From 7e2c4ed21622a4fefab58e709528a14866e0d133 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Wed, 1 Jul 2020 14:03:16 -0400 Subject: [PATCH] data type that starts off using a set but converts to a bloom filter when large This adds a dep on hashable, but it's a free dependency, since unordered-containers already pulled it in. Using unordered-containers for the set seems to make sense, since it hashes and bloom filter hashes too. (Though different hashes.) I dunno, never quite know if I should use unordered-containers or containers. --- Annex/BloomFilter.hs | 93 ++++++++++++++++++++++++++++++++++++++++++-- debian/control | 1 + git-annex.cabal | 1 + 3 files changed, 92 insertions(+), 3 deletions(-) diff --git a/Annex/BloomFilter.hs b/Annex/BloomFilter.hs index 544fdeeccc..5b735b5b8b 100644 --- a/Annex/BloomFilter.hs +++ b/Annex/BloomFilter.hs @@ -1,10 +1,13 @@ {- git-annex bloom filter - - - Copyright 2010-2015 Joey Hess + - Copyright 2010-2020 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} + module Annex.BloomFilter where import Annex.Common @@ -12,6 +15,10 @@ import qualified Annex import Utility.Bloom import Control.Monad.ST +import Data.STRef +import Data.Hashable +import qualified Data.HashSet as S +import qualified Data.ByteString as B {- A bloom filter capable of holding half a million keys with a - false positive rate of 1 in 10000000 uses around 16 mb of memory, @@ -28,7 +35,7 @@ bloomBitsHashes = do case safeSuggestSizing capacity (1 / fromIntegral accuracy) of Left e -> do warning $ "bloomfilter " ++ e ++ "; falling back to sane value" - -- precaulculated value for 500000 (1/10000000) + -- precalculated value for 500000 (1/10000000) return (16777216,23) Right v -> return v @@ -40,7 +47,10 @@ bloomBitsHashes = do - Once the action completes, the mutable filter is frozen - for later use. -} -genBloomFilter :: Hashable v => ((v -> Annex ()) -> Annex ()) -> Annex (Bloom v) +genBloomFilter + :: Utility.Bloom.Hashable v + => ((v -> Annex ()) -> Annex ()) + -> Annex (Bloom v) genBloomFilter populate = do (numbits, numhashes) <- bloomBitsHashes bloom <- lift $ newMB (cheapHashes numhashes) numbits @@ -51,3 +61,80 @@ genBloomFilter populate = do bloomFilter :: [v] -> Bloom v -> [v] bloomFilter l bloom = filter (\v -> v `notElemB` bloom) l + +{- This starts off as a HashSet, and is only converted to a bloom filter + - if it grows too large. + - + - The advantage is, if it stays a HashSet, queries do not have false + - positives. + -} +data BloomableFilter v + = BloomableFilter (S.HashSet (BloomBytes v)) + | BloomedFilter (Bloom (BloomBytes v)) + deriving (Show) + +newtype BloomBytes v = BloomBytes B.ByteString + deriving (Show, Eq, Utility.Bloom.Hashable, Data.Hashable.Hashable) + +class BloomableBytes v where + toBloomBytes :: v -> BloomBytes v + +instance BloomableBytes B.ByteString where + toBloomBytes = BloomBytes + +instance BloomableBytes Key where + toBloomBytes = BloomBytes . serializeKey' + +{- The HashSet is grown until it uses around as much memory as the bloom + - filter is configured to use. So peak memory use is 2x when the HashSet + - is full and is being converted to the bloom filter. + -} +genBloomableFilter + :: BloomableBytes v + => ((v -> Annex ()) -> Annex ()) + -> Annex (BloomableFilter v) +genBloomableFilter populate = do + (numbits, numhashes) <- bloomBitsHashes + -- A HashSet is a tree, so there's some memory overhead beyond + -- storing the values. Use 2/3 of the memory for storing + -- the values, and reserve the rest for that overhead. + let maxsz = (numbits `div` 8) `div` 3 * 2 + bv <- lift $ newMB (cheapHashes numhashes) numbits + sv <- lift $ newSTRef S.empty + szv <- lift $ newSTRef (Just 0) + populate $ \v -> lift $ readSTRef szv >>= \case + Just n + | n < maxsz -> do + let bb@(BloomBytes b) = toBloomBytes v + modifySTRef' sv (S.insert bb) + modifySTRef' szv (fmap (+ B.length b)) + | otherwise -> do + s <- readSTRef sv + forM_ (S.toList s) $ + insertMB bv + modifySTRef' sv (const S.empty) + modifySTRef' szv (const Nothing) + insertMB bv (toBloomBytes v) + Nothing -> insertMB bv (toBloomBytes v) + lift $ readSTRef szv >>= \case + Just _ -> BloomableFilter <$> readSTRef sv + Nothing -> BloomedFilter <$> unsafeFreezeMB bv + where + lift = liftIO . stToIO + +data ElemWithFalsePositives + = ElemNo + | ElemYes + | ElemProbablyYes + deriving (Show) + +elemB' :: (BloomableBytes v) => v -> BloomableFilter v -> ElemWithFalsePositives +elemB' v (BloomedFilter b) = + case elemB (toBloomBytes v) b of + True -> ElemProbablyYes + False -> ElemNo +elemB' v (BloomableFilter s) = + case S.member (toBloomBytes v) s of + True -> ElemYes + False -> ElemNo + diff --git a/debian/control b/debian/control index 06d9f9f044..224b360f8b 100644 --- a/debian/control +++ b/debian/control @@ -29,6 +29,7 @@ Build-Depends: libghc-aeson-dev, libghc-tagsoup-dev, libghc-unordered-containers-dev, + libghc-hashable-dev, libghc-ifelse-dev, libghc-bloomfilter-dev, libghc-edit-distance-dev, diff --git a/git-annex.cabal b/git-annex.cabal index c11f8e96b0..17c9a94720 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -361,6 +361,7 @@ Executable git-annex vector, tagsoup, unordered-containers, + hashable, feed (>= 1.0.0), regex-tdfa, socks,