data type that starts off using a set but converts to a bloom filter when large

This adds a dep on hashable, but it's a free dependency, since unordered-containers already pulled it in. Using unordered-containers for the set seems to make sense, since it hashes and bloom filter hashes too. (Though different hashes.) I dunno, never quite know if I should use unordered-containers or containers.
2020-07-01 14:03:16 -04:00 · 2020-07-01 14:03:16 -04:00 · 7e2c4ed216
commit 7e2c4ed216
parent 424b1912d6
3 changed files with 92 additions and 3 deletions
--- a/Annex/BloomFilter.hs
+++ b/Annex/BloomFilter.hs
@ -1,10 +1,13 @@
 {- git-annex bloom filter
 -
- - Copyright 2010-2015 Joey Hess <id@joeyh.name>
+ - Copyright 2010-2020 Joey Hess <id@joeyh.name>
 -
 - Licensed under the GNU AGPL version 3 or higher.
 -}
 {-# LANGUAGE LambdaCase #-}
 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
 module Annex.BloomFilter where
 import Annex.Common
@ -12,6 +15,10 @@ import qualified Annex
 import Utility.Bloom
 import Control.Monad.ST
 import Data.STRef
 import Data.Hashable
 import qualified Data.HashSet as S
 import qualified Data.ByteString as B
 {- A bloom filter capable of holding half a million keys with a
 - false positive rate of 1 in 10000000 uses around 16 mb of memory,
@ -28,7 +35,7 @@ bloomBitsHashes = do
 	case safeSuggestSizing capacity (1 / fromIntegral accuracy) of
 		Left e -> do
 			warning $ "bloomfilter " ++ e ++ "; falling back to sane value"
-			-- precaulculated value for 500000 (1/10000000)
+			-- precalculated value for 500000 (1/10000000)
 			return (16777216,23)
 		Right v -> return v
@ -40,7 +47,10 @@ bloomBitsHashes = do
 - Once the action completes, the mutable filter is frozen
 - for later use.
 -}
-genBloomFilter :: Hashable v => ((v -> Annex ()) -> Annex ()) -> Annex (Bloom v)
+genBloomFilter
 	:: Utility.Bloom.Hashable v
 	=> ((v -> Annex ()) -> Annex ())
 	-> Annex (Bloom v)
 genBloomFilter populate = do
 	(numbits, numhashes) <- bloomBitsHashes
 	bloom <- lift $ newMB (cheapHashes numhashes) numbits
@ -51,3 +61,80 @@ genBloomFilter populate = do
 bloomFilter :: [v] -> Bloom v -> [v]
 bloomFilter l bloom = filter (\v -> v `notElemB` bloom) l
 {- This starts off as a HashSet, and is only converted to a bloom filter
 - if it grows too large.
 - 
 - The advantage is, if it stays a HashSet, queries do not have false
 - positives.
 -}
 data BloomableFilter v
 	= BloomableFilter (S.HashSet (BloomBytes v))
 	| BloomedFilter (Bloom (BloomBytes v))
 	deriving (Show)
 newtype BloomBytes v = BloomBytes B.ByteString
 	deriving (Show, Eq, Utility.Bloom.Hashable, Data.Hashable.Hashable)
 class BloomableBytes v where
 	toBloomBytes :: v -> BloomBytes v
 instance BloomableBytes B.ByteString where
 	toBloomBytes = BloomBytes
 instance BloomableBytes Key where
 	toBloomBytes = BloomBytes . serializeKey'
 {- The HashSet is grown until it uses around as much memory as the bloom
 - filter is configured to use. So peak memory use is 2x when the HashSet
 - is full and is being converted to the bloom filter.
 -}
 genBloomableFilter
 	:: BloomableBytes v
 	=> ((v -> Annex ()) -> Annex ())
 	-> Annex (BloomableFilter v)
 genBloomableFilter populate = do
 	(numbits, numhashes) <- bloomBitsHashes
 	-- A HashSet is a tree, so there's some memory overhead beyond
 	-- storing the values. Use 2/3 of the memory for storing
 	-- the values, and reserve the rest for that overhead.
 	let maxsz = (numbits `div` 8) `div` 3 * 2
 	bv <- lift $ newMB (cheapHashes numhashes) numbits
 	sv <- lift $ newSTRef S.empty
 	szv <- lift $ newSTRef (Just 0)
 	populate $ \v -> lift $ readSTRef szv >>= \case
 		Just n 
 			| n < maxsz -> do
 				let bb@(BloomBytes b) = toBloomBytes v
 				modifySTRef' sv (S.insert bb)
 				modifySTRef' szv (fmap (+ B.length b))
 			| otherwise -> do
 				s <- readSTRef sv
 				forM_ (S.toList s) $
 					insertMB bv
 				modifySTRef' sv (const S.empty)
 				modifySTRef' szv (const Nothing)
 				insertMB bv (toBloomBytes v)
 		Nothing -> insertMB bv (toBloomBytes v)
 	lift $ readSTRef szv >>= \case
 		Just _ -> BloomableFilter <$> readSTRef sv
 		Nothing -> BloomedFilter <$> unsafeFreezeMB bv
  where
 	lift = liftIO . stToIO
 data ElemWithFalsePositives
 	= ElemNo
 	| ElemYes
 	| ElemProbablyYes
 	deriving (Show)
 elemB' :: (BloomableBytes v) => v -> BloomableFilter v -> ElemWithFalsePositives
 elemB' v (BloomedFilter b) =
 	case elemB (toBloomBytes v) b of
 		True -> ElemProbablyYes
 		False -> ElemNo
 elemB' v (BloomableFilter s) =
 	case S.member (toBloomBytes v) s of
 		True -> ElemYes
 		False -> ElemNo
--- a/debian/control
+++ b/debian/control
@ -29,6 +29,7 @@ Build-Depends:
 	libghc-aeson-dev,
 	libghc-tagsoup-dev,
 	libghc-unordered-containers-dev,
 	libghc-hashable-dev,
 	libghc-ifelse-dev,
 	libghc-bloomfilter-dev,
 	libghc-edit-distance-dev,
--- a/git-annex.cabal
+++ b/git-annex.cabal
@ -361,6 +361,7 @@ Executable git-annex
   vector,
   tagsoup,
   unordered-containers,
   hashable,
   feed (>= 1.0.0),
   regex-tdfa,
   socks,