data type that starts off using a set but converts to a bloom filter when large

This adds a dep on hashable, but it's a free dependency, since unordered-containers already pulled it in. Using unordered-containers for the set seems to make sense, since it hashes and bloom filter hashes too. (Though different hashes.) I dunno, never quite know if I should use unordered-containers or containers.
2020-07-01 14:03:16 -04:00 · 2020-07-01 14:03:16 -04:00 · 7e2c4ed216
commit 7e2c4ed216
parent 424b1912d6
3 changed files with 92 additions and 3 deletions
--- a/Annex/BloomFilter.hs
+++ b/Annex/BloomFilter.hs
@ -1,10 +1,13 @@
 {- git-annex bloom filter
 -
- - Copyright 2010-2015 Joey Hess <id@joeyh.name>
+ - Copyright 2010-2020 Joey Hess <id@joeyh.name>
 -
 - Licensed under the GNU AGPL version 3 or higher.
 -}

+{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
+
 module Annex.BloomFilter where

 import Annex.Common
@ -12,6 +15,10 @@ import qualified Annex
 import Utility.Bloom

 import Control.Monad.ST
+import Data.STRef
+import Data.Hashable
+import qualified Data.HashSet as S
+import qualified Data.ByteString as B

 {- A bloom filter capable of holding half a million keys with a
 - false positive rate of 1 in 10000000 uses around 16 mb of memory,
@ -28,7 +35,7 @@ bloomBitsHashes = do
 	case safeSuggestSizing capacity (1 / fromIntegral accuracy) of
 		Left e -> do
 			warning $ "bloomfilter " ++ e ++ "; falling back to sane value"
-			-- precaulculated value for 500000 (1/10000000)
+			-- precalculated value for 500000 (1/10000000)
 			return (16777216,23)
 		Right v -> return v

@ -40,7 +47,10 @@ bloomBitsHashes = do
 - Once the action completes, the mutable filter is frozen
 - for later use.
 -}
-genBloomFilter :: Hashable v => ((v -> Annex ()) -> Annex ()) -> Annex (Bloom v)
+genBloomFilter
+	:: Utility.Bloom.Hashable v
+	=> ((v -> Annex ()) -> Annex ())
+	-> Annex (Bloom v)
 genBloomFilter populate = do
 	(numbits, numhashes) <- bloomBitsHashes
 	bloom <- lift $ newMB (cheapHashes numhashes) numbits
@ -51,3 +61,80 @@ genBloomFilter populate = do

 bloomFilter :: [v] -> Bloom v -> [v]
 bloomFilter l bloom = filter (\v -> v `notElemB` bloom) l
+
+{- This starts off as a HashSet, and is only converted to a bloom filter
+ - if it grows too large.
+ - 
+ - The advantage is, if it stays a HashSet, queries do not have false
+ - positives.
+ -}
+data BloomableFilter v
+	= BloomableFilter (S.HashSet (BloomBytes v))
+	| BloomedFilter (Bloom (BloomBytes v))
+	deriving (Show)
+
+newtype BloomBytes v = BloomBytes B.ByteString
+	deriving (Show, Eq, Utility.Bloom.Hashable, Data.Hashable.Hashable)
+
+class BloomableBytes v where
+	toBloomBytes :: v -> BloomBytes v
+
+instance BloomableBytes B.ByteString where
+	toBloomBytes = BloomBytes
+
+instance BloomableBytes Key where
+	toBloomBytes = BloomBytes . serializeKey'
+
+{- The HashSet is grown until it uses around as much memory as the bloom
+ - filter is configured to use. So peak memory use is 2x when the HashSet
+ - is full and is being converted to the bloom filter.
+ -}
+genBloomableFilter
+	:: BloomableBytes v
+	=> ((v -> Annex ()) -> Annex ())
+	-> Annex (BloomableFilter v)
+genBloomableFilter populate = do
+	(numbits, numhashes) <- bloomBitsHashes
+	-- A HashSet is a tree, so there's some memory overhead beyond
+	-- storing the values. Use 2/3 of the memory for storing
+	-- the values, and reserve the rest for that overhead.
+	let maxsz = (numbits `div` 8) `div` 3 * 2
+	bv <- lift $ newMB (cheapHashes numhashes) numbits
+	sv <- lift $ newSTRef S.empty
+	szv <- lift $ newSTRef (Just 0)
+	populate $ \v -> lift $ readSTRef szv >>= \case
+		Just n 
+			| n < maxsz -> do
+				let bb@(BloomBytes b) = toBloomBytes v
+				modifySTRef' sv (S.insert bb)
+				modifySTRef' szv (fmap (+ B.length b))
+			| otherwise -> do
+				s <- readSTRef sv
+				forM_ (S.toList s) $
+					insertMB bv
+				modifySTRef' sv (const S.empty)
+				modifySTRef' szv (const Nothing)
+				insertMB bv (toBloomBytes v)
+		Nothing -> insertMB bv (toBloomBytes v)
+	lift $ readSTRef szv >>= \case
+		Just _ -> BloomableFilter <$> readSTRef sv
+		Nothing -> BloomedFilter <$> unsafeFreezeMB bv
+  where
+	lift = liftIO . stToIO
+
+data ElemWithFalsePositives
+	= ElemNo
+	| ElemYes
+	| ElemProbablyYes
+	deriving (Show)
+
+elemB' :: (BloomableBytes v) => v -> BloomableFilter v -> ElemWithFalsePositives
+elemB' v (BloomedFilter b) =
+	case elemB (toBloomBytes v) b of
+		True -> ElemProbablyYes
+		False -> ElemNo
+elemB' v (BloomableFilter s) =
+	case S.member (toBloomBytes v) s of
+		True -> ElemYes
+		False -> ElemNo
+
--- a/debian/control
+++ b/debian/control
@ -29,6 +29,7 @@ Build-Depends:
 	libghc-aeson-dev,
 	libghc-tagsoup-dev,
 	libghc-unordered-containers-dev,
+	libghc-hashable-dev,
 	libghc-ifelse-dev,
 	libghc-bloomfilter-dev,
 	libghc-edit-distance-dev,
--- a/git-annex.cabal
+++ b/git-annex.cabal
@ -361,6 +361,7 @@ Executable git-annex
   vector,
   tagsoup,
   unordered-containers,
+   hashable,
   feed (>= 1.0.0),
   regex-tdfa,
   socks,