From 7e2c4ed21622a4fefab58e709528a14866e0d133 Mon Sep 17 00:00:00 2001
From: Joey Hess <joeyh@joeyh.name>
Date: Wed, 1 Jul 2020 14:03:16 -0400
Subject: [PATCH] data type that starts off using a set but converts to a bloom
 filter when large

This adds a dep on hashable, but it's a free dependency, since
unordered-containers already pulled it in.

Using unordered-containers for the set seems to make sense, since it
hashes and bloom filter hashes too. (Though different hashes.)
I dunno, never quite know if I should use unordered-containers or containers.
---
 Annex/BloomFilter.hs | 93 ++++++++++++++++++++++++++++++++++++++++++--
 debian/control       |  1 +
 git-annex.cabal      |  1 +
 3 files changed, 92 insertions(+), 3 deletions(-)

diff --git a/Annex/BloomFilter.hs b/Annex/BloomFilter.hs
index 544fdeeccc..5b735b5b8b 100644
--- a/Annex/BloomFilter.hs
+++ b/Annex/BloomFilter.hs
@@ -1,10 +1,13 @@
 {- git-annex bloom filter
  -
- - Copyright 2010-2015 Joey Hess <id@joeyh.name>
+ - Copyright 2010-2020 Joey Hess <id@joeyh.name>
  -
  - Licensed under the GNU AGPL version 3 or higher.
  -}
 
+{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
+
 module Annex.BloomFilter where
 
 import Annex.Common
@@ -12,6 +15,10 @@ import qualified Annex
 import Utility.Bloom
 
 import Control.Monad.ST
+import Data.STRef
+import Data.Hashable
+import qualified Data.HashSet as S
+import qualified Data.ByteString as B
 
 {- A bloom filter capable of holding half a million keys with a
  - false positive rate of 1 in 10000000 uses around 16 mb of memory,
@@ -28,7 +35,7 @@ bloomBitsHashes = do
 	case safeSuggestSizing capacity (1 / fromIntegral accuracy) of
 		Left e -> do
 			warning $ "bloomfilter " ++ e ++ "; falling back to sane value"
-			-- precaulculated value for 500000 (1/10000000)
+			-- precalculated value for 500000 (1/10000000)
 			return (16777216,23)
 		Right v -> return v
 
@@ -40,7 +47,10 @@ bloomBitsHashes = do
  - Once the action completes, the mutable filter is frozen
  - for later use.
  -}
-genBloomFilter :: Hashable v => ((v -> Annex ()) -> Annex ()) -> Annex (Bloom v)
+genBloomFilter
+	:: Utility.Bloom.Hashable v
+	=> ((v -> Annex ()) -> Annex ())
+	-> Annex (Bloom v)
 genBloomFilter populate = do
 	(numbits, numhashes) <- bloomBitsHashes
 	bloom <- lift $ newMB (cheapHashes numhashes) numbits
@@ -51,3 +61,80 @@ genBloomFilter populate = do
 
 bloomFilter :: [v] -> Bloom v -> [v]
 bloomFilter l bloom = filter (\v -> v `notElemB` bloom) l
+
+{- This starts off as a HashSet, and is only converted to a bloom filter
+ - if it grows too large.
+ - 
+ - The advantage is, if it stays a HashSet, queries do not have false
+ - positives.
+ -}
+data BloomableFilter v
+	= BloomableFilter (S.HashSet (BloomBytes v))
+	| BloomedFilter (Bloom (BloomBytes v))
+	deriving (Show)
+
+newtype BloomBytes v = BloomBytes B.ByteString
+	deriving (Show, Eq, Utility.Bloom.Hashable, Data.Hashable.Hashable)
+
+class BloomableBytes v where
+	toBloomBytes :: v -> BloomBytes v
+
+instance BloomableBytes B.ByteString where
+	toBloomBytes = BloomBytes
+
+instance BloomableBytes Key where
+	toBloomBytes = BloomBytes . serializeKey'
+
+{- The HashSet is grown until it uses around as much memory as the bloom
+ - filter is configured to use. So peak memory use is 2x when the HashSet
+ - is full and is being converted to the bloom filter.
+ -}
+genBloomableFilter
+	:: BloomableBytes v
+	=> ((v -> Annex ()) -> Annex ())
+	-> Annex (BloomableFilter v)
+genBloomableFilter populate = do
+	(numbits, numhashes) <- bloomBitsHashes
+	-- A HashSet is a tree, so there's some memory overhead beyond
+	-- storing the values. Use 2/3 of the memory for storing
+	-- the values, and reserve the rest for that overhead.
+	let maxsz = (numbits `div` 8) `div` 3 * 2
+	bv <- lift $ newMB (cheapHashes numhashes) numbits
+	sv <- lift $ newSTRef S.empty
+	szv <- lift $ newSTRef (Just 0)
+	populate $ \v -> lift $ readSTRef szv >>= \case
+		Just n 
+			| n < maxsz -> do
+				let bb@(BloomBytes b) = toBloomBytes v
+				modifySTRef' sv (S.insert bb)
+				modifySTRef' szv (fmap (+ B.length b))
+			| otherwise -> do
+				s <- readSTRef sv
+				forM_ (S.toList s) $
+					insertMB bv
+				modifySTRef' sv (const S.empty)
+				modifySTRef' szv (const Nothing)
+				insertMB bv (toBloomBytes v)
+		Nothing -> insertMB bv (toBloomBytes v)
+	lift $ readSTRef szv >>= \case
+		Just _ -> BloomableFilter <$> readSTRef sv
+		Nothing -> BloomedFilter <$> unsafeFreezeMB bv
+  where
+	lift = liftIO . stToIO
+
+data ElemWithFalsePositives
+	= ElemNo
+	| ElemYes
+	| ElemProbablyYes
+	deriving (Show)
+
+elemB' :: (BloomableBytes v) => v -> BloomableFilter v -> ElemWithFalsePositives
+elemB' v (BloomedFilter b) =
+	case elemB (toBloomBytes v) b of
+		True -> ElemProbablyYes
+		False -> ElemNo
+elemB' v (BloomableFilter s) =
+	case S.member (toBloomBytes v) s of
+		True -> ElemYes
+		False -> ElemNo
+
diff --git a/debian/control b/debian/control
index 06d9f9f044..224b360f8b 100644
--- a/debian/control
+++ b/debian/control
@@ -29,6 +29,7 @@ Build-Depends:
 	libghc-aeson-dev,
 	libghc-tagsoup-dev,
 	libghc-unordered-containers-dev,
+	libghc-hashable-dev,
 	libghc-ifelse-dev,
 	libghc-bloomfilter-dev,
 	libghc-edit-distance-dev,
diff --git a/git-annex.cabal b/git-annex.cabal
index c11f8e96b0..17c9a94720 100644
--- a/git-annex.cabal
+++ b/git-annex.cabal
@@ -361,6 +361,7 @@ Executable git-annex
    vector,
    tagsoup,
    unordered-containers,
+   hashable,
    feed (>= 1.0.0),
    regex-tdfa,
    socks,