data type that starts off using a set but converts to a bloom filter when large
This adds a dep on hashable, but it's a free dependency, since unordered-containers already pulled it in. Using unordered-containers for the set seems to make sense, since it hashes and bloom filter hashes too. (Though different hashes.) I dunno, never quite know if I should use unordered-containers or containers.
This commit is contained in:
parent
424b1912d6
commit
7e2c4ed216
3 changed files with 92 additions and 3 deletions
|
@ -1,10 +1,13 @@
|
||||||
{- git-annex bloom filter
|
{- git-annex bloom filter
|
||||||
-
|
-
|
||||||
- Copyright 2010-2015 Joey Hess <id@joeyh.name>
|
- Copyright 2010-2020 Joey Hess <id@joeyh.name>
|
||||||
-
|
-
|
||||||
- Licensed under the GNU AGPL version 3 or higher.
|
- Licensed under the GNU AGPL version 3 or higher.
|
||||||
-}
|
-}
|
||||||
|
|
||||||
|
{-# LANGUAGE LambdaCase #-}
|
||||||
|
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
|
||||||
|
|
||||||
module Annex.BloomFilter where
|
module Annex.BloomFilter where
|
||||||
|
|
||||||
import Annex.Common
|
import Annex.Common
|
||||||
|
@ -12,6 +15,10 @@ import qualified Annex
|
||||||
import Utility.Bloom
|
import Utility.Bloom
|
||||||
|
|
||||||
import Control.Monad.ST
|
import Control.Monad.ST
|
||||||
|
import Data.STRef
|
||||||
|
import Data.Hashable
|
||||||
|
import qualified Data.HashSet as S
|
||||||
|
import qualified Data.ByteString as B
|
||||||
|
|
||||||
{- A bloom filter capable of holding half a million keys with a
|
{- A bloom filter capable of holding half a million keys with a
|
||||||
- false positive rate of 1 in 10000000 uses around 16 mb of memory,
|
- false positive rate of 1 in 10000000 uses around 16 mb of memory,
|
||||||
|
@ -28,7 +35,7 @@ bloomBitsHashes = do
|
||||||
case safeSuggestSizing capacity (1 / fromIntegral accuracy) of
|
case safeSuggestSizing capacity (1 / fromIntegral accuracy) of
|
||||||
Left e -> do
|
Left e -> do
|
||||||
warning $ "bloomfilter " ++ e ++ "; falling back to sane value"
|
warning $ "bloomfilter " ++ e ++ "; falling back to sane value"
|
||||||
-- precaulculated value for 500000 (1/10000000)
|
-- precalculated value for 500000 (1/10000000)
|
||||||
return (16777216,23)
|
return (16777216,23)
|
||||||
Right v -> return v
|
Right v -> return v
|
||||||
|
|
||||||
|
@ -40,7 +47,10 @@ bloomBitsHashes = do
|
||||||
- Once the action completes, the mutable filter is frozen
|
- Once the action completes, the mutable filter is frozen
|
||||||
- for later use.
|
- for later use.
|
||||||
-}
|
-}
|
||||||
genBloomFilter :: Hashable v => ((v -> Annex ()) -> Annex ()) -> Annex (Bloom v)
|
genBloomFilter
|
||||||
|
:: Utility.Bloom.Hashable v
|
||||||
|
=> ((v -> Annex ()) -> Annex ())
|
||||||
|
-> Annex (Bloom v)
|
||||||
genBloomFilter populate = do
|
genBloomFilter populate = do
|
||||||
(numbits, numhashes) <- bloomBitsHashes
|
(numbits, numhashes) <- bloomBitsHashes
|
||||||
bloom <- lift $ newMB (cheapHashes numhashes) numbits
|
bloom <- lift $ newMB (cheapHashes numhashes) numbits
|
||||||
|
@ -51,3 +61,80 @@ genBloomFilter populate = do
|
||||||
|
|
||||||
bloomFilter :: [v] -> Bloom v -> [v]
|
bloomFilter :: [v] -> Bloom v -> [v]
|
||||||
bloomFilter l bloom = filter (\v -> v `notElemB` bloom) l
|
bloomFilter l bloom = filter (\v -> v `notElemB` bloom) l
|
||||||
|
|
||||||
|
{- This starts off as a HashSet, and is only converted to a bloom filter
|
||||||
|
- if it grows too large.
|
||||||
|
-
|
||||||
|
- The advantage is, if it stays a HashSet, queries do not have false
|
||||||
|
- positives.
|
||||||
|
-}
|
||||||
|
data BloomableFilter v
|
||||||
|
= BloomableFilter (S.HashSet (BloomBytes v))
|
||||||
|
| BloomedFilter (Bloom (BloomBytes v))
|
||||||
|
deriving (Show)
|
||||||
|
|
||||||
|
newtype BloomBytes v = BloomBytes B.ByteString
|
||||||
|
deriving (Show, Eq, Utility.Bloom.Hashable, Data.Hashable.Hashable)
|
||||||
|
|
||||||
|
class BloomableBytes v where
|
||||||
|
toBloomBytes :: v -> BloomBytes v
|
||||||
|
|
||||||
|
instance BloomableBytes B.ByteString where
|
||||||
|
toBloomBytes = BloomBytes
|
||||||
|
|
||||||
|
instance BloomableBytes Key where
|
||||||
|
toBloomBytes = BloomBytes . serializeKey'
|
||||||
|
|
||||||
|
{- The HashSet is grown until it uses around as much memory as the bloom
|
||||||
|
- filter is configured to use. So peak memory use is 2x when the HashSet
|
||||||
|
- is full and is being converted to the bloom filter.
|
||||||
|
-}
|
||||||
|
genBloomableFilter
|
||||||
|
:: BloomableBytes v
|
||||||
|
=> ((v -> Annex ()) -> Annex ())
|
||||||
|
-> Annex (BloomableFilter v)
|
||||||
|
genBloomableFilter populate = do
|
||||||
|
(numbits, numhashes) <- bloomBitsHashes
|
||||||
|
-- A HashSet is a tree, so there's some memory overhead beyond
|
||||||
|
-- storing the values. Use 2/3 of the memory for storing
|
||||||
|
-- the values, and reserve the rest for that overhead.
|
||||||
|
let maxsz = (numbits `div` 8) `div` 3 * 2
|
||||||
|
bv <- lift $ newMB (cheapHashes numhashes) numbits
|
||||||
|
sv <- lift $ newSTRef S.empty
|
||||||
|
szv <- lift $ newSTRef (Just 0)
|
||||||
|
populate $ \v -> lift $ readSTRef szv >>= \case
|
||||||
|
Just n
|
||||||
|
| n < maxsz -> do
|
||||||
|
let bb@(BloomBytes b) = toBloomBytes v
|
||||||
|
modifySTRef' sv (S.insert bb)
|
||||||
|
modifySTRef' szv (fmap (+ B.length b))
|
||||||
|
| otherwise -> do
|
||||||
|
s <- readSTRef sv
|
||||||
|
forM_ (S.toList s) $
|
||||||
|
insertMB bv
|
||||||
|
modifySTRef' sv (const S.empty)
|
||||||
|
modifySTRef' szv (const Nothing)
|
||||||
|
insertMB bv (toBloomBytes v)
|
||||||
|
Nothing -> insertMB bv (toBloomBytes v)
|
||||||
|
lift $ readSTRef szv >>= \case
|
||||||
|
Just _ -> BloomableFilter <$> readSTRef sv
|
||||||
|
Nothing -> BloomedFilter <$> unsafeFreezeMB bv
|
||||||
|
where
|
||||||
|
lift = liftIO . stToIO
|
||||||
|
|
||||||
|
data ElemWithFalsePositives
|
||||||
|
= ElemNo
|
||||||
|
| ElemYes
|
||||||
|
| ElemProbablyYes
|
||||||
|
deriving (Show)
|
||||||
|
|
||||||
|
elemB' :: (BloomableBytes v) => v -> BloomableFilter v -> ElemWithFalsePositives
|
||||||
|
elemB' v (BloomedFilter b) =
|
||||||
|
case elemB (toBloomBytes v) b of
|
||||||
|
True -> ElemProbablyYes
|
||||||
|
False -> ElemNo
|
||||||
|
elemB' v (BloomableFilter s) =
|
||||||
|
case S.member (toBloomBytes v) s of
|
||||||
|
True -> ElemYes
|
||||||
|
False -> ElemNo
|
||||||
|
|
||||||
|
|
1
debian/control
vendored
1
debian/control
vendored
|
@ -29,6 +29,7 @@ Build-Depends:
|
||||||
libghc-aeson-dev,
|
libghc-aeson-dev,
|
||||||
libghc-tagsoup-dev,
|
libghc-tagsoup-dev,
|
||||||
libghc-unordered-containers-dev,
|
libghc-unordered-containers-dev,
|
||||||
|
libghc-hashable-dev,
|
||||||
libghc-ifelse-dev,
|
libghc-ifelse-dev,
|
||||||
libghc-bloomfilter-dev,
|
libghc-bloomfilter-dev,
|
||||||
libghc-edit-distance-dev,
|
libghc-edit-distance-dev,
|
||||||
|
|
|
@ -361,6 +361,7 @@ Executable git-annex
|
||||||
vector,
|
vector,
|
||||||
tagsoup,
|
tagsoup,
|
||||||
unordered-containers,
|
unordered-containers,
|
||||||
|
hashable,
|
||||||
feed (>= 1.0.0),
|
feed (>= 1.0.0),
|
||||||
regex-tdfa,
|
regex-tdfa,
|
||||||
socks,
|
socks,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue