Revert "data type that starts off using a set but converts to a bloom filter when large"
This reverts commit 7e2c4ed216
.
I was not able to use this in the end..
See comment in the previous commit.
This commit is contained in:
parent
00c9eb4c78
commit
087b7ee66a
3 changed files with 3 additions and 92 deletions
|
@ -1,13 +1,10 @@
|
||||||
{- git-annex bloom filter
|
{- git-annex bloom filter
|
||||||
-
|
-
|
||||||
- Copyright 2010-2020 Joey Hess <id@joeyh.name>
|
- Copyright 2010-2015 Joey Hess <id@joeyh.name>
|
||||||
-
|
-
|
||||||
- Licensed under the GNU AGPL version 3 or higher.
|
- Licensed under the GNU AGPL version 3 or higher.
|
||||||
-}
|
-}
|
||||||
|
|
||||||
{-# LANGUAGE LambdaCase #-}
|
|
||||||
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
|
|
||||||
|
|
||||||
module Annex.BloomFilter where
|
module Annex.BloomFilter where
|
||||||
|
|
||||||
import Annex.Common
|
import Annex.Common
|
||||||
|
@ -15,10 +12,6 @@ import qualified Annex
|
||||||
import Utility.Bloom
|
import Utility.Bloom
|
||||||
|
|
||||||
import Control.Monad.ST
|
import Control.Monad.ST
|
||||||
import Data.STRef
|
|
||||||
import Data.Hashable
|
|
||||||
import qualified Data.HashSet as S
|
|
||||||
import qualified Data.ByteString as B
|
|
||||||
|
|
||||||
{- A bloom filter capable of holding half a million keys with a
|
{- A bloom filter capable of holding half a million keys with a
|
||||||
- false positive rate of 1 in 10000000 uses around 16 mb of memory,
|
- false positive rate of 1 in 10000000 uses around 16 mb of memory,
|
||||||
|
@ -35,7 +28,7 @@ bloomBitsHashes = do
|
||||||
case safeSuggestSizing capacity (1 / fromIntegral accuracy) of
|
case safeSuggestSizing capacity (1 / fromIntegral accuracy) of
|
||||||
Left e -> do
|
Left e -> do
|
||||||
warning $ "bloomfilter " ++ e ++ "; falling back to sane value"
|
warning $ "bloomfilter " ++ e ++ "; falling back to sane value"
|
||||||
-- precalculated value for 500000 (1/10000000)
|
-- precaulculated value for 500000 (1/10000000)
|
||||||
return (16777216,23)
|
return (16777216,23)
|
||||||
Right v -> return v
|
Right v -> return v
|
||||||
|
|
||||||
|
@ -47,10 +40,7 @@ bloomBitsHashes = do
|
||||||
- Once the action completes, the mutable filter is frozen
|
- Once the action completes, the mutable filter is frozen
|
||||||
- for later use.
|
- for later use.
|
||||||
-}
|
-}
|
||||||
genBloomFilter
|
genBloomFilter :: Hashable v => ((v -> Annex ()) -> Annex ()) -> Annex (Bloom v)
|
||||||
:: Utility.Bloom.Hashable v
|
|
||||||
=> ((v -> Annex ()) -> Annex ())
|
|
||||||
-> Annex (Bloom v)
|
|
||||||
genBloomFilter populate = do
|
genBloomFilter populate = do
|
||||||
(numbits, numhashes) <- bloomBitsHashes
|
(numbits, numhashes) <- bloomBitsHashes
|
||||||
bloom <- lift $ newMB (cheapHashes numhashes) numbits
|
bloom <- lift $ newMB (cheapHashes numhashes) numbits
|
||||||
|
@ -61,80 +51,3 @@ genBloomFilter populate = do
|
||||||
|
|
||||||
bloomFilter :: [v] -> Bloom v -> [v]
|
bloomFilter :: [v] -> Bloom v -> [v]
|
||||||
bloomFilter l bloom = filter (\v -> v `notElemB` bloom) l
|
bloomFilter l bloom = filter (\v -> v `notElemB` bloom) l
|
||||||
|
|
||||||
{- This starts off as a HashSet, and is only converted to a bloom filter
|
|
||||||
- if it grows too large.
|
|
||||||
-
|
|
||||||
- The advantage is, if it stays a HashSet, queries do not have false
|
|
||||||
- positives.
|
|
||||||
-}
|
|
||||||
data BloomableFilter v
|
|
||||||
= BloomableFilter (S.HashSet (BloomBytes v))
|
|
||||||
| BloomedFilter (Bloom (BloomBytes v))
|
|
||||||
deriving (Show)
|
|
||||||
|
|
||||||
newtype BloomBytes v = BloomBytes B.ByteString
|
|
||||||
deriving (Show, Eq, Utility.Bloom.Hashable, Data.Hashable.Hashable)
|
|
||||||
|
|
||||||
class BloomableBytes v where
|
|
||||||
toBloomBytes :: v -> BloomBytes v
|
|
||||||
|
|
||||||
instance BloomableBytes B.ByteString where
|
|
||||||
toBloomBytes = BloomBytes
|
|
||||||
|
|
||||||
instance BloomableBytes Key where
|
|
||||||
toBloomBytes = BloomBytes . serializeKey'
|
|
||||||
|
|
||||||
{- The HashSet is grown until it uses around as much memory as the bloom
|
|
||||||
- filter is configured to use. So peak memory use is 2x when the HashSet
|
|
||||||
- is full and is being converted to the bloom filter.
|
|
||||||
-}
|
|
||||||
genBloomableFilter
|
|
||||||
:: BloomableBytes v
|
|
||||||
=> ((v -> Annex ()) -> Annex ())
|
|
||||||
-> Annex (BloomableFilter v)
|
|
||||||
genBloomableFilter populate = do
|
|
||||||
(numbits, numhashes) <- bloomBitsHashes
|
|
||||||
-- A HashSet is a tree, so there's some memory overhead beyond
|
|
||||||
-- storing the values. Use 2/3 of the memory for storing
|
|
||||||
-- the values, and reserve the rest for that overhead.
|
|
||||||
let maxsz = (numbits `div` 8) `div` 3 * 2
|
|
||||||
bv <- lift $ newMB (cheapHashes numhashes) numbits
|
|
||||||
sv <- lift $ newSTRef S.empty
|
|
||||||
szv <- lift $ newSTRef (Just 0)
|
|
||||||
populate $ \v -> lift $ readSTRef szv >>= \case
|
|
||||||
Just n
|
|
||||||
| n < maxsz -> do
|
|
||||||
let bb@(BloomBytes b) = toBloomBytes v
|
|
||||||
modifySTRef' sv (S.insert bb)
|
|
||||||
modifySTRef' szv (fmap (+ B.length b))
|
|
||||||
| otherwise -> do
|
|
||||||
s <- readSTRef sv
|
|
||||||
forM_ (S.toList s) $
|
|
||||||
insertMB bv
|
|
||||||
modifySTRef' sv (const S.empty)
|
|
||||||
modifySTRef' szv (const Nothing)
|
|
||||||
insertMB bv (toBloomBytes v)
|
|
||||||
Nothing -> insertMB bv (toBloomBytes v)
|
|
||||||
lift $ readSTRef szv >>= \case
|
|
||||||
Just _ -> BloomableFilter <$> readSTRef sv
|
|
||||||
Nothing -> BloomedFilter <$> unsafeFreezeMB bv
|
|
||||||
where
|
|
||||||
lift = liftIO . stToIO
|
|
||||||
|
|
||||||
data ElemWithFalsePositives
|
|
||||||
= ElemNo
|
|
||||||
| ElemYes
|
|
||||||
| ElemProbablyYes
|
|
||||||
deriving (Show)
|
|
||||||
|
|
||||||
elemB' :: (BloomableBytes v) => v -> BloomableFilter v -> ElemWithFalsePositives
|
|
||||||
elemB' v (BloomedFilter b) =
|
|
||||||
case elemB (toBloomBytes v) b of
|
|
||||||
True -> ElemProbablyYes
|
|
||||||
False -> ElemNo
|
|
||||||
elemB' v (BloomableFilter s) =
|
|
||||||
case S.member (toBloomBytes v) s of
|
|
||||||
True -> ElemYes
|
|
||||||
False -> ElemNo
|
|
||||||
|
|
||||||
|
|
1
debian/control
vendored
1
debian/control
vendored
|
@ -29,7 +29,6 @@ Build-Depends:
|
||||||
libghc-aeson-dev,
|
libghc-aeson-dev,
|
||||||
libghc-tagsoup-dev,
|
libghc-tagsoup-dev,
|
||||||
libghc-unordered-containers-dev,
|
libghc-unordered-containers-dev,
|
||||||
libghc-hashable-dev,
|
|
||||||
libghc-ifelse-dev,
|
libghc-ifelse-dev,
|
||||||
libghc-bloomfilter-dev,
|
libghc-bloomfilter-dev,
|
||||||
libghc-edit-distance-dev,
|
libghc-edit-distance-dev,
|
||||||
|
|
|
@ -361,7 +361,6 @@ Executable git-annex
|
||||||
vector,
|
vector,
|
||||||
tagsoup,
|
tagsoup,
|
||||||
unordered-containers,
|
unordered-containers,
|
||||||
hashable,
|
|
||||||
feed (>= 1.0.0),
|
feed (>= 1.0.0),
|
||||||
regex-tdfa,
|
regex-tdfa,
|
||||||
socks,
|
socks,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue