diff --git a/Annex/BloomFilter.hs b/Annex/BloomFilter.hs index 5b735b5b8b..544fdeeccc 100644 --- a/Annex/BloomFilter.hs +++ b/Annex/BloomFilter.hs @@ -1,13 +1,10 @@ {- git-annex bloom filter - - - Copyright 2010-2020 Joey Hess + - Copyright 2010-2015 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} -{-# LANGUAGE LambdaCase #-} -{-# LANGUAGE GeneralizedNewtypeDeriving #-} - module Annex.BloomFilter where import Annex.Common @@ -15,10 +12,6 @@ import qualified Annex import Utility.Bloom import Control.Monad.ST -import Data.STRef -import Data.Hashable -import qualified Data.HashSet as S -import qualified Data.ByteString as B {- A bloom filter capable of holding half a million keys with a - false positive rate of 1 in 10000000 uses around 16 mb of memory, @@ -35,7 +28,7 @@ bloomBitsHashes = do case safeSuggestSizing capacity (1 / fromIntegral accuracy) of Left e -> do warning $ "bloomfilter " ++ e ++ "; falling back to sane value" - -- precalculated value for 500000 (1/10000000) + -- precaulculated value for 500000 (1/10000000) return (16777216,23) Right v -> return v @@ -47,10 +40,7 @@ bloomBitsHashes = do - Once the action completes, the mutable filter is frozen - for later use. -} -genBloomFilter - :: Utility.Bloom.Hashable v - => ((v -> Annex ()) -> Annex ()) - -> Annex (Bloom v) +genBloomFilter :: Hashable v => ((v -> Annex ()) -> Annex ()) -> Annex (Bloom v) genBloomFilter populate = do (numbits, numhashes) <- bloomBitsHashes bloom <- lift $ newMB (cheapHashes numhashes) numbits @@ -61,80 +51,3 @@ genBloomFilter populate = do bloomFilter :: [v] -> Bloom v -> [v] bloomFilter l bloom = filter (\v -> v `notElemB` bloom) l - -{- This starts off as a HashSet, and is only converted to a bloom filter - - if it grows too large. - - - - The advantage is, if it stays a HashSet, queries do not have false - - positives. - -} -data BloomableFilter v - = BloomableFilter (S.HashSet (BloomBytes v)) - | BloomedFilter (Bloom (BloomBytes v)) - deriving (Show) - -newtype BloomBytes v = BloomBytes B.ByteString - deriving (Show, Eq, Utility.Bloom.Hashable, Data.Hashable.Hashable) - -class BloomableBytes v where - toBloomBytes :: v -> BloomBytes v - -instance BloomableBytes B.ByteString where - toBloomBytes = BloomBytes - -instance BloomableBytes Key where - toBloomBytes = BloomBytes . serializeKey' - -{- The HashSet is grown until it uses around as much memory as the bloom - - filter is configured to use. So peak memory use is 2x when the HashSet - - is full and is being converted to the bloom filter. - -} -genBloomableFilter - :: BloomableBytes v - => ((v -> Annex ()) -> Annex ()) - -> Annex (BloomableFilter v) -genBloomableFilter populate = do - (numbits, numhashes) <- bloomBitsHashes - -- A HashSet is a tree, so there's some memory overhead beyond - -- storing the values. Use 2/3 of the memory for storing - -- the values, and reserve the rest for that overhead. - let maxsz = (numbits `div` 8) `div` 3 * 2 - bv <- lift $ newMB (cheapHashes numhashes) numbits - sv <- lift $ newSTRef S.empty - szv <- lift $ newSTRef (Just 0) - populate $ \v -> lift $ readSTRef szv >>= \case - Just n - | n < maxsz -> do - let bb@(BloomBytes b) = toBloomBytes v - modifySTRef' sv (S.insert bb) - modifySTRef' szv (fmap (+ B.length b)) - | otherwise -> do - s <- readSTRef sv - forM_ (S.toList s) $ - insertMB bv - modifySTRef' sv (const S.empty) - modifySTRef' szv (const Nothing) - insertMB bv (toBloomBytes v) - Nothing -> insertMB bv (toBloomBytes v) - lift $ readSTRef szv >>= \case - Just _ -> BloomableFilter <$> readSTRef sv - Nothing -> BloomedFilter <$> unsafeFreezeMB bv - where - lift = liftIO . stToIO - -data ElemWithFalsePositives - = ElemNo - | ElemYes - | ElemProbablyYes - deriving (Show) - -elemB' :: (BloomableBytes v) => v -> BloomableFilter v -> ElemWithFalsePositives -elemB' v (BloomedFilter b) = - case elemB (toBloomBytes v) b of - True -> ElemProbablyYes - False -> ElemNo -elemB' v (BloomableFilter s) = - case S.member (toBloomBytes v) s of - True -> ElemYes - False -> ElemNo - diff --git a/debian/control b/debian/control index 224b360f8b..06d9f9f044 100644 --- a/debian/control +++ b/debian/control @@ -29,7 +29,6 @@ Build-Depends: libghc-aeson-dev, libghc-tagsoup-dev, libghc-unordered-containers-dev, - libghc-hashable-dev, libghc-ifelse-dev, libghc-bloomfilter-dev, libghc-edit-distance-dev, diff --git a/git-annex.cabal b/git-annex.cabal index 17c9a94720..c11f8e96b0 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -361,7 +361,6 @@ Executable git-annex vector, tagsoup, unordered-containers, - hashable, feed (>= 1.0.0), regex-tdfa, socks,