2015-06-16 21:58:15 +00:00
|
|
|
{- git-annex bloom filter
|
|
|
|
-
|
2020-07-02 00:12:19 +00:00
|
|
|
- Copyright 2010-2015 Joey Hess <id@joeyh.name>
|
2015-06-16 21:58:15 +00:00
|
|
|
-
|
2019-03-13 19:48:14 +00:00
|
|
|
- Licensed under the GNU AGPL version 3 or higher.
|
2015-06-16 21:58:15 +00:00
|
|
|
-}
|
|
|
|
|
|
|
|
module Annex.BloomFilter where
|
|
|
|
|
2016-01-20 20:36:33 +00:00
|
|
|
import Annex.Common
|
2015-06-16 21:58:15 +00:00
|
|
|
import qualified Annex
|
|
|
|
import Utility.Bloom
|
|
|
|
|
|
|
|
import Control.Monad.ST
|
|
|
|
|
|
|
|
{- A bloom filter capable of holding half a million keys with a
|
|
|
|
- false positive rate of 1 in 10000000 uses around 16 mb of memory,
|
|
|
|
- so will easily fit on even my lowest memory systems.
|
|
|
|
-}
|
|
|
|
bloomCapacity :: Annex Int
|
|
|
|
bloomCapacity = fromMaybe 500000 . annexBloomCapacity <$> Annex.getGitConfig
|
|
|
|
bloomAccuracy :: Annex Int
|
|
|
|
bloomAccuracy = fromMaybe 10000000 . annexBloomAccuracy <$> Annex.getGitConfig
|
|
|
|
bloomBitsHashes :: Annex (Int, Int)
|
|
|
|
bloomBitsHashes = do
|
|
|
|
capacity <- bloomCapacity
|
|
|
|
accuracy <- bloomAccuracy
|
|
|
|
case safeSuggestSizing capacity (1 / fromIntegral accuracy) of
|
|
|
|
Left e -> do
|
filter out control characters in warning messages
Converted warning and similar to use StringContainingQuotedPath. Most
warnings are static strings, some do refer to filepaths that need to be
quoted, and others don't need quoting.
Note that, since quote filters out control characters of even
UnquotedString, this makes all warnings safe, even when an attacker
sneaks in a control character in some other way.
When json is being output, no quoting is done, since json gets its own
quoting.
This does, as a side effect, make warning messages in json output not
be indented. The indentation is only needed to offset warning messages
underneath the display of the file they apply to, so that's ok.
Sponsored-by: Brett Eisenberg on Patreon
2023-04-10 18:47:32 +00:00
|
|
|
warning $ UnquotedString $
|
|
|
|
"bloomfilter " ++ e ++ "; falling back to sane value"
|
2020-07-02 00:12:19 +00:00
|
|
|
-- precaulculated value for 500000 (1/10000000)
|
2015-06-16 21:58:15 +00:00
|
|
|
return (16777216,23)
|
|
|
|
Right v -> return v
|
|
|
|
|
|
|
|
{- Creates a bloom filter, and runs an action to populate it.
|
|
|
|
-
|
|
|
|
- The action is passed a callback that it can use to feed values into the
|
|
|
|
- bloom filter.
|
|
|
|
-
|
|
|
|
- Once the action completes, the mutable filter is frozen
|
|
|
|
- for later use.
|
|
|
|
-}
|
2020-07-02 00:12:19 +00:00
|
|
|
genBloomFilter :: Hashable v => ((v -> Annex ()) -> Annex ()) -> Annex (Bloom v)
|
2015-06-16 22:37:41 +00:00
|
|
|
genBloomFilter populate = do
|
2015-06-16 21:58:15 +00:00
|
|
|
(numbits, numhashes) <- bloomBitsHashes
|
|
|
|
bloom <- lift $ newMB (cheapHashes numhashes) numbits
|
2015-06-16 22:38:12 +00:00
|
|
|
populate $ \v -> lift $ insertMB bloom v
|
2015-06-16 21:58:15 +00:00
|
|
|
lift $ unsafeFreezeMB bloom
|
|
|
|
where
|
|
|
|
lift = liftIO . stToIO
|
|
|
|
|
2016-01-27 13:43:23 +00:00
|
|
|
bloomFilter :: [v] -> Bloom v -> [v]
|
2015-06-16 22:37:41 +00:00
|
|
|
bloomFilter l bloom = filter (\v -> v `notElemB` bloom) l
|