Increased the default annex.bloomaccuracy from 1000 to 10000000

This makes git annex unused use around 48 mb more memory than it did before,
but the massive increase in accuracy makes this worthwhile for all but the
smallest systems.

Also, I want to use the bloom filter for sync --all --content, to avoid
dropping files that the preferred content doesn't want, and 1/1000
false positives would be far too many in that use case, even if it were
acceptable for unused.

Actual memory use numbers:

1000: 21.06user 3.42system 0:26.40elapsed 92%CPU (0avgtext+0avgdata 501552maxresident)k
1000000: 21.41user 3.55system 0:26.84elapsed 93%CPU (0avgtext+0avgdata 549496maxresident)k
10000000: 21.84user 3.52system 0:27.89elapsed 90%CPU (0avgtext+0avgdata 549920maxresident)k

Based on these numbers, 10 million seemed a better pick than 1 million.
This commit is contained in:
Joey Hess 2015-06-16 17:58:15 -04:00
parent f7350b7c33
commit 8b74aec3ea
6 changed files with 76 additions and 56 deletions

53
Annex/BloomFilter.hs Normal file
View file

@ -0,0 +1,53 @@
{- git-annex bloom filter
-
- Copyright 2010-2015 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU GPL version 3 or higher.
-}
module Annex.BloomFilter where
import Common.Annex
import qualified Annex
import Utility.Bloom
import Control.Monad.ST
{- A bloom filter capable of holding half a million keys with a
- false positive rate of 1 in 10000000 uses around 16 mb of memory,
- so will easily fit on even my lowest memory systems.
-}
bloomCapacity :: Annex Int
bloomCapacity = fromMaybe 500000 . annexBloomCapacity <$> Annex.getGitConfig
bloomAccuracy :: Annex Int
bloomAccuracy = fromMaybe 10000000 . annexBloomAccuracy <$> Annex.getGitConfig
bloomBitsHashes :: Annex (Int, Int)
bloomBitsHashes = do
capacity <- bloomCapacity
accuracy <- bloomAccuracy
case safeSuggestSizing capacity (1 / fromIntegral accuracy) of
Left e -> do
warning $ "bloomfilter " ++ e ++ "; falling back to sane value"
-- precaulculated value for 500000 (1/10000000)
return (16777216,23)
Right v -> return v
{- Creates a bloom filter, and runs an action to populate it.
-
- The action is passed a callback that it can use to feed values into the
- bloom filter.
-
- Once the action completes, the mutable filter is frozen
- for later use.
-}
genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t)
genBloomFilter convert populate = do
(numbits, numhashes) <- bloomBitsHashes
bloom <- lift $ newMB (cheapHashes numhashes) numbits
_ <- populate $ \v -> lift $ insertMB bloom (convert v)
lift $ unsafeFreezeMB bloom
where
lift = liftIO . stToIO
bloomFilter :: Hashable t => (v -> t) -> [v] -> Bloom t -> [v]
bloomFilter convert l bloom = filter (\k -> convert k `notElemB` bloom) l