Increased the default annex.bloomaccuracy from 1000 to 10000000

This makes git annex unused use around 48 mb more memory than it did before,
but the massive increase in accuracy makes this worthwhile for all but the
smallest systems.

Also, I want to use the bloom filter for sync --all --content, to avoid
dropping files that the preferred content doesn't want, and 1/1000
false positives would be far too many in that use case, even if it were
acceptable for unused.

Actual memory use numbers:

1000: 21.06user 3.42system 0:26.40elapsed 92%CPU (0avgtext+0avgdata 501552maxresident)k
1000000: 21.41user 3.55system 0:26.84elapsed 93%CPU (0avgtext+0avgdata 549496maxresident)k
10000000: 21.84user 3.52system 0:27.89elapsed 90%CPU (0avgtext+0avgdata 549920maxresident)k

Based on these numbers, 10 million seemed a better pick than 1 million.
This commit is contained in:
Joey Hess 2015-06-16 17:58:15 -04:00
parent f7350b7c33
commit 8b74aec3ea
6 changed files with 76 additions and 56 deletions

53
Annex/BloomFilter.hs Normal file
View file

@ -0,0 +1,53 @@
{- git-annex bloom filter
-
- Copyright 2010-2015 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU GPL version 3 or higher.
-}
module Annex.BloomFilter where
import Common.Annex
import qualified Annex
import Utility.Bloom
import Control.Monad.ST
{- A bloom filter capable of holding half a million keys with a
- false positive rate of 1 in 10000000 uses around 16 mb of memory,
- so will easily fit on even my lowest memory systems.
-}
bloomCapacity :: Annex Int
bloomCapacity = fromMaybe 500000 . annexBloomCapacity <$> Annex.getGitConfig
bloomAccuracy :: Annex Int
bloomAccuracy = fromMaybe 10000000 . annexBloomAccuracy <$> Annex.getGitConfig
bloomBitsHashes :: Annex (Int, Int)
bloomBitsHashes = do
capacity <- bloomCapacity
accuracy <- bloomAccuracy
case safeSuggestSizing capacity (1 / fromIntegral accuracy) of
Left e -> do
warning $ "bloomfilter " ++ e ++ "; falling back to sane value"
-- precaulculated value for 500000 (1/10000000)
return (16777216,23)
Right v -> return v
{- Creates a bloom filter, and runs an action to populate it.
-
- The action is passed a callback that it can use to feed values into the
- bloom filter.
-
- Once the action completes, the mutable filter is frozen
- for later use.
-}
genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t)
genBloomFilter convert populate = do
(numbits, numhashes) <- bloomBitsHashes
bloom <- lift $ newMB (cheapHashes numhashes) numbits
_ <- populate $ \v -> lift $ insertMB bloom (convert v)
lift $ unsafeFreezeMB bloom
where
lift = liftIO . stToIO
bloomFilter :: Hashable t => (v -> t) -> [v] -> Bloom t -> [v]
bloomFilter convert l bloom = filter (\k -> convert k `notElemB` bloom) l

View file

@ -16,7 +16,6 @@ import Data.Tuple
import Data.Ord import Data.Ord
import Common.Annex import Common.Annex
import qualified Command.Unused
import qualified Git import qualified Git
import qualified Annex import qualified Annex
import qualified Remote import qualified Remote
@ -39,6 +38,8 @@ import Types.TrustLevel
import Types.FileMatcher import Types.FileMatcher
import qualified Limit import qualified Limit
import Messages.JSON (DualDisp(..)) import Messages.JSON (DualDisp(..))
import Annex.BloomFilter
import qualified Command.Unused
-- a named computation that produces a statistic -- a named computation that produces a statistic
type Stat = StatState (Maybe (String, StatState String)) type Stat = StatState (Maybe (String, StatState String))
@ -330,17 +331,17 @@ key_name k = simpleStat "key" $ pure $ key2file k
bloom_info :: Stat bloom_info :: Stat
bloom_info = simpleStat "bloom filter size" $ do bloom_info = simpleStat "bloom filter size" $ do
localkeys <- countKeys <$> cachedPresentData localkeys <- countKeys <$> cachedPresentData
capacity <- fromIntegral <$> lift Command.Unused.bloomCapacity capacity <- fromIntegral <$> lift bloomCapacity
let note = aside $ let note = aside $
if localkeys >= capacity if localkeys >= capacity
then "appears too small for this repository; adjust annex.bloomcapacity" then "appears too small for this repository; adjust annex.bloomcapacity"
else showPercentage 1 (percentage capacity localkeys) ++ " full" else showPercentage 1 (percentage capacity localkeys) ++ " full"
-- Two bloom filters are used at the same time, so double the size -- Two bloom filters are used at the same time when running
-- of one. -- git-annex unused, so double the size of one.
sizer <- lift mkSizer sizer <- lift mkSizer
size <- sizer memoryUnits False . (* 2) . fromIntegral . fst <$> size <- sizer memoryUnits False . (* 2) . fromIntegral . fst <$>
lift Command.Unused.bloomBitsHashes lift bloomBitsHashes
return $ size ++ note return $ size ++ note

View file

@ -45,6 +45,7 @@ import Annex.UUID
import Logs.UUID import Logs.UUID
import Annex.AutoMerge import Annex.AutoMerge
import Annex.Ssh import Annex.Ssh
import Utility.Bloom
import Control.Concurrent.MVar import Control.Concurrent.MVar
import qualified Data.Map as M import qualified Data.Map as M

View file

@ -9,7 +9,6 @@
module Command.Unused where module Command.Unused where
import Control.Monad.ST
import qualified Data.Map as M import qualified Data.Map as M
import Common.Annex import Common.Annex
@ -32,7 +31,7 @@ import Types.Key
import Types.RefSpec import Types.RefSpec
import Git.FilePath import Git.FilePath
import Logs.View (is_branchView) import Logs.View (is_branchView)
import Utility.Bloom import Annex.BloomFilter
cmd :: [Command] cmd :: [Command]
cmd = [withOptions [unusedFromOption, refSpecOption] $ cmd = [withOptions [unusedFromOption, refSpecOption] $
@ -172,46 +171,6 @@ excludeReferenced refspec ks = runfilter firstlevel ks >>= runfilter secondlevel
firstlevel = withKeysReferencedM firstlevel = withKeysReferencedM
secondlevel = withKeysReferencedInGit refspec secondlevel = withKeysReferencedInGit refspec
{- A bloom filter capable of holding half a million keys with a
- false positive rate of 1 in 1000 uses around 8 mb of memory,
- so will easily fit on even my lowest memory systems.
-}
bloomCapacity :: Annex Int
bloomCapacity = fromMaybe 500000 . annexBloomCapacity <$> Annex.getGitConfig
bloomAccuracy :: Annex Int
bloomAccuracy = fromMaybe 1000 . annexBloomAccuracy <$> Annex.getGitConfig
bloomBitsHashes :: Annex (Int, Int)
bloomBitsHashes = do
capacity <- bloomCapacity
accuracy <- bloomAccuracy
case safeSuggestSizing capacity (1 / fromIntegral accuracy) of
Left e -> do
warning $ "bloomfilter " ++ e ++ "; falling back to sane value"
-- precaulculated value for 500000 (1/1000)
return (8388608,10)
Right v -> return v
{- Creates a bloom filter, and runs an action, such as withKeysReferenced,
- to populate it.
-
- The action is passed a callback that it can use to feed values into the
- bloom filter.
-
- Once the action completes, the mutable filter is frozen
- for later use.
-}
genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t)
genBloomFilter convert populate = do
(numbits, numhashes) <- bloomBitsHashes
bloom <- lift $ newMB (cheapHashes numhashes) numbits
_ <- populate $ \v -> lift $ insertMB bloom (convert v)
lift $ unsafeFreezeMB bloom
where
lift = liftIO . stToIO
bloomFilter :: Hashable t => (v -> t) -> [v] -> Bloom t -> [v]
bloomFilter convert l bloom = filter (\k -> convert k `notElemB` bloom) l
{- Given an initial value, folds it with each key referenced by {- Given an initial value, folds it with each key referenced by
- symlinks in the git repo. -} - symlinks in the git repo. -}
withKeysReferenced :: v -> (Key -> v -> v) -> Annex v withKeysReferenced :: v -> (Key -> v -> v) -> Annex v

4
debian/changelog vendored
View file

@ -44,6 +44,10 @@ git-annex (5.20150616) UNRELEASED; urgency=medium
* Fix incremental backup standard preferred content expression to match * Fix incremental backup standard preferred content expression to match
its documentation, which says it does not want files that have reached its documentation, which says it does not want files that have reached
a backup repository. a backup repository.
* Increased the default annex.bloomaccuracy from 1000 to 10000000.
This makes git annex unused use up to 16 mb more memory than it did
before, but the massive increase in accuracy makes this worthwhile
for all but the smallest systems.
-- Joey Hess <id@joeyh.name> Sat, 30 May 2015 02:07:18 -0400 -- Joey Hess <id@joeyh.name> Sat, 30 May 2015 02:07:18 -0400

View file

@ -830,20 +830,22 @@ Here are all the supported configuration settings.
* `annex.bloomcapacity` * `annex.bloomcapacity`
The `git annex unused` command uses a bloom filter to determine The `git annex unused` and `git annex sync --content` commands use
what data is no longer used. The default bloom filter is sized to handle a bloom filter to determine what files are present in eg, the work tree.
up to 500000 keys. If your repository is larger than that, The default bloom filter is sized to handle
you can adjust this to avoid `git annex unused` not noticing some unused up to 500000 files. If your repository is larger than that,
data files. Increasing this will make `git-annex unused` consume more memory; you should increase this value. Larger values will
make `git-annex unused` and `git annex sync --content` consume more memory;
run `git annex info` for memory usage numbers. run `git annex info` for memory usage numbers.
* `annex.bloomaccuracy` * `annex.bloomaccuracy`
Adjusts the accuracy of the bloom filter used by Adjusts the accuracy of the bloom filter used by
`git annex unused`. The default accuracy is 1000 -- `git annex unused` and `git annex sync --content`.
1 unused file out of 1000 will be missed by `git annex unused`. Increasing The default accuracy is 10000000 -- 1 unused file out of 10000000
the accuracy will make `git annex unused` consume more memory; will be missed by `git annex unused`. Increasing the accuracy will make
run `git annex info` for memory usage numbers. `git annex unused` consume more memory; run `git annex info`
for memory usage numbers.
* `annex.sshcaching` * `annex.sshcaching`