diff --git a/Annex/BloomFilter.hs b/Annex/BloomFilter.hs new file mode 100644 index 0000000000..3dcd8140bd --- /dev/null +++ b/Annex/BloomFilter.hs @@ -0,0 +1,53 @@ +{- git-annex bloom filter + - + - Copyright 2010-2015 Joey Hess + - + - Licensed under the GNU GPL version 3 or higher. + -} + +module Annex.BloomFilter where + +import Common.Annex +import qualified Annex +import Utility.Bloom + +import Control.Monad.ST + +{- A bloom filter capable of holding half a million keys with a + - false positive rate of 1 in 10000000 uses around 16 mb of memory, + - so will easily fit on even my lowest memory systems. + -} +bloomCapacity :: Annex Int +bloomCapacity = fromMaybe 500000 . annexBloomCapacity <$> Annex.getGitConfig +bloomAccuracy :: Annex Int +bloomAccuracy = fromMaybe 10000000 . annexBloomAccuracy <$> Annex.getGitConfig +bloomBitsHashes :: Annex (Int, Int) +bloomBitsHashes = do + capacity <- bloomCapacity + accuracy <- bloomAccuracy + case safeSuggestSizing capacity (1 / fromIntegral accuracy) of + Left e -> do + warning $ "bloomfilter " ++ e ++ "; falling back to sane value" + -- precaulculated value for 500000 (1/10000000) + return (16777216,23) + Right v -> return v + +{- Creates a bloom filter, and runs an action to populate it. + - + - The action is passed a callback that it can use to feed values into the + - bloom filter. + - + - Once the action completes, the mutable filter is frozen + - for later use. + -} +genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t) +genBloomFilter convert populate = do + (numbits, numhashes) <- bloomBitsHashes + bloom <- lift $ newMB (cheapHashes numhashes) numbits + _ <- populate $ \v -> lift $ insertMB bloom (convert v) + lift $ unsafeFreezeMB bloom + where + lift = liftIO . stToIO + +bloomFilter :: Hashable t => (v -> t) -> [v] -> Bloom t -> [v] +bloomFilter convert l bloom = filter (\k -> convert k `notElemB` bloom) l diff --git a/Command/Info.hs b/Command/Info.hs index f5fa9c6bf7..e6e0194ce8 100644 --- a/Command/Info.hs +++ b/Command/Info.hs @@ -16,7 +16,6 @@ import Data.Tuple import Data.Ord import Common.Annex -import qualified Command.Unused import qualified Git import qualified Annex import qualified Remote @@ -39,6 +38,8 @@ import Types.TrustLevel import Types.FileMatcher import qualified Limit import Messages.JSON (DualDisp(..)) +import Annex.BloomFilter +import qualified Command.Unused -- a named computation that produces a statistic type Stat = StatState (Maybe (String, StatState String)) @@ -330,17 +331,17 @@ key_name k = simpleStat "key" $ pure $ key2file k bloom_info :: Stat bloom_info = simpleStat "bloom filter size" $ do localkeys <- countKeys <$> cachedPresentData - capacity <- fromIntegral <$> lift Command.Unused.bloomCapacity + capacity <- fromIntegral <$> lift bloomCapacity let note = aside $ if localkeys >= capacity then "appears too small for this repository; adjust annex.bloomcapacity" else showPercentage 1 (percentage capacity localkeys) ++ " full" - -- Two bloom filters are used at the same time, so double the size - -- of one. + -- Two bloom filters are used at the same time when running + -- git-annex unused, so double the size of one. sizer <- lift mkSizer size <- sizer memoryUnits False . (* 2) . fromIntegral . fst <$> - lift Command.Unused.bloomBitsHashes + lift bloomBitsHashes return $ size ++ note diff --git a/Command/Sync.hs b/Command/Sync.hs index 88449384d1..80ecce43ee 100644 --- a/Command/Sync.hs +++ b/Command/Sync.hs @@ -45,6 +45,7 @@ import Annex.UUID import Logs.UUID import Annex.AutoMerge import Annex.Ssh +import Utility.Bloom import Control.Concurrent.MVar import qualified Data.Map as M diff --git a/Command/Unused.hs b/Command/Unused.hs index 4f844081ad..82a6052900 100644 --- a/Command/Unused.hs +++ b/Command/Unused.hs @@ -9,7 +9,6 @@ module Command.Unused where -import Control.Monad.ST import qualified Data.Map as M import Common.Annex @@ -32,7 +31,7 @@ import Types.Key import Types.RefSpec import Git.FilePath import Logs.View (is_branchView) -import Utility.Bloom +import Annex.BloomFilter cmd :: [Command] cmd = [withOptions [unusedFromOption, refSpecOption] $ @@ -172,46 +171,6 @@ excludeReferenced refspec ks = runfilter firstlevel ks >>= runfilter secondlevel firstlevel = withKeysReferencedM secondlevel = withKeysReferencedInGit refspec -{- A bloom filter capable of holding half a million keys with a - - false positive rate of 1 in 1000 uses around 8 mb of memory, - - so will easily fit on even my lowest memory systems. - -} -bloomCapacity :: Annex Int -bloomCapacity = fromMaybe 500000 . annexBloomCapacity <$> Annex.getGitConfig -bloomAccuracy :: Annex Int -bloomAccuracy = fromMaybe 1000 . annexBloomAccuracy <$> Annex.getGitConfig -bloomBitsHashes :: Annex (Int, Int) -bloomBitsHashes = do - capacity <- bloomCapacity - accuracy <- bloomAccuracy - case safeSuggestSizing capacity (1 / fromIntegral accuracy) of - Left e -> do - warning $ "bloomfilter " ++ e ++ "; falling back to sane value" - -- precaulculated value for 500000 (1/1000) - return (8388608,10) - Right v -> return v - -{- Creates a bloom filter, and runs an action, such as withKeysReferenced, - - to populate it. - - - - The action is passed a callback that it can use to feed values into the - - bloom filter. - - - - Once the action completes, the mutable filter is frozen - - for later use. - -} -genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t) -genBloomFilter convert populate = do - (numbits, numhashes) <- bloomBitsHashes - bloom <- lift $ newMB (cheapHashes numhashes) numbits - _ <- populate $ \v -> lift $ insertMB bloom (convert v) - lift $ unsafeFreezeMB bloom - where - lift = liftIO . stToIO - -bloomFilter :: Hashable t => (v -> t) -> [v] -> Bloom t -> [v] -bloomFilter convert l bloom = filter (\k -> convert k `notElemB` bloom) l - {- Given an initial value, folds it with each key referenced by - symlinks in the git repo. -} withKeysReferenced :: v -> (Key -> v -> v) -> Annex v diff --git a/debian/changelog b/debian/changelog index c7b4f34e6b..0c04ef8e01 100644 --- a/debian/changelog +++ b/debian/changelog @@ -44,6 +44,10 @@ git-annex (5.20150616) UNRELEASED; urgency=medium * Fix incremental backup standard preferred content expression to match its documentation, which says it does not want files that have reached a backup repository. + * Increased the default annex.bloomaccuracy from 1000 to 10000000. + This makes git annex unused use up to 16 mb more memory than it did + before, but the massive increase in accuracy makes this worthwhile + for all but the smallest systems. -- Joey Hess Sat, 30 May 2015 02:07:18 -0400 diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index e7c80f3cd0..c90ef5ec2f 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -830,20 +830,22 @@ Here are all the supported configuration settings. * `annex.bloomcapacity` - The `git annex unused` command uses a bloom filter to determine - what data is no longer used. The default bloom filter is sized to handle - up to 500000 keys. If your repository is larger than that, - you can adjust this to avoid `git annex unused` not noticing some unused - data files. Increasing this will make `git-annex unused` consume more memory; + The `git annex unused` and `git annex sync --content` commands use + a bloom filter to determine what files are present in eg, the work tree. + The default bloom filter is sized to handle + up to 500000 files. If your repository is larger than that, + you should increase this value. Larger values will + make `git-annex unused` and `git annex sync --content` consume more memory; run `git annex info` for memory usage numbers. * `annex.bloomaccuracy` Adjusts the accuracy of the bloom filter used by - `git annex unused`. The default accuracy is 1000 -- - 1 unused file out of 1000 will be missed by `git annex unused`. Increasing - the accuracy will make `git annex unused` consume more memory; - run `git annex info` for memory usage numbers. + `git annex unused` and `git annex sync --content`. + The default accuracy is 10000000 -- 1 unused file out of 10000000 + will be missed by `git annex unused`. Increasing the accuracy will make + `git annex unused` consume more memory; run `git annex info` + for memory usage numbers. * `annex.sshcaching`