added second stage bloom filter

2012-03-12 15:21:20 -04:00 · 2012-03-12 15:21:20 -04:00 · faf3a94fa7
commit faf3a94fa7
parent 32f9742a88
1 changed files with 69 additions and 45 deletions
--- a/Command/Unused.hs
+++ b/Command/Unused.hs
@ -69,9 +69,7 @@ checkUnused = chain 0
 			return []
 		findunused False = do
 			showAction "checking for unused data"
-			b <- genBloomFilter show withKeysReferenced'
+			excludeReferenced =<< getKeysPresent
 			bloomFilter b <$> getKeysPresent
 			-- TODO: check branches
 		chain _ [] = next $ return True
 		chain v (a:as) = do
 			v' <- a v
@ -145,28 +143,32 @@ dropMsg (Just r) = dropMsg' $ " --from " ++ Remote.name r
 dropMsg' :: String -> String
 dropMsg' s = "\nTo remove unwanted data: git-annex dropunused" ++ s ++ " NUMBER\n"
-{- Finds keys in the list that are not referenced in the git repository. -}
+{- Finds keys in the list that are not referenced in the git repository.
 -
 - Strategy:
 -
 - * Build a bloom filter of all keys referenced by symlinks. This 
 -   is the fastest one to build and will filter out most keys.
 - * If keys remain, build a second bloom filter of keys referenced by
 -   all branches.
 - * The list is streamed through these bloom filters lazily, so both will
 -   exist at the same time. This means that twice the memory is used,
 -   but they're relatively small, so the added complexity of using a
 -   mutable bloom filter does not seem worthwhile.
 - * Generating the second bloom filter can take quite a while, since
 -   it needs enumerating all keys in all git branches. But, the common
 -   case, if the second filter is needed, is for some keys to be globally
 -   unused, and in that case, no short-circuit is possible.
 -   Short-circuiting if the first filter filters all the keys handles the
 -   other common case.
 -}
 excludeReferenced :: [Key] -> Annex [Key]
-excludeReferenced [] = return [] -- optimisation
+excludeReferenced ks = runfilter firstlevel ks >>= runfilter secondlevel
 excludeReferenced l = do
 	let s = S.fromList l
 	!s' <- withKeysReferenced s S.delete
 	go s' =<< refs <$> (inRepo $ Git.Command.pipeRead [Param "show-ref"])
 	where
-		-- Skip the git-annex branches, and get all other unique refs.
+		runfilter _ [] = return [] -- optimisation
-		refs = map (Git.Ref .  snd) .
+		runfilter a l = bloomFilter show l <$> genBloomFilter show a
-			nubBy uniqref .
+		firstlevel = withKeysReferencedM
-			filter ourbranches .
+		secondlevel = withKeysReferencedInGit
 			map (separate (== ' ')) . lines
 		uniqref (a, _) (b, _) = a == b
 		ourbranchend = '/' : show Annex.Branch.name
 		ourbranches (_, b) = not $ ourbranchend `isSuffixOf` b
 		go s [] = return $ S.toList s
 		go s (r:rs)
 			| s == S.empty = return [] -- optimisation
 			| otherwise = do
 				s' <- withKeysReferencedInGit r s S.delete
 				go s' rs
 {- Finds items in the first, smaller list, that are not
 - present in the second, larger list.
@ -180,7 +182,7 @@ exclude smaller larger = S.toList $ remove larger $ S.fromList smaller
 	where
 		remove a b = foldl (flip S.delete) b a
-{- Creates a bloom filter, and runs an action, such as withKeysReferenced',
+{- Creates a bloom filter, and runs an action, such as withKeysReferenced,
 - to populate it.
 -
 - The action is passed a callback that it can use to feed values into the
@ -189,29 +191,36 @@ exclude smaller larger = S.toList $ remove larger $ S.fromList smaller
 - Once the action completes, the mutable filter is frozen
 - for later use.
 -}
-genBloomFilter :: Hashable t => (v -> t) -> (() -> (v -> () -> Annex ()) -> Annex b) -> Annex (Bloom t)
+genBloomFilter :: Hashable t => (v -> t) -> ((v -> Annex ()) -> Annex b) -> Annex (Bloom t)
 genBloomFilter convert populate = do
-	-- A bloom filter capable of holding one million keys with a
+	-- A bloom filter capable of holding half a million keys with a
-	-- false positive rate of 0.1% uses 16 mb of memory.
+	-- false positive rate of 0.1% uses around 8 mb of memory.
 	-- TODO: make this configurable, for the really large repos,
 	-- or really low false positive rates.
-	let (numbits, numhashes) = suggestSizing 1000000 0.0001
+	let (numbits, numhashes) = suggestSizing 500000 0.001
 	bloom <- lift $ newMB (cheapHashes numhashes) numbits
-	_ <- populate () $ \v _ -> lift $ insertMB bloom (convert v)
+	_ <- populate $ \v -> lift $ insertMB bloom (convert v)
 	lift $ unsafeFreezeMB bloom
 	where
 		lift = liftIO . stToIO
-bloomFilter :: Bloom String -> [Key] -> [Key]
+bloomFilter :: Hashable t => (v -> t) -> [v] -> Bloom t -> [v]
-bloomFilter b l = filter (\k -> show k `notElemB` b) l
+bloomFilter convert l bloom = filter (\k -> convert k `notElemB` bloom) l
-{- Given an initial value, mutates it using an action for each
+{- Given an initial value, folds it with each key referenced by
- - key referenced by symlinks in the git repo. -}
+ - symlinks in the git repo. -}
 withKeysReferenced :: v -> (Key -> v -> v) -> Annex v
-withKeysReferenced initial a = withKeysReferenced' initial reta
+withKeysReferenced initial a = withKeysReferenced' initial folda
 	where
-		reta k v = return $ a k v
+		folda k v = return $ a k v
 {- Runs an action on each referenced key in the git repo. -}
 withKeysReferencedM :: (Key -> Annex ()) -> Annex ()
 withKeysReferencedM a = withKeysReferenced' () calla
 	where
 		calla k _ = a k
 withKeysReferenced' :: v -> (Key -> v -> Annex v) -> Annex v
 withKeysReferenced' initial a = go initial =<< files
 	where
@ -227,21 +236,36 @@ withKeysReferenced' initial a = go initial =<< files
 					!v' <- a k v
 					go v' fs
-withKeysReferencedInGit :: Git.Ref -> v -> (Key -> v -> v) -> Annex v
+
-withKeysReferencedInGit ref initial a = do
+withKeysReferencedInGit :: (Key -> Annex ()) -> Annex ()
-	showAction $ "checking " ++ Git.Ref.describe ref
+withKeysReferencedInGit a = do
-	go initial =<< inRepo (LsTree.lsTree ref)
+	rs <- relevantrefs <$> showref
 	forM_ rs (withKeysReferencedInGitRef a)
 	where
-		go v [] = return v
+		showref = inRepo $ Git.Command.pipeRead [Param "show-ref"]
-		go v (l:ls)
+		relevantrefs = map (Git.Ref .  snd) .
 			nubBy uniqref .
 			filter ourbranches .
 			map (separate (== ' ')) . lines
 		uniqref (x, _) (y, _) = x == y
 		ourbranchend = '/' : show Annex.Branch.name
 		ourbranches (_, b) = not $ ourbranchend `isSuffixOf` b
 withKeysReferencedInGitRef :: (Key -> Annex ()) -> Git.Ref -> Annex ()
 withKeysReferencedInGitRef a ref = do
 	showAction $ "checking " ++ Git.Ref.describe ref
 	go =<< inRepo (LsTree.lsTree ref)
 	where
 		go [] = return ()
 		go (l:ls)
 			| isSymLink (LsTree.mode l) = do
 				content <- L.decodeUtf8 <$> catFile ref (LsTree.file l)
 				case fileKey (takeFileName $ L.unpack content) of
-					Nothing -> go v ls
+					Nothing -> go ls
 					Just k -> do
-						let !v' = a k v
+						a k
-						go v' ls
+						go ls
-			| otherwise = go v ls
+			| otherwise = go ls
 {- Looks in the specified directory for bad/tmp keys, and returns a list
 - of those that might still have value, or might be stale and removable.