unused: Reduce memory usage significantly.
Much of the memory bloat turned out to be due to getKeysReferenced containing a mapM, which is strict and buffered the whole list rather than streaming it. The other half of the bloat was due to building a temporary Set in order to call S.difference. While that is more cpu efficient, I switched to successive S.delete, since with it, I can run a whole git annex unused in less than 8 mb of memory. The whole Set of keys with content available is still stored in memory, so running unused in a repo with a whole lot of file content will still use more memory. In a repo containing 6000 files, it needed 40 mb. Note that the status command still uses the bloatful getKeysReferenced.
This commit is contained in:
parent
a13949bf37
commit
b086e32c63
3 changed files with 49 additions and 22 deletions
|
@ -155,9 +155,9 @@ unusedKeys = do
|
||||||
excludeReferenced :: [Key] -> Annex [Key]
|
excludeReferenced :: [Key] -> Annex [Key]
|
||||||
excludeReferenced [] = return [] -- optimisation
|
excludeReferenced [] = return [] -- optimisation
|
||||||
excludeReferenced l = do
|
excludeReferenced l = do
|
||||||
c <- inRepo $ Git.Command.pipeRead [Param "show-ref"]
|
let s = S.fromList l
|
||||||
removewith (getKeysReferenced : map getKeysReferencedInGit (refs c))
|
!s' <- withKeysReferenced s S.delete
|
||||||
(S.fromList l)
|
go s' =<< refs <$> (inRepo $ Git.Command.pipeRead [Param "show-ref"])
|
||||||
where
|
where
|
||||||
-- Skip the git-annex branches, and get all other unique refs.
|
-- Skip the git-annex branches, and get all other unique refs.
|
||||||
refs = map (Git.Ref . snd) .
|
refs = map (Git.Ref . snd) .
|
||||||
|
@ -167,13 +167,12 @@ excludeReferenced l = do
|
||||||
uniqref (a, _) (b, _) = a == b
|
uniqref (a, _) (b, _) = a == b
|
||||||
ourbranchend = '/' : show Annex.Branch.name
|
ourbranchend = '/' : show Annex.Branch.name
|
||||||
ourbranches (_, b) = not $ ourbranchend `isSuffixOf` b
|
ourbranches (_, b) = not $ ourbranchend `isSuffixOf` b
|
||||||
removewith [] s = return $ S.toList s
|
go s [] = return $ S.toList s
|
||||||
removewith (a:as) s
|
go s (r:rs)
|
||||||
| s == S.empty = return [] -- optimisation
|
| s == S.empty = return [] -- optimisation
|
||||||
| otherwise = do
|
| otherwise = do
|
||||||
referenced <- a
|
!s' <- withKeysReferencedInGit r s S.delete
|
||||||
let !s' = s `S.difference` S.fromList referenced
|
go s' rs
|
||||||
removewith as s'
|
|
||||||
|
|
||||||
{- Finds items in the first, smaller list, that are not
|
{- Finds items in the first, smaller list, that are not
|
||||||
- present in the second, larger list.
|
- present in the second, larger list.
|
||||||
|
@ -195,20 +194,37 @@ getKeysReferenced = do
|
||||||
keypairs <- mapM Backend.lookupFile files
|
keypairs <- mapM Backend.lookupFile files
|
||||||
return $ map fst $ catMaybes keypairs
|
return $ map fst $ catMaybes keypairs
|
||||||
|
|
||||||
{- List of keys referenced by symlinks in a git ref. -}
|
{- Given an initial value, mutates it using an action for each
|
||||||
getKeysReferencedInGit :: Git.Ref -> Annex [Key]
|
- key referenced by symlinks in the git repo. -}
|
||||||
getKeysReferencedInGit ref = do
|
withKeysReferenced :: v -> (Key -> v -> v) -> Annex v
|
||||||
showAction $ "checking " ++ Git.Ref.describe ref
|
withKeysReferenced initial a = do
|
||||||
findkeys [] =<< inRepo (LsTree.lsTree ref)
|
top <- fromRepo Git.workTree
|
||||||
|
go initial =<< inRepo (LsFiles.inRepo [top])
|
||||||
where
|
where
|
||||||
findkeys c [] = return c
|
go v [] = return v
|
||||||
findkeys c (l:ls)
|
go v (f:fs) = do
|
||||||
|
x <- Backend.lookupFile f
|
||||||
|
case x of
|
||||||
|
Nothing -> go v fs
|
||||||
|
Just (k, _) -> do
|
||||||
|
let !v' = a k v
|
||||||
|
go v' fs
|
||||||
|
|
||||||
|
withKeysReferencedInGit :: Git.Ref -> v -> (Key -> v -> v) -> Annex v
|
||||||
|
withKeysReferencedInGit ref initial a = do
|
||||||
|
showAction $ "checking " ++ Git.Ref.describe ref
|
||||||
|
go initial =<< inRepo (LsTree.lsTree ref)
|
||||||
|
where
|
||||||
|
go v [] = return v
|
||||||
|
go v (l:ls)
|
||||||
| isSymLink (LsTree.mode l) = do
|
| isSymLink (LsTree.mode l) = do
|
||||||
content <- L.decodeUtf8 <$> catFile ref (LsTree.file l)
|
content <- L.decodeUtf8 <$> catFile ref (LsTree.file l)
|
||||||
case fileKey (takeFileName $ L.unpack content) of
|
case fileKey (takeFileName $ L.unpack content) of
|
||||||
Nothing -> findkeys c ls
|
Nothing -> go v ls
|
||||||
Just k -> findkeys (k:c) ls
|
Just k -> do
|
||||||
| otherwise = findkeys c ls
|
let !v' = a k v
|
||||||
|
go v' ls
|
||||||
|
| otherwise = go v ls
|
||||||
|
|
||||||
{- Looks in the specified directory for bad/tmp keys, and returns a list
|
{- Looks in the specified directory for bad/tmp keys, and returns a list
|
||||||
- of those that might still have value, or might be stale and removable.
|
- of those that might still have value, or might be stale and removable.
|
||||||
|
|
3
debian/changelog
vendored
3
debian/changelog
vendored
|
@ -3,6 +3,9 @@ git-annex (3.20120310) UNRELEASED; urgency=low
|
||||||
* fsck: Fix up any broken links and misplaced content caused by the
|
* fsck: Fix up any broken links and misplaced content caused by the
|
||||||
directory hash calculation bug fixed in the last release.
|
directory hash calculation bug fixed in the last release.
|
||||||
* sync: Sync to lower cost remotes first.
|
* sync: Sync to lower cost remotes first.
|
||||||
|
* unused: Reduce memory usage significantly. Still not constant
|
||||||
|
space, but now only needs to store the set of file contents that
|
||||||
|
are present in the annex in memory.
|
||||||
|
|
||||||
-- Joey Hess <joeyh@debian.org> Sat, 10 Mar 2012 14:03:22 -0400
|
-- Joey Hess <joeyh@debian.org> Sat, 10 Mar 2012 14:03:22 -0400
|
||||||
|
|
||||||
|
|
|
@ -1,12 +1,20 @@
|
||||||
`git-annex unused` has to compare large sets of data
|
`git-annex unused` has to compare large sets of data
|
||||||
(all keys with content present in the repository,
|
(all keys with content present in the repository,
|
||||||
with all keys used by files in the repository), and so
|
with all keys used by files in the repository), and so
|
||||||
uses more memory than git-annex typically needs; around
|
uses more memory than git-annex typically needs.
|
||||||
50 mb when run in a repository with 80 thousand files.
|
|
||||||
|
|
||||||
(Used to be 80 mb, but implementation improved.)
|
It used to be a lot worse (hundreds of megabytes).
|
||||||
|
|
||||||
I would like to reduce this. One idea is to use a bloom filter.
|
Now it only needs enough memory to store a Set of all Keys that currently
|
||||||
|
have content in the annex. On a lightly populated repository, it runs in
|
||||||
|
quite low memory use (like 8 mb) even if the git repo has 100 thousand
|
||||||
|
files. On a repository with lots of file contents, it will use more.
|
||||||
|
|
||||||
|
Still, I would like to reduce this to a purely constant memory use,
|
||||||
|
as running in constant memory no matter the repo size is a git-annex design
|
||||||
|
goal.
|
||||||
|
|
||||||
|
One idea is to use a bloom filter.
|
||||||
For example, construct a bloom filter of all keys used by files in
|
For example, construct a bloom filter of all keys used by files in
|
||||||
the repository. Then for each key with content present, check if it's
|
the repository. Then for each key with content present, check if it's
|
||||||
in the bloom filter. Since there can be false positives, this might
|
in the bloom filter. Since there can be false positives, this might
|
||||||
|
|
Loading…
Add table
Reference in a new issue