Improve memory usage when git fsck finds a great many broken objects.

From 1.7 gb to 900 mb on 300 thousand unique reported shas.

When shas are not unique, this streams much better than before, so won't
buffer the full list before putting them into the Set and throwing away
dups. And when fsck output includes ignorable lines, especially
dangling object lines, they won't be buffered in memory at all.
This commit is contained in:
Joey Hess 2014-03-10 15:12:54 -04:00
parent aab40f02ca
commit 0e0d396b27
2 changed files with 19 additions and 3 deletions

View file

@ -23,6 +23,7 @@ import Utility.Batch
import qualified Git.Version
import qualified Data.Set as S
import System.Process (std_out, std_err)
type MissingObjects = S.Set Sha
@ -46,9 +47,17 @@ findBroken batchmode r = do
(command', params') <- if batchmode
then toBatchCommand (command, params)
else return (command, params)
(output, fsckok) <- processTranscript command' (toCommand params') Nothing
let objs = findShas supportsNoDangling output
badobjs <- findMissing objs r
p@(_, _, _, pid) <- createProcess $
(proc command' (toCommand params'))
{ std_out = CreatePipe
, std_err = CreatePipe
}
bad1 <- readMissingObjs r supportsNoDangling (stdoutHandle p)
bad2 <- readMissingObjs r supportsNoDangling (stderrHandle p)
fsckok <- checkSuccessProcess pid
let badobjs = S.union bad1 bad2
if S.null badobjs && not fsckok
then return FsckFailed
else return $ FsckFoundMissing badobjs
@ -69,6 +78,11 @@ knownMissing (FsckFoundMissing s) = s
findMissing :: [Sha] -> Repo -> IO MissingObjects
findMissing objs r = S.fromList <$> filterM (`isMissing` r) objs
readMissingObjs :: Repo -> Bool -> Handle -> IO MissingObjects
readMissingObjs r supportsNoDangling h = do
objs <- findShas supportsNoDangling <$> hGetContents h
findMissing objs r
isMissing :: Sha -> Repo -> IO Bool
isMissing s r = either (const True) (const False) <$> tryIO dump
where

2
debian/changelog vendored
View file

@ -5,6 +5,8 @@ git-annex (5.20140307) UNRELEASED; urgency=medium
* webapp: Added a "Sync now" item to each repository's menu.
* unused: In direct mode, files that are deleted from the work tree
are no longer incorrectly detected as unused.
* repair: Improve memory usage when git fsck finds a great many broken
objects.
-- Joey Hess <joeyh@debian.org> Thu, 06 Mar 2014 16:17:01 -0400