From 0e0d396b2786b8e3c583d85fab44519803b3b8c0 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 10 Mar 2014 15:12:54 -0400 Subject: [PATCH] Improve memory usage when git fsck finds a great many broken objects. From 1.7 gb to 900 mb on 300 thousand unique reported shas. When shas are not unique, this streams much better than before, so won't buffer the full list before putting them into the Set and throwing away dups. And when fsck output includes ignorable lines, especially dangling object lines, they won't be buffered in memory at all. --- Git/Fsck.hs | 20 +++++++++++++++++--- debian/changelog | 2 ++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/Git/Fsck.hs b/Git/Fsck.hs index e90683bc0e..b3948cb1da 100644 --- a/Git/Fsck.hs +++ b/Git/Fsck.hs @@ -23,6 +23,7 @@ import Utility.Batch import qualified Git.Version import qualified Data.Set as S +import System.Process (std_out, std_err) type MissingObjects = S.Set Sha @@ -46,9 +47,17 @@ findBroken batchmode r = do (command', params') <- if batchmode then toBatchCommand (command, params) else return (command, params) - (output, fsckok) <- processTranscript command' (toCommand params') Nothing - let objs = findShas supportsNoDangling output - badobjs <- findMissing objs r + + p@(_, _, _, pid) <- createProcess $ + (proc command' (toCommand params')) + { std_out = CreatePipe + , std_err = CreatePipe + } + bad1 <- readMissingObjs r supportsNoDangling (stdoutHandle p) + bad2 <- readMissingObjs r supportsNoDangling (stderrHandle p) + fsckok <- checkSuccessProcess pid + let badobjs = S.union bad1 bad2 + if S.null badobjs && not fsckok then return FsckFailed else return $ FsckFoundMissing badobjs @@ -69,6 +78,11 @@ knownMissing (FsckFoundMissing s) = s findMissing :: [Sha] -> Repo -> IO MissingObjects findMissing objs r = S.fromList <$> filterM (`isMissing` r) objs +readMissingObjs :: Repo -> Bool -> Handle -> IO MissingObjects +readMissingObjs r supportsNoDangling h = do + objs <- findShas supportsNoDangling <$> hGetContents h + findMissing objs r + isMissing :: Sha -> Repo -> IO Bool isMissing s r = either (const True) (const False) <$> tryIO dump where diff --git a/debian/changelog b/debian/changelog index 26153c5fdc..7ff502ad35 100644 --- a/debian/changelog +++ b/debian/changelog @@ -5,6 +5,8 @@ git-annex (5.20140307) UNRELEASED; urgency=medium * webapp: Added a "Sync now" item to each repository's menu. * unused: In direct mode, files that are deleted from the work tree are no longer incorrectly detected as unused. + * repair: Improve memory usage when git fsck finds a great many broken + objects. -- Joey Hess Thu, 06 Mar 2014 16:17:01 -0400