From 7ebd98d8d829005c7dae38b789146d98e6800e5b Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 14 Feb 2012 14:35:52 -0400 Subject: [PATCH] fix memory leak when staging the journal The list of files had to be retained until the end so it could be deleted. Also, a list of update-index lines was generated and only then fed into it. Now everything streams in constant space. --- Annex/Branch.hs | 23 ++++++------ Git/HashObject.hs | 47 +++++++++++++++---------- debian/changelog | 3 +- doc/bugs/git_annex_add_memory_leak.mdwn | 21 +++++------ 4 files changed, 52 insertions(+), 42 deletions(-) diff --git a/Annex/Branch.hs b/Annex/Branch.hs index 72a98ac167..f20c87b4a3 100644 --- a/Annex/Branch.hs +++ b/Annex/Branch.hs @@ -1,6 +1,6 @@ {- management of the git-annex branch - - - Copyright 2011 Joey Hess + - Copyright 2011-2012 Joey Hess - - Licensed under the GNU GPL version 3 or higher. -} @@ -32,7 +32,7 @@ import qualified Git.Command import qualified Git.Ref import qualified Git.Branch import qualified Git.UnionMerge -import qualified Git.HashObject +import Git.HashObject import qualified Git.Index import Annex.CatFile @@ -307,13 +307,14 @@ stageJournal = do fs <- getJournalFiles g <- gitRepo withIndex $ liftIO $ do - let dir = gitAnnexJournalDir g - let paths = map (dir ) fs - (shas, cleanup) <- Git.HashObject.hashFiles paths g - Git.UnionMerge.update_index g $ - index_lines shas (map fileJournal fs) - cleanup - mapM_ removeFile paths + h <- hashObjectStart g + Git.UnionMerge.stream_update_index g + [genstream (gitAnnexJournalDir g) h fs] + hashObjectStop h where - index_lines shas = map genline . zip shas - genline (sha, file) = Git.UnionMerge.update_index_line sha file + genstream dir h fs streamer = forM_ fs $ \file -> do + let path = dir file + sha <- hashFile h path + streamer $ Git.UnionMerge.update_index_line + sha (fileJournal file) + removeFile path diff --git a/Git/HashObject.hs b/Git/HashObject.hs index ac74f02577..200fedbd27 100644 --- a/Git/HashObject.hs +++ b/Git/HashObject.hs @@ -1,6 +1,6 @@ {- git hash-object interface - - - Copyright 2011 Joey Hess + - Copyright 2011-2012 Joey Hess - - Licensed under the GNU GPL version 3 or higher. -} @@ -11,23 +11,32 @@ import Common import Git import Git.Command -{- Injects a set of files into git, returning the shas of the objects - - and an IO action to call once the the shas have been used. -} -hashFiles :: [FilePath] -> Repo -> IO ([Sha], IO ()) -hashFiles paths repo = do - (pid, fromh, toh) <- hPipeBoth "git" $ toCommand $ git_hash_object repo +type HashObjectHandle = (PipeHandle, Handle, Handle) + +{- Starts git hash-object and returns a handle. -} +hashObjectStart :: Repo -> IO HashObjectHandle +hashObjectStart repo = do + r@(_, _, toh) <- hPipeBoth "git" $ + toCommand $ gitCommandLine params repo fileEncoding toh - _ <- forkProcess (feeder toh) - hClose toh - shas <- map Ref . lines <$> hGetContents fromh - return (shas, ender fromh pid) + return r where - git_hash_object = gitCommandLine - [Param "hash-object", Param "-w", Param "--stdin-paths"] - feeder toh = do - hPutStr toh $ unlines paths - hClose toh - exitSuccess - ender fromh pid = do - hClose fromh - forceSuccess pid + params = + [ Param "hash-object" + , Param "-w" + , Param "--stdin-paths" + ] + +{- Stops git hash-object. -} +hashObjectStop :: HashObjectHandle -> IO () +hashObjectStop (pid, from, to) = do + hClose to + hClose from + forceSuccess pid + +{- Injects a file into git, returning the shas of the objects. -} +hashFile :: HashObjectHandle -> FilePath -> IO Sha +hashFile (_, from, to) file = do + hPutStrLn to file + hFlush to + Ref <$> hGetLine from diff --git a/debian/changelog b/debian/changelog index 23ade624c1..9317a52915 100644 --- a/debian/changelog +++ b/debian/changelog @@ -24,8 +24,7 @@ git-annex (3.20120124) UNRELEASED; urgency=low its head), and records the size in the key. * Fixed to use the strict state monad, to avoid leaking all kinds of memory due to lazy state update thunks when adding/fixing many files. - * Fixed a memory leak due to excessive strictness when committing journal - files. + * Fixed some memory leaks that occurred when committing journal files. * whereis: Prints the urls of files that the web special remote knows about. -- Joey Hess Tue, 24 Jan 2012 16:21:55 -0400 diff --git a/doc/bugs/git_annex_add_memory_leak.mdwn b/doc/bugs/git_annex_add_memory_leak.mdwn index b6ae60f7bd..891ba318f6 100644 --- a/doc/bugs/git_annex_add_memory_leak.mdwn +++ b/doc/bugs/git_annex_add_memory_leak.mdwn @@ -12,26 +12,27 @@ A history of the leaks: * Originally, `git annex add` remembered all the files it had added, and fed them to git at the end. Of course that made its memory use grow, so it was fixed to periodically - flush its buffer. Affected versions: before 0.20110417 + flush its buffer. Fixed in version 0.20110417. * Something called a "lazy state monad" caused "thunks" to build up and memory to leak. Also affected other git annex commands than `add`. Adding files using a SHA* backend hit the worst. Fixed in versions afer 3.20120123. -* A strange GHC bug seemed to be responsible for another leak. - (In particular, a child process was forked. All the child did - was read filenames from one pipe and shove them reformatted out - another pipe. For some reason, it steadily grew in size.) - Code was rewritten in a way that happens to avoid that leak. - Apparently fixed in versions afer 3.20120123, but this one is not - well understood. - * Committing journal files turned out to have another memory leak. After adding a lot of files ran out of memory, this left the journal - behind and could affect other git-anne commands. Fixed in versions afer + behind and could affect other git-annex commands. Fixed in versions afer 3.20120123. +* Something is still causing a slow leak when adding files. + I tested by adding many copies of the whole linux kernel + tree into the annex using the WORM backend, and once + it had added 1 million files, git-annex used ~100 mb of ram. + That's 100 bytes leaked per file on average .. roughly the + size of a filename? It's worth noting that `git add` uses more memory + than that in such a large tree. + **not fixed yet** + * (Note that `git ls-files --others`, which is used to find files to add, also uses surpsisingly large amounts of memory when you have a lot of files. It buffers