fix memory leak when staging the journal
The list of files had to be retained until the end so it could be deleted. Also, a list of update-index lines was generated and only then fed into it. Now everything streams in constant space.
This commit is contained in:
parent
cdd6cdbb67
commit
7ebd98d8d8
4 changed files with 52 additions and 42 deletions
|
@ -1,6 +1,6 @@
|
||||||
{- management of the git-annex branch
|
{- management of the git-annex branch
|
||||||
-
|
-
|
||||||
- Copyright 2011 Joey Hess <joey@kitenet.net>
|
- Copyright 2011-2012 Joey Hess <joey@kitenet.net>
|
||||||
-
|
-
|
||||||
- Licensed under the GNU GPL version 3 or higher.
|
- Licensed under the GNU GPL version 3 or higher.
|
||||||
-}
|
-}
|
||||||
|
@ -32,7 +32,7 @@ import qualified Git.Command
|
||||||
import qualified Git.Ref
|
import qualified Git.Ref
|
||||||
import qualified Git.Branch
|
import qualified Git.Branch
|
||||||
import qualified Git.UnionMerge
|
import qualified Git.UnionMerge
|
||||||
import qualified Git.HashObject
|
import Git.HashObject
|
||||||
import qualified Git.Index
|
import qualified Git.Index
|
||||||
import Annex.CatFile
|
import Annex.CatFile
|
||||||
|
|
||||||
|
@ -307,13 +307,14 @@ stageJournal = do
|
||||||
fs <- getJournalFiles
|
fs <- getJournalFiles
|
||||||
g <- gitRepo
|
g <- gitRepo
|
||||||
withIndex $ liftIO $ do
|
withIndex $ liftIO $ do
|
||||||
let dir = gitAnnexJournalDir g
|
h <- hashObjectStart g
|
||||||
let paths = map (dir </>) fs
|
Git.UnionMerge.stream_update_index g
|
||||||
(shas, cleanup) <- Git.HashObject.hashFiles paths g
|
[genstream (gitAnnexJournalDir g) h fs]
|
||||||
Git.UnionMerge.update_index g $
|
hashObjectStop h
|
||||||
index_lines shas (map fileJournal fs)
|
|
||||||
cleanup
|
|
||||||
mapM_ removeFile paths
|
|
||||||
where
|
where
|
||||||
index_lines shas = map genline . zip shas
|
genstream dir h fs streamer = forM_ fs $ \file -> do
|
||||||
genline (sha, file) = Git.UnionMerge.update_index_line sha file
|
let path = dir </> file
|
||||||
|
sha <- hashFile h path
|
||||||
|
streamer $ Git.UnionMerge.update_index_line
|
||||||
|
sha (fileJournal file)
|
||||||
|
removeFile path
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{- git hash-object interface
|
{- git hash-object interface
|
||||||
-
|
-
|
||||||
- Copyright 2011 Joey Hess <joey@kitenet.net>
|
- Copyright 2011-2012 Joey Hess <joey@kitenet.net>
|
||||||
-
|
-
|
||||||
- Licensed under the GNU GPL version 3 or higher.
|
- Licensed under the GNU GPL version 3 or higher.
|
||||||
-}
|
-}
|
||||||
|
@ -11,23 +11,32 @@ import Common
|
||||||
import Git
|
import Git
|
||||||
import Git.Command
|
import Git.Command
|
||||||
|
|
||||||
{- Injects a set of files into git, returning the shas of the objects
|
type HashObjectHandle = (PipeHandle, Handle, Handle)
|
||||||
- and an IO action to call once the the shas have been used. -}
|
|
||||||
hashFiles :: [FilePath] -> Repo -> IO ([Sha], IO ())
|
{- Starts git hash-object and returns a handle. -}
|
||||||
hashFiles paths repo = do
|
hashObjectStart :: Repo -> IO HashObjectHandle
|
||||||
(pid, fromh, toh) <- hPipeBoth "git" $ toCommand $ git_hash_object repo
|
hashObjectStart repo = do
|
||||||
|
r@(_, _, toh) <- hPipeBoth "git" $
|
||||||
|
toCommand $ gitCommandLine params repo
|
||||||
fileEncoding toh
|
fileEncoding toh
|
||||||
_ <- forkProcess (feeder toh)
|
return r
|
||||||
hClose toh
|
|
||||||
shas <- map Ref . lines <$> hGetContents fromh
|
|
||||||
return (shas, ender fromh pid)
|
|
||||||
where
|
where
|
||||||
git_hash_object = gitCommandLine
|
params =
|
||||||
[Param "hash-object", Param "-w", Param "--stdin-paths"]
|
[ Param "hash-object"
|
||||||
feeder toh = do
|
, Param "-w"
|
||||||
hPutStr toh $ unlines paths
|
, Param "--stdin-paths"
|
||||||
hClose toh
|
]
|
||||||
exitSuccess
|
|
||||||
ender fromh pid = do
|
{- Stops git hash-object. -}
|
||||||
hClose fromh
|
hashObjectStop :: HashObjectHandle -> IO ()
|
||||||
forceSuccess pid
|
hashObjectStop (pid, from, to) = do
|
||||||
|
hClose to
|
||||||
|
hClose from
|
||||||
|
forceSuccess pid
|
||||||
|
|
||||||
|
{- Injects a file into git, returning the shas of the objects. -}
|
||||||
|
hashFile :: HashObjectHandle -> FilePath -> IO Sha
|
||||||
|
hashFile (_, from, to) file = do
|
||||||
|
hPutStrLn to file
|
||||||
|
hFlush to
|
||||||
|
Ref <$> hGetLine from
|
||||||
|
|
3
debian/changelog
vendored
3
debian/changelog
vendored
|
@ -24,8 +24,7 @@ git-annex (3.20120124) UNRELEASED; urgency=low
|
||||||
its head), and records the size in the key.
|
its head), and records the size in the key.
|
||||||
* Fixed to use the strict state monad, to avoid leaking all kinds of memory
|
* Fixed to use the strict state monad, to avoid leaking all kinds of memory
|
||||||
due to lazy state update thunks when adding/fixing many files.
|
due to lazy state update thunks when adding/fixing many files.
|
||||||
* Fixed a memory leak due to excessive strictness when committing journal
|
* Fixed some memory leaks that occurred when committing journal files.
|
||||||
files.
|
|
||||||
* whereis: Prints the urls of files that the web special remote knows about.
|
* whereis: Prints the urls of files that the web special remote knows about.
|
||||||
|
|
||||||
-- Joey Hess <joeyh@debian.org> Tue, 24 Jan 2012 16:21:55 -0400
|
-- Joey Hess <joeyh@debian.org> Tue, 24 Jan 2012 16:21:55 -0400
|
||||||
|
|
|
@ -12,26 +12,27 @@ A history of the leaks:
|
||||||
* Originally, `git annex add` remembered all the files
|
* Originally, `git annex add` remembered all the files
|
||||||
it had added, and fed them to git at the end. Of course
|
it had added, and fed them to git at the end. Of course
|
||||||
that made its memory use grow, so it was fixed to periodically
|
that made its memory use grow, so it was fixed to periodically
|
||||||
flush its buffer. Affected versions: before 0.20110417
|
flush its buffer. Fixed in version 0.20110417.
|
||||||
|
|
||||||
* Something called a "lazy state monad" caused "thunks" to build
|
* Something called a "lazy state monad" caused "thunks" to build
|
||||||
up and memory to leak. Also affected other git annex commands
|
up and memory to leak. Also affected other git annex commands
|
||||||
than `add`. Adding files using a SHA* backend hit the worst.
|
than `add`. Adding files using a SHA* backend hit the worst.
|
||||||
Fixed in versions afer 3.20120123.
|
Fixed in versions afer 3.20120123.
|
||||||
|
|
||||||
* A strange GHC bug seemed to be responsible for another leak.
|
|
||||||
(In particular, a child process was forked. All the child did
|
|
||||||
was read filenames from one pipe and shove them reformatted out
|
|
||||||
another pipe. For some reason, it steadily grew in size.)
|
|
||||||
Code was rewritten in a way that happens to avoid that leak.
|
|
||||||
Apparently fixed in versions afer 3.20120123, but this one is not
|
|
||||||
well understood.
|
|
||||||
|
|
||||||
* Committing journal files turned out to have another memory leak.
|
* Committing journal files turned out to have another memory leak.
|
||||||
After adding a lot of files ran out of memory, this left the journal
|
After adding a lot of files ran out of memory, this left the journal
|
||||||
behind and could affect other git-anne commands. Fixed in versions afer
|
behind and could affect other git-annex commands. Fixed in versions afer
|
||||||
3.20120123.
|
3.20120123.
|
||||||
|
|
||||||
|
* Something is still causing a slow leak when adding files.
|
||||||
|
I tested by adding many copies of the whole linux kernel
|
||||||
|
tree into the annex using the WORM backend, and once
|
||||||
|
it had added 1 million files, git-annex used ~100 mb of ram.
|
||||||
|
That's 100 bytes leaked per file on average .. roughly the
|
||||||
|
size of a filename? It's worth noting that `git add` uses more memory
|
||||||
|
than that in such a large tree.
|
||||||
|
**not fixed yet**
|
||||||
|
|
||||||
* (Note that `git ls-files --others`, which is used to find files to add,
|
* (Note that `git ls-files --others`, which is used to find files to add,
|
||||||
also uses surpsisingly large amounts
|
also uses surpsisingly large amounts
|
||||||
of memory when you have a lot of files. It buffers
|
of memory when you have a lot of files. It buffers
|
||||||
|
|
Loading…
Reference in a new issue