more completely solve catKey memory leak

Done using a mode witness, which ensures it's fixed everywhere.

Fixing catFileKey was a bear, because git cat-file does not provide a
nice way to query for the mode of a file and there is no other efficient
way to do it. Oh, for libgit2..

Note that I am looking at tree objects from HEAD, rather than the index.
Because I cat-file cannot show a tree object for the index.
So this fix is technically incomplete. The only cases where it matters
are:

1. A new large file has been directly staged in git, but not committed.
2. A file that was committed to HEAD as a symlink has been staged
   directly in the index.

This could be fixed a lot better using libgit2.
This commit is contained in:
Joey Hess 2013-09-19 16:30:37 -04:00
parent f26c996dc6
commit 006cf7976f
10 changed files with 71 additions and 27 deletions

View file

@ -8,6 +8,7 @@
module Annex.CatFile ( module Annex.CatFile (
catFile, catFile,
catObject, catObject,
catTree,
catObjectDetails, catObjectDetails,
catFileHandle, catFileHandle,
catKey, catKey,
@ -17,6 +18,7 @@ module Annex.CatFile (
import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Lazy as L
import qualified Data.Map as M import qualified Data.Map as M
import System.PosixCompat.Types
import Common.Annex import Common.Annex
import qualified Git import qualified Git
@ -24,6 +26,7 @@ import qualified Git.CatFile
import qualified Annex import qualified Annex
import Git.Types import Git.Types
import Git.FilePath import Git.FilePath
import Git.FileMode
catFile :: Git.Branch -> FilePath -> Annex L.ByteString catFile :: Git.Branch -> FilePath -> Annex L.ByteString
catFile branch file = do catFile branch file = do
@ -35,6 +38,11 @@ catObject ref = do
h <- catFileHandle h <- catFileHandle
liftIO $ Git.CatFile.catObject h ref liftIO $ Git.CatFile.catObject h ref
catTree :: Git.Ref -> Annex [(FilePath, FileMode)]
catTree ref = do
h <- catFileHandle
liftIO $ Git.CatFile.catTree h ref
catObjectDetails :: Git.Ref -> Annex (Maybe (L.ByteString, Sha)) catObjectDetails :: Git.Ref -> Annex (Maybe (L.ByteString, Sha))
catObjectDetails ref = do catObjectDetails ref = do
h <- catFileHandle h <- catFileHandle
@ -55,13 +63,39 @@ catFileHandle = do
Annex.changeState $ \s -> s { Annex.catfilehandles = m' } Annex.changeState $ \s -> s { Annex.catfilehandles = m' }
return h return h
{- From the Sha or Ref of a symlink back to the key. -} {- From the Sha or Ref of a symlink back to the key.
catKey :: Ref -> Annex (Maybe Key) -
catKey ref = do - Requires a mode witness, to guarantee that the file is a symlink.
l <- fromInternalGitPath . encodeW8 . L.unpack <$> catObject ref -}
return $ if isLinkToAnnex l catKey :: Ref -> FileMode -> Annex (Maybe Key)
then fileKey $ takeFileName l catKey ref mode
else Nothing | isSymLink mode = do
l <- fromInternalGitPath . encodeW8 . L.unpack <$> catObject ref
return $ if isLinkToAnnex l
then fileKey $ takeFileName l
else Nothing
| otherwise = return Nothing
{- Looks up the file mode corresponding to the Ref using the running
- cat-file.
-
- Currently this always has to look in HEAD, because cat-file --batch
- does not offer a way to specify that we want to look up a tree object
- in the index. So if the index has a file staged not as a symlink,
- and it is a sylink in head, the wrong mode is gotten. This is a bug.
- Also, we have to assume the file is a symlink if it's not yet committed
- to HEAD.
-}
catKeyChecked :: Bool -> Ref -> Annex (Maybe Key)
catKeyChecked needhead ref@(Ref r) =
catKey ref =<< findmode <$> catTree treeref
where
pathparts = split "/" r
dir = intercalate "/" $ take (length pathparts - 1) pathparts
file = fromMaybe "" $ lastMaybe pathparts
treeref = Ref $ if needhead then "HEAD" ++ dir ++ "/" else dir ++ "/"
findmode = fromMaybe symLinkMode . headMaybe .
map snd . filter (\p -> fst p == file)
{- From a file in the repository back to the key. {- From a file in the repository back to the key.
- -
@ -76,7 +110,8 @@ catKey ref = do
- -
- For command-line git-annex use, that doesn't matter. It's perfectly - For command-line git-annex use, that doesn't matter. It's perfectly
- reasonable for things staged in the index after the currently running - reasonable for things staged in the index after the currently running
- git-annex process to not be noticed by it. - git-annex process to not be noticed by it. However, we do want to see
- what's in the index, since it may have uncommitted changes not in HEAD>
- -
- For the assistant, this is much more of a problem, since it commits - For the assistant, this is much more of a problem, since it commits
- files and then needs to be able to immediately look up their keys. - files and then needs to be able to immediately look up their keys.
@ -89,8 +124,8 @@ catKey ref = do
catKeyFile :: FilePath -> Annex (Maybe Key) catKeyFile :: FilePath -> Annex (Maybe Key)
catKeyFile f = ifM (Annex.getState Annex.daemon) catKeyFile f = ifM (Annex.getState Annex.daemon)
( catKeyFileHEAD f ( catKeyFileHEAD f
, catKey $ Ref $ ":./" ++ f , catKeyChecked True (Ref $ ":./" ++ f)
) )
catKeyFileHEAD :: FilePath -> Annex (Maybe Key) catKeyFileHEAD :: FilePath -> Annex (Maybe Key)
catKeyFileHEAD f = catKey $ Ref $ "HEAD:./" ++ f catKeyFileHEAD f = catKeyChecked False (Ref $ "HEAD:./" ++ f)

View file

@ -14,7 +14,6 @@ import qualified Git.Merge
import qualified Git.DiffTree as DiffTree import qualified Git.DiffTree as DiffTree
import Git.Sha import Git.Sha
import Git.Types import Git.Types
import Git.FileMode
import Annex.CatFile import Annex.CatFile
import qualified Annex.Queue import qualified Annex.Queue
import Logs.Location import Logs.Location
@ -46,9 +45,7 @@ stageDirect = do
- efficiently as we can, by getting any key that's associated - efficiently as we can, by getting any key that's associated
- with it in git, as well as its stat info. -} - with it in git, as well as its stat info. -}
go (file, Just sha, Just mode) = do go (file, Just sha, Just mode) = do
shakey <- if isSymLink mode shakey <- catKey sha mode
then catKey sha
else return Nothing
mstat <- liftIO $ catchMaybeIO $ getSymbolicLinkStatus file mstat <- liftIO $ catchMaybeIO $ getSymbolicLinkStatus file
filekey <- isAnnexLink file filekey <- isAnnexLink file
case (shakey, filekey, mstat, toInodeCache =<< mstat) of case (shakey, filekey, mstat, toInodeCache =<< mstat) of
@ -149,10 +146,9 @@ mergeDirectCleanup d oldsha newsha = do
where where
go getsha getmode a araw go getsha getmode a araw
| getsha item == nullSha = noop | getsha item == nullSha = noop
| isSymLink (getmode item) = | otherwise =
maybe (araw f) (\k -> void $ a k f) maybe (araw f) (\k -> void $ a k f)
=<< catKey (getsha item) =<< catKey (getsha item) (getmode item)
| otherwise = araw f
f = DiffTree.file item f = DiffTree.file item
moveout = removeDirect moveout = removeDirect

View file

@ -78,7 +78,7 @@ perform = do
return Nothing return Nothing
| otherwise -> | otherwise ->
maybe noop (fromdirect f) maybe noop (fromdirect f)
=<< catKey sha =<< catKey sha mode
_ -> noop _ -> noop
go _ = noop go _ = noop

View file

@ -16,7 +16,6 @@ import qualified Git.Ref
import Annex.CatFile import Annex.CatFile
import Annex.Content.Direct import Annex.Content.Direct
import Git.Sha import Git.Sha
import Git.FileMode
def :: [Command] def :: [Command]
def = [command "pre-commit" paramPaths seek SectionPlumbing def = [command "pre-commit" paramPaths seek SectionPlumbing
@ -48,8 +47,8 @@ startDirect _ = next $ do
withkey (Git.DiffTree.srcsha diff) (Git.DiffTree.srcmode diff) removeAssociatedFile withkey (Git.DiffTree.srcsha diff) (Git.DiffTree.srcmode diff) removeAssociatedFile
withkey (Git.DiffTree.dstsha diff) (Git.DiffTree.dstmode diff) addAssociatedFile withkey (Git.DiffTree.dstsha diff) (Git.DiffTree.dstmode diff) addAssociatedFile
where where
withkey sha mode a = when (sha /= nullSha && isSymLink mode) $ do withkey sha mode a = when (sha /= nullSha) $ do
k <- catKey sha k <- catKey sha mode
case k of case k of
Nothing -> noop Nothing -> noop
Just key -> void $ a key (Git.DiffTree.file diff) Just key -> void $ a key (Git.DiffTree.file diff)

View file

@ -29,6 +29,7 @@ import qualified Remote.Git
import Types.Key import Types.Key
import Config import Config
import Annex.ReplaceFile import Annex.ReplaceFile
import Git.FileMode
import Data.Hash.MD5 import Data.Hash.MD5
@ -321,7 +322,7 @@ resolveMerge' u
case msha of case msha of
Nothing -> a Nothing Nothing -> a Nothing
Just sha -> do Just sha -> do
key <- catKey sha key <- catKey sha symLinkMode
maybe (return False) (a . Just) key maybe (return False) (a . Just) key
{- The filename to use when resolving a conflicted merge of a file, {- The filename to use when resolving a conflicted merge of a file,

View file

@ -130,4 +130,4 @@ catTree h treeref = go <$> catObjectDetails h treeref
parsemodefile b = parsemodefile b =
let (modestr, file) = separate (== ' ') (encodeW8 $ L.unpack b) let (modestr, file) = separate (== ' ') (encodeW8 $ L.unpack b)
in (file, readmode modestr) in (file, readmode modestr)
readmode = fst . Prelude.head . readOct readmode = fst . fromMaybe (0, undefined) . headMaybe . readOct

View file

@ -13,8 +13,11 @@ import Utility.FileMode
import System.PosixCompat.Types import System.PosixCompat.Types
symLinkMode :: FileMode
symLinkMode = 40960
{- Git uses a special file mode to indicate a symlink. This is the case {- Git uses a special file mode to indicate a symlink. This is the case
- even on Windows, so we hard code the valuse here, rather than using - even on Windows, so we hard code the valuse here, rather than using
- System.Posix.Files.symbolicLinkMode. -} - System.Posix.Files.symbolicLinkMode. -}
isSymLink :: FileMode -> Bool isSymLink :: FileMode -> Bool
isSymLink = checkMode 40960 isSymLink = checkMode symLinkMode

4
debian/changelog vendored
View file

@ -18,8 +18,8 @@ git-annex (4.20130912) UNRELEASED; urgency=low
numcopies levels. (--fast avoids calculating these) numcopies levels. (--fast avoids calculating these)
* gcrypt: Ensure that signing key is set to one of the participants keys. * gcrypt: Ensure that signing key is set to one of the participants keys.
* webapp: Show encryption information when editing a remote. * webapp: Show encryption information when editing a remote.
* sync, pre-commit, indirect: Avoid unnecessarily catting non-symlink * Avoid unnecessarily catting non-symlink files from git, which can be
files from git, which can be so large it runs out of memory. so large it runs out of memory.
-- Joey Hess <joeyh@debian.org> Thu, 12 Sep 2013 12:14:46 -0400 -- Joey Hess <joeyh@debian.org> Thu, 12 Sep 2013 12:14:46 -0400

View file

@ -61,3 +61,9 @@ commit git-annex: out of memory (requested 985661440 bytes)
# End of transcript or log. # End of transcript or log.
"""]] """]]
> [[fixed|done]]. However, if you saw this behavior,
> you have large files checked directly into git. You may
> want to examine your repository and use git filter-branch to clean
> it up.
> --[[Joey]]

View file

@ -77,4 +77,8 @@ Any thoughts on how I can get git-annex (esp. fsck) to complete would be appreci
Thanks Thanks
Giovanni Giovanni
[[!tag moreinfo]] > [[fixed|done]]. However, if you saw this behavior,
> you have large files checked directly into git. You may
> want to examine your repository and use git filter-branch to clean
> it up.
> --[[Joey]]