diff --git a/Annex/Locations.hs b/Annex/Locations.hs index 7e52dc6115..8fd3dfce01 100644 --- a/Annex/Locations.hs +++ b/Annex/Locations.hs @@ -38,6 +38,8 @@ module Annex.Locations ( gitAnnexFsckDbDir, gitAnnexFsckDbLock, gitAnnexFsckResultsLog, + gitAnnexSmudgeLog, + gitAnnexSmudgeLock, gitAnnexExportDbDir, gitAnnexExportLock, gitAnnexScheduleState, @@ -312,6 +314,14 @@ gitAnnexFsckDbLock u r = gitAnnexFsckDir u r "fsck.lck" gitAnnexFsckResultsLog :: UUID -> Git.Repo -> FilePath gitAnnexFsckResultsLog u r = gitAnnexDir r "fsckresults" fromUUID u +{- .git/annex/smudge.log is used to log smudges worktree files that need to + - be updated. -} +gitAnnexSmudgeLog :: Git.Repo -> FilePath +gitAnnexSmudgeLog r = gitAnnexDir r "smudge.log" + +gitAnnexSmudgeLock :: Git.Repo -> FilePath +gitAnnexSmudgeLock r = gitAnnexDir r "smudge.lck" + {- .git/annex/export/uuid/ is used to store information about - exports to special remotes. -} gitAnnexExportDir :: UUID -> Git.Repo -> FilePath diff --git a/CHANGELOG b/CHANGELOG index 4a698a6619..2ea7896924 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -21,6 +21,11 @@ git-annex (6.20181012) UNRELEASED; urgency=medium get confused about whether a locked file's content was present. * Fix concurrency bug that occurred on the first download from an exporttree remote. + * smudge: The smudge filter no longer provides git with annexed + file content, to avoid a git memory leak, and because that did not + honor annex.thin. Now git annex smudge --update has to be run + after a checkout to update unlocked files in the working tree + with annexed file contents. -- Joey Hess Sat, 13 Oct 2018 00:52:02 -0400 diff --git a/Command/Smudge.hs b/Command/Smudge.hs index 75dc6a17bb..d335cf82b5 100644 --- a/Command/Smudge.hs +++ b/Command/Smudge.hs @@ -8,12 +8,12 @@ module Command.Smudge where import Command -import qualified Annex import Annex.Content import Annex.Link import Annex.FileMatcher import Annex.Ingest import Annex.CatFile +import Logs.Smudge import Logs.Location import qualified Database.Keys import qualified Git.BuildVersion @@ -29,43 +29,46 @@ cmd = noCommit $ noMessages $ "git smudge filter" paramFile (seek <$$> optParser) -data SmudgeOptions = SmudgeOptions +data SmudgeOptions = UpdateOption | SmudgeOptions { smudgeFile :: FilePath , cleanOption :: Bool } optParser :: CmdParamsDesc -> Parser SmudgeOptions -optParser desc = SmudgeOptions - <$> argument str ( metavar desc ) - <*> switch ( long "clean" <> help "clean filter" ) +optParser desc = smudgeoptions <|> updateoption + where + smudgeoptions = SmudgeOptions + <$> argument str ( metavar desc ) + <*> switch ( long "clean" <> help "clean filter" ) + updateoption = flag' UpdateOption + ( long "update" <> help "populate annexed worktree files" ) seek :: SmudgeOptions -> CommandSeek -seek o = commandAction $ - (if cleanOption o then clean else smudge) (smudgeFile o) +seek (SmudgeOptions f False) = commandAction (smudge f) +seek (SmudgeOptions f True) = commandAction (clean f) +seek UpdateOption = commandAction update -- Smudge filter is fed git file content, and if it's a pointer to an --- available annex object, should output its content. +-- available annex object, git expects it to output its content. +-- +-- However, this does not do that. It outputs the pointer, and records +-- the filename in the smudge log. Git hooks run after commands like checkout +-- then run git annex smudge --update which populates the work tree files +-- with annex content. This is done for several reasons: +-- +-- * To support annex.thin +-- * Because git currently buffers the whole object received from the +-- smudge filter in memory, which is a problem with large files. smudge :: FilePath -> CommandStart smudge file = do b <- liftIO $ B.hGetContents stdin case parseLinkOrPointer b of - Nothing -> liftIO $ B.putStr b + Nothing -> noop Just k -> do - Database.Keys.addAssociatedFile k =<< inRepo (toTopFilePath file) - -- A previous unlocked checkout of the file may have - -- led to the annex object getting modified; - -- don't provide such modified content as it - -- will be confusing. inAnnex will detect such - -- modifications. - ifM (inAnnex k) - ( do - content <- calcRepo (gitAnnexLocation k) - whenM (annexThin <$> Annex.getGitConfig) $ - warning $ "Not able to honor annex.thin when git is checking out " ++ file ++ " (run git annex fix to re-thin files)" - liftIO $ B.putStr . fromMaybe b - =<< catchMaybeIO (B.readFile content) - , liftIO $ B.putStr b - ) + topfile <- inRepo (toTopFilePath file) + Database.Keys.addAssociatedFile k topfile + smudgeLog k topfile + liftIO $ B.putStr b stop -- Clean filter is fed file content on stdin, decides if a file @@ -151,3 +154,19 @@ getMoveRaceRecovery k file = void $ tryNonAsync $ populatePointerFile (Restage False) k obj file >>= \case Nothing -> return () Just ic -> Database.Keys.addInodeCaches k [ic] + +update :: CommandStart +update = do + updateSmudged (Restage True) + stop + +updateSmudged :: Restage -> Annex () +updateSmudged restage = streamSmudged $ \k topf -> do + f <- fromRepo $ fromTopFilePath topf + whenM (inAnnex k) $ do + obj <- calcRepo (gitAnnexLocation k) + unlessM (isJust <$> populatePointerFile restage k obj f) $ + liftIO (isPointerFile f) >>= \case + Just k' | k' == k -> toplevelWarning False $ + "unable to populate worktree file " ++ f + _ -> noop diff --git a/Logs/File.hs b/Logs/File.hs index 6676dbb7ef..3226b23067 100644 --- a/Logs/File.hs +++ b/Logs/File.hs @@ -2,26 +2,59 @@ - - Copyright 2018 Joey Hess - - - Licensed under the GNU GPL version 3 or higher. + - Licensed under the GNU AGPL version 3 or higher. -} -module Logs.File where +module Logs.File (writeLogFile, appendLogFile, streamLogFile) where import Annex.Common import Annex.Perms +import Annex.LockFile +import qualified Git import Utility.Tmp -- | Writes content to a file, replacing the file atomically, and -- making the new file have whatever permissions the git repository is -- configured to use. Creates the parent directory when necessary. writeLogFile :: FilePath -> String -> Annex () -writeLogFile f c = go `catchNonAsync` \_e -> do - -- Most of the time, the directory will exist, so this is only - -- done if writing the file fails. - createAnnexDirectory (parentDir f) - go +writeLogFile f c = createDirWhenNeeded f $ viaTmp writelog f c where - go = viaTmp writelog f c writelog f' c' = do liftIO $ writeFile f' c' setAnnexFilePerm f' + +-- | Appends a line to a log file, first locking it to prevent +-- concurrent writers. +appendLogFile :: FilePath -> (Git.Repo -> FilePath) -> String -> Annex () +appendLogFile f lck c = createDirWhenNeeded f $ withExclusiveLock lck $ do + liftIO $ withFile f AppendMode $ \h -> hPutStrLn h c + setAnnexFilePerm f + +-- | Streams lines from a log file, and then empties the file at the end. +-- +-- If the action is interrupted or throws an exception, the log file is +-- left unchanged. +-- +-- Does nothing if the log file does not exist. +-- +-- Locking is used to prevent writes to to the log file while this +-- is running. +streamLogFile :: FilePath -> (Git.Repo -> FilePath) -> (String -> Annex ()) -> Annex () +streamLogFile f lck a = withExclusiveLock lck $ bracketOnError setup cleanup go + where + setup = liftIO $ tryWhenExists $ openFile f ReadMode + cleanup Nothing = noop + cleanup (Just h) = liftIO $ hClose h + go Nothing = noop + go (Just h) = do + mapM_ a =<< liftIO (lines <$> hGetContents h) + liftIO $ hClose h + liftIO $ writeFile f "" + setAnnexFilePerm f + +createDirWhenNeeded :: FilePath -> Annex () -> Annex () +createDirWhenNeeded f a = a `catchNonAsync` \_e -> do + -- Most of the time, the directory will exist, so this is only + -- done if writing the file fails. + createAnnexDirectory (parentDir f) + a diff --git a/Logs/Smudge.hs b/Logs/Smudge.hs new file mode 100644 index 0000000000..3a1fca8d46 --- /dev/null +++ b/Logs/Smudge.hs @@ -0,0 +1,40 @@ +{- git-annex smudge log file + - + - Copyright 2018 Joey Hess + - + - Licensed under the GNU AGPL version 3 or higher. + -} + +module Logs.Smudge where + +import Annex.Common +import Git.FilePath +import Logs.File + +-- | Log a smudged file. +smudgeLog :: Key -> TopFilePath -> Annex () +smudgeLog k f = do + logf <- fromRepo gitAnnexSmudgeLog + appendLogFile logf gitAnnexSmudgeLock $ + key2file k ++ " " ++ getTopFilePath f + +-- | Streams all smudged files, and then empties the log at the end. +-- +-- If the action is interrupted or throws an exception, the log file is +-- left unchanged. +-- +-- Locking is used to prevent new items being added to the log while this +-- is running. +streamSmudged :: (Key -> TopFilePath -> Annex ()) -> Annex () +streamSmudged a = do + logf <- fromRepo gitAnnexSmudgeLog + streamLogFile logf gitAnnexSmudgeLock $ \l -> + case parse l of + Nothing -> noop + Just (k, f) -> a k f + where + parse l = + let (ks, f) = separate (== ' ') l + in do + k <- file2key ks + return (k, asTopFilePath f) diff --git a/doc/git-annex-smudge.mdwn b/doc/git-annex-smudge.mdwn index 3b34ab7dc9..98de666754 100644 --- a/doc/git-annex-smudge.mdwn +++ b/doc/git-annex-smudge.mdwn @@ -6,11 +6,13 @@ git-annex smudge - git filter driver for git-annex git annex smudge [--clean] file +git annex smudge --update + # DESCRIPTION This command lets git-annex be used as a git filter driver which lets -annexed files in the git repository to be unlocked at all times, instead -of being symlinks. +annexed files in the git repository to be unlocked, instead +of being symlinks, and lets `git add` store files in the annex. When adding a file with `git add`, the annex.largefiles config is consulted to decide if a given file should be added to git as-is, @@ -32,6 +34,16 @@ contents: * filter=annex .* !filter +The smudge filter does not provide git with the content of annexed files, +because that would be slow and triggers memory leaks in git. Instead, +it records which worktree files need to be updated, and +`git annex smudge --update` later updates the work tree to contain +the content. That is run by several git hooks, including post-checkout +and post-merge. However, a few git commands, notably `git stash` and +`git cherry-pick`, do not run any hooks, so after using those commands +you can manually run `git annex smudge --update` to update the working +tree. + # SEE ALSO [[git-annex]](1) diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index fa8a6f3aac..2ee23d5787 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -1032,11 +1032,6 @@ Here are all the supported configuration settings. After setting (or unsetting) this, you should run `git annex fix` to fix up the annexed files in the work tree to be hard links (or copies). - Note that `annex.thin` is not honored when git updates an annexed file - in the working tree. So when `git checkout` or `git merge` updates the - working tree, a second copy of annexed files will result. You can run - `git-annex fix` to fix up the hard links after running such git commands. - * `annex.delayadd` Makes the watch and assistant commands delay for the specified number of diff --git a/git-annex.cabal b/git-annex.cabal index a2c6b4df0c..dfa30b1246 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -880,6 +880,7 @@ Executable git-annex Logs.Schedule Logs.SingleValue Logs.SingleValue.Pure + Logs.Smudge Logs.TimeStamp Logs.Transfer Logs.Transitions