defer updating unlocked files until after smudge filter

The smuge filter no longer provides git with annexed file content, to
avoid a git memory leak, and because that did not honor annex.thin.

git annex smudge --update has to be run after a checkout to update
unlocked files in the working tree with annexed file contents.

No hooks yet to run it.

This commit was sponsored by Nick Piper on Patreon.
This commit is contained in:
Joey Hess 2018-10-25 14:43:13 -04:00
parent f2a4db724c
commit 917a2c6095
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
8 changed files with 154 additions and 39 deletions

View file

@ -38,6 +38,8 @@ module Annex.Locations (
gitAnnexFsckDbDir,
gitAnnexFsckDbLock,
gitAnnexFsckResultsLog,
gitAnnexSmudgeLog,
gitAnnexSmudgeLock,
gitAnnexExportDbDir,
gitAnnexExportLock,
gitAnnexScheduleState,
@ -312,6 +314,14 @@ gitAnnexFsckDbLock u r = gitAnnexFsckDir u r </> "fsck.lck"
gitAnnexFsckResultsLog :: UUID -> Git.Repo -> FilePath
gitAnnexFsckResultsLog u r = gitAnnexDir r </> "fsckresults" </> fromUUID u
{- .git/annex/smudge.log is used to log smudges worktree files that need to
- be updated. -}
gitAnnexSmudgeLog :: Git.Repo -> FilePath
gitAnnexSmudgeLog r = gitAnnexDir r </> "smudge.log"
gitAnnexSmudgeLock :: Git.Repo -> FilePath
gitAnnexSmudgeLock r = gitAnnexDir r </> "smudge.lck"
{- .git/annex/export/uuid/ is used to store information about
- exports to special remotes. -}
gitAnnexExportDir :: UUID -> Git.Repo -> FilePath

View file

@ -21,6 +21,11 @@ git-annex (6.20181012) UNRELEASED; urgency=medium
get confused about whether a locked file's content was present.
* Fix concurrency bug that occurred on the first download from an
exporttree remote.
* smudge: The smudge filter no longer provides git with annexed
file content, to avoid a git memory leak, and because that did not
honor annex.thin. Now git annex smudge --update has to be run
after a checkout to update unlocked files in the working tree
with annexed file contents.
-- Joey Hess <id@joeyh.name> Sat, 13 Oct 2018 00:52:02 -0400

View file

@ -8,12 +8,12 @@
module Command.Smudge where
import Command
import qualified Annex
import Annex.Content
import Annex.Link
import Annex.FileMatcher
import Annex.Ingest
import Annex.CatFile
import Logs.Smudge
import Logs.Location
import qualified Database.Keys
import qualified Git.BuildVersion
@ -29,43 +29,46 @@ cmd = noCommit $ noMessages $
"git smudge filter"
paramFile (seek <$$> optParser)
data SmudgeOptions = SmudgeOptions
data SmudgeOptions = UpdateOption | SmudgeOptions
{ smudgeFile :: FilePath
, cleanOption :: Bool
}
optParser :: CmdParamsDesc -> Parser SmudgeOptions
optParser desc = SmudgeOptions
optParser desc = smudgeoptions <|> updateoption
where
smudgeoptions = SmudgeOptions
<$> argument str ( metavar desc )
<*> switch ( long "clean" <> help "clean filter" )
updateoption = flag' UpdateOption
( long "update" <> help "populate annexed worktree files" )
seek :: SmudgeOptions -> CommandSeek
seek o = commandAction $
(if cleanOption o then clean else smudge) (smudgeFile o)
seek (SmudgeOptions f False) = commandAction (smudge f)
seek (SmudgeOptions f True) = commandAction (clean f)
seek UpdateOption = commandAction update
-- Smudge filter is fed git file content, and if it's a pointer to an
-- available annex object, should output its content.
-- available annex object, git expects it to output its content.
--
-- However, this does not do that. It outputs the pointer, and records
-- the filename in the smudge log. Git hooks run after commands like checkout
-- then run git annex smudge --update which populates the work tree files
-- with annex content. This is done for several reasons:
--
-- * To support annex.thin
-- * Because git currently buffers the whole object received from the
-- smudge filter in memory, which is a problem with large files.
smudge :: FilePath -> CommandStart
smudge file = do
b <- liftIO $ B.hGetContents stdin
case parseLinkOrPointer b of
Nothing -> liftIO $ B.putStr b
Nothing -> noop
Just k -> do
Database.Keys.addAssociatedFile k =<< inRepo (toTopFilePath file)
-- A previous unlocked checkout of the file may have
-- led to the annex object getting modified;
-- don't provide such modified content as it
-- will be confusing. inAnnex will detect such
-- modifications.
ifM (inAnnex k)
( do
content <- calcRepo (gitAnnexLocation k)
whenM (annexThin <$> Annex.getGitConfig) $
warning $ "Not able to honor annex.thin when git is checking out " ++ file ++ " (run git annex fix to re-thin files)"
liftIO $ B.putStr . fromMaybe b
=<< catchMaybeIO (B.readFile content)
, liftIO $ B.putStr b
)
topfile <- inRepo (toTopFilePath file)
Database.Keys.addAssociatedFile k topfile
smudgeLog k topfile
liftIO $ B.putStr b
stop
-- Clean filter is fed file content on stdin, decides if a file
@ -151,3 +154,19 @@ getMoveRaceRecovery k file = void $ tryNonAsync $
populatePointerFile (Restage False) k obj file >>= \case
Nothing -> return ()
Just ic -> Database.Keys.addInodeCaches k [ic]
update :: CommandStart
update = do
updateSmudged (Restage True)
stop
updateSmudged :: Restage -> Annex ()
updateSmudged restage = streamSmudged $ \k topf -> do
f <- fromRepo $ fromTopFilePath topf
whenM (inAnnex k) $ do
obj <- calcRepo (gitAnnexLocation k)
unlessM (isJust <$> populatePointerFile restage k obj f) $
liftIO (isPointerFile f) >>= \case
Just k' | k' == k -> toplevelWarning False $
"unable to populate worktree file " ++ f
_ -> noop

View file

@ -2,26 +2,59 @@
-
- Copyright 2018 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU GPL version 3 or higher.
- Licensed under the GNU AGPL version 3 or higher.
-}
module Logs.File where
module Logs.File (writeLogFile, appendLogFile, streamLogFile) where
import Annex.Common
import Annex.Perms
import Annex.LockFile
import qualified Git
import Utility.Tmp
-- | Writes content to a file, replacing the file atomically, and
-- making the new file have whatever permissions the git repository is
-- configured to use. Creates the parent directory when necessary.
writeLogFile :: FilePath -> String -> Annex ()
writeLogFile f c = go `catchNonAsync` \_e -> do
-- Most of the time, the directory will exist, so this is only
-- done if writing the file fails.
createAnnexDirectory (parentDir f)
go
writeLogFile f c = createDirWhenNeeded f $ viaTmp writelog f c
where
go = viaTmp writelog f c
writelog f' c' = do
liftIO $ writeFile f' c'
setAnnexFilePerm f'
-- | Appends a line to a log file, first locking it to prevent
-- concurrent writers.
appendLogFile :: FilePath -> (Git.Repo -> FilePath) -> String -> Annex ()
appendLogFile f lck c = createDirWhenNeeded f $ withExclusiveLock lck $ do
liftIO $ withFile f AppendMode $ \h -> hPutStrLn h c
setAnnexFilePerm f
-- | Streams lines from a log file, and then empties the file at the end.
--
-- If the action is interrupted or throws an exception, the log file is
-- left unchanged.
--
-- Does nothing if the log file does not exist.
--
-- Locking is used to prevent writes to to the log file while this
-- is running.
streamLogFile :: FilePath -> (Git.Repo -> FilePath) -> (String -> Annex ()) -> Annex ()
streamLogFile f lck a = withExclusiveLock lck $ bracketOnError setup cleanup go
where
setup = liftIO $ tryWhenExists $ openFile f ReadMode
cleanup Nothing = noop
cleanup (Just h) = liftIO $ hClose h
go Nothing = noop
go (Just h) = do
mapM_ a =<< liftIO (lines <$> hGetContents h)
liftIO $ hClose h
liftIO $ writeFile f ""
setAnnexFilePerm f
createDirWhenNeeded :: FilePath -> Annex () -> Annex ()
createDirWhenNeeded f a = a `catchNonAsync` \_e -> do
-- Most of the time, the directory will exist, so this is only
-- done if writing the file fails.
createAnnexDirectory (parentDir f)
a

40
Logs/Smudge.hs Normal file
View file

@ -0,0 +1,40 @@
{- git-annex smudge log file
-
- Copyright 2018 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
module Logs.Smudge where
import Annex.Common
import Git.FilePath
import Logs.File
-- | Log a smudged file.
smudgeLog :: Key -> TopFilePath -> Annex ()
smudgeLog k f = do
logf <- fromRepo gitAnnexSmudgeLog
appendLogFile logf gitAnnexSmudgeLock $
key2file k ++ " " ++ getTopFilePath f
-- | Streams all smudged files, and then empties the log at the end.
--
-- If the action is interrupted or throws an exception, the log file is
-- left unchanged.
--
-- Locking is used to prevent new items being added to the log while this
-- is running.
streamSmudged :: (Key -> TopFilePath -> Annex ()) -> Annex ()
streamSmudged a = do
logf <- fromRepo gitAnnexSmudgeLog
streamLogFile logf gitAnnexSmudgeLock $ \l ->
case parse l of
Nothing -> noop
Just (k, f) -> a k f
where
parse l =
let (ks, f) = separate (== ' ') l
in do
k <- file2key ks
return (k, asTopFilePath f)

View file

@ -6,11 +6,13 @@ git-annex smudge - git filter driver for git-annex
git annex smudge [--clean] file
git annex smudge --update
# DESCRIPTION
This command lets git-annex be used as a git filter driver which lets
annexed files in the git repository to be unlocked at all times, instead
of being symlinks.
annexed files in the git repository to be unlocked, instead
of being symlinks, and lets `git add` store files in the annex.
When adding a file with `git add`, the annex.largefiles config is
consulted to decide if a given file should be added to git as-is,
@ -32,6 +34,16 @@ contents:
* filter=annex
.* !filter
The smudge filter does not provide git with the content of annexed files,
because that would be slow and triggers memory leaks in git. Instead,
it records which worktree files need to be updated, and
`git annex smudge --update` later updates the work tree to contain
the content. That is run by several git hooks, including post-checkout
and post-merge. However, a few git commands, notably `git stash` and
`git cherry-pick`, do not run any hooks, so after using those commands
you can manually run `git annex smudge --update` to update the working
tree.
# SEE ALSO
[[git-annex]](1)

View file

@ -1032,11 +1032,6 @@ Here are all the supported configuration settings.
After setting (or unsetting) this, you should run `git annex fix` to
fix up the annexed files in the work tree to be hard links (or copies).
Note that `annex.thin` is not honored when git updates an annexed file
in the working tree. So when `git checkout` or `git merge` updates the
working tree, a second copy of annexed files will result. You can run
`git-annex fix` to fix up the hard links after running such git commands.
* `annex.delayadd`
Makes the watch and assistant commands delay for the specified number of

View file

@ -880,6 +880,7 @@ Executable git-annex
Logs.Schedule
Logs.SingleValue
Logs.SingleValue.Pure
Logs.Smudge
Logs.TimeStamp
Logs.Transfer
Logs.Transitions