Merge branch 'delaysmudge' into v7

This commit is contained in:
Joey Hess 2018-10-25 17:01:14 -04:00
commit 6ceeb4bae7
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
14 changed files with 236 additions and 60 deletions

View file

@ -46,6 +46,7 @@ module Annex.Content (
staleKeysPrune, staleKeysPrune,
pruneTmpWorkDirBefore, pruneTmpWorkDirBefore,
isUnmodified, isUnmodified,
isUnmodifiedCheap,
verifyKeyContent, verifyKeyContent,
VerifyConfig(..), VerifyConfig(..),
Verification(..), Verification(..),
@ -746,25 +747,38 @@ isUnmodified :: Key -> FilePath -> Annex Bool
isUnmodified key f = go =<< geti isUnmodified key f = go =<< geti
where where
go Nothing = return False go Nothing = return False
go (Just fc) = cheapcheck fc <||> expensivecheck fc go (Just fc) = isUnmodifiedCheap' key fc <||> expensivecheck fc
cheapcheck fc = anyM (compareInodeCaches fc)
=<< Database.Keys.getInodeCaches key
expensivecheck fc = ifM (verifyKeyContent RetrievalAllKeysSecure AlwaysVerify UnVerified key f) expensivecheck fc = ifM (verifyKeyContent RetrievalAllKeysSecure AlwaysVerify UnVerified key f)
( do ( do
liftIO $ print "content verified"
-- The file could have been modified while it was -- The file could have been modified while it was
-- being verified. Detect that. -- being verified. Detect that.
ifM (geti >>= maybe (return False) (compareInodeCaches fc)) ifM (geti >>= maybe (return False) (compareInodeCaches fc))
( do ( do
-- Update the InodeCache to avoid -- Update the InodeCache to avoid
-- performing this expensive check again. -- performing this expensive check again.
liftIO $ print "update inode cache"
Database.Keys.addInodeCaches key [fc] Database.Keys.addInodeCaches key [fc]
return True return True
, return False , return False
) )
, return False , do
liftIO $ print "content not verified"
return False
) )
geti = withTSDelta (liftIO . genInodeCache f) geti = withTSDelta (liftIO . genInodeCache f)
{- Cheap check if a file contains the unmodified content of the key,
- only checking the InodeCache of the key.
-}
isUnmodifiedCheap :: Key -> FilePath -> Annex Bool
isUnmodifiedCheap key f = maybe (return False) (isUnmodifiedCheap' key)
=<< withTSDelta (liftIO . genInodeCache f)
isUnmodifiedCheap' :: Key -> InodeCache -> Annex Bool
isUnmodifiedCheap' key fc =
anyM (compareInodeCaches fc) =<< Database.Keys.getInodeCaches key
{- Moves a key out of .git/annex/objects/ into .git/annex/bad, and {- Moves a key out of .git/annex/objects/ into .git/annex/bad, and
- returns the file it was moved to. -} - returns the file it was moved to. -}
moveBad :: Key -> Annex FilePath moveBad :: Key -> Annex FilePath

View file

@ -32,6 +32,17 @@ postReceiveHook = Git.Hook "post-receive"
[ mkHookScript "git annex post-receive" [ mkHookScript "git annex post-receive"
] ]
postCheckoutHook :: Git.Hook
postCheckoutHook = Git.Hook "post-checkout" smudgeHook []
postMergeHook :: Git.Hook
postMergeHook = Git.Hook "post-merge" smudgeHook []
-- Only run git-annex smudge --update when git-annex supports it.
-- Older versions of git-annex didn't need this hook.
smudgeHook :: String
smudgeHook = mkHookScript "if git annex smudge --update >/dev/null 2>&1; then git-annex smudge --update; fi"
preCommitAnnexHook :: Git.Hook preCommitAnnexHook :: Git.Hook
preCommitAnnexHook = Git.Hook "pre-commit-annex" "" [] preCommitAnnexHook = Git.Hook "pre-commit-annex" "" []

View file

@ -38,7 +38,6 @@ import Logs.Location
import qualified Annex import qualified Annex
import qualified Annex.Queue import qualified Annex.Queue
import qualified Database.Keys import qualified Database.Keys
import qualified Git.Branch
import Config import Config
import Utility.InodeCache import Utility.InodeCache
import Annex.ReplaceFile import Annex.ReplaceFile

View file

@ -112,6 +112,9 @@ initialize' ai mversion = checkCanInitialize ai $ do
whenM versionSupportsUnlockedPointers $ do whenM versionSupportsUnlockedPointers $ do
configureSmudgeFilter configureSmudgeFilter
scanUnlockedFiles scanUnlockedFiles
unlessM isBareRepo $ do
hookWrite postCheckoutHook
hookWrite postMergeHook
checkAdjustedClone >>= \case checkAdjustedClone >>= \case
NeedUpgradeForAdjustedClone -> NeedUpgradeForAdjustedClone ->
void $ upgrade True versionForAdjustedClone void $ upgrade True versionForAdjustedClone

View file

@ -38,6 +38,8 @@ module Annex.Locations (
gitAnnexFsckDbDir, gitAnnexFsckDbDir,
gitAnnexFsckDbLock, gitAnnexFsckDbLock,
gitAnnexFsckResultsLog, gitAnnexFsckResultsLog,
gitAnnexSmudgeLog,
gitAnnexSmudgeLock,
gitAnnexExportDbDir, gitAnnexExportDbDir,
gitAnnexExportLock, gitAnnexExportLock,
gitAnnexScheduleState, gitAnnexScheduleState,
@ -312,6 +314,14 @@ gitAnnexFsckDbLock u r = gitAnnexFsckDir u r </> "fsck.lck"
gitAnnexFsckResultsLog :: UUID -> Git.Repo -> FilePath gitAnnexFsckResultsLog :: UUID -> Git.Repo -> FilePath
gitAnnexFsckResultsLog u r = gitAnnexDir r </> "fsckresults" </> fromUUID u gitAnnexFsckResultsLog u r = gitAnnexDir r </> "fsckresults" </> fromUUID u
{- .git/annex/smudge.log is used to log smudges worktree files that need to
- be updated. -}
gitAnnexSmudgeLog :: Git.Repo -> FilePath
gitAnnexSmudgeLog r = gitAnnexDir r </> "smudge.log"
gitAnnexSmudgeLock :: Git.Repo -> FilePath
gitAnnexSmudgeLock r = gitAnnexDir r </> "smudge.lck"
{- .git/annex/export/uuid/ is used to store information about {- .git/annex/export/uuid/ is used to store information about
- exports to special remotes. -} - exports to special remotes. -}
gitAnnexExportDir :: UUID -> Git.Repo -> FilePath gitAnnexExportDir :: UUID -> Git.Repo -> FilePath

View file

@ -21,6 +21,19 @@ git-annex (6.20181012) UNRELEASED; urgency=medium
get confused about whether a locked file's content was present. get confused about whether a locked file's content was present.
* Fix concurrency bug that occurred on the first download from an * Fix concurrency bug that occurred on the first download from an
exporttree remote. exporttree remote.
* smudge: The smudge filter no longer provides git with annexed
file content, to avoid a git memory leak, and because that did not
honor annex.thin. Now git annex smudge --update has to be run
after a checkout to update unlocked files in the working tree
with annexed file contents.
* init, upgrade: Install git post-checkout and post-merge hooks that run
git annex smudge --update.
* precommit: Run git annex smudge --update, because the post-merge
hook is not run when there is a merge conflict. So the work tree will
be updated when a commit is made to resolve the merge conflict.
* Note that git has no hooks run after git stash or git cherry-pick,
so the user will have to manually run git annex smudge --update
after such commands.
-- Joey Hess <id@joeyh.name> Sat, 13 Oct 2018 00:52:02 -0400 -- Joey Hess <id@joeyh.name> Sat, 13 Oct 2018 00:52:02 -0400

View file

@ -13,6 +13,7 @@ import Command
import Config import Config
import qualified Command.Add import qualified Command.Add
import qualified Command.Fix import qualified Command.Fix
import qualified Command.Smudge
import Annex.Direct import Annex.Direct
import Annex.Hook import Annex.Hook
import Annex.Link import Annex.Link
@ -54,11 +55,21 @@ seek ps = lockPreCommitHook $ ifM isDirect
flip withFilesToBeCommitted l $ \f -> commandAction $ flip withFilesToBeCommitted l $ \f -> commandAction $
maybe stop (Command.Fix.start Command.Fix.FixSymlinks f) maybe stop (Command.Fix.start Command.Fix.FixSymlinks f)
=<< isAnnexLink f =<< isAnnexLink f
-- inject unlocked files into the annex ifM versionSupportsUnlockedPointers
-- (not needed when repo version uses -- after a merge conflict or git
-- unlocked pointer files) -- cherry-pick or stash, pointer
unlessM versionSupportsUnlockedPointers $ -- files in the worktree won't
withFilesOldUnlockedToBeCommitted (commandAction . startInjectUnlocked) l -- be populated, so populate them
-- here
( Command.Smudge.updateSmudged
-- When there's a false index,
-- restaging the files won't work.
. Restage =<< liftIO Git.haveFalseIndex
-- inject unlocked files into the annex
-- (not needed when repo version uses
-- unlocked pointer files)
, withFilesOldUnlockedToBeCommitted (commandAction . startInjectUnlocked) l
)
) )
runAnnexHook preCommitAnnexHook runAnnexHook preCommitAnnexHook
-- committing changes to a view updates metadata -- committing changes to a view updates metadata

View file

@ -8,12 +8,12 @@
module Command.Smudge where module Command.Smudge where
import Command import Command
import qualified Annex
import Annex.Content import Annex.Content
import Annex.Link import Annex.Link
import Annex.FileMatcher import Annex.FileMatcher
import Annex.Ingest import Annex.Ingest
import Annex.CatFile import Annex.CatFile
import Logs.Smudge
import Logs.Location import Logs.Location
import qualified Database.Keys import qualified Database.Keys
import qualified Git.BuildVersion import qualified Git.BuildVersion
@ -29,43 +29,46 @@ cmd = noCommit $ noMessages $
"git smudge filter" "git smudge filter"
paramFile (seek <$$> optParser) paramFile (seek <$$> optParser)
data SmudgeOptions = SmudgeOptions data SmudgeOptions = UpdateOption | SmudgeOptions
{ smudgeFile :: FilePath { smudgeFile :: FilePath
, cleanOption :: Bool , cleanOption :: Bool
} }
optParser :: CmdParamsDesc -> Parser SmudgeOptions optParser :: CmdParamsDesc -> Parser SmudgeOptions
optParser desc = SmudgeOptions optParser desc = smudgeoptions <|> updateoption
<$> argument str ( metavar desc ) where
<*> switch ( long "clean" <> help "clean filter" ) smudgeoptions = SmudgeOptions
<$> argument str ( metavar desc )
<*> switch ( long "clean" <> help "clean filter" )
updateoption = flag' UpdateOption
( long "update" <> help "populate annexed worktree files" )
seek :: SmudgeOptions -> CommandSeek seek :: SmudgeOptions -> CommandSeek
seek o = commandAction $ seek (SmudgeOptions f False) = commandAction (smudge f)
(if cleanOption o then clean else smudge) (smudgeFile o) seek (SmudgeOptions f True) = commandAction (clean f)
seek UpdateOption = commandAction update
-- Smudge filter is fed git file content, and if it's a pointer to an -- Smudge filter is fed git file content, and if it's a pointer to an
-- available annex object, should output its content. -- available annex object, git expects it to output its content.
--
-- However, this does not do that. It outputs the pointer, and records
-- the filename in the smudge log. Git hooks run after commands like checkout
-- then run git annex smudge --update which populates the work tree files
-- with annex content. This is done for several reasons:
--
-- * To support annex.thin
-- * Because git currently buffers the whole object received from the
-- smudge filter in memory, which is a problem with large files.
smudge :: FilePath -> CommandStart smudge :: FilePath -> CommandStart
smudge file = do smudge file = do
b <- liftIO $ B.hGetContents stdin b <- liftIO $ B.hGetContents stdin
case parseLinkOrPointer b of case parseLinkOrPointer b of
Nothing -> liftIO $ B.putStr b Nothing -> noop
Just k -> do Just k -> do
Database.Keys.addAssociatedFile k =<< inRepo (toTopFilePath file) topfile <- inRepo (toTopFilePath file)
-- A previous unlocked checkout of the file may have Database.Keys.addAssociatedFile k topfile
-- led to the annex object getting modified; void $ smudgeLog k topfile
-- don't provide such modified content as it liftIO $ B.putStr b
-- will be confusing. inAnnex will detect such
-- modifications.
ifM (inAnnex k)
( do
content <- calcRepo (gitAnnexLocation k)
whenM (annexThin <$> Annex.getGitConfig) $
warning $ "Not able to honor annex.thin when git is checking out " ++ file ++ " (run git annex fix to re-thin files)"
liftIO $ B.putStr . fromMaybe b
=<< catchMaybeIO (B.readFile content)
, liftIO $ B.putStr b
)
stop stop
-- Clean filter is fed file content on stdin, decides if a file -- Clean filter is fed file content on stdin, decides if a file
@ -92,20 +95,31 @@ clean file = do
if Git.BuildVersion.older "2.5" if Git.BuildVersion.older "2.5"
then B.length b `seq` return () then B.length b `seq` return ()
else liftIO $ hClose stdin else liftIO $ hClose stdin
-- Look up the backend that was used for this file
-- before, so that when git re-cleans a file its -- Optimization when the file is already annexed
-- backend does not change. -- and is unmodified.
let oldbackend = maybe Nothing (maybeLookupBackendVariety . keyVariety) oldkey case oldkey of
-- Can't restage associated files because git add Nothing -> ingest oldkey
-- runs this and has the index locked. Just ko -> ifM (isUnmodifiedCheap ko file)
let norestage = Restage False ( liftIO $ emitPointer ko
liftIO . emitPointer , ingest oldkey
=<< postingest )
=<< (\ld -> ingest' oldbackend ld Nothing norestage)
=<< lockDown cfg file
, liftIO $ B.hPut stdout b , liftIO $ B.hPut stdout b
) )
ingest oldkey = do
-- Look up the backend that was used for this file
-- before, so that when git re-cleans a file its
-- backend does not change.
let oldbackend = maybe Nothing (maybeLookupBackendVariety . keyVariety) oldkey
-- Can't restage associated files because git add
-- runs this and has the index locked.
let norestage = Restage False
liftIO . emitPointer
=<< postingest
=<< (\ld -> ingest' oldbackend ld Nothing norestage)
=<< lockDown cfg file
postingest (Just k, _) = do postingest (Just k, _) = do
logStatus k InfoPresent logStatus k InfoPresent
return k return k
@ -151,3 +165,19 @@ getMoveRaceRecovery k file = void $ tryNonAsync $
populatePointerFile (Restage False) k obj file >>= \case populatePointerFile (Restage False) k obj file >>= \case
Nothing -> return () Nothing -> return ()
Just ic -> Database.Keys.addInodeCaches k [ic] Just ic -> Database.Keys.addInodeCaches k [ic]
update :: CommandStart
update = do
updateSmudged (Restage True)
stop
updateSmudged :: Restage -> Annex ()
updateSmudged restage = streamSmudged $ \k topf -> do
f <- fromRepo $ fromTopFilePath topf
whenM (inAnnex k) $ do
obj <- calcRepo (gitAnnexLocation k)
unlessM (isJust <$> populatePointerFile restage k obj f) $
liftIO (isPointerFile f) >>= \case
Just k' | k' == k -> toplevelWarning False $
"unable to populate worktree file " ++ f
_ -> noop

View file

@ -2,26 +2,59 @@
- -
- Copyright 2018 Joey Hess <id@joeyh.name> - Copyright 2018 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU GPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
module Logs.File where module Logs.File (writeLogFile, appendLogFile, streamLogFile) where
import Annex.Common import Annex.Common
import Annex.Perms import Annex.Perms
import Annex.LockFile
import qualified Git
import Utility.Tmp import Utility.Tmp
-- | Writes content to a file, replacing the file atomically, and -- | Writes content to a file, replacing the file atomically, and
-- making the new file have whatever permissions the git repository is -- making the new file have whatever permissions the git repository is
-- configured to use. Creates the parent directory when necessary. -- configured to use. Creates the parent directory when necessary.
writeLogFile :: FilePath -> String -> Annex () writeLogFile :: FilePath -> String -> Annex ()
writeLogFile f c = go `catchNonAsync` \_e -> do writeLogFile f c = createDirWhenNeeded f $ viaTmp writelog f c
-- Most of the time, the directory will exist, so this is only
-- done if writing the file fails.
createAnnexDirectory (parentDir f)
go
where where
go = viaTmp writelog f c
writelog f' c' = do writelog f' c' = do
liftIO $ writeFile f' c' liftIO $ writeFile f' c'
setAnnexFilePerm f' setAnnexFilePerm f'
-- | Appends a line to a log file, first locking it to prevent
-- concurrent writers.
appendLogFile :: FilePath -> (Git.Repo -> FilePath) -> String -> Annex ()
appendLogFile f lck c = createDirWhenNeeded f $ withExclusiveLock lck $ do
liftIO $ withFile f AppendMode $ \h -> hPutStrLn h c
setAnnexFilePerm f
-- | Streams lines from a log file, and then empties the file at the end.
--
-- If the action is interrupted or throws an exception, the log file is
-- left unchanged.
--
-- Does nothing if the log file does not exist.
--
-- Locking is used to prevent writes to to the log file while this
-- is running.
streamLogFile :: FilePath -> (Git.Repo -> FilePath) -> (String -> Annex ()) -> Annex ()
streamLogFile f lck a = withExclusiveLock lck $ bracketOnError setup cleanup go
where
setup = liftIO $ tryWhenExists $ openFile f ReadMode
cleanup Nothing = noop
cleanup (Just h) = liftIO $ hClose h
go Nothing = noop
go (Just h) = do
mapM_ a =<< liftIO (lines <$> hGetContents h)
liftIO $ hClose h
liftIO $ writeFile f ""
setAnnexFilePerm f
createDirWhenNeeded :: FilePath -> Annex () -> Annex ()
createDirWhenNeeded f a = a `catchNonAsync` \_e -> do
-- Most of the time, the directory will exist, so this is only
-- done if writing the file fails.
createAnnexDirectory (parentDir f)
a

40
Logs/Smudge.hs Normal file
View file

@ -0,0 +1,40 @@
{- git-annex smudge log file
-
- Copyright 2018 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
module Logs.Smudge where
import Annex.Common
import Git.FilePath
import Logs.File
-- | Log a smudged file.
smudgeLog :: Key -> TopFilePath -> Annex ()
smudgeLog k f = do
logf <- fromRepo gitAnnexSmudgeLog
appendLogFile logf gitAnnexSmudgeLock $
key2file k ++ " " ++ getTopFilePath f
-- | Streams all smudged files, and then empties the log at the end.
--
-- If the action is interrupted or throws an exception, the log file is
-- left unchanged.
--
-- Locking is used to prevent new items being added to the log while this
-- is running.
streamSmudged :: (Key -> TopFilePath -> Annex ()) -> Annex ()
streamSmudged a = do
logf <- fromRepo gitAnnexSmudgeLog
streamLogFile logf gitAnnexSmudgeLock $ \l ->
case parse l of
Nothing -> noop
Just (k, f) -> a k f
where
parse l =
let (ks, f) = separate (== ' ') l
in do
k <- file2key ks
return (k, asTopFilePath f)

View file

@ -17,6 +17,7 @@ import Annex.Direct
import Annex.Content import Annex.Content
import Annex.CatFile import Annex.CatFile
import Annex.WorkTree import Annex.WorkTree
import Annex.Hook
import qualified Database.Keys import qualified Database.Keys
import qualified Annex.Content.Direct as Direct import qualified Annex.Content.Direct as Direct
import qualified Git import qualified Git
@ -63,6 +64,9 @@ upgrade automatic = do
- adjust branch. Instead, update HEAD manually. -} - adjust branch. Instead, update HEAD manually. -}
inRepo $ setHeadRef b inRepo $ setHeadRef b
configureSmudgeFilter configureSmudgeFilter
unlessM isBareRepo $ do
hookWrite postCheckoutHook
hookWrite postMergeHook
-- Inode sentinal file was only used in direct mode and when -- Inode sentinal file was only used in direct mode and when
-- locking down files as they were added. In v6, it's used more -- locking down files as they were added. In v6, it's used more
-- extensively, so make sure it exists, since old repos that didn't -- extensively, so make sure it exists, since old repos that didn't

View file

@ -6,11 +6,13 @@ git-annex smudge - git filter driver for git-annex
git annex smudge [--clean] file git annex smudge [--clean] file
git annex smudge --update
# DESCRIPTION # DESCRIPTION
This command lets git-annex be used as a git filter driver which lets This command lets git-annex be used as a git filter driver which lets
annexed files in the git repository to be unlocked at all times, instead annexed files in the git repository to be unlocked, instead
of being symlinks. of being symlinks, and lets `git add` store files in the annex.
When adding a file with `git add`, the annex.largefiles config is When adding a file with `git add`, the annex.largefiles config is
consulted to decide if a given file should be added to git as-is, consulted to decide if a given file should be added to git as-is,
@ -32,6 +34,16 @@ contents:
* filter=annex * filter=annex
.* !filter .* !filter
The smudge filter does not provide git with the content of annexed files,
because that would be slow and triggers memory leaks in git. Instead,
it records which worktree files need to be updated, and
`git annex smudge --update` later updates the work tree to contain
the content. That is run by several git hooks, including post-checkout
and post-merge. However, a few git commands, notably `git stash` and
`git cherry-pick`, do not run any hooks, so after using those commands
you can manually run `git annex smudge --update` to update the working
tree.
# SEE ALSO # SEE ALSO
[[git-annex]](1) [[git-annex]](1)

View file

@ -1032,11 +1032,6 @@ Here are all the supported configuration settings.
After setting (or unsetting) this, you should run `git annex fix` to After setting (or unsetting) this, you should run `git annex fix` to
fix up the annexed files in the work tree to be hard links (or copies). fix up the annexed files in the work tree to be hard links (or copies).
Note that `annex.thin` is not honored when git updates an annexed file
in the working tree. So when `git checkout` or `git merge` updates the
working tree, a second copy of annexed files will result. You can run
`git-annex fix` to fix up the hard links after running such git commands.
* `annex.delayadd` * `annex.delayadd`
Makes the watch and assistant commands delay for the specified number of Makes the watch and assistant commands delay for the specified number of

View file

@ -880,6 +880,7 @@ Executable git-annex
Logs.Schedule Logs.Schedule
Logs.SingleValue Logs.SingleValue
Logs.SingleValue.Pure Logs.SingleValue.Pure
Logs.Smudge
Logs.TimeStamp Logs.TimeStamp
Logs.Transfer Logs.Transfer
Logs.Transitions Logs.Transitions