Merge branch 'delaysmudge' into v7

This commit is contained in:
Joey Hess 2018-10-25 17:01:14 -04:00
commit 6ceeb4bae7
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
14 changed files with 236 additions and 60 deletions

View file

@ -46,6 +46,7 @@ module Annex.Content (
staleKeysPrune,
pruneTmpWorkDirBefore,
isUnmodified,
isUnmodifiedCheap,
verifyKeyContent,
VerifyConfig(..),
Verification(..),
@ -746,25 +747,38 @@ isUnmodified :: Key -> FilePath -> Annex Bool
isUnmodified key f = go =<< geti
where
go Nothing = return False
go (Just fc) = cheapcheck fc <||> expensivecheck fc
cheapcheck fc = anyM (compareInodeCaches fc)
=<< Database.Keys.getInodeCaches key
go (Just fc) = isUnmodifiedCheap' key fc <||> expensivecheck fc
expensivecheck fc = ifM (verifyKeyContent RetrievalAllKeysSecure AlwaysVerify UnVerified key f)
( do
liftIO $ print "content verified"
-- The file could have been modified while it was
-- being verified. Detect that.
ifM (geti >>= maybe (return False) (compareInodeCaches fc))
( do
-- Update the InodeCache to avoid
-- performing this expensive check again.
liftIO $ print "update inode cache"
Database.Keys.addInodeCaches key [fc]
return True
, return False
)
, return False
, do
liftIO $ print "content not verified"
return False
)
geti = withTSDelta (liftIO . genInodeCache f)
{- Cheap check if a file contains the unmodified content of the key,
- only checking the InodeCache of the key.
-}
isUnmodifiedCheap :: Key -> FilePath -> Annex Bool
isUnmodifiedCheap key f = maybe (return False) (isUnmodifiedCheap' key)
=<< withTSDelta (liftIO . genInodeCache f)
isUnmodifiedCheap' :: Key -> InodeCache -> Annex Bool
isUnmodifiedCheap' key fc =
anyM (compareInodeCaches fc) =<< Database.Keys.getInodeCaches key
{- Moves a key out of .git/annex/objects/ into .git/annex/bad, and
- returns the file it was moved to. -}
moveBad :: Key -> Annex FilePath

View file

@ -32,6 +32,17 @@ postReceiveHook = Git.Hook "post-receive"
[ mkHookScript "git annex post-receive"
]
postCheckoutHook :: Git.Hook
postCheckoutHook = Git.Hook "post-checkout" smudgeHook []
postMergeHook :: Git.Hook
postMergeHook = Git.Hook "post-merge" smudgeHook []
-- Only run git-annex smudge --update when git-annex supports it.
-- Older versions of git-annex didn't need this hook.
smudgeHook :: String
smudgeHook = mkHookScript "if git annex smudge --update >/dev/null 2>&1; then git-annex smudge --update; fi"
preCommitAnnexHook :: Git.Hook
preCommitAnnexHook = Git.Hook "pre-commit-annex" "" []

View file

@ -38,7 +38,6 @@ import Logs.Location
import qualified Annex
import qualified Annex.Queue
import qualified Database.Keys
import qualified Git.Branch
import Config
import Utility.InodeCache
import Annex.ReplaceFile

View file

@ -112,6 +112,9 @@ initialize' ai mversion = checkCanInitialize ai $ do
whenM versionSupportsUnlockedPointers $ do
configureSmudgeFilter
scanUnlockedFiles
unlessM isBareRepo $ do
hookWrite postCheckoutHook
hookWrite postMergeHook
checkAdjustedClone >>= \case
NeedUpgradeForAdjustedClone ->
void $ upgrade True versionForAdjustedClone

View file

@ -38,6 +38,8 @@ module Annex.Locations (
gitAnnexFsckDbDir,
gitAnnexFsckDbLock,
gitAnnexFsckResultsLog,
gitAnnexSmudgeLog,
gitAnnexSmudgeLock,
gitAnnexExportDbDir,
gitAnnexExportLock,
gitAnnexScheduleState,
@ -312,6 +314,14 @@ gitAnnexFsckDbLock u r = gitAnnexFsckDir u r </> "fsck.lck"
gitAnnexFsckResultsLog :: UUID -> Git.Repo -> FilePath
gitAnnexFsckResultsLog u r = gitAnnexDir r </> "fsckresults" </> fromUUID u
{- .git/annex/smudge.log is used to log smudges worktree files that need to
- be updated. -}
gitAnnexSmudgeLog :: Git.Repo -> FilePath
gitAnnexSmudgeLog r = gitAnnexDir r </> "smudge.log"
gitAnnexSmudgeLock :: Git.Repo -> FilePath
gitAnnexSmudgeLock r = gitAnnexDir r </> "smudge.lck"
{- .git/annex/export/uuid/ is used to store information about
- exports to special remotes. -}
gitAnnexExportDir :: UUID -> Git.Repo -> FilePath

View file

@ -21,6 +21,19 @@ git-annex (6.20181012) UNRELEASED; urgency=medium
get confused about whether a locked file's content was present.
* Fix concurrency bug that occurred on the first download from an
exporttree remote.
* smudge: The smudge filter no longer provides git with annexed
file content, to avoid a git memory leak, and because that did not
honor annex.thin. Now git annex smudge --update has to be run
after a checkout to update unlocked files in the working tree
with annexed file contents.
* init, upgrade: Install git post-checkout and post-merge hooks that run
git annex smudge --update.
* precommit: Run git annex smudge --update, because the post-merge
hook is not run when there is a merge conflict. So the work tree will
be updated when a commit is made to resolve the merge conflict.
* Note that git has no hooks run after git stash or git cherry-pick,
so the user will have to manually run git annex smudge --update
after such commands.
-- Joey Hess <id@joeyh.name> Sat, 13 Oct 2018 00:52:02 -0400

View file

@ -13,6 +13,7 @@ import Command
import Config
import qualified Command.Add
import qualified Command.Fix
import qualified Command.Smudge
import Annex.Direct
import Annex.Hook
import Annex.Link
@ -54,11 +55,21 @@ seek ps = lockPreCommitHook $ ifM isDirect
flip withFilesToBeCommitted l $ \f -> commandAction $
maybe stop (Command.Fix.start Command.Fix.FixSymlinks f)
=<< isAnnexLink f
-- inject unlocked files into the annex
-- (not needed when repo version uses
-- unlocked pointer files)
unlessM versionSupportsUnlockedPointers $
withFilesOldUnlockedToBeCommitted (commandAction . startInjectUnlocked) l
ifM versionSupportsUnlockedPointers
-- after a merge conflict or git
-- cherry-pick or stash, pointer
-- files in the worktree won't
-- be populated, so populate them
-- here
( Command.Smudge.updateSmudged
-- When there's a false index,
-- restaging the files won't work.
. Restage =<< liftIO Git.haveFalseIndex
-- inject unlocked files into the annex
-- (not needed when repo version uses
-- unlocked pointer files)
, withFilesOldUnlockedToBeCommitted (commandAction . startInjectUnlocked) l
)
)
runAnnexHook preCommitAnnexHook
-- committing changes to a view updates metadata

View file

@ -8,12 +8,12 @@
module Command.Smudge where
import Command
import qualified Annex
import Annex.Content
import Annex.Link
import Annex.FileMatcher
import Annex.Ingest
import Annex.CatFile
import Logs.Smudge
import Logs.Location
import qualified Database.Keys
import qualified Git.BuildVersion
@ -29,43 +29,46 @@ cmd = noCommit $ noMessages $
"git smudge filter"
paramFile (seek <$$> optParser)
data SmudgeOptions = SmudgeOptions
data SmudgeOptions = UpdateOption | SmudgeOptions
{ smudgeFile :: FilePath
, cleanOption :: Bool
}
optParser :: CmdParamsDesc -> Parser SmudgeOptions
optParser desc = SmudgeOptions
<$> argument str ( metavar desc )
<*> switch ( long "clean" <> help "clean filter" )
optParser desc = smudgeoptions <|> updateoption
where
smudgeoptions = SmudgeOptions
<$> argument str ( metavar desc )
<*> switch ( long "clean" <> help "clean filter" )
updateoption = flag' UpdateOption
( long "update" <> help "populate annexed worktree files" )
seek :: SmudgeOptions -> CommandSeek
seek o = commandAction $
(if cleanOption o then clean else smudge) (smudgeFile o)
seek (SmudgeOptions f False) = commandAction (smudge f)
seek (SmudgeOptions f True) = commandAction (clean f)
seek UpdateOption = commandAction update
-- Smudge filter is fed git file content, and if it's a pointer to an
-- available annex object, should output its content.
-- available annex object, git expects it to output its content.
--
-- However, this does not do that. It outputs the pointer, and records
-- the filename in the smudge log. Git hooks run after commands like checkout
-- then run git annex smudge --update which populates the work tree files
-- with annex content. This is done for several reasons:
--
-- * To support annex.thin
-- * Because git currently buffers the whole object received from the
-- smudge filter in memory, which is a problem with large files.
smudge :: FilePath -> CommandStart
smudge file = do
b <- liftIO $ B.hGetContents stdin
case parseLinkOrPointer b of
Nothing -> liftIO $ B.putStr b
Nothing -> noop
Just k -> do
Database.Keys.addAssociatedFile k =<< inRepo (toTopFilePath file)
-- A previous unlocked checkout of the file may have
-- led to the annex object getting modified;
-- don't provide such modified content as it
-- will be confusing. inAnnex will detect such
-- modifications.
ifM (inAnnex k)
( do
content <- calcRepo (gitAnnexLocation k)
whenM (annexThin <$> Annex.getGitConfig) $
warning $ "Not able to honor annex.thin when git is checking out " ++ file ++ " (run git annex fix to re-thin files)"
liftIO $ B.putStr . fromMaybe b
=<< catchMaybeIO (B.readFile content)
, liftIO $ B.putStr b
)
topfile <- inRepo (toTopFilePath file)
Database.Keys.addAssociatedFile k topfile
void $ smudgeLog k topfile
liftIO $ B.putStr b
stop
-- Clean filter is fed file content on stdin, decides if a file
@ -92,19 +95,30 @@ clean file = do
if Git.BuildVersion.older "2.5"
then B.length b `seq` return ()
else liftIO $ hClose stdin
-- Look up the backend that was used for this file
-- before, so that when git re-cleans a file its
-- backend does not change.
let oldbackend = maybe Nothing (maybeLookupBackendVariety . keyVariety) oldkey
-- Can't restage associated files because git add
-- runs this and has the index locked.
let norestage = Restage False
liftIO . emitPointer
=<< postingest
=<< (\ld -> ingest' oldbackend ld Nothing norestage)
=<< lockDown cfg file
-- Optimization when the file is already annexed
-- and is unmodified.
case oldkey of
Nothing -> ingest oldkey
Just ko -> ifM (isUnmodifiedCheap ko file)
( liftIO $ emitPointer ko
, ingest oldkey
)
, liftIO $ B.hPut stdout b
)
ingest oldkey = do
-- Look up the backend that was used for this file
-- before, so that when git re-cleans a file its
-- backend does not change.
let oldbackend = maybe Nothing (maybeLookupBackendVariety . keyVariety) oldkey
-- Can't restage associated files because git add
-- runs this and has the index locked.
let norestage = Restage False
liftIO . emitPointer
=<< postingest
=<< (\ld -> ingest' oldbackend ld Nothing norestage)
=<< lockDown cfg file
postingest (Just k, _) = do
logStatus k InfoPresent
@ -151,3 +165,19 @@ getMoveRaceRecovery k file = void $ tryNonAsync $
populatePointerFile (Restage False) k obj file >>= \case
Nothing -> return ()
Just ic -> Database.Keys.addInodeCaches k [ic]
update :: CommandStart
update = do
updateSmudged (Restage True)
stop
updateSmudged :: Restage -> Annex ()
updateSmudged restage = streamSmudged $ \k topf -> do
f <- fromRepo $ fromTopFilePath topf
whenM (inAnnex k) $ do
obj <- calcRepo (gitAnnexLocation k)
unlessM (isJust <$> populatePointerFile restage k obj f) $
liftIO (isPointerFile f) >>= \case
Just k' | k' == k -> toplevelWarning False $
"unable to populate worktree file " ++ f
_ -> noop

View file

@ -2,26 +2,59 @@
-
- Copyright 2018 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU GPL version 3 or higher.
- Licensed under the GNU AGPL version 3 or higher.
-}
module Logs.File where
module Logs.File (writeLogFile, appendLogFile, streamLogFile) where
import Annex.Common
import Annex.Perms
import Annex.LockFile
import qualified Git
import Utility.Tmp
-- | Writes content to a file, replacing the file atomically, and
-- making the new file have whatever permissions the git repository is
-- configured to use. Creates the parent directory when necessary.
writeLogFile :: FilePath -> String -> Annex ()
writeLogFile f c = go `catchNonAsync` \_e -> do
-- Most of the time, the directory will exist, so this is only
-- done if writing the file fails.
createAnnexDirectory (parentDir f)
go
writeLogFile f c = createDirWhenNeeded f $ viaTmp writelog f c
where
go = viaTmp writelog f c
writelog f' c' = do
liftIO $ writeFile f' c'
setAnnexFilePerm f'
-- | Appends a line to a log file, first locking it to prevent
-- concurrent writers.
appendLogFile :: FilePath -> (Git.Repo -> FilePath) -> String -> Annex ()
appendLogFile f lck c = createDirWhenNeeded f $ withExclusiveLock lck $ do
liftIO $ withFile f AppendMode $ \h -> hPutStrLn h c
setAnnexFilePerm f
-- | Streams lines from a log file, and then empties the file at the end.
--
-- If the action is interrupted or throws an exception, the log file is
-- left unchanged.
--
-- Does nothing if the log file does not exist.
--
-- Locking is used to prevent writes to to the log file while this
-- is running.
streamLogFile :: FilePath -> (Git.Repo -> FilePath) -> (String -> Annex ()) -> Annex ()
streamLogFile f lck a = withExclusiveLock lck $ bracketOnError setup cleanup go
where
setup = liftIO $ tryWhenExists $ openFile f ReadMode
cleanup Nothing = noop
cleanup (Just h) = liftIO $ hClose h
go Nothing = noop
go (Just h) = do
mapM_ a =<< liftIO (lines <$> hGetContents h)
liftIO $ hClose h
liftIO $ writeFile f ""
setAnnexFilePerm f
createDirWhenNeeded :: FilePath -> Annex () -> Annex ()
createDirWhenNeeded f a = a `catchNonAsync` \_e -> do
-- Most of the time, the directory will exist, so this is only
-- done if writing the file fails.
createAnnexDirectory (parentDir f)
a

40
Logs/Smudge.hs Normal file
View file

@ -0,0 +1,40 @@
{- git-annex smudge log file
-
- Copyright 2018 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
module Logs.Smudge where
import Annex.Common
import Git.FilePath
import Logs.File
-- | Log a smudged file.
smudgeLog :: Key -> TopFilePath -> Annex ()
smudgeLog k f = do
logf <- fromRepo gitAnnexSmudgeLog
appendLogFile logf gitAnnexSmudgeLock $
key2file k ++ " " ++ getTopFilePath f
-- | Streams all smudged files, and then empties the log at the end.
--
-- If the action is interrupted or throws an exception, the log file is
-- left unchanged.
--
-- Locking is used to prevent new items being added to the log while this
-- is running.
streamSmudged :: (Key -> TopFilePath -> Annex ()) -> Annex ()
streamSmudged a = do
logf <- fromRepo gitAnnexSmudgeLog
streamLogFile logf gitAnnexSmudgeLock $ \l ->
case parse l of
Nothing -> noop
Just (k, f) -> a k f
where
parse l =
let (ks, f) = separate (== ' ') l
in do
k <- file2key ks
return (k, asTopFilePath f)

View file

@ -17,6 +17,7 @@ import Annex.Direct
import Annex.Content
import Annex.CatFile
import Annex.WorkTree
import Annex.Hook
import qualified Database.Keys
import qualified Annex.Content.Direct as Direct
import qualified Git
@ -63,6 +64,9 @@ upgrade automatic = do
- adjust branch. Instead, update HEAD manually. -}
inRepo $ setHeadRef b
configureSmudgeFilter
unlessM isBareRepo $ do
hookWrite postCheckoutHook
hookWrite postMergeHook
-- Inode sentinal file was only used in direct mode and when
-- locking down files as they were added. In v6, it's used more
-- extensively, so make sure it exists, since old repos that didn't

View file

@ -6,11 +6,13 @@ git-annex smudge - git filter driver for git-annex
git annex smudge [--clean] file
git annex smudge --update
# DESCRIPTION
This command lets git-annex be used as a git filter driver which lets
annexed files in the git repository to be unlocked at all times, instead
of being symlinks.
annexed files in the git repository to be unlocked, instead
of being symlinks, and lets `git add` store files in the annex.
When adding a file with `git add`, the annex.largefiles config is
consulted to decide if a given file should be added to git as-is,
@ -32,6 +34,16 @@ contents:
* filter=annex
.* !filter
The smudge filter does not provide git with the content of annexed files,
because that would be slow and triggers memory leaks in git. Instead,
it records which worktree files need to be updated, and
`git annex smudge --update` later updates the work tree to contain
the content. That is run by several git hooks, including post-checkout
and post-merge. However, a few git commands, notably `git stash` and
`git cherry-pick`, do not run any hooks, so after using those commands
you can manually run `git annex smudge --update` to update the working
tree.
# SEE ALSO
[[git-annex]](1)

View file

@ -1032,11 +1032,6 @@ Here are all the supported configuration settings.
After setting (or unsetting) this, you should run `git annex fix` to
fix up the annexed files in the work tree to be hard links (or copies).
Note that `annex.thin` is not honored when git updates an annexed file
in the working tree. So when `git checkout` or `git merge` updates the
working tree, a second copy of annexed files will result. You can run
`git-annex fix` to fix up the hard links after running such git commands.
* `annex.delayadd`
Makes the watch and assistant commands delay for the specified number of

View file

@ -880,6 +880,7 @@ Executable git-annex
Logs.Schedule
Logs.SingleValue
Logs.SingleValue.Pure
Logs.Smudge
Logs.TimeStamp
Logs.Transfer
Logs.Transitions