From 48e9e129618bcceba5d76ce68a25e046cd72ea89 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 14 Aug 2018 14:22:23 -0400 Subject: [PATCH] finally fixed v6 get/drop git status After updating the worktree for an add/drop, update git's index, so git status will not show the files as modified. What actually happens is that the index update removes the inode information from the index. The next git status (or similar) run then has to do some work. It runs the clean filter. So, this depends on the clean filter being reasonably fast and on git not leaking memory when running it. Both problems were fixed in a96972015dd76271b46432151e15d5d38d7151ff, but only for git 2.5. Anyone using an older git will see very expensive git status after an add/drop. This uses the same git update-index queue as other parts of git-annex, so the actual index update is fairly efficient. Of course, updating the index does still have some overhead. The annex.queuesize config will control how often the index gets updated when working on a lot of files. This is an imperfect workaround... Added several todos about new problems this workaround causes. Still, this seems a lot better than the old behavior. This commit was supported by the NSF-funded DataLad project. --- Annex/Content.hs | 37 +++++++---- Annex/Ingest.hs | 29 ++++----- CHANGELOG | 2 + Command/Smudge.hs | 5 +- ..._c2380f19248abf98928743fabd88ed05._comment | 41 ------------ doc/todo/smudge.mdwn | 64 +++++++++---------- 6 files changed, 76 insertions(+), 102 deletions(-) delete mode 100644 doc/todo/Long_Running_Filter_Process/comment_2_c2380f19248abf98928743fabd88ed05._comment diff --git a/Annex/Content.hs b/Annex/Content.hs index 8011a82305..76a5454d32 100644 --- a/Annex/Content.hs +++ b/Annex/Content.hs @@ -24,6 +24,7 @@ module Annex.Content ( checkDiskSpace, needMoreDiskSpace, moveAnnex, + Restage(..), populatePointerFile, linkToAnnex, linkFromAnnex, @@ -545,7 +546,7 @@ moveAnnex key src = ifM (checkSecureHashes key) fs <- map (`fromTopFilePath` g) <$> Database.Keys.getAssociatedFiles key unless (null fs) $ do - mapM_ (populatePointerFile key dest) fs + mapM_ (populatePointerFile (Restage True) key dest) fs Database.Keys.storeInodeCaches key (dest:fs) ) storeindirect = storeobject =<< calcRepo (gitAnnexLocation key) @@ -586,14 +587,23 @@ checkSecureHashes key , return True ) -populatePointerFile :: Key -> FilePath -> FilePath -> Annex () -populatePointerFile k obj f = go =<< liftIO (isPointerFile f) +newtype Restage = Restage Bool + +{- Populates a pointer file with the content of a key. -} +populatePointerFile :: Restage -> Key -> FilePath -> FilePath -> Annex () +populatePointerFile (Restage restage) k obj f = go =<< liftIO (isPointerFile f) where go (Just k') | k == k' = do destmode <- liftIO $ catchMaybeIO $ fileMode <$> getFileStatus f liftIO $ nukeFile f ifM (linkOrCopy k obj f destmode) - ( thawContent f + ( do + thawContent f + -- The pointer file is re-staged, + -- so git won't think it's been modified. + when restage $ do + pointersha <- hashPointerFile k + stagePointerFile f destmode pointersha , liftIO $ writePointerFile f k destmode ) go _ = return () @@ -816,12 +826,6 @@ cleanObjectLoc key cleaner = do <=< catchMaybeIO $ removeDirectory dir {- Removes a key's file from .git/annex/objects/ - - - - When a key has associated pointer files, they are checked for - - modifications, and if unmodified, are reset. - - - - In direct mode, deletes the associated files or files, and replaces - - them with symlinks. -} removeAnnex :: ContentRemovalLock -> Annex () removeAnnex (ContentRemovalLock key) = withObjectLoc key remove removedirect @@ -834,22 +838,33 @@ removeAnnex (ContentRemovalLock key) = withObjectLoc key remove removedirect =<< Database.Keys.getAssociatedFiles key Database.Keys.removeInodeCaches key Direct.removeInodeCache key + + -- Check associated pointer file for modifications, and reset if + -- it's unmodified. resetpointer file = ifM (isUnmodified key file) ( do mode <- liftIO $ catchMaybeIO $ fileMode <$> getFileStatus file secureErase file liftIO $ nukeFile file liftIO $ writePointerFile file key mode - -- Can't delete the pointer file. + -- Re-stage the pointer, so git won't think it's + -- been modified. + pointersha <- hashPointerFile key + stagePointerFile file mode pointersha + -- Modified file, so leave it alone. -- If it was a hard link to the annex object, -- that object might have been frozen as part of the -- removal process, so thaw it. , void $ tryIO $ thawContent file ) + + -- In direct mode, deletes the associated files or files, and replaces + -- them with symlinks. removedirect fs = do cache <- Direct.recordedInodeCache key Direct.removeInodeCache key mapM_ (resetfile cache) fs + resetfile cache f = whenM (Direct.sameInodeCache f cache) $ do l <- calcRepo $ gitAnnexLink f key secureErase f diff --git a/Annex/Ingest.hs b/Annex/Ingest.hs index 1bc0815602..aa556b3711 100644 --- a/Annex/Ingest.hs +++ b/Annex/Ingest.hs @@ -140,14 +140,13 @@ ingestAdd' ld@(Just (LockedDown cfg source)) mk = do return (Just k) {- Ingests a locked down file into the annex. Does not update the working - - tree or the index. - -} + - tree or the index. -} ingest :: Maybe LockedDown -> Maybe Key -> Annex (Maybe Key, Maybe InodeCache) -ingest = ingest' Nothing +ingest ld mk = ingest' Nothing ld mk (Restage True) -ingest' :: Maybe Backend -> Maybe LockedDown -> Maybe Key -> Annex (Maybe Key, Maybe InodeCache) -ingest' _ Nothing _ = return (Nothing, Nothing) -ingest' preferredbackend (Just (LockedDown cfg source)) mk = withTSDelta $ \delta -> do +ingest' :: Maybe Backend -> Maybe LockedDown -> Maybe Key -> Restage -> Annex (Maybe Key, Maybe InodeCache) +ingest' _ Nothing _ _ = return (Nothing, Nothing) +ingest' preferredbackend (Just (LockedDown cfg source)) mk restage = withTSDelta $ \delta -> do k <- case mk of Nothing -> do backend <- maybe (chooseBackend $ keyFilename source) (return . Just) preferredbackend @@ -172,7 +171,7 @@ ingest' preferredbackend (Just (LockedDown cfg source)) mk = withTSDelta $ \delt golocked key mcache s = tryNonAsync (moveAnnex key $ contentLocation source) >>= \case Right True -> do - populateAssociatedFiles key source + populateAssociatedFiles key source restage success key mcache s Right False -> giveup "failed to add content to annex" Left e -> restoreFile (keyFilename source) key e @@ -186,7 +185,7 @@ ingest' preferredbackend (Just (LockedDown cfg source)) mk = withTSDelta $ \delt linkToAnnex key (keyFilename source) (Just cache) >>= \case LinkAnnexFailed -> failure "failed to link to annex" _ -> do - finishIngestUnlocked' key source + finishIngestUnlocked' key source restage success key (Just cache) s gounlocked _ _ _ = failure "failed statting file" @@ -218,23 +217,23 @@ finishIngestDirect key source = do finishIngestUnlocked :: Key -> KeySource -> Annex () finishIngestUnlocked key source = do cleanCruft source - finishIngestUnlocked' key source + finishIngestUnlocked' key source (Restage True) -finishIngestUnlocked' :: Key -> KeySource -> Annex () -finishIngestUnlocked' key source = do +finishIngestUnlocked' :: Key -> KeySource -> Restage -> Annex () +finishIngestUnlocked' key source restage = do Database.Keys.addAssociatedFile key =<< inRepo (toTopFilePath (keyFilename source)) - populateAssociatedFiles key source + populateAssociatedFiles key source restage {- Copy to any other locations using the same key. -} -populateAssociatedFiles :: Key -> KeySource -> Annex () -populateAssociatedFiles key source = do +populateAssociatedFiles :: Key -> KeySource -> Restage -> Annex () +populateAssociatedFiles key source restage = do obj <- calcRepo (gitAnnexLocation key) g <- Annex.gitRepo ingestedf <- flip fromTopFilePath g <$> inRepo (toTopFilePath (keyFilename source)) afs <- map (`fromTopFilePath` g) <$> Database.Keys.getAssociatedFiles key forM_ (filter (/= ingestedf) afs) $ - populatePointerFile key obj + populatePointerFile restage key obj cleanCruft :: KeySource -> Annex () cleanCruft source = when (contentLocation source /= keyFilename source) $ diff --git a/CHANGELOG b/CHANGELOG index 7e12578c9a..67fdc8b22a 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -9,6 +9,8 @@ git-annex (6.20180808) UNRELEASED; urgency=medium * v6 add: Take advantage of improved SIGPIPE handler in git 2.5 to speed up the clean filter by not reading the file content from the pipe. This also avoids git buffering the whole file content in memory. + * v6: After updating the worktree for an add/drop, update git's index, + so git status will not show the files as modified. -- Joey Hess Wed, 08 Aug 2018 11:24:08 -0400 diff --git a/Command/Smudge.hs b/Command/Smudge.hs index cfd327c8d0..c2e74c0768 100644 --- a/Command/Smudge.hs +++ b/Command/Smudge.hs @@ -97,7 +97,7 @@ clean file = do <$> catKeyFile file liftIO . emitPointer =<< go - =<< (\ld -> ingest' currbackend ld Nothing) + =<< (\ld -> ingest' currbackend ld Nothing norestage) =<< lockDown cfg file , liftIO $ B.hPut stdout b ) @@ -111,6 +111,9 @@ clean file = do { lockingFile = False , hardlinkFileTmp = False } + -- Can't restage associated files because git add runs this and has + -- the index locked. + norestage = Restage False shouldAnnex :: FilePath -> Annex Bool shouldAnnex file = do diff --git a/doc/todo/Long_Running_Filter_Process/comment_2_c2380f19248abf98928743fabd88ed05._comment b/doc/todo/Long_Running_Filter_Process/comment_2_c2380f19248abf98928743fabd88ed05._comment deleted file mode 100644 index 56cb1ab67e..0000000000 --- a/doc/todo/Long_Running_Filter_Process/comment_2_c2380f19248abf98928743fabd88ed05._comment +++ /dev/null @@ -1,41 +0,0 @@ -[[!comment format=mdwn - username="joey" - subject="""comment 2""" - date="2018-08-09T20:18:46Z" - content=""" -One of v6's big problems is that dropping or getting an annexed file -updates the file in the working tree, which makes git status think -the file is modified, even though the clean filter will output -the same pointer as before. Runing `git add` to clear it up is quite -expensive since the large file content has to be read. -Maybe a long-running filter process could avoid this problem. - ----- - -If git can be coaxed somehow into re-running the smudge filter, -git-annex could provide the new worktree content to git via it, -and let git update the working tree. - -Git would make a copy, which git-annex currently does, so the only -added overhead would be sending the file content to git down the pipe. -(Well and it won't use reflink for the copy on COW filesystems.) - -annex.thin is a complication, but it could be handled by hard linking the -work tree file that git writes back into the annex, overwriting the file that -was there. (This approach could also make git checkout of a branch honor -annex.thin.) - -How to make git re-run the smudge filter? It needs to want to update the -working tree. One way is to touch the worktree files and then run -`git checkout`. Although this risks losing modifications the user made -to the files so would need to be done with care. - -That seems like it would defer working tree updates until the git-annex -get command was done processing all files. Sometimes I want to use a -file while the same get command is still running for other files. -It might work to use the "delay" capability of the filter process -interface. Get git to re-smudge all affected files, and when it -asks for content for each, send "delayed". Then as git-annex gets -each file, respond to git's "list_available_blobs" with a single blob, -which git should request and use to update the working tree. -"""]] diff --git a/doc/todo/smudge.mdwn b/doc/todo/smudge.mdwn index e3df6b10d6..29746c6f9c 100644 --- a/doc/todo/smudge.mdwn +++ b/doc/todo/smudge.mdwn @@ -12,39 +12,39 @@ git-annex should use smudge/clean filters. # because it doesn't know it has that name # git commit clears up this mess -* Dropping a smudged file causes git status (and git annex status) - to show it as modified, because the timestamp has changed. - Getting a smudged file can also cause this. - Upgrading a direct mode repo also leaves files in this state. - User can use `git add` to clear it up, but better to avoid this, - by updating stat info in the index. +* If an annexed file's content is not present, and its pointer file + is copied to a new name and added, it does not get added as an + associated file. (If the content is present, it does get added.) - May need to use libgit2 to do this efficiently, cannot find - any plumbing except git-update-index, which is very inneficient for - smudged files; updating a file feeds its whole content through the clean - filter again. +* If an unlocked file's content is not present, and a new file with + identical content is added with `git add`, the unlocked file is + populated, but git-annex is unable to update the index, so git status + will say that it has been modified. - Part of the problem is that the clean filter needs to consume the whole - of stdin. (And git has to write the whole file content to stdout from the - file it mmaps). A more efficient smudge/clean interface that let the filter - read the file itself would let git-annex short-circuit when the file it's - cleaning is one it already knows about. I've proposed extending git with - such an interface: - +* If an annexed file is deleted in the index, and another annexed file + uses the same key, and git annex get/drop is run, the index update + that's done to prevent status showing the file as modified adds + the deleted file back to the index. - And developed a patch set: [[git-patches]] +* Also, if the user is getting files, and modifying files at the same + time, and they stage their modifications, the modification may get + unstaged in a race when a file is got and the updated worktree file + staged in the index. + + I don't know if this is worth worrying about, + because there's also of course a race where the modification to the + worktree file may get reverted when git-annex updates the content. Those + races are much smaller, but do exist. - > Thanks to [[!commit a96972015dd76271b46432151e15d5d38d7151ff]], - > the clean filter is now very quick, so, this can be fixed by running - > git update-index with files affected by get/drop. - > - > In case a file's content quickly changes after get/drop, git - > update-index would add the new content. To avoid this, use - > `git update-index --index-info`. The next run of `git status` - > then runs the clean filter, and will detect if the file has gotten - > modified after the get/drop. TODO +* get/drop operations on unlocked files lead to an update of the index. + Only one process can update the index at one time, so eg, git annex get + at the same time as a git commit may display a ugly warning + (or the git commit could fail to start if run at just the right time). -* Use git's new `filter..process` interface, which will + Two git-annex get processes can also try to update the index at the + same time and encounter this problem (git annex get -J is ok). + +* Potentially: Use git's new `filter..process` interface, which will let only 1 git-annex process be started by git when processing multiple files, and so should be faster. @@ -66,12 +66,8 @@ git-annex should use smudge/clean filters. git-annex adjust and git-annex sync could both use that internally when checking out the adjusted branch, and merging a branch into HEAD. - Since this approach modifies work tree files, it again causes git status - to think files are modified. So, the above todo item about that needs to - be sorted out first; it would not do for git annex adjust to cause - the whole work tree to be considered to be modified! - - My enhanced smudge/clean patch set also fixes this problem. + (My enhanced smudge/clean patch set also fixed this problem, in a much + nicer way...) * When git runs the smudge filter, it buffers all its output in ram before writing it to a file. So, checking out a branch with a large v6 unlocked files