From 293f95c2d6f63ce9725866f7b353a25448e308f6 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 26 Dec 2019 15:05:36 -0400 Subject: [PATCH] analysis --- Command/Smudge.hs | 27 +++++- ..._ea82bf47752807531870651653160f1c._comment | 97 +++++++++++++++++++ 2 files changed, 119 insertions(+), 5 deletions(-) create mode 100644 doc/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file/comment_1_ea82bf47752807531870651653160f1c._comment diff --git a/Command/Smudge.hs b/Command/Smudge.hs index d8f6c08454..01141fb042 100644 --- a/Command/Smudge.hs +++ b/Command/Smudge.hs @@ -24,9 +24,12 @@ import Backend import Utility.Metered import Annex.InodeSentinal import Utility.InodeCache +import Utility.ThreadScheduler import qualified Data.ByteString as S import qualified Data.ByteString.Lazy as L +import Data.Time.Clock +import Data.Time.Clock.POSIX cmd :: Command cmd = noCommit $ noMessages $ @@ -88,11 +91,14 @@ clean file = do Just k -> do getMoveRaceRecovery k (toRawFilePath file) liftIO $ L.hPut stdout b - Nothing -> go b =<< catKeyFile (toRawFilePath file) + Nothing -> do + ic <- withTSDelta (liftIO . genInodeCache (toRawFilePath file)) + go ic b =<< catKeyFile (toRawFilePath file) + gitMtimeWorkaround ic ) stop where - go b oldkey = ifM (shouldAnnex file oldkey) + go ic b oldkey = ifM (shouldAnnex file ic oldkey) ( do -- Before git 2.5, failing to consume all stdin here -- would cause a SIGPIPE and crash it. @@ -160,8 +166,8 @@ clean file = do -- annexed content before, annex it. This handles cases such as renaming an -- unlocked annexed file followed by git add, which the user naturally -- expects to behave the same as git mv. -shouldAnnex :: FilePath -> Maybe Key -> Annex Bool -shouldAnnex file moldkey = ifM (annexGitAddToAnnex <$> Annex.getGitConfig) +shouldAnnex :: FilePath -> Maybe InodeCache -> Maybe Key -> Annex Bool +shouldAnnex file mic moldkey = ifM (annexGitAddToAnnex <$> Annex.getGitConfig) ( checkmatcher checkheuristics , checkheuristics ) @@ -174,7 +180,7 @@ shouldAnnex file moldkey = ifM (annexGitAddToAnnex <$> Annex.getGitConfig) Just _ -> return True Nothing -> checkknowninode - checkknowninode = withTSDelta (liftIO . genInodeCache (toRawFilePath file)) >>= \case + checkknowninode = case mic of Nothing -> pure False Just ic -> Database.Keys.isInodeKnown ic =<< sentinalStatus @@ -213,3 +219,14 @@ updateSmudged restage = streamSmudged $ \k topf -> do Just k' | k' == k -> toplevelWarning False $ "unable to populate worktree file " ++ fromRawFilePath f _ -> noop + +-- If the mtime of a worktree file is too close to the mtime of the index +-- file, git will distrust the index. Which results in it running +-- the clean filter again later. While that's not normally a problem, +-- +gitMtimeWorkaround :: Maybe InodeCache -> Annex () +gitMtimeWorkaround Nothing = noop +gitMtimeWorkaround (Just ic) = liftIO $ do + now <- getCurrentTime + when (abs (diffUTCTime (posixSecondsToUTCTime (inodeCacheToMtime ic)) now) < 1) $ do + threadDelaySeconds (Seconds 1) diff --git a/doc/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file/comment_1_ea82bf47752807531870651653160f1c._comment b/doc/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file/comment_1_ea82bf47752807531870651653160f1c._comment new file mode 100644 index 0000000000..6f93c91ffa --- /dev/null +++ b/doc/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file/comment_1_ea82bf47752807531870651653160f1c._comment @@ -0,0 +1,97 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 1""" + date="2019-12-26T16:56:38Z" + content=""" +The title makes it sound like a work tree file gets replaced with a +dangling pointer file, which is not the case. A worktree file that was +not annexed is is being added to the annex, if you choose to commit that +state. + +For whatever reason, git becomes confused about whether this file is +modified. I seem to recall that git distrusts information it recorded in +its own index if the mtime of the index file is too close to the +mtime recorded inside it, or something like that. (Likely as a +workaround for mtime granularity issues with various filesystems.) Whatever +the reason, git-annex is not involved in it; it will happen sometimes even +when git-annex has not initialized the repo and is not being used. + +It's not normally a problem that git gets confused or distrusts its +index or whatever, since all it does is stat the file, or +feed it through the clean filter again, and if the file is not +modified, nothing changes. + +Why does the clean filter decide to add the file to annex in this case? + +Well, because this is all happening inside this: + + git -c annex.largefiles=anything annex add -- file-annex + +And there you've told it to add all files to the annex with +annex.largefiles=anything. So it does. + +To complete the description of what happens: +`git-annex add` runs `git add` on the `file-annex` symlink it's adding. +`git add file-annex`, for whatever reason, decides to run the clean filter on +file-git. +The annex.largefiles=anything gets inherited through this chain of calls. + +While the resulting "change" does not get staged by `git add` +(it was never asked to operate on that file), the clean filter +duly ingests the content into the annex, and remembers its inode. +So when the clean filter later gets run by `git status`, it sees an inode +it knows it saw before, and assumes it should remain annexed. +(This is why the commit that checks for known inodes was fingered by the +bisection.) + +--- + +Note that, you can accomplish the same thing without setting +annex.largefiles, assuming a current version of git-annex: + + git add file-git + git annex add file-annex + +I think the only reason for setting annex.largefiles in either of the two +places you did is if there's a default value that you want to +temporarily override? + +---- + +Also, just touching file-git before the annex.largefiles=anything +operation causes the same problem, again git-annex add runs git add +file-annex, which runs the clean filter on file-git, which this time +is legitimately modified. + +--- + +Possible ways to improve this short of improving git's behavior: + +`git annex` could set annex.gitaddtoannex=false when it runs `git add`. +Since git-annex never relies on `git add` adding files to the annex, +that seems entirely safe to always do (perhaps even when running all git +commands aside from git-annex commands of course). But, that would +not help with a variant where rather than `git-annex add`, +this is run: + + git -c annex.largefiles=anything add file-annex + +The clean filter could delay long enough that git stops distrusting +its index based on timestamps. A 1 second sleep if the file's mtime +is too close to the current time works; I prototyped a patch doing that. +But, that does not deal with the case +mentioned above where file-git gets touched or legitimately modified. + +The clean filter could check if the file is already +in the index but is not annexed, and avoid converting it to annexed. +But that would prevent legitimate conversions from git to annexed +as well, which rely on the same kind of use of annex.largefiles. + +Temporary overrides of annex.largefiles could be ignored by the clean +filter. Same problem as previous. + +So, I think that fixing this will involve adding a new interface for +converting between git and annexed files that does not involve +-c annex.largefiles. That plus having the clean filter check for +non-annexed files seems like the best approach. +"""]]