From d2c48404a8774a136dd30a95958cbe7047a04704 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Wed, 30 Jun 2021 17:57:49 -0400 Subject: [PATCH] assistant: Avoid unncessary git repository repair In a situation where git fsck gets confused about a commit that is made while it's running. Sponsored-by: Graham Spencer on Patreon --- Assistant/Threads/Cronner.hs | 4 +-- Assistant/Threads/ProblemFixer.hs | 2 +- CHANGELOG | 7 +++++ Git/Fsck.hs | 20 +++++++++++--- Git/Repair.hs | 10 +++---- doc/bugs/assistant_repair_misfires.mdwn | 36 +++++++++++++++++++++---- 6 files changed, 62 insertions(+), 17 deletions(-) diff --git a/Assistant/Threads/Cronner.hs b/Assistant/Threads/Cronner.hs index a13acabc73..9bd173869d 100644 --- a/Assistant/Threads/Cronner.hs +++ b/Assistant/Threads/Cronner.hs @@ -185,7 +185,7 @@ runActivity' urlrenderer (ScheduledSelfFsck _ d) = do g <- liftAnnex gitRepo fsckresults <- showFscking urlrenderer Nothing $ tryNonAsync $ do void $ batchCommand program (Param "fsck" : annexFsckParams d) - Git.Fsck.findBroken True g + Git.Fsck.findBroken True False g u <- liftAnnex getUUID void $ repairWhenNecessary urlrenderer u Nothing fsckresults mapM_ reget =<< liftAnnex (dirKeys gitAnnexBadDir) @@ -214,7 +214,7 @@ runActivity' urlrenderer (ScheduledRemoteFsck u s d) = dispatch =<< liftAnnex (r fsckresults <- showFscking urlrenderer (Just rmt) $ tryNonAsync $ do void annexfscker if Git.repoIsLocal repo && not (Git.repoIsLocalUnknown repo) - then Just <$> Git.Fsck.findBroken True repo + then Just <$> Git.Fsck.findBroken True True repo else pure Nothing maybe noop (void . repairWhenNecessary urlrenderer u (Just rmt)) fsckresults diff --git a/Assistant/Threads/ProblemFixer.hs b/Assistant/Threads/ProblemFixer.hs index 5146932b25..0c29044dec 100644 --- a/Assistant/Threads/ProblemFixer.hs +++ b/Assistant/Threads/ProblemFixer.hs @@ -60,7 +60,7 @@ handleRemoteProblem' repo urlrenderer rmt ( do fixedlocks <- repairStaleGitLocks repo fsckresults <- showFscking urlrenderer (Just rmt) $ tryNonAsync $ - Git.Fsck.findBroken True repo + Git.Fsck.findBroken True True repo repaired <- repairWhenNecessary urlrenderer (Remote.uuid rmt) (Just rmt) fsckresults return $ fixedlocks || repaired , return False diff --git a/CHANGELOG b/CHANGELOG index cfbda6e29a..65bfcdc446 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,10 @@ +git-annex (8.20210631) UNRELEASED; urgency=medium + + * assistant: Avoid unncessary git repository repair in a situation where + git fsck gets confused about a commit that is made while it's running. + + -- Joey Hess Wed, 30 Jun 2021 17:55:10 -0400 + git-annex (8.20210630) upstream; urgency=medium * Fixed bug that interrupting git-annex repair (or assistant) while diff --git a/Git/Fsck.hs b/Git/Fsck.hs index 7440b92272..de11e89af4 100644 --- a/Git/Fsck.hs +++ b/Git/Fsck.hs @@ -1,4 +1,5 @@ {- git fsck interface +i it is not fully repoducibleI repeated the same steps - - Copyright 2013 Joey Hess - @@ -69,9 +70,17 @@ instance Monoid FsckOutput where - look for anything in its output (both stdout and stderr) that appears - to be a git sha. Not all such shas are of broken objects, so ask git - to try to cat the object, and see if it fails. + - + - Note that there is a possible false positive: When changes are being + - made to the repo while this is running, fsck might complain about a + - missing object that has not made it to disk yet. Catting the object + - then succeeds, so it's not included in the FsckResults. But, fsck then + - exits nonzero, and so FsckFailed is returned. Set ignorenonzeroexit + - to avoid this false positive, at the risk of perhaps missing a problem + - so bad that fsck crashes without outputting any missing shas. -} -findBroken :: Bool -> Repo -> IO FsckResults -findBroken batchmode r = do +findBroken :: Bool -> Bool -> Repo -> IO FsckResults +findBroken batchmode ignorenonzeroexit r = do let (command, params) = ("git", fsckParams r) (command', params') <- if batchmode then toBatchCommand (command, params) @@ -90,10 +99,10 @@ findBroken batchmode r = do fsckok <- checkSuccessProcess pid case mappend o1 o2 of FsckOutput badobjs truncated - | S.null badobjs && not fsckok -> return FsckFailed + | S.null badobjs && not fsckok -> return fsckfailed | otherwise -> return $ FsckFoundMissing badobjs truncated NoFsckOutput - | not fsckok -> return FsckFailed + | not fsckok -> return fsckfailed | otherwise -> return noproblem -- If all fsck output was duplicateEntries warnings, -- the repository is not broken, it just has some @@ -104,6 +113,9 @@ findBroken batchmode r = do maxobjs = 10000 noproblem = FsckFoundMissing S.empty False + fsckfailed + | ignorenonzeroexit = noproblem + | otherwise = FsckFailed foundBroken :: FsckResults -> Bool foundBroken FsckFailed = True diff --git a/Git/Repair.hs b/Git/Repair.hs index 144c96fd60..d221fa6335 100644 --- a/Git/Repair.hs +++ b/Git/Repair.hs @@ -476,7 +476,7 @@ runRepair :: (Ref -> Bool) -> Bool -> Repo -> IO (Bool, [Branch]) runRepair removablebranch forced g = do preRepair g putStrLn "Running git fsck ..." - fsckresult <- findBroken False g + fsckresult <- findBroken False False g if foundBroken fsckresult then do putStrLn "Fsck found problems, attempting repair." @@ -500,7 +500,7 @@ runRepairOf fsckresult removablebranch forced referencerepo g = do runRepair' :: (Ref -> Bool) -> FsckResults -> Bool -> Maybe FilePath -> Repo -> IO (Bool, [Branch]) runRepair' removablebranch fsckresult forced referencerepo g = do cleanCorruptObjects fsckresult g - missing <- findBroken False g + missing <- findBroken False False g stillmissing <- retrieveMissingObjects missing referencerepo g case stillmissing of FsckFoundMissing s t @@ -529,7 +529,7 @@ runRepair' removablebranch fsckresult forced referencerepo g = do | forced -> ifM (pure (repoIsLocalBare g) <||> checkIndex g) ( do cleanCorruptObjects FsckFailed g - stillmissing' <- findBroken False g + stillmissing' <- findBroken False False g case stillmissing' of FsckFailed -> return (False, []) FsckFoundMissing s t -> forcerepair s t @@ -575,7 +575,7 @@ runRepair' removablebranch fsckresult forced referencerepo g = do -- the repair process. if fscktruncated then do - fsckresult' <- findBroken False g + fsckresult' <- findBroken False False g case fsckresult' of FsckFailed -> do putStrLn "git fsck is failing" @@ -597,7 +597,7 @@ runRepair' removablebranch fsckresult forced referencerepo g = do removeWhenExistsWith R.removeLink (indexFile g) -- The corrupted index can prevent fsck from finding other -- problems, so re-run repair. - fsckresult' <- findBroken False g + fsckresult' <- findBroken False False g result <- runRepairOf fsckresult' removablebranch forced referencerepo g putStrLn "Removed the corrupted index file. You should look at what files are present in your working tree and git add them back to the index when appropriate." return result diff --git a/doc/bugs/assistant_repair_misfires.mdwn b/doc/bugs/assistant_repair_misfires.mdwn index 6d24adf888..21dfb19be8 100644 --- a/doc/bugs/assistant_repair_misfires.mdwn +++ b/doc/bugs/assistant_repair_misfires.mdwn @@ -14,13 +14,39 @@ repair. git fsck reported no problems at all. --[[Joey]] > the fsck exited nonzero for some reason, but did not detect > any misssing shas. > -> Reproed on my own laptop, with the family annex. However, -> it is not fully repoducible, I repeated the same steps -> again and it didn't happen. Those were: Clone; +> Reproed on my own laptop, with the family annex. This reproduces it about +> 50% of the time: Clone over ssh; git-annex init; > git remote rm origin; git annex schedule here 'fsck self 30m every day at -> any time'; git annex assistant +> any time'; git annex assistant; kill git-annex fsck process +> +> Confirmed that it's getting FsckFailed. > > Hypothesis: Maybe fsck is failing due to some other change -> that is being made to the git repo at the same time it's running? +> that is being made to the git repo by the assistant +> at the same time it's running? > I noticed some files being downloaded from the web at the same > time the failed fsck was running. +> +> Fsck output to stdout is empty, stderr is: +> +> missing commit 4da14c19140e4c240358af4518d83661713ab044 +> +> Intriguingly, that commit is present. It is a commit on the git-annex +> branch. And fscking again succeeds. So, fsck found a reference to a +> commit object that had not yet been written to disk. This feels like a +> bug in git, because if it were interrupted there the repo would be left +> in a bad state. +> +> Anyway, git-annex verifies that the commit is present, to double-check +> it understood fsck correctly. And it is. So it is not considered a +> problem. But, fsck still exits nonzero because it thinks there was a +> problem. And that's the problem. +> +> Fixed by making git-annex assistant ignore fsck nonzero +> exit status when it does not find any missing objects. +> Since any actual failure that makes fsck do that can't +> be distinguished from a false positive. I left git-annex repair +> unchanged, because if the user knows the repo is badly broken and explictly +> runs it, they would be surprised if it didn't repair. +> +> [[fixed|done]] --[[Joey]]