Assistant, repair: Fix ignoring of git fsck errors due to duplicate file entries in tree objects.

This commit is contained in:
Joey Hess 2016-10-31 14:00:37 -04:00
parent 632e9c93f0
commit 2ad7b00e29
No known key found for this signature in database
GPG key ID: C910D9222512E3C7
3 changed files with 91 additions and 19 deletions

View file

@ -1,3 +1,10 @@
git-annex (6.20161028) UNRELEASED; urgency=medium
* Assistant, repair: Fix ignoring of git fsck errors due to
duplicate file entries in tree objects.
-- Joey Hess <id@joeyh.name> Mon, 31 Oct 2016 13:59:47 -0400
git-annex (6.20161027) unstable; urgency=medium
* lock, smudge: Fix edge cases where data loss could occur in v6 mode

View file

@ -5,6 +5,8 @@
- Licensed under the GNU GPL version 3 or higher.
-}
{-# LANGUAGE BangPatterns #-}
module Git.Fsck (
FsckResults(..),
MissingObjects,
@ -25,8 +27,6 @@ import qualified Git.Version
import qualified Data.Set as S
import Control.Concurrent.Async
type MissingObjects = S.Set Sha
data FsckResults
= FsckFoundMissing
{ missingObjects :: MissingObjects
@ -35,6 +35,25 @@ data FsckResults
| FsckFailed
deriving (Show)
data FsckOutput
= FsckOutput MissingObjects Truncated
| NoFsckOutput
| AllDuplicateEntriesWarning
type MissingObjects = S.Set Sha
type Truncated = Bool
instance Monoid FsckOutput where
mempty = NoFsckOutput
mappend (FsckOutput s1 t1) (FsckOutput s2 t2) = FsckOutput (S.union s1 s2) (t1 || t2)
mappend (FsckOutput s t) _ = FsckOutput s t
mappend _ (FsckOutput s t) = FsckOutput s t
mappend NoFsckOutput NoFsckOutput = NoFsckOutput
mappend AllDuplicateEntriesWarning AllDuplicateEntriesWarning = AllDuplicateEntriesWarning
mappend AllDuplicateEntriesWarning NoFsckOutput = AllDuplicateEntriesWarning
mappend NoFsckOutput AllDuplicateEntriesWarning = AllDuplicateEntriesWarning
{- Runs fsck to find some of the broken objects in the repository.
- May not find all broken objects, if fsck fails on bad data in some of
- the broken objects it does find.
@ -58,18 +77,24 @@ findBroken batchmode r = do
{ std_out = CreatePipe
, std_err = CreatePipe
}
(bad1, bad2) <- concurrently
(readMissingObjs maxobjs r supportsNoDangling (stdoutHandle p))
(readMissingObjs maxobjs r supportsNoDangling (stderrHandle p))
(o1, o2) <- concurrently
(parseFsckOutput maxobjs r supportsNoDangling (stdoutHandle p))
(parseFsckOutput maxobjs r supportsNoDangling (stderrHandle p))
fsckok <- checkSuccessProcess pid
let truncated = S.size bad1 == maxobjs || S.size bad1 == maxobjs
let badobjs = S.union bad1 bad2
if S.null badobjs && not fsckok
then return FsckFailed
else return $ FsckFoundMissing badobjs truncated
case mappend o1 o2 of
FsckOutput badobjs truncated
| S.null badobjs && not fsckok -> return FsckFailed
| otherwise -> return $ FsckFoundMissing badobjs truncated
NoFsckOutput
| not fsckok -> return FsckFailed
| otherwise -> return noproblem
-- If all fsck output was duplicateEntries warnings,
-- the repository is not broken, it just has some unusual
-- tree objects in it. So ignore nonzero exit status.
AllDuplicateEntriesWarning -> return noproblem
where
maxobjs = 10000
noproblem = FsckFoundMissing S.empty False
foundBroken :: FsckResults -> Bool
foundBroken FsckFailed = True
@ -87,10 +112,18 @@ knownMissing (FsckFoundMissing s _) = s
findMissing :: [Sha] -> Repo -> IO MissingObjects
findMissing objs r = S.fromList <$> filterM (`isMissing` r) objs
readMissingObjs :: Int -> Repo -> Bool -> Handle -> IO MissingObjects
readMissingObjs maxobjs r supportsNoDangling h = do
objs <- take maxobjs . findShas supportsNoDangling <$> hGetContents h
findMissing objs r
parseFsckOutput :: Int -> Repo -> Bool -> Handle -> IO FsckOutput
parseFsckOutput maxobjs r supportsNoDangling h = do
ls <- lines <$> hGetContents h
if null ls
then return NoFsckOutput
else if all ("duplicateEntries" `isInfixOf`) ls
then return AllDuplicateEntriesWarning
else do
let shas = findShas supportsNoDangling ls
let !truncated = length shas > maxobjs
missingobjs <- findMissing (take maxobjs shas) r
return $ FsckOutput missingobjs truncated
isMissing :: Sha -> Repo -> IO Bool
isMissing s r = either (const True) (const False) <$> tryIO dump
@ -100,12 +133,10 @@ isMissing s r = either (const True) (const False) <$> tryIO dump
, Param (fromRef s)
] r
findShas :: Bool -> String -> [Sha]
findShas supportsNoDangling = catMaybes . map extractSha . concat . map words . filter wanted . lines
findShas :: Bool -> [String] -> [Sha]
findShas supportsNoDangling = catMaybes . map extractSha . concat . map words . filter wanted
where
wanted l
-- Skip lines like "error in tree <sha>: duplicateEntries: contains duplicate file entries"
| "duplicateEntries" `isInfixOf` l = False
| supportsNoDangling = True
| otherwise = not ("dangling " `isPrefixOf` l)

View file

@ -0,0 +1,34 @@
[[!comment format=mdwn
username="joey"
subject="""comment 1"""
date="2016-10-31T17:07:43Z"
content="""
This was supposed to be dealt with in version 6.20161027. But, I see now
that the parser I thought was triggering on those missing object lines,
was not really (`extractSha` does not match in this case because of the
colon after the sha).
Instead, the problem seems to be that `git fsck` is exiting nonzero. So it
assumes that fsck is failing without printing out any shas, which is a
condition that calls for repairs.
About all I can think to do is, if fsck outputs "duplicateEntries" and
no other lines at all, and exits nonzero, treat this as a success.
This risks ignoring other reasons fsck might exit nonzero, but hopefully
it would output something else in such a case. I've implemented this.
---
I am interested in getting at the root cause of the problem of
duplicate directory entries. It seems pretty likely to result from using
adjusted branches.
It would be useful to get more information about the trees that fsck
is warning about; are they part of existing or past adjusted branches
or not? Are they merge commits?
(Previously: [[forum/how_to_disaster_recovery]])
Leaving this bug open since we really need to get at the root cause of
the problem.
"""]]