git-recover-repository 1/2 done

This commit is contained in:
Joey Hess 2013-10-20 17:50:51 -04:00
parent f482de1b76
commit 4f871f89ba
16 changed files with 431 additions and 43 deletions

11
.gitignore vendored
View file

@ -1,3 +1,7 @@
tags
Setup
*.hi
*.o
tmp
test
build-stamp
@ -9,7 +13,10 @@ Build/OSXMkLibs
git-annex
git-annex.1
git-annex-shell.1
git-union-merge
git-union-merge.1
git-recover-repository
git-recover-repository.1
doc/.ikiwiki
html
*.tix
@ -22,7 +29,3 @@ cabal-dev
# OSX related
.DS_Store
.virthualenv
tags
Setup
*.hi
*.o

View file

@ -43,7 +43,7 @@ catTree ref = do
h <- catFileHandle
liftIO $ Git.CatFile.catTree h ref
catObjectDetails :: Git.Ref -> Annex (Maybe (L.ByteString, Sha))
catObjectDetails :: Git.Ref -> Annex (Maybe (L.ByteString, Sha, ObjectType))
catObjectDetails ref = do
h <- catFileHandle
liftIO $ Git.CatFile.catObjectDetails h ref

View file

@ -300,7 +300,7 @@ addLink file link mk = do
liftAnnex $ do
v <- catObjectDetails $ Ref $ ':':file
case v of
Just (currlink, sha)
Just (currlink, sha, _type)
| s2w8 link == L.unpack currlink ->
stageSymlink file sha
_ -> stageSymlink file =<< hashSymlink link

View file

@ -96,7 +96,7 @@ commit message branch parentrefs repo = do
pipeReadStrict [Param "write-tree"] repo
sha <- getSha "commit-tree" $ pipeWriteRead
(map Param $ ["commit-tree", show tree] ++ ps)
message repo
(Just $ flip hPutStr message) repo
run [Param "update-ref", Param $ show branch, Param $ show sha] repo
return sha
where

View file

@ -8,6 +8,7 @@
module Git.CatFile (
CatFileHandle,
catFileStart,
catFileStart',
catFileStop,
catFile,
catTree,
@ -18,8 +19,7 @@ module Git.CatFile (
import System.IO
import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
import Data.Char
import System.Process (std_out, std_err)
import Data.Tuple.Utils
import Numeric
import System.Posix.Types
@ -30,13 +30,15 @@ import Git.Command
import Git.Types
import Git.FilePath
import qualified Utility.CoProcess as CoProcess
import Utility.Hash
data CatFileHandle = CatFileHandle CoProcess.CoProcessHandle Repo
catFileStart :: Repo -> IO CatFileHandle
catFileStart repo = do
coprocess <- CoProcess.rawMode =<< gitCoProcessStart True
catFileStart = catFileStart' True
catFileStart' :: Bool -> Repo -> IO CatFileHandle
catFileStart' restartable repo = do
coprocess <- CoProcess.rawMode =<< gitCoProcessStart restartable
[ Param "cat-file"
, Param "--batch"
] repo
@ -53,11 +55,10 @@ catFile h branch file = catObject h $ Ref $
{- Uses a running git cat-file read the content of an object.
- Objects that do not exist will have "" returned. -}
catObject :: CatFileHandle -> Ref -> IO L.ByteString
catObject h object = maybe L.empty fst <$> catObjectDetails h object
catObject h object = maybe L.empty fst3 <$> catObjectDetails h object
{- Gets both the content of an object, and its Sha. -}
catObjectDetails :: CatFileHandle -> Ref -> IO (Maybe (L.ByteString, Sha))
catObjectDetails (CatFileHandle hdl repo) object = CoProcess.query hdl send receive
catObjectDetails :: CatFileHandle -> Ref -> IO (Maybe (L.ByteString, Sha, ObjectType))
catObjectDetails (CatFileHandle hdl _) object = CoProcess.query hdl send receive
where
query = show object
send to = hPutStrLn to query
@ -65,19 +66,18 @@ catObjectDetails (CatFileHandle hdl repo) object = CoProcess.query hdl send rece
header <- hGetLine from
case words header of
[sha, objtype, size]
| length sha == shaSize &&
isJust (readObjectType objtype) ->
case reads size of
[(bytes, "")] -> readcontent bytes from sha
| length sha == shaSize ->
case (readObjectType objtype, reads size) of
(Just t, [(bytes, "")]) -> readcontent t bytes from sha
_ -> dne
| otherwise -> dne
_
| header == show object ++ " missing" -> dne
| otherwise -> error $ "unknown response from git cat-file " ++ show (header, object)
readcontent bytes from sha = do
readcontent objtype bytes from sha = do
content <- S.hGet from bytes
eatchar '\n' from
return $ Just (L.fromChunks [content], Ref sha)
return $ Just (L.fromChunks [content], Ref sha, objtype)
dne = return Nothing
eatchar expected from = do
c <- hGetChar from
@ -88,8 +88,8 @@ catObjectDetails (CatFileHandle hdl repo) object = CoProcess.query hdl send rece
catTree :: CatFileHandle -> Ref -> IO [(FilePath, FileMode)]
catTree h treeref = go <$> catObjectDetails h treeref
where
go Nothing = []
go (Just (b, _)) = parsetree [] b
go (Just (b, _, TreeObject)) = parsetree [] b
go _ = []
parsetree c b = case L.break (== 0) b of
(modefile, rest)

View file

@ -1,6 +1,6 @@
{- running git commands
-
- Copyright 2010-2012 Joey Hess <joey@kitenet.net>
- Copyright 2010-2013 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
@ -85,13 +85,13 @@ pipeReadStrict params repo = assertLocal repo $
where
p = gitCreateProcess params repo
{- Runs a git command, feeding it input, and returning its output,
{- Runs a git command, feeding it an input, and returning its output,
- which is expected to be fairly small, since it's all read into memory
- strictly. -}
pipeWriteRead :: [CommandParam] -> String -> Repo -> IO String
pipeWriteRead params s repo = assertLocal repo $
pipeWriteRead :: [CommandParam] -> Maybe (Handle -> IO ()) -> Repo -> IO String
pipeWriteRead params writer repo = assertLocal repo $
writeReadProcessEnv "git" (toCommand $ gitCommandLine params repo)
(gitEnv repo) s (Just adjusthandle)
(gitEnv repo) writer (Just adjusthandle)
where
adjusthandle h = do
fileEncoding h

66
Git/Fsck.hs Normal file
View file

@ -0,0 +1,66 @@
{- git fsck interface
-
- Copyright 2013 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
module Git.Fsck (
findBroken,
findMissing
) where
import Common
import Git
import Git.Command
import Git.Sha
import Git.CatFile
import qualified Data.Set as S
{- Runs fsck to find some of the broken objects in the repository.
- May not find all broken objects, if fsck fails on bad data in some of
- the broken objects it does find. If the fsck fails generally without
- finding any broken objects, returns Nothing.
-
- Strategy: Rather than parsing fsck's current specific output,
- look for anything in its output (both stdout and stderr) that appears
- to be a git sha. Not all such shas are of broken objects, so ask git
- to try to cat the object, and see if it fails.
-}
findBroken :: Repo -> IO (Maybe (S.Set Sha))
findBroken r = do
(output, fsckok) <- processTranscript "git" (toCommand $ fsckParams r) Nothing
let objs = parseFsckOutput output
badobjs <- findMissing objs r
if S.null badobjs && not fsckok
then return Nothing
else return $ Just badobjs
{- Finds objects that are missing from the git repsitory, or are corrupt.
- Note that catting a corrupt object will cause cat-file to crash. -}
findMissing :: [Sha] -> Repo -> IO (S.Set Sha)
findMissing objs r = go objs [] =<< start
where
start = catFileStart' False r
go [] c h = do
catFileStop h
return $ S.fromList c
go (o:os) c h = do
v <- tryIO $ isNothing <$> catObjectDetails h o
case v of
Left _ -> do
void $ tryIO $ catFileStop h
go os (o:c) =<< start
Right True -> go os (o:c) h
Right False -> go os c h
parseFsckOutput :: String -> [Sha]
parseFsckOutput = catMaybes . map extractSha . concat . map words . lines
fsckParams :: Repo -> [CommandParam]
fsckParams = gitCommandLine
[ Param "fsck"
, Param "--no-dangling"
, Param "--no-reflogs"
]

View file

@ -36,8 +36,11 @@ hashFile h file = CoProcess.query h send receive
{- Injects some content into git, returning its Sha. -}
hashObject :: ObjectType -> String -> Repo -> IO Sha
hashObject objtype content repo = getSha subcmd $
pipeWriteRead (map Param params) content repo
hashObject objtype content = hashObject' objtype (flip hPutStr content)
hashObject' :: ObjectType -> (Handle -> IO ()) -> Repo -> IO Sha
hashObject' objtype writer repo = getSha subcmd $
pipeWriteRead (map Param params) (Just writer) repo
where
subcmd = "hash-object"
params = [subcmd, "-t", show objtype, "-w", "--stdin", "--no-filters"]

29
Git/Objects.hs Normal file
View file

@ -0,0 +1,29 @@
{- .git/objects
-
- Copyright 2013 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
module Git.Objects where
import Common
import Git
objectsDir :: Repo -> FilePath
objectsDir r = localGitDir r </> "objects"
packDir :: Repo -> FilePath
packDir r = objectsDir r </> "pack"
listPackFiles :: Repo -> IO [FilePath]
listPackFiles r = filter (".pack" `isSuffixOf`)
<$> catchDefaultIO [] (dirContents $ packDir r)
packIdxFile :: FilePath -> FilePath
packIdxFile = flip replaceExtension "idx"
looseObjectFile :: Repo -> Sha -> FilePath
looseObjectFile r sha = objectsDir r </> prefix </> rest
where
(prefix, rest) = splitAt 2 (show sha)

171
Git/RecoverRepository.hs Normal file
View file

@ -0,0 +1,171 @@
{- git repository recovery
-
- Copyright 2013 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
module Git.RecoverRepository (
cleanCorruptObjects,
retrieveMissingObjects,
resetLocalBranches,
removeTrackingBranches,
) where
import Common
import Git
import Git.Command
import Git.Fsck
import Git.Objects
import Git.HashObject
import Git.Types
import qualified Git.Config
import qualified Git.Construct
import Utility.Tmp
import Utility.Monad
import Utility.Rsync
import qualified Data.Set as S
import qualified Data.ByteString.Lazy as L
import System.Log.Logger
{- Finds and removes corrupt objects from the repository, returning a list
- of all such objects, which need to be found elsewhere to finish
- recovery.
-
- Strategy: Run git fsck, remove objects it identifies as corrupt,
- and repeat until git fsck finds no new objects.
-
- To remove corrupt objects, unpack all packs, and remove the packs
- (to handle corrupt packs), and remove loose object files.
-}
cleanCorruptObjects :: Repo -> IO (S.Set Sha)
cleanCorruptObjects r = do
notice "Running git fsck ..."
check =<< findBroken r
where
check Nothing = do
notice "git fsck found a problem but no specific broken objects. Perhaps a corrupt pack file? Unpacking all pack files."
explodePacks r
retry S.empty
check (Just bad)
| S.null bad = return S.empty
| otherwise = do
notice $ unwords
[ "git fsck found"
, show (S.size bad)
, "broken objects. Unpacking all pack files."
]
explodePacks r
removeLoose r bad
retry bad
retry oldbad = do
notice "Re-running git fsck to see if it finds more problems."
v <- findBroken r
case v of
Nothing -> error $ unwords
[ "git fsck found a problem, which was not corrected after removing"
, show (S.size oldbad)
, "corrupt objects."
]
Just newbad -> do
removeLoose r newbad
let s = S.union oldbad newbad
if s == oldbad
then return s
else retry s
removeLoose :: Repo -> S.Set Sha -> IO ()
removeLoose r s = do
let fs = map (looseObjectFile r) (S.toList s)
count <- length <$> filterM doesFileExist fs
when (count > 0) $ do
notice $ unwords
[ "removing"
, show count
, "corrupt loose objects"
]
mapM_ nukeFile fs
explodePacks :: Repo -> IO ()
explodePacks r = mapM_ go =<< listPackFiles r
where
go packfile = do
-- May fail, if pack file is corrupt.
void $ tryIO $
pipeWrite [Param "unpack-objects"] r $ \h ->
L.hPut h =<< L.readFile packfile
nukeFile packfile
nukeFile $ packIdxFile packfile
{- Try to retrieve a set of missing objects, from the remotes of a
- repository. Returns any that could not be retreived.
-}
retrieveMissingObjects :: S.Set Sha -> Repo -> IO (S.Set Sha)
retrieveMissingObjects missing r
| S.null missing = return missing
| otherwise = withTmpDir "tmprepo" $ \tmpdir -> do
unlessM (boolSystem "git" [Params "init", File tmpdir]) $
error $ "failed to create temp repository in " ++ tmpdir
tmpr <- Git.Config.read =<< Git.Construct.fromAbsPath tmpdir
stillmissing <- pullremotes tmpr (remotes r) fetchrefstags missing
if S.null stillmissing
then return stillmissing
else pullremotes tmpr (remotes r) fetchallrefs stillmissing
where
pullremotes tmpr [] _ stillmissing = return stillmissing
pullremotes tmpr (rmt:rmts) fetchrefs s
| S.null s = return s
| otherwise = do
notice $ "Trying to recover missing objects from remote " ++ repoDescribe rmt
ifM (fetchsome rmt fetchrefs tmpr)
( do
void $ copyObjects tmpr r
stillmissing <- findMissing (S.toList s) r
pullremotes tmpr rmts fetchrefs stillmissing
, do
notice $ unwords
[ "failed to fetch from remote"
, repoDescribe rmt
, "(will continue without it, but making this remote available may improve recovery)"
]
pullremotes tmpr rmts fetchrefs s
)
fetchsome rmt ps = runBool $
[ Param "fetch"
, Param (repoLocation rmt)
, Params "--force --update-head-ok --quiet"
] ++ ps
-- fetch refs and tags
fetchrefstags = [ Param "+refs/heads/*:refs/heads/*", Param "--tags"]
-- Fetch all available refs (more likely to fail,
-- as the remote may have refs it refuses to send).
fetchallrefs = [ Param "+*:*" ]
{- Copies all objects from the src repository to the dest repository.
- This is done using rsync, so it copies all missing object, and all
- objects they rely on. -}
copyObjects :: Repo -> Repo -> IO Bool
copyObjects srcr destr = rsync
[ Param "-qr"
, File $ addTrailingPathSeparator $ objectsDir srcr
, File $ addTrailingPathSeparator $ objectsDir destr
]
{- To deal with missing objects that cannot be recovered, resets any
- local branches to point to an old commit before the missing
- objects.
-}
resetLocalBranches :: S.Set Sha -> Repo -> IO [Branch]
resetLocalBranches missing r = do
error "TODO"
{- To deal with missing objects that cannot be recovered, removes
- any remote tracking branches that reference them.
-}
removeTrackingBranches :: S.Set Sha -> Repo -> IO [Branch]
removeTrackingBranches missing r = do
error "TODO"
notice :: String -> IO ()
notice = noticeM "RecoverRepository"

View file

@ -27,8 +27,15 @@ git-annex.1: doc/git-annex.mdwn
git-annex-shell.1: doc/git-annex-shell.mdwn
./Build/mdwn2man git-annex-shell 1 doc/git-annex-shell.mdwn > git-annex-shell.1
# These are not built normally.
git-union-merge.1: doc/git-union-merge.mdwn
./Build/mdwn2man git-union-merge 1 doc/git-union-merge.mdwn > git-union-merge.1
git-recover-repository.1: doc/git-recover-repository.mdwn
./Build/mdwn2man git-recover-repository 1 doc/git-recover-repository.mdwn > git-recover-repository.1
git-union-merge:
$(GHC) --make -threaded $@
git-recover-repository:
$(GHC) --make -threaded $@
install-mans: $(mans)
install -d $(DESTDIR)$(PREFIX)/share/man/man1
@ -74,7 +81,8 @@ clean:
rm -rf tmp dist git-annex $(mans) configure *.tix .hpc \
doc/.ikiwiki html dist tags Build/SysConfig.hs build-stamp \
Setup Build/InstallDesktopFile Build/EvilSplicer \
Build/Standalone Build/OSXMkLibs
Build/Standalone Build/OSXMkLibs \
git-union-merge git-recover-repository
find -name \*.o -exec rm {} \;
find -name \*.hi -exec rm {} \;
@ -213,4 +221,4 @@ hdevtools:
hdevtools --stop-server || true
hdevtools check git-annex.hs -g -cpp -g -i -g -idist/build/git-annex/git-annex-tmp -g -i. -g -idist/build/autogen -g -Idist/build/autogen -g -Idist/build/git-annex/git-annex-tmp -g -IUtility -g -DWITH_TESTSUITE -g -DWITH_S3 -g -DWITH_ASSISTANT -g -DWITH_INOTIFY -g -DWITH_DBUS -g -DWITH_PAIRING -g -DWITH_XMPP -g -optP-include -g -optPdist/build/autogen/cabal_macros.h -g -odir -g dist/build/git-annex/git-annex-tmp -g -hidir -g dist/build/git-annex/git-annex-tmp -g -stubdir -g dist/build/git-annex/git-annex-tmp -g -threaded -g -Wall -g -XHaskell98 -g -XPackageImports
.PHONY: git-annex tags build-stamp
.PHONY: git-annex git-union-merge git-recover-repository tags build-stamp

View file

@ -72,17 +72,17 @@ readProcessEnv cmd args environ =
, env = environ
}
{- Writes a string to a process on its stdin,
{- Runs an action to write to a process on its stdin,
- returns its output, and also allows specifying the environment.
-}
writeReadProcessEnv
:: FilePath
-> [String]
-> Maybe [(String, String)]
-> String
-> (Maybe (Handle -> IO ()))
-> (Maybe (Handle -> IO ()))
-> IO String
writeReadProcessEnv cmd args environ input adjusthandle = do
writeReadProcessEnv cmd args environ writestdin adjusthandle = do
(Just inh, Just outh, _, pid) <- createProcess p
maybe (return ()) (\a -> a inh) adjusthandle
@ -94,7 +94,7 @@ writeReadProcessEnv cmd args environ input adjusthandle = do
_ <- forkIO $ E.evaluate (length output) >> putMVar outMVar ()
-- now write and flush any input
when (not (null input)) $ do hPutStr inh input; hFlush inh
maybe (return ()) (\a -> a inh >> hFlush inh) writestdin
hClose inh -- done with stdin
-- wait on the output

2
debian/changelog vendored
View file

@ -32,6 +32,8 @@ git-annex (4.20131003) UNRELEASED; urgency=low
* Windows: Deal with strange msysgit 1.8.4 behavior of not understanding
DOS formatted paths for --git-dir and --work-tree.
* Removed workaround for bug in git 1.8.4r0.
* Added git-recover-repository command to git-annex source
(not built by default; this needs to move to someplace else).
-- Joey Hess <joeyh@debian.org> Thu, 03 Oct 2013 15:41:24 -0400

View file

@ -87,18 +87,23 @@ git-recover-repository command.
### detailed design
Run `git fsck` and parse output to find bad objects, and determine
from its output if they are a commit, a tree, or a blob.
Check if there's a remote. If so, and if the bad objects are all
present on it, can simply get all bad objects from the remote,
and inject them back into .git/objects to recover:
Run `git fsck` and parse output to find bad objects. Note that
fsck may fall over and fail to print out all bad objects, when
files are corrupt. So if the fsck exits nonzero, need to collect all
bad objects it did find, and:
1. If the local repository contains packs, the packs may be corrupt.
So, start by using `git unpack-objects` to unpack all
packs it can handle (which may include parts of corrupt packs)
back to loose objects. And delete all packs.
2. Delete all loose corrupt objects.
Repeat until fsck finds no new problems.
Check if there's a remote. If so, and if the bad objects are all
present on it, can simply get all bad objects from the remote,
and inject them back into .git/objects to recover:
3. Make a new (bare) clone from the remote.
(Note: git does not seem to provide a way to fetch specific missing
objects from the remote. Also, cannot use `--reference` against

View file

@ -0,0 +1,28 @@
# NAME
git-recover-repository - Fix a broken git repository
# SYNOPSIS
git-recover-repository [--force]
# DESCRIPTION
This can fix a corrupt or broken git repository, which git fsck would
only complain has problems.
It does by deleting all corrupt objects, and retreiving all missing
objects that it can from the remotes of the repository.
If that is not sufficient to fully recover the repository, it can also
reset branches back to commits before the corruption happened. It will only
do this if run with the --force option, since that rewrites history
and throws out missing data.
# AUTHOR
Joey Hess <joey@kitenet.net>
<http://git-annex.branchable.com/>
Warning: Automatically converted into a man page by mdwn2man. Edit with care

73
git-recover-repository.hs Normal file
View file

@ -0,0 +1,73 @@
{- git-recover-repository program
-
- Copyright 2012 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
import System.Environment
import System.Log.Logger
import System.Log.Formatter
import System.Log.Handler (setFormatter)
import System.Log.Handler.Simple
import qualified Data.Set as S
import Common
import qualified Git.CurrentRepo
import qualified Git.RecoverRepository
import qualified Git.Config
header :: String
header = "Usage: git-recover-repository"
usage :: a
usage = error $ "bad parameters\n\n" ++ header
parseArgs :: IO Bool
parseArgs = do
args <- getArgs
return $ or $ map parse args
where
parse "--force" = True
parse _ = usage
enableDebugOutput :: IO ()
enableDebugOutput = do
s <- setFormatter
<$> streamHandler stderr NOTICE
<*> pure (simpleLogFormatter "$msg")
updateGlobalLogger rootLoggerName (setLevel DEBUG . setHandlers [s])
main :: IO ()
main = do
enableDebugOutput
forced <- parseArgs
g <- Git.Config.read =<< Git.CurrentRepo.get
missing <- Git.RecoverRepository.cleanCorruptObjects g
stillmissing <- Git.RecoverRepository.retrieveMissingObjects missing g
if S.null stillmissing
then putStr $ unlines
[ "Successfully recovered repository!"
, "You should run \"git fsck\" to make sure, but it looks like"
, "everything was recovered ok."
]
else do
putStrLn $ unwords
[ show (S.size stillmissing)
, "missing objects could not be recovered!"
]
if forced
then do
remotebranches <- Git.RecoverRepository.removeTrackingBranches stillmissing g
unless (null remotebranches) $
putStrLn $ unwords
[ "removed"
, show (length remotebranches)
, "remote tracking branches that referred to missing objects"
]
localbranches <- Git.RecoverRepository.resetLocalBranches stillmissing g
unless (null localbranches) $ do
putStrLn "Reset these local branches to old versions before the missing objects were committed:"
putStr $ unlines $ map show localbranches
else putStrLn "To force a recovery to a usable state, run this command again with the --force parameter."