From 5fa25a812a8a03af9f6a5fdb3d06eb4d89ee06f5 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Sat, 13 Nov 2010 14:59:27 -0400 Subject: [PATCH] fsck improvements * fsck: Check if annex.numcopies is satisfied. * fsck: Verify the sha1 of files when the SHA1 backend is used. * fsck: Verify the size of files when the WORM backend is used. * fsck: Allow specifying individual files to fsk if fscking everything is not desired. * fsck: Fix bug, introduced in 0.04, in detection of unused data. --- Backend.hs | 7 ++++++- Backend/File.hs | 43 ++++++++++++++++++++++++++++++++++++------- Backend/SHA1.hs | 37 ++++++++++++++++++++++++++++++++----- Backend/URL.hs | 8 ++++++-- Backend/WORM.hs | 34 +++++++++++++++++++++++++++++++++- CmdLine.hs | 6 +++--- Command.hs | 10 ++++++++++ Command/Fsck.hs | 9 ++++----- Command/FsckFile.hs | 33 +++++++++++++++++++++++++++++++++ Core.hs | 18 +++++++++++++++--- Locations.hs | 5 +++++ TypeInternals.hs | 4 +++- debian/changelog | 11 +++++++++++ doc/git-annex.mdwn | 8 +++++--- doc/walkthrough.mdwn | 34 ++++++++++++++++++++++++++++++++++ 15 files changed, 236 insertions(+), 31 deletions(-) create mode 100644 Command/FsckFile.hs diff --git a/Backend.hs b/Backend.hs index 43b450736d..14af56bbfa 100644 --- a/Backend.hs +++ b/Backend.hs @@ -23,6 +23,7 @@ module Backend ( retrieveKeyFile, removeKey, hasKey, + fsckKey, lookupFile, chooseBackends ) where @@ -105,7 +106,7 @@ retrieveKeyFile backend key dest = (Internals.retrieveKeyFile backend) key dest {- Removes a key from a backend. -} removeKey :: Backend -> Key -> Annex Bool -removeKey backend key = (Internals.removeKey backend) key +removeKey backend key = (Internals.removeKey backend) key {- Checks if a key is present in its backend. -} hasKey :: Key -> Annex Bool @@ -113,6 +114,10 @@ hasKey key = do bs <- Annex.supportedBackends (Internals.hasKey (lookupBackendName bs $ backendName key)) key +{- Checks a key's backend for problems. -} +fsckKey :: Backend -> Key -> Annex Bool +fsckKey backend key = (Internals.fsckKey backend) key + {- Looks up the key and backend corresponding to an annexed file, - by examining what the file symlinks to. -} lookupFile :: FilePath -> Annex (Maybe (Key, Backend)) diff --git a/Backend/File.hs b/Backend/File.hs index 9178b830a5..9bda0d5718 100644 --- a/Backend/File.hs +++ b/Backend/File.hs @@ -4,15 +4,15 @@ - it relies on the file contents in .git/annex/ in this repo, - and other accessible repos. - - - This is an abstract backend; getKey has to be implemented to complete - - it. + - This is an abstract backend; name, getKey and fsckKey have to be implemented + - to complete it. - - Copyright 2010 Joey Hess - - Licensed under the GNU GPL version 3 or higher. -} -module Backend.File (backend) where +module Backend.File (backend, checkKey) where import Control.Monad.State import System.Directory @@ -34,7 +34,8 @@ backend = Backend { storeFileKey = dummyStore, retrieveKeyFile = copyKeyFile, removeKey = checkRemoveKey, - hasKey = checkKeyFile + hasKey = checkKeyFile, + fsckKey = mustProvide } mustProvide :: a @@ -97,14 +98,12 @@ checkRemoveKey key = do if (force) then return True else do - g <- Annex.gitRepo remotes <- Remotes.keyPossibilities key - let numcopies = read $ Git.configGet g config "1" + numcopies <- getNumCopies if (numcopies > length remotes) then notEnoughCopies numcopies (length remotes) [] else findcopies numcopies 0 remotes [] where - config = "annex.numcopies" findcopies need have [] bad = if (have >= need) then return True @@ -147,3 +146,33 @@ showTriedRemotes [] = return () showTriedRemotes remotes = showLongNote $ "I was unable to access these remotes: " ++ (Remotes.list remotes) + +getNumCopies :: Annex Int +getNumCopies = do + g <- Annex.gitRepo + return $ read $ Git.configGet g config "1" + where + config = "annex.numcopies" + +{- This is used to check that numcopies is satisfied for the key on fsck. + - This trusts the location log, and so checks all keys, even those with + - data not present in the current annex. + - + - The passed action is first run to allow backends deriving this one + - to do their own checks. + -} +checkKey :: (Key -> Annex Bool) -> Key -> Annex Bool +checkKey a key = do + a_ok <- a key + copies_ok <- checkKeyNumCopies key + return $ a_ok && copies_ok + +checkKeyNumCopies :: Key -> Annex Bool +checkKeyNumCopies key = do + remotes <- Remotes.keyPossibilities key + numcopies <- getNumCopies + if (length remotes < numcopies) + then do + showLongNote $ "only " ++ show (length remotes) ++ " of " ++ show numcopies ++ " copies" + return False + else return True diff --git a/Backend/SHA1.hs b/Backend/SHA1.hs index 5a232ec1db..8852e72e94 100644 --- a/Backend/SHA1.hs +++ b/Backend/SHA1.hs @@ -11,24 +11,51 @@ import Control.Monad.State import Data.String.Utils import System.Cmd.Utils import System.IO +import System.Directory import qualified Backend.File import TypeInternals import Messages +import qualified Annex +import Locations +import Core backend :: Backend backend = Backend.File.backend { name = "SHA1", - getKey = keyValue + getKey = keyValue, + fsckKey = Backend.File.checkKey checkKeySHA1 } --- checksum the file to get its key -keyValue :: FilePath -> Annex (Maybe Key) -keyValue file = do +sha1 :: FilePath -> Annex String +sha1 file = do showNote "checksum..." liftIO $ pOpen ReadFromPipe "sha1sum" [file] $ \h -> do line <- hGetLine h let bits = split " " line if (null bits) then error "sha1sum parse error" - else return $ Just $ Key ((name backend), bits !! 0) + else return $ bits !! 0 + +-- A key is a sha1 of its contents. +keyValue :: FilePath -> Annex (Maybe Key) +keyValue file = do + s <- sha1 file + return $ Just $ Key ((name backend), s) + +-- A key's sha1 is checked during fsck. +checkKeySHA1 :: Key -> Annex Bool +checkKeySHA1 key = do + g <- Annex.gitRepo + let file = annexLocation g key + present <- liftIO $ doesFileExist file + if (not present) + then return True + else do + s <- sha1 file + if (s == keyName key) + then return True + else do + dest <- moveBad key + showNote $ "bad file content (moved to "++dest++")" + return False diff --git a/Backend/URL.hs b/Backend/URL.hs index 830d343c53..b38ea71c96 100644 --- a/Backend/URL.hs +++ b/Backend/URL.hs @@ -20,8 +20,13 @@ backend = Backend { getKey = keyValue, storeFileKey = dummyStore, retrieveKeyFile = downloadUrl, + -- allow keys to be removed; presumably they can always be + -- downloaded again removeKey = dummyOk, - hasKey = dummyOk + -- similarly, keys are always assumed to be out there on the web + hasKey = dummyOk, + -- and nothing needed to fsck + fsckKey = dummyOk } -- cannot generate url from filename @@ -32,7 +37,6 @@ keyValue _ = return Nothing dummyStore :: FilePath -> Key -> Annex Bool dummyStore _ _ = return False --- allow keys to be removed; presumably they can always be downloaded again dummyOk :: Key -> Annex Bool dummyOk _ = return True diff --git a/Backend/WORM.hs b/Backend/WORM.hs index 848386ecd1..21b3876b90 100644 --- a/Backend/WORM.hs +++ b/Backend/WORM.hs @@ -10,14 +10,22 @@ module Backend.WORM (backend) where import Control.Monad.State import System.FilePath import System.Posix.Files +import System.Posix.Types +import System.Directory +import Data.String.Utils import qualified Backend.File import TypeInternals +import Locations +import qualified Annex +import Core +import Messages backend :: Backend backend = Backend.File.backend { name = "WORM", - getKey = keyValue + getKey = keyValue, + fsckKey = Backend.File.checkKey checkKeySize } -- The key is formed from the file size, modification time, and the @@ -36,3 +44,27 @@ keyValue file = do (show $ fileSize stat) base = takeFileName file sep = ":" + +{- Extracts the file size from a key. -} +keySize :: Key -> FileOffset +keySize key = read $ section !! 2 + where + section = split ":" (keyName key) + +{- The size of the data for a key is checked against the size encoded in + - the key. Note that the modification time is not checked. -} +checkKeySize :: Key -> Annex Bool +checkKeySize key = do + g <- Annex.gitRepo + let file = annexLocation g key + present <- liftIO $ doesFileExist file + if (not present) + then return True + else do + s <- liftIO $ getFileStatus file + if (fileSize s == keySize key) + then return True + else do + dest <- moveBad key + showNote $ "bad file size (moved to "++dest++")" + return False diff --git a/CmdLine.hs b/CmdLine.hs index efa541ebcc..a683be5c53 100644 --- a/CmdLine.hs +++ b/CmdLine.hs @@ -59,14 +59,14 @@ subCmds = "sets annexed content for a key using a temp file" , SubCommand "fix" path Command.Fix.seek "fix up symlinks to point to annexed content" - , SubCommand "fsck" nothing Command.Fsck.seek - "check annex for problems" + , SubCommand "fsck" maybepath Command.Fsck.seek + "check for problems" ] where path = "PATH ..." + maybepath = "[PATH ...]" key = "KEY ..." desc = "DESCRIPTION" - nothing = "" -- Each dashed command-line option results in generation of an action -- in the Annex monad that performs the necessary setting. diff --git a/Command.hs b/Command.hs index 21d636463e..4180155faf 100644 --- a/Command.hs +++ b/Command.hs @@ -146,6 +146,16 @@ withTempFile a params = return $ map a params withNothing :: SubCmdSeekNothing withNothing a _ = return [a] +{- Default to acting on all files matching the seek action if + - none are specified. -} +withAll :: SubCmdSeekStrings -> SubCmdSeekStrings +withAll w a params = do + if null params + then do + g <- Annex.gitRepo + w a [Git.workTree g] + else w a params + {- filter out files from the state directory -} notState :: FilePath -> Bool notState f = stateLoc /= take (length stateLoc) f diff --git a/Command/Fsck.hs b/Command/Fsck.hs index e5f0debe0f..b0b9f7bb6b 100644 --- a/Command/Fsck.hs +++ b/Command/Fsck.hs @@ -13,9 +13,10 @@ import Command import Types import Core import Messages +import qualified Command.FsckFile seek :: [SubCmdSeek] -seek = [withNothing start] +seek = [withNothing start, withAll withFilesInGit Command.FsckFile.start] {- Checks the whole annex for problems. -} start :: SubCmdStart @@ -26,11 +27,9 @@ start = do perform :: SubCmdPerform perform = do ok <- checkUnused - if (ok) + if ok then return $ Just $ return True - else do - showLongNote "Possible problems detected." - return Nothing + else return Nothing checkUnused :: Annex Bool checkUnused = do diff --git a/Command/FsckFile.hs b/Command/FsckFile.hs new file mode 100644 index 0000000000..2f9efa56ee --- /dev/null +++ b/Command/FsckFile.hs @@ -0,0 +1,33 @@ +{- git-annex command + - + - Copyright 2010 Joey Hess + - + - Licensed under the GNU GPL version 3 or higher. + -} + +module Command.FsckFile where + +import Command +import qualified Backend +import Types +import Messages + +seek :: [SubCmdSeek] +seek = [withFilesInGit start] + +{- Checks a file's backend data for problems. -} +start :: SubCmdStartString +start file = isAnnexed file $ \(key, backend) -> do + inbackend <- Backend.hasKey key + if (not inbackend) + then return Nothing + else do + showStart "fsck" file + return $ Just $ perform key backend + +perform :: Key -> Backend -> SubCmdPerform +perform key backend = do + success <- Backend.fsckKey backend key + if (success) + then return $ Just $ return True + else return Nothing diff --git a/Core.hs b/Core.hs index 8497a7f368..789b369cc8 100644 --- a/Core.hs +++ b/Core.hs @@ -14,6 +14,7 @@ import System.Path import Control.Monad (when, unless, filterM) import System.Posix.Files import Data.Maybe +import System.FilePath import Types import Locations @@ -201,6 +202,16 @@ fromAnnex key dest = do renameFile file dest removeDirectory dir +{- Moves a key out of .git/annex/objects/ into .git/annex/bad, and + - returns the directory it was moved to. -} +moveBad :: Key -> Annex FilePath +moveBad key = do + g <- Annex.gitRepo + let src = parentDir $ annexLocation g key + let dest = annexBadLocation g + liftIO $ renameDirectory src dest + return dest + {- List of keys whose content exists in .git/annex/objects/ -} getKeysPresent :: Annex [Key] getKeysPresent = do @@ -209,11 +220,12 @@ getKeysPresent = do getKeysPresent' :: FilePath -> Annex [Key] getKeysPresent' dir = do contents <- liftIO $ getDirectoryContents dir - files <- liftIO $ filterM isreg contents + files <- liftIO $ filterM present contents return $ map fileKey files where - isreg f = do - s <- getFileStatus $ dir ++ "/" ++ f + present d = do + s <- getFileStatus $ dir ++ "/" ++ d ++ "/" + ++ (takeFileName d) return $ isRegularFile s {- List of keys referenced by symlinks in the git repo. -} diff --git a/Locations.hs b/Locations.hs index 58244cef0e..c3bab285d4 100644 --- a/Locations.hs +++ b/Locations.hs @@ -13,6 +13,7 @@ module Locations ( annexLocation, annexLocationRelative, annexTmpLocation, + annexBadLocation, annexDir, annexObjectDir, @@ -59,6 +60,10 @@ annexObjectDir r = annexDir r ++ "/objects" annexTmpLocation :: Git.Repo -> FilePath annexTmpLocation r = annexDir r ++ "/tmp/" +{- .git-annex/bad is used for bad files found during fsck -} +annexBadLocation :: Git.Repo -> FilePath +annexBadLocation r = annexDir r ++ "/bad/" + {- Converts a key into a filename fragment. - - Escape "/" in the key name, to keep a flat tree of files and avoid diff --git a/TypeInternals.hs b/TypeInternals.hs index 4b5cff9d9f..3078224b15 100644 --- a/TypeInternals.hs +++ b/TypeInternals.hs @@ -75,7 +75,9 @@ data Backend = Backend { -- removes a key removeKey :: Key -> Annex Bool, -- checks if a backend is storing the content of a key - hasKey :: Key -> Annex Bool + hasKey :: Key -> Annex Bool, + -- called during fsck to check a key + fsckKey :: Key -> Annex Bool } instance Show Backend where diff --git a/debian/changelog b/debian/changelog index d1c3344ffa..71d163d116 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,14 @@ +git-annex (0.06) UNRELEASED; urgency=low + + * fsck: Check if annex.numcopies is satisfied. + * fsck: Verify the sha1 of files when the SHA1 backend is used. + * fsck: Verify the size of files when the WORM backend is used. + * fsck: Allow specifying individual files to fsk if fscking everything + is not desired. + * fsck: Fix bug, introduced in 0.04, in detection of unused data. + + -- Joey Hess Sat, 13 Nov 2010 14:08:58 -0400 + git-annex (0.05) unstable; urgency=low * Optimize both pre-commit and lock subcommands to not call git diff diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index d0bd3a754e..61a5962f1f 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -159,10 +159,12 @@ Many git-annex subcommands will stage changes for later `git commit` by you. git annex setkey --key=1287765018:3 /tmp/file -* fsck +* fsck [path ...] - This subcommand checks the whole annex for consistency, and warns - about any problems found. + With no parameters, this subcommand checks the whole annex for consistency, + and warns about any problems found. + + With parameters, only the specified files are checked. # OPTIONS diff --git a/doc/walkthrough.mdwn b/doc/walkthrough.mdwn index d6c0214ffd..7effb53178 100644 --- a/doc/walkthrough.mdwn +++ b/doc/walkthrough.mdwn @@ -276,3 +276,37 @@ significantly for really big files. To make SHA1 the detault, just add something like this to `.gitattributes`: * git-annex-backend=SHA1 + +## fsck: verifying your data + +You can use the fsck subcommand to check for problems in your data. +What can be checked depends on the [[backend|backends]] you've used to store +the data. For example, when you use the SHA1 backend, fsck will verify that +the checksums of your files are good. Fsck also checks that the annex.numcopies +setting is satisfied for all files, and it warns about any dangling values +in `.git/annex/objects/`. + + # git annex fsck + fsck (checking for unused data...) (checking files...) ok + +Fsck checks the entire repository for problems by default. But you can +also specify the files to check. +This is particularly useful if you're using sha1 and don't want to spend +a long time checksumming everything. + + # git annex fsck my_cool_big_file + fsck my_cool_big_file (checksum..) ok + +## fsck: When things go wrong + +Fsck never deletes possibly bad data; instead it will be moved to +`.git/annex/bad/` for you to review. Here is a sample of what fsck +might say about a badly messed up annex: + + # git annex fsck + fsck (checking for unused data...) + Some annexed data is no longer pointed to by any files in the repository. + If this data is no longer needed, it can be removed using git-annex dropkey: + WORM:1289672605:3:file + (checking files...) +