fsck improvements

* fsck: Check if annex.numcopies is satisfied.
* fsck: Verify the sha1 of files when the SHA1 backend is used.
* fsck: Verify the size of files when the WORM backend is used.
* fsck: Allow specifying individual files to fsk if fscking everything
  is not desired.
* fsck: Fix bug, introduced in 0.04, in detection of unused data.
This commit is contained in:
Joey Hess 2010-11-13 14:59:27 -04:00
parent d4d65a3c92
commit 5fa25a812a
15 changed files with 236 additions and 31 deletions

View file

@ -23,6 +23,7 @@ module Backend (
retrieveKeyFile,
removeKey,
hasKey,
fsckKey,
lookupFile,
chooseBackends
) where
@ -113,6 +114,10 @@ hasKey key = do
bs <- Annex.supportedBackends
(Internals.hasKey (lookupBackendName bs $ backendName key)) key
{- Checks a key's backend for problems. -}
fsckKey :: Backend -> Key -> Annex Bool
fsckKey backend key = (Internals.fsckKey backend) key
{- Looks up the key and backend corresponding to an annexed file,
- by examining what the file symlinks to. -}
lookupFile :: FilePath -> Annex (Maybe (Key, Backend))

View file

@ -4,15 +4,15 @@
- it relies on the file contents in .git/annex/ in this repo,
- and other accessible repos.
-
- This is an abstract backend; getKey has to be implemented to complete
- it.
- This is an abstract backend; name, getKey and fsckKey have to be implemented
- to complete it.
-
- Copyright 2010 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
module Backend.File (backend) where
module Backend.File (backend, checkKey) where
import Control.Monad.State
import System.Directory
@ -34,7 +34,8 @@ backend = Backend {
storeFileKey = dummyStore,
retrieveKeyFile = copyKeyFile,
removeKey = checkRemoveKey,
hasKey = checkKeyFile
hasKey = checkKeyFile,
fsckKey = mustProvide
}
mustProvide :: a
@ -97,14 +98,12 @@ checkRemoveKey key = do
if (force)
then return True
else do
g <- Annex.gitRepo
remotes <- Remotes.keyPossibilities key
let numcopies = read $ Git.configGet g config "1"
numcopies <- getNumCopies
if (numcopies > length remotes)
then notEnoughCopies numcopies (length remotes) []
else findcopies numcopies 0 remotes []
where
config = "annex.numcopies"
findcopies need have [] bad =
if (have >= need)
then return True
@ -147,3 +146,33 @@ showTriedRemotes [] = return ()
showTriedRemotes remotes =
showLongNote $ "I was unable to access these remotes: " ++
(Remotes.list remotes)
getNumCopies :: Annex Int
getNumCopies = do
g <- Annex.gitRepo
return $ read $ Git.configGet g config "1"
where
config = "annex.numcopies"
{- This is used to check that numcopies is satisfied for the key on fsck.
- This trusts the location log, and so checks all keys, even those with
- data not present in the current annex.
-
- The passed action is first run to allow backends deriving this one
- to do their own checks.
-}
checkKey :: (Key -> Annex Bool) -> Key -> Annex Bool
checkKey a key = do
a_ok <- a key
copies_ok <- checkKeyNumCopies key
return $ a_ok && copies_ok
checkKeyNumCopies :: Key -> Annex Bool
checkKeyNumCopies key = do
remotes <- Remotes.keyPossibilities key
numcopies <- getNumCopies
if (length remotes < numcopies)
then do
showLongNote $ "only " ++ show (length remotes) ++ " of " ++ show numcopies ++ " copies"
return False
else return True

View file

@ -11,24 +11,51 @@ import Control.Monad.State
import Data.String.Utils
import System.Cmd.Utils
import System.IO
import System.Directory
import qualified Backend.File
import TypeInternals
import Messages
import qualified Annex
import Locations
import Core
backend :: Backend
backend = Backend.File.backend {
name = "SHA1",
getKey = keyValue
getKey = keyValue,
fsckKey = Backend.File.checkKey checkKeySHA1
}
-- checksum the file to get its key
keyValue :: FilePath -> Annex (Maybe Key)
keyValue file = do
sha1 :: FilePath -> Annex String
sha1 file = do
showNote "checksum..."
liftIO $ pOpen ReadFromPipe "sha1sum" [file] $ \h -> do
line <- hGetLine h
let bits = split " " line
if (null bits)
then error "sha1sum parse error"
else return $ Just $ Key ((name backend), bits !! 0)
else return $ bits !! 0
-- A key is a sha1 of its contents.
keyValue :: FilePath -> Annex (Maybe Key)
keyValue file = do
s <- sha1 file
return $ Just $ Key ((name backend), s)
-- A key's sha1 is checked during fsck.
checkKeySHA1 :: Key -> Annex Bool
checkKeySHA1 key = do
g <- Annex.gitRepo
let file = annexLocation g key
present <- liftIO $ doesFileExist file
if (not present)
then return True
else do
s <- sha1 file
if (s == keyName key)
then return True
else do
dest <- moveBad key
showNote $ "bad file content (moved to "++dest++")"
return False

View file

@ -20,8 +20,13 @@ backend = Backend {
getKey = keyValue,
storeFileKey = dummyStore,
retrieveKeyFile = downloadUrl,
-- allow keys to be removed; presumably they can always be
-- downloaded again
removeKey = dummyOk,
hasKey = dummyOk
-- similarly, keys are always assumed to be out there on the web
hasKey = dummyOk,
-- and nothing needed to fsck
fsckKey = dummyOk
}
-- cannot generate url from filename
@ -32,7 +37,6 @@ keyValue _ = return Nothing
dummyStore :: FilePath -> Key -> Annex Bool
dummyStore _ _ = return False
-- allow keys to be removed; presumably they can always be downloaded again
dummyOk :: Key -> Annex Bool
dummyOk _ = return True

View file

@ -10,14 +10,22 @@ module Backend.WORM (backend) where
import Control.Monad.State
import System.FilePath
import System.Posix.Files
import System.Posix.Types
import System.Directory
import Data.String.Utils
import qualified Backend.File
import TypeInternals
import Locations
import qualified Annex
import Core
import Messages
backend :: Backend
backend = Backend.File.backend {
name = "WORM",
getKey = keyValue
getKey = keyValue,
fsckKey = Backend.File.checkKey checkKeySize
}
-- The key is formed from the file size, modification time, and the
@ -36,3 +44,27 @@ keyValue file = do
(show $ fileSize stat)
base = takeFileName file
sep = ":"
{- Extracts the file size from a key. -}
keySize :: Key -> FileOffset
keySize key = read $ section !! 2
where
section = split ":" (keyName key)
{- The size of the data for a key is checked against the size encoded in
- the key. Note that the modification time is not checked. -}
checkKeySize :: Key -> Annex Bool
checkKeySize key = do
g <- Annex.gitRepo
let file = annexLocation g key
present <- liftIO $ doesFileExist file
if (not present)
then return True
else do
s <- liftIO $ getFileStatus file
if (fileSize s == keySize key)
then return True
else do
dest <- moveBad key
showNote $ "bad file size (moved to "++dest++")"
return False

View file

@ -59,14 +59,14 @@ subCmds =
"sets annexed content for a key using a temp file"
, SubCommand "fix" path Command.Fix.seek
"fix up symlinks to point to annexed content"
, SubCommand "fsck" nothing Command.Fsck.seek
"check annex for problems"
, SubCommand "fsck" maybepath Command.Fsck.seek
"check for problems"
]
where
path = "PATH ..."
maybepath = "[PATH ...]"
key = "KEY ..."
desc = "DESCRIPTION"
nothing = ""
-- Each dashed command-line option results in generation of an action
-- in the Annex monad that performs the necessary setting.

View file

@ -146,6 +146,16 @@ withTempFile a params = return $ map a params
withNothing :: SubCmdSeekNothing
withNothing a _ = return [a]
{- Default to acting on all files matching the seek action if
- none are specified. -}
withAll :: SubCmdSeekStrings -> SubCmdSeekStrings
withAll w a params = do
if null params
then do
g <- Annex.gitRepo
w a [Git.workTree g]
else w a params
{- filter out files from the state directory -}
notState :: FilePath -> Bool
notState f = stateLoc /= take (length stateLoc) f

View file

@ -13,9 +13,10 @@ import Command
import Types
import Core
import Messages
import qualified Command.FsckFile
seek :: [SubCmdSeek]
seek = [withNothing start]
seek = [withNothing start, withAll withFilesInGit Command.FsckFile.start]
{- Checks the whole annex for problems. -}
start :: SubCmdStart
@ -26,11 +27,9 @@ start = do
perform :: SubCmdPerform
perform = do
ok <- checkUnused
if (ok)
if ok
then return $ Just $ return True
else do
showLongNote "Possible problems detected."
return Nothing
else return Nothing
checkUnused :: Annex Bool
checkUnused = do

33
Command/FsckFile.hs Normal file
View file

@ -0,0 +1,33 @@
{- git-annex command
-
- Copyright 2010 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
module Command.FsckFile where
import Command
import qualified Backend
import Types
import Messages
seek :: [SubCmdSeek]
seek = [withFilesInGit start]
{- Checks a file's backend data for problems. -}
start :: SubCmdStartString
start file = isAnnexed file $ \(key, backend) -> do
inbackend <- Backend.hasKey key
if (not inbackend)
then return Nothing
else do
showStart "fsck" file
return $ Just $ perform key backend
perform :: Key -> Backend -> SubCmdPerform
perform key backend = do
success <- Backend.fsckKey backend key
if (success)
then return $ Just $ return True
else return Nothing

18
Core.hs
View file

@ -14,6 +14,7 @@ import System.Path
import Control.Monad (when, unless, filterM)
import System.Posix.Files
import Data.Maybe
import System.FilePath
import Types
import Locations
@ -201,6 +202,16 @@ fromAnnex key dest = do
renameFile file dest
removeDirectory dir
{- Moves a key out of .git/annex/objects/ into .git/annex/bad, and
- returns the directory it was moved to. -}
moveBad :: Key -> Annex FilePath
moveBad key = do
g <- Annex.gitRepo
let src = parentDir $ annexLocation g key
let dest = annexBadLocation g
liftIO $ renameDirectory src dest
return dest
{- List of keys whose content exists in .git/annex/objects/ -}
getKeysPresent :: Annex [Key]
getKeysPresent = do
@ -209,11 +220,12 @@ getKeysPresent = do
getKeysPresent' :: FilePath -> Annex [Key]
getKeysPresent' dir = do
contents <- liftIO $ getDirectoryContents dir
files <- liftIO $ filterM isreg contents
files <- liftIO $ filterM present contents
return $ map fileKey files
where
isreg f = do
s <- getFileStatus $ dir ++ "/" ++ f
present d = do
s <- getFileStatus $ dir ++ "/" ++ d ++ "/"
++ (takeFileName d)
return $ isRegularFile s
{- List of keys referenced by symlinks in the git repo. -}

View file

@ -13,6 +13,7 @@ module Locations (
annexLocation,
annexLocationRelative,
annexTmpLocation,
annexBadLocation,
annexDir,
annexObjectDir,
@ -59,6 +60,10 @@ annexObjectDir r = annexDir r ++ "/objects"
annexTmpLocation :: Git.Repo -> FilePath
annexTmpLocation r = annexDir r ++ "/tmp/"
{- .git-annex/bad is used for bad files found during fsck -}
annexBadLocation :: Git.Repo -> FilePath
annexBadLocation r = annexDir r ++ "/bad/"
{- Converts a key into a filename fragment.
-
- Escape "/" in the key name, to keep a flat tree of files and avoid

View file

@ -75,7 +75,9 @@ data Backend = Backend {
-- removes a key
removeKey :: Key -> Annex Bool,
-- checks if a backend is storing the content of a key
hasKey :: Key -> Annex Bool
hasKey :: Key -> Annex Bool,
-- called during fsck to check a key
fsckKey :: Key -> Annex Bool
}
instance Show Backend where

11
debian/changelog vendored
View file

@ -1,3 +1,14 @@
git-annex (0.06) UNRELEASED; urgency=low
* fsck: Check if annex.numcopies is satisfied.
* fsck: Verify the sha1 of files when the SHA1 backend is used.
* fsck: Verify the size of files when the WORM backend is used.
* fsck: Allow specifying individual files to fsk if fscking everything
is not desired.
* fsck: Fix bug, introduced in 0.04, in detection of unused data.
-- Joey Hess <joeyh@debian.org> Sat, 13 Nov 2010 14:08:58 -0400
git-annex (0.05) unstable; urgency=low
* Optimize both pre-commit and lock subcommands to not call git diff

View file

@ -159,10 +159,12 @@ Many git-annex subcommands will stage changes for later `git commit` by you.
git annex setkey --key=1287765018:3 /tmp/file
* fsck
* fsck [path ...]
This subcommand checks the whole annex for consistency, and warns
about any problems found.
With no parameters, this subcommand checks the whole annex for consistency,
and warns about any problems found.
With parameters, only the specified files are checked.
# OPTIONS

View file

@ -276,3 +276,37 @@ significantly for really big files. To make SHA1 the detault, just
add something like this to `.gitattributes`:
* git-annex-backend=SHA1
## fsck: verifying your data
You can use the fsck subcommand to check for problems in your data.
What can be checked depends on the [[backend|backends]] you've used to store
the data. For example, when you use the SHA1 backend, fsck will verify that
the checksums of your files are good. Fsck also checks that the annex.numcopies
setting is satisfied for all files, and it warns about any dangling values
in `.git/annex/objects/`.
# git annex fsck
fsck (checking for unused data...) (checking files...) ok
Fsck checks the entire repository for problems by default. But you can
also specify the files to check.
This is particularly useful if you're using sha1 and don't want to spend
a long time checksumming everything.
# git annex fsck my_cool_big_file
fsck my_cool_big_file (checksum..) ok
## fsck: When things go wrong
Fsck never deletes possibly bad data; instead it will be moved to
`.git/annex/bad/` for you to review. Here is a sample of what fsck
might say about a badly messed up annex:
# git annex fsck
fsck (checking for unused data...)
Some annexed data is no longer pointed to by any files in the repository.
If this data is no longer needed, it can be removed using git-annex dropkey:
WORM:1289672605:3:file
(checking files...)