Sped back up fsck, copy --from etc

All commands that often have to read a lot of information from
the git-annex branch should now be nearly as fast as before
the branch was introduced.

Before fsck was taking approximatly 3 hours, now it's running in 8 minutes.

The code is very nasty. It should be rewritten to read the header line
from git cat-file, and then read the specified number of bytes of content.
This commit is contained in:
Joey Hess 2011-06-29 21:23:40 -04:00
parent 8725fde5c6
commit e1c18ddec4
4 changed files with 54 additions and 36 deletions

View file

@ -26,9 +26,6 @@ import System.Cmd.Utils
import Data.Maybe import Data.Maybe
import Data.List import Data.List
import System.IO import System.IO
import System.Posix.IO
import System.Posix.Process
import System.Log.Logger
import Types.BranchState import Types.BranchState
import qualified GitRepo as Git import qualified GitRepo as Git
@ -142,7 +139,7 @@ commit message = whenM stageJournalFiles $ do
- data is read from it. Runs only once per git-annex run. -} - data is read from it. Runs only once per git-annex run. -}
update :: Annex () update :: Annex ()
update = do update = do
state <- Annex.getState Annex.branchstate state <- getState
unless (branchUpdated state) $ withIndex $ do unless (branchUpdated state) $ withIndex $ do
{- Since branches get merged into the index, it's important to {- Since branches get merged into the index, it's important to
- first stage the journal into the index. Otherwise, any - first stage the journal into the index. Otherwise, any
@ -226,39 +223,48 @@ get file = do
setCache file content setCache file content
return content return content
Nothing -> withIndexUpdate $ do Nothing -> withIndexUpdate $ do
g <- Annex.gitRepo content <- catFile file
content <- liftIO $ catch (cat g) (const $ return "")
setCache file content setCache file content
return content return content
{- Uses git cat-file in batch mode to read the content of a file.
-
- Only one process is run, and it persists and is used for all accesses. -}
catFile :: FilePath -> Annex String
catFile file = do
state <- getState
maybe (startup state) ask (catFileHandles state)
where where
cat g = cmdOutput "git" $ toCommand $ Git.gitCommandLine g startup state = do
[Param "cat-file", Param "blob", Param $ ':':file] g <- Annex.gitRepo
let cmd = Git.gitCommandLine g
{- Runs a command, returning its output, ignoring nonzero exit [Param "cat-file", Param "--batch"]
- status, and discarding stderr. -} let gitcmd = join " " $ "git" : toCommand cmd
cmdOutput :: FilePath -> [String] -> IO String (_, from, to) <- liftIO $ hPipeBoth "sh"
cmdOutput cmd params = do -- want stderr on stdin for sentinal, and
pipepair <- createPipe -- to ignore other error messages
let callfunc _ = do ["-c", gitcmd ++ " 2>&1"]
closeFd (snd pipepair) setState state { catFileHandles = Just (from, to) }
h <- fdToHandle (fst pipepair) ask (from, to)
x <- hGetContentsStrict h ask (from, to) = do
hClose h _ <- liftIO $ do
return $! x hPutStr to $
let child = do fullname ++ ":" ++ file ++ "\n" ++
closeFd (fst pipepair) sentinal ++ "\n"
-- disable stderr output by this child, hFlush to
-- and since the logger uses it, also disable it return . unlines =<< readContent from []
liftIO $ updateGlobalLogger rootLoggerName $ setLevel EMERGENCY readContent from ls = do
closeFd stdError l <- liftIO $ hGetLine from
if l == sentinal_line
debugM "Utility.executeFile" $ cmd ++ " " ++ show params -- first line is blob info,
-- or maybe an error message
pid <- pOpen3Raw Nothing (Just (snd pipepair)) Nothing cmd params child then return $ drop 1 $ reverse ls
retval <- callfunc $! pid else readContent from (l:ls)
let rv = seq retval retval -- To find the end of a catted file, ask for a sentinal
_ <- getProcessStatus True False pid -- value that is always missing, and look for the error
return rv -- message. Utterly nasty, probably will break one day.
sentinal = ":"
sentinal_line = sentinal ++ " missing"
{- Lists all files on the branch. There may be duplicates in the list. -} {- Lists all files on the branch. There may be duplicates in the list. -}
files :: Annex [FilePath] files :: Annex [FilePath]

View file

@ -7,11 +7,18 @@
module Types.BranchState where module Types.BranchState where
import System.IO
data BranchState = BranchState { data BranchState = BranchState {
branchUpdated :: Bool, branchUpdated :: Bool, -- has the branch been updated this run?
-- (from, to) handles used to talk to a git-cat-file process
catFileHandles :: Maybe (Handle, Handle),
-- the content of one file is cached
cachedFile :: Maybe FilePath, cachedFile :: Maybe FilePath,
cachedContent :: String cachedContent :: String
} }
startBranchState :: BranchState startBranchState :: BranchState
startBranchState = BranchState False Nothing "" startBranchState = BranchState False Nothing Nothing ""

3
debian/changelog vendored
View file

@ -3,6 +3,9 @@ git-annex (3.20110625) UNRELEASED; urgency=low
* Always ensure git-annex branch exists. * Always ensure git-annex branch exists.
* Modify location log parser to allow future expansion. * Modify location log parser to allow future expansion.
* --force will cause add, etc, to operate on ignored files. * --force will cause add, etc, to operate on ignored files.
* Sped back up fsck, copy --from, and other commands that often
have to read a lot of information from the git-annex branch. Should
now be nearly as fast as before the branch was introduced.
-- Joey Hess <joeyh@debian.org> Sun, 26 Jun 2011 21:01:06 -0400 -- Joey Hess <joeyh@debian.org> Sun, 26 Jun 2011 21:01:06 -0400

View file

@ -36,3 +36,5 @@ commands like whereis and add. --[[Joey]]
> Hmm, except that's actually an error message sent to stderr. Unless > Hmm, except that's actually an error message sent to stderr. Unless
> stderr is connected to stdout, it might be better to look for a known, > stderr is connected to stdout, it might be better to look for a known,
> empty object. Could just add a git-annex:empty file to that end. > empty object. Could just add a git-annex:empty file to that end.
[[done]] --[[Joey]]