From e1c18ddec455e5d1259ab46ccccbe6a9c7079de6 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Wed, 29 Jun 2011 21:23:40 -0400 Subject: [PATCH] Sped back up fsck, copy --from etc All commands that often have to read a lot of information from the git-annex branch should now be nearly as fast as before the branch was introduced. Before fsck was taking approximatly 3 hours, now it's running in 8 minutes. The code is very nasty. It should be rewritten to read the header line from git cat-file, and then read the specified number of bytes of content. --- Branch.hs | 74 ++++++++++++++++++++----------------- Types/BranchState.hs | 11 +++++- debian/changelog | 3 ++ doc/todo/speed_up_fsck.mdwn | 2 + 4 files changed, 54 insertions(+), 36 deletions(-) diff --git a/Branch.hs b/Branch.hs index 26aad44070..4f568e36bc 100644 --- a/Branch.hs +++ b/Branch.hs @@ -26,9 +26,6 @@ import System.Cmd.Utils import Data.Maybe import Data.List import System.IO -import System.Posix.IO -import System.Posix.Process -import System.Log.Logger import Types.BranchState import qualified GitRepo as Git @@ -142,7 +139,7 @@ commit message = whenM stageJournalFiles $ do - data is read from it. Runs only once per git-annex run. -} update :: Annex () update = do - state <- Annex.getState Annex.branchstate + state <- getState unless (branchUpdated state) $ withIndex $ do {- Since branches get merged into the index, it's important to - first stage the journal into the index. Otherwise, any @@ -226,39 +223,48 @@ get file = do setCache file content return content Nothing -> withIndexUpdate $ do - g <- Annex.gitRepo - content <- liftIO $ catch (cat g) (const $ return "") + content <- catFile file setCache file content return content + +{- Uses git cat-file in batch mode to read the content of a file. + - + - Only one process is run, and it persists and is used for all accesses. -} +catFile :: FilePath -> Annex String +catFile file = do + state <- getState + maybe (startup state) ask (catFileHandles state) where - cat g = cmdOutput "git" $ toCommand $ Git.gitCommandLine g - [Param "cat-file", Param "blob", Param $ ':':file] - -{- Runs a command, returning its output, ignoring nonzero exit - - status, and discarding stderr. -} -cmdOutput :: FilePath -> [String] -> IO String -cmdOutput cmd params = do - pipepair <- createPipe - let callfunc _ = do - closeFd (snd pipepair) - h <- fdToHandle (fst pipepair) - x <- hGetContentsStrict h - hClose h - return $! x - let child = do - closeFd (fst pipepair) - -- disable stderr output by this child, - -- and since the logger uses it, also disable it - liftIO $ updateGlobalLogger rootLoggerName $ setLevel EMERGENCY - closeFd stdError - - debugM "Utility.executeFile" $ cmd ++ " " ++ show params - - pid <- pOpen3Raw Nothing (Just (snd pipepair)) Nothing cmd params child - retval <- callfunc $! pid - let rv = seq retval retval - _ <- getProcessStatus True False pid - return rv + startup state = do + g <- Annex.gitRepo + let cmd = Git.gitCommandLine g + [Param "cat-file", Param "--batch"] + let gitcmd = join " " $ "git" : toCommand cmd + (_, from, to) <- liftIO $ hPipeBoth "sh" + -- want stderr on stdin for sentinal, and + -- to ignore other error messages + ["-c", gitcmd ++ " 2>&1"] + setState state { catFileHandles = Just (from, to) } + ask (from, to) + ask (from, to) = do + _ <- liftIO $ do + hPutStr to $ + fullname ++ ":" ++ file ++ "\n" ++ + sentinal ++ "\n" + hFlush to + return . unlines =<< readContent from [] + readContent from ls = do + l <- liftIO $ hGetLine from + if l == sentinal_line + -- first line is blob info, + -- or maybe an error message + then return $ drop 1 $ reverse ls + else readContent from (l:ls) + -- To find the end of a catted file, ask for a sentinal + -- value that is always missing, and look for the error + -- message. Utterly nasty, probably will break one day. + sentinal = ":" + sentinal_line = sentinal ++ " missing" {- Lists all files on the branch. There may be duplicates in the list. -} files :: Annex [FilePath] diff --git a/Types/BranchState.hs b/Types/BranchState.hs index 40d7f5c2c7..bc1d32e693 100644 --- a/Types/BranchState.hs +++ b/Types/BranchState.hs @@ -7,11 +7,18 @@ module Types.BranchState where +import System.IO + data BranchState = BranchState { - branchUpdated :: Bool, + branchUpdated :: Bool, -- has the branch been updated this run? + + -- (from, to) handles used to talk to a git-cat-file process + catFileHandles :: Maybe (Handle, Handle), + + -- the content of one file is cached cachedFile :: Maybe FilePath, cachedContent :: String } startBranchState :: BranchState -startBranchState = BranchState False Nothing "" +startBranchState = BranchState False Nothing Nothing "" diff --git a/debian/changelog b/debian/changelog index 5dff0bbe56..a87a89860a 100644 --- a/debian/changelog +++ b/debian/changelog @@ -3,6 +3,9 @@ git-annex (3.20110625) UNRELEASED; urgency=low * Always ensure git-annex branch exists. * Modify location log parser to allow future expansion. * --force will cause add, etc, to operate on ignored files. + * Sped back up fsck, copy --from, and other commands that often + have to read a lot of information from the git-annex branch. Should + now be nearly as fast as before the branch was introduced. -- Joey Hess Sun, 26 Jun 2011 21:01:06 -0400 diff --git a/doc/todo/speed_up_fsck.mdwn b/doc/todo/speed_up_fsck.mdwn index e22c01766d..5d5e867f80 100644 --- a/doc/todo/speed_up_fsck.mdwn +++ b/doc/todo/speed_up_fsck.mdwn @@ -36,3 +36,5 @@ commands like whereis and add. --[[Joey]] > Hmm, except that's actually an error message sent to stderr. Unless > stderr is connected to stdout, it might be better to look for a known, > empty object. Could just add a git-annex:empty file to that end. + +[[done]] --[[Joey]]