Sped back up fsck, copy --from etc

All commands that often have to read a lot of information from
the git-annex branch should now be nearly as fast as before
the branch was introduced.

Before fsck was taking approximatly 3 hours, now it's running in 8 minutes.

The code is very nasty. It should be rewritten to read the header line
from git cat-file, and then read the specified number of bytes of content.
This commit is contained in:
Joey Hess 2011-06-29 21:23:40 -04:00
parent 8725fde5c6
commit e1c18ddec4
4 changed files with 54 additions and 36 deletions

View file

@ -26,9 +26,6 @@ import System.Cmd.Utils
import Data.Maybe
import Data.List
import System.IO
import System.Posix.IO
import System.Posix.Process
import System.Log.Logger
import Types.BranchState
import qualified GitRepo as Git
@ -142,7 +139,7 @@ commit message = whenM stageJournalFiles $ do
- data is read from it. Runs only once per git-annex run. -}
update :: Annex ()
update = do
state <- Annex.getState Annex.branchstate
state <- getState
unless (branchUpdated state) $ withIndex $ do
{- Since branches get merged into the index, it's important to
- first stage the journal into the index. Otherwise, any
@ -226,39 +223,48 @@ get file = do
setCache file content
return content
Nothing -> withIndexUpdate $ do
g <- Annex.gitRepo
content <- liftIO $ catch (cat g) (const $ return "")
content <- catFile file
setCache file content
return content
{- Uses git cat-file in batch mode to read the content of a file.
-
- Only one process is run, and it persists and is used for all accesses. -}
catFile :: FilePath -> Annex String
catFile file = do
state <- getState
maybe (startup state) ask (catFileHandles state)
where
cat g = cmdOutput "git" $ toCommand $ Git.gitCommandLine g
[Param "cat-file", Param "blob", Param $ ':':file]
{- Runs a command, returning its output, ignoring nonzero exit
- status, and discarding stderr. -}
cmdOutput :: FilePath -> [String] -> IO String
cmdOutput cmd params = do
pipepair <- createPipe
let callfunc _ = do
closeFd (snd pipepair)
h <- fdToHandle (fst pipepair)
x <- hGetContentsStrict h
hClose h
return $! x
let child = do
closeFd (fst pipepair)
-- disable stderr output by this child,
-- and since the logger uses it, also disable it
liftIO $ updateGlobalLogger rootLoggerName $ setLevel EMERGENCY
closeFd stdError
debugM "Utility.executeFile" $ cmd ++ " " ++ show params
pid <- pOpen3Raw Nothing (Just (snd pipepair)) Nothing cmd params child
retval <- callfunc $! pid
let rv = seq retval retval
_ <- getProcessStatus True False pid
return rv
startup state = do
g <- Annex.gitRepo
let cmd = Git.gitCommandLine g
[Param "cat-file", Param "--batch"]
let gitcmd = join " " $ "git" : toCommand cmd
(_, from, to) <- liftIO $ hPipeBoth "sh"
-- want stderr on stdin for sentinal, and
-- to ignore other error messages
["-c", gitcmd ++ " 2>&1"]
setState state { catFileHandles = Just (from, to) }
ask (from, to)
ask (from, to) = do
_ <- liftIO $ do
hPutStr to $
fullname ++ ":" ++ file ++ "\n" ++
sentinal ++ "\n"
hFlush to
return . unlines =<< readContent from []
readContent from ls = do
l <- liftIO $ hGetLine from
if l == sentinal_line
-- first line is blob info,
-- or maybe an error message
then return $ drop 1 $ reverse ls
else readContent from (l:ls)
-- To find the end of a catted file, ask for a sentinal
-- value that is always missing, and look for the error
-- message. Utterly nasty, probably will break one day.
sentinal = ":"
sentinal_line = sentinal ++ " missing"
{- Lists all files on the branch. There may be duplicates in the list. -}
files :: Annex [FilePath]

View file

@ -7,11 +7,18 @@
module Types.BranchState where
import System.IO
data BranchState = BranchState {
branchUpdated :: Bool,
branchUpdated :: Bool, -- has the branch been updated this run?
-- (from, to) handles used to talk to a git-cat-file process
catFileHandles :: Maybe (Handle, Handle),
-- the content of one file is cached
cachedFile :: Maybe FilePath,
cachedContent :: String
}
startBranchState :: BranchState
startBranchState = BranchState False Nothing ""
startBranchState = BranchState False Nothing Nothing ""

3
debian/changelog vendored
View file

@ -3,6 +3,9 @@ git-annex (3.20110625) UNRELEASED; urgency=low
* Always ensure git-annex branch exists.
* Modify location log parser to allow future expansion.
* --force will cause add, etc, to operate on ignored files.
* Sped back up fsck, copy --from, and other commands that often
have to read a lot of information from the git-annex branch. Should
now be nearly as fast as before the branch was introduced.
-- Joey Hess <joeyh@debian.org> Sun, 26 Jun 2011 21:01:06 -0400

View file

@ -36,3 +36,5 @@ commands like whereis and add. --[[Joey]]
> Hmm, except that's actually an error message sent to stderr. Unless
> stderr is connected to stdout, it might be better to look for a known,
> empty object. Could just add a git-annex:empty file to that end.
[[done]] --[[Joey]]