add journaling to speed up changes to the git-annex branch

git is slow when the index file is large and has to be rewritten each time
a file is changed. To speed this up, added a journal where changes are
recorded before being fed into the index file and committed to the
git-annex branch. The entire journal can be fed into git with just 2
commands, and only one write of the index file.
This commit is contained in:
Joey Hess 2011-06-23 11:37:26 -04:00
parent 23e765b67c
commit 5f494154a3
7 changed files with 132 additions and 40 deletions

127
Branch.hs
View file

@ -33,6 +33,7 @@ import qualified Annex
import Utility import Utility
import Types import Types
import Messages import Messages
import Locations
{- Name of the branch that is used to store git-annex's information. -} {- Name of the branch that is used to store git-annex's information. -}
name :: String name :: String
@ -42,6 +43,8 @@ name = "git-annex"
fullname :: String fullname :: String
fullname = "refs/heads/" ++ name fullname = "refs/heads/" ++ name
{- Converts a fully qualified git ref into a short version for human
- consumptiom. -}
shortref :: String -> String shortref :: String -> String
shortref = remove "refs/heads/" . remove "refs/remotes/" shortref = remove "refs/heads/" . remove "refs/remotes/"
where where
@ -56,7 +59,8 @@ index g = Git.workTree g </> Git.gitDir g </> "index." ++ name
{- Populates the branch's index file with the current branch contents. {- Populates the branch's index file with the current branch contents.
- -
- Usually, this is only done when the index doesn't yet exist, and - Usually, this is only done when the index doesn't yet exist, and
- the index is used to build up changes to be commited to the branch. - the index is used to build up changes to be commited to the branch,
- and merge in changes from other branches.
-} -}
genIndex :: Git.Repo -> IO () genIndex :: Git.Repo -> IO ()
genIndex g = do genIndex g = do
@ -97,11 +101,6 @@ setCache file content = do
state <- getState state <- getState
setState state { cachedFile = Just file, cachedContent = content } setState state { cachedFile = Just file, cachedContent = content }
setCacheChanged :: FilePath -> String -> Annex ()
setCacheChanged file content = do
state <- getState
setState state { cachedFile = Just file, cachedContent = content, branchChanged = True }
invalidateCache :: Annex () invalidateCache :: Annex ()
invalidateCache = do invalidateCache = do
state <- getState state <- getState
@ -133,11 +132,11 @@ create = do
liftIO $ Git.runBool g "show-ref" liftIO $ Git.runBool g "show-ref"
[Param "--verify", Param "-q", Param ref] [Param "--verify", Param "-q", Param ref]
{- Commits any staged changes to the branch. -} {- Stages the journal, and commits staged changes to the branch. -}
commit :: String -> Annex () commit :: String -> Annex ()
commit message = do commit message = do
state <- getState staged <- stageJournalFiles
when (branchChanged state) $ do when staged $ do
g <- Annex.gitRepo g <- Annex.gitRepo
withIndex $ liftIO $ withIndex $ liftIO $
GitUnionMerge.commit g message fullname [fullname] GitUnionMerge.commit g message fullname [fullname]
@ -187,23 +186,27 @@ updateRef ref
liftIO $ GitUnionMerge.merge g [ref] liftIO $ GitUnionMerge.merge g [ref]
return $ Just ref return $ Just ref
{- Stages the content of a file into the branch's index. -} {- Records changed content of a file into the journal. -}
change :: FilePath -> String -> Annex () change :: FilePath -> String -> Annex ()
change file content = do change file content = do
g <- Annex.gitRepo setJournalFile file content
sha <- liftIO $ Git.hashObject g content setCache file content
withIndex $ liftIO $ Git.run g "update-index"
[ Param "--add", Param "--cacheinfo", Param "100644",
Param sha, File file]
setCacheChanged file content
{- Gets the content of a file on the branch, or content staged in the index {- Gets the content of a file on the branch, or content from the journal, or
- if it's newer. Returns an empty string if the file didn't exist yet. -} - staged in the index.
-
- Returns an empty string if the file doesn't exist yet. -}
get :: FilePath -> Annex String get :: FilePath -> Annex String
get file = do get file = do
cached <- getCache file cached <- getCache file
case cached of case cached of
Just content -> return content Just content -> return content
Nothing -> do
j <- getJournalFile file
case j of
Just content -> do
setCache file content
return content
Nothing -> withIndexUpdate $ do Nothing -> withIndexUpdate $ do
g <- Annex.gitRepo g <- Annex.gitRepo
content <- liftIO $ catch (cat g) (const $ return "") content <- liftIO $ catch (cat g) (const $ return "")
@ -231,9 +234,93 @@ cmdOutput cmd params = do
_ <- getProcessStatus True False pid _ <- getProcessStatus True False pid
return rv return rv
{- Lists all files on the branch. -} {- Lists all files on the branch. There may be duplicates in the list. -}
files :: Annex [FilePath] files :: Annex [FilePath]
files = withIndexUpdate $ do files = withIndexUpdate $ do
g <- Annex.gitRepo g <- Annex.gitRepo
liftIO $ Git.pipeNullSplit g bfiles <- liftIO $ Git.pipeNullSplit g
[Params "ls-tree --name-only -r -z", Param fullname] [Params "ls-tree --name-only -r -z", Param fullname]
jfiles <- getJournalFiles
return $ jfiles ++ bfiles
{- Records content for a file in the branch to the journal.
-
- Using the journal, rather than immediatly staging content to the index
- avoids git needing to rewrite the index after every change. -}
setJournalFile :: FilePath -> String -> Annex ()
setJournalFile file content = do
g <- Annex.gitRepo
liftIO $ catch (write g) $ const $ do
createDirectoryIfMissing True $ gitAnnexJournalDir g
createDirectoryIfMissing True $ gitAnnexTmpDir g
write g
where
-- journal file is written atomically
write g = do
let jfile = journalFile g file
let tmpfile = gitAnnexTmpDir g </> takeFileName jfile
writeFile tmpfile content
renameFile tmpfile jfile
{- Gets journalled content for a file in the branch. -}
getJournalFile :: FilePath -> Annex (Maybe String)
getJournalFile file = do
g <- Annex.gitRepo
liftIO $ catch (liftM Just . readFileStrict $ journalFile g file)
(const $ return Nothing)
{- List of journal files. -}
getJournalFiles :: Annex [FilePath]
getJournalFiles = getJournalFilesRaw >>= return . map fileJournal
getJournalFilesRaw :: Annex [FilePath]
getJournalFilesRaw = do
g <- Annex.gitRepo
fs <- liftIO $ catch (getDirectoryContents $ gitAnnexJournalDir g)
(const $ return [])
return $ filter (\f -> f /= "." && f /= "..") fs
{- Stages all journal files into the index, and returns True if the index
- was modified. -}
stageJournalFiles :: Annex Bool
stageJournalFiles = do
l <- getJournalFilesRaw
if null l
then return False
else do
g <- Annex.gitRepo
withIndex $ liftIO $ stage g l
return True
where
stage g fs = do
let dir = gitAnnexJournalDir g
let paths = map (dir </>) fs
-- inject all the journal files directly into git
-- in one quick command
(h, s) <- Git.pipeWriteRead g [Param "hash-object",
Param "-w", Param "--stdin-paths"] $ unlines paths
-- update the index, also in just one command
GitUnionMerge.update_index g $
index_lines (lines s) $ map fileJournal fs
forceSuccess h
mapM_ removeFile paths
index_lines shas fs = map genline $ zip shas fs
genline (sha, file) = GitUnionMerge.update_index_line sha file
{- Produces a filename to use in the journal for a file on the branch.
-
- The journal typically won't have a lot of files in it, so the hashing
- used in the branch is not necessary, and all the files are put directly
- in the journal directory.
-}
journalFile :: Git.Repo -> FilePath -> FilePath
journalFile repo file = gitAnnexJournalDir repo </> concatMap mangle file
where
mangle '/' = "_"
mangle '_' = "__"
mangle c = [c]
{- Converts a journal file (relative to the journal dir) back to the
- filename on the branch. -}
fileJournal :: FilePath -> FilePath
fileJournal = replace "//" "_" . replace "_" "/"

View file

@ -7,7 +7,9 @@
module GitUnionMerge ( module GitUnionMerge (
merge, merge,
commit commit,
update_index,
update_index_line
) where ) where
import System.Cmd.Utils import System.Cmd.Utils
@ -43,6 +45,11 @@ update_index g l = togit ["update-index", "-z", "--index-info"] (join "\0" l)
togit ps content = Git.pipeWrite g (map Param ps) content togit ps content = Git.pipeWrite g (map Param ps) content
>>= forceSuccess >>= forceSuccess
{- Generates a line suitable to be fed into update-index, to add
- a given file with a given sha. -}
update_index_line :: String -> FilePath -> String
update_index_line sha file = "100644 blob " ++ sha ++ "\t" ++ file
{- Gets the contents of a tree in a format suitable for update_index. -} {- Gets the contents of a tree in a format suitable for update_index. -}
ls_tree :: Git.Repo -> String -> IO [String] ls_tree :: Git.Repo -> String -> IO [String]
ls_tree g x = Git.pipeNullSplit g $ ls_tree g x = Git.pipeNullSplit g $
@ -76,14 +83,13 @@ calc_merge g differ = do
mergeFile :: Git.Repo -> (String, FilePath) -> IO (Maybe String) mergeFile :: Git.Repo -> (String, FilePath) -> IO (Maybe String)
mergeFile g (info, file) = case filter (/= nullsha) [asha, bsha] of mergeFile g (info, file) = case filter (/= nullsha) [asha, bsha] of
[] -> return Nothing [] -> return Nothing
(sha:[]) -> return $ Just $ ls_tree_line sha (sha:[]) -> return $ Just $ update_index_line sha file
shas -> do shas -> do
content <- Git.pipeRead g $ map Param ("show":shas) content <- Git.pipeRead g $ map Param ("show":shas)
sha <- Git.hashObject g $ unionmerge content sha <- Git.hashObject g $ unionmerge content
return $ Just $ ls_tree_line sha return $ Just $ update_index_line sha file
where where
[_colonamode, _bmode, asha, bsha, _status] = words info [_colonamode, _bmode, asha, bsha, _status] = words info
ls_tree_line sha = "100644 blob " ++ sha ++ "\t" ++ file
nullsha = take Git.shaSize $ repeat '0' nullsha = take Git.shaSize $ repeat '0'
unionmerge = unlines . nub . lines unionmerge = unlines . nub . lines

View file

@ -17,6 +17,7 @@ module Locations (
gitAnnexBadDir, gitAnnexBadDir,
gitAnnexBadLocation, gitAnnexBadLocation,
gitAnnexUnusedLog, gitAnnexUnusedLog,
gitAnnexJournalDir,
isLinkToAnnex, isLinkToAnnex,
logFile, logFile,
logFileKey, logFileKey,

View file

@ -9,10 +9,9 @@ module Types.BranchState where
data BranchState = BranchState { data BranchState = BranchState {
branchUpdated :: Bool, branchUpdated :: Bool,
branchChanged :: Bool,
cachedFile :: Maybe FilePath, cachedFile :: Maybe FilePath,
cachedContent :: String cachedContent :: String
} }
startBranchState :: BranchState startBranchState :: BranchState
startBranchState = BranchState False False Nothing "" startBranchState = BranchState False Nothing ""

View file

@ -21,9 +21,13 @@ deleting or changing the file contents.
This branch is managed by git-annex, with the contents listed below. This branch is managed by git-annex, with the contents listed below.
Note that git-annex assumes only it will modify this branch. If you go in The file `.git/index.git-annex` is a separate git index file it uses
and make changes directly, it will probably revert your changes in its next to accumlate changes for the branch. Also, `.git/annex/journal/` is used
commit to the branch. to record changes before they are added to git.
Note that for speed reasons, git-annex assumes only it will modify this
branch. If you go in and make changes directly, it will probably revert
your changes in its next commit to the branch.
The best way to make changes to the git-annex branch is instead The best way to make changes to the git-annex branch is instead
to create a branch of it, with a name like "my/git-annex", and then to create a branch of it, with a name like "my/git-annex", and then

View file

@ -29,11 +29,6 @@ This upgrade is easier than the previous upgrades. You don't need to
upgrade every repository at once; it's sufficient to upgrade each upgrade every repository at once; it's sufficient to upgrade each
repository only when you next use it. repository only when you next use it.
This upgrade can be sped up by, before you start, making
.git/index.git-annex into a symlink to a file on a ramdisk.
For example: `ln -s /run/shm/index.git-annex.$(git config annex.uuid) .git/index.git-annex`
but, if you do that, be sure to remove the symlink after the upgrade!
After the upgrade is complete, commit the changes it staged. After the upgrade is complete, commit the changes it staged.
git commit -m "upgrade v2 to v3" git commit -m "upgrade v2 to v3"

View file

@ -42,7 +42,7 @@ main :: IO ()
main = do main = do
[aref, bref, newref] <- parseArgs [aref, bref, newref] <- parseArgs
g <- Git.configRead =<< Git.repoFromCwd g <- Git.configRead =<< Git.repoFromCwd
Git.useIndex (tmpIndex g) _ <- Git.useIndex (tmpIndex g)
setup g setup g
GitUnionMerge.merge g [aref, bref] GitUnionMerge.merge g [aref, bref]
GitUnionMerge.commit g "union merge" newref [aref, bref] GitUnionMerge.commit g "union merge" newref [aref, bref]