add journaling to speed up changes to the git-annex branch

git is slow when the index file is large and has to be rewritten each time
a file is changed. To speed this up, added a journal where changes are
recorded before being fed into the index file and committed to the
git-annex branch. The entire journal can be fed into git with just 2
commands, and only one write of the index file.
This commit is contained in:
Joey Hess 2011-06-23 11:37:26 -04:00
parent 23e765b67c
commit 5f494154a3
7 changed files with 132 additions and 40 deletions

137
Branch.hs
View file

@ -33,6 +33,7 @@ import qualified Annex
import Utility
import Types
import Messages
import Locations
{- Name of the branch that is used to store git-annex's information. -}
name :: String
@ -42,6 +43,8 @@ name = "git-annex"
fullname :: String
fullname = "refs/heads/" ++ name
{- Converts a fully qualified git ref into a short version for human
- consumptiom. -}
shortref :: String -> String
shortref = remove "refs/heads/" . remove "refs/remotes/"
where
@ -56,7 +59,8 @@ index g = Git.workTree g </> Git.gitDir g </> "index." ++ name
{- Populates the branch's index file with the current branch contents.
-
- Usually, this is only done when the index doesn't yet exist, and
- the index is used to build up changes to be commited to the branch.
- the index is used to build up changes to be commited to the branch,
- and merge in changes from other branches.
-}
genIndex :: Git.Repo -> IO ()
genIndex g = do
@ -97,11 +101,6 @@ setCache file content = do
state <- getState
setState state { cachedFile = Just file, cachedContent = content }
setCacheChanged :: FilePath -> String -> Annex ()
setCacheChanged file content = do
state <- getState
setState state { cachedFile = Just file, cachedContent = content, branchChanged = True }
invalidateCache :: Annex ()
invalidateCache = do
state <- getState
@ -133,11 +132,11 @@ create = do
liftIO $ Git.runBool g "show-ref"
[Param "--verify", Param "-q", Param ref]
{- Commits any staged changes to the branch. -}
{- Stages the journal, and commits staged changes to the branch. -}
commit :: String -> Annex ()
commit message = do
state <- getState
when (branchChanged state) $ do
staged <- stageJournalFiles
when staged $ do
g <- Annex.gitRepo
withIndex $ liftIO $
GitUnionMerge.commit g message fullname [fullname]
@ -187,28 +186,32 @@ updateRef ref
liftIO $ GitUnionMerge.merge g [ref]
return $ Just ref
{- Stages the content of a file into the branch's index. -}
{- Records changed content of a file into the journal. -}
change :: FilePath -> String -> Annex ()
change file content = do
g <- Annex.gitRepo
sha <- liftIO $ Git.hashObject g content
withIndex $ liftIO $ Git.run g "update-index"
[ Param "--add", Param "--cacheinfo", Param "100644",
Param sha, File file]
setCacheChanged file content
setJournalFile file content
setCache file content
{- Gets the content of a file on the branch, or content staged in the index
- if it's newer. Returns an empty string if the file didn't exist yet. -}
{- Gets the content of a file on the branch, or content from the journal, or
- staged in the index.
-
- Returns an empty string if the file doesn't exist yet. -}
get :: FilePath -> Annex String
get file = do
cached <- getCache file
case cached of
Just content -> return content
Nothing -> withIndexUpdate $ do
g <- Annex.gitRepo
content <- liftIO $ catch (cat g) (const $ return "")
setCache file content
return content
Nothing -> do
j <- getJournalFile file
case j of
Just content -> do
setCache file content
return content
Nothing -> withIndexUpdate $ do
g <- Annex.gitRepo
content <- liftIO $ catch (cat g) (const $ return "")
setCache file content
return content
where
cat g = cmdOutput "git" $ toCommand $ Git.gitCommandLine g
[Param "cat-file", Param "blob", Param $ ':':file]
@ -231,9 +234,93 @@ cmdOutput cmd params = do
_ <- getProcessStatus True False pid
return rv
{- Lists all files on the branch. -}
{- Lists all files on the branch. There may be duplicates in the list. -}
files :: Annex [FilePath]
files = withIndexUpdate $ do
g <- Annex.gitRepo
liftIO $ Git.pipeNullSplit g
bfiles <- liftIO $ Git.pipeNullSplit g
[Params "ls-tree --name-only -r -z", Param fullname]
jfiles <- getJournalFiles
return $ jfiles ++ bfiles
{- Records content for a file in the branch to the journal.
-
- Using the journal, rather than immediatly staging content to the index
- avoids git needing to rewrite the index after every change. -}
setJournalFile :: FilePath -> String -> Annex ()
setJournalFile file content = do
g <- Annex.gitRepo
liftIO $ catch (write g) $ const $ do
createDirectoryIfMissing True $ gitAnnexJournalDir g
createDirectoryIfMissing True $ gitAnnexTmpDir g
write g
where
-- journal file is written atomically
write g = do
let jfile = journalFile g file
let tmpfile = gitAnnexTmpDir g </> takeFileName jfile
writeFile tmpfile content
renameFile tmpfile jfile
{- Gets journalled content for a file in the branch. -}
getJournalFile :: FilePath -> Annex (Maybe String)
getJournalFile file = do
g <- Annex.gitRepo
liftIO $ catch (liftM Just . readFileStrict $ journalFile g file)
(const $ return Nothing)
{- List of journal files. -}
getJournalFiles :: Annex [FilePath]
getJournalFiles = getJournalFilesRaw >>= return . map fileJournal
getJournalFilesRaw :: Annex [FilePath]
getJournalFilesRaw = do
g <- Annex.gitRepo
fs <- liftIO $ catch (getDirectoryContents $ gitAnnexJournalDir g)
(const $ return [])
return $ filter (\f -> f /= "." && f /= "..") fs
{- Stages all journal files into the index, and returns True if the index
- was modified. -}
stageJournalFiles :: Annex Bool
stageJournalFiles = do
l <- getJournalFilesRaw
if null l
then return False
else do
g <- Annex.gitRepo
withIndex $ liftIO $ stage g l
return True
where
stage g fs = do
let dir = gitAnnexJournalDir g
let paths = map (dir </>) fs
-- inject all the journal files directly into git
-- in one quick command
(h, s) <- Git.pipeWriteRead g [Param "hash-object",
Param "-w", Param "--stdin-paths"] $ unlines paths
-- update the index, also in just one command
GitUnionMerge.update_index g $
index_lines (lines s) $ map fileJournal fs
forceSuccess h
mapM_ removeFile paths
index_lines shas fs = map genline $ zip shas fs
genline (sha, file) = GitUnionMerge.update_index_line sha file
{- Produces a filename to use in the journal for a file on the branch.
-
- The journal typically won't have a lot of files in it, so the hashing
- used in the branch is not necessary, and all the files are put directly
- in the journal directory.
-}
journalFile :: Git.Repo -> FilePath -> FilePath
journalFile repo file = gitAnnexJournalDir repo </> concatMap mangle file
where
mangle '/' = "_"
mangle '_' = "__"
mangle c = [c]
{- Converts a journal file (relative to the journal dir) back to the
- filename on the branch. -}
fileJournal :: FilePath -> FilePath
fileJournal = replace "//" "_" . replace "_" "/"

View file

@ -7,7 +7,9 @@
module GitUnionMerge (
merge,
commit
commit,
update_index,
update_index_line
) where
import System.Cmd.Utils
@ -43,6 +45,11 @@ update_index g l = togit ["update-index", "-z", "--index-info"] (join "\0" l)
togit ps content = Git.pipeWrite g (map Param ps) content
>>= forceSuccess
{- Generates a line suitable to be fed into update-index, to add
- a given file with a given sha. -}
update_index_line :: String -> FilePath -> String
update_index_line sha file = "100644 blob " ++ sha ++ "\t" ++ file
{- Gets the contents of a tree in a format suitable for update_index. -}
ls_tree :: Git.Repo -> String -> IO [String]
ls_tree g x = Git.pipeNullSplit g $
@ -76,14 +83,13 @@ calc_merge g differ = do
mergeFile :: Git.Repo -> (String, FilePath) -> IO (Maybe String)
mergeFile g (info, file) = case filter (/= nullsha) [asha, bsha] of
[] -> return Nothing
(sha:[]) -> return $ Just $ ls_tree_line sha
(sha:[]) -> return $ Just $ update_index_line sha file
shas -> do
content <- Git.pipeRead g $ map Param ("show":shas)
sha <- Git.hashObject g $ unionmerge content
return $ Just $ ls_tree_line sha
return $ Just $ update_index_line sha file
where
[_colonamode, _bmode, asha, bsha, _status] = words info
ls_tree_line sha = "100644 blob " ++ sha ++ "\t" ++ file
nullsha = take Git.shaSize $ repeat '0'
unionmerge = unlines . nub . lines

View file

@ -17,6 +17,7 @@ module Locations (
gitAnnexBadDir,
gitAnnexBadLocation,
gitAnnexUnusedLog,
gitAnnexJournalDir,
isLinkToAnnex,
logFile,
logFileKey,

View file

@ -9,10 +9,9 @@ module Types.BranchState where
data BranchState = BranchState {
branchUpdated :: Bool,
branchChanged :: Bool,
cachedFile :: Maybe FilePath,
cachedContent :: String
}
startBranchState :: BranchState
startBranchState = BranchState False False Nothing ""
startBranchState = BranchState False Nothing ""

View file

@ -21,9 +21,13 @@ deleting or changing the file contents.
This branch is managed by git-annex, with the contents listed below.
Note that git-annex assumes only it will modify this branch. If you go in
and make changes directly, it will probably revert your changes in its next
commit to the branch.
The file `.git/index.git-annex` is a separate git index file it uses
to accumlate changes for the branch. Also, `.git/annex/journal/` is used
to record changes before they are added to git.
Note that for speed reasons, git-annex assumes only it will modify this
branch. If you go in and make changes directly, it will probably revert
your changes in its next commit to the branch.
The best way to make changes to the git-annex branch is instead
to create a branch of it, with a name like "my/git-annex", and then

View file

@ -29,11 +29,6 @@ This upgrade is easier than the previous upgrades. You don't need to
upgrade every repository at once; it's sufficient to upgrade each
repository only when you next use it.
This upgrade can be sped up by, before you start, making
.git/index.git-annex into a symlink to a file on a ramdisk.
For example: `ln -s /run/shm/index.git-annex.$(git config annex.uuid) .git/index.git-annex`
but, if you do that, be sure to remove the symlink after the upgrade!
After the upgrade is complete, commit the changes it staged.
git commit -m "upgrade v2 to v3"

View file

@ -42,7 +42,7 @@ main :: IO ()
main = do
[aref, bref, newref] <- parseArgs
g <- Git.configRead =<< Git.repoFromCwd
Git.useIndex (tmpIndex g)
_ <- Git.useIndex (tmpIndex g)
setup g
GitUnionMerge.merge g [aref, bref]
GitUnionMerge.commit g "union merge" newref [aref, bref]