add journaling to speed up changes to the git-annex branch
git is slow when the index file is large and has to be rewritten each time a file is changed. To speed this up, added a journal where changes are recorded before being fed into the index file and committed to the git-annex branch. The entire journal can be fed into git with just 2 commands, and only one write of the index file.
This commit is contained in:
parent
23e765b67c
commit
5f494154a3
7 changed files with 132 additions and 40 deletions
127
Branch.hs
127
Branch.hs
|
@ -33,6 +33,7 @@ import qualified Annex
|
|||
import Utility
|
||||
import Types
|
||||
import Messages
|
||||
import Locations
|
||||
|
||||
{- Name of the branch that is used to store git-annex's information. -}
|
||||
name :: String
|
||||
|
@ -42,6 +43,8 @@ name = "git-annex"
|
|||
fullname :: String
|
||||
fullname = "refs/heads/" ++ name
|
||||
|
||||
{- Converts a fully qualified git ref into a short version for human
|
||||
- consumptiom. -}
|
||||
shortref :: String -> String
|
||||
shortref = remove "refs/heads/" . remove "refs/remotes/"
|
||||
where
|
||||
|
@ -56,7 +59,8 @@ index g = Git.workTree g </> Git.gitDir g </> "index." ++ name
|
|||
{- Populates the branch's index file with the current branch contents.
|
||||
-
|
||||
- Usually, this is only done when the index doesn't yet exist, and
|
||||
- the index is used to build up changes to be commited to the branch.
|
||||
- the index is used to build up changes to be commited to the branch,
|
||||
- and merge in changes from other branches.
|
||||
-}
|
||||
genIndex :: Git.Repo -> IO ()
|
||||
genIndex g = do
|
||||
|
@ -97,11 +101,6 @@ setCache file content = do
|
|||
state <- getState
|
||||
setState state { cachedFile = Just file, cachedContent = content }
|
||||
|
||||
setCacheChanged :: FilePath -> String -> Annex ()
|
||||
setCacheChanged file content = do
|
||||
state <- getState
|
||||
setState state { cachedFile = Just file, cachedContent = content, branchChanged = True }
|
||||
|
||||
invalidateCache :: Annex ()
|
||||
invalidateCache = do
|
||||
state <- getState
|
||||
|
@ -133,11 +132,11 @@ create = do
|
|||
liftIO $ Git.runBool g "show-ref"
|
||||
[Param "--verify", Param "-q", Param ref]
|
||||
|
||||
{- Commits any staged changes to the branch. -}
|
||||
{- Stages the journal, and commits staged changes to the branch. -}
|
||||
commit :: String -> Annex ()
|
||||
commit message = do
|
||||
state <- getState
|
||||
when (branchChanged state) $ do
|
||||
staged <- stageJournalFiles
|
||||
when staged $ do
|
||||
g <- Annex.gitRepo
|
||||
withIndex $ liftIO $
|
||||
GitUnionMerge.commit g message fullname [fullname]
|
||||
|
@ -187,23 +186,27 @@ updateRef ref
|
|||
liftIO $ GitUnionMerge.merge g [ref]
|
||||
return $ Just ref
|
||||
|
||||
{- Stages the content of a file into the branch's index. -}
|
||||
{- Records changed content of a file into the journal. -}
|
||||
change :: FilePath -> String -> Annex ()
|
||||
change file content = do
|
||||
g <- Annex.gitRepo
|
||||
sha <- liftIO $ Git.hashObject g content
|
||||
withIndex $ liftIO $ Git.run g "update-index"
|
||||
[ Param "--add", Param "--cacheinfo", Param "100644",
|
||||
Param sha, File file]
|
||||
setCacheChanged file content
|
||||
setJournalFile file content
|
||||
setCache file content
|
||||
|
||||
{- Gets the content of a file on the branch, or content staged in the index
|
||||
- if it's newer. Returns an empty string if the file didn't exist yet. -}
|
||||
{- Gets the content of a file on the branch, or content from the journal, or
|
||||
- staged in the index.
|
||||
-
|
||||
- Returns an empty string if the file doesn't exist yet. -}
|
||||
get :: FilePath -> Annex String
|
||||
get file = do
|
||||
cached <- getCache file
|
||||
case cached of
|
||||
Just content -> return content
|
||||
Nothing -> do
|
||||
j <- getJournalFile file
|
||||
case j of
|
||||
Just content -> do
|
||||
setCache file content
|
||||
return content
|
||||
Nothing -> withIndexUpdate $ do
|
||||
g <- Annex.gitRepo
|
||||
content <- liftIO $ catch (cat g) (const $ return "")
|
||||
|
@ -231,9 +234,93 @@ cmdOutput cmd params = do
|
|||
_ <- getProcessStatus True False pid
|
||||
return rv
|
||||
|
||||
{- Lists all files on the branch. -}
|
||||
{- Lists all files on the branch. There may be duplicates in the list. -}
|
||||
files :: Annex [FilePath]
|
||||
files = withIndexUpdate $ do
|
||||
g <- Annex.gitRepo
|
||||
liftIO $ Git.pipeNullSplit g
|
||||
bfiles <- liftIO $ Git.pipeNullSplit g
|
||||
[Params "ls-tree --name-only -r -z", Param fullname]
|
||||
jfiles <- getJournalFiles
|
||||
return $ jfiles ++ bfiles
|
||||
|
||||
{- Records content for a file in the branch to the journal.
|
||||
-
|
||||
- Using the journal, rather than immediatly staging content to the index
|
||||
- avoids git needing to rewrite the index after every change. -}
|
||||
setJournalFile :: FilePath -> String -> Annex ()
|
||||
setJournalFile file content = do
|
||||
g <- Annex.gitRepo
|
||||
liftIO $ catch (write g) $ const $ do
|
||||
createDirectoryIfMissing True $ gitAnnexJournalDir g
|
||||
createDirectoryIfMissing True $ gitAnnexTmpDir g
|
||||
write g
|
||||
where
|
||||
-- journal file is written atomically
|
||||
write g = do
|
||||
let jfile = journalFile g file
|
||||
let tmpfile = gitAnnexTmpDir g </> takeFileName jfile
|
||||
writeFile tmpfile content
|
||||
renameFile tmpfile jfile
|
||||
|
||||
{- Gets journalled content for a file in the branch. -}
|
||||
getJournalFile :: FilePath -> Annex (Maybe String)
|
||||
getJournalFile file = do
|
||||
g <- Annex.gitRepo
|
||||
liftIO $ catch (liftM Just . readFileStrict $ journalFile g file)
|
||||
(const $ return Nothing)
|
||||
|
||||
{- List of journal files. -}
|
||||
getJournalFiles :: Annex [FilePath]
|
||||
getJournalFiles = getJournalFilesRaw >>= return . map fileJournal
|
||||
|
||||
getJournalFilesRaw :: Annex [FilePath]
|
||||
getJournalFilesRaw = do
|
||||
g <- Annex.gitRepo
|
||||
fs <- liftIO $ catch (getDirectoryContents $ gitAnnexJournalDir g)
|
||||
(const $ return [])
|
||||
return $ filter (\f -> f /= "." && f /= "..") fs
|
||||
|
||||
{- Stages all journal files into the index, and returns True if the index
|
||||
- was modified. -}
|
||||
stageJournalFiles :: Annex Bool
|
||||
stageJournalFiles = do
|
||||
l <- getJournalFilesRaw
|
||||
if null l
|
||||
then return False
|
||||
else do
|
||||
g <- Annex.gitRepo
|
||||
withIndex $ liftIO $ stage g l
|
||||
return True
|
||||
where
|
||||
stage g fs = do
|
||||
let dir = gitAnnexJournalDir g
|
||||
let paths = map (dir </>) fs
|
||||
-- inject all the journal files directly into git
|
||||
-- in one quick command
|
||||
(h, s) <- Git.pipeWriteRead g [Param "hash-object",
|
||||
Param "-w", Param "--stdin-paths"] $ unlines paths
|
||||
-- update the index, also in just one command
|
||||
GitUnionMerge.update_index g $
|
||||
index_lines (lines s) $ map fileJournal fs
|
||||
forceSuccess h
|
||||
mapM_ removeFile paths
|
||||
index_lines shas fs = map genline $ zip shas fs
|
||||
genline (sha, file) = GitUnionMerge.update_index_line sha file
|
||||
|
||||
{- Produces a filename to use in the journal for a file on the branch.
|
||||
-
|
||||
- The journal typically won't have a lot of files in it, so the hashing
|
||||
- used in the branch is not necessary, and all the files are put directly
|
||||
- in the journal directory.
|
||||
-}
|
||||
journalFile :: Git.Repo -> FilePath -> FilePath
|
||||
journalFile repo file = gitAnnexJournalDir repo </> concatMap mangle file
|
||||
where
|
||||
mangle '/' = "_"
|
||||
mangle '_' = "__"
|
||||
mangle c = [c]
|
||||
|
||||
{- Converts a journal file (relative to the journal dir) back to the
|
||||
- filename on the branch. -}
|
||||
fileJournal :: FilePath -> FilePath
|
||||
fileJournal = replace "//" "_" . replace "_" "/"
|
||||
|
|
|
@ -7,7 +7,9 @@
|
|||
|
||||
module GitUnionMerge (
|
||||
merge,
|
||||
commit
|
||||
commit,
|
||||
update_index,
|
||||
update_index_line
|
||||
) where
|
||||
|
||||
import System.Cmd.Utils
|
||||
|
@ -43,6 +45,11 @@ update_index g l = togit ["update-index", "-z", "--index-info"] (join "\0" l)
|
|||
togit ps content = Git.pipeWrite g (map Param ps) content
|
||||
>>= forceSuccess
|
||||
|
||||
{- Generates a line suitable to be fed into update-index, to add
|
||||
- a given file with a given sha. -}
|
||||
update_index_line :: String -> FilePath -> String
|
||||
update_index_line sha file = "100644 blob " ++ sha ++ "\t" ++ file
|
||||
|
||||
{- Gets the contents of a tree in a format suitable for update_index. -}
|
||||
ls_tree :: Git.Repo -> String -> IO [String]
|
||||
ls_tree g x = Git.pipeNullSplit g $
|
||||
|
@ -76,14 +83,13 @@ calc_merge g differ = do
|
|||
mergeFile :: Git.Repo -> (String, FilePath) -> IO (Maybe String)
|
||||
mergeFile g (info, file) = case filter (/= nullsha) [asha, bsha] of
|
||||
[] -> return Nothing
|
||||
(sha:[]) -> return $ Just $ ls_tree_line sha
|
||||
(sha:[]) -> return $ Just $ update_index_line sha file
|
||||
shas -> do
|
||||
content <- Git.pipeRead g $ map Param ("show":shas)
|
||||
sha <- Git.hashObject g $ unionmerge content
|
||||
return $ Just $ ls_tree_line sha
|
||||
return $ Just $ update_index_line sha file
|
||||
where
|
||||
[_colonamode, _bmode, asha, bsha, _status] = words info
|
||||
ls_tree_line sha = "100644 blob " ++ sha ++ "\t" ++ file
|
||||
nullsha = take Git.shaSize $ repeat '0'
|
||||
unionmerge = unlines . nub . lines
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@ module Locations (
|
|||
gitAnnexBadDir,
|
||||
gitAnnexBadLocation,
|
||||
gitAnnexUnusedLog,
|
||||
gitAnnexJournalDir,
|
||||
isLinkToAnnex,
|
||||
logFile,
|
||||
logFileKey,
|
||||
|
|
|
@ -9,10 +9,9 @@ module Types.BranchState where
|
|||
|
||||
data BranchState = BranchState {
|
||||
branchUpdated :: Bool,
|
||||
branchChanged :: Bool,
|
||||
cachedFile :: Maybe FilePath,
|
||||
cachedContent :: String
|
||||
}
|
||||
|
||||
startBranchState :: BranchState
|
||||
startBranchState = BranchState False False Nothing ""
|
||||
startBranchState = BranchState False Nothing ""
|
||||
|
|
|
@ -21,9 +21,13 @@ deleting or changing the file contents.
|
|||
|
||||
This branch is managed by git-annex, with the contents listed below.
|
||||
|
||||
Note that git-annex assumes only it will modify this branch. If you go in
|
||||
and make changes directly, it will probably revert your changes in its next
|
||||
commit to the branch.
|
||||
The file `.git/index.git-annex` is a separate git index file it uses
|
||||
to accumlate changes for the branch. Also, `.git/annex/journal/` is used
|
||||
to record changes before they are added to git.
|
||||
|
||||
Note that for speed reasons, git-annex assumes only it will modify this
|
||||
branch. If you go in and make changes directly, it will probably revert
|
||||
your changes in its next commit to the branch.
|
||||
|
||||
The best way to make changes to the git-annex branch is instead
|
||||
to create a branch of it, with a name like "my/git-annex", and then
|
||||
|
|
|
@ -29,11 +29,6 @@ This upgrade is easier than the previous upgrades. You don't need to
|
|||
upgrade every repository at once; it's sufficient to upgrade each
|
||||
repository only when you next use it.
|
||||
|
||||
This upgrade can be sped up by, before you start, making
|
||||
.git/index.git-annex into a symlink to a file on a ramdisk.
|
||||
For example: `ln -s /run/shm/index.git-annex.$(git config annex.uuid) .git/index.git-annex`
|
||||
but, if you do that, be sure to remove the symlink after the upgrade!
|
||||
|
||||
After the upgrade is complete, commit the changes it staged.
|
||||
|
||||
git commit -m "upgrade v2 to v3"
|
||||
|
|
|
@ -42,7 +42,7 @@ main :: IO ()
|
|||
main = do
|
||||
[aref, bref, newref] <- parseArgs
|
||||
g <- Git.configRead =<< Git.repoFromCwd
|
||||
Git.useIndex (tmpIndex g)
|
||||
_ <- Git.useIndex (tmpIndex g)
|
||||
setup g
|
||||
GitUnionMerge.merge g [aref, bref]
|
||||
GitUnionMerge.commit g "union merge" newref [aref, bref]
|
||||
|
|
Loading…
Reference in a new issue