add journaling to speed up changes to the git-annex branch
git is slow when the index file is large and has to be rewritten each time a file is changed. To speed this up, added a journal where changes are recorded before being fed into the index file and committed to the git-annex branch. The entire journal can be fed into git with just 2 commands, and only one write of the index file.
This commit is contained in:
parent
23e765b67c
commit
5f494154a3
7 changed files with 132 additions and 40 deletions
127
Branch.hs
127
Branch.hs
|
@ -33,6 +33,7 @@ import qualified Annex
|
||||||
import Utility
|
import Utility
|
||||||
import Types
|
import Types
|
||||||
import Messages
|
import Messages
|
||||||
|
import Locations
|
||||||
|
|
||||||
{- Name of the branch that is used to store git-annex's information. -}
|
{- Name of the branch that is used to store git-annex's information. -}
|
||||||
name :: String
|
name :: String
|
||||||
|
@ -42,6 +43,8 @@ name = "git-annex"
|
||||||
fullname :: String
|
fullname :: String
|
||||||
fullname = "refs/heads/" ++ name
|
fullname = "refs/heads/" ++ name
|
||||||
|
|
||||||
|
{- Converts a fully qualified git ref into a short version for human
|
||||||
|
- consumptiom. -}
|
||||||
shortref :: String -> String
|
shortref :: String -> String
|
||||||
shortref = remove "refs/heads/" . remove "refs/remotes/"
|
shortref = remove "refs/heads/" . remove "refs/remotes/"
|
||||||
where
|
where
|
||||||
|
@ -56,7 +59,8 @@ index g = Git.workTree g </> Git.gitDir g </> "index." ++ name
|
||||||
{- Populates the branch's index file with the current branch contents.
|
{- Populates the branch's index file with the current branch contents.
|
||||||
-
|
-
|
||||||
- Usually, this is only done when the index doesn't yet exist, and
|
- Usually, this is only done when the index doesn't yet exist, and
|
||||||
- the index is used to build up changes to be commited to the branch.
|
- the index is used to build up changes to be commited to the branch,
|
||||||
|
- and merge in changes from other branches.
|
||||||
-}
|
-}
|
||||||
genIndex :: Git.Repo -> IO ()
|
genIndex :: Git.Repo -> IO ()
|
||||||
genIndex g = do
|
genIndex g = do
|
||||||
|
@ -97,11 +101,6 @@ setCache file content = do
|
||||||
state <- getState
|
state <- getState
|
||||||
setState state { cachedFile = Just file, cachedContent = content }
|
setState state { cachedFile = Just file, cachedContent = content }
|
||||||
|
|
||||||
setCacheChanged :: FilePath -> String -> Annex ()
|
|
||||||
setCacheChanged file content = do
|
|
||||||
state <- getState
|
|
||||||
setState state { cachedFile = Just file, cachedContent = content, branchChanged = True }
|
|
||||||
|
|
||||||
invalidateCache :: Annex ()
|
invalidateCache :: Annex ()
|
||||||
invalidateCache = do
|
invalidateCache = do
|
||||||
state <- getState
|
state <- getState
|
||||||
|
@ -133,11 +132,11 @@ create = do
|
||||||
liftIO $ Git.runBool g "show-ref"
|
liftIO $ Git.runBool g "show-ref"
|
||||||
[Param "--verify", Param "-q", Param ref]
|
[Param "--verify", Param "-q", Param ref]
|
||||||
|
|
||||||
{- Commits any staged changes to the branch. -}
|
{- Stages the journal, and commits staged changes to the branch. -}
|
||||||
commit :: String -> Annex ()
|
commit :: String -> Annex ()
|
||||||
commit message = do
|
commit message = do
|
||||||
state <- getState
|
staged <- stageJournalFiles
|
||||||
when (branchChanged state) $ do
|
when staged $ do
|
||||||
g <- Annex.gitRepo
|
g <- Annex.gitRepo
|
||||||
withIndex $ liftIO $
|
withIndex $ liftIO $
|
||||||
GitUnionMerge.commit g message fullname [fullname]
|
GitUnionMerge.commit g message fullname [fullname]
|
||||||
|
@ -187,23 +186,27 @@ updateRef ref
|
||||||
liftIO $ GitUnionMerge.merge g [ref]
|
liftIO $ GitUnionMerge.merge g [ref]
|
||||||
return $ Just ref
|
return $ Just ref
|
||||||
|
|
||||||
{- Stages the content of a file into the branch's index. -}
|
{- Records changed content of a file into the journal. -}
|
||||||
change :: FilePath -> String -> Annex ()
|
change :: FilePath -> String -> Annex ()
|
||||||
change file content = do
|
change file content = do
|
||||||
g <- Annex.gitRepo
|
setJournalFile file content
|
||||||
sha <- liftIO $ Git.hashObject g content
|
setCache file content
|
||||||
withIndex $ liftIO $ Git.run g "update-index"
|
|
||||||
[ Param "--add", Param "--cacheinfo", Param "100644",
|
|
||||||
Param sha, File file]
|
|
||||||
setCacheChanged file content
|
|
||||||
|
|
||||||
{- Gets the content of a file on the branch, or content staged in the index
|
{- Gets the content of a file on the branch, or content from the journal, or
|
||||||
- if it's newer. Returns an empty string if the file didn't exist yet. -}
|
- staged in the index.
|
||||||
|
-
|
||||||
|
- Returns an empty string if the file doesn't exist yet. -}
|
||||||
get :: FilePath -> Annex String
|
get :: FilePath -> Annex String
|
||||||
get file = do
|
get file = do
|
||||||
cached <- getCache file
|
cached <- getCache file
|
||||||
case cached of
|
case cached of
|
||||||
Just content -> return content
|
Just content -> return content
|
||||||
|
Nothing -> do
|
||||||
|
j <- getJournalFile file
|
||||||
|
case j of
|
||||||
|
Just content -> do
|
||||||
|
setCache file content
|
||||||
|
return content
|
||||||
Nothing -> withIndexUpdate $ do
|
Nothing -> withIndexUpdate $ do
|
||||||
g <- Annex.gitRepo
|
g <- Annex.gitRepo
|
||||||
content <- liftIO $ catch (cat g) (const $ return "")
|
content <- liftIO $ catch (cat g) (const $ return "")
|
||||||
|
@ -231,9 +234,93 @@ cmdOutput cmd params = do
|
||||||
_ <- getProcessStatus True False pid
|
_ <- getProcessStatus True False pid
|
||||||
return rv
|
return rv
|
||||||
|
|
||||||
{- Lists all files on the branch. -}
|
{- Lists all files on the branch. There may be duplicates in the list. -}
|
||||||
files :: Annex [FilePath]
|
files :: Annex [FilePath]
|
||||||
files = withIndexUpdate $ do
|
files = withIndexUpdate $ do
|
||||||
g <- Annex.gitRepo
|
g <- Annex.gitRepo
|
||||||
liftIO $ Git.pipeNullSplit g
|
bfiles <- liftIO $ Git.pipeNullSplit g
|
||||||
[Params "ls-tree --name-only -r -z", Param fullname]
|
[Params "ls-tree --name-only -r -z", Param fullname]
|
||||||
|
jfiles <- getJournalFiles
|
||||||
|
return $ jfiles ++ bfiles
|
||||||
|
|
||||||
|
{- Records content for a file in the branch to the journal.
|
||||||
|
-
|
||||||
|
- Using the journal, rather than immediatly staging content to the index
|
||||||
|
- avoids git needing to rewrite the index after every change. -}
|
||||||
|
setJournalFile :: FilePath -> String -> Annex ()
|
||||||
|
setJournalFile file content = do
|
||||||
|
g <- Annex.gitRepo
|
||||||
|
liftIO $ catch (write g) $ const $ do
|
||||||
|
createDirectoryIfMissing True $ gitAnnexJournalDir g
|
||||||
|
createDirectoryIfMissing True $ gitAnnexTmpDir g
|
||||||
|
write g
|
||||||
|
where
|
||||||
|
-- journal file is written atomically
|
||||||
|
write g = do
|
||||||
|
let jfile = journalFile g file
|
||||||
|
let tmpfile = gitAnnexTmpDir g </> takeFileName jfile
|
||||||
|
writeFile tmpfile content
|
||||||
|
renameFile tmpfile jfile
|
||||||
|
|
||||||
|
{- Gets journalled content for a file in the branch. -}
|
||||||
|
getJournalFile :: FilePath -> Annex (Maybe String)
|
||||||
|
getJournalFile file = do
|
||||||
|
g <- Annex.gitRepo
|
||||||
|
liftIO $ catch (liftM Just . readFileStrict $ journalFile g file)
|
||||||
|
(const $ return Nothing)
|
||||||
|
|
||||||
|
{- List of journal files. -}
|
||||||
|
getJournalFiles :: Annex [FilePath]
|
||||||
|
getJournalFiles = getJournalFilesRaw >>= return . map fileJournal
|
||||||
|
|
||||||
|
getJournalFilesRaw :: Annex [FilePath]
|
||||||
|
getJournalFilesRaw = do
|
||||||
|
g <- Annex.gitRepo
|
||||||
|
fs <- liftIO $ catch (getDirectoryContents $ gitAnnexJournalDir g)
|
||||||
|
(const $ return [])
|
||||||
|
return $ filter (\f -> f /= "." && f /= "..") fs
|
||||||
|
|
||||||
|
{- Stages all journal files into the index, and returns True if the index
|
||||||
|
- was modified. -}
|
||||||
|
stageJournalFiles :: Annex Bool
|
||||||
|
stageJournalFiles = do
|
||||||
|
l <- getJournalFilesRaw
|
||||||
|
if null l
|
||||||
|
then return False
|
||||||
|
else do
|
||||||
|
g <- Annex.gitRepo
|
||||||
|
withIndex $ liftIO $ stage g l
|
||||||
|
return True
|
||||||
|
where
|
||||||
|
stage g fs = do
|
||||||
|
let dir = gitAnnexJournalDir g
|
||||||
|
let paths = map (dir </>) fs
|
||||||
|
-- inject all the journal files directly into git
|
||||||
|
-- in one quick command
|
||||||
|
(h, s) <- Git.pipeWriteRead g [Param "hash-object",
|
||||||
|
Param "-w", Param "--stdin-paths"] $ unlines paths
|
||||||
|
-- update the index, also in just one command
|
||||||
|
GitUnionMerge.update_index g $
|
||||||
|
index_lines (lines s) $ map fileJournal fs
|
||||||
|
forceSuccess h
|
||||||
|
mapM_ removeFile paths
|
||||||
|
index_lines shas fs = map genline $ zip shas fs
|
||||||
|
genline (sha, file) = GitUnionMerge.update_index_line sha file
|
||||||
|
|
||||||
|
{- Produces a filename to use in the journal for a file on the branch.
|
||||||
|
-
|
||||||
|
- The journal typically won't have a lot of files in it, so the hashing
|
||||||
|
- used in the branch is not necessary, and all the files are put directly
|
||||||
|
- in the journal directory.
|
||||||
|
-}
|
||||||
|
journalFile :: Git.Repo -> FilePath -> FilePath
|
||||||
|
journalFile repo file = gitAnnexJournalDir repo </> concatMap mangle file
|
||||||
|
where
|
||||||
|
mangle '/' = "_"
|
||||||
|
mangle '_' = "__"
|
||||||
|
mangle c = [c]
|
||||||
|
|
||||||
|
{- Converts a journal file (relative to the journal dir) back to the
|
||||||
|
- filename on the branch. -}
|
||||||
|
fileJournal :: FilePath -> FilePath
|
||||||
|
fileJournal = replace "//" "_" . replace "_" "/"
|
||||||
|
|
|
@ -7,7 +7,9 @@
|
||||||
|
|
||||||
module GitUnionMerge (
|
module GitUnionMerge (
|
||||||
merge,
|
merge,
|
||||||
commit
|
commit,
|
||||||
|
update_index,
|
||||||
|
update_index_line
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import System.Cmd.Utils
|
import System.Cmd.Utils
|
||||||
|
@ -43,6 +45,11 @@ update_index g l = togit ["update-index", "-z", "--index-info"] (join "\0" l)
|
||||||
togit ps content = Git.pipeWrite g (map Param ps) content
|
togit ps content = Git.pipeWrite g (map Param ps) content
|
||||||
>>= forceSuccess
|
>>= forceSuccess
|
||||||
|
|
||||||
|
{- Generates a line suitable to be fed into update-index, to add
|
||||||
|
- a given file with a given sha. -}
|
||||||
|
update_index_line :: String -> FilePath -> String
|
||||||
|
update_index_line sha file = "100644 blob " ++ sha ++ "\t" ++ file
|
||||||
|
|
||||||
{- Gets the contents of a tree in a format suitable for update_index. -}
|
{- Gets the contents of a tree in a format suitable for update_index. -}
|
||||||
ls_tree :: Git.Repo -> String -> IO [String]
|
ls_tree :: Git.Repo -> String -> IO [String]
|
||||||
ls_tree g x = Git.pipeNullSplit g $
|
ls_tree g x = Git.pipeNullSplit g $
|
||||||
|
@ -76,14 +83,13 @@ calc_merge g differ = do
|
||||||
mergeFile :: Git.Repo -> (String, FilePath) -> IO (Maybe String)
|
mergeFile :: Git.Repo -> (String, FilePath) -> IO (Maybe String)
|
||||||
mergeFile g (info, file) = case filter (/= nullsha) [asha, bsha] of
|
mergeFile g (info, file) = case filter (/= nullsha) [asha, bsha] of
|
||||||
[] -> return Nothing
|
[] -> return Nothing
|
||||||
(sha:[]) -> return $ Just $ ls_tree_line sha
|
(sha:[]) -> return $ Just $ update_index_line sha file
|
||||||
shas -> do
|
shas -> do
|
||||||
content <- Git.pipeRead g $ map Param ("show":shas)
|
content <- Git.pipeRead g $ map Param ("show":shas)
|
||||||
sha <- Git.hashObject g $ unionmerge content
|
sha <- Git.hashObject g $ unionmerge content
|
||||||
return $ Just $ ls_tree_line sha
|
return $ Just $ update_index_line sha file
|
||||||
where
|
where
|
||||||
[_colonamode, _bmode, asha, bsha, _status] = words info
|
[_colonamode, _bmode, asha, bsha, _status] = words info
|
||||||
ls_tree_line sha = "100644 blob " ++ sha ++ "\t" ++ file
|
|
||||||
nullsha = take Git.shaSize $ repeat '0'
|
nullsha = take Git.shaSize $ repeat '0'
|
||||||
unionmerge = unlines . nub . lines
|
unionmerge = unlines . nub . lines
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,7 @@ module Locations (
|
||||||
gitAnnexBadDir,
|
gitAnnexBadDir,
|
||||||
gitAnnexBadLocation,
|
gitAnnexBadLocation,
|
||||||
gitAnnexUnusedLog,
|
gitAnnexUnusedLog,
|
||||||
|
gitAnnexJournalDir,
|
||||||
isLinkToAnnex,
|
isLinkToAnnex,
|
||||||
logFile,
|
logFile,
|
||||||
logFileKey,
|
logFileKey,
|
||||||
|
|
|
@ -9,10 +9,9 @@ module Types.BranchState where
|
||||||
|
|
||||||
data BranchState = BranchState {
|
data BranchState = BranchState {
|
||||||
branchUpdated :: Bool,
|
branchUpdated :: Bool,
|
||||||
branchChanged :: Bool,
|
|
||||||
cachedFile :: Maybe FilePath,
|
cachedFile :: Maybe FilePath,
|
||||||
cachedContent :: String
|
cachedContent :: String
|
||||||
}
|
}
|
||||||
|
|
||||||
startBranchState :: BranchState
|
startBranchState :: BranchState
|
||||||
startBranchState = BranchState False False Nothing ""
|
startBranchState = BranchState False Nothing ""
|
||||||
|
|
|
@ -21,9 +21,13 @@ deleting or changing the file contents.
|
||||||
|
|
||||||
This branch is managed by git-annex, with the contents listed below.
|
This branch is managed by git-annex, with the contents listed below.
|
||||||
|
|
||||||
Note that git-annex assumes only it will modify this branch. If you go in
|
The file `.git/index.git-annex` is a separate git index file it uses
|
||||||
and make changes directly, it will probably revert your changes in its next
|
to accumlate changes for the branch. Also, `.git/annex/journal/` is used
|
||||||
commit to the branch.
|
to record changes before they are added to git.
|
||||||
|
|
||||||
|
Note that for speed reasons, git-annex assumes only it will modify this
|
||||||
|
branch. If you go in and make changes directly, it will probably revert
|
||||||
|
your changes in its next commit to the branch.
|
||||||
|
|
||||||
The best way to make changes to the git-annex branch is instead
|
The best way to make changes to the git-annex branch is instead
|
||||||
to create a branch of it, with a name like "my/git-annex", and then
|
to create a branch of it, with a name like "my/git-annex", and then
|
||||||
|
|
|
@ -29,11 +29,6 @@ This upgrade is easier than the previous upgrades. You don't need to
|
||||||
upgrade every repository at once; it's sufficient to upgrade each
|
upgrade every repository at once; it's sufficient to upgrade each
|
||||||
repository only when you next use it.
|
repository only when you next use it.
|
||||||
|
|
||||||
This upgrade can be sped up by, before you start, making
|
|
||||||
.git/index.git-annex into a symlink to a file on a ramdisk.
|
|
||||||
For example: `ln -s /run/shm/index.git-annex.$(git config annex.uuid) .git/index.git-annex`
|
|
||||||
but, if you do that, be sure to remove the symlink after the upgrade!
|
|
||||||
|
|
||||||
After the upgrade is complete, commit the changes it staged.
|
After the upgrade is complete, commit the changes it staged.
|
||||||
|
|
||||||
git commit -m "upgrade v2 to v3"
|
git commit -m "upgrade v2 to v3"
|
||||||
|
|
|
@ -42,7 +42,7 @@ main :: IO ()
|
||||||
main = do
|
main = do
|
||||||
[aref, bref, newref] <- parseArgs
|
[aref, bref, newref] <- parseArgs
|
||||||
g <- Git.configRead =<< Git.repoFromCwd
|
g <- Git.configRead =<< Git.repoFromCwd
|
||||||
Git.useIndex (tmpIndex g)
|
_ <- Git.useIndex (tmpIndex g)
|
||||||
setup g
|
setup g
|
||||||
GitUnionMerge.merge g [aref, bref]
|
GitUnionMerge.merge g [aref, bref]
|
||||||
GitUnionMerge.commit g "union merge" newref [aref, bref]
|
GitUnionMerge.commit g "union merge" newref [aref, bref]
|
||||||
|
|
Loading…
Reference in a new issue