From 5f494154a34bad7cc54915a2a408b830e8ca77be Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 23 Jun 2011 11:37:26 -0400 Subject: [PATCH] add journaling to speed up changes to the git-annex branch git is slow when the index file is large and has to be rewritten each time a file is changed. To speed this up, added a journal where changes are recorded before being fed into the index file and committed to the git-annex branch. The entire journal can be fed into git with just 2 commands, and only one write of the index file. --- Branch.hs | 137 +++++++++++++++++++++++++++++++++++-------- GitUnionMerge.hs | 14 +++-- Locations.hs | 1 + Types/BranchState.hs | 3 +- doc/internals.mdwn | 10 +++- doc/upgrades.mdwn | 5 -- git-union-merge.hs | 2 +- 7 files changed, 132 insertions(+), 40 deletions(-) diff --git a/Branch.hs b/Branch.hs index 00f406135e..f0d97bfc36 100644 --- a/Branch.hs +++ b/Branch.hs @@ -33,6 +33,7 @@ import qualified Annex import Utility import Types import Messages +import Locations {- Name of the branch that is used to store git-annex's information. -} name :: String @@ -42,6 +43,8 @@ name = "git-annex" fullname :: String fullname = "refs/heads/" ++ name +{- Converts a fully qualified git ref into a short version for human + - consumptiom. -} shortref :: String -> String shortref = remove "refs/heads/" . remove "refs/remotes/" where @@ -56,7 +59,8 @@ index g = Git.workTree g Git.gitDir g "index." ++ name {- Populates the branch's index file with the current branch contents. - - Usually, this is only done when the index doesn't yet exist, and - - the index is used to build up changes to be commited to the branch. + - the index is used to build up changes to be commited to the branch, + - and merge in changes from other branches. -} genIndex :: Git.Repo -> IO () genIndex g = do @@ -97,11 +101,6 @@ setCache file content = do state <- getState setState state { cachedFile = Just file, cachedContent = content } -setCacheChanged :: FilePath -> String -> Annex () -setCacheChanged file content = do - state <- getState - setState state { cachedFile = Just file, cachedContent = content, branchChanged = True } - invalidateCache :: Annex () invalidateCache = do state <- getState @@ -133,11 +132,11 @@ create = do liftIO $ Git.runBool g "show-ref" [Param "--verify", Param "-q", Param ref] -{- Commits any staged changes to the branch. -} +{- Stages the journal, and commits staged changes to the branch. -} commit :: String -> Annex () commit message = do - state <- getState - when (branchChanged state) $ do + staged <- stageJournalFiles + when staged $ do g <- Annex.gitRepo withIndex $ liftIO $ GitUnionMerge.commit g message fullname [fullname] @@ -187,28 +186,32 @@ updateRef ref liftIO $ GitUnionMerge.merge g [ref] return $ Just ref -{- Stages the content of a file into the branch's index. -} +{- Records changed content of a file into the journal. -} change :: FilePath -> String -> Annex () change file content = do - g <- Annex.gitRepo - sha <- liftIO $ Git.hashObject g content - withIndex $ liftIO $ Git.run g "update-index" - [ Param "--add", Param "--cacheinfo", Param "100644", - Param sha, File file] - setCacheChanged file content + setJournalFile file content + setCache file content -{- Gets the content of a file on the branch, or content staged in the index - - if it's newer. Returns an empty string if the file didn't exist yet. -} +{- Gets the content of a file on the branch, or content from the journal, or + - staged in the index. + - + - Returns an empty string if the file doesn't exist yet. -} get :: FilePath -> Annex String get file = do cached <- getCache file case cached of Just content -> return content - Nothing -> withIndexUpdate $ do - g <- Annex.gitRepo - content <- liftIO $ catch (cat g) (const $ return "") - setCache file content - return content + Nothing -> do + j <- getJournalFile file + case j of + Just content -> do + setCache file content + return content + Nothing -> withIndexUpdate $ do + g <- Annex.gitRepo + content <- liftIO $ catch (cat g) (const $ return "") + setCache file content + return content where cat g = cmdOutput "git" $ toCommand $ Git.gitCommandLine g [Param "cat-file", Param "blob", Param $ ':':file] @@ -231,9 +234,93 @@ cmdOutput cmd params = do _ <- getProcessStatus True False pid return rv -{- Lists all files on the branch. -} +{- Lists all files on the branch. There may be duplicates in the list. -} files :: Annex [FilePath] files = withIndexUpdate $ do g <- Annex.gitRepo - liftIO $ Git.pipeNullSplit g + bfiles <- liftIO $ Git.pipeNullSplit g [Params "ls-tree --name-only -r -z", Param fullname] + jfiles <- getJournalFiles + return $ jfiles ++ bfiles + +{- Records content for a file in the branch to the journal. + - + - Using the journal, rather than immediatly staging content to the index + - avoids git needing to rewrite the index after every change. -} +setJournalFile :: FilePath -> String -> Annex () +setJournalFile file content = do + g <- Annex.gitRepo + liftIO $ catch (write g) $ const $ do + createDirectoryIfMissing True $ gitAnnexJournalDir g + createDirectoryIfMissing True $ gitAnnexTmpDir g + write g + where + -- journal file is written atomically + write g = do + let jfile = journalFile g file + let tmpfile = gitAnnexTmpDir g takeFileName jfile + writeFile tmpfile content + renameFile tmpfile jfile + +{- Gets journalled content for a file in the branch. -} +getJournalFile :: FilePath -> Annex (Maybe String) +getJournalFile file = do + g <- Annex.gitRepo + liftIO $ catch (liftM Just . readFileStrict $ journalFile g file) + (const $ return Nothing) + +{- List of journal files. -} +getJournalFiles :: Annex [FilePath] +getJournalFiles = getJournalFilesRaw >>= return . map fileJournal + +getJournalFilesRaw :: Annex [FilePath] +getJournalFilesRaw = do + g <- Annex.gitRepo + fs <- liftIO $ catch (getDirectoryContents $ gitAnnexJournalDir g) + (const $ return []) + return $ filter (\f -> f /= "." && f /= "..") fs + +{- Stages all journal files into the index, and returns True if the index + - was modified. -} +stageJournalFiles :: Annex Bool +stageJournalFiles = do + l <- getJournalFilesRaw + if null l + then return False + else do + g <- Annex.gitRepo + withIndex $ liftIO $ stage g l + return True + where + stage g fs = do + let dir = gitAnnexJournalDir g + let paths = map (dir ) fs + -- inject all the journal files directly into git + -- in one quick command + (h, s) <- Git.pipeWriteRead g [Param "hash-object", + Param "-w", Param "--stdin-paths"] $ unlines paths + -- update the index, also in just one command + GitUnionMerge.update_index g $ + index_lines (lines s) $ map fileJournal fs + forceSuccess h + mapM_ removeFile paths + index_lines shas fs = map genline $ zip shas fs + genline (sha, file) = GitUnionMerge.update_index_line sha file + +{- Produces a filename to use in the journal for a file on the branch. + - + - The journal typically won't have a lot of files in it, so the hashing + - used in the branch is not necessary, and all the files are put directly + - in the journal directory. + -} +journalFile :: Git.Repo -> FilePath -> FilePath +journalFile repo file = gitAnnexJournalDir repo concatMap mangle file + where + mangle '/' = "_" + mangle '_' = "__" + mangle c = [c] + +{- Converts a journal file (relative to the journal dir) back to the + - filename on the branch. -} +fileJournal :: FilePath -> FilePath +fileJournal = replace "//" "_" . replace "_" "/" diff --git a/GitUnionMerge.hs b/GitUnionMerge.hs index 096a153a48..fa14a6bc3f 100644 --- a/GitUnionMerge.hs +++ b/GitUnionMerge.hs @@ -7,7 +7,9 @@ module GitUnionMerge ( merge, - commit + commit, + update_index, + update_index_line ) where import System.Cmd.Utils @@ -43,6 +45,11 @@ update_index g l = togit ["update-index", "-z", "--index-info"] (join "\0" l) togit ps content = Git.pipeWrite g (map Param ps) content >>= forceSuccess +{- Generates a line suitable to be fed into update-index, to add + - a given file with a given sha. -} +update_index_line :: String -> FilePath -> String +update_index_line sha file = "100644 blob " ++ sha ++ "\t" ++ file + {- Gets the contents of a tree in a format suitable for update_index. -} ls_tree :: Git.Repo -> String -> IO [String] ls_tree g x = Git.pipeNullSplit g $ @@ -76,14 +83,13 @@ calc_merge g differ = do mergeFile :: Git.Repo -> (String, FilePath) -> IO (Maybe String) mergeFile g (info, file) = case filter (/= nullsha) [asha, bsha] of [] -> return Nothing - (sha:[]) -> return $ Just $ ls_tree_line sha + (sha:[]) -> return $ Just $ update_index_line sha file shas -> do content <- Git.pipeRead g $ map Param ("show":shas) sha <- Git.hashObject g $ unionmerge content - return $ Just $ ls_tree_line sha + return $ Just $ update_index_line sha file where [_colonamode, _bmode, asha, bsha, _status] = words info - ls_tree_line sha = "100644 blob " ++ sha ++ "\t" ++ file nullsha = take Git.shaSize $ repeat '0' unionmerge = unlines . nub . lines diff --git a/Locations.hs b/Locations.hs index f93b0cc50d..bfb0d3af9e 100644 --- a/Locations.hs +++ b/Locations.hs @@ -17,6 +17,7 @@ module Locations ( gitAnnexBadDir, gitAnnexBadLocation, gitAnnexUnusedLog, + gitAnnexJournalDir, isLinkToAnnex, logFile, logFileKey, diff --git a/Types/BranchState.hs b/Types/BranchState.hs index 65d0642a14..40d7f5c2c7 100644 --- a/Types/BranchState.hs +++ b/Types/BranchState.hs @@ -9,10 +9,9 @@ module Types.BranchState where data BranchState = BranchState { branchUpdated :: Bool, - branchChanged :: Bool, cachedFile :: Maybe FilePath, cachedContent :: String } startBranchState :: BranchState -startBranchState = BranchState False False Nothing "" +startBranchState = BranchState False Nothing "" diff --git a/doc/internals.mdwn b/doc/internals.mdwn index aaa125599d..27b5bb1f29 100644 --- a/doc/internals.mdwn +++ b/doc/internals.mdwn @@ -21,9 +21,13 @@ deleting or changing the file contents. This branch is managed by git-annex, with the contents listed below. -Note that git-annex assumes only it will modify this branch. If you go in -and make changes directly, it will probably revert your changes in its next -commit to the branch. +The file `.git/index.git-annex` is a separate git index file it uses +to accumlate changes for the branch. Also, `.git/annex/journal/` is used +to record changes before they are added to git. + +Note that for speed reasons, git-annex assumes only it will modify this +branch. If you go in and make changes directly, it will probably revert +your changes in its next commit to the branch. The best way to make changes to the git-annex branch is instead to create a branch of it, with a name like "my/git-annex", and then diff --git a/doc/upgrades.mdwn b/doc/upgrades.mdwn index 63fbcf75b7..2e8f201fb0 100644 --- a/doc/upgrades.mdwn +++ b/doc/upgrades.mdwn @@ -29,11 +29,6 @@ This upgrade is easier than the previous upgrades. You don't need to upgrade every repository at once; it's sufficient to upgrade each repository only when you next use it. -This upgrade can be sped up by, before you start, making -.git/index.git-annex into a symlink to a file on a ramdisk. -For example: `ln -s /run/shm/index.git-annex.$(git config annex.uuid) .git/index.git-annex` -but, if you do that, be sure to remove the symlink after the upgrade! - After the upgrade is complete, commit the changes it staged. git commit -m "upgrade v2 to v3" diff --git a/git-union-merge.hs b/git-union-merge.hs index 7c0c1cd843..57232be67b 100644 --- a/git-union-merge.hs +++ b/git-union-merge.hs @@ -42,7 +42,7 @@ main :: IO () main = do [aref, bref, newref] <- parseArgs g <- Git.configRead =<< Git.repoFromCwd - Git.useIndex (tmpIndex g) + _ <- Git.useIndex (tmpIndex g) setup g GitUnionMerge.merge g [aref, bref] GitUnionMerge.commit g "union merge" newref [aref, bref]