Optimised union merging; now only runs git cat-file once.

This commit is contained in:
Joey Hess 2011-11-12 17:45:12 -04:00
parent cea65b9e5b
commit 04edae6791
7 changed files with 62 additions and 36 deletions

View file

@ -149,7 +149,8 @@ update = onceonly $ do
- documentation advises users not to directly
- modify the branch.
-}
inRepo $ \g -> Git.UnionMerge.merge_index g branches
h <- catFileHandle
inRepo $ \g -> Git.UnionMerge.merge_index h g branches
ff <- if dirty then return False else tryFastForwardTo refs
unless ff $ inRepo $
Git.commit merge_desc fullname (nub $ fullname:refs)
@ -280,7 +281,7 @@ get' staleok file = fromcache =<< getCache file
fromjournal Nothing
| staleok = withIndex frombranch
| otherwise = withIndexUpdate $ frombranch >>= cache
frombranch = catFile fullname file
frombranch = L.unpack <$> catFile fullname file
cache content = do
setCache file content
return content

View file

@ -6,18 +6,25 @@
-}
module Annex.CatFile (
catFile
catFile,
catFileHandle
) where
import qualified Data.ByteString.Lazy.Char8 as L
import Common.Annex
import qualified Git.CatFile
import qualified Annex
catFile :: String -> FilePath -> Annex String
catFile branch file = maybe startup go =<< Annex.getState Annex.catfilehandle
catFile :: String -> FilePath -> Annex L.ByteString
catFile branch file = do
h <- catFileHandle
liftIO $ Git.CatFile.catFile h branch file
catFileHandle :: Annex Git.CatFile.CatFileHandle
catFileHandle = maybe startup return =<< Annex.getState Annex.catfilehandle
where
startup = do
h <- inRepo Git.CatFile.catFileStart
Annex.changeState $ \s -> s { Annex.catfilehandle = Just h }
go h
go h = liftIO $ Git.CatFile.catFile h branch file
return h

View file

@ -197,7 +197,7 @@ getKeysReferencedInGit ref = do
findkeys c (l:ls)
| isSymLink (LsTree.mode l) = do
content <- catFile ref $ LsTree.file l
case fileKey (takeFileName content) of
case fileKey (takeFileName $ L.unpack content) of
Nothing -> findkeys c ls
Just k -> findkeys (k:c) ls
| otherwise = findkeys c ls

View file

@ -9,13 +9,15 @@ module Git.CatFile (
CatFileHandle,
catFileStart,
catFileStop,
catFile
catFile,
catObject
) where
import Control.Monad.State
import System.Cmd.Utils
import System.IO
import qualified Data.ByteString.Char8 as B
import qualified Data.ByteString.Char8 as S
import qualified Data.ByteString.Lazy.Char8 as L
import Git
import Utility.SafeCommand
@ -34,30 +36,38 @@ catFileStop (pid, from, to) = do
hClose from
forceSuccess pid
{- Uses a running git cat-file read the content of a file from a branch.
- Files that do not exist on the branch will have "" returned. -}
catFile :: CatFileHandle -> String -> FilePath -> IO String
catFile (_, from, to) branch file = do
hPutStrLn to want
{- Reads a file from a specified branch. -}
catFile :: CatFileHandle -> String -> FilePath -> IO L.ByteString
catFile h branch file = catObject h (branch ++ ":" ++ file)
{- Uses a running git cat-file read the content of an object.
- Objects that do not exist will have "" returned. -}
catObject :: CatFileHandle -> String -> IO L.ByteString
catObject (_, from, to) object = do
hPutStrLn to object
hFlush to
header <- hGetLine from
case words header of
[sha, blob, size]
[sha, objtype, size]
| length sha == Git.shaSize &&
blob == "blob" -> handle size
validobjtype objtype -> handle size
| otherwise -> empty
_
| header == want ++ " missing" -> empty
| header == object ++ " missing" -> empty
| otherwise -> error $ "unknown response from git cat-file " ++ header
where
want = branch ++ ":" ++ file
handle size = case reads size of
[(bytes, "")] -> readcontent bytes
_ -> empty
readcontent bytes = do
content <- B.hGet from bytes
content <- S.hGet from bytes
c <- hGetChar from
when (c /= '\n') $
error "missing newline from git cat-file"
return $ B.unpack content
empty = return ""
return $ L.fromChunks [content]
empty = return L.empty
validobjtype t
| t == "blob" = True
| t == "commit" = True
| t == "tree" = True
| otherwise = False

View file

@ -21,6 +21,7 @@ import qualified Data.ByteString.Lazy.Char8 as L
import Common
import Git
import Git.CatFile
{- Performs a union merge between two branches, staging it in the index.
- Any previously staged changes in the index will be lost.
@ -30,14 +31,16 @@ import Git
merge :: String -> String -> Repo -> IO ()
merge x y repo = do
a <- ls_tree x repo
b <- merge_trees x y repo
h <- catFileStart repo
b <- merge_trees x y h repo
catFileStop h
update_index repo (a++b)
{- Merges a list of branches into the index. Previously staged changed in
- the index are preserved (and participate in the merge). -}
merge_index :: Repo -> [String] -> IO ()
merge_index repo bs =
update_index repo =<< concat <$> mapM (`merge_tree_index` repo) bs
merge_index :: CatFileHandle -> Repo -> [String] -> IO ()
merge_index h repo bs =
update_index repo =<< concat <$> mapM (\b -> merge_tree_index b h repo) bs
{- Feeds a list into update-index. Later items in the list can override
- earlier ones, so the list can be generated from any combination of
@ -60,22 +63,22 @@ ls_tree x = pipeNullSplit params
params = map Param ["ls-tree", "-z", "-r", "--full-tree", x]
{- For merging two trees. -}
merge_trees :: String -> String -> Repo -> IO [String]
merge_trees x y = calc_merge $ "diff-tree":diff_opts ++ [x, y]
merge_trees :: String -> String -> CatFileHandle -> Repo -> IO [String]
merge_trees x y h = calc_merge h $ "diff-tree":diff_opts ++ [x, y]
{- For merging a single tree into the index. -}
merge_tree_index :: String -> Repo -> IO [String]
merge_tree_index x = calc_merge $ "diff-index":diff_opts ++ ["--cached", x]
merge_tree_index :: String -> CatFileHandle -> Repo -> IO [String]
merge_tree_index x h = calc_merge h $ "diff-index":diff_opts ++ ["--cached", x]
diff_opts :: [String]
diff_opts = ["--raw", "-z", "-r", "--no-renames", "-l0"]
{- Calculates how to perform a merge, using git to get a raw diff,
- and returning a list suitable for update_index. -}
calc_merge :: [String] -> Repo -> IO [String]
calc_merge differ repo = do
calc_merge :: CatFileHandle -> [String] -> Repo -> IO [String]
calc_merge h differ repo = do
diff <- pipeNullSplit (map Param differ) repo
l <- mapM (\p -> mergeFile p repo) (pairs diff)
l <- mapM (\p -> mergeFile p h repo) (pairs diff)
return $ catMaybes l
where
pairs [] = []
@ -97,12 +100,12 @@ hashObject content repo = getSha subcmd $ do
{- Given an info line from a git raw diff, and the filename, generates
- a line suitable for update_index that union merges the two sides of the
- diff. -}
mergeFile :: (String, FilePath) -> Repo -> IO (Maybe String)
mergeFile (info, file) repo = case filter (/= nullsha) [asha, bsha] of
mergeFile :: (String, FilePath) -> CatFileHandle -> Repo -> IO (Maybe String)
mergeFile (info, file) h repo = case filter (/= nullsha) [asha, bsha] of
[] -> return Nothing
(sha:[]) -> return $ Just $ update_index_line sha file
shas -> do
content <- pipeRead (map Param ("show":shas)) repo
content <- L.concat <$> mapM (catObject h) shas
sha <- hashObject (unionmerge content) repo
return $ Just $ update_index_line sha file
where

1
debian/changelog vendored
View file

@ -6,6 +6,7 @@ git-annex (3.20111112) UNRELEASED; urgency=low
no longer needs to auto-merge.
* init: When run in an already initalized repository, and without
a description specified, don't delete the old description.
* Optimised union merging; now only runs git cat-file once.
-- Joey Hess <joeyh@debian.org> Sat, 12 Nov 2011 14:50:21 -0400

View file

@ -8,6 +8,10 @@ Instead, I'd like a way to stream multiple objects into git using stdin.
Sometime, should look at either extending git-hash-object to support that,
or possibly look at using git-fast-import instead.
---
`git-annex merge` also runs `git show` once per file that needs to be
merged. This could be reduced to a single call to `git-cat-file --batch`,
There is already a Git.CatFile library that can do this easily. --[[Joey]]
> This is now done, part above remains todo. --[[Joey]]