git-annex/Annex/Branch.hs

385 lines
12 KiB
Haskell
Raw Normal View History

2011-06-21 20:08:09 +00:00
{- management of the git-annex branch
-
- Copyright 2011 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
2011-10-04 04:40:47 +00:00
module Annex.Branch (
create,
2011-06-21 20:08:09 +00:00
update,
disableUpdate,
get,
change,
2011-06-22 21:47:06 +00:00
commit,
files,
refExists,
hasOrigin,
hasSomeBranch,
name
2011-06-21 20:08:09 +00:00
) where
import System.IO.Binary
import System.Exit
import qualified Data.ByteString.Lazy.Char8 as L
2011-10-05 20:02:51 +00:00
import Common.Annex
import Annex.Exception
import Types.BranchState
import qualified Git
import qualified Git.UnionMerge
import qualified Annex
2011-10-04 04:40:47 +00:00
import Annex.CatFile
2011-06-28 18:14:49 +00:00
type GitRef = String
{- Name of the branch that is used to store git-annex's information. -}
2011-06-28 18:14:49 +00:00
name :: GitRef
name = "git-annex"
{- Fully qualified name of the branch. -}
2011-06-28 18:14:49 +00:00
fullname :: GitRef
fullname = "refs/heads/" ++ name
{- Branch's name in origin. -}
2011-06-28 18:14:49 +00:00
originname :: GitRef
originname = "origin/" ++ name
{- A separate index file for the branch. -}
index :: Git.Repo -> FilePath
2011-06-23 16:11:09 +00:00
index g = gitAnnexDir g </> "index"
{- Populates the branch's index file with the current branch contents.
-
- Usually, this is only done when the index doesn't yet exist, and
- the index is used to build up changes to be commited to the branch,
- and merge in changes from other branches.
-}
genIndex :: Git.Repo -> IO ()
genIndex g = Git.UnionMerge.ls_tree fullname g >>= Git.UnionMerge.update_index g
{- Runs an action using the branch's index file. -}
withIndex :: Annex a -> Annex a
withIndex = withIndex' False
withIndex' :: Bool -> Annex a -> Annex a
withIndex' bootstrapping a = do
f <- fromRepo $ index
bracketIO (Git.useIndex f) id $ do
unlessM (liftIO $ doesFileExist f) $ do
unless bootstrapping create
liftIO $ createDirectoryIfMissing True $ takeDirectory f
unless bootstrapping $ inRepo genIndex
a
withIndexUpdate :: Annex a -> Annex a
withIndexUpdate a = update >> withIndex a
getState :: Annex BranchState
getState = Annex.getState Annex.branchstate
setState :: BranchState -> Annex ()
setState state = Annex.changeState $ \s -> s { Annex.branchstate = state }
setCache :: FilePath -> String -> Annex ()
setCache file content = do
state <- getState
setState state { cachedFile = Just file, cachedContent = content }
invalidateCache :: Annex ()
invalidateCache = do
state <- getState
setState state { cachedFile = Nothing, cachedContent = "" }
getCache :: FilePath -> Annex (Maybe String)
getCache file = getState >>= go
where
go state
| cachedFile state == Just file =
return $ Just $ cachedContent state
| otherwise = return Nothing
{- Creates the branch, if it does not already exist. -}
create :: Annex ()
create = unlessM hasBranch $ do
2011-06-28 18:14:49 +00:00
e <- hasOrigin
if e
then inRepo $ Git.run "branch" [Param name, Param originname]
2011-06-28 18:14:49 +00:00
else withIndex' True $
inRepo $ Git.commit "branch created" fullname []
{- Stages the journal, and commits staged changes to the branch. -}
commit :: String -> Annex ()
commit message = whenM journalDirty $ lockJournal $ do
stageJournalFiles
withIndex $ inRepo $ Git.commit message fullname [fullname]
merge: Use fast-forward merges when possible. Thanks Valentin Haenel for a test case showing how non-fast-forward merges could result in an ongoing pull/merge/push cycle. While the git-annex branch is fast-forwarded, git-annex's index file is still updated using the union merge strategy as before. There's no other way to update the index that would be any faster. It is possible that a union merge and a fast-forward result in different file contents: Files should have the same lines, but a union merge may change their order. If this happens, the next commit made to the git-annex branch will have some unnecessary changes to line orders, but the consistency of data should be preserved. Note that when the journal contains changes, a fast-forward is never attempted, which is fine, because committing those changes would be vanishingly unlikely to leave the git-annex branch at a commit that already exists in one of the remotes. The real difficulty is handling the case where multiple remotes have all changed. git-annex does find the best (ie, newest) one and fast forwards to it. If the remotes are diverged, no fast-forward is done at all. It would be possible to pick one, fast forward to it, and make a merge commit to the rest, I see no benefit to adding that complexity. Determining the best of N changed remotes requires N*2+1 calls to git-log, but these are fast git-log calls, and N is typically small. Also, typically some or all of the remote refs will be the same, and git-log is not called to compare those. In the real world I expect this will almost always add only 1 git-log call to the merge process. (Which already makes N anyway.)
2011-11-06 19:18:45 +00:00
{- Ensures that the branch is up-to-date; should be called before data is
- read from it. Runs only once per git-annex run.
-
merge: Use fast-forward merges when possible. Thanks Valentin Haenel for a test case showing how non-fast-forward merges could result in an ongoing pull/merge/push cycle. While the git-annex branch is fast-forwarded, git-annex's index file is still updated using the union merge strategy as before. There's no other way to update the index that would be any faster. It is possible that a union merge and a fast-forward result in different file contents: Files should have the same lines, but a union merge may change their order. If this happens, the next commit made to the git-annex branch will have some unnecessary changes to line orders, but the consistency of data should be preserved. Note that when the journal contains changes, a fast-forward is never attempted, which is fine, because committing those changes would be vanishingly unlikely to leave the git-annex branch at a commit that already exists in one of the remotes. The real difficulty is handling the case where multiple remotes have all changed. git-annex does find the best (ie, newest) one and fast forwards to it. If the remotes are diverged, no fast-forward is done at all. It would be possible to pick one, fast forward to it, and make a merge commit to the rest, I see no benefit to adding that complexity. Determining the best of N changed remotes requires N*2+1 calls to git-log, but these are fast git-log calls, and N is typically small. Also, typically some or all of the remote refs will be the same, and git-log is not called to compare those. In the real world I expect this will almost always add only 1 git-log call to the merge process. (Which already makes N anyway.)
2011-11-06 19:18:45 +00:00
- Before refs are merged into the index, it's important to first stage the
- journal into the index. Otherwise, any changes in the journal would
- later get staged, and might overwrite changes made during the merge.
-
merge: Use fast-forward merges when possible. Thanks Valentin Haenel for a test case showing how non-fast-forward merges could result in an ongoing pull/merge/push cycle. While the git-annex branch is fast-forwarded, git-annex's index file is still updated using the union merge strategy as before. There's no other way to update the index that would be any faster. It is possible that a union merge and a fast-forward result in different file contents: Files should have the same lines, but a union merge may change their order. If this happens, the next commit made to the git-annex branch will have some unnecessary changes to line orders, but the consistency of data should be preserved. Note that when the journal contains changes, a fast-forward is never attempted, which is fine, because committing those changes would be vanishingly unlikely to leave the git-annex branch at a commit that already exists in one of the remotes. The real difficulty is handling the case where multiple remotes have all changed. git-annex does find the best (ie, newest) one and fast forwards to it. If the remotes are diverged, no fast-forward is done at all. It would be possible to pick one, fast forward to it, and make a merge commit to the rest, I see no benefit to adding that complexity. Determining the best of N changed remotes requires N*2+1 calls to git-log, but these are fast git-log calls, and N is typically small. Also, typically some or all of the remote refs will be the same, and git-log is not called to compare those. In the real world I expect this will almost always add only 1 git-log call to the merge process. (Which already makes N anyway.)
2011-11-06 19:18:45 +00:00
- It would be cleaner to handle the merge by updating the journal, not the
- index, with changes from the branches.
-
- The index is always updated using a union merge, as that's the most
- efficient way to update it. However, if the branch can be
- fast-forwarded, that is then done, rather than adding an unnecessary
- commit to it.
-}
update :: Annex ()
update = onceonly $ do
-- check what needs updating before taking the lock
dirty <- journalDirty
merge: Use fast-forward merges when possible. Thanks Valentin Haenel for a test case showing how non-fast-forward merges could result in an ongoing pull/merge/push cycle. While the git-annex branch is fast-forwarded, git-annex's index file is still updated using the union merge strategy as before. There's no other way to update the index that would be any faster. It is possible that a union merge and a fast-forward result in different file contents: Files should have the same lines, but a union merge may change their order. If this happens, the next commit made to the git-annex branch will have some unnecessary changes to line orders, but the consistency of data should be preserved. Note that when the journal contains changes, a fast-forward is never attempted, which is fine, because committing those changes would be vanishingly unlikely to leave the git-annex branch at a commit that already exists in one of the remotes. The real difficulty is handling the case where multiple remotes have all changed. git-annex does find the best (ie, newest) one and fast forwards to it. If the remotes are diverged, no fast-forward is done at all. It would be possible to pick one, fast forward to it, and make a merge commit to the rest, I see no benefit to adding that complexity. Determining the best of N changed remotes requires N*2+1 calls to git-log, but these are fast git-log calls, and N is typically small. Also, typically some or all of the remote refs will be the same, and git-log is not called to compare those. In the real world I expect this will almost always add only 1 git-log call to the merge process. (Which already makes N anyway.)
2011-11-06 19:18:45 +00:00
c <- filterM (changedBranch name . snd) =<< siblingBranches
let (refs, branches) = unzip c
unless (not dirty && null refs) $ withIndex $ lockJournal $ do
when dirty stageJournalFiles
unless (null branches) $ do
showSideAction $ "merging " ++
(unwords $ map Git.refDescribe branches) ++
" into " ++ name
{- Note: This merges the branches into the index.
- Any unstaged changes in the git-annex branch
- (if it's checked out) will be removed. So,
- documentation advises users not to directly
- modify the branch.
-}
inRepo $ \g -> Git.UnionMerge.merge_index g branches
merge: Use fast-forward merges when possible. Thanks Valentin Haenel for a test case showing how non-fast-forward merges could result in an ongoing pull/merge/push cycle. While the git-annex branch is fast-forwarded, git-annex's index file is still updated using the union merge strategy as before. There's no other way to update the index that would be any faster. It is possible that a union merge and a fast-forward result in different file contents: Files should have the same lines, but a union merge may change their order. If this happens, the next commit made to the git-annex branch will have some unnecessary changes to line orders, but the consistency of data should be preserved. Note that when the journal contains changes, a fast-forward is never attempted, which is fine, because committing those changes would be vanishingly unlikely to leave the git-annex branch at a commit that already exists in one of the remotes. The real difficulty is handling the case where multiple remotes have all changed. git-annex does find the best (ie, newest) one and fast forwards to it. If the remotes are diverged, no fast-forward is done at all. It would be possible to pick one, fast forward to it, and make a merge commit to the rest, I see no benefit to adding that complexity. Determining the best of N changed remotes requires N*2+1 calls to git-log, but these are fast git-log calls, and N is typically small. Also, typically some or all of the remote refs will be the same, and git-log is not called to compare those. In the real world I expect this will almost always add only 1 git-log call to the merge process. (Which already makes N anyway.)
2011-11-06 19:18:45 +00:00
ff <- if dirty then return False else tryFastForwardTo refs
unless ff $ inRepo $
Git.commit "update" fullname (nub $ fullname:refs)
invalidateCache
where
onceonly a = unlessM (branchUpdated <$> getState) $ do
r <- a
disableUpdate
return r
merge: Use fast-forward merges when possible. Thanks Valentin Haenel for a test case showing how non-fast-forward merges could result in an ongoing pull/merge/push cycle. While the git-annex branch is fast-forwarded, git-annex's index file is still updated using the union merge strategy as before. There's no other way to update the index that would be any faster. It is possible that a union merge and a fast-forward result in different file contents: Files should have the same lines, but a union merge may change their order. If this happens, the next commit made to the git-annex branch will have some unnecessary changes to line orders, but the consistency of data should be preserved. Note that when the journal contains changes, a fast-forward is never attempted, which is fine, because committing those changes would be vanishingly unlikely to leave the git-annex branch at a commit that already exists in one of the remotes. The real difficulty is handling the case where multiple remotes have all changed. git-annex does find the best (ie, newest) one and fast forwards to it. If the remotes are diverged, no fast-forward is done at all. It would be possible to pick one, fast forward to it, and make a merge commit to the rest, I see no benefit to adding that complexity. Determining the best of N changed remotes requires N*2+1 calls to git-log, but these are fast git-log calls, and N is typically small. Also, typically some or all of the remote refs will be the same, and git-log is not called to compare those. In the real world I expect this will almost always add only 1 git-log call to the merge process. (Which already makes N anyway.)
2011-11-06 19:18:45 +00:00
{- Checks if the second branch has any commits not present on the first
- branch. -}
changedBranch :: String -> String -> Annex Bool
changedBranch origbranch newbranch = not . L.null <$> diffs
where
diffs = inRepo $ Git.pipeRead
[ Param "log"
, Param (origbranch ++ ".." ++ newbranch)
, Params "--oneline -n1"
]
merge: Use fast-forward merges when possible. Thanks Valentin Haenel for a test case showing how non-fast-forward merges could result in an ongoing pull/merge/push cycle. While the git-annex branch is fast-forwarded, git-annex's index file is still updated using the union merge strategy as before. There's no other way to update the index that would be any faster. It is possible that a union merge and a fast-forward result in different file contents: Files should have the same lines, but a union merge may change their order. If this happens, the next commit made to the git-annex branch will have some unnecessary changes to line orders, but the consistency of data should be preserved. Note that when the journal contains changes, a fast-forward is never attempted, which is fine, because committing those changes would be vanishingly unlikely to leave the git-annex branch at a commit that already exists in one of the remotes. The real difficulty is handling the case where multiple remotes have all changed. git-annex does find the best (ie, newest) one and fast forwards to it. If the remotes are diverged, no fast-forward is done at all. It would be possible to pick one, fast forward to it, and make a merge commit to the rest, I see no benefit to adding that complexity. Determining the best of N changed remotes requires N*2+1 calls to git-log, but these are fast git-log calls, and N is typically small. Also, typically some or all of the remote refs will be the same, and git-log is not called to compare those. In the real world I expect this will almost always add only 1 git-log call to the merge process. (Which already makes N anyway.)
2011-11-06 19:18:45 +00:00
{- Given a set of refs that are all known to have commits not
- on the git-annex branch, tries to update the branch by a
- fast-forward.
-
- In order for that to be possible, one of the refs must contain
- every commit present in all the other refs, as well as in the
- git-annex branch.
-}
tryFastForwardTo :: [String] -> Annex Bool
tryFastForwardTo [] = return True
tryFastForwardTo (first:rest) = do
-- First, check that the git-annex branch does not contain any
-- new commits that are in the first other branch. If it does,
-- cannot fast-forward.
diverged <- changedBranch first fullname
if diverged
then no_ff
else maybe no_ff do_ff =<< findbest first rest
where
no_ff = return False
do_ff branch = do
inRepo $ Git.run "update-ref" [Param fullname, Param branch]
merge: Use fast-forward merges when possible. Thanks Valentin Haenel for a test case showing how non-fast-forward merges could result in an ongoing pull/merge/push cycle. While the git-annex branch is fast-forwarded, git-annex's index file is still updated using the union merge strategy as before. There's no other way to update the index that would be any faster. It is possible that a union merge and a fast-forward result in different file contents: Files should have the same lines, but a union merge may change their order. If this happens, the next commit made to the git-annex branch will have some unnecessary changes to line orders, but the consistency of data should be preserved. Note that when the journal contains changes, a fast-forward is never attempted, which is fine, because committing those changes would be vanishingly unlikely to leave the git-annex branch at a commit that already exists in one of the remotes. The real difficulty is handling the case where multiple remotes have all changed. git-annex does find the best (ie, newest) one and fast forwards to it. If the remotes are diverged, no fast-forward is done at all. It would be possible to pick one, fast forward to it, and make a merge commit to the rest, I see no benefit to adding that complexity. Determining the best of N changed remotes requires N*2+1 calls to git-log, but these are fast git-log calls, and N is typically small. Also, typically some or all of the remote refs will be the same, and git-log is not called to compare those. In the real world I expect this will almost always add only 1 git-log call to the merge process. (Which already makes N anyway.)
2011-11-06 19:18:45 +00:00
return True
findbest c [] = return $ Just c
findbest c (r:rs)
| c == r = findbest c rs
| otherwise = do
better <- changedBranch c r
worse <- changedBranch r c
case (better, worse) of
(True, True) -> return Nothing -- divergent fail
(True, False) -> findbest r rs -- better
(False, True) -> findbest c rs -- worse
(False, False) -> findbest c rs -- same
2011-10-27 22:01:46 +00:00
{- Avoids updating the branch. A useful optimisation when the branch
- is known to have not changed, or git-annex won't be relying on info
- from it. -}
disableUpdate :: Annex ()
disableUpdate = Annex.changeState setupdated
where
setupdated s = s { Annex.branchstate = new }
where
new = old { branchUpdated = True }
old = Annex.branchstate s
{- Checks if a git ref exists. -}
2011-06-28 18:14:49 +00:00
refExists :: GitRef -> Annex Bool
refExists ref = inRepo $ Git.runBool "show-ref"
[Param "--verify", Param "-q", Param ref]
{- Does the main git-annex branch exist? -}
hasBranch :: Annex Bool
hasBranch = refExists fullname
{- Does origin/git-annex exist? -}
hasOrigin :: Annex Bool
hasOrigin = refExists originname
{- Does the git-annex branch or a foo/git-annex branch exist? -}
hasSomeBranch :: Annex Bool
hasSomeBranch = not . null <$> siblingBranches
{- List of git-annex (refs, branches), including the main one and any
- from remotes. Duplicate refs are filtered out. -}
siblingBranches :: Annex [(String, String)]
siblingBranches = do
r <- inRepo $ Git.pipeRead [Param "show-ref", Param name]
return $ nubBy uref $ map (pair . words . L.unpack) (L.lines r)
where
pair l = (head l, last l)
uref (a, _) (b, _) = a == b
{- Applies a function to modifiy the content of a file. -}
change :: FilePath -> (String -> String) -> Annex ()
change file a = lockJournal $ get file >>= return . a >>= set file
{- Records new content of a file into the journal. -}
set :: FilePath -> String -> Annex ()
set file content = do
setJournalFile file content
setCache file content
{- Gets the content of a file on the branch, or content from the journal, or
- staged in the index.
-
- Returns an empty string if the file doesn't exist yet. -}
get :: FilePath -> Annex String
2011-10-10 21:37:44 +00:00
get file = fromcache =<< getCache file
where
fromcache (Just content) = return content
fromcache Nothing = fromjournal =<< getJournalFile file
fromjournal (Just content) = cache content
fromjournal Nothing = withIndexUpdate $
cache =<< catFile fullname file
cache content = do
setCache file content
return content
{- Lists all files on the branch. There may be duplicates in the list. -}
files :: Annex [FilePath]
files = withIndexUpdate $ do
bfiles <- inRepo $ Git.pipeNullSplit
[Params "ls-tree --name-only -r -z", Param fullname]
jfiles <- getJournalledFiles
return $ jfiles ++ bfiles
{- Records content for a file in the branch to the journal.
-
- Using the journal, rather than immediatly staging content to the index
- avoids git needing to rewrite the index after every change. -}
setJournalFile :: FilePath -> String -> Annex ()
setJournalFile file content = do
g <- gitRepo
liftIO $ doRedo (write g) $ do
createDirectoryIfMissing True $ gitAnnexJournalDir g
createDirectoryIfMissing True $ gitAnnexTmpDir g
where
-- journal file is written atomically
write g = do
let jfile = journalFile g file
let tmpfile = gitAnnexTmpDir g </> takeFileName jfile
writeBinaryFile tmpfile content
renameFile tmpfile jfile
{- Gets any journalled content for a file in the branch. -}
getJournalFile :: FilePath -> Annex (Maybe String)
getJournalFile file = do
g <- gitRepo
liftIO $ catch (liftM Just . readFileStrict $ journalFile g file)
(const $ return Nothing)
{- List of files that have updated content in the journal. -}
getJournalledFiles :: Annex [FilePath]
getJournalledFiles = map fileJournal <$> getJournalFiles
{- List of existing journal files. -}
getJournalFiles :: Annex [FilePath]
getJournalFiles = do
g <- gitRepo
fs <- liftIO $ catch (getDirectoryContents $ gitAnnexJournalDir g)
(const $ return [])
2011-09-21 03:26:35 +00:00
return $ filter (`notElem` [".", ".."]) fs
{- Stages the specified journalfiles. -}
stageJournalFiles :: Annex ()
stageJournalFiles = do
fs <- getJournalFiles
g <- gitRepo
withIndex $ liftIO $ do
let dir = gitAnnexJournalDir g
let paths = map (dir </>) fs
-- inject all the journal files directly into git
-- in one quick command
2011-10-10 21:37:44 +00:00
(pid, fromh, toh) <- hPipeBoth "git" $ toCommand $ git_hash_object g
_ <- forkProcess $ do
hPutStr toh $ unlines paths
hClose toh
exitSuccess
hClose toh
s <- hGetContents fromh
-- update the index, also in just one command
Git.UnionMerge.update_index g $
index_lines (lines s) $ map fileJournal fs
hClose fromh
forceSuccess pid
mapM_ removeFile paths
where
index_lines shas = map genline . zip shas
genline (sha, file) = Git.UnionMerge.update_index_line sha file
git_hash_object g = Git.gitCommandLine
[Param "hash-object", Param "-w", Param "--stdin-paths"] g
2011-10-10 21:37:44 +00:00
{- Checks if there are changes in the journal. -}
journalDirty :: Annex Bool
journalDirty = not . null <$> getJournalFiles
{- Produces a filename to use in the journal for a file on the branch.
-
- The journal typically won't have a lot of files in it, so the hashing
- used in the branch is not necessary, and all the files are put directly
- in the journal directory.
-}
journalFile :: Git.Repo -> FilePath -> FilePath
journalFile repo file = gitAnnexJournalDir repo </> concatMap mangle file
where
mangle '/' = "_"
mangle '_' = "__"
mangle c = [c]
{- Converts a journal file (relative to the journal dir) back to the
- filename on the branch. -}
fileJournal :: FilePath -> FilePath
fileJournal = replace "//" "_" . replace "_" "/"
{- Runs an action that modifies the journal, using locking to avoid
- contention with other git-annex processes. -}
lockJournal :: Annex a -> Annex a
lockJournal a = do
file <- fromRepo $ gitAnnexJournalLock
bracketIO (lock file) unlock a
2011-10-03 22:20:29 +00:00
where
lock file = do
l <- doRedo (createFile file stdFileMode) $
createDirectoryIfMissing True $ takeDirectory file
2011-10-03 22:20:29 +00:00
waitToSetLock l (WriteLock, AbsoluteSeek, 0, 0)
return l
unlock = closeFd
{- Runs an action, catching failure and running something to fix it up, and
- retrying if necessary. -}
doRedo :: IO a -> IO b -> IO a
doRedo a b = catch a $ const $ b >> a