clean up git-remote-annex git-annex branch handling

Implemented alternateJournal, which git-remote-annex
uses to avoid any writes to the git-annex branch while setting up
a special remote from an annex:: url.

That prevents the remote.log from being overwritten with the special
remote configuration from the url, which might not be 100% the same as
the existing special remote configuration.

And it prevents an overwrite deleting of other stuff that was
already in the remote.log.

Also, when the branch was created by git-remote-annex, only delete it
at the end if nothing else has been written to it by another command.
This fixes the race condition described in
797f27ab05, where git-remote-annex
set up the branch and git-annex init and other commands were
run at the same time and their writes to the branch were lost.
This commit is contained in:
Joey Hess 2024-05-15 17:33:38 -04:00
parent d24d8870c5
commit adcebbae47
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
6 changed files with 84 additions and 66 deletions

View file

@ -727,7 +727,8 @@ stageJournal :: JournalLocked -> Annex () -> Annex ()
stageJournal jl commitindex = withIndex $ withOtherTmp $ \tmpdir -> do
prepareModifyIndex jl
g <- gitRepo
let dir = gitAnnexJournalDir g
st <- getState
let dir = gitAnnexJournalDir st g
(jlogf, jlogh) <- openjlog (fromRawFilePath tmpdir)
withHashObjectHandle $ \h ->
withJournalHandle gitAnnexJournalDir $ \jh ->

View file

@ -7,7 +7,7 @@
- All files in the journal must be a series of lines separated by
- newlines.
-
- Copyright 2011-2022 Joey Hess <id@joeyh.name>
- Copyright 2011-2024 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -23,6 +23,8 @@ import qualified Git
import Annex.Perms
import Annex.Tmp
import Annex.LockFile
import Annex.BranchState
import Types.BranchState
import Utility.Directory.Stream
import qualified Utility.RawFilePath as R
@ -82,9 +84,10 @@ privateUUIDsKnown' = not . S.null . annexPrivateRepos . Annex.gitconfig
-}
setJournalFile :: Journalable content => JournalLocked -> RegardingUUID -> RawFilePath -> content -> Annex ()
setJournalFile _jl ru file content = withOtherTmp $ \tmp -> do
st <- getState
jd <- fromRepo =<< ifM (regardingPrivateUUID ru)
( return gitAnnexPrivateJournalDir
, return gitAnnexJournalDir
( return (gitAnnexPrivateJournalDir st)
, return (gitAnnexJournalDir st)
)
-- journal file is written atomically
let jfile = journalFile file
@ -106,9 +109,10 @@ newtype AppendableJournalFile = AppendableJournalFile (RawFilePath, RawFilePath)
- branch. -}
checkCanAppendJournalFile :: JournalLocked -> RegardingUUID -> RawFilePath -> Annex (Maybe AppendableJournalFile)
checkCanAppendJournalFile _jl ru file = do
st <- getState
jd <- fromRepo =<< ifM (regardingPrivateUUID ru)
( return gitAnnexPrivateJournalDir
, return gitAnnexJournalDir
( return (gitAnnexPrivateJournalDir st)
, return (gitAnnexJournalDir st)
)
let jfile = jd P.</> journalFile file
ifM (liftIO $ R.doesPathExist jfile)
@ -176,14 +180,12 @@ data GetPrivate = GetPrivate Bool
-}
getJournalFileStale :: GetPrivate -> RawFilePath -> Annex JournalledContent
getJournalFileStale (GetPrivate getprivate) file = do
-- Optimisation to avoid a second MVar access.
st <- Annex.getState id
let g = Annex.repo st
liftIO $
if getprivate && privateUUIDsKnown' st
then do
x <- getfrom (gitAnnexJournalDir g)
getfrom (gitAnnexPrivateJournalDir g) >>= \case
x <- getfrom (gitAnnexJournalDir (Annex.branchstate st) (Annex.repo st))
getfrom (gitAnnexPrivateJournalDir (Annex.branchstate st) (Annex.repo st)) >>= \case
Nothing -> return $ case x of
Nothing -> NoJournalledContent
Just b -> JournalledContent b
@ -193,7 +195,7 @@ getJournalFileStale (GetPrivate getprivate) file = do
-- happens in a merge of two
-- git-annex branches.
Just x' -> x' <> y
else getfrom (gitAnnexJournalDir g) >>= return . \case
else getfrom (gitAnnexJournalDir (Annex.branchstate st) (Annex.repo st)) >>= return . \case
Nothing -> NoJournalledContent
Just b -> JournalledContent b
where
@ -219,18 +221,20 @@ discardIncompleteAppend v
{- List of existing journal files in a journal directory, but without locking,
- may miss new ones just being added, or may have false positives if the
- journal is staged as it is run. -}
getJournalledFilesStale :: (Git.Repo -> RawFilePath) -> Annex [RawFilePath]
getJournalledFilesStale :: (BranchState -> Git.Repo -> RawFilePath) -> Annex [RawFilePath]
getJournalledFilesStale getjournaldir = do
g <- gitRepo
fs <- liftIO $ catchDefaultIO [] $
getDirectoryContents $ fromRawFilePath (getjournaldir g)
st <- Annex.getState id
let d = getjournaldir (Annex.branchstate st) (Annex.repo st)
fs <- liftIO $ catchDefaultIO [] $
getDirectoryContents (fromRawFilePath d)
return $ filter (`notElem` [".", ".."]) $
map (fileJournal . toRawFilePath) fs
{- Directory handle open on a journal directory. -}
withJournalHandle :: (Git.Repo -> RawFilePath) -> (DirectoryHandle -> IO a) -> Annex a
withJournalHandle :: (BranchState -> Git.Repo -> RawFilePath) -> (DirectoryHandle -> IO a) -> Annex a
withJournalHandle getjournaldir a = do
d <- fromRepo getjournaldir
st <- Annex.getState id
let d = getjournaldir (Annex.branchstate st) (Annex.repo st)
bracket (opendir d) (liftIO . closeDirectory) (liftIO . a)
where
-- avoid overhead of creating the journal directory when it already
@ -239,9 +243,10 @@ withJournalHandle getjournaldir a = do
`catchIO` (const (createAnnexDirectory d >> opendir d))
{- Checks if there are changes in the journal. -}
journalDirty :: (Git.Repo -> RawFilePath) -> Annex Bool
journalDirty :: (BranchState -> Git.Repo -> RawFilePath) -> Annex Bool
journalDirty getjournaldir = do
d <- fromRawFilePath <$> fromRepo getjournaldir
st <- getState
d <- fromRawFilePath <$> fromRepo (getjournaldir st)
liftIO $
(not <$> isDirectoryEmpty d)
`catchIO` (const $ doesDirectoryExist d)

View file

@ -118,6 +118,7 @@ import Key
import Types.UUID
import Types.GitConfig
import Types.Difference
import Types.BranchState
import qualified Git
import qualified Git.Types as Git
import Git.FilePath
@ -528,15 +529,19 @@ gitAnnexTransferDir r =
{- .git/annex/journal/ is used to journal changes made to the git-annex
- branch -}
gitAnnexJournalDir :: Git.Repo -> RawFilePath
gitAnnexJournalDir r =
P.addTrailingPathSeparator $ gitAnnexDir r P.</> "journal"
gitAnnexJournalDir :: BranchState -> Git.Repo -> RawFilePath
gitAnnexJournalDir st r = P.addTrailingPathSeparator $
case alternateJournal st of
Nothing -> gitAnnexDir r P.</> "journal"
Just d -> d
{- .git/annex/journal.private/ is used to journal changes regarding private
- repositories. -}
gitAnnexPrivateJournalDir :: Git.Repo -> RawFilePath
gitAnnexPrivateJournalDir r =
P.addTrailingPathSeparator $ gitAnnexDir r P.</> "journal-private"
gitAnnexPrivateJournalDir :: BranchState -> Git.Repo -> RawFilePath
gitAnnexPrivateJournalDir st r = P.addTrailingPathSeparator $
case alternateJournal st of
Nothing -> gitAnnexDir r P.</> "journal-private"
Just d -> d
{- Lock file for the journal. -}
gitAnnexJournalLock :: Git.Repo -> RawFilePath

View file

@ -21,6 +21,7 @@ import qualified Git.Remote
import qualified Git.Remote.Remove
import qualified Annex.SpecialRemote as SpecialRemote
import qualified Annex.Branch
import qualified Annex.BranchState
import qualified Types.Remote as Remote
import qualified Logs.Remote
import Remote.Helper.Encryptable (parseEncryptionMethod)
@ -32,6 +33,7 @@ import Types.RemoteConfig
import Types.ProposedAccepted
import Types.Export
import Types.GitConfig
import Types.BranchState
import Types.Difference
import Types.Crypto
import Git.Types
@ -44,6 +46,7 @@ import Annex.SpecialRemote.Config
import Remote.List
import Remote.List.Util
import Utility.Tmp
import Utility.Tmp.Dir
import Utility.Env
import Utility.Metered
@ -485,10 +488,9 @@ withSpecialRemote cfg@(SpecialRemoteConfig {}) sab a = case specialRemoteName cf
| otherwise -> giveup $ "The uuid in the annex:: url does not match the uuid of the remote named " ++ remotename
-- When cloning from an annex:: url,
-- this is used to set up the origin remote.
Nothing -> (initremote remotename >>= a)
`finally` cleanupInitialization sab
Nothing -> inittempremote
`finally` cleanupInitialization sab
Nothing -> specialRemoteFromUrl sab
(initremote remotename >>= a)
Nothing -> specialRemoteFromUrl sab inittempremote
where
-- Initialize a new special remote with the provided configuration
-- and name.
@ -869,27 +871,48 @@ getRepo = getEnv "GIT_WORK_TREE" >>= \case
-- Records what the git-annex branch was at the beginning of this command.
data StartAnnexBranch
= AnnexBranchExistedAlready Ref
| AnnexBranchCreatedEmpty Ref
= AnnexBranchExistedAlready Sha
| AnnexBranchCreatedEmpty Sha
{- Run early in the command, gets the initial state of the git-annex
- branch.
-
- If the branch does not exist yet, it's created here. This is done
- because it's hard to avoid the branch being created by this command,
- so tracking the sha of the created branch allows cleaning it up later.
-}
startAnnexBranch :: Annex StartAnnexBranch
startAnnexBranch = ifM (null <$> Annex.Branch.siblingBranches)
( AnnexBranchCreatedEmpty <$> Annex.Branch.getBranch
, AnnexBranchExistedAlready <$> Annex.Branch.getBranch
)
-- This is run after git has used this process to fetch or push from a
-- special remote that was specified using a git-annex url. If the git
-- repository was not initialized for use by git-annex already, it is still
-- not initialized at this point.
-- This runs an action that will set up a special remote that
-- was specified using an annex url.
--
-- Setting up a special remote needs to write its config to the git-annex
-- branch. And using a special remote may also write to the branch.
-- But in this case, writes to the git-annex branch need to be avoided,
-- so that cleanupInitialization can leave things in the right state.
--
-- So this prevents commits to the git-annex branch, and redirects all
-- journal writes to a temporary directory, so that all writes
-- to the git-annex branch by the action will be discarded.
specialRemoteFromUrl :: StartAnnexBranch -> Annex a -> Annex a
specialRemoteFromUrl sab a = withTmpDir "journal" $ \tmpdir -> do
Annex.overrideGitConfig $ \c ->
c { annexAlwaysCommit = False }
Annex.BranchState.changeState $ \st ->
st { alternateJournal = Just (toRawFilePath tmpdir) }
a `finally` cleanupInitialization sab
-- If the git-annex branch did not exist when this command started,
-- the current contents of it were created in passing by this command,
-- which is hard to avoid. But if a git-annex branch is fetched from the
-- special remote and contains Differences, it would not be possible to
-- merge it into the git-annex branch that was created while running this
-- command. To avoid that problem, when the git-annex branch was created
-- at the start of this command, it's deleted.
-- it was created empty by this command, and this command has avoided
-- making any other commits to it. If nothing else has written to the
-- branch while this command was running, the branch will be deleted.
-- That allows for the git-annex branch that is fetched from the special
-- remote to contain Differences, which would prevent it from being merged
-- with the git-annex branch created by this command.
--
-- If there is still not a sibling git-annex branch, this deletes all annex
-- objects for git bundles from the annex objects directory, and deletes
@ -905,10 +928,11 @@ cleanupInitialization :: StartAnnexBranch -> Annex ()
cleanupInitialization sab = do
case sab of
AnnexBranchExistedAlready _ -> noop
AnnexBranchCreatedEmpty _ -> do
inRepo $ Git.Branch.delete Annex.Branch.fullname
indexfile <- fromRepo gitAnnexIndex
liftIO $ removeWhenExistsWith R.removeLink indexfile
AnnexBranchCreatedEmpty r ->
whenM ((r ==) <$> Annex.Branch.getBranch) $ do
inRepo $ Git.Branch.delete Annex.Branch.fullname
indexfile <- fromRepo gitAnnexIndex
liftIO $ removeWhenExistsWith R.removeLink indexfile
ifM Annex.Branch.hasSibling
( do
autoInitialize' (pure True) remoteList

View file

@ -36,7 +36,10 @@ data BranchState = BranchState
-- process need to be noticed while the current process is running?
-- (This makes the journal always be read, and avoids using the
-- cache.)
, alternateJournal :: Maybe RawFilePath
-- ^ use this directory for all journals, rather than the
-- gitAnnexJournalDir and gitAnnexPrivateJournalDir.
}
startBranchState :: BranchState
startBranchState = BranchState False False False [] [] [] False
startBranchState = BranchState False False False [] [] [] False Nothing

View file

@ -10,26 +10,6 @@ will be available to users who don't use datalad.
This is implememented and working. Remaining todo list for it:
* Cloning writes the new special remote config into remote.log,
and *deletes* other special remote configs.
The remote config from the url may be slightly different as well
than the existing one. Cloning should not write it.
* The race condition described in
[[!commit 797f27ab0517e0021363791ff269300f2ba095a5]]
where before git-annex init is run in a repo,
using git-remote-annex and at the same time git-annex init can lose
changes that the latter command (and ones after it) write to the
git-annex branch.
This should be fixable by making git-remote-annex not write to the
git-annex branch, but to eg, a temporary journal directory.
Also, when the remote uses importtree=yes, pushing to it updates
content identifiers, which currently get recorded in the git-annex
branch. It would be good to avoid that being written as well.
* Test incremental push edge cases involving checkprereq.
* Cloning from an annex:: url with importtree=yes doesn't work