log migration trees to git-annex branch

This will allow distributed migration: Start a migration in one clone of
a repo, and then update other clones.

commitMigration is a bit of a bear.. There is some inversion of control
that needs some TMVars. Also streamLogFile's finalizer does not handle
recording the trees, so an interrupt at just the wrong time can cause
migration.log to be emptied but the git-annex branch not updated.

Sponsored-by: Graham Spencer on Patreon
This commit is contained in:
Joey Hess 2023-12-06 15:38:01 -04:00
parent b55efc179a
commit 0bd8b17b59
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
12 changed files with 219 additions and 43 deletions

View file

@ -19,6 +19,7 @@ module Annex.Ingest (
finishIngestUnlocked,
cleanOldKeys,
addSymlink,
genSymlink,
makeLink,
addUnlocked,
CheckGitIgnore(..),
@ -38,6 +39,7 @@ import Annex.MetaData
import Annex.CurrentBranch
import Annex.CheckIgnore
import Logs.Location
import qualified Git
import qualified Annex
import qualified Database.Keys
import Config
@ -320,9 +322,12 @@ makeLink file key mcache = flip catchNonAsync (restoreFile file key) $ do
{- Creates the symlink to the annexed content, and stages it in git. -}
addSymlink :: RawFilePath -> Key -> Maybe InodeCache -> Annex ()
addSymlink file key mcache = do
addSymlink file key mcache = stageSymlink file =<< genSymlink file key mcache
genSymlink :: RawFilePath -> Key -> Maybe InodeCache -> Annex Git.Sha
genSymlink file key mcache = do
linktarget <- makeLink file key mcache
stageSymlink file =<< hashSymlink linktarget
hashSymlink linktarget
{- Parameters to pass to git add, forcing addition of ignored files.
-

View file

@ -54,6 +54,8 @@ module Annex.Locations (
gitAnnexRestageLock,
gitAnnexAdjustedBranchUpdateLog,
gitAnnexAdjustedBranchUpdateLock,
gitAnnexMigrateLog,
gitAnnexMigrateLock,
gitAnnexMoveLog,
gitAnnexMoveLock,
gitAnnexExportDir,
@ -407,6 +409,13 @@ gitAnnexAdjustedBranchUpdateLog r = gitAnnexDir r P.</> "adjust.log"
gitAnnexAdjustedBranchUpdateLock :: Git.Repo -> RawFilePath
gitAnnexAdjustedBranchUpdateLock r = gitAnnexDir r P.</> "adjust.lck"
{- .git/annex/migrate.log is used to log migrations before committing them. -}
gitAnnexMigrateLog :: Git.Repo -> RawFilePath
gitAnnexMigrateLog r = gitAnnexDir r P.</> "migrate.log"
gitAnnexMigrateLock :: Git.Repo -> RawFilePath
gitAnnexMigrateLock r = gitAnnexDir r P.</> "migrate.lck"
{- .git/annex/move.log is used to log moves that are in progress,
- to better support resuming an interrupted move. -}
gitAnnexMoveLog :: Git.Repo -> RawFilePath

View file

@ -198,7 +198,7 @@ batchAnnexed fmt seeker keyaction = do
Right f -> lookupKeyStaged f >>= \case
Nothing -> return Nothing
Just k -> checkpresent k $
startAction seeker si f k
startAction seeker Nothing si f k
Left k -> ifM (matcher (MatchingInfo (mkinfo k)))
( checkpresent k $
keyaction (si, k, mkActionItem k)

View file

@ -1,6 +1,6 @@
{- git-annex command
-
- Copyright 2011 Joey Hess <id@joeyh.name>
- Copyright 2011-2023 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -15,6 +15,7 @@ import Annex.Content
import qualified Command.ReKey
import qualified Command.Fsck
import qualified Annex
import Logs.Migrate
import Logs.MetaData
import Logs.Web
import Utility.Metered
@ -39,17 +40,19 @@ optParser desc = MigrateOptions
)
seek :: MigrateOptions -> CommandSeek
seek o = withFilesInGitAnnex ww seeker =<< workTreeItems ww (migrateThese o)
seek o = do
withFilesInGitAnnex ww seeker =<< workTreeItems ww (migrateThese o)
commitMigration
where
ww = WarnUnmatchLsFiles "migrate"
seeker = AnnexedFileSeeker
{ startAction = const $ start o
{ startAction = start o
, checkContentPresent = Nothing
, usesLocationLog = False
}
start :: MigrateOptions -> SeekInput -> RawFilePath -> Key -> CommandStart
start o si file key = do
start :: MigrateOptions -> Maybe KeySha -> SeekInput -> RawFilePath -> Key -> CommandStart
start o ksha si file key = do
forced <- Annex.getRead Annex.force
v <- Backend.getBackend (fromRawFilePath file) key
case v of
@ -63,9 +66,12 @@ start o si file key = do
then go True oldbackend oldbackend
else stop
where
go onlyremovesize oldbackend newbackend =
go onlyremovesize oldbackend newbackend = do
keyrec <- case ksha of
Just (KeySha s) -> pure (MigrationRecord s)
Nothing -> error "internal"
starting "migrate" (mkActionItem (key, file)) si $
perform onlyremovesize o file key oldbackend newbackend
perform onlyremovesize o file key keyrec oldbackend newbackend
{- Checks if a key is upgradable to a newer representation.
-
@ -87,8 +93,8 @@ upgradableKey backend key = isNothing (fromKey keySize key) || backendupgradable
- data cannot get corrupted after the fsck but before the new key is
- generated.
-}
perform :: Bool -> MigrateOptions -> RawFilePath -> Key -> Backend -> Backend -> CommandPerform
perform onlyremovesize o file oldkey oldbackend newbackend = go =<< genkey (fastMigrate oldbackend)
perform :: Bool -> MigrateOptions -> RawFilePath -> Key -> MigrationRecord -> Backend -> Backend -> CommandPerform
perform onlyremovesize o file oldkey oldkeyrec oldbackend newbackend = go =<< genkey (fastMigrate oldbackend)
where
go Nothing = stop
go (Just (newkey, knowngoodcontent))
@ -104,7 +110,8 @@ perform onlyremovesize o file oldkey oldbackend newbackend = go =<< genkey (fast
urls <- getUrls oldkey
forM_ urls $ \url ->
setUrlPresent newkey url
next $ Command.ReKey.cleanup file newkey
next $ Command.ReKey.cleanup file newkey $
logMigration oldkeyrec
, giveup "failed creating link from old to new key"
)
genkey _ | onlyremovesize = return $ Just (oldkey, False)

View file

@ -1,6 +1,6 @@
{- git-annex command
-
- Copyright 2012-2016 Joey Hess <id@joeyh.name>
- Copyright 2012-2023 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -19,6 +19,7 @@ import Annex.ReplaceFile
import Logs.Location
import Annex.InodeSentinal
import Annex.WorkTree
import Logs.Migrate
import Utility.InodeCache
import qualified Utility.RawFilePath as R
@ -88,7 +89,7 @@ perform file oldkey newkey = do
giveup $ decodeBS $ quote qp $ QuotedPath file
<> " is not available (use --force to override)"
)
next $ cleanup file newkey
next $ cleanup file newkey $ const noop
{- Make a hard link to the old key content (when supported),
- to avoid wasting disk space. -}
@ -127,18 +128,23 @@ linkKey file oldkey newkey = ifM (isJust <$> isAnnexLink file)
LinkAnnexNoop -> True
)
cleanup :: RawFilePath -> Key -> CommandCleanup
cleanup file newkey = do
ifM (isJust <$> isAnnexLink file)
cleanup :: RawFilePath -> Key -> (MigrationRecord -> Annex ()) -> CommandCleanup
cleanup file newkey a = do
newkeyrec <- ifM (isJust <$> isAnnexLink file)
( do
-- Update symlink to use the new key.
addSymlink file newkey Nothing
sha <- genSymlink file newkey Nothing
stageSymlink file sha
return (MigrationRecord sha)
, do
mode <- liftIO $ catchMaybeIO $ fileMode <$> R.getFileStatus file
liftIO $ whenM (isJust <$> isPointerFile file) $
writePointerFile file newkey mode
stagePointerFile file mode =<< hashPointerFile newkey
sha <- hashPointerFile newkey
stagePointerFile file mode sha
return (MigrationRecord sha)
)
whenM (inAnnex newkey) $
logStatus newkey InfoPresent
a newkeyrec
return True

View file

@ -23,6 +23,7 @@ module Git.Tree (
graftTree',
withMkTreeHandle,
MkTreeHandle,
mkTree,
treeMode,
) where
@ -91,28 +92,39 @@ recordTree :: Tree -> Repo -> IO Sha
recordTree t repo = withMkTreeHandle repo $ \h -> recordTree' h t
recordTree' :: MkTreeHandle -> Tree -> IO Sha
recordTree' h (Tree l) = mkTree h =<< mapM (recordSubTree h) l
recordTree' h (Tree l) = mkTree' h =<< mapM (recordSubTree h) l
{- Note that the returned RecordedSubTree does not have its [TreeContent]
- list populated. This is a memory optimisation, since the list is not
- used. -}
recordSubTree :: MkTreeHandle -> TreeContent -> IO TreeContent
recordSubTree h (NewSubTree d l) = do
sha <- mkTree h =<< mapM (recordSubTree h) l
sha <- mkTree' h =<< mapM (recordSubTree h) l
return (RecordedSubTree d sha [])
recordSubTree _ alreadyrecorded = return alreadyrecorded
mkTree :: MkTreeHandle -> [TreeContent] -> IO Sha
mkTree (MkTreeHandle cp) l = CoProcess.query cp send receive
{- Note that this creates a single tree. It cannot create a recursive tree
- with subtrees in a single call. -}
mkTree
:: (MonadIO m, MonadCatch m)
=> MkTreeHandle
-> ((FileMode -> ObjectType -> Sha -> TopFilePath -> m ()) -> m ())
-> m Sha
mkTree (MkTreeHandle cp) a = CoProcess.query cp send receive
where
send h = do
forM_ l $ \i -> hPutStr h $ case i of
TreeBlob f fm s -> mkTreeOutput fm BlobObject s f
RecordedSubTree f s _ -> mkTreeOutput treeMode TreeObject s f
a $ \fm ot s f -> liftIO $ hPutStr h (mkTreeOutput fm ot s f)
-- NUL to signal end of tree to --batch
liftIO $ hPutStr h "\NUL"
receive h = liftIO $ getSha "mktree" (S8.hGetLine h)
mkTree' :: MkTreeHandle -> [TreeContent] -> IO Sha
mkTree' h l = mkTree h $ \send ->
forM_ l $ \case
TreeBlob f fm s -> send fm BlobObject s f
RecordedSubTree f s _ -> send treeMode TreeObject s f
NewSubTree _ _ -> error "recordSubTree internal error; unexpected NewSubTree"
TreeCommit f fm s -> mkTreeOutput fm CommitObject s f
hPutStr h "\NUL" -- signal end of tree to --batch
receive h = getSha "mktree" (S8.hGetLine h)
TreeCommit f fm s -> send fm CommitObject s f
treeMode :: FileMode
treeMode = 0o040000
@ -221,7 +233,7 @@ adjustTree adjusttreeitem addtreeitems resolveaddconflict removefiles r repo =
(l, cleanup) <- liftIO $ lsTreeWithObjects LsTree.LsTreeRecursive r repo
(l', _, _) <- go h False [] 1 inTopTree l
l'' <- adjustlist h 0 inTopTree (const True) l'
sha <- liftIO $ mkTree h l''
sha <- liftIO $ mkTree' h l''
void $ liftIO cleanup
return sha
where
@ -332,7 +344,7 @@ graftTree' subtree graftloc basetree repo hdl = go basetree subdirs graftdirs
return $ RecordedSubTree tloc tsha'' []
_ -> graftin (topmostgraphdir:restgraphdirs)
return (newshas ++ rest)
mkTree hdl t'
mkTree' hdl t'
go _ _ [] = return subtree
go _ [] _ = return subtree

View file

@ -1,6 +1,6 @@
{- git-annex log file names
-
- Copyright 2013-2021 Joey Hess <id@joeyh.name>
- Copyright 2013-2023 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -156,6 +156,11 @@ exportLog = "export.log"
exportTreeGraftPoint :: RawFilePath
exportTreeGraftPoint = "export.tree"
{- This is not a log file, it's where migration treeishes get grafted into
- the git-annex branch. -}
migrationTreeGraftPoint :: RawFilePath
migrationTreeGraftPoint = "migrate.tree"
{- The pathname of the location log file for a given key. -}
locationLogFile :: GitConfig -> Key -> RawFilePath
locationLogFile config key =

102
Logs/Migrate.hs Normal file
View file

@ -0,0 +1,102 @@
{- git-annex migration logs
-
- To record a migration in the git-annex branch as space efficiently as
- possible, it is stored as a tree which contains two subtrees 'old' and 'new'.
- The subtrees each contain the same filenames, which point to the old
- and new keys respectively.
-
- When the user commits the migrated files to their HEAD branch, that will
- store pointers to the new keys in git. And pointers to the old keys
- already exist in git. So recording the migration this way avoids
- injecting any new objects into git, besides the two trees. Note that for
- this to be the case, care has to be taken to record the migration
- using the same symlink targets or pointer file contents as are used in
- the HEAD branch.
-
- The filenames used in the trees are not the original filenames, to avoid
- running migrate in a throwaway branch unexpectedly recording that
- branch's contents.
-
- Copyright 2023 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE OverloadedStrings, BangPatterns #-}
module Logs.Migrate (
MigrationRecord(..),
logMigration,
commitMigration,
) where
import Annex.Common
import qualified Git
import qualified Annex
import qualified Annex.Branch
import Git.Types
import Git.Tree
import Git.FilePath
import Logs.File
import Logs
import qualified Data.ByteString.Lazy as L
import Control.Concurrent.STM
-- | What to use to record a migration. This should be the same Sha that is
-- used to as the content of the annexed file in the HEAD branch.
newtype MigrationRecord = MigrationRecord { fromMigrationRecord :: Git.Sha }
-- | Logs a migration from an old to a new key.
--
-- This writes to a log file, which can later be committed. That allows an
-- interrupted migration to be resumed later.
logMigration :: MigrationRecord -> MigrationRecord -> Annex ()
logMigration old new = do
logf <- fromRepo gitAnnexMigrateLog
lckf <- fromRepo gitAnnexMigrateLock
appendLogFile logf lckf $ L.fromStrict $
Git.fromRef' (fromMigrationRecord old)
<> " "
<> Git.fromRef' (fromMigrationRecord new)
-- | Commits a migration to the git-annex branch.
commitMigration :: Annex ()
commitMigration = do
logf <- fromRawFilePath <$> fromRepo gitAnnexMigrateLog
lckf <- fromRepo gitAnnexMigrateLock
nv <- liftIO $ newTVarIO (0 :: Integer)
newtv <- liftIO newEmptyTMVarIO
g <- Annex.gitRepo
oldt <- withMkTreeHandle g $ \oldh ->
withMkTreeHandle g $ \newh ->
mkTree oldh $ \oldsend -> do
newt <- mkTree newh $ \newsend ->
streamLogFile logf lckf noop $
processor nv oldsend newsend
liftIO $ atomically $ writeTMVar newtv newt
newt <- liftIO $ atomically $ takeTMVar newtv
n <- liftIO $ atomically $ readTVar nv
when (n > 0) $ do
treesha <- liftIO $ flip recordTree g $ Tree
[ RecordedSubTree (asTopFilePath "old") oldt []
, RecordedSubTree (asTopFilePath "new") newt []
]
Annex.Branch.rememberTreeish treesha
(asTopFilePath migrationTreeGraftPoint)
where
processor nv oldsend newsend s = case words s of
(old:new:[]) -> do
fn <- liftIO $ atomically $ do
n <- readTVar nv
let !n' = succ n
writeTVar nv n'
return (asTopFilePath (encodeBS (show n')))
let rec f r = f
(fromTreeItemType TreeFile)
BlobObject
(Git.Ref (encodeBS r))
fn
void $ rec oldsend old
void $ rec newsend new
_ -> error "migrate.log parse error"

View file

@ -19,6 +19,7 @@ module Utility.CoProcess (
import Common
import Control.Concurrent.MVar
import Control.Monad.IO.Class (MonadIO)
type CoProcessHandle = MVar CoProcessState
@ -65,11 +66,11 @@ stop ch = do
{- To handle a restartable process, any IO exception thrown by the send and
- receive actions are assumed to mean communication with the process
- failed, and the failed action is re-run with a new process. -}
query :: CoProcessHandle -> (Handle -> IO a) -> (Handle -> IO b) -> IO b
query :: (MonadIO m, MonadCatch m) => CoProcessHandle -> (Handle -> m a) -> (Handle -> m b) -> m b
query ch send receive = do
s <- readMVar ch
s <- liftIO $ readMVar ch
restartable s (send $ coProcessTo s) $ const $
restartable s (hFlush $ coProcessTo s) $ const $
restartable s (liftIO $ hFlush $ coProcessTo s) $ const $
restartable s (receive $ coProcessFrom s)
return
where
@ -78,12 +79,12 @@ query ch send receive = do
maybe restart cont =<< catchMaybeIO a
| otherwise = cont =<< a
restart = do
s <- takeMVar ch
void $ catchMaybeIO $ do
s <- liftIO $ takeMVar ch
void $ liftIO $ catchMaybeIO $ do
hClose $ coProcessTo s
hClose $ coProcessFrom s
void $ waitForProcess $ coProcessPid s
s' <- start' $ (coProcessSpec s)
void $ liftIO $ waitForProcess $ coProcessPid s
s' <- liftIO $ start' $ (coProcessSpec s)
{ coProcessNumRestarts = coProcessNumRestarts (coProcessSpec s) - 1 }
putMVar ch s'
liftIO $ putMVar ch s'
query ch send receive

View file

@ -346,3 +346,14 @@ Example:
## `multicast.log`
Records uftp public key fingerprints, for use by [[git-annex-multicast]].
## `migrate.tree/old` and `migrate.tree/new`
These are used to record migrations done by `git-annex migrate`. By diffing
between the two, the old and new keys can be determined. This lets
migrations be recorded while using a minimum of space in the git
repository. The filenames in these trees have no connection to the names
of actual annexed files.
These trees are recorded in history of the git-annex branch, but the
head of the git-annex branch will never contain them.

View file

@ -0,0 +1,17 @@
[[!comment format=mdwn
username="joey"
subject="""comment 2"""
date="2023-12-06T19:31:42Z"
content="""
On the `distributedmigration` branch I have `git-annex migrate` recording
migrations on the git-annex branch.
Its method of grafting in 2 trees, one with the old keys and one with the
new is quite efficient. In a migration of 1000 files from SHA256E to SHA1,
the git objects only needs 52kb to record the migration trees.
Compared with 424 kb needed to update the location logs.
The total git repo grew from 508kb to 984k.
Next up: Make `git-annex migrate --update` find new migrations started
elsewhere and apply them to the local annex objects.
"""]]

View file

@ -821,6 +821,7 @@ Executable git-annex
Logs.MapLog
Logs.MetaData
Logs.MetaData.Pure
Logs.Migrate
Logs.Multicast
Logs.NumCopies
Logs.PreferredContent