started migrate --update

This is most of the way there, but not quite working.

The layout of migrate.tree/ needs to be changed to follow this approach.
git log will list all the files in tree order, so the new layout needs
to alternate old and new keys. Can that be done? git may not document
tree order, or may not preserve it here.

Alternatively, change to using git log --format=raw and extract
the tree header from that, then use
git diff --raw $tree:migrate.tree/old $tree:migrate.tree/new
That will be a little more expensive, but only when there are lots of
migrations.

Sponsored-by: Joshua Antonishen on Patreon
This commit is contained in:
Joey Hess 2023-12-07 15:50:52 -04:00
parent d06aee7ce0
commit f1ce15036f
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
9 changed files with 172 additions and 38 deletions

View file

@ -14,6 +14,7 @@ module Annex.Branch (
hasSibling, hasSibling,
siblingBranches, siblingBranches,
create, create,
getBranch,
UpdateMade(..), UpdateMade(..),
update, update,
forceUpdate, forceUpdate,
@ -120,7 +121,7 @@ siblingBranches = inRepo $ Git.Ref.matchingUniq [name]
create :: Annex () create :: Annex ()
create = void getBranch create = void getBranch
{- Returns the ref of the branch, creating it first if necessary. -} {- Returns the sha of the branch, creating it first if necessary. -}
getBranch :: Annex Git.Ref getBranch :: Annex Git.Ref
getBranch = maybe (hasOrigin >>= go >>= use) return =<< branchsha getBranch = maybe (hasOrigin >>= go >>= use) return =<< branchsha
where where
@ -920,10 +921,14 @@ getMergedRefs' = do
{- Grafts a treeish into the branch at the specified location, {- Grafts a treeish into the branch at the specified location,
- and then removes it. This ensures that the treeish won't get garbage - and then removes it. This ensures that the treeish won't get garbage
- collected, and will always be available as long as the git-annex branch - collected, and will always be available as long as the git-annex branch
- is available. -} - is available.
rememberTreeish :: Git.Ref -> TopFilePath -> Annex () -
rememberTreeish treeish graftpoint = lockJournal $ rememberTreeishLocked treeish graftpoint - Returns the sha of the git commit made to the git-annex branch.
rememberTreeishLocked :: Git.Ref -> TopFilePath -> JournalLocked -> Annex () -}
rememberTreeish :: Git.Ref -> TopFilePath -> Annex Git.Sha
rememberTreeish treeish graftpoint = lockJournal $
rememberTreeishLocked treeish graftpoint
rememberTreeishLocked :: Git.Ref -> TopFilePath -> JournalLocked -> Annex Git.Sha
rememberTreeishLocked treeish graftpoint jl = do rememberTreeishLocked treeish graftpoint jl = do
branchref <- getBranch branchref <- getBranch
updateIndex jl branchref updateIndex jl branchref
@ -940,6 +945,7 @@ rememberTreeishLocked treeish graftpoint jl = do
-- and the index was updated to that above, so it's safe to -- and the index was updated to that above, so it's safe to
-- say that the index contains c'. -- say that the index contains c'.
setIndexSha c' setIndexSha c'
return c'
{- Runs an action on the content of selected files from the branch. {- Runs an action on the content of selected files from the branch.
- This is much faster than reading the content of each file in turn, - This is much faster than reading the content of each file in turn,

View file

@ -56,6 +56,8 @@ module Annex.Locations (
gitAnnexAdjustedBranchUpdateLock, gitAnnexAdjustedBranchUpdateLock,
gitAnnexMigrateLog, gitAnnexMigrateLog,
gitAnnexMigrateLock, gitAnnexMigrateLock,
gitAnnexMigrationsLog,
gitAnnexMigrationsLock,
gitAnnexMoveLog, gitAnnexMoveLog,
gitAnnexMoveLock, gitAnnexMoveLock,
gitAnnexExportDir, gitAnnexExportDir,
@ -416,6 +418,13 @@ gitAnnexMigrateLog r = gitAnnexDir r P.</> "migrate.log"
gitAnnexMigrateLock :: Git.Repo -> RawFilePath gitAnnexMigrateLock :: Git.Repo -> RawFilePath
gitAnnexMigrateLock r = gitAnnexDir r P.</> "migrate.lck" gitAnnexMigrateLock r = gitAnnexDir r P.</> "migrate.lck"
{- .git/annex/migrations.log is used to log committed migrations. -}
gitAnnexMigrationsLog :: Git.Repo -> RawFilePath
gitAnnexMigrationsLog r = gitAnnexDir r P.</> "migrations.log"
gitAnnexMigrationsLock :: Git.Repo -> RawFilePath
gitAnnexMigrationsLock r = gitAnnexDir r P.</> "migrations.lck"
{- .git/annex/move.log is used to log moves that are in progress, {- .git/annex/move.log is used to log moves that are in progress,
- to better support resuming an interrupted move. -} - to better support resuming an interrupted move. -}
gitAnnexMoveLog :: Git.Repo -> RawFilePath gitAnnexMoveLog :: Git.Repo -> RawFilePath

View file

@ -1,5 +1,7 @@
git-annex (10.20231130) UNRELEASED; urgency=medium git-annex (10.20231130) UNRELEASED; urgency=medium
* migrate: Support distributed migrations by recording each migration,
and adding a --update option that updates the local repository.
* Make git-annex get/copy/move --from foo override configuration of * Make git-annex get/copy/move --from foo override configuration of
remote.foo.annex-ignore, as documented. remote.foo.annex-ignore, as documented.
* Support git-annex copy/move --from-anywhere --to remote. * Support git-annex copy/move --from-anywhere --to remote.

View file

@ -290,8 +290,9 @@ getKeyLog key os = do
getGitLogAnnex :: [FilePath] -> [CommandParam] -> Annex ([RefChange Key], IO Bool) getGitLogAnnex :: [FilePath] -> [CommandParam] -> Annex ([RefChange Key], IO Bool)
getGitLogAnnex fs os = do getGitLogAnnex fs os = do
config <- Annex.getGitConfig config <- Annex.getGitConfig
let fileselector = locationLogFileKey config . toRawFilePath let fileselector = \_sha f ->
inRepo $ getGitLog Annex.Branch.fullname fs os fileselector locationLogFileKey config (toRawFilePath f)
inRepo $ getGitLog Annex.Branch.fullname Nothing fs os fileselector
showTimeStamp :: TimeZone -> String -> POSIXTime -> String showTimeStamp :: TimeZone -> String -> POSIXTime -> String
showTimeStamp zone format = formatTime defaultTimeLocale format showTimeStamp zone format = formatTime defaultTimeLocale format
@ -321,13 +322,13 @@ sizeHistoryInfo mu o = do
-- and to the trust log. -- and to the trust log.
getlog = do getlog = do
config <- Annex.getGitConfig config <- Annex.getGitConfig
let fileselector = \f -> let f' = toRawFilePath f in let fileselector = \_sha f -> let f' = toRawFilePath f in
case locationLogFileKey config f' of case locationLogFileKey config f' of
Just k -> Just (Right k) Just k -> Just (Right k)
Nothing Nothing
| f' == trustLog -> Just (Left ()) | f' == trustLog -> Just (Left ())
| otherwise -> Nothing | otherwise -> Nothing
inRepo $ getGitLog Annex.Branch.fullname [] inRepo $ getGitLog Annex.Branch.fullname Nothing []
[ Param "--date-order" [ Param "--date-order"
, Param "--reverse" , Param "--reverse"
] ]

View file

@ -28,21 +28,31 @@ cmd = withAnnexOptions [backendOption, annexedMatchingOptions, jsonOptions] $
data MigrateOptions = MigrateOptions data MigrateOptions = MigrateOptions
{ migrateThese :: CmdParams { migrateThese :: CmdParams
, updateOption :: Bool
, removeSize :: Bool , removeSize :: Bool
} }
optParser :: CmdParamsDesc -> Parser MigrateOptions optParser :: CmdParamsDesc -> Parser MigrateOptions
optParser desc = MigrateOptions optParser desc = MigrateOptions
<$> cmdParams desc <$> cmdParams desc
<*> switch
( long "update"
<> help "update for migrations performed elsewhere"
)
<*> switch <*> switch
( long "remove-size" ( long "remove-size"
<> help "remove size field from keys" <> help "remove size field from keys"
) )
seek :: MigrateOptions -> CommandSeek seek :: MigrateOptions -> CommandSeek
seek o = do seek o
withFilesInGitAnnex ww seeker =<< workTreeItems ww (migrateThese o) | updateOption o = do
commitMigration unless (null (migrateThese o)) $
error "Cannot combine --update with files to migrate."
commandAction update
| otherwise = do
withFilesInGitAnnex ww seeker =<< workTreeItems ww (migrateThese o)
commitMigration
where where
ww = WarnUnmatchLsFiles "migrate" ww = WarnUnmatchLsFiles "migrate"
seeker = AnnexedFileSeeker seeker = AnnexedFileSeeker
@ -131,3 +141,10 @@ perform onlyremovesize o file oldkey oldkeyrec oldbackend newbackend = go =<< ge
| removeSize o = alterKey k $ \kd -> kd { keySize = Nothing } | removeSize o = alterKey k $ \kd -> kd { keySize = Nothing }
| otherwise = k | otherwise = k
afile = AssociatedFile (Just file) afile = AssociatedFile (Just file)
update :: CommandStart
update = starting "migrate" (ActionItemOther Nothing) (SeekInput []) $ do
streamNewDistributedMigrations $ \oldkey newkey -> do
liftIO $ print ("migrate", oldkey, newkey)
next $ return True

View file

@ -10,6 +10,7 @@ module Git.Log where
import Common import Common
import Git import Git
import Git.Command import Git.Command
import Git.Sha
import Data.Time import Data.Time
import Data.Time.Clock.POSIX import Data.Time.Clock.POSIX
@ -28,14 +29,15 @@ data RefChange t = RefChange
-- run after processing the returned list. -- run after processing the returned list.
getGitLog getGitLog
:: Ref :: Ref
-> Maybe Ref
-> [FilePath] -> [FilePath]
-> [CommandParam] -> [CommandParam]
-> (FilePath -> Maybe t) -> (Sha -> FilePath -> Maybe t)
-> Repo -> Repo
-> IO ([RefChange t], IO Bool) -> IO ([RefChange t], IO Bool)
getGitLog ref fs os fileselector repo = do getGitLog ref stopref fs os selector repo = do
(ls, cleanup) <- pipeNullSplit ps repo (ls, cleanup) <- pipeNullSplit ps repo
return (parseGitRawLog fileselector (map decodeBL ls), cleanup) return (parseGitRawLog selector (map decodeBL ls), cleanup)
where where
ps = ps =
[ Param "log" [ Param "log"
@ -45,14 +47,16 @@ getGitLog ref fs os fileselector repo = do
, Param "--no-abbrev" , Param "--no-abbrev"
, Param "--no-renames" , Param "--no-renames"
] ++ os ++ ] ++ os ++
[ Param (fromRef ref) [ case stopref of
Just stopref' -> Param $
fromRef stopref' <> ".." <> fromRef ref
Nothing -> Param (fromRef ref)
, Param "--" , Param "--"
] ++ map Param fs ] ++ map Param fs
-- The commitinfo is the timestamp of the commit, followed by -- The commitinfo is the commit hash followed by its timestamp.
-- the commit hash and then the commit's parents, separated by spaces.
commitinfoFormat :: String commitinfoFormat :: String
commitinfoFormat = "%ct" commitinfoFormat = "%H %ct"
-- Parses chunked git log --raw output generated by getGitLog, -- Parses chunked git log --raw output generated by getGitLog,
-- which looks something like: -- which looks something like:
@ -69,21 +73,23 @@ commitinfoFormat = "%ct"
-- --
-- The commitinfo is not included before all changelines, so -- The commitinfo is not included before all changelines, so
-- keep track of the most recently seen commitinfo. -- keep track of the most recently seen commitinfo.
parseGitRawLog :: (FilePath -> Maybe t) -> [String] -> [RefChange t] parseGitRawLog :: (Ref -> FilePath -> Maybe t) -> [String] -> [RefChange t]
parseGitRawLog fileselector = parse epoch parseGitRawLog selector = parse (deleteSha, epoch)
where where
epoch = toEnum 0 :: POSIXTime epoch = toEnum 0 :: POSIXTime
parse oldts ([]:rest) = parse oldts rest parse old ([]:rest) = parse old rest
parse oldts (c1:c2:rest) = case mrc of parse (oldcommitsha, oldts) (c1:c2:rest) = case mrc of
Just rc -> rc : parse ts rest Just rc -> rc : parse (commitsha, ts) rest
Nothing -> parse ts (c2:rest) Nothing -> parse (commitsha, ts) (c2:rest)
where where
(ts, cl) = case separate (== '\n') c1 of (commitsha, ts, cl) = case separate (== '\n') c1 of
(cl', []) -> (oldts, cl') (cl', []) -> (oldcommitsha, oldts, cl')
(tss, cl') -> (parseTimeStamp tss, cl') (ci, cl') -> case words ci of
(css:tss:[]) -> (Ref (encodeBS css), parseTimeStamp tss, cl')
_ -> (oldcommitsha, oldts, cl')
mrc = do mrc = do
(old, new) <- parseRawChangeLine cl (old, new) <- parseRawChangeLine cl
v <- fileselector c2 v <- selector commitsha c2
return $ RefChange return $ RefChange
{ changetime = ts { changetime = ts
, changed = v , changed = v

View file

@ -78,7 +78,7 @@ recordExportBeginning remoteuuid newtree = do
-- repository, the tree has to be kept available, even if it -- repository, the tree has to be kept available, even if it
-- doesn't end up being merged into the master branch. -- doesn't end up being merged into the master branch.
recordExportTreeish :: Git.Ref -> Annex () recordExportTreeish :: Git.Ref -> Annex ()
recordExportTreeish t = recordExportTreeish t = void $
Annex.Branch.rememberTreeish t (asTopFilePath exportTreeGraftPoint) Annex.Branch.rememberTreeish t (asTopFilePath exportTreeGraftPoint)
-- | Record that an export to a special remote is under way. -- | Record that an export to a special remote is under way.

View file

@ -16,7 +16,19 @@
- The filenames used in the trees are not the original filenames, to avoid - The filenames used in the trees are not the original filenames, to avoid
- running migrate in a throwaway branch unexpectedly recording that - running migrate in a throwaway branch unexpectedly recording that
- branch's contents. - branch's contents.
- -
- There are two local log files:
- * migrate.log contains pairs of old and new keys, and is used while
- performing a new migration, to build up a migration to commit.
- This allows an interrupted migration to be resumed later.
- * migrations.log has as its first line a commit to the git-annex branch
- up to which all migrations have been performed locally (including any
- migrations in parent commits). Or the first line may be a null sha when
- this has not been done yet. The rest of the lines in the file
- are commits that have been made for locally performed migrations,
- but whose parent commits have not necessarily been checked for
- migrations yet.
-
- Copyright 2023 Joey Hess <id@joeyh.name> - Copyright 2023 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
@ -28,6 +40,7 @@ module Logs.Migrate (
MigrationRecord(..), MigrationRecord(..),
logMigration, logMigration,
commitMigration, commitMigration,
streamNewDistributedMigrations,
) where ) where
import Annex.Common import Annex.Common
@ -39,6 +52,9 @@ import Git.Tree
import Git.FilePath import Git.FilePath
import Logs.File import Logs.File
import Logs import Logs
import Git.Log
import Git.Sha
import Annex.CatFile
import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Lazy as L
import Control.Concurrent.STM import Control.Concurrent.STM
@ -48,9 +64,6 @@ import Control.Concurrent.STM
newtype MigrationRecord = MigrationRecord { fromMigrationRecord :: Git.Sha } newtype MigrationRecord = MigrationRecord { fromMigrationRecord :: Git.Sha }
-- | Logs a migration from an old to a new key. -- | Logs a migration from an old to a new key.
--
-- This writes to a log file, which can later be committed. That allows an
-- interrupted migration to be resumed later.
logMigration :: MigrationRecord -> MigrationRecord -> Annex () logMigration :: MigrationRecord -> MigrationRecord -> Annex ()
logMigration old new = do logMigration old new = do
logf <- fromRepo gitAnnexMigrateLog logf <- fromRepo gitAnnexMigrateLog
@ -97,5 +110,77 @@ commitMigration = do
[ RecordedSubTree (asTopFilePath "old") oldt [] [ RecordedSubTree (asTopFilePath "old") oldt []
, RecordedSubTree (asTopFilePath "new") newt [] , RecordedSubTree (asTopFilePath "new") newt []
] ]
Annex.Branch.rememberTreeish treesha commitsha <- Annex.Branch.rememberTreeish treesha
(asTopFilePath migrationTreeGraftPoint) (asTopFilePath migrationTreeGraftPoint)
committedMigration commitsha
-- Streams distributed migrations from the git-annex branch that have not
-- been performed here, and runs the provided action on each old and new
-- key pair.
streamNewDistributedMigrations :: (Key -> Key -> Annex ()) -> Annex ()
streamNewDistributedMigrations a = do
void Annex.Branch.update
branchsha <- Annex.Branch.getBranch
(stoppoint, toskip) <- getPerformedMigrations
(l, cleanup) <- inRepo $ getGitLog branchsha stoppoint
[fromRawFilePath migrationTreeGraftPoint]
-- Need to follow because migrate.tree is grafted in
-- and then deleted, and normally git log stops when a file
-- gets deleted.
([Param "--reverse", Param "--follow"])
(\sha _file -> Just sha)
forM_ l $ \c ->
unless (changed c `elem` toskip) $ do
moldkey <- catKey XXX
mnewkey <- catKey YYY
case (moldkey, mnewkey) of
(Just oldkey, Just newkey) -> a oldkey newkey
_ -> return ()
liftIO $ void cleanup
recordPerformedMigrations branchsha toskip
getPerformedMigrations :: Annex (Maybe Sha, [Sha])
getPerformedMigrations = do
logf <- fromRepo gitAnnexMigrationsLog
lckf <- fromRepo gitAnnexMigrationsLock
ls <- calcLogFile logf lckf [] (:)
return $ case reverse ls of
[] -> (Nothing, [])
(stoppoint:toskip) ->
let stoppoint' = conv stoppoint
in
( if stoppoint' `elem` nullShas
then Nothing
else Just stoppoint'
, map conv toskip
)
where
conv = Git.Ref . L.toStrict
-- Record locally that migrations have been performed up to the given
-- commit. The list is additional commits that can be removed from the
-- log file if present.
recordPerformedMigrations :: Sha -> [Sha] -> Annex ()
recordPerformedMigrations sha toremove = do
logf <- fromRepo gitAnnexMigrationsLog
lckf <- fromRepo gitAnnexMigrationsLock
modifyLogFile logf lckf (update . drop 1)
where
update l = L.fromStrict (fromRef' sha) : filter (`notElem` toremove') l
toremove' = map (L.fromStrict . fromRef') toremove
-- Record that a migration was performed locally and committed.
-- Since committing a migration may result in parent migrations that have
-- not yet been processed locally, that commit cannot be the first line of
-- the log file, which is reserved for commits whose parents have also had
-- their migrations handled. So if the log file does not exist or is empty,
-- make the first line a null sha.
committedMigration :: Sha -> Annex ()
committedMigration commitsha = do
logf <- fromRepo gitAnnexMigrationsLog
lckf <- fromRepo gitAnnexMigrationsLock
modifyLogFile logf lckf update
where
update [] = [conv deleteSha, conv commitsha]
update logged = logged ++ [conv commitsha]
conv = L.fromStrict . fromRef'

View file

@ -6,6 +6,8 @@ git-annex migrate - switch data to different backend
git annex migrate `[path ...]` git annex migrate `[path ...]`
git annex migrate --update
# DESCRIPTION # DESCRIPTION
Changes the specified annexed files to use the default key-value backend Changes the specified annexed files to use the default key-value backend
@ -19,11 +21,17 @@ Normally, nothing will be done to files already using the new backend.
However, if a backend changes the information it uses to construct a key, However, if a backend changes the information it uses to construct a key,
this can also be used to migrate files to use the new key format. this can also be used to migrate files to use the new key format.
When you have multiple repositories that each contain a copy of a file,
it's best to run migrate in all of them.
# OPTIONS # OPTIONS
* `--update`
This updates the local repository for migrations that were performed
elsewhere.
Note that older versions of git-annex did not record migrations in a
way that this can use. Migrations performed with those older versions
had to be manually run in each clone of the repository.
* `--backend` * `--backend`
Specify the new key-value backend to use for migrated data. Specify the new key-value backend to use for migrated data.