started migrate --update

This is most of the way there, but not quite working.

The layout of migrate.tree/ needs to be changed to follow this approach.
git log will list all the files in tree order, so the new layout needs
to alternate old and new keys. Can that be done? git may not document
tree order, or may not preserve it here.

Alternatively, change to using git log --format=raw and extract
the tree header from that, then use
git diff --raw $tree:migrate.tree/old $tree:migrate.tree/new
That will be a little more expensive, but only when there are lots of
migrations.

Sponsored-by: Joshua Antonishen on Patreon
This commit is contained in:
Joey Hess 2023-12-07 15:50:52 -04:00
parent d06aee7ce0
commit f1ce15036f
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
9 changed files with 172 additions and 38 deletions

View file

@ -14,6 +14,7 @@ module Annex.Branch (
hasSibling,
siblingBranches,
create,
getBranch,
UpdateMade(..),
update,
forceUpdate,
@ -120,7 +121,7 @@ siblingBranches = inRepo $ Git.Ref.matchingUniq [name]
create :: Annex ()
create = void getBranch
{- Returns the ref of the branch, creating it first if necessary. -}
{- Returns the sha of the branch, creating it first if necessary. -}
getBranch :: Annex Git.Ref
getBranch = maybe (hasOrigin >>= go >>= use) return =<< branchsha
where
@ -920,10 +921,14 @@ getMergedRefs' = do
{- Grafts a treeish into the branch at the specified location,
- and then removes it. This ensures that the treeish won't get garbage
- collected, and will always be available as long as the git-annex branch
- is available. -}
rememberTreeish :: Git.Ref -> TopFilePath -> Annex ()
rememberTreeish treeish graftpoint = lockJournal $ rememberTreeishLocked treeish graftpoint
rememberTreeishLocked :: Git.Ref -> TopFilePath -> JournalLocked -> Annex ()
- is available.
-
- Returns the sha of the git commit made to the git-annex branch.
-}
rememberTreeish :: Git.Ref -> TopFilePath -> Annex Git.Sha
rememberTreeish treeish graftpoint = lockJournal $
rememberTreeishLocked treeish graftpoint
rememberTreeishLocked :: Git.Ref -> TopFilePath -> JournalLocked -> Annex Git.Sha
rememberTreeishLocked treeish graftpoint jl = do
branchref <- getBranch
updateIndex jl branchref
@ -940,6 +945,7 @@ rememberTreeishLocked treeish graftpoint jl = do
-- and the index was updated to that above, so it's safe to
-- say that the index contains c'.
setIndexSha c'
return c'
{- Runs an action on the content of selected files from the branch.
- This is much faster than reading the content of each file in turn,

View file

@ -56,6 +56,8 @@ module Annex.Locations (
gitAnnexAdjustedBranchUpdateLock,
gitAnnexMigrateLog,
gitAnnexMigrateLock,
gitAnnexMigrationsLog,
gitAnnexMigrationsLock,
gitAnnexMoveLog,
gitAnnexMoveLock,
gitAnnexExportDir,
@ -416,6 +418,13 @@ gitAnnexMigrateLog r = gitAnnexDir r P.</> "migrate.log"
gitAnnexMigrateLock :: Git.Repo -> RawFilePath
gitAnnexMigrateLock r = gitAnnexDir r P.</> "migrate.lck"
{- .git/annex/migrations.log is used to log committed migrations. -}
gitAnnexMigrationsLog :: Git.Repo -> RawFilePath
gitAnnexMigrationsLog r = gitAnnexDir r P.</> "migrations.log"
gitAnnexMigrationsLock :: Git.Repo -> RawFilePath
gitAnnexMigrationsLock r = gitAnnexDir r P.</> "migrations.lck"
{- .git/annex/move.log is used to log moves that are in progress,
- to better support resuming an interrupted move. -}
gitAnnexMoveLog :: Git.Repo -> RawFilePath

View file

@ -1,5 +1,7 @@
git-annex (10.20231130) UNRELEASED; urgency=medium
* migrate: Support distributed migrations by recording each migration,
and adding a --update option that updates the local repository.
* Make git-annex get/copy/move --from foo override configuration of
remote.foo.annex-ignore, as documented.
* Support git-annex copy/move --from-anywhere --to remote.

View file

@ -290,8 +290,9 @@ getKeyLog key os = do
getGitLogAnnex :: [FilePath] -> [CommandParam] -> Annex ([RefChange Key], IO Bool)
getGitLogAnnex fs os = do
config <- Annex.getGitConfig
let fileselector = locationLogFileKey config . toRawFilePath
inRepo $ getGitLog Annex.Branch.fullname fs os fileselector
let fileselector = \_sha f ->
locationLogFileKey config (toRawFilePath f)
inRepo $ getGitLog Annex.Branch.fullname Nothing fs os fileselector
showTimeStamp :: TimeZone -> String -> POSIXTime -> String
showTimeStamp zone format = formatTime defaultTimeLocale format
@ -321,13 +322,13 @@ sizeHistoryInfo mu o = do
-- and to the trust log.
getlog = do
config <- Annex.getGitConfig
let fileselector = \f -> let f' = toRawFilePath f in
let fileselector = \_sha f -> let f' = toRawFilePath f in
case locationLogFileKey config f' of
Just k -> Just (Right k)
Nothing
| f' == trustLog -> Just (Left ())
| otherwise -> Nothing
inRepo $ getGitLog Annex.Branch.fullname []
inRepo $ getGitLog Annex.Branch.fullname Nothing []
[ Param "--date-order"
, Param "--reverse"
]

View file

@ -28,21 +28,31 @@ cmd = withAnnexOptions [backendOption, annexedMatchingOptions, jsonOptions] $
data MigrateOptions = MigrateOptions
{ migrateThese :: CmdParams
, updateOption :: Bool
, removeSize :: Bool
}
optParser :: CmdParamsDesc -> Parser MigrateOptions
optParser desc = MigrateOptions
<$> cmdParams desc
<*> switch
( long "update"
<> help "update for migrations performed elsewhere"
)
<*> switch
( long "remove-size"
<> help "remove size field from keys"
)
seek :: MigrateOptions -> CommandSeek
seek o = do
withFilesInGitAnnex ww seeker =<< workTreeItems ww (migrateThese o)
commitMigration
seek o
| updateOption o = do
unless (null (migrateThese o)) $
error "Cannot combine --update with files to migrate."
commandAction update
| otherwise = do
withFilesInGitAnnex ww seeker =<< workTreeItems ww (migrateThese o)
commitMigration
where
ww = WarnUnmatchLsFiles "migrate"
seeker = AnnexedFileSeeker
@ -131,3 +141,10 @@ perform onlyremovesize o file oldkey oldkeyrec oldbackend newbackend = go =<< ge
| removeSize o = alterKey k $ \kd -> kd { keySize = Nothing }
| otherwise = k
afile = AssociatedFile (Just file)
update :: CommandStart
update = starting "migrate" (ActionItemOther Nothing) (SeekInput []) $ do
streamNewDistributedMigrations $ \oldkey newkey -> do
liftIO $ print ("migrate", oldkey, newkey)
next $ return True

View file

@ -10,6 +10,7 @@ module Git.Log where
import Common
import Git
import Git.Command
import Git.Sha
import Data.Time
import Data.Time.Clock.POSIX
@ -28,14 +29,15 @@ data RefChange t = RefChange
-- run after processing the returned list.
getGitLog
:: Ref
-> Maybe Ref
-> [FilePath]
-> [CommandParam]
-> (FilePath -> Maybe t)
-> (Sha -> FilePath -> Maybe t)
-> Repo
-> IO ([RefChange t], IO Bool)
getGitLog ref fs os fileselector repo = do
getGitLog ref stopref fs os selector repo = do
(ls, cleanup) <- pipeNullSplit ps repo
return (parseGitRawLog fileselector (map decodeBL ls), cleanup)
return (parseGitRawLog selector (map decodeBL ls), cleanup)
where
ps =
[ Param "log"
@ -45,14 +47,16 @@ getGitLog ref fs os fileselector repo = do
, Param "--no-abbrev"
, Param "--no-renames"
] ++ os ++
[ Param (fromRef ref)
[ case stopref of
Just stopref' -> Param $
fromRef stopref' <> ".." <> fromRef ref
Nothing -> Param (fromRef ref)
, Param "--"
] ++ map Param fs
-- The commitinfo is the timestamp of the commit, followed by
-- the commit hash and then the commit's parents, separated by spaces.
-- The commitinfo is the commit hash followed by its timestamp.
commitinfoFormat :: String
commitinfoFormat = "%ct"
commitinfoFormat = "%H %ct"
-- Parses chunked git log --raw output generated by getGitLog,
-- which looks something like:
@ -69,21 +73,23 @@ commitinfoFormat = "%ct"
--
-- The commitinfo is not included before all changelines, so
-- keep track of the most recently seen commitinfo.
parseGitRawLog :: (FilePath -> Maybe t) -> [String] -> [RefChange t]
parseGitRawLog fileselector = parse epoch
parseGitRawLog :: (Ref -> FilePath -> Maybe t) -> [String] -> [RefChange t]
parseGitRawLog selector = parse (deleteSha, epoch)
where
epoch = toEnum 0 :: POSIXTime
parse oldts ([]:rest) = parse oldts rest
parse oldts (c1:c2:rest) = case mrc of
Just rc -> rc : parse ts rest
Nothing -> parse ts (c2:rest)
parse old ([]:rest) = parse old rest
parse (oldcommitsha, oldts) (c1:c2:rest) = case mrc of
Just rc -> rc : parse (commitsha, ts) rest
Nothing -> parse (commitsha, ts) (c2:rest)
where
(ts, cl) = case separate (== '\n') c1 of
(cl', []) -> (oldts, cl')
(tss, cl') -> (parseTimeStamp tss, cl')
(commitsha, ts, cl) = case separate (== '\n') c1 of
(cl', []) -> (oldcommitsha, oldts, cl')
(ci, cl') -> case words ci of
(css:tss:[]) -> (Ref (encodeBS css), parseTimeStamp tss, cl')
_ -> (oldcommitsha, oldts, cl')
mrc = do
(old, new) <- parseRawChangeLine cl
v <- fileselector c2
v <- selector commitsha c2
return $ RefChange
{ changetime = ts
, changed = v

View file

@ -78,7 +78,7 @@ recordExportBeginning remoteuuid newtree = do
-- repository, the tree has to be kept available, even if it
-- doesn't end up being merged into the master branch.
recordExportTreeish :: Git.Ref -> Annex ()
recordExportTreeish t =
recordExportTreeish t = void $
Annex.Branch.rememberTreeish t (asTopFilePath exportTreeGraftPoint)
-- | Record that an export to a special remote is under way.

View file

@ -16,7 +16,19 @@
- The filenames used in the trees are not the original filenames, to avoid
- running migrate in a throwaway branch unexpectedly recording that
- branch's contents.
-
-
- There are two local log files:
- * migrate.log contains pairs of old and new keys, and is used while
- performing a new migration, to build up a migration to commit.
- This allows an interrupted migration to be resumed later.
- * migrations.log has as its first line a commit to the git-annex branch
- up to which all migrations have been performed locally (including any
- migrations in parent commits). Or the first line may be a null sha when
- this has not been done yet. The rest of the lines in the file
- are commits that have been made for locally performed migrations,
- but whose parent commits have not necessarily been checked for
- migrations yet.
-
- Copyright 2023 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
@ -28,6 +40,7 @@ module Logs.Migrate (
MigrationRecord(..),
logMigration,
commitMigration,
streamNewDistributedMigrations,
) where
import Annex.Common
@ -39,6 +52,9 @@ import Git.Tree
import Git.FilePath
import Logs.File
import Logs
import Git.Log
import Git.Sha
import Annex.CatFile
import qualified Data.ByteString.Lazy as L
import Control.Concurrent.STM
@ -48,9 +64,6 @@ import Control.Concurrent.STM
newtype MigrationRecord = MigrationRecord { fromMigrationRecord :: Git.Sha }
-- | Logs a migration from an old to a new key.
--
-- This writes to a log file, which can later be committed. That allows an
-- interrupted migration to be resumed later.
logMigration :: MigrationRecord -> MigrationRecord -> Annex ()
logMigration old new = do
logf <- fromRepo gitAnnexMigrateLog
@ -97,5 +110,77 @@ commitMigration = do
[ RecordedSubTree (asTopFilePath "old") oldt []
, RecordedSubTree (asTopFilePath "new") newt []
]
Annex.Branch.rememberTreeish treesha
commitsha <- Annex.Branch.rememberTreeish treesha
(asTopFilePath migrationTreeGraftPoint)
committedMigration commitsha
-- Streams distributed migrations from the git-annex branch that have not
-- been performed here, and runs the provided action on each old and new
-- key pair.
streamNewDistributedMigrations :: (Key -> Key -> Annex ()) -> Annex ()
streamNewDistributedMigrations a = do
void Annex.Branch.update
branchsha <- Annex.Branch.getBranch
(stoppoint, toskip) <- getPerformedMigrations
(l, cleanup) <- inRepo $ getGitLog branchsha stoppoint
[fromRawFilePath migrationTreeGraftPoint]
-- Need to follow because migrate.tree is grafted in
-- and then deleted, and normally git log stops when a file
-- gets deleted.
([Param "--reverse", Param "--follow"])
(\sha _file -> Just sha)
forM_ l $ \c ->
unless (changed c `elem` toskip) $ do
moldkey <- catKey XXX
mnewkey <- catKey YYY
case (moldkey, mnewkey) of
(Just oldkey, Just newkey) -> a oldkey newkey
_ -> return ()
liftIO $ void cleanup
recordPerformedMigrations branchsha toskip
getPerformedMigrations :: Annex (Maybe Sha, [Sha])
getPerformedMigrations = do
logf <- fromRepo gitAnnexMigrationsLog
lckf <- fromRepo gitAnnexMigrationsLock
ls <- calcLogFile logf lckf [] (:)
return $ case reverse ls of
[] -> (Nothing, [])
(stoppoint:toskip) ->
let stoppoint' = conv stoppoint
in
( if stoppoint' `elem` nullShas
then Nothing
else Just stoppoint'
, map conv toskip
)
where
conv = Git.Ref . L.toStrict
-- Record locally that migrations have been performed up to the given
-- commit. The list is additional commits that can be removed from the
-- log file if present.
recordPerformedMigrations :: Sha -> [Sha] -> Annex ()
recordPerformedMigrations sha toremove = do
logf <- fromRepo gitAnnexMigrationsLog
lckf <- fromRepo gitAnnexMigrationsLock
modifyLogFile logf lckf (update . drop 1)
where
update l = L.fromStrict (fromRef' sha) : filter (`notElem` toremove') l
toremove' = map (L.fromStrict . fromRef') toremove
-- Record that a migration was performed locally and committed.
-- Since committing a migration may result in parent migrations that have
-- not yet been processed locally, that commit cannot be the first line of
-- the log file, which is reserved for commits whose parents have also had
-- their migrations handled. So if the log file does not exist or is empty,
-- make the first line a null sha.
committedMigration :: Sha -> Annex ()
committedMigration commitsha = do
logf <- fromRepo gitAnnexMigrationsLog
lckf <- fromRepo gitAnnexMigrationsLock
modifyLogFile logf lckf update
where
update [] = [conv deleteSha, conv commitsha]
update logged = logged ++ [conv commitsha]
conv = L.fromStrict . fromRef'

View file

@ -6,6 +6,8 @@ git-annex migrate - switch data to different backend
git annex migrate `[path ...]`
git annex migrate --update
# DESCRIPTION
Changes the specified annexed files to use the default key-value backend
@ -19,11 +21,17 @@ Normally, nothing will be done to files already using the new backend.
However, if a backend changes the information it uses to construct a key,
this can also be used to migrate files to use the new key format.
When you have multiple repositories that each contain a copy of a file,
it's best to run migrate in all of them.
# OPTIONS
* `--update`
This updates the local repository for migrations that were performed
elsewhere.
Note that older versions of git-annex did not record migrations in a
way that this can use. Migrations performed with those older versions
had to be manually run in each clone of the repository.
* `--backend`
Specify the new key-value backend to use for migrated data.