invalidate recorded content identifier tree when export changes

Fix bug that made changes to a special remote sometimes be missed when
importing a tree from it. The diff import would miss when a change was
exported, then manually undone on the special remote (eg deleting a newly
exported file). A full import is needed to catch such changes.

After upgrading, any such missed changes will be included in the next
tree imported from a special remote. This happens because the previously
recorded content identifier tree does not have export information included,
so it is treated as invalid, and a full import is done.

Fixes reversion introduced in version 10.20230626, commit
40017089f2

Unfortunately, this does mean that after each export, the next import will
be a full import. Which can take significantly longer than the diff import
does, when there are a lot of files in the tree.

It would be better if exporting also update the content identifier tree.
However, I don't know if that can be done inexpensively. It would be future
optimisation work, in any case.

(That could only be done for an export that is run in the same
repository as the import. When an export is run in a different repository,
the export.log gets updated, and that propagates to the repository where
import is later run. At that point, a full import is done.)

Sponsored-by: Luke T. Shumaker
This commit is contained in:
Joey Hess 2025-09-23 12:52:55 -04:00
commit ff65cd6954
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
6 changed files with 111 additions and 41 deletions

View file

@ -21,6 +21,7 @@ module Annex.Import (
importKeys, importKeys,
makeImportMatcher, makeImportMatcher,
getImportableContents, getImportableContents,
PostExportLogUpdate,
) where ) where
import Annex.Common import Annex.Common
@ -74,6 +75,8 @@ import qualified Data.ByteArray.Encoding as BA
#ifdef mingw32_HOST_OS #ifdef mingw32_HOST_OS
import qualified System.FilePath.Posix as Posix import qualified System.FilePath.Posix as Posix
#endif #endif
import qualified Data.Semigroup as Sem
import Prelude
{- Configures how to build an import tree. -} {- Configures how to build an import tree. -}
data ImportTreeConfig data ImportTreeConfig
@ -112,8 +115,9 @@ buildImportCommit
-> ImportCommitConfig -> ImportCommitConfig
-> AddUnlockedMatcher -> AddUnlockedMatcher
-> Imported -> Imported
-> PostExportLogUpdate
-> Annex (Maybe Ref) -> Annex (Maybe Ref)
buildImportCommit remote importtreeconfig importcommitconfig addunlockedmatcher imported = buildImportCommit remote importtreeconfig importcommitconfig addunlockedmatcher imported postexportlogupdate =
case importCommitTracking importcommitconfig of case importCommitTracking importcommitconfig of
Nothing -> go Nothing Nothing -> go Nothing
Just trackingcommit -> inRepo (Git.Ref.tree trackingcommit) >>= \case Just trackingcommit -> inRepo (Git.Ref.tree trackingcommit) >>= \case
@ -121,12 +125,14 @@ buildImportCommit remote importtreeconfig importcommitconfig addunlockedmatcher
Just _ -> go (Just trackingcommit) Just _ -> go (Just trackingcommit)
where where
go trackingcommit = do go trackingcommit = do
(importedtree, updatestate) <- recordImportTree remote importtreeconfig (Just addunlockedmatcher) imported (importedtree, updatestate) <- recordImportTree remote importtreeconfig (Just addunlockedmatcher) imported postexportlogupdate
buildImportCommit' remote importcommitconfig trackingcommit importedtree >>= \case buildImportCommit' remote importcommitconfig trackingcommit importedtree >>= \case
Just finalcommit -> do Just finalcommit -> do
updatestate updatestate
return (Just finalcommit) return (Just finalcommit)
Nothing -> return Nothing Nothing -> do
postExportLogUpdate postexportlogupdate
return Nothing
{- Builds a tree for an import from a special remote. {- Builds a tree for an import from a special remote.
- -
@ -138,8 +144,9 @@ recordImportTree
-> ImportTreeConfig -> ImportTreeConfig
-> Maybe AddUnlockedMatcher -> Maybe AddUnlockedMatcher
-> Imported -> Imported
-> PostExportLogUpdate
-> Annex (History Sha, Annex ()) -> Annex (History Sha, Annex ())
recordImportTree remote importtreeconfig addunlockedmatcher imported = do recordImportTree remote importtreeconfig addunlockedmatcher imported postexportlogupdate = do
importedtree@(History finaltree _) <- buildImportTrees basetree subdir addunlockedmatcher imported importedtree@(History finaltree _) <- buildImportTrees basetree subdir addunlockedmatcher imported
return (importedtree, updatestate finaltree) return (importedtree, updatestate finaltree)
where where
@ -180,6 +187,7 @@ recordImportTree remote importtreeconfig addunlockedmatcher imported = do
{ oldTreeish = exportedTreeishes oldexport { oldTreeish = exportedTreeishes oldexport
, newTreeish = importedtree , newTreeish = importedtree
} }
postExportLogUpdate postexportlogupdate
return oldexport return oldexport
-- importKeys takes care of updating the location log -- importKeys takes care of updating the location log
@ -498,11 +506,26 @@ canImportKeys remote importcontent =
where where
ia = Remote.importActions remote ia = Remote.importActions remote
-- Result of an import. ImportUnfinished indicates that some file failed to -- Result of an import.
-- be imported. Running again should resume where it left off.
data ImportResult t data ImportResult t
= ImportFinished t = ImportFinished PostExportLogUpdate t
| ImportUnfinished | ImportUnfinished
-- ^ ImportUnfinished indicates that some file failed to
-- be imported. Running again should resume where it left off.
-- An action to run after the export log has been updated to reflect an
-- import.
newtype PostExportLogUpdate = PostExportLogUpdate (Annex ())
instance Sem.Semigroup PostExportLogUpdate where
PostExportLogUpdate a <> PostExportLogUpdate b =
PostExportLogUpdate (a >> b)
noPostExportLogUpdate :: PostExportLogUpdate
noPostExportLogUpdate = PostExportLogUpdate (return ())
postExportLogUpdate :: PostExportLogUpdate -> Annex ()
postExportLogUpdate (PostExportLogUpdate a) = a
data Diffed t data Diffed t
= DiffChanged t = DiffChanged t
@ -546,7 +569,10 @@ importChanges remote importtreeconfig importcontent thirdpartypopulated importab
Nothing -> fullimport currcidtree Nothing -> fullimport currcidtree
Just lastimportedtree -> diffimport cidtreemap prevcidtree currcidtree lastimportedtree Just lastimportedtree -> diffimport cidtreemap prevcidtree currcidtree lastimportedtree
where where
remember = recordContentIdentifierTree (Remote.uuid remote) -- Record the content identifier tree after the export log is
-- updated for the import.
remember = PostExportLogUpdate .
recordContentIdentifierTree (Remote.uuid remote)
-- In order to use a diff, the previous ContentIdentifier tree must -- In order to use a diff, the previous ContentIdentifier tree must
-- not have been garbage collected. Which can happen since there -- not have been garbage collected. Which can happen since there
@ -567,11 +593,11 @@ importChanges remote importtreeconfig importcontent thirdpartypopulated importab
) )
fullimport currcidtree = fullimport currcidtree =
importKeys remote importtreeconfig importcontent thirdpartypopulated importablecontents >>= \case importKeys remote importtreeconfig importcontent thirdpartypopulated importablecontents >>= return . \case
ImportUnfinished -> return ImportUnfinished ImportUnfinished -> ImportUnfinished
ImportFinished r -> do ImportFinished a r ->
remember currcidtree ImportFinished (a <> remember currcidtree) $
return $ ImportFinished $ ImportedFull r ImportedFull r
diffimport cidtreemap prevcidtree currcidtree lastimportedtree = do diffimport cidtreemap prevcidtree currcidtree lastimportedtree = do
(diff, cleanup) <- inRepo $ Git.DiffTree.diffTreeRecursive (diff, cleanup) <- inRepo $ Git.DiffTree.diffTreeRecursive
@ -589,17 +615,15 @@ importChanges remote importtreeconfig importcontent thirdpartypopulated importab
ImportUnfinished -> do ImportUnfinished -> do
void $ liftIO cleanup void $ liftIO cleanup
return ImportUnfinished return ImportUnfinished
ImportFinished (ImportableContentsComplete ic') -> ImportFinished a (ImportableContentsComplete ic') ->
liftIO cleanup >>= \case liftIO cleanup >>= return . \case
False -> return ImportUnfinished False -> ImportUnfinished
True -> do True -> ImportFinished (a <> remember currcidtree) $
remember currcidtree ImportedDiff lastimportedtree
return $ ImportFinished $ (mkdiff ic' removed)
ImportedDiff lastimportedtree
(mkdiff ic' removed)
-- importKeys is not passed ImportableContentsChunked -- importKeys is not passed ImportableContentsChunked
-- above, so it cannot return it -- above, so it cannot return it
ImportFinished (ImportableContentsChunked {}) -> error "internal" ImportFinished _ (ImportableContentsChunked {}) -> error "internal"
isremoval ti = Git.DiffTree.dstsha ti `elem` nullShas isremoval ti = Git.DiffTree.dstsha ti `elem` nullShas
@ -685,12 +709,12 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec
ImportableContentsComplete ic -> ImportableContentsComplete ic ->
go False largematcher cidmap importing db ic >>= return . \case go False largematcher cidmap importing db ic >>= return . \case
Nothing -> ImportUnfinished Nothing -> ImportUnfinished
Just v -> ImportFinished $ ImportableContentsComplete v Just v -> ImportFinished noPostExportLogUpdate $ ImportableContentsComplete v
ImportableContentsChunked {} -> do ImportableContentsChunked {} -> do
c <- gochunked db (importableContentsChunk importablecontents) c <- gochunked db (importableContentsChunk importablecontents)
gohistory largematcher cidmap importing db (importableHistoryComplete importablecontents) >>= return . \case gohistory largematcher cidmap importing db (importableHistoryComplete importablecontents) >>= return . \case
Nothing -> ImportUnfinished Nothing -> ImportUnfinished
Just h -> ImportFinished $ ImportableContentsChunked Just h -> ImportFinished noPostExportLogUpdate $ ImportableContentsChunked
{ importableContentsChunk = c { importableContentsChunk = c
, importableHistoryComplete = h , importableHistoryComplete = h
} }

View file

@ -23,6 +23,10 @@ git-annex (10.20250829) UNRELEASED; urgency=medium
existing remote. existing remote.
* Fix hang that could occur when using git-annex adjust on a branch with * Fix hang that could occur when using git-annex adjust on a branch with
a number of files greater than annex.queuesize. a number of files greater than annex.queuesize.
* Fix bug that made changes to a special remote sometimes be missed when
importing a tree from it. After upgrading, any such missed changes
will be included in the next tree imported from a special remote.
Fixes reversion introduced in version 10.20230626.
-- Joey Hess <id@joeyh.name> Fri, 29 Aug 2025 12:34:06 -0400 -- Joey Hess <id@joeyh.name> Fri, 29 Aug 2025 12:34:06 -0400

View file

@ -349,9 +349,9 @@ seekRemote remote branch msubdir importcontent ci addunlockedmatcher importmessa
, Remote.name remote , Remote.name remote
, ". Re-run command to resume import." , ". Re-run command to resume import."
] ]
ImportFinished imported -> void $ ImportFinished postexportlogupdate imported ->
includeCommandAction $ void $ includeCommandAction $
commitimport imported commitimport imported postexportlogupdate
where where
importmessages' importmessages'
| null importmessages = ["import from " ++ Remote.name remote] | null importmessages = ["import from " ++ Remote.name remote]
@ -383,10 +383,10 @@ listContents' remote importtreeconfig ci a =
, err , err
] ]
commitRemote :: Remote -> Branch -> RemoteTrackingBranch -> Maybe Sha -> ImportTreeConfig -> ImportCommitConfig -> AddUnlockedMatcher -> Imported -> CommandStart commitRemote :: Remote -> Branch -> RemoteTrackingBranch -> Maybe Sha -> ImportTreeConfig -> ImportCommitConfig -> AddUnlockedMatcher -> Imported -> PostExportLogUpdate -> CommandStart
commitRemote remote branch tb trackingcommit importtreeconfig importcommitconfig addunlockedmatcher imported = commitRemote remote branch tb trackingcommit importtreeconfig importcommitconfig addunlockedmatcher imported postexportlogupdate =
starting "update" ai si $ do starting "update" ai si $ do
importcommit <- buildImportCommit remote importtreeconfig importcommitconfig addunlockedmatcher imported importcommit <- buildImportCommit remote importtreeconfig importcommitconfig addunlockedmatcher imported postexportlogupdate
next $ updateremotetrackingbranch importcommit next $ updateremotetrackingbranch importcommit
where where
ai = ActionItemOther (Just $ UnquotedString $ fromRef $ fromRemoteTrackingBranch tb) ai = ActionItemOther (Just $ UnquotedString $ fromRef $ fromRemoteTrackingBranch tb)

View file

@ -606,8 +606,8 @@ pullThirdPartyPopulated o remote
Command.Import.listContents' remote ImportTree (CheckGitIgnore False) go Command.Import.listContents' remote ImportTree (CheckGitIgnore False) go
where where
go (Just importable) = importChanges remote ImportTree False True importable >>= \case go (Just importable) = importChanges remote ImportTree False True importable >>= \case
ImportFinished imported -> do ImportFinished postexportlogupdate imported -> do
(_t, updatestate) <- recordImportTree remote ImportTree Nothing imported (_t, updatestate) <- recordImportTree remote ImportTree Nothing imported postexportlogupdate
next $ do next $ do
updatestate updatestate
return True return True

View file

@ -1,6 +1,6 @@
{- git-annex import logs {- git-annex import logs
- -
- Copyright 2023 Joey Hess <id@joeyh.name> - Copyright 2023-2025 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
@ -14,24 +14,60 @@ import Annex.Common
import Git.Types import Git.Types
import Git.Sha import Git.Sha
import Logs.File import Logs.File
import Logs.Export
import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Char8 as S8
import qualified Data.Set as S
{- Records the sha of a tree that contains hashes of ContentIdentifiers {- Records the sha of a tree that contains hashes of ContentIdentifiers
- that were imported from a remote. -} - that were imported from a remote.
-
- The sha is on the first line of the log file, and following it
- is a line with the the currently exported treeishs, and then a line with
- the incomplete exported treeishes.
-}
recordContentIdentifierTree :: UUID -> Sha -> Annex () recordContentIdentifierTree :: UUID -> Sha -> Annex ()
recordContentIdentifierTree u t = do recordContentIdentifierTree u t = do
l <- calcRepo' (gitAnnexImportLog u) l <- calcRepo' (gitAnnexImportLog u)
writeLogFile l (fromRef t) exported <- getExport u
writeLogFile l $ unlines
[ fromRef t
, unwords $ map fromRef $ exportedTreeishes exported
, unwords $ map fromRef $ incompleteExportedTreeishes exported
]
{- Gets the tree last recorded for a remote. -} {- Gets the ContentIdentifier tree last recorded for a remote.
-
- This returns Nothing if no tree was recorded yet.
-
- It also returns Nothing when there have been changes to what is exported
- to the remote since the tree was recorded. That avoids a problem where
- diffing from the current Contentidentifier tree to the previous tree
- would miss changes that were made to a remote by an export, but were
- later undone manually. For example, if a file was exported to the remote,
- and then the file was manually removed from the remote, the current tree
- would not contain the file, and neither would the previous tree.
- So diffing between the trees would miss that removal. The removed
- file would then remain in the imported tree.
-}
getContentIdentifierTree :: UUID -> Annex (Maybe Sha) getContentIdentifierTree :: UUID -> Annex (Maybe Sha)
getContentIdentifierTree u = do getContentIdentifierTree u = do
l <- calcRepo' (gitAnnexImportLog u) l <- calcRepo' (gitAnnexImportLog u)
-- This is safe because the log file is written atomically. -- This is safe because the log file is written atomically.
calcLogFileUnsafe l Nothing update ls <- calcLogFileUnsafe l [] (\v ls -> L.toStrict v : ls)
exported <- getExport u
return $ case reverse ls of
-- Subsequent lines are ignored. This leaves room for future
-- expansion of what is logged.
(a:b:c:_) -> do
t <- extractSha a
exportedtreeishs <- mapM extractSha (S8.words b)
incompleteexportedtreeishs <- mapM extractSha (S8.words c)
if same exportedtreeishs (exportedTreeishes exported) &&
same incompleteexportedtreeishs (incompleteExportedTreeishes exported)
then Just t
else Nothing
_ -> Nothing
where where
update l Nothing = extractSha (L.toStrict l) same l1 l2 = S.fromList l1 == S.fromList l2
-- Subsequent lines are ignored. This leaves room for future
-- expansion of what is logged.
update _l (Just l) = Just l

View file

@ -475,3 +475,9 @@ one three
$ ls ../directory/ $ ls ../directory/
one one
``` ```
> [[fixed|done]]
>
> When this has happened in a repository, after upgrading git-annex,
> the next git-annex import will be a full import, so it will notice
> whatever changes it failed to notice before. --[[Joey]]