implement importChanges optimisaton (not used yet)

For simplicity, I've not tried to make it handle History yet, so when
there is a history, a full import will still be done. Probably the right
way to handle history is to first diff from the current tree to the last
imported tree. Then, diff from the current tree to each of the
historical trees, and recurse through the history diffing from child tree
to parent tree.

I don't think that will need a record of the previously imported
historical trees, and so Logs.Import doesn't store them. Although I did
leave room for future expansion in that log just in case.

Next step will be to change importTree to importChanges and modify
recordImportTree et all to handle it, by using adjustTree.

Sponsored-by: Brett Eisenberg on Patreon
This commit is contained in:
Joey Hess 2023-05-31 15:45:23 -04:00
parent 7298123520
commit c6acf574c7
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
8 changed files with 169 additions and 26 deletions

View file

@ -14,6 +14,8 @@ module Annex.Import (
buildImportTrees, buildImportTrees,
recordImportTree, recordImportTree,
canImportKeys, canImportKeys,
ImportResult(..),
importChanges,
importKeys, importKeys,
makeImportMatcher, makeImportMatcher,
getImportableContents, getImportableContents,
@ -27,6 +29,7 @@ import Git.Tree
import Git.Sha import Git.Sha
import Git.FilePath import Git.FilePath
import Git.History import Git.History
import qualified Git.DiffTree
import qualified Git.Ref import qualified Git.Ref
import qualified Git.Branch import qualified Git.Branch
import qualified Annex import qualified Annex
@ -47,6 +50,7 @@ import Messages.Progress
import Utility.DataUnits import Utility.DataUnits
import Utility.Metered import Utility.Metered
import Utility.Hash (sha1s) import Utility.Hash (sha1s)
import Logs.Import
import Logs.Export import Logs.Export
import Logs.Location import Logs.Location
import Logs.PreferredContent import Logs.PreferredContent
@ -300,22 +304,30 @@ convertImportTree msubdir ls = treeItemsToTree <$> mapM mktreeitem ls
-} -}
buildContentIdentifierTree buildContentIdentifierTree
:: ImportableContentsChunkable Annex (ContentIdentifier, ByteSize) :: ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)
-> Annex (History Sha) -> Annex (History Sha, M.Map Sha (ContentIdentifier, ByteSize))
buildContentIdentifierTree = buildContentIdentifierTree importable = do
buildImportTreesGeneric convertContentIdentifierTree emptyTree Nothing mv <- liftIO $ newTVarIO M.empty
r <- buildImportTreesGeneric (convertContentIdentifierTree mv) emptyTree Nothing importable
m <- liftIO $ atomically $ readTVar mv
return (r, m)
{- For speed, and to avoid bloating the repository, the ContentIdentifiers {- For speed, and to avoid bloating the repository, the ContentIdentifiers
- are not actually checked into git, instead a sha1 hash is calculated - are not actually checked into git, instead a sha1 hash is calculated
- internally. - internally.
-} -}
convertContentIdentifierTree convertContentIdentifierTree
:: Maybe TopFilePath :: TVar (M.Map Sha (ContentIdentifier, ByteSize))
-> Maybe TopFilePath
-> [(ImportLocation, (ContentIdentifier, ByteSize))] -> [(ImportLocation, (ContentIdentifier, ByteSize))]
-> Annex Tree -> Annex Tree
convertContentIdentifierTree _ ls = pure $ treeItemsToTree $ map mktreeitem ls convertContentIdentifierTree mv _ ls = do
let (tis, ml) = unzip (map mktreeitem ls)
liftIO $ atomically $ modifyTVar' mv $
M.union (M.fromList ml)
return (treeItemsToTree tis)
where where
mktreeitem (loc, ((ContentIdentifier cid), _sz)) = mktreeitem (loc, v@((ContentIdentifier cid), _sz)) =
TreeItem p mode sha1 (TreeItem p mode sha1, (sha1, v))
where where
p = asTopFilePath (fromImportLocation loc) p = asTopFilePath (fromImportLocation loc)
mode = fromTreeItemType TreeFile mode = fromTreeItemType TreeFile
@ -414,6 +426,90 @@ canImportKeys remote importcontent =
where where
ia = Remote.importActions remote ia = Remote.importActions remote
data Diffed t
= DiffChanged t
| DiffRemoved
{- Diffs between the current and previous ContentIdentifier trees, and
- runs importKeys on only the changed files.
-
- This will download the same content as if importKeys were run on all
- files, but this speeds it up significantly when there are a lot of files
- and only a few have changed. importKeys has to look up each
- ContentIdentifier to see if a Key is known for it. This avoids doing
- that lookup on files that have not changed.
-
- Diffing is not currently implemented when there is a History.
-}
importChanges
:: Remote
-> ImportTreeConfig
-> Bool
-> Bool
-> ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)
-> Annex (ImportResult (Either
(ImportableContentsChunkable Annex (Either Sha Key))
(ImportableContentsChunkable Annex (Diffed (Either Sha Key)))))
importChanges remote importtreeconfig importcontent thirdpartypopulated importablecontents = do
((History currcidtree currhistory), cidtreemap) <- buildContentIdentifierTree importablecontents
-- diffimport below does not handle history, so when there is
-- history, do a full import.
if not (S.null currhistory)
then fullimport currcidtree
else do
getContentIdentifierTree (Remote.uuid remote) >>= \case
Nothing -> fullimport currcidtree
Just prevcidtree -> diffimport cidtreemap prevcidtree currcidtree
where
remember = recordContentIdentifierTree (Remote.uuid remote)
fullimport currcidtree =
importKeys remote importtreeconfig importcontent thirdpartypopulated importablecontents >>= \case
ImportUnfinished -> return ImportUnfinished
ImportFinished r -> do
remember currcidtree
return $ ImportFinished $ Left r
diffimport cidtreemap prevcidtree currcidtree = do
(diff, cleanup) <- inRepo $ Git.DiffTree.diffTreeRecursive currcidtree prevcidtree
let (removed, changed) = partition (\ti -> Git.DiffTree.dstsha ti `elem` nullShas) diff
let mkloc = mkImportLocation . getTopFilePath . Git.DiffTree.file
let mkicchanged ti = do
v <- M.lookup (Git.DiffTree.dstsha ti) cidtreemap
return (mkloc ti, v)
let ic = ImportableContentsComplete $ ImportableContents
{ importableContents = mapMaybe mkicchanged changed
, importableHistory = []
}
importKeys remote importtreeconfig importcontent thirdpartypopulated ic >>= \case
ImportUnfinished -> do
void $ liftIO cleanup
return ImportUnfinished
ImportFinished (ImportableContentsComplete ic') -> liftIO cleanup >>= \case
False -> return ImportUnfinished
True -> do
remember currcidtree
let diffchanged = map
(\(loc, v) -> (loc, DiffChanged v))
(importableContents ic')
let diffremoved = map
(\ti -> (mkloc ti, DiffRemoved))
removed
let ic'' = ImportableContentsComplete $ ImportableContents
{ importableContents = diffremoved ++ diffchanged
, importableHistory = []
}
return $ ImportFinished $ Right ic''
-- importKeys is not passed ImportableContentsChunked
-- above, so it cannot return it
ImportFinished (ImportableContentsChunked {}) -> error "internal"
-- Result of an import. ImportUnfinished indicates that some file failed to
-- be imported. Running again should resume where it left off.
data ImportResult t
= ImportFinished t
| ImportUnfinished
{- Downloads all new ContentIdentifiers, or when importcontent is False, {- Downloads all new ContentIdentifiers, or when importcontent is False,
- generates Keys without downloading. - generates Keys without downloading.
- -
@ -423,9 +519,6 @@ canImportKeys remote importcontent =
- -
- Supports concurrency when enabled. - Supports concurrency when enabled.
- -
- If it fails on any file, the whole thing fails with Nothing,
- but it will resume where it left off.
-
- Note that, when a ContentIdentifier has been imported before, - Note that, when a ContentIdentifier has been imported before,
- generates the same thing that was imported before, so annex.largefiles - generates the same thing that was imported before, so annex.largefiles
- is not reapplied. - is not reapplied.
@ -436,10 +529,8 @@ importKeys
-> Bool -> Bool
-> Bool -> Bool
-> ImportableContentsChunkable Annex (ContentIdentifier, ByteSize) -> ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)
-> Annex (Maybe (ImportableContentsChunkable Annex (Either Sha Key))) -> Annex (ImportResult (ImportableContentsChunkable Annex (Either Sha Key)))
importKeys remote importtreeconfig importcontent thirdpartypopulated importablecontents = do importKeys remote importtreeconfig importcontent thirdpartypopulated importablecontents = do
_ts <- buildContentIdentifierTree importablecontents
-- TODO use above
unless (canImportKeys remote importcontent) $ unless (canImportKeys remote importcontent) $
giveup "This remote does not support importing without downloading content." giveup "This remote does not support importing without downloading content."
-- This map is used to remember content identifiers that -- This map is used to remember content identifiers that
@ -476,13 +567,13 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec
case importablecontents of case importablecontents of
ImportableContentsComplete ic -> ImportableContentsComplete ic ->
go False largematcher cidmap importing db ic >>= return . \case go False largematcher cidmap importing db ic >>= return . \case
Nothing -> Nothing Nothing -> ImportUnfinished
Just v -> Just $ ImportableContentsComplete v Just v -> ImportFinished $ ImportableContentsComplete v
ImportableContentsChunked {} -> do ImportableContentsChunked {} -> do
c <- gochunked db (importableContentsChunk importablecontents) c <- gochunked db (importableContentsChunk importablecontents)
gohistory largematcher cidmap importing db (importableHistoryComplete importablecontents) >>= return . \case gohistory largematcher cidmap importing db (importableHistoryComplete importablecontents) >>= return . \case
Nothing -> Nothing Nothing -> ImportUnfinished
Just h -> Just $ ImportableContentsChunked Just h -> ImportFinished $ ImportableContentsChunked
{ importableContentsChunk = c { importableContentsChunk = c
, importableHistoryComplete = h , importableHistoryComplete = h
} }

View file

@ -1,6 +1,6 @@
{- git-annex file locations {- git-annex file locations
- -
- Copyright 2010-2022 Joey Hess <id@joeyh.name> - Copyright 2010-2023 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
@ -59,6 +59,8 @@ module Annex.Locations (
gitAnnexExportLock, gitAnnexExportLock,
gitAnnexExportUpdateLock, gitAnnexExportUpdateLock,
gitAnnexExportExcludeLog, gitAnnexExportExcludeLog,
gitAnnexImportDir,
gitAnnexImportLog,
gitAnnexContentIdentifierDbDir, gitAnnexContentIdentifierDbDir,
gitAnnexContentIdentifierLock, gitAnnexContentIdentifierLock,
gitAnnexScheduleState, gitAnnexScheduleState,
@ -438,6 +440,16 @@ gitAnnexContentIdentifierDbDir r c =
gitAnnexContentIdentifierLock :: Git.Repo -> GitConfig -> RawFilePath gitAnnexContentIdentifierLock :: Git.Repo -> GitConfig -> RawFilePath
gitAnnexContentIdentifierLock r c = gitAnnexContentIdentifierDbDir r c <> ".lck" gitAnnexContentIdentifierLock r c = gitAnnexContentIdentifierDbDir r c <> ".lck"
{- .git/annex/import/ is used to store information about
- imports from special remotes. -}
gitAnnexImportDir :: Git.Repo -> GitConfig -> RawFilePath
gitAnnexImportDir r c = fromMaybe (gitAnnexDir r) (annexDbDir c) P.</> "import"
{- File containing state about the last import done from a remote. -}
gitAnnexImportLog :: UUID -> Git.Repo -> GitConfig -> RawFilePath
gitAnnexImportLog u r c =
gitAnnexImportDir r c P.</> fromUUID u P.</> "log"
{- .git/annex/schedulestate is used to store information about when {- .git/annex/schedulestate is used to store information about when
- scheduled jobs were last run. -} - scheduled jobs were last run. -}
gitAnnexScheduleState :: Git.Repo -> RawFilePath gitAnnexScheduleState :: Git.Repo -> RawFilePath

View file

@ -336,12 +336,12 @@ seekRemote remote branch msubdir importcontent ci = do
liftIO (atomically (readTVar importabletvar)) >>= \case liftIO (atomically (readTVar importabletvar)) >>= \case
Nothing -> return () Nothing -> return ()
Just importable -> importKeys remote importtreeconfig importcontent False importable >>= \case Just importable -> importKeys remote importtreeconfig importcontent False importable >>= \case
Nothing -> warning $ UnquotedString $ concat ImportUnfinished -> warning $ UnquotedString $ concat
[ "Failed to import some files from " [ "Failed to import some files from "
, Remote.name remote , Remote.name remote
, ". Re-run command to resume import." , ". Re-run command to resume import."
] ]
Just imported -> void $ ImportFinished imported -> void $
includeCommandAction $ includeCommandAction $
commitimport imported commitimport imported
where where

View file

@ -587,12 +587,12 @@ pullThirdPartyPopulated o remote
Command.Import.listContents' remote ImportTree (CheckGitIgnore False) go Command.Import.listContents' remote ImportTree (CheckGitIgnore False) go
where where
go (Just importable) = importKeys remote ImportTree False True importable >>= \case go (Just importable) = importKeys remote ImportTree False True importable >>= \case
Just importablekeys -> do ImportFinished importablekeys -> do
(_imported, updatestate) <- recordImportTree remote ImportTree importablekeys (_imported, updatestate) <- recordImportTree remote ImportTree importablekeys
next $ do next $ do
updatestate updatestate
return True return True
Nothing -> next $ return False ImportUnfinished -> next $ return False
go Nothing = next $ return True -- unchanged from before go Nothing = next $ return True -- unchanged from before
ai = ActionItemOther (Just (UnquotedString (Remote.name remote))) ai = ActionItemOther (Just (UnquotedString (Remote.name remote)))

View file

@ -46,7 +46,7 @@ instance NFData TopFilePath
{- A file in a branch or other treeish. -} {- A file in a branch or other treeish. -}
data BranchFilePath = BranchFilePath Ref TopFilePath data BranchFilePath = BranchFilePath Ref TopFilePath
deriving (Show, Eq, Ord) deriving (Show, Eq, Ord)
{- Git uses the branch:file form to refer to a BranchFilePath -} {- Git uses the branch:file form to refer to a BranchFilePath -}
descBranchFilePath :: BranchFilePath -> StringContainingQuotedPath descBranchFilePath :: BranchFilePath -> StringContainingQuotedPath
descBranchFilePath (BranchFilePath b f) = descBranchFilePath (BranchFilePath b f) =

37
Logs/Import.hs Normal file
View file

@ -0,0 +1,37 @@
{- git-annex import logs
-
- Copyright 2023 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
module Logs.Import (
recordContentIdentifierTree,
getContentIdentifierTree
) where
import Annex.Common
import Git.Types
import Git.Sha
import Logs.File
import qualified Data.ByteString.Lazy as L
{- Records the sha of a tree that contains hashes of ContentIdentifiers
- that were imported from a remote. -}
recordContentIdentifierTree :: UUID -> Sha -> Annex ()
recordContentIdentifierTree u t = do
l <- calcRepo' (gitAnnexImportLog u)
writeLogFile l (fromRef t)
{- Gets the tree last recorded for a remote. -}
getContentIdentifierTree :: UUID -> Annex (Maybe Sha)
getContentIdentifierTree u = do
l <- calcRepo' (gitAnnexImportLog u)
-- This is safe because the log file is written atomically.
calcLogFileUnsafe l Nothing update
where
update l Nothing = extractSha (L.toStrict l)
-- Subsequent lines are ignored. This leaves room for future
-- expansion of what is logged.
update _l (Just l) = Just l

View file

@ -1,11 +1,11 @@
{- git-annex import types {- git-annex import types
- -
- Copyright 2019-2021 Joey Hess <id@joeyh.name> - Copyright 2019-2023 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
{-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DeriveGeneric, DeriveFunctor #-}
module Types.Import where module Types.Import where
@ -67,7 +67,7 @@ data ImportableContents info = ImportableContents
-- locations. So, if a remote does not support Key/Value access, -- locations. So, if a remote does not support Key/Value access,
-- it should not populate the importableHistory. -- it should not populate the importableHistory.
} }
deriving (Show, Generic) deriving (Show, Generic, Functor)
instance NFData info => NFData (ImportableContents info) instance NFData info => NFData (ImportableContents info)
@ -81,6 +81,7 @@ data ImportableContentsChunkable m info
, importableHistoryComplete :: [ImportableContents info] , importableHistoryComplete :: [ImportableContents info]
-- ^ Chunking the history is not supported -- ^ Chunking the history is not supported
} }
deriving (Functor)
{- A chunk of ImportableContents, which is the entire content of a subtree {- A chunk of ImportableContents, which is the entire content of a subtree
- of the main tree. Nested subtrees are not allowed. -} - of the main tree. Nested subtrees are not allowed. -}
@ -92,6 +93,7 @@ data ImportableContentsChunk m info = ImportableContentsChunk
-- ^ Continuation to get the next chunk. -- ^ Continuation to get the next chunk.
-- Returns Nothing when there are no more chunks. -- Returns Nothing when there are no more chunks.
} }
deriving (Functor)
newtype ImportChunkSubDir = ImportChunkSubDir { importChunkSubDir :: RawFilePath } newtype ImportChunkSubDir = ImportChunkSubDir { importChunkSubDir :: RawFilePath }

View file

@ -910,6 +910,7 @@ Executable git-annex
Logs.File Logs.File
Logs.FsckResults Logs.FsckResults
Logs.Group Logs.Group
Logs.Import
Logs.Line Logs.Line
Logs.Location Logs.Location
Logs.MapLog Logs.MapLog