implement importChanges optimisaton (not used yet)

For simplicity, I've not tried to make it handle History yet, so when
there is a history, a full import will still be done. Probably the right
way to handle history is to first diff from the current tree to the last
imported tree. Then, diff from the current tree to each of the
historical trees, and recurse through the history diffing from child tree
to parent tree.

I don't think that will need a record of the previously imported
historical trees, and so Logs.Import doesn't store them. Although I did
leave room for future expansion in that log just in case.

Next step will be to change importTree to importChanges and modify
recordImportTree et all to handle it, by using adjustTree.

Sponsored-by: Brett Eisenberg on Patreon
This commit is contained in:
Joey Hess 2023-05-31 15:45:23 -04:00
parent 7298123520
commit c6acf574c7
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
8 changed files with 169 additions and 26 deletions

View file

@ -14,6 +14,8 @@ module Annex.Import (
buildImportTrees,
recordImportTree,
canImportKeys,
ImportResult(..),
importChanges,
importKeys,
makeImportMatcher,
getImportableContents,
@ -27,6 +29,7 @@ import Git.Tree
import Git.Sha
import Git.FilePath
import Git.History
import qualified Git.DiffTree
import qualified Git.Ref
import qualified Git.Branch
import qualified Annex
@ -47,6 +50,7 @@ import Messages.Progress
import Utility.DataUnits
import Utility.Metered
import Utility.Hash (sha1s)
import Logs.Import
import Logs.Export
import Logs.Location
import Logs.PreferredContent
@ -300,22 +304,30 @@ convertImportTree msubdir ls = treeItemsToTree <$> mapM mktreeitem ls
-}
buildContentIdentifierTree
:: ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)
-> Annex (History Sha)
buildContentIdentifierTree =
buildImportTreesGeneric convertContentIdentifierTree emptyTree Nothing
-> Annex (History Sha, M.Map Sha (ContentIdentifier, ByteSize))
buildContentIdentifierTree importable = do
mv <- liftIO $ newTVarIO M.empty
r <- buildImportTreesGeneric (convertContentIdentifierTree mv) emptyTree Nothing importable
m <- liftIO $ atomically $ readTVar mv
return (r, m)
{- For speed, and to avoid bloating the repository, the ContentIdentifiers
- are not actually checked into git, instead a sha1 hash is calculated
- internally.
-}
convertContentIdentifierTree
:: Maybe TopFilePath
:: TVar (M.Map Sha (ContentIdentifier, ByteSize))
-> Maybe TopFilePath
-> [(ImportLocation, (ContentIdentifier, ByteSize))]
-> Annex Tree
convertContentIdentifierTree _ ls = pure $ treeItemsToTree $ map mktreeitem ls
convertContentIdentifierTree mv _ ls = do
let (tis, ml) = unzip (map mktreeitem ls)
liftIO $ atomically $ modifyTVar' mv $
M.union (M.fromList ml)
return (treeItemsToTree tis)
where
mktreeitem (loc, ((ContentIdentifier cid), _sz)) =
TreeItem p mode sha1
mktreeitem (loc, v@((ContentIdentifier cid), _sz)) =
(TreeItem p mode sha1, (sha1, v))
where
p = asTopFilePath (fromImportLocation loc)
mode = fromTreeItemType TreeFile
@ -414,6 +426,90 @@ canImportKeys remote importcontent =
where
ia = Remote.importActions remote
data Diffed t
= DiffChanged t
| DiffRemoved
{- Diffs between the current and previous ContentIdentifier trees, and
- runs importKeys on only the changed files.
-
- This will download the same content as if importKeys were run on all
- files, but this speeds it up significantly when there are a lot of files
- and only a few have changed. importKeys has to look up each
- ContentIdentifier to see if a Key is known for it. This avoids doing
- that lookup on files that have not changed.
-
- Diffing is not currently implemented when there is a History.
-}
importChanges
:: Remote
-> ImportTreeConfig
-> Bool
-> Bool
-> ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)
-> Annex (ImportResult (Either
(ImportableContentsChunkable Annex (Either Sha Key))
(ImportableContentsChunkable Annex (Diffed (Either Sha Key)))))
importChanges remote importtreeconfig importcontent thirdpartypopulated importablecontents = do
((History currcidtree currhistory), cidtreemap) <- buildContentIdentifierTree importablecontents
-- diffimport below does not handle history, so when there is
-- history, do a full import.
if not (S.null currhistory)
then fullimport currcidtree
else do
getContentIdentifierTree (Remote.uuid remote) >>= \case
Nothing -> fullimport currcidtree
Just prevcidtree -> diffimport cidtreemap prevcidtree currcidtree
where
remember = recordContentIdentifierTree (Remote.uuid remote)
fullimport currcidtree =
importKeys remote importtreeconfig importcontent thirdpartypopulated importablecontents >>= \case
ImportUnfinished -> return ImportUnfinished
ImportFinished r -> do
remember currcidtree
return $ ImportFinished $ Left r
diffimport cidtreemap prevcidtree currcidtree = do
(diff, cleanup) <- inRepo $ Git.DiffTree.diffTreeRecursive currcidtree prevcidtree
let (removed, changed) = partition (\ti -> Git.DiffTree.dstsha ti `elem` nullShas) diff
let mkloc = mkImportLocation . getTopFilePath . Git.DiffTree.file
let mkicchanged ti = do
v <- M.lookup (Git.DiffTree.dstsha ti) cidtreemap
return (mkloc ti, v)
let ic = ImportableContentsComplete $ ImportableContents
{ importableContents = mapMaybe mkicchanged changed
, importableHistory = []
}
importKeys remote importtreeconfig importcontent thirdpartypopulated ic >>= \case
ImportUnfinished -> do
void $ liftIO cleanup
return ImportUnfinished
ImportFinished (ImportableContentsComplete ic') -> liftIO cleanup >>= \case
False -> return ImportUnfinished
True -> do
remember currcidtree
let diffchanged = map
(\(loc, v) -> (loc, DiffChanged v))
(importableContents ic')
let diffremoved = map
(\ti -> (mkloc ti, DiffRemoved))
removed
let ic'' = ImportableContentsComplete $ ImportableContents
{ importableContents = diffremoved ++ diffchanged
, importableHistory = []
}
return $ ImportFinished $ Right ic''
-- importKeys is not passed ImportableContentsChunked
-- above, so it cannot return it
ImportFinished (ImportableContentsChunked {}) -> error "internal"
-- Result of an import. ImportUnfinished indicates that some file failed to
-- be imported. Running again should resume where it left off.
data ImportResult t
= ImportFinished t
| ImportUnfinished
{- Downloads all new ContentIdentifiers, or when importcontent is False,
- generates Keys without downloading.
-
@ -423,9 +519,6 @@ canImportKeys remote importcontent =
-
- Supports concurrency when enabled.
-
- If it fails on any file, the whole thing fails with Nothing,
- but it will resume where it left off.
-
- Note that, when a ContentIdentifier has been imported before,
- generates the same thing that was imported before, so annex.largefiles
- is not reapplied.
@ -436,10 +529,8 @@ importKeys
-> Bool
-> Bool
-> ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)
-> Annex (Maybe (ImportableContentsChunkable Annex (Either Sha Key)))
-> Annex (ImportResult (ImportableContentsChunkable Annex (Either Sha Key)))
importKeys remote importtreeconfig importcontent thirdpartypopulated importablecontents = do
_ts <- buildContentIdentifierTree importablecontents
-- TODO use above
unless (canImportKeys remote importcontent) $
giveup "This remote does not support importing without downloading content."
-- This map is used to remember content identifiers that
@ -476,13 +567,13 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec
case importablecontents of
ImportableContentsComplete ic ->
go False largematcher cidmap importing db ic >>= return . \case
Nothing -> Nothing
Just v -> Just $ ImportableContentsComplete v
Nothing -> ImportUnfinished
Just v -> ImportFinished $ ImportableContentsComplete v
ImportableContentsChunked {} -> do
c <- gochunked db (importableContentsChunk importablecontents)
gohistory largematcher cidmap importing db (importableHistoryComplete importablecontents) >>= return . \case
Nothing -> Nothing
Just h -> Just $ ImportableContentsChunked
Nothing -> ImportUnfinished
Just h -> ImportFinished $ ImportableContentsChunked
{ importableContentsChunk = c
, importableHistoryComplete = h
}

View file

@ -1,6 +1,6 @@
{- git-annex file locations
-
- Copyright 2010-2022 Joey Hess <id@joeyh.name>
- Copyright 2010-2023 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -59,6 +59,8 @@ module Annex.Locations (
gitAnnexExportLock,
gitAnnexExportUpdateLock,
gitAnnexExportExcludeLog,
gitAnnexImportDir,
gitAnnexImportLog,
gitAnnexContentIdentifierDbDir,
gitAnnexContentIdentifierLock,
gitAnnexScheduleState,
@ -438,6 +440,16 @@ gitAnnexContentIdentifierDbDir r c =
gitAnnexContentIdentifierLock :: Git.Repo -> GitConfig -> RawFilePath
gitAnnexContentIdentifierLock r c = gitAnnexContentIdentifierDbDir r c <> ".lck"
{- .git/annex/import/ is used to store information about
- imports from special remotes. -}
gitAnnexImportDir :: Git.Repo -> GitConfig -> RawFilePath
gitAnnexImportDir r c = fromMaybe (gitAnnexDir r) (annexDbDir c) P.</> "import"
{- File containing state about the last import done from a remote. -}
gitAnnexImportLog :: UUID -> Git.Repo -> GitConfig -> RawFilePath
gitAnnexImportLog u r c =
gitAnnexImportDir r c P.</> fromUUID u P.</> "log"
{- .git/annex/schedulestate is used to store information about when
- scheduled jobs were last run. -}
gitAnnexScheduleState :: Git.Repo -> RawFilePath

View file

@ -336,12 +336,12 @@ seekRemote remote branch msubdir importcontent ci = do
liftIO (atomically (readTVar importabletvar)) >>= \case
Nothing -> return ()
Just importable -> importKeys remote importtreeconfig importcontent False importable >>= \case
Nothing -> warning $ UnquotedString $ concat
ImportUnfinished -> warning $ UnquotedString $ concat
[ "Failed to import some files from "
, Remote.name remote
, ". Re-run command to resume import."
]
Just imported -> void $
ImportFinished imported -> void $
includeCommandAction $
commitimport imported
where

View file

@ -587,12 +587,12 @@ pullThirdPartyPopulated o remote
Command.Import.listContents' remote ImportTree (CheckGitIgnore False) go
where
go (Just importable) = importKeys remote ImportTree False True importable >>= \case
Just importablekeys -> do
ImportFinished importablekeys -> do
(_imported, updatestate) <- recordImportTree remote ImportTree importablekeys
next $ do
updatestate
return True
Nothing -> next $ return False
ImportUnfinished -> next $ return False
go Nothing = next $ return True -- unchanged from before
ai = ActionItemOther (Just (UnquotedString (Remote.name remote)))

View file

@ -46,7 +46,7 @@ instance NFData TopFilePath
{- A file in a branch or other treeish. -}
data BranchFilePath = BranchFilePath Ref TopFilePath
deriving (Show, Eq, Ord)
{- Git uses the branch:file form to refer to a BranchFilePath -}
descBranchFilePath :: BranchFilePath -> StringContainingQuotedPath
descBranchFilePath (BranchFilePath b f) =

37
Logs/Import.hs Normal file
View file

@ -0,0 +1,37 @@
{- git-annex import logs
-
- Copyright 2023 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
module Logs.Import (
recordContentIdentifierTree,
getContentIdentifierTree
) where
import Annex.Common
import Git.Types
import Git.Sha
import Logs.File
import qualified Data.ByteString.Lazy as L
{- Records the sha of a tree that contains hashes of ContentIdentifiers
- that were imported from a remote. -}
recordContentIdentifierTree :: UUID -> Sha -> Annex ()
recordContentIdentifierTree u t = do
l <- calcRepo' (gitAnnexImportLog u)
writeLogFile l (fromRef t)
{- Gets the tree last recorded for a remote. -}
getContentIdentifierTree :: UUID -> Annex (Maybe Sha)
getContentIdentifierTree u = do
l <- calcRepo' (gitAnnexImportLog u)
-- This is safe because the log file is written atomically.
calcLogFileUnsafe l Nothing update
where
update l Nothing = extractSha (L.toStrict l)
-- Subsequent lines are ignored. This leaves room for future
-- expansion of what is logged.
update _l (Just l) = Just l

View file

@ -1,11 +1,11 @@
{- git-annex import types
-
- Copyright 2019-2021 Joey Hess <id@joeyh.name>
- Copyright 2019-2023 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE DeriveGeneric, DeriveFunctor #-}
module Types.Import where
@ -67,7 +67,7 @@ data ImportableContents info = ImportableContents
-- locations. So, if a remote does not support Key/Value access,
-- it should not populate the importableHistory.
}
deriving (Show, Generic)
deriving (Show, Generic, Functor)
instance NFData info => NFData (ImportableContents info)
@ -81,6 +81,7 @@ data ImportableContentsChunkable m info
, importableHistoryComplete :: [ImportableContents info]
-- ^ Chunking the history is not supported
}
deriving (Functor)
{- A chunk of ImportableContents, which is the entire content of a subtree
- of the main tree. Nested subtrees are not allowed. -}
@ -92,6 +93,7 @@ data ImportableContentsChunk m info = ImportableContentsChunk
-- ^ Continuation to get the next chunk.
-- Returns Nothing when there are no more chunks.
}
deriving (Functor)
newtype ImportChunkSubDir = ImportChunkSubDir { importChunkSubDir :: RawFilePath }

View file

@ -910,6 +910,7 @@ Executable git-annex
Logs.File
Logs.FsckResults
Logs.Group
Logs.Import
Logs.Line
Logs.Location
Logs.MapLog