implement importChanges optimisaton (not used yet)
For simplicity, I've not tried to make it handle History yet, so when there is a history, a full import will still be done. Probably the right way to handle history is to first diff from the current tree to the last imported tree. Then, diff from the current tree to each of the historical trees, and recurse through the history diffing from child tree to parent tree. I don't think that will need a record of the previously imported historical trees, and so Logs.Import doesn't store them. Although I did leave room for future expansion in that log just in case. Next step will be to change importTree to importChanges and modify recordImportTree et all to handle it, by using adjustTree. Sponsored-by: Brett Eisenberg on Patreon
This commit is contained in:
parent
7298123520
commit
c6acf574c7
8 changed files with 169 additions and 26 deletions
125
Annex/Import.hs
125
Annex/Import.hs
|
@ -14,6 +14,8 @@ module Annex.Import (
|
|||
buildImportTrees,
|
||||
recordImportTree,
|
||||
canImportKeys,
|
||||
ImportResult(..),
|
||||
importChanges,
|
||||
importKeys,
|
||||
makeImportMatcher,
|
||||
getImportableContents,
|
||||
|
@ -27,6 +29,7 @@ import Git.Tree
|
|||
import Git.Sha
|
||||
import Git.FilePath
|
||||
import Git.History
|
||||
import qualified Git.DiffTree
|
||||
import qualified Git.Ref
|
||||
import qualified Git.Branch
|
||||
import qualified Annex
|
||||
|
@ -47,6 +50,7 @@ import Messages.Progress
|
|||
import Utility.DataUnits
|
||||
import Utility.Metered
|
||||
import Utility.Hash (sha1s)
|
||||
import Logs.Import
|
||||
import Logs.Export
|
||||
import Logs.Location
|
||||
import Logs.PreferredContent
|
||||
|
@ -300,22 +304,30 @@ convertImportTree msubdir ls = treeItemsToTree <$> mapM mktreeitem ls
|
|||
-}
|
||||
buildContentIdentifierTree
|
||||
:: ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)
|
||||
-> Annex (History Sha)
|
||||
buildContentIdentifierTree =
|
||||
buildImportTreesGeneric convertContentIdentifierTree emptyTree Nothing
|
||||
-> Annex (History Sha, M.Map Sha (ContentIdentifier, ByteSize))
|
||||
buildContentIdentifierTree importable = do
|
||||
mv <- liftIO $ newTVarIO M.empty
|
||||
r <- buildImportTreesGeneric (convertContentIdentifierTree mv) emptyTree Nothing importable
|
||||
m <- liftIO $ atomically $ readTVar mv
|
||||
return (r, m)
|
||||
|
||||
{- For speed, and to avoid bloating the repository, the ContentIdentifiers
|
||||
- are not actually checked into git, instead a sha1 hash is calculated
|
||||
- internally.
|
||||
-}
|
||||
convertContentIdentifierTree
|
||||
:: Maybe TopFilePath
|
||||
:: TVar (M.Map Sha (ContentIdentifier, ByteSize))
|
||||
-> Maybe TopFilePath
|
||||
-> [(ImportLocation, (ContentIdentifier, ByteSize))]
|
||||
-> Annex Tree
|
||||
convertContentIdentifierTree _ ls = pure $ treeItemsToTree $ map mktreeitem ls
|
||||
convertContentIdentifierTree mv _ ls = do
|
||||
let (tis, ml) = unzip (map mktreeitem ls)
|
||||
liftIO $ atomically $ modifyTVar' mv $
|
||||
M.union (M.fromList ml)
|
||||
return (treeItemsToTree tis)
|
||||
where
|
||||
mktreeitem (loc, ((ContentIdentifier cid), _sz)) =
|
||||
TreeItem p mode sha1
|
||||
mktreeitem (loc, v@((ContentIdentifier cid), _sz)) =
|
||||
(TreeItem p mode sha1, (sha1, v))
|
||||
where
|
||||
p = asTopFilePath (fromImportLocation loc)
|
||||
mode = fromTreeItemType TreeFile
|
||||
|
@ -414,6 +426,90 @@ canImportKeys remote importcontent =
|
|||
where
|
||||
ia = Remote.importActions remote
|
||||
|
||||
data Diffed t
|
||||
= DiffChanged t
|
||||
| DiffRemoved
|
||||
|
||||
{- Diffs between the current and previous ContentIdentifier trees, and
|
||||
- runs importKeys on only the changed files.
|
||||
-
|
||||
- This will download the same content as if importKeys were run on all
|
||||
- files, but this speeds it up significantly when there are a lot of files
|
||||
- and only a few have changed. importKeys has to look up each
|
||||
- ContentIdentifier to see if a Key is known for it. This avoids doing
|
||||
- that lookup on files that have not changed.
|
||||
-
|
||||
- Diffing is not currently implemented when there is a History.
|
||||
-}
|
||||
importChanges
|
||||
:: Remote
|
||||
-> ImportTreeConfig
|
||||
-> Bool
|
||||
-> Bool
|
||||
-> ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)
|
||||
-> Annex (ImportResult (Either
|
||||
(ImportableContentsChunkable Annex (Either Sha Key))
|
||||
(ImportableContentsChunkable Annex (Diffed (Either Sha Key)))))
|
||||
importChanges remote importtreeconfig importcontent thirdpartypopulated importablecontents = do
|
||||
((History currcidtree currhistory), cidtreemap) <- buildContentIdentifierTree importablecontents
|
||||
-- diffimport below does not handle history, so when there is
|
||||
-- history, do a full import.
|
||||
if not (S.null currhistory)
|
||||
then fullimport currcidtree
|
||||
else do
|
||||
getContentIdentifierTree (Remote.uuid remote) >>= \case
|
||||
Nothing -> fullimport currcidtree
|
||||
Just prevcidtree -> diffimport cidtreemap prevcidtree currcidtree
|
||||
where
|
||||
remember = recordContentIdentifierTree (Remote.uuid remote)
|
||||
|
||||
fullimport currcidtree =
|
||||
importKeys remote importtreeconfig importcontent thirdpartypopulated importablecontents >>= \case
|
||||
ImportUnfinished -> return ImportUnfinished
|
||||
ImportFinished r -> do
|
||||
remember currcidtree
|
||||
return $ ImportFinished $ Left r
|
||||
|
||||
diffimport cidtreemap prevcidtree currcidtree = do
|
||||
(diff, cleanup) <- inRepo $ Git.DiffTree.diffTreeRecursive currcidtree prevcidtree
|
||||
let (removed, changed) = partition (\ti -> Git.DiffTree.dstsha ti `elem` nullShas) diff
|
||||
let mkloc = mkImportLocation . getTopFilePath . Git.DiffTree.file
|
||||
let mkicchanged ti = do
|
||||
v <- M.lookup (Git.DiffTree.dstsha ti) cidtreemap
|
||||
return (mkloc ti, v)
|
||||
let ic = ImportableContentsComplete $ ImportableContents
|
||||
{ importableContents = mapMaybe mkicchanged changed
|
||||
, importableHistory = []
|
||||
}
|
||||
importKeys remote importtreeconfig importcontent thirdpartypopulated ic >>= \case
|
||||
ImportUnfinished -> do
|
||||
void $ liftIO cleanup
|
||||
return ImportUnfinished
|
||||
ImportFinished (ImportableContentsComplete ic') -> liftIO cleanup >>= \case
|
||||
False -> return ImportUnfinished
|
||||
True -> do
|
||||
remember currcidtree
|
||||
let diffchanged = map
|
||||
(\(loc, v) -> (loc, DiffChanged v))
|
||||
(importableContents ic')
|
||||
let diffremoved = map
|
||||
(\ti -> (mkloc ti, DiffRemoved))
|
||||
removed
|
||||
let ic'' = ImportableContentsComplete $ ImportableContents
|
||||
{ importableContents = diffremoved ++ diffchanged
|
||||
, importableHistory = []
|
||||
}
|
||||
return $ ImportFinished $ Right ic''
|
||||
-- importKeys is not passed ImportableContentsChunked
|
||||
-- above, so it cannot return it
|
||||
ImportFinished (ImportableContentsChunked {}) -> error "internal"
|
||||
|
||||
-- Result of an import. ImportUnfinished indicates that some file failed to
|
||||
-- be imported. Running again should resume where it left off.
|
||||
data ImportResult t
|
||||
= ImportFinished t
|
||||
| ImportUnfinished
|
||||
|
||||
{- Downloads all new ContentIdentifiers, or when importcontent is False,
|
||||
- generates Keys without downloading.
|
||||
-
|
||||
|
@ -423,9 +519,6 @@ canImportKeys remote importcontent =
|
|||
-
|
||||
- Supports concurrency when enabled.
|
||||
-
|
||||
- If it fails on any file, the whole thing fails with Nothing,
|
||||
- but it will resume where it left off.
|
||||
-
|
||||
- Note that, when a ContentIdentifier has been imported before,
|
||||
- generates the same thing that was imported before, so annex.largefiles
|
||||
- is not reapplied.
|
||||
|
@ -436,10 +529,8 @@ importKeys
|
|||
-> Bool
|
||||
-> Bool
|
||||
-> ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)
|
||||
-> Annex (Maybe (ImportableContentsChunkable Annex (Either Sha Key)))
|
||||
-> Annex (ImportResult (ImportableContentsChunkable Annex (Either Sha Key)))
|
||||
importKeys remote importtreeconfig importcontent thirdpartypopulated importablecontents = do
|
||||
_ts <- buildContentIdentifierTree importablecontents
|
||||
-- TODO use above
|
||||
unless (canImportKeys remote importcontent) $
|
||||
giveup "This remote does not support importing without downloading content."
|
||||
-- This map is used to remember content identifiers that
|
||||
|
@ -476,13 +567,13 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec
|
|||
case importablecontents of
|
||||
ImportableContentsComplete ic ->
|
||||
go False largematcher cidmap importing db ic >>= return . \case
|
||||
Nothing -> Nothing
|
||||
Just v -> Just $ ImportableContentsComplete v
|
||||
Nothing -> ImportUnfinished
|
||||
Just v -> ImportFinished $ ImportableContentsComplete v
|
||||
ImportableContentsChunked {} -> do
|
||||
c <- gochunked db (importableContentsChunk importablecontents)
|
||||
gohistory largematcher cidmap importing db (importableHistoryComplete importablecontents) >>= return . \case
|
||||
Nothing -> Nothing
|
||||
Just h -> Just $ ImportableContentsChunked
|
||||
Nothing -> ImportUnfinished
|
||||
Just h -> ImportFinished $ ImportableContentsChunked
|
||||
{ importableContentsChunk = c
|
||||
, importableHistoryComplete = h
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{- git-annex file locations
|
||||
-
|
||||
- Copyright 2010-2022 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2010-2023 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
@ -59,6 +59,8 @@ module Annex.Locations (
|
|||
gitAnnexExportLock,
|
||||
gitAnnexExportUpdateLock,
|
||||
gitAnnexExportExcludeLog,
|
||||
gitAnnexImportDir,
|
||||
gitAnnexImportLog,
|
||||
gitAnnexContentIdentifierDbDir,
|
||||
gitAnnexContentIdentifierLock,
|
||||
gitAnnexScheduleState,
|
||||
|
@ -438,6 +440,16 @@ gitAnnexContentIdentifierDbDir r c =
|
|||
gitAnnexContentIdentifierLock :: Git.Repo -> GitConfig -> RawFilePath
|
||||
gitAnnexContentIdentifierLock r c = gitAnnexContentIdentifierDbDir r c <> ".lck"
|
||||
|
||||
{- .git/annex/import/ is used to store information about
|
||||
- imports from special remotes. -}
|
||||
gitAnnexImportDir :: Git.Repo -> GitConfig -> RawFilePath
|
||||
gitAnnexImportDir r c = fromMaybe (gitAnnexDir r) (annexDbDir c) P.</> "import"
|
||||
|
||||
{- File containing state about the last import done from a remote. -}
|
||||
gitAnnexImportLog :: UUID -> Git.Repo -> GitConfig -> RawFilePath
|
||||
gitAnnexImportLog u r c =
|
||||
gitAnnexImportDir r c P.</> fromUUID u P.</> "log"
|
||||
|
||||
{- .git/annex/schedulestate is used to store information about when
|
||||
- scheduled jobs were last run. -}
|
||||
gitAnnexScheduleState :: Git.Repo -> RawFilePath
|
||||
|
|
|
@ -336,12 +336,12 @@ seekRemote remote branch msubdir importcontent ci = do
|
|||
liftIO (atomically (readTVar importabletvar)) >>= \case
|
||||
Nothing -> return ()
|
||||
Just importable -> importKeys remote importtreeconfig importcontent False importable >>= \case
|
||||
Nothing -> warning $ UnquotedString $ concat
|
||||
ImportUnfinished -> warning $ UnquotedString $ concat
|
||||
[ "Failed to import some files from "
|
||||
, Remote.name remote
|
||||
, ". Re-run command to resume import."
|
||||
]
|
||||
Just imported -> void $
|
||||
ImportFinished imported -> void $
|
||||
includeCommandAction $
|
||||
commitimport imported
|
||||
where
|
||||
|
|
|
@ -587,12 +587,12 @@ pullThirdPartyPopulated o remote
|
|||
Command.Import.listContents' remote ImportTree (CheckGitIgnore False) go
|
||||
where
|
||||
go (Just importable) = importKeys remote ImportTree False True importable >>= \case
|
||||
Just importablekeys -> do
|
||||
ImportFinished importablekeys -> do
|
||||
(_imported, updatestate) <- recordImportTree remote ImportTree importablekeys
|
||||
next $ do
|
||||
updatestate
|
||||
return True
|
||||
Nothing -> next $ return False
|
||||
ImportUnfinished -> next $ return False
|
||||
go Nothing = next $ return True -- unchanged from before
|
||||
|
||||
ai = ActionItemOther (Just (UnquotedString (Remote.name remote)))
|
||||
|
|
|
@ -46,7 +46,7 @@ instance NFData TopFilePath
|
|||
{- A file in a branch or other treeish. -}
|
||||
data BranchFilePath = BranchFilePath Ref TopFilePath
|
||||
deriving (Show, Eq, Ord)
|
||||
|
||||
|
||||
{- Git uses the branch:file form to refer to a BranchFilePath -}
|
||||
descBranchFilePath :: BranchFilePath -> StringContainingQuotedPath
|
||||
descBranchFilePath (BranchFilePath b f) =
|
||||
|
|
37
Logs/Import.hs
Normal file
37
Logs/Import.hs
Normal file
|
@ -0,0 +1,37 @@
|
|||
{- git-annex import logs
|
||||
-
|
||||
- Copyright 2023 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
||||
module Logs.Import (
|
||||
recordContentIdentifierTree,
|
||||
getContentIdentifierTree
|
||||
) where
|
||||
|
||||
import Annex.Common
|
||||
import Git.Types
|
||||
import Git.Sha
|
||||
import Logs.File
|
||||
|
||||
import qualified Data.ByteString.Lazy as L
|
||||
|
||||
{- Records the sha of a tree that contains hashes of ContentIdentifiers
|
||||
- that were imported from a remote. -}
|
||||
recordContentIdentifierTree :: UUID -> Sha -> Annex ()
|
||||
recordContentIdentifierTree u t = do
|
||||
l <- calcRepo' (gitAnnexImportLog u)
|
||||
writeLogFile l (fromRef t)
|
||||
|
||||
{- Gets the tree last recorded for a remote. -}
|
||||
getContentIdentifierTree :: UUID -> Annex (Maybe Sha)
|
||||
getContentIdentifierTree u = do
|
||||
l <- calcRepo' (gitAnnexImportLog u)
|
||||
-- This is safe because the log file is written atomically.
|
||||
calcLogFileUnsafe l Nothing update
|
||||
where
|
||||
update l Nothing = extractSha (L.toStrict l)
|
||||
-- Subsequent lines are ignored. This leaves room for future
|
||||
-- expansion of what is logged.
|
||||
update _l (Just l) = Just l
|
|
@ -1,11 +1,11 @@
|
|||
{- git-annex import types
|
||||
-
|
||||
- Copyright 2019-2021 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2019-2023 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
||||
{-# LANGUAGE DeriveGeneric #-}
|
||||
{-# LANGUAGE DeriveGeneric, DeriveFunctor #-}
|
||||
|
||||
module Types.Import where
|
||||
|
||||
|
@ -67,7 +67,7 @@ data ImportableContents info = ImportableContents
|
|||
-- locations. So, if a remote does not support Key/Value access,
|
||||
-- it should not populate the importableHistory.
|
||||
}
|
||||
deriving (Show, Generic)
|
||||
deriving (Show, Generic, Functor)
|
||||
|
||||
instance NFData info => NFData (ImportableContents info)
|
||||
|
||||
|
@ -81,6 +81,7 @@ data ImportableContentsChunkable m info
|
|||
, importableHistoryComplete :: [ImportableContents info]
|
||||
-- ^ Chunking the history is not supported
|
||||
}
|
||||
deriving (Functor)
|
||||
|
||||
{- A chunk of ImportableContents, which is the entire content of a subtree
|
||||
- of the main tree. Nested subtrees are not allowed. -}
|
||||
|
@ -92,6 +93,7 @@ data ImportableContentsChunk m info = ImportableContentsChunk
|
|||
-- ^ Continuation to get the next chunk.
|
||||
-- Returns Nothing when there are no more chunks.
|
||||
}
|
||||
deriving (Functor)
|
||||
|
||||
newtype ImportChunkSubDir = ImportChunkSubDir { importChunkSubDir :: RawFilePath }
|
||||
|
||||
|
|
|
@ -910,6 +910,7 @@ Executable git-annex
|
|||
Logs.File
|
||||
Logs.FsckResults
|
||||
Logs.Group
|
||||
Logs.Import
|
||||
Logs.Line
|
||||
Logs.Location
|
||||
Logs.MapLog
|
||||
|
|
Loading…
Add table
Reference in a new issue