build git trees using ContentIdentifier to speed up import

This gets the trees built, but it does not use them. Next step will be
to remember the tree for next time an import is done, and diff between
old and new trees to find the files that have changed.

Added --missing to the mktree parameters. That only disables a check, so
it's ok to do everywhere mktree is used. It probably also speeds up
mktree to disable the check.

Note that git fsck does not complain about the resulting tree objects
that point to shas that are not in the repository. Even with --strict.

A quick benchmark, importing 10000 files, this slowed it down
from 2:04.06 to 2:04.28. So it will more than pay for itself.

Sponsored-by: Luke Shumaker on Patreon
This commit is contained in:
Joey Hess 2023-05-31 12:31:14 -04:00
parent 51319f8558
commit 7298123520
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
3 changed files with 124 additions and 69 deletions

View file

@ -1,6 +1,6 @@
{- git-annex import from remotes
-
- Copyright 2019-2021 Joey Hess <id@joeyh.name>
- Copyright 2019-2023 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -46,6 +46,7 @@ import Types.KeySource
import Messages.Progress
import Utility.DataUnits
import Utility.Metered
import Utility.Hash (sha1s)
import Logs.Export
import Logs.Location
import Logs.PreferredContent
@ -62,6 +63,7 @@ import qualified Data.Map.Strict as M
import qualified Data.Set as S
import qualified System.FilePath.Posix.ByteString as Posix
import qualified System.FilePath.ByteString as P
import qualified Data.ByteArray.Encoding as BA
{- Configures how to build an import tree. -}
data ImportTreeConfig
@ -269,69 +271,7 @@ buildImportTrees
-> Maybe TopFilePath
-> ImportableContentsChunkable Annex (Either Sha Key)
-> Annex (History Sha)
buildImportTrees basetree msubdir (ImportableContentsComplete importable) = do
repo <- Annex.gitRepo
withMkTreeHandle repo $ buildImportTrees' basetree msubdir importable
buildImportTrees basetree msubdir importable@(ImportableContentsChunked {}) = do
repo <- Annex.gitRepo
withMkTreeHandle repo $ \hdl ->
History
<$> go hdl
<*> buildImportTreesHistory basetree msubdir
(importableHistoryComplete importable) hdl
where
go hdl = do
tree <- gochunks [] (importableContentsChunk importable) hdl
importtree <- liftIO $ recordTree' hdl tree
graftImportTree basetree msubdir importtree hdl
gochunks l c hdl = do
let subdir = importChunkSubDir $ importableContentsSubDir c
-- Full directory prefix where the sub tree is located.
let fullprefix = asTopFilePath $ case msubdir of
Nothing -> subdir
Just d -> getTopFilePath d Posix.</> subdir
Tree ts <- convertImportTree (Just fullprefix) $
map (\(p, i) -> (mkImportLocation p, i))
(importableContentsSubTree c)
-- Record this subtree before getting next chunk, this
-- avoids buffering all the chunks into memory.
tc <- liftIO $ recordSubTree hdl $
NewSubTree (asTopFilePath subdir) ts
importableContentsNextChunk c >>= \case
Nothing -> return (Tree (tc:l))
Just c' -> gochunks (tc:l) c' hdl
buildImportTrees'
:: Ref
-> Maybe TopFilePath
-> ImportableContents (Either Sha Key)
-> MkTreeHandle
-> Annex (History Sha)
buildImportTrees' basetree msubdir importable hdl = History
<$> buildImportTree basetree msubdir (importableContents importable) hdl
<*> buildImportTreesHistory basetree msubdir (importableHistory importable) hdl
buildImportTree
:: Ref
-> Maybe TopFilePath
-> [(ImportLocation, Either Sha Key)]
-> MkTreeHandle
-> Annex Sha
buildImportTree basetree msubdir ls hdl = do
importtree <- liftIO . recordTree' hdl =<< convertImportTree msubdir ls
graftImportTree basetree msubdir importtree hdl
graftImportTree
:: Ref
-> Maybe TopFilePath
-> Sha
-> MkTreeHandle
-> Annex Sha
graftImportTree basetree msubdir tree hdl = case msubdir of
Nothing -> return tree
Just subdir -> inRepo $ \repo ->
graftTree' tree subdir basetree repo hdl
buildImportTrees = buildImportTreesGeneric convertImportTree
convertImportTree :: Maybe TopFilePath -> [(ImportLocation, Either Sha Key)] -> Annex Tree
convertImportTree msubdir ls = treeItemsToTree <$> mapM mktreeitem ls
@ -350,14 +290,123 @@ convertImportTree msubdir ls = treeItemsToTree <$> mapM mktreeitem ls
topf = asTopFilePath $
maybe lf (\sd -> getTopFilePath sd P.</> lf) msubdir
buildImportTreesHistory
{- Builds a history of git trees using ContentIdentifiers.
-
- These are not the final trees that are generated by the import, which
- use Keys. The purpose of these trees is to allow quickly determining
- which files in the import have changed, and which are unchanged, to
- avoid needing to look up the Keys for unchanged ContentIdentifiers.
- When the import has a large number of files, that can be slow.
-}
buildContentIdentifierTree
:: ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)
-> Annex (History Sha)
buildContentIdentifierTree =
buildImportTreesGeneric convertContentIdentifierTree emptyTree Nothing
{- For speed, and to avoid bloating the repository, the ContentIdentifiers
- are not actually checked into git, instead a sha1 hash is calculated
- internally.
-}
convertContentIdentifierTree
:: Maybe TopFilePath
-> [(ImportLocation, (ContentIdentifier, ByteSize))]
-> Annex Tree
convertContentIdentifierTree _ ls = pure $ treeItemsToTree $ map mktreeitem ls
where
mktreeitem (loc, ((ContentIdentifier cid), _sz)) =
TreeItem p mode sha1
where
p = asTopFilePath (fromImportLocation loc)
mode = fromTreeItemType TreeFile
-- Note that this hardcodes sha1, even if git has started
-- defaulting to some other checksum method. That should be
-- ok, hopefully. This checksum never needs to be verified
-- by git, which is why this does not bother to prefix the
-- cid with its length, like git would.
sha1 = Ref $ BA.convertToBase BA.Base16 $ sha1s cid
buildImportTreesGeneric
:: (Maybe TopFilePath -> [(ImportLocation, v)] -> Annex Tree)
-> Ref
-> Maybe TopFilePath
-> ImportableContentsChunkable Annex v
-> Annex (History Sha)
buildImportTreesGeneric converttree basetree msubdir (ImportableContentsComplete importable) = do
repo <- Annex.gitRepo
withMkTreeHandle repo $ buildImportTreesGeneric' converttree basetree msubdir importable
buildImportTreesGeneric converttree basetree msubdir importable@(ImportableContentsChunked {}) = do
repo <- Annex.gitRepo
withMkTreeHandle repo $ \hdl ->
History
<$> go hdl
<*> buildImportTreesHistory converttree basetree msubdir
(importableHistoryComplete importable) hdl
where
go hdl = do
tree <- gochunks [] (importableContentsChunk importable) hdl
importtree <- liftIO $ recordTree' hdl tree
graftImportTree basetree msubdir importtree hdl
gochunks l c hdl = do
let subdir = importChunkSubDir $ importableContentsSubDir c
-- Full directory prefix where the sub tree is located.
let fullprefix = asTopFilePath $ case msubdir of
Nothing -> subdir
Just d -> getTopFilePath d Posix.</> subdir
Tree ts <- converttree (Just fullprefix) $
map (\(p, i) -> (mkImportLocation p, i))
(importableContentsSubTree c)
-- Record this subtree before getting next chunk, this
-- avoids buffering all the chunks into memory.
tc <- liftIO $ recordSubTree hdl $
NewSubTree (asTopFilePath subdir) ts
importableContentsNextChunk c >>= \case
Nothing -> return (Tree (tc:l))
Just c' -> gochunks (tc:l) c' hdl
buildImportTreesGeneric'
:: (Maybe TopFilePath -> [(ImportLocation, v)] -> Annex Tree)
-> Ref
-> Maybe TopFilePath
-> ImportableContents v
-> MkTreeHandle
-> Annex (History Sha)
buildImportTreesGeneric' converttree basetree msubdir importable hdl = History
<$> buildImportTree converttree basetree msubdir (importableContents importable) hdl
<*> buildImportTreesHistory converttree basetree msubdir (importableHistory importable) hdl
buildImportTree
:: (Maybe TopFilePath -> [(ImportLocation, v)] -> Annex Tree)
-> Ref
-> Maybe TopFilePath
-> [(ImportLocation, v)]
-> MkTreeHandle
-> Annex Sha
buildImportTree converttree basetree msubdir ls hdl = do
importtree <- liftIO . recordTree' hdl =<< converttree msubdir ls
graftImportTree basetree msubdir importtree hdl
graftImportTree
:: Ref
-> Maybe TopFilePath
-> [ImportableContents (Either Sha Key)]
-> Sha
-> MkTreeHandle
-> Annex Sha
graftImportTree basetree msubdir tree hdl = case msubdir of
Nothing -> return tree
Just subdir -> inRepo $ \repo ->
graftTree' tree subdir basetree repo hdl
buildImportTreesHistory
:: (Maybe TopFilePath -> [(ImportLocation, v)] -> Annex Tree)
-> Ref
-> Maybe TopFilePath
-> [ImportableContents v]
-> MkTreeHandle
-> Annex (S.Set (History Sha))
buildImportTreesHistory basetree msubdir history hdl = S.fromList
<$> mapM (\ic -> buildImportTrees' basetree msubdir ic hdl) history
buildImportTreesHistory converttree basetree msubdir history hdl = S.fromList
<$> mapM (\ic -> buildImportTreesGeneric' converttree basetree msubdir ic hdl) history
canImportKeys :: Remote -> Bool -> Bool
canImportKeys remote importcontent =
@ -389,6 +438,8 @@ importKeys
-> ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)
-> Annex (Maybe (ImportableContentsChunkable Annex (Either Sha Key)))
importKeys remote importtreeconfig importcontent thirdpartypopulated importablecontents = do
_ts <- buildContentIdentifierTree importablecontents
-- TODO use above
unless (canImportKeys remote importcontent) $
giveup "This remote does not support importing without downloading content."
-- This map is used to remember content identifiers that

View file

@ -77,7 +77,7 @@ withMkTreeHandle :: (MonadIO m, MonadMask m) => Repo -> (MkTreeHandle -> m a) ->
withMkTreeHandle repo a = bracketIO setup cleanup (a . MkTreeHandle)
where
setup = gitCoProcessStart False ps repo
ps = [Param "mktree", Param "--batch", Param "-z"]
ps = [Param "mktree", Param "--missing", Param "--batch", Param "-z"]
cleanup = CoProcess.stop
{- Records a Tree in the Repo, returning its Sha.

View file

@ -10,6 +10,7 @@
module Utility.Hash (
sha1,
sha1_context,
sha1s,
sha2_224,
sha2_224_context,
sha2_256,
@ -84,6 +85,9 @@ sha1 = hashlazy
sha1_context :: Context SHA1
sha1_context = hashInit
sha1s :: S.ByteString -> Digest SHA1
sha1s = hash
sha2_224 :: L.ByteString -> Digest SHA224
sha2_224 = hashlazy