build git trees using ContentIdentifier to speed up import

This gets the trees built, but it does not use them. Next step will be to remember the tree for next time an import is done, and diff between old and new trees to find the files that have changed. Added --missing to the mktree parameters. That only disables a check, so it's ok to do everywhere mktree is used. It probably also speeds up mktree to disable the check. Note that git fsck does not complain about the resulting tree objects that point to shas that are not in the repository. Even with --strict. A quick benchmark, importing 10000 files, this slowed it down from 2:04.06 to 2:04.28. So it will more than pay for itself. Sponsored-by: Luke Shumaker on Patreon
2023-05-31 12:31:14 -04:00 · 2023-05-31 12:31:14 -04:00 · 7298123520
commit 7298123520
parent 51319f8558
3 changed files with 124 additions and 69 deletions
--- a/Annex/Import.hs
+++ b/Annex/Import.hs
@ -1,6 +1,6 @@
 {- git-annex import from remotes
 -
- - Copyright 2019-2021 Joey Hess <id@joeyh.name>
+ - Copyright 2019-2023 Joey Hess <id@joeyh.name>
 -
 - Licensed under the GNU AGPL version 3 or higher.
 -}
@ -46,6 +46,7 @@ import Types.KeySource
 import Messages.Progress
 import Utility.DataUnits
 import Utility.Metered
+import Utility.Hash (sha1s)
 import Logs.Export
 import Logs.Location
 import Logs.PreferredContent
@ -62,6 +63,7 @@ import qualified Data.Map.Strict as M
 import qualified Data.Set as S
 import qualified System.FilePath.Posix.ByteString as Posix
 import qualified System.FilePath.ByteString as P
+import qualified Data.ByteArray.Encoding as BA

 {- Configures how to build an import tree. -}
 data ImportTreeConfig
@ -269,69 +271,7 @@ buildImportTrees
 	-> Maybe TopFilePath
 	-> ImportableContentsChunkable Annex (Either Sha Key)
 	-> Annex (History Sha)
-buildImportTrees basetree msubdir (ImportableContentsComplete importable) = do
-	repo <- Annex.gitRepo
-	withMkTreeHandle repo $ buildImportTrees' basetree msubdir importable
-buildImportTrees basetree msubdir importable@(ImportableContentsChunked {}) = do
-	repo <- Annex.gitRepo
-	withMkTreeHandle repo $ \hdl ->
-		History
-			<$> go hdl
-			<*> buildImportTreesHistory basetree msubdir
-				(importableHistoryComplete importable) hdl
-  where
-	go hdl = do
-		tree <- gochunks [] (importableContentsChunk importable) hdl
-		importtree <- liftIO $ recordTree' hdl tree
-		graftImportTree basetree msubdir importtree hdl
-
-	gochunks l c hdl = do
-		let subdir = importChunkSubDir $ importableContentsSubDir c
-		-- Full directory prefix where the sub tree is located.
-		let fullprefix = asTopFilePath $ case msubdir of
-			Nothing -> subdir
-			Just d -> getTopFilePath d Posix.</> subdir
-		Tree ts <- convertImportTree (Just fullprefix) $
-			map (\(p, i) -> (mkImportLocation p, i))
-				(importableContentsSubTree c)
-		-- Record this subtree before getting next chunk, this
-		-- avoids buffering all the chunks into memory.
-		tc <- liftIO $ recordSubTree hdl $
-			NewSubTree (asTopFilePath subdir) ts
-		importableContentsNextChunk c >>= \case
-			Nothing -> return (Tree (tc:l))
-			Just c' -> gochunks (tc:l) c' hdl
-
-buildImportTrees'
-	:: Ref
-	-> Maybe TopFilePath
-	-> ImportableContents (Either Sha Key)
-	-> MkTreeHandle
-	-> Annex (History Sha)
-buildImportTrees' basetree msubdir importable hdl = History
-	<$> buildImportTree basetree msubdir (importableContents importable) hdl
-	<*> buildImportTreesHistory basetree msubdir (importableHistory importable) hdl
-
-buildImportTree
-	:: Ref
-	-> Maybe TopFilePath
-	-> [(ImportLocation, Either Sha Key)]
-	-> MkTreeHandle
-	-> Annex Sha
-buildImportTree basetree msubdir ls hdl = do
-	importtree <- liftIO . recordTree' hdl =<< convertImportTree msubdir ls
-	graftImportTree basetree msubdir importtree hdl
-
-graftImportTree
-	:: Ref
-	-> Maybe TopFilePath
-	-> Sha
-	-> MkTreeHandle
-	-> Annex Sha
-graftImportTree basetree msubdir tree hdl = case msubdir of
-	Nothing -> return tree
-	Just subdir -> inRepo $ \repo ->
-		graftTree' tree subdir basetree repo hdl
+buildImportTrees = buildImportTreesGeneric convertImportTree

 convertImportTree :: Maybe TopFilePath -> [(ImportLocation, Either Sha Key)] -> Annex Tree
 convertImportTree msubdir ls = treeItemsToTree <$> mapM mktreeitem ls
@ -350,14 +290,123 @@ convertImportTree msubdir ls = treeItemsToTree <$> mapM mktreeitem ls
 		topf = asTopFilePath $
 			maybe lf (\sd -> getTopFilePath sd P.</> lf) msubdir

-buildImportTreesHistory
+{- Builds a history of git trees using ContentIdentifiers.
+ -
+ - These are not the final trees that are generated by the import, which
+ - use Keys. The purpose of these trees is to allow quickly determining
+ - which files in the import have changed, and which are unchanged, to
+ - avoid needing to look up the Keys for unchanged ContentIdentifiers.
+ - When the import has a large number of files, that can be slow.
+ -}
+buildContentIdentifierTree
+	:: ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)
+	-> Annex (History Sha)
+buildContentIdentifierTree = 
+	buildImportTreesGeneric convertContentIdentifierTree emptyTree Nothing
+
+{- For speed, and to avoid bloating the repository, the ContentIdentifiers
+ - are not actually checked into git, instead a sha1 hash is calculated
+ - internally.
+ -}
+convertContentIdentifierTree
+	:: Maybe TopFilePath
+	-> [(ImportLocation, (ContentIdentifier, ByteSize))]
+	-> Annex Tree
+convertContentIdentifierTree _ ls = pure $ treeItemsToTree $ map mktreeitem ls
+  where
+	mktreeitem (loc, ((ContentIdentifier cid), _sz)) = 
+		TreeItem p mode sha1
+	  where
+		p = asTopFilePath (fromImportLocation loc)
+		mode = fromTreeItemType TreeFile
+		-- Note that this hardcodes sha1, even if git has started
+		-- defaulting to some other checksum method. That should be
+		-- ok, hopefully. This checksum never needs to be verified
+		-- by git, which is why this does not bother to prefix the
+		-- cid with its length, like git would.
+		sha1 = Ref $ BA.convertToBase BA.Base16 $ sha1s cid
+
+buildImportTreesGeneric
+	:: (Maybe TopFilePath -> [(ImportLocation, v)] -> Annex Tree)
+	-> Ref
+	-> Maybe TopFilePath
+	-> ImportableContentsChunkable Annex v
+	-> Annex (History Sha)
+buildImportTreesGeneric converttree basetree msubdir (ImportableContentsComplete importable) = do
+	repo <- Annex.gitRepo
+	withMkTreeHandle repo $ buildImportTreesGeneric' converttree basetree msubdir importable
+buildImportTreesGeneric converttree basetree msubdir importable@(ImportableContentsChunked {}) = do
+	repo <- Annex.gitRepo
+	withMkTreeHandle repo $ \hdl ->
+		History
+			<$> go hdl
+			<*> buildImportTreesHistory converttree basetree msubdir
+				(importableHistoryComplete importable) hdl
+  where
+	go hdl = do
+		tree <- gochunks [] (importableContentsChunk importable) hdl
+		importtree <- liftIO $ recordTree' hdl tree
+		graftImportTree basetree msubdir importtree hdl
+
+	gochunks l c hdl = do
+		let subdir = importChunkSubDir $ importableContentsSubDir c
+		-- Full directory prefix where the sub tree is located.
+		let fullprefix = asTopFilePath $ case msubdir of
+			Nothing -> subdir
+			Just d -> getTopFilePath d Posix.</> subdir
+		Tree ts <- converttree (Just fullprefix) $
+			map (\(p, i) -> (mkImportLocation p, i))
+				(importableContentsSubTree c)
+		-- Record this subtree before getting next chunk, this
+		-- avoids buffering all the chunks into memory.
+		tc <- liftIO $ recordSubTree hdl $
+			NewSubTree (asTopFilePath subdir) ts
+		importableContentsNextChunk c >>= \case
+			Nothing -> return (Tree (tc:l))
+			Just c' -> gochunks (tc:l) c' hdl
+
+buildImportTreesGeneric'
+	:: (Maybe TopFilePath -> [(ImportLocation, v)] -> Annex Tree)
+	-> Ref
+	-> Maybe TopFilePath
+	-> ImportableContents v
+	-> MkTreeHandle
+	-> Annex (History Sha)
+buildImportTreesGeneric' converttree basetree msubdir importable hdl = History
+	<$> buildImportTree converttree basetree msubdir (importableContents importable) hdl
+	<*> buildImportTreesHistory converttree basetree msubdir (importableHistory importable) hdl
+
+buildImportTree
+	:: (Maybe TopFilePath -> [(ImportLocation, v)] -> Annex Tree)
+	-> Ref
+	-> Maybe TopFilePath
+	-> [(ImportLocation, v)]
+	-> MkTreeHandle
+	-> Annex Sha
+buildImportTree converttree basetree msubdir ls hdl = do
+	importtree <- liftIO . recordTree' hdl =<< converttree msubdir ls
+	graftImportTree basetree msubdir importtree hdl
+
+graftImportTree
 	:: Ref
 	-> Maybe TopFilePath
-	-> [ImportableContents (Either Sha Key)]
+	-> Sha
+	-> MkTreeHandle
+	-> Annex Sha
+graftImportTree basetree msubdir tree hdl = case msubdir of
+	Nothing -> return tree
+	Just subdir -> inRepo $ \repo ->
+		graftTree' tree subdir basetree repo hdl
+
+buildImportTreesHistory
+	:: (Maybe TopFilePath -> [(ImportLocation, v)] -> Annex Tree)
+	-> Ref
+	-> Maybe TopFilePath
+	-> [ImportableContents v]
 	-> MkTreeHandle
 	-> Annex (S.Set (History Sha))
-buildImportTreesHistory basetree msubdir history hdl = S.fromList
-	<$> mapM (\ic -> buildImportTrees' basetree msubdir ic hdl) history
+buildImportTreesHistory converttree basetree msubdir history hdl = S.fromList
+	<$> mapM (\ic -> buildImportTreesGeneric' converttree basetree msubdir ic hdl) history

 canImportKeys :: Remote -> Bool -> Bool
 canImportKeys remote importcontent =
@ -389,6 +438,8 @@ importKeys
 	-> ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)
 	-> Annex (Maybe (ImportableContentsChunkable Annex (Either Sha Key)))
 importKeys remote importtreeconfig importcontent thirdpartypopulated importablecontents = do
+	_ts <- buildContentIdentifierTree importablecontents
+	-- TODO use above
 	unless (canImportKeys remote importcontent) $
 		giveup "This remote does not support importing without downloading content."
 	-- This map is used to remember content identifiers that
--- a/Git/Tree.hs
+++ b/Git/Tree.hs
@ -77,7 +77,7 @@ withMkTreeHandle :: (MonadIO m, MonadMask m) => Repo -> (MkTreeHandle -> m a) ->
 withMkTreeHandle repo a = bracketIO setup cleanup (a . MkTreeHandle)
  where
 	setup = gitCoProcessStart False ps repo
-	ps = [Param "mktree", Param "--batch", Param "-z"]
+	ps = [Param "mktree", Param "--missing", Param "--batch", Param "-z"]
 	cleanup = CoProcess.stop

 {- Records a Tree in the Repo, returning its Sha.
--- a/Utility/Hash.hs
+++ b/Utility/Hash.hs
@ -10,6 +10,7 @@
 module Utility.Hash (
 	sha1,
 	sha1_context,
+	sha1s,
 	sha2_224,
 	sha2_224_context,
 	sha2_256,
@ -84,6 +85,9 @@ sha1 = hashlazy
 sha1_context :: Context SHA1
 sha1_context = hashInit

+sha1s :: S.ByteString -> Digest SHA1
+sha1s = hash
+
 sha2_224 :: L.ByteString -> Digest SHA224
 sha2_224 = hashlazy