optimise adjustTree when adding many TreeItems (take 2)

The old code traversed the list of addtreeitems once per subdirectory in
the tree, so could get quite slow. Converting to Map lookups sped it up
significantly.

In my test case, git-annex import used to take about 2 minutes, when
calling adjustTree to add back excluded files to the imported tree. This
dropped it down to 6 seconds. Of which 4 seconds are the actual
enumeration of the contents of the remote, so really only 2 seconds for
this.

The path prefix map is a bit suboptimal memory-wise, since items get
stored in the map once per subdirectory on the path to the item. It
would perhaps be better to use a tree data structure.

Also it's suboptimal memory-wise that it builds two maps, as well
as retaining a reference to addtreeitems. I could not see a way around
that though.

This is a fixed version of commit 2c86651180.
It fixes a test suite reversion.

Sponsored-by: Jack Hill on Patreon
This commit is contained in:
Joey Hess 2024-01-16 11:52:45 -04:00
parent 3fbc5d9c7e
commit c15fa17635
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38

View file

@ -1,11 +1,12 @@
{- git trees {- git trees
- -
- Copyright 2016-2021 Joey Hess <id@joeyh.name> - Copyright 2016-2023 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
{-# LANGUAGE BangPatterns, TypeSynonymInstances, FlexibleInstances #-} {-# LANGUAGE BangPatterns, TypeSynonymInstances, FlexibleInstances #-}
{-# LANGUAGE OverloadedStrings #-}
module Git.Tree ( module Git.Tree (
Tree(..), Tree(..),
@ -231,7 +232,7 @@ adjustTree adjusttreeitem addtreeitems resolveaddconflict removefiles r repo =
withMkTreeHandle repo $ \h -> do withMkTreeHandle repo $ \h -> do
(l, cleanup) <- liftIO $ lsTreeWithObjects LsTree.LsTreeRecursive r repo (l, cleanup) <- liftIO $ lsTreeWithObjects LsTree.LsTreeRecursive r repo
(l', _, _) <- go h False [] 1 inTopTree l (l', _, _) <- go h False [] 1 inTopTree l
l'' <- adjustlist h 0 inTopTree (const True) l' l'' <- adjustlist h 0 inTopTree topTreePath l'
sha <- liftIO $ mkTree h l'' sha <- liftIO $ mkTree h l''
void $ liftIO cleanup void $ liftIO cleanup
return sha return sha
@ -250,7 +251,7 @@ adjustTree adjusttreeitem addtreeitems resolveaddconflict removefiles r repo =
in go h modified (blob:c) depth intree is in go h modified (blob:c) depth intree is
Just TreeObject -> do Just TreeObject -> do
(sl, modified, is') <- go h False [] (depth+1) (beneathSubTree i) is (sl, modified, is') <- go h False [] (depth+1) (beneathSubTree i) is
sl' <- adjustlist h depth (inTree i) (beneathSubTree i) sl sl' <- adjustlist h depth (inTree i) (gitPath i) sl
let slmodified = sl' /= sl let slmodified = sl' /= sl
subtree <- if modified || slmodified subtree <- if modified || slmodified
then liftIO $ recordSubTree h $ NewSubTree (LsTree.file i) sl' then liftIO $ recordSubTree h $ NewSubTree (LsTree.file i) sl'
@ -268,16 +269,22 @@ adjustTree adjusttreeitem addtreeitems resolveaddconflict removefiles r repo =
_ -> giveup ("unexpected object type \"" ++ decodeBS (LsTree.typeobj i) ++ "\"") _ -> giveup ("unexpected object type \"" ++ decodeBS (LsTree.typeobj i) ++ "\"")
| otherwise = return (c, wasmodified, i:is) | otherwise = return (c, wasmodified, i:is)
adjustlist h depth ishere underhere l = do adjustlist h depth ishere herepath l = do
let (addhere, rest) = partition ishere addtreeitems let addhere = fromMaybe [] $ M.lookup herepath addtreeitempathmap
let l' = filter (not . removed) $ let l' = filter (not . removed) $
addoldnew l (map treeItemToTreeContent addhere) addoldnew l (map treeItemToTreeContent addhere)
let inl i = any (\t -> beneathSubTree t i) l' let inl i = any (\t -> beneathSubTree t i) l'
let (Tree addunderhere) = flattenTree depth $ treeItemsToTree $ let (Tree addunderhere) = flattenTree depth $ treeItemsToTree $
filter (\i -> underhere i && not (inl i)) rest filter (not . inl) $ if herepath == topTreePath
then filter (not . ishere) addtreeitems
else fromMaybe [] $
M.lookup (subTreePrefix herepath) addtreeitemprefixmap
addunderhere' <- liftIO $ mapM (recordSubTree h) addunderhere addunderhere' <- liftIO $ mapM (recordSubTree h) addunderhere
return (addoldnew l' addunderhere') return (addoldnew l' addunderhere')
addtreeitempathmap = mkPathMap addtreeitems
addtreeitemprefixmap = mkSubTreePathPrefixMap addtreeitems
removeset = S.fromList $ map (P.normalise . gitPath) removefiles removeset = S.fromList $ map (P.normalise . gitPath) removefiles
removed (TreeBlob f _ _) = S.member (P.normalise (gitPath f)) removeset removed (TreeBlob f _ _) = S.member (P.normalise (gitPath f)) removeset
removed (TreeCommit f _ _) = S.member (P.normalise (gitPath f)) removeset removed (TreeCommit f _ _) = S.member (P.normalise (gitPath f)) removeset
@ -355,12 +362,8 @@ graftTree' subtree graftloc basetree repo hdl = go basetree subdirs graftdirs
subdirs = P.splitDirectories $ gitPath graftloc subdirs = P.splitDirectories $ gitPath graftloc
-- For a graftloc of "foo/bar/baz", this generates
-- ["foo", "foo/bar", "foo/bar/baz"]
graftdirs = map (asTopFilePath . toInternalGitPath) $ graftdirs = map (asTopFilePath . toInternalGitPath) $
mkpaths [] subdirs pathPrefixes subdirs
mkpaths _ [] = []
mkpaths base (d:rest) = (P.joinPath base P.</> d) : mkpaths (base ++ [d]) rest
{- Assumes the list is ordered, with tree objects coming right before their {- Assumes the list is ordered, with tree objects coming right before their
- contents. -} - contents. -}
@ -413,13 +416,50 @@ instance GitPath TreeContent where
gitPath (TreeCommit f _ _) = gitPath f gitPath (TreeCommit f _ _) = gitPath f
inTopTree :: GitPath t => t -> Bool inTopTree :: GitPath t => t -> Bool
inTopTree = inTree "." inTopTree = inTree topTreePath
topTreePath :: RawFilePath
topTreePath = "."
inTree :: (GitPath t, GitPath f) => t -> f -> Bool inTree :: (GitPath t, GitPath f) => t -> f -> Bool
inTree t f = gitPath t == P.takeDirectory (gitPath f) inTree t f = gitPath t == P.takeDirectory (gitPath f)
beneathSubTree :: (GitPath t, GitPath f) => t -> f -> Bool beneathSubTree :: (GitPath t, GitPath f) => t -> f -> Bool
beneathSubTree t f = prefix `B.isPrefixOf` P.normalise (gitPath f) beneathSubTree t f = subTreePrefix t `B.isPrefixOf` subTreePath f
subTreePath :: GitPath t => t -> RawFilePath
subTreePath = P.normalise . gitPath
subTreePrefix :: GitPath t => t -> RawFilePath
subTreePrefix t
| B.null tp = tp
| otherwise = P.addTrailingPathSeparator (P.normalise tp)
where where
tp = gitPath t tp = gitPath t
prefix = if B.null tp then tp else P.addTrailingPathSeparator (P.normalise tp)
{- Makes a Map where the keys are directories, and the values
- are the items located in that directory.
-
- Values that are not in any subdirectory are placed in
- the topTreePath key.
-}
mkPathMap :: GitPath t => [t] -> M.Map RawFilePath [t]
mkPathMap l = M.fromListWith (++) $
map (\ti -> (P.takeDirectory (gitPath ti), [ti])) l
{- Input is eg splitDirectories "foo/bar/baz",
- for which it will output ["foo", "foo/bar", "foo/bar/baz"] -}
pathPrefixes :: [RawFilePath] -> [RawFilePath]
pathPrefixes = go []
where
go _ [] = []
go base (d:rest) = (P.joinPath base P.</> d) : go (base ++ [d]) rest
{- Makes a Map where the keys are all subtree path prefixes,
- and the values are items with that subtree path prefix.
-}
mkSubTreePathPrefixMap :: GitPath t => [t] -> M.Map RawFilePath [t]
mkSubTreePathPrefixMap l = M.fromListWith (++) $ concatMap go l
where
go ti = map (\p -> (p, [ti]))
(map subTreePrefix $ pathPrefixes $ P.splitDirectories $ subTreePath ti)