Sped up view branch construction by 50%

A benchmark in my sound repository with `git-annex view feedtitle=*`
took 2:52 wall clock time before and 1:58 after. Though it still only used
130% of CPU.

This is the same kind of optimisation that is in seekFilteredKeys, though
that precaches location logs while this streams the metadata logs direct
to parsing them.

seekFilteredKeys contains more streaming, to find the annexed files, and
this could be further sped up with similar streaming.

Sponsored-by: Nicholas Golder-Manning on Patreon
This commit is contained in:
Joey Hess 2023-02-13 13:29:57 -04:00
parent c733ccdf21
commit 826b225ca8
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
3 changed files with 63 additions and 23 deletions

View file

@ -15,12 +15,14 @@ import Types.View
import Types.MetaData import Types.MetaData
import Annex.MetaData import Annex.MetaData
import qualified Annex import qualified Annex
import qualified Annex.Branch
import qualified Git import qualified Git
import qualified Git.DiffTree as DiffTree import qualified Git.DiffTree as DiffTree
import qualified Git.Branch import qualified Git.Branch
import qualified Git.LsFiles import qualified Git.LsFiles
import qualified Git.LsTree import qualified Git.LsTree
import qualified Git.Ref import qualified Git.Ref
import Git.CatFile
import Git.UpdateIndex import Git.UpdateIndex
import Git.Sha import Git.Sha
import Git.Types import Git.Types
@ -29,6 +31,8 @@ import Annex.WorkTree
import Annex.GitOverlay import Annex.GitOverlay
import Annex.Link import Annex.Link
import Annex.CatFile import Annex.CatFile
import Annex.Concurrent
import Logs
import Logs.MetaData import Logs.MetaData
import Logs.View import Logs.View
import Utility.Glob import Utility.Glob
@ -41,6 +45,7 @@ import qualified Data.ByteString as B
import qualified Data.Set as S import qualified Data.Set as S
import qualified Data.Map as M import qualified Data.Map as M
import qualified System.FilePath.ByteString as P import qualified System.FilePath.ByteString as P
import Control.Concurrent.Async
import "mtl" Control.Monad.Writer import "mtl" Control.Monad.Writer
{- Each visible ViewFilter in a view results in another level of {- Each visible ViewFilter in a view results in another level of
@ -435,9 +440,10 @@ applyView' mkviewedfile getfilemetadata view = do
top <- fromRepo Git.repoPath top <- fromRepo Git.repoPath
(l, clean) <- inRepo $ Git.LsFiles.inRepoDetails [] [top] (l, clean) <- inRepo $ Git.LsFiles.inRepoDetails [] [top]
applyView'' mkviewedfile getfilemetadata view l clean $ applyView'' mkviewedfile getfilemetadata view l clean $
\go (f, sha, mode) -> do \(f, sha, mode) -> do
topf <- inRepo (toTopFilePath f) topf <- inRepo (toTopFilePath f)
go topf sha (toTreeItemType mode) =<< lookupKey f k <- lookupKey f
return (topf, sha, toTreeItemType mode, k)
genViewBranch view genViewBranch view
applyView'' applyView''
@ -446,29 +452,58 @@ applyView''
-> View -> View
-> [t] -> [t]
-> IO Bool -> IO Bool
-> ((TopFilePath -> Sha -> Maybe TreeItemType -> Maybe Key -> Annex ()) -> t -> Annex ()) -> (t -> Annex (TopFilePath, Sha, Maybe TreeItemType, Maybe Key))
-> Annex () -> Annex ()
applyView'' mkviewedfile getfilemetadata view l clean a = do applyView'' mkviewedfile getfilemetadata view l clean conv = do
viewg <- withNewViewIndex gitRepo viewg <- withNewViewIndex gitRepo
withUpdateIndex viewg $ \uh -> do withUpdateIndex viewg $ \uh -> do
forM_ l $ a (go uh) g <- Annex.gitRepo
liftIO $ void clean gc <- Annex.getGitConfig
-- Streaming the metadata like this is an optimisation.
catObjectStream g $ \mdfeeder mdcloser mdreader -> do
tid <- liftIO . async =<< forkState
(getmetadata gc mdfeeder mdcloser l)
process uh mdreader
join (liftIO (wait tid))
liftIO $ void clean
where where
genviewedfiles = viewedFiles view mkviewedfile -- enables memoization genviewedfiles = viewedFiles view mkviewedfile -- enables memoization
go uh topf _sha _mode (Just k) = do getmetadata _ _ mdcloser [] = liftIO mdcloser
metadata <- getCurrentMetaData k getmetadata gc mdfeeder mdcloser (t:ts) = do
let f = fromRawFilePath $ getTopFilePath topf v@(topf, _sha, _treeitemtype, mkey) <- conv t
let metadata' = getfilemetadata f `unionMetaData` metadata let feed mdlogf = liftIO $ mdfeeder
forM_ (genviewedfiles f metadata') $ \fv -> do (v, Git.Ref.branchFileRef Annex.Branch.fullname mdlogf)
f' <- fromRepo (fromTopFilePath $ asTopFilePath $ toRawFilePath fv) case mkey of
stagesymlink uh f' =<< calcRepo (gitAnnexLink f' k) Just key -> feed (metaDataLogFile gc key)
go uh topf sha (Just treeitemtype) Nothing Nothing
| "." `B.isPrefixOf` getTopFilePath topf = -- Handle toplevel dotfiles that are not
-- annexed files by feeding through a query
-- for dummy metadata. Calling
-- Git.UpdateIndex.streamUpdateIndex'
-- here would race with process's calls
-- to it.
| "." `B.isPrefixOf` getTopFilePath topf ->
feed "dummy"
| otherwise -> noop
getmetadata gc mdfeeder mdcloser ts
process uh mdreader = liftIO mdreader >>= \case
Just ((topf, _, _, Just k), Just mdlog) -> do
let metadata = parseCurrentMetaData mdlog
let f = fromRawFilePath $ getTopFilePath topf
let metadata' = getfilemetadata f `unionMetaData` metadata
forM_ (genviewedfiles f metadata') $ \fv -> do
f' <- fromRepo (fromTopFilePath $ asTopFilePath $ toRawFilePath fv)
stagesymlink uh f' =<< calcRepo (gitAnnexLink f' k)
process uh mdreader
Just ((topf, sha, Just treeitemtype, Nothing), _) -> do
liftIO $ Git.UpdateIndex.streamUpdateIndex' uh $ liftIO $ Git.UpdateIndex.streamUpdateIndex' uh $
pureStreamer $ updateIndexLine sha treeitemtype topf pureStreamer $ updateIndexLine sha treeitemtype topf
go _ _ _ _ _ = noop process uh mdreader
Just _ -> process uh mdreader
Nothing -> return ()
stagesymlink uh f linktarget = do stagesymlink uh f linktarget = do
sha <- hashSymlink linktarget sha <- hashSymlink linktarget
liftIO . Git.UpdateIndex.streamUpdateIndex' uh liftIO . Git.UpdateIndex.streamUpdateIndex' uh
@ -490,17 +525,18 @@ updateView view = do
(Git.LsTree.LsTreeLong True) (Git.LsTree.LsTreeLong True)
(viewParentBranch view) (viewParentBranch view)
applyView'' viewedFileFromReference getWorkTreeMetaData view l clean $ applyView'' viewedFileFromReference getWorkTreeMetaData view l clean $
\go ti -> do \ti -> do
let ref = Git.Ref.branchFileRef (viewParentBranch view) let ref = Git.Ref.branchFileRef (viewParentBranch view)
(getTopFilePath (Git.LsTree.file ti)) (getTopFilePath (Git.LsTree.file ti))
k <- case Git.LsTree.size ti of k <- case Git.LsTree.size ti of
Nothing -> catKey ref Nothing -> catKey ref
Just sz -> catKey' ref sz Just sz -> catKey' ref sz
go return
(Git.LsTree.file ti) ( (Git.LsTree.file ti)
(Git.LsTree.sha ti) , (Git.LsTree.sha ti)
(toTreeItemType (Git.LsTree.mode ti)) , (toTreeItemType (Git.LsTree.mode ti))
k , k
)
oldcommit <- inRepo $ Git.Ref.sha (branchView view) oldcommit <- inRepo $ Git.Ref.sha (branchView view)
oldtree <- maybe (pure Nothing) (inRepo . Git.Ref.tree) oldcommit oldtree <- maybe (pure Nothing) (inRepo . Git.Ref.tree) oldcommit
newtree <- withViewIndex $ inRepo Git.Branch.writeTree newtree <- withViewIndex $ inRepo Git.Branch.writeTree

View file

@ -20,6 +20,7 @@ git-annex (10.20230127) UNRELEASED; urgency=medium
number, the same as git does. number, the same as git does.
* sync: Warn when the adjusted basis ref cannot be found, as happens eg when * sync: Warn when the adjusted basis ref cannot be found, as happens eg when
the user has renamed branches. the user has renamed branches.
* Sped up view branch construction by 50%.
-- Joey Hess <id@joeyh.name> Mon, 06 Feb 2023 13:39:18 -0400 -- Joey Hess <id@joeyh.name> Mon, 06 Feb 2023 13:39:18 -0400

View file

@ -18,3 +18,6 @@ It may also be that just improving the precaching of metadata logs
would improve the speed a lot. The streaming precaching of location logs would improve the speed a lot. The streaming precaching of location logs
sped up some commands around 2x before IIRC. sped up some commands around 2x before IIRC.
> catObjectStream of metadata logs sped view constriction up by ~50%.
> More streaming should speed it up more; it still uses
> lookupKey/catKey once per file. --[[Joey]]