speed up initial scanning for annexed files
Streaming through git this way speeds it up by around 25%. This is similar to the optimisations of seeking annexed files. Sponsored-by: Dartmouth College's Datalad project
This commit is contained in:
parent
aa00e171cb
commit
0f54e5e0ae
2 changed files with 46 additions and 18 deletions
|
@ -1,6 +1,6 @@
|
||||||
{- git-annex worktree files
|
{- git-annex worktree files
|
||||||
-
|
-
|
||||||
- Copyright 2013-2020 Joey Hess <id@joeyh.name>
|
- Copyright 2013-2021 Joey Hess <id@joeyh.name>
|
||||||
-
|
-
|
||||||
- Licensed under the GNU AGPL version 3 or higher.
|
- Licensed under the GNU AGPL version 3 or higher.
|
||||||
-}
|
-}
|
||||||
|
@ -8,6 +8,7 @@
|
||||||
module Annex.WorkTree where
|
module Annex.WorkTree where
|
||||||
|
|
||||||
import Annex.Common
|
import Annex.Common
|
||||||
|
import qualified Annex
|
||||||
import Annex.Link
|
import Annex.Link
|
||||||
import Annex.CatFile
|
import Annex.CatFile
|
||||||
import Annex.Content
|
import Annex.Content
|
||||||
|
@ -16,6 +17,7 @@ import Annex.CurrentBranch
|
||||||
import Annex.InodeSentinal
|
import Annex.InodeSentinal
|
||||||
import Utility.InodeCache
|
import Utility.InodeCache
|
||||||
import Git.FilePath
|
import Git.FilePath
|
||||||
|
import Git.CatFile
|
||||||
import qualified Git.Ref
|
import qualified Git.Ref
|
||||||
import qualified Git.LsTree
|
import qualified Git.LsTree
|
||||||
import qualified Git.Types
|
import qualified Git.Types
|
||||||
|
@ -24,7 +26,7 @@ import qualified Database.Keys.SQL
|
||||||
import Config
|
import Config
|
||||||
import qualified Utility.RawFilePath as R
|
import qualified Utility.RawFilePath as R
|
||||||
|
|
||||||
import Control.Concurrent
|
import qualified Data.ByteString.Lazy as L
|
||||||
|
|
||||||
{- Looks up the key corresponding to an annexed file in the work tree,
|
{- Looks up the key corresponding to an annexed file in the work tree,
|
||||||
- by examining what the file links to.
|
- by examining what the file links to.
|
||||||
|
@ -79,30 +81,40 @@ ifAnnexed file yes no = maybe no yes =<< lookupKey file
|
||||||
-}
|
-}
|
||||||
scanAnnexedFiles :: Annex ()
|
scanAnnexedFiles :: Annex ()
|
||||||
scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ do
|
scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ do
|
||||||
dropold <- liftIO $ newMVar $
|
g <- Annex.gitRepo
|
||||||
Database.Keys.runWriter $
|
Database.Keys.runWriter $
|
||||||
liftIO . Database.Keys.SQL.dropAllAssociatedFiles
|
liftIO . Database.Keys.SQL.dropAllAssociatedFiles
|
||||||
(l, cleanup) <- inRepo $ Git.LsTree.lsTree
|
(l, cleanup) <- inRepo $ Git.LsTree.lsTree
|
||||||
Git.LsTree.LsTreeRecursive
|
Git.LsTree.LsTreeRecursive
|
||||||
(Git.LsTree.LsTreeLong False)
|
(Git.LsTree.LsTreeLong True)
|
||||||
Git.Ref.headRef
|
Git.Ref.headRef
|
||||||
forM_ l $ \i ->
|
catObjectStreamLsTree l want g go
|
||||||
maybe noop (add dropold i)
|
|
||||||
=<< catKey'
|
|
||||||
(Git.LsTree.sha i)
|
|
||||||
(fromMaybe 0 (Git.LsTree.size i))
|
|
||||||
liftIO $ void cleanup
|
liftIO $ void cleanup
|
||||||
where
|
where
|
||||||
isregfile i = case Git.Types.toTreeItemType (Git.LsTree.mode i) of
|
-- Want to process symlinks, and regular files.
|
||||||
Just Git.Types.TreeFile -> True
|
want i = case Git.Types.toTreeItemType (Git.LsTree.mode i) of
|
||||||
Just Git.Types.TreeExecutable -> True
|
Just Git.Types.TreeSymlink -> Just (i, False)
|
||||||
_ -> False
|
Just Git.Types.TreeFile -> checkfilesize i
|
||||||
add dropold i k = do
|
Just Git.Types.TreeExecutable -> checkfilesize i
|
||||||
join $ fromMaybe noop <$> liftIO (tryTakeMVar dropold)
|
_ -> Nothing
|
||||||
|
|
||||||
|
-- Avoid processing files that are too large to be pointer files.
|
||||||
|
checkfilesize i = case Git.LsTree.size i of
|
||||||
|
Just n | n < maxPointerSz -> Just (i, True)
|
||||||
|
_ -> Nothing
|
||||||
|
|
||||||
|
go getnext = liftIO getnext >>= \case
|
||||||
|
Just ((i, isregfile), Just c) -> do
|
||||||
|
maybe noop (add i isregfile)
|
||||||
|
(parseLinkTargetOrPointer (L.toStrict c))
|
||||||
|
go getnext
|
||||||
|
_ -> return ()
|
||||||
|
|
||||||
|
add i isregfile k = do
|
||||||
let tf = Git.LsTree.file i
|
let tf = Git.LsTree.file i
|
||||||
Database.Keys.runWriter $
|
Database.Keys.runWriter $
|
||||||
liftIO . Database.Keys.SQL.addAssociatedFileFast k tf
|
liftIO . Database.Keys.SQL.addAssociatedFileFast k tf
|
||||||
whenM (pure (isregfile i) <&&> inAnnex k) $ do
|
whenM (pure isregfile <&&> inAnnex k) $ do
|
||||||
f <- fromRepo $ fromTopFilePath tf
|
f <- fromRepo $ fromTopFilePath tf
|
||||||
liftIO (isPointerFile f) >>= \case
|
liftIO (isPointerFile f) >>= \case
|
||||||
Just k' | k' == k -> do
|
Just k' | k' == k -> do
|
||||||
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
[[!comment format=mdwn
|
||||||
|
username="joey"
|
||||||
|
subject="""comment 12"""
|
||||||
|
date="2021-05-31T16:30:11Z"
|
||||||
|
content="""
|
||||||
|
Implemented streaming through git. In a repo with 100000 unlocked files,
|
||||||
|
version 8.20210429 took 46 seconds, now reduced to 36 seconds.
|
||||||
|
|
||||||
|
When the files are locked, of course the old version was faster
|
||||||
|
due to being able to skip all symlinks, 2 seconds. The new version takes
|
||||||
|
slightly less time than it does for unlocked files, 35 seconds.
|
||||||
|
|
||||||
|
Now the git query and processing is only a few seconds of the total run time,
|
||||||
|
writing information about all the files to sqlite is most of the rest,
|
||||||
|
and may also be possible to speed up.
|
||||||
|
"""]]
|
Loading…
Add table
Add a link
Reference in a new issue