avoid double work in git-annex init, second try

reconcileStaged populates the db, so scanAnnexedFiles does not need to
do it again. It still makes a pass over the HEAD tree, but populating
the db was most of the expensive part.

Benchmarking with 100,000 files, git-annex init now takes 40 seconds,
vs 37 seconds with the old, buggy version of this fix. It should be
possible to win those 3 precious seconds per 100k files back, in the
case when when annex.thin is not set, with improvements to reconcileStaged
that avoid needing this second pass.

Sponsored-by: Dartmouth College's Datalad project
This commit is contained in:
Joey Hess 2021-06-08 09:27:53 -04:00
parent 22185b4a4e
commit c941ab6f5b
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
3 changed files with 10 additions and 19 deletions

View file

@ -22,7 +22,6 @@ import qualified Git.Ref
import qualified Git.LsTree
import qualified Git.Types
import qualified Database.Keys
import qualified Database.Keys.SQL
import Config
import qualified Utility.RawFilePath as R
@ -81,9 +80,15 @@ ifAnnexed file yes no = maybe no yes =<< lookupKey file
-}
scanAnnexedFiles :: Annex ()
scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ do
-- This gets the keys database populated with all annexed files,
-- by running Database.Keys.reconcileStaged.
Database.Keys.runWriter (const noop)
-- The above tries to populate pointer files, but one thing it
-- is not able to handle is populating a pointer file when the
-- annex object file already exists, but its inode is not yet
-- cached. So, the rest of this makes another pass over the
-- tree to do that.
g <- Annex.gitRepo
Database.Keys.runWriter $
liftIO . Database.Keys.SQL.dropAllAssociatedFiles
(l, cleanup) <- inRepo $ Git.LsTree.lsTree
Git.LsTree.LsTreeRecursive
(Git.LsTree.LsTreeLong True)
@ -112,8 +117,6 @@ scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ d
add i isregfile k = do
let tf = Git.LsTree.file i
Database.Keys.runWriter $
liftIO . Database.Keys.SQL.addAssociatedFileFast k tf
whenM (pure isregfile <&&> inAnnex k) $ do
f <- fromRepo $ fromTopFilePath tf
liftIO (isPointerFile f) >>= \case

View file

@ -339,7 +339,7 @@ reconcileStaged qh = do
(asTopFilePath file)
(SQL.WriteHandle qh)
when (dstmode /= fmtTreeItemType TreeSymlink) $
reconcilerace (asTopFilePath file) key
reconcilepointerfile (asTopFilePath file) key
return True
Nothing -> return False
procdiff mdfeeder rest
@ -367,7 +367,7 @@ reconcileStaged qh = do
_ -> return conflicted -- parse failed
procmergeconflictdiff _ _ conflicted = return conflicted
reconcilerace file key = do
reconcilepointerfile file key = do
caches <- liftIO $ SQL.getInodeCaches key (SQL.ReadHandle qh)
keyloc <- calcRepo (gitAnnexLocation key)
keypopulated <- sameInodeCache keyloc caches

View file

@ -88,18 +88,6 @@ addAssociatedFile k f = queueDb $
where
af = SFilePath (getTopFilePath f)
-- Does not remove any old association for a file, but less expensive
-- than addAssociatedFile. Calling dropAllAssociatedFiles first and then
-- this is an efficient way to update all associated files.
addAssociatedFileFast :: Key -> TopFilePath -> WriteHandle -> IO ()
addAssociatedFileFast k f = queueDb $ void $ insertUnique $ Associated k af
where
af = SFilePath (getTopFilePath f)
dropAllAssociatedFiles :: WriteHandle -> IO ()
dropAllAssociatedFiles = queueDb $
deleteWhere ([] :: [Filter Associated])
{- Note that the files returned were once associated with the key, but
- some of them may not be any longer. -}
getAssociatedFiles :: Key -> ReadHandle -> IO [TopFilePath]