avoid double work in git-annex init, second try

reconcileStaged populates the db, so scanAnnexedFiles does not need to
do it again. It still makes a pass over the HEAD tree, but populating
the db was most of the expensive part.

Benchmarking with 100,000 files, git-annex init now takes 40 seconds,
vs 37 seconds with the old, buggy version of this fix. It should be
possible to win those 3 precious seconds per 100k files back, in the
case when when annex.thin is not set, with improvements to reconcileStaged
that avoid needing this second pass.

Sponsored-by: Dartmouth College's Datalad project
This commit is contained in:
Joey Hess 2021-06-08 09:27:53 -04:00
parent 22185b4a4e
commit c941ab6f5b
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
3 changed files with 10 additions and 19 deletions

View file

@ -22,7 +22,6 @@ import qualified Git.Ref
import qualified Git.LsTree import qualified Git.LsTree
import qualified Git.Types import qualified Git.Types
import qualified Database.Keys import qualified Database.Keys
import qualified Database.Keys.SQL
import Config import Config
import qualified Utility.RawFilePath as R import qualified Utility.RawFilePath as R
@ -81,9 +80,15 @@ ifAnnexed file yes no = maybe no yes =<< lookupKey file
-} -}
scanAnnexedFiles :: Annex () scanAnnexedFiles :: Annex ()
scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ do scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ do
-- This gets the keys database populated with all annexed files,
-- by running Database.Keys.reconcileStaged.
Database.Keys.runWriter (const noop)
-- The above tries to populate pointer files, but one thing it
-- is not able to handle is populating a pointer file when the
-- annex object file already exists, but its inode is not yet
-- cached. So, the rest of this makes another pass over the
-- tree to do that.
g <- Annex.gitRepo g <- Annex.gitRepo
Database.Keys.runWriter $
liftIO . Database.Keys.SQL.dropAllAssociatedFiles
(l, cleanup) <- inRepo $ Git.LsTree.lsTree (l, cleanup) <- inRepo $ Git.LsTree.lsTree
Git.LsTree.LsTreeRecursive Git.LsTree.LsTreeRecursive
(Git.LsTree.LsTreeLong True) (Git.LsTree.LsTreeLong True)
@ -112,8 +117,6 @@ scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ d
add i isregfile k = do add i isregfile k = do
let tf = Git.LsTree.file i let tf = Git.LsTree.file i
Database.Keys.runWriter $
liftIO . Database.Keys.SQL.addAssociatedFileFast k tf
whenM (pure isregfile <&&> inAnnex k) $ do whenM (pure isregfile <&&> inAnnex k) $ do
f <- fromRepo $ fromTopFilePath tf f <- fromRepo $ fromTopFilePath tf
liftIO (isPointerFile f) >>= \case liftIO (isPointerFile f) >>= \case

View file

@ -339,7 +339,7 @@ reconcileStaged qh = do
(asTopFilePath file) (asTopFilePath file)
(SQL.WriteHandle qh) (SQL.WriteHandle qh)
when (dstmode /= fmtTreeItemType TreeSymlink) $ when (dstmode /= fmtTreeItemType TreeSymlink) $
reconcilerace (asTopFilePath file) key reconcilepointerfile (asTopFilePath file) key
return True return True
Nothing -> return False Nothing -> return False
procdiff mdfeeder rest procdiff mdfeeder rest
@ -367,7 +367,7 @@ reconcileStaged qh = do
_ -> return conflicted -- parse failed _ -> return conflicted -- parse failed
procmergeconflictdiff _ _ conflicted = return conflicted procmergeconflictdiff _ _ conflicted = return conflicted
reconcilerace file key = do reconcilepointerfile file key = do
caches <- liftIO $ SQL.getInodeCaches key (SQL.ReadHandle qh) caches <- liftIO $ SQL.getInodeCaches key (SQL.ReadHandle qh)
keyloc <- calcRepo (gitAnnexLocation key) keyloc <- calcRepo (gitAnnexLocation key)
keypopulated <- sameInodeCache keyloc caches keypopulated <- sameInodeCache keyloc caches

View file

@ -88,18 +88,6 @@ addAssociatedFile k f = queueDb $
where where
af = SFilePath (getTopFilePath f) af = SFilePath (getTopFilePath f)
-- Does not remove any old association for a file, but less expensive
-- than addAssociatedFile. Calling dropAllAssociatedFiles first and then
-- this is an efficient way to update all associated files.
addAssociatedFileFast :: Key -> TopFilePath -> WriteHandle -> IO ()
addAssociatedFileFast k f = queueDb $ void $ insertUnique $ Associated k af
where
af = SFilePath (getTopFilePath f)
dropAllAssociatedFiles :: WriteHandle -> IO ()
dropAllAssociatedFiles = queueDb $
deleteWhere ([] :: [Filter Associated])
{- Note that the files returned were once associated with the key, but {- Note that the files returned were once associated with the key, but
- some of them may not be any longer. -} - some of them may not be any longer. -}
getAssociatedFiles :: Key -> ReadHandle -> IO [TopFilePath] getAssociatedFiles :: Key -> ReadHandle -> IO [TopFilePath]