avoid double work in git-annex init, second try
reconcileStaged populates the db, so scanAnnexedFiles does not need to do it again. It still makes a pass over the HEAD tree, but populating the db was most of the expensive part. Benchmarking with 100,000 files, git-annex init now takes 40 seconds, vs 37 seconds with the old, buggy version of this fix. It should be possible to win those 3 precious seconds per 100k files back, in the case when when annex.thin is not set, with improvements to reconcileStaged that avoid needing this second pass. Sponsored-by: Dartmouth College's Datalad project
This commit is contained in:
parent
22185b4a4e
commit
c941ab6f5b
3 changed files with 10 additions and 19 deletions
|
@ -22,7 +22,6 @@ import qualified Git.Ref
|
|||
import qualified Git.LsTree
|
||||
import qualified Git.Types
|
||||
import qualified Database.Keys
|
||||
import qualified Database.Keys.SQL
|
||||
import Config
|
||||
import qualified Utility.RawFilePath as R
|
||||
|
||||
|
@ -81,9 +80,15 @@ ifAnnexed file yes no = maybe no yes =<< lookupKey file
|
|||
-}
|
||||
scanAnnexedFiles :: Annex ()
|
||||
scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ do
|
||||
-- This gets the keys database populated with all annexed files,
|
||||
-- by running Database.Keys.reconcileStaged.
|
||||
Database.Keys.runWriter (const noop)
|
||||
-- The above tries to populate pointer files, but one thing it
|
||||
-- is not able to handle is populating a pointer file when the
|
||||
-- annex object file already exists, but its inode is not yet
|
||||
-- cached. So, the rest of this makes another pass over the
|
||||
-- tree to do that.
|
||||
g <- Annex.gitRepo
|
||||
Database.Keys.runWriter $
|
||||
liftIO . Database.Keys.SQL.dropAllAssociatedFiles
|
||||
(l, cleanup) <- inRepo $ Git.LsTree.lsTree
|
||||
Git.LsTree.LsTreeRecursive
|
||||
(Git.LsTree.LsTreeLong True)
|
||||
|
@ -112,8 +117,6 @@ scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ d
|
|||
|
||||
add i isregfile k = do
|
||||
let tf = Git.LsTree.file i
|
||||
Database.Keys.runWriter $
|
||||
liftIO . Database.Keys.SQL.addAssociatedFileFast k tf
|
||||
whenM (pure isregfile <&&> inAnnex k) $ do
|
||||
f <- fromRepo $ fromTopFilePath tf
|
||||
liftIO (isPointerFile f) >>= \case
|
||||
|
|
|
@ -339,7 +339,7 @@ reconcileStaged qh = do
|
|||
(asTopFilePath file)
|
||||
(SQL.WriteHandle qh)
|
||||
when (dstmode /= fmtTreeItemType TreeSymlink) $
|
||||
reconcilerace (asTopFilePath file) key
|
||||
reconcilepointerfile (asTopFilePath file) key
|
||||
return True
|
||||
Nothing -> return False
|
||||
procdiff mdfeeder rest
|
||||
|
@ -367,7 +367,7 @@ reconcileStaged qh = do
|
|||
_ -> return conflicted -- parse failed
|
||||
procmergeconflictdiff _ _ conflicted = return conflicted
|
||||
|
||||
reconcilerace file key = do
|
||||
reconcilepointerfile file key = do
|
||||
caches <- liftIO $ SQL.getInodeCaches key (SQL.ReadHandle qh)
|
||||
keyloc <- calcRepo (gitAnnexLocation key)
|
||||
keypopulated <- sameInodeCache keyloc caches
|
||||
|
|
|
@ -88,18 +88,6 @@ addAssociatedFile k f = queueDb $
|
|||
where
|
||||
af = SFilePath (getTopFilePath f)
|
||||
|
||||
-- Does not remove any old association for a file, but less expensive
|
||||
-- than addAssociatedFile. Calling dropAllAssociatedFiles first and then
|
||||
-- this is an efficient way to update all associated files.
|
||||
addAssociatedFileFast :: Key -> TopFilePath -> WriteHandle -> IO ()
|
||||
addAssociatedFileFast k f = queueDb $ void $ insertUnique $ Associated k af
|
||||
where
|
||||
af = SFilePath (getTopFilePath f)
|
||||
|
||||
dropAllAssociatedFiles :: WriteHandle -> IO ()
|
||||
dropAllAssociatedFiles = queueDb $
|
||||
deleteWhere ([] :: [Filter Associated])
|
||||
|
||||
{- Note that the files returned were once associated with the key, but
|
||||
- some of them may not be any longer. -}
|
||||
getAssociatedFiles :: Key -> ReadHandle -> IO [TopFilePath]
|
||||
|
|
Loading…
Reference in a new issue