From 0f10f208a7c76061b7d48aaa7ea0fa8e641cace3 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 7 Jun 2021 16:50:14 -0400 Subject: [PATCH] avoid double work in git-annex init reconcileStaged was doing a redundant scan to scannAnnexedFiles. It would probably make sense to move the body of scannAnnexedFiles into reconcileStaged, the separation does not really serve any purpose. Sponsored-by: Dartmouth College's Datalad project --- Annex/WorkTree.hs | 41 +++++++-------- Database/Keys.hs | 51 ++++++++++++------- ..._85d1031d2b51c0fc1271c283d8ee7888._comment | 14 +++++ ..._b0f1e94e96cf8fe992425cf81f8d1105._comment | 9 ++++ ..._e9a36e9600561201969c4d21499833af._comment | 16 ++++++ 5 files changed, 93 insertions(+), 38 deletions(-) create mode 100644 doc/bugs/significant_performance_regression_impacting_datal/comment_4_85d1031d2b51c0fc1271c283d8ee7888._comment create mode 100644 doc/bugs/significant_performance_regression_impacting_datal/comment_5_b0f1e94e96cf8fe992425cf81f8d1105._comment create mode 100644 doc/todo/Avoid_lengthy___34__Scanning_for_unlocked_files_...__34__/comment_20_e9a36e9600561201969c4d21499833af._comment diff --git a/Annex/WorkTree.hs b/Annex/WorkTree.hs index b9d45b483d..8d09adf7a8 100644 --- a/Annex/WorkTree.hs +++ b/Annex/WorkTree.hs @@ -70,27 +70,29 @@ ifAnnexed file yes no = maybe no yes =<< lookupKey file {- Find all annexed files and update the keys database for them. - - - This is expensive, and so normally the associated files are updated - - incrementally when changes are noticed. So, this only needs to be done - - when initializing/upgrading a repository. - - - Also, the content for an unlocked file may already be present as - an annex object. If so, populate the pointer file with it. - But if worktree file does not have a pointer file's content, it is left - as-is. + - + - Normally the keys database is updated incrementally when changes are + - noticed. For an initial scan, this is faster than that incremental + - update. -} scanAnnexedFiles :: Annex () -scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ do - g <- Annex.gitRepo - Database.Keys.runWriter $ - liftIO . Database.Keys.SQL.dropAllAssociatedFiles - (l, cleanup) <- inRepo $ Git.LsTree.lsTree - Git.LsTree.LsTreeRecursive - (Git.LsTree.LsTreeLong True) - Git.Ref.headRef - catObjectStreamLsTree l want g go - liftIO $ void cleanup +scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ + Database.Keys.runWriter' (Just reconciler) (const noop) where + reconciler dbh = do + g <- Annex.gitRepo + liftIO $ Database.Keys.SQL.dropAllAssociatedFiles dbh + (l, cleanup) <- inRepo $ Git.LsTree.lsTree + Git.LsTree.LsTreeRecursive + (Git.LsTree.LsTreeLong True) + Git.Ref.headRef + catObjectStreamLsTree l want g (go dbh) + liftIO $ void cleanup + -- Want to process symlinks, and regular files. want i = case Git.Types.toTreeItemType (Git.LsTree.mode i) of Just Git.Types.TreeSymlink -> Just (i, False) @@ -103,17 +105,16 @@ scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ d Just n | n < maxPointerSz -> Just (i, True) _ -> Nothing - go getnext = liftIO getnext >>= \case + go dbh getnext = liftIO getnext >>= \case Just ((i, isregfile), Just c) -> do - maybe noop (add i isregfile) + maybe noop (add i isregfile dbh) (parseLinkTargetOrPointer (L.toStrict c)) - go getnext + go dbh getnext _ -> return () - add i isregfile k = do + add i isregfile dbh k = do let tf = Git.LsTree.file i - Database.Keys.runWriter $ - liftIO . Database.Keys.SQL.addAssociatedFileFast k tf + liftIO $ Database.Keys.SQL.addAssociatedFileFast k tf dbh whenM (pure isregfile <&&> inAnnex k) $ do f <- fromRepo $ fromTopFilePath tf liftIO (isPointerFile f) >>= \case diff --git a/Database/Keys.hs b/Database/Keys.hs index 8b6edd80fe..778a0af1ac 100644 --- a/Database/Keys.hs +++ b/Database/Keys.hs @@ -23,6 +23,7 @@ module Database.Keys ( removeInodeCache, isInodeKnown, runWriter, + runWriter', ) where import qualified Database.Keys.SQL as SQL @@ -73,7 +74,7 @@ runReader a = do v <- a (SQL.ReadHandle qh) return (v, st) go DbClosed = do - st' <- openDb True DbClosed + st' <- openDb True Nothing DbClosed v <- case st' of (DbOpen qh) -> a (SQL.ReadHandle qh) _ -> return mempty @@ -87,7 +88,15 @@ runReaderIO a = runReader (liftIO . a) - - The database is created if it doesn't exist yet. -} runWriter :: (SQL.WriteHandle -> Annex ()) -> Annex () -runWriter a = do +runWriter = runWriter' Nothing + +runWriterIO :: (SQL.WriteHandle -> IO ()) -> Annex () +runWriterIO a = runWriter (liftIO . a) + +{- When a reconcile action is passed, it is run by reconcileStaged instead + - of its usual scan, and must update the database in the same way. -} +runWriter' :: Maybe (SQL.WriteHandle -> Annex ()) -> (SQL.WriteHandle -> Annex ()) -> Annex () +runWriter' reconciler a = do h <- Annex.getRead Annex.keysdbhandle withDbState h go where @@ -95,15 +104,12 @@ runWriter a = do v <- a (SQL.WriteHandle qh) return (v, st) go st = do - st' <- openDb False st + st' <- openDb False reconciler st v <- case st' of DbOpen qh -> a (SQL.WriteHandle qh) _ -> error "internal" return (v, st') -runWriterIO :: (SQL.WriteHandle -> IO ()) -> Annex () -runWriterIO a = runWriter (liftIO . a) - {- Opens the database, creating it if it doesn't exist yet. - - Multiple readers and writers can have the database open at the same @@ -112,10 +118,10 @@ runWriterIO a = runWriter (liftIO . a) - the database doesn't exist yet, one caller wins the lock and - can create it undisturbed. -} -openDb :: Bool -> DbState -> Annex DbState -openDb _ st@(DbOpen _) = return st -openDb False DbUnavailable = return DbUnavailable -openDb forwrite _ = catchPermissionDenied permerr $ withExclusiveLock gitAnnexKeysDbLock $ do +openDb :: Bool -> (Maybe (SQL.WriteHandle -> Annex ())) -> DbState -> Annex DbState +openDb _ _ st@(DbOpen _) = return st +openDb False _ DbUnavailable = return DbUnavailable +openDb forwrite reconciler _ = catchPermissionDenied permerr $ withExclusiveLock gitAnnexKeysDbLock $ do dbdir <- fromRepo gitAnnexKeysDb let db = dbdir P. "db" dbexists <- liftIO $ R.doesPathExist db @@ -133,7 +139,7 @@ openDb forwrite _ = catchPermissionDenied permerr $ withExclusiveLock gitAnnexKe open db = do qh <- liftIO $ H.openDbQueue H.MultiWriter db SQL.containedTable - reconcileStaged qh + reconcileStaged qh reconciler return $ DbOpen qh {- Closes the database if it was open. Any writes will be flushed to it. @@ -223,8 +229,8 @@ isInodeKnown i s = or <$> runReaderIO ((:[]) <$$> SQL.isInodeKnown i s) - So when using getAssociatedFiles, have to make sure the file still - is an associated file. -} -reconcileStaged :: H.DbQueue -> Annex () -reconcileStaged qh = do +reconcileStaged :: H.DbQueue -> Maybe (SQL.WriteHandle -> Annex ()) -> Annex () +reconcileStaged qh mreconciler = do gitindex <- inRepo currentIndexFile indexcache <- fromRawFilePath <$> fromRepo gitAnnexKeysDbIndexCache withTSDelta (liftIO . genInodeCache gitindex) >>= \case @@ -246,12 +252,21 @@ reconcileStaged qh = do go cur indexcache (Just newtree) = do oldtree <- getoldtree when (oldtree /= newtree) $ do + case mreconciler of + Just reconciler -> + reconciler (SQL.WriteHandle qh) + Nothing -> noop g <- Annex.gitRepo - void $ catstream $ \mdfeeder -> - void $ updatetodiff g - (Just (fromRef oldtree)) - (fromRef newtree) - (procdiff mdfeeder) + void $ catstream $ \mdfeeder -> do + case mreconciler of + Nothing -> void $ updatetodiff g + (Just (fromRef oldtree)) + (fromRef newtree) + (procdiff mdfeeder) + Just _ -> void $ updatetodiff g + Nothing "--staged" + (procdiff mdfeeder) + liftIO $ writeFile indexcache $ showInodeCache cur -- Storing the tree in a ref makes sure it does not -- get garbage collected, and is available to diff diff --git a/doc/bugs/significant_performance_regression_impacting_datal/comment_4_85d1031d2b51c0fc1271c283d8ee7888._comment b/doc/bugs/significant_performance_regression_impacting_datal/comment_4_85d1031d2b51c0fc1271c283d8ee7888._comment new file mode 100644 index 0000000000..27c306ba7a --- /dev/null +++ b/doc/bugs/significant_performance_regression_impacting_datal/comment_4_85d1031d2b51c0fc1271c283d8ee7888._comment @@ -0,0 +1,14 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 4""" + date="2021-06-07T20:22:24Z" + content=""" +Turns out `git-annex init` got a lot slower than it had to, it was doing +the same kind of scan twice. I think that probably explains much of the +slowdown you saw. (Although it still needs to do somewhat more work than +before, but also does it more efficiently than before.) + +Also I've optimised the update scan's use of git, around a 20% speedup, +although I don't know if your test case speed was impacted much by that +scan anyway. +"""]] diff --git a/doc/bugs/significant_performance_regression_impacting_datal/comment_5_b0f1e94e96cf8fe992425cf81f8d1105._comment b/doc/bugs/significant_performance_regression_impacting_datal/comment_5_b0f1e94e96cf8fe992425cf81f8d1105._comment new file mode 100644 index 0000000000..d6ea9e9b20 --- /dev/null +++ b/doc/bugs/significant_performance_regression_impacting_datal/comment_5_b0f1e94e96cf8fe992425cf81f8d1105._comment @@ -0,0 +1,9 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 5""" + date="2021-06-07T20:42:34Z" + content=""" +Looking at the CI log, I don't see any past runs that took 1h 46min. +A month ago they were taking 2h 6min. Let's see if the changes I'm pushing +now drop it back to that. +"""]] diff --git a/doc/todo/Avoid_lengthy___34__Scanning_for_unlocked_files_...__34__/comment_20_e9a36e9600561201969c4d21499833af._comment b/doc/todo/Avoid_lengthy___34__Scanning_for_unlocked_files_...__34__/comment_20_e9a36e9600561201969c4d21499833af._comment new file mode 100644 index 0000000000..8b8ffef1fc --- /dev/null +++ b/doc/todo/Avoid_lengthy___34__Scanning_for_unlocked_files_...__34__/comment_20_e9a36e9600561201969c4d21499833af._comment @@ -0,0 +1,16 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 20""" + date="2021-06-07T19:22:03Z" + content=""" +Turns out git-annex init was running both scanAnnexedFiles and +reconcileStaged, which after recent changes to the latter, both do +close to the same scan when run in a fresh clone. So double work! + +Benchmarking with 100,000 files, git-annex init took 88 seconds. +Fixed not to use reconcileStaged it took 37 seconds. + +(Keeping reconcileStaged and removing scanAnnexedFiles it took 47 seconds. +That makes sense; reconcileStaged is an incremental updater and is not +able to use SQL as efficiently as scanAnnexedFiles.) +"""]]