diff --git a/Annex/WorkTree.hs b/Annex/WorkTree.hs index 226a00d1df..b9d45b483d 100644 --- a/Annex/WorkTree.hs +++ b/Annex/WorkTree.hs @@ -1,6 +1,6 @@ {- git-annex worktree files - - - Copyright 2013-2020 Joey Hess + - Copyright 2013-2021 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -8,6 +8,7 @@ module Annex.WorkTree where import Annex.Common +import qualified Annex import Annex.Link import Annex.CatFile import Annex.Content @@ -16,6 +17,7 @@ import Annex.CurrentBranch import Annex.InodeSentinal import Utility.InodeCache import Git.FilePath +import Git.CatFile import qualified Git.Ref import qualified Git.LsTree import qualified Git.Types @@ -24,7 +26,7 @@ import qualified Database.Keys.SQL import Config import qualified Utility.RawFilePath as R -import Control.Concurrent +import qualified Data.ByteString.Lazy as L {- Looks up the key corresponding to an annexed file in the work tree, - by examining what the file links to. @@ -79,30 +81,40 @@ ifAnnexed file yes no = maybe no yes =<< lookupKey file -} scanAnnexedFiles :: Annex () scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ do - dropold <- liftIO $ newMVar $ - Database.Keys.runWriter $ - liftIO . Database.Keys.SQL.dropAllAssociatedFiles + g <- Annex.gitRepo + Database.Keys.runWriter $ + liftIO . Database.Keys.SQL.dropAllAssociatedFiles (l, cleanup) <- inRepo $ Git.LsTree.lsTree Git.LsTree.LsTreeRecursive - (Git.LsTree.LsTreeLong False) + (Git.LsTree.LsTreeLong True) Git.Ref.headRef - forM_ l $ \i -> - maybe noop (add dropold i) - =<< catKey' - (Git.LsTree.sha i) - (fromMaybe 0 (Git.LsTree.size i)) + catObjectStreamLsTree l want g go liftIO $ void cleanup where - isregfile i = case Git.Types.toTreeItemType (Git.LsTree.mode i) of - Just Git.Types.TreeFile -> True - Just Git.Types.TreeExecutable -> True - _ -> False - add dropold i k = do - join $ fromMaybe noop <$> liftIO (tryTakeMVar dropold) + -- Want to process symlinks, and regular files. + want i = case Git.Types.toTreeItemType (Git.LsTree.mode i) of + Just Git.Types.TreeSymlink -> Just (i, False) + Just Git.Types.TreeFile -> checkfilesize i + Just Git.Types.TreeExecutable -> checkfilesize i + _ -> Nothing + + -- Avoid processing files that are too large to be pointer files. + checkfilesize i = case Git.LsTree.size i of + Just n | n < maxPointerSz -> Just (i, True) + _ -> Nothing + + go getnext = liftIO getnext >>= \case + Just ((i, isregfile), Just c) -> do + maybe noop (add i isregfile) + (parseLinkTargetOrPointer (L.toStrict c)) + go getnext + _ -> return () + + add i isregfile k = do let tf = Git.LsTree.file i Database.Keys.runWriter $ liftIO . Database.Keys.SQL.addAssociatedFileFast k tf - whenM (pure (isregfile i) <&&> inAnnex k) $ do + whenM (pure isregfile <&&> inAnnex k) $ do f <- fromRepo $ fromTopFilePath tf liftIO (isPointerFile f) >>= \case Just k' | k' == k -> do diff --git a/doc/todo/Avoid_lengthy___34__Scanning_for_unlocked_files_...__34__/comment_12_70c4c9f6c35acd7ca1134ac74356e5be._comment b/doc/todo/Avoid_lengthy___34__Scanning_for_unlocked_files_...__34__/comment_12_70c4c9f6c35acd7ca1134ac74356e5be._comment new file mode 100644 index 0000000000..26af7699a3 --- /dev/null +++ b/doc/todo/Avoid_lengthy___34__Scanning_for_unlocked_files_...__34__/comment_12_70c4c9f6c35acd7ca1134ac74356e5be._comment @@ -0,0 +1,16 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 12""" + date="2021-05-31T16:30:11Z" + content=""" +Implemented streaming through git. In a repo with 100000 unlocked files, +version 8.20210429 took 46 seconds, now reduced to 36 seconds. + +When the files are locked, of course the old version was faster +due to being able to skip all symlinks, 2 seconds. The new version takes +slightly less time than it does for unlocked files, 35 seconds. + +Now the git query and processing is only a few seconds of the total run time, +writing information about all the files to sqlite is most of the rest, +and may also be possible to speed up. +"""]]