diff --git a/Annex/Init.hs b/Annex/Init.hs index cea69f8237..c1834085d1 100644 --- a/Annex/Init.hs +++ b/Annex/Init.hs @@ -134,8 +134,8 @@ initialize' mversion = checkInitializeAllowed $ do else deconfigureSmudgeFilter unlessM isBareRepo $ do when supportunlocked $ do - showSideAction "scanning for unlocked files" - scanUnlockedFiles + showSideAction "scanning for annexed files" + scanAnnexedFiles hookWrite postCheckoutHook hookWrite postMergeHook AdjustedBranch.checkAdjustedClone >>= \case diff --git a/Annex/WorkTree.hs b/Annex/WorkTree.hs index 42abde34aa..226a00d1df 100644 --- a/Annex/WorkTree.hs +++ b/Annex/WorkTree.hs @@ -66,19 +66,19 @@ whenAnnexed a file = ifAnnexed file (a file) (return Nothing) ifAnnexed :: RawFilePath -> (Key -> Annex a) -> Annex a -> Annex a ifAnnexed file yes no = maybe no yes =<< lookupKey file -{- Find all unlocked files and update the keys database for them. +{- Find all annexed files and update the keys database for them. - - This is expensive, and so normally the associated files are updated - incrementally when changes are noticed. So, this only needs to be done - - when initializing/upgrading repository. + - when initializing/upgrading a repository. - - - Also, the content for the unlocked file may already be present as + - Also, the content for an unlocked file may already be present as - an annex object. If so, populate the pointer file with it. - But if worktree file does not have a pointer file's content, it is left - as-is. -} -scanUnlockedFiles :: Annex () -scanUnlockedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ do +scanAnnexedFiles :: Annex () +scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ do dropold <- liftIO $ newMVar $ Database.Keys.runWriter $ liftIO . Database.Keys.SQL.dropAllAssociatedFiles @@ -87,9 +87,10 @@ scanUnlockedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ (Git.LsTree.LsTreeLong False) Git.Ref.headRef forM_ l $ \i -> - when (isregfile i) $ - maybe noop (add dropold i) - =<< catKey (Git.LsTree.sha i) + maybe noop (add dropold i) + =<< catKey' + (Git.LsTree.sha i) + (fromMaybe 0 (Git.LsTree.size i)) liftIO $ void cleanup where isregfile i = case Git.Types.toTreeItemType (Git.LsTree.mode i) of @@ -101,7 +102,7 @@ scanUnlockedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ let tf = Git.LsTree.file i Database.Keys.runWriter $ liftIO . Database.Keys.SQL.addAssociatedFileFast k tf - whenM (inAnnex k) $ do + whenM (pure (isregfile i) <&&> inAnnex k) $ do f <- fromRepo $ fromTopFilePath tf liftIO (isPointerFile f) >>= \case Just k' | k' == k -> do diff --git a/Command/Lock.hs b/Command/Lock.hs index e7af74ca9f..3d085393a8 100644 --- a/Command/Lock.hs +++ b/Command/Lock.hs @@ -62,7 +62,7 @@ perform file key = do lockdown =<< calcRepo (gitAnnexLocation key) addLink (CheckGitIgnore False) file key =<< withTSDelta (liftIO . genInodeCache file) - next $ cleanup file key + next $ return True where lockdown obj = do ifM (isUnmodified key obj) @@ -97,10 +97,5 @@ perform file key = do lostcontent = logStatus key InfoMissing -cleanup :: RawFilePath -> Key -> CommandCleanup -cleanup file key = do - Database.Keys.removeAssociatedFile key =<< inRepo (toTopFilePath file) - return True - errorModified :: a errorModified = giveup "Locking this file would discard any changes you have made to it. Use 'git annex add' to stage your changes. (Or, use --force to override)" diff --git a/Command/Migrate.hs b/Command/Migrate.hs index 669be7f56c..9a0d69f35a 100644 --- a/Command/Migrate.hs +++ b/Command/Migrate.hs @@ -86,7 +86,7 @@ perform file oldkey oldbackend newbackend = go =<< genkey (fastMigrate oldbacken urls <- getUrls oldkey forM_ urls $ \url -> setUrlPresent newkey url - next $ Command.ReKey.cleanup file oldkey newkey + next $ Command.ReKey.cleanup file newkey , giveup "failed creating link from old to new key" ) genkey Nothing = do diff --git a/Command/ReKey.hs b/Command/ReKey.hs index 368fb42ef2..077dd5a628 100644 --- a/Command/ReKey.hs +++ b/Command/ReKey.hs @@ -15,8 +15,6 @@ import Annex.Link import Annex.Perms import Annex.ReplaceFile import Logs.Location -import Git.FilePath -import qualified Database.Keys import Annex.InodeSentinal import Utility.InodeCache import qualified Utility.RawFilePath as R @@ -79,7 +77,7 @@ perform file oldkey newkey = do , unlessM (Annex.getState Annex.force) $ giveup $ fromRawFilePath file ++ " is not available (use --force to override)" ) - next $ cleanup file oldkey newkey + next $ cleanup file newkey {- Make a hard link to the old key content (when supported), - to avoid wasting disk space. -} @@ -119,8 +117,8 @@ linkKey file oldkey newkey = ifM (isJust <$> isAnnexLink file) LinkAnnexNoop -> True ) -cleanup :: RawFilePath -> Key -> Key -> CommandCleanup -cleanup file oldkey newkey = do +cleanup :: RawFilePath -> Key -> CommandCleanup +cleanup file newkey = do ifM (isJust <$> isAnnexLink file) ( do -- Update symlink to use the new key. @@ -131,8 +129,6 @@ cleanup file oldkey newkey = do liftIO $ whenM (isJust <$> isPointerFile file) $ writePointerFile file newkey mode stagePointerFile file mode =<< hashPointerFile newkey - Database.Keys.removeAssociatedFile oldkey - =<< inRepo (toTopFilePath file) ) whenM (inAnnex newkey) $ logStatus newkey InfoPresent diff --git a/Database/Keys.hs b/Database/Keys.hs index c050002617..984dac1958 100644 --- a/Database/Keys.hs +++ b/Database/Keys.hs @@ -1,6 +1,6 @@ {- Sqlite database of information about Keys - - - Copyright 2015-2019 Joey Hess + - Copyright 2015-2021 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -44,6 +44,9 @@ import Git.FilePath import Git.Command import Git.Types import Git.Index +import Git.Sha +import Git.Branch (writeTree, update') +import qualified Git.Ref import Config.Smudge import qualified Utility.RawFilePath as R @@ -191,20 +194,17 @@ removeInodeCache = runWriterIO . SQL.removeInodeCache isInodeKnown :: InodeCache -> SentinalStatus -> Annex Bool isInodeKnown i s = or <$> runReaderIO ((:[]) <$$> SQL.isInodeKnown i s) -{- Looks at staged changes to find when unlocked files are copied/moved, - - and updates associated files in the keys database. +{- Looks at staged changes to annexed files, and updates the keys database, + - so that its information is consistent with the state of the repository. - - - Since staged changes can be dropped later, does not remove any - - associated files; only adds new associated files. - - - - This needs to be run before querying the keys database so that - - information is consistent with the state of the repository. + - This is run with a lock held, so only one process can be running this at + - a time. - - To avoid unncessary work, the index file is statted, and if it's not - changed since last time this was run, nothing is done. - - - Note that this is run with a lock held, so only one process can be - - running this at a time. + - A tree is generated from the index, and the diff between that tree + - and the last processed tree is examined for changes. - - This also cleans up after a race between eg a git mv and git-annex - get/drop/similar. If git moves the file between this being run and the @@ -233,17 +233,28 @@ reconcileStaged qh = do ) Nothing -> noop where + lastindexref = Ref "refs/annex/last-index" + go cur indexcache = do - (l, cleanup) <- inRepo $ pipeNullSplit' diff - changed <- procdiff l False - void $ liftIO cleanup - -- Flush database changes immediately - -- so other processes can see them. - when changed $ - liftIO $ H.flushDbQueue qh - liftIO $ writeFile indexcache $ showInodeCache cur + oldtree <- fromMaybe emptyTree + <$> inRepo (Git.Ref.sha lastindexref) + newtree <- inRepo writeTree + when (oldtree /= newtree) $ do + (l, cleanup) <- inRepo $ pipeNullSplit' $ + diff oldtree newtree + changed <- procdiff l False + void $ liftIO cleanup + -- Flush database changes immediately + -- so other processes can see them. + when changed $ + liftIO $ H.flushDbQueue qh + liftIO $ writeFile indexcache $ showInodeCache cur + -- Storing the tree in a ref makes sure it does not + -- get garbage collected, and is available to diff + -- against next time. + inRepo $ update' lastindexref newtree - diff = + diff oldtree newtree = -- Avoid running smudge or clean filters, since we want the -- raw output, and they would block trying to access the -- locked database. The --raw normally avoids git diff @@ -253,43 +264,49 @@ reconcileStaged qh = do -- (The -G option may make it be used otherwise.) [ Param "-c", Param "diff.external=" , Param "diff" - , Param "--cached" , Param "--raw" , Param "-z" , Param "--no-abbrev" - -- Optimization: Only find pointer files. This is not - -- perfect. A file could start with this and not be a - -- pointer file. And a pointer file that is replaced with - -- a non-pointer file will match this. - , Param $ "-G^" ++ fromRawFilePath (toInternalGitPath $ + -- Optimization: Limit to pointer files and annex symlinks. + -- This is not perfect. A file could contain with this and not + -- be a pointer file. And a pointer file that is replaced with + -- a non-pointer file will match this. This is only a + -- prefilter so that's ok. + , Param $ "-G" ++ fromRawFilePath (toInternalGitPath $ P.pathSeparator `S.cons` objectDir') - -- Don't include files that were deleted, because this only - -- wants to update information for files that are present - -- in the index. - , Param "--diff-filter=AMUT" -- Disable rename detection. , Param "--no-renames" -- Avoid other complications. , Param "--ignore-submodules=all" , Param "--no-ext-diff" + , Param (fromRef oldtree) + , Param (fromRef newtree) ] procdiff (info:file:rest) changed | ":" `S.isPrefixOf` info = case S8.words info of - (_colonsrcmode:dstmode:_srcsha:dstsha:_change:[]) - -- Only want files, not symlinks - | dstmode /= fmtTreeItemType TreeSymlink -> do - maybe noop (reconcile (asTopFilePath file)) - =<< catKey (Ref dstsha) - procdiff rest True - | otherwise -> procdiff rest changed + (_colonsrcmode:dstmode:srcsha:dstsha:_change:[]) -> do + removed <- catKey (Ref srcsha) >>= \case + Just oldkey -> do + liftIO $ SQL.removeAssociatedFile oldkey + (asTopFilePath file) + (SQL.WriteHandle qh) + return True + Nothing -> return False + added <- catKey (Ref dstsha) >>= \case + Just key -> do + liftIO $ SQL.addAssociatedFile key + (asTopFilePath file) + (SQL.WriteHandle qh) + when (dstmode /= fmtTreeItemType TreeSymlink) $ + reconcilerace (asTopFilePath file) key + return True + Nothing -> return False + procdiff rest (changed || removed || added) _ -> return changed -- parse failed procdiff _ changed = return changed - -- Note that database writes done in here will not necessarily - -- be visible to database reads also done in here. - reconcile file key = do - liftIO $ SQL.addAssociatedFileFast key file (SQL.WriteHandle qh) + reconcilerace file key = do caches <- liftIO $ SQL.getInodeCaches key (SQL.ReadHandle qh) keyloc <- calcRepo (gitAnnexLocation key) keypopulated <- sameInodeCache keyloc caches diff --git a/Upgrade/V5.hs b/Upgrade/V5.hs index bed8a6d801..2db92d57f9 100644 --- a/Upgrade/V5.hs +++ b/Upgrade/V5.hs @@ -47,7 +47,7 @@ upgrade automatic = flip catchNonAsync onexception $ do , do checkGitVersionForIndirectUpgrade ) - scanUnlockedFiles + scanAnnexedFiles configureSmudgeFilter -- Inode sentinal file was only used in direct mode and when -- locking down files as they were added. In v6, it's used more diff --git a/doc/todo/Avoid_lengthy___34__Scanning_for_unlocked_files_...__34__/comment_4_2cb31617bb7003c5bf0e5def358da0e4._comment b/doc/todo/Avoid_lengthy___34__Scanning_for_unlocked_files_...__34__/comment_4_2cb31617bb7003c5bf0e5def358da0e4._comment index b37477e6e4..b525275114 100644 --- a/doc/todo/Avoid_lengthy___34__Scanning_for_unlocked_files_...__34__/comment_4_2cb31617bb7003c5bf0e5def358da0e4._comment +++ b/doc/todo/Avoid_lengthy___34__Scanning_for_unlocked_files_...__34__/comment_4_2cb31617bb7003c5bf0e5def358da0e4._comment @@ -9,7 +9,12 @@ If most of the files are locked, that would actually make the scan somewhere around twice as slow as it currently is. So not a worthwhile optimisation. -And I don't see much else there that could be optimised. Possibly the +Update: Now that the scan also scans for locked files to make the +associated files include information about them, the catKey optimisation +did make sense. Unfortunately, that does mean this scan got a little bit +slower still, since it has to use git ls-tree --long. + +I don't see much else there that could be optimised. Possibly the ls-tree parser could be made faster but it's already using attoparsec so unlikely to be many gains. """]]