From 428c91606b434512d1986622e751c795edf4df44 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 21 May 2021 15:47:37 -0400 Subject: [PATCH 01/11] include locked files in the keys database associated files Before only unlocked files were included. The initial scan now scans for locked as well as unlocked files. This does mean it gets a little bit slower, although I optimised it as well as I think it can be. reconcileStaged changed to diff from the current index to the tree of the previous index. This lets it handle deletions as well, removing associated files for both locked and unlocked files, which did not always happen before. On upgrade, there will be no recorded previous tree, so it will diff from the empty tree to current index, and so will fully populate the associated files, as well as removing any stale associated files that were present due to them not being removed before. reconcileStaged now does a bit more work. Most of the time, this will just be due to running more often, after some change is made to the index, and since there will be few changes since the last time, it will not be a noticable overhead. What may turn out to be a noticable slowdown is after changing to a branch, it has to go through the diff from the previous index to the new one, and if there are lots of changes, that could take a long time. Also, after adding a lot of files, or deleting a lot of files, or moving a large subdirectory, etc. Command.Lock used removeAssociatedFile, but now that's wrong because a newly locked file still needs to have its associated file tracked. Command.Rekey used removeAssociatedFile when the file was unlocked. It could remove it also when it's locked, but it is not really necessary, because it changes the index, and so the next time git-annex run and accesses the keys db, reconcileStaged will run and update it. There are probably several other places that use addAssociatedFile and don't need to any more for similar reasons. But there's no harm in keeping them, and it probably is a good idea to, if only to support mixing this with older versions of git-annex. However, mixing this and older versions does risk reconcileStaged not running, if the older version already ran it on a given index state. So it's not a good idea to mix versions. This problem could be dealt with by changing the name of the gitAnnexKeysDbIndexCache, but that would leave the old file dangling, or it would need to keep trying to remove it. --- Annex/Init.hs | 4 +- Annex/WorkTree.hs | 19 ++-- Command/Lock.hs | 7 +- Command/Migrate.hs | 2 +- Command/ReKey.hs | 10 +- Database/Keys.hs | 97 +++++++++++-------- Upgrade/V5.hs | 2 +- ..._2cb31617bb7003c5bf0e5def358da0e4._comment | 7 +- 8 files changed, 81 insertions(+), 67 deletions(-) diff --git a/Annex/Init.hs b/Annex/Init.hs index cea69f8237..c1834085d1 100644 --- a/Annex/Init.hs +++ b/Annex/Init.hs @@ -134,8 +134,8 @@ initialize' mversion = checkInitializeAllowed $ do else deconfigureSmudgeFilter unlessM isBareRepo $ do when supportunlocked $ do - showSideAction "scanning for unlocked files" - scanUnlockedFiles + showSideAction "scanning for annexed files" + scanAnnexedFiles hookWrite postCheckoutHook hookWrite postMergeHook AdjustedBranch.checkAdjustedClone >>= \case diff --git a/Annex/WorkTree.hs b/Annex/WorkTree.hs index 42abde34aa..226a00d1df 100644 --- a/Annex/WorkTree.hs +++ b/Annex/WorkTree.hs @@ -66,19 +66,19 @@ whenAnnexed a file = ifAnnexed file (a file) (return Nothing) ifAnnexed :: RawFilePath -> (Key -> Annex a) -> Annex a -> Annex a ifAnnexed file yes no = maybe no yes =<< lookupKey file -{- Find all unlocked files and update the keys database for them. +{- Find all annexed files and update the keys database for them. - - This is expensive, and so normally the associated files are updated - incrementally when changes are noticed. So, this only needs to be done - - when initializing/upgrading repository. + - when initializing/upgrading a repository. - - - Also, the content for the unlocked file may already be present as + - Also, the content for an unlocked file may already be present as - an annex object. If so, populate the pointer file with it. - But if worktree file does not have a pointer file's content, it is left - as-is. -} -scanUnlockedFiles :: Annex () -scanUnlockedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ do +scanAnnexedFiles :: Annex () +scanAnnexedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ do dropold <- liftIO $ newMVar $ Database.Keys.runWriter $ liftIO . Database.Keys.SQL.dropAllAssociatedFiles @@ -87,9 +87,10 @@ scanUnlockedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ (Git.LsTree.LsTreeLong False) Git.Ref.headRef forM_ l $ \i -> - when (isregfile i) $ - maybe noop (add dropold i) - =<< catKey (Git.LsTree.sha i) + maybe noop (add dropold i) + =<< catKey' + (Git.LsTree.sha i) + (fromMaybe 0 (Git.LsTree.size i)) liftIO $ void cleanup where isregfile i = case Git.Types.toTreeItemType (Git.LsTree.mode i) of @@ -101,7 +102,7 @@ scanUnlockedFiles = whenM (inRepo Git.Ref.headExists <&&> not <$> isBareRepo) $ let tf = Git.LsTree.file i Database.Keys.runWriter $ liftIO . Database.Keys.SQL.addAssociatedFileFast k tf - whenM (inAnnex k) $ do + whenM (pure (isregfile i) <&&> inAnnex k) $ do f <- fromRepo $ fromTopFilePath tf liftIO (isPointerFile f) >>= \case Just k' | k' == k -> do diff --git a/Command/Lock.hs b/Command/Lock.hs index e7af74ca9f..3d085393a8 100644 --- a/Command/Lock.hs +++ b/Command/Lock.hs @@ -62,7 +62,7 @@ perform file key = do lockdown =<< calcRepo (gitAnnexLocation key) addLink (CheckGitIgnore False) file key =<< withTSDelta (liftIO . genInodeCache file) - next $ cleanup file key + next $ return True where lockdown obj = do ifM (isUnmodified key obj) @@ -97,10 +97,5 @@ perform file key = do lostcontent = logStatus key InfoMissing -cleanup :: RawFilePath -> Key -> CommandCleanup -cleanup file key = do - Database.Keys.removeAssociatedFile key =<< inRepo (toTopFilePath file) - return True - errorModified :: a errorModified = giveup "Locking this file would discard any changes you have made to it. Use 'git annex add' to stage your changes. (Or, use --force to override)" diff --git a/Command/Migrate.hs b/Command/Migrate.hs index 669be7f56c..9a0d69f35a 100644 --- a/Command/Migrate.hs +++ b/Command/Migrate.hs @@ -86,7 +86,7 @@ perform file oldkey oldbackend newbackend = go =<< genkey (fastMigrate oldbacken urls <- getUrls oldkey forM_ urls $ \url -> setUrlPresent newkey url - next $ Command.ReKey.cleanup file oldkey newkey + next $ Command.ReKey.cleanup file newkey , giveup "failed creating link from old to new key" ) genkey Nothing = do diff --git a/Command/ReKey.hs b/Command/ReKey.hs index 368fb42ef2..077dd5a628 100644 --- a/Command/ReKey.hs +++ b/Command/ReKey.hs @@ -15,8 +15,6 @@ import Annex.Link import Annex.Perms import Annex.ReplaceFile import Logs.Location -import Git.FilePath -import qualified Database.Keys import Annex.InodeSentinal import Utility.InodeCache import qualified Utility.RawFilePath as R @@ -79,7 +77,7 @@ perform file oldkey newkey = do , unlessM (Annex.getState Annex.force) $ giveup $ fromRawFilePath file ++ " is not available (use --force to override)" ) - next $ cleanup file oldkey newkey + next $ cleanup file newkey {- Make a hard link to the old key content (when supported), - to avoid wasting disk space. -} @@ -119,8 +117,8 @@ linkKey file oldkey newkey = ifM (isJust <$> isAnnexLink file) LinkAnnexNoop -> True ) -cleanup :: RawFilePath -> Key -> Key -> CommandCleanup -cleanup file oldkey newkey = do +cleanup :: RawFilePath -> Key -> CommandCleanup +cleanup file newkey = do ifM (isJust <$> isAnnexLink file) ( do -- Update symlink to use the new key. @@ -131,8 +129,6 @@ cleanup file oldkey newkey = do liftIO $ whenM (isJust <$> isPointerFile file) $ writePointerFile file newkey mode stagePointerFile file mode =<< hashPointerFile newkey - Database.Keys.removeAssociatedFile oldkey - =<< inRepo (toTopFilePath file) ) whenM (inAnnex newkey) $ logStatus newkey InfoPresent diff --git a/Database/Keys.hs b/Database/Keys.hs index c050002617..984dac1958 100644 --- a/Database/Keys.hs +++ b/Database/Keys.hs @@ -1,6 +1,6 @@ {- Sqlite database of information about Keys - - - Copyright 2015-2019 Joey Hess + - Copyright 2015-2021 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -44,6 +44,9 @@ import Git.FilePath import Git.Command import Git.Types import Git.Index +import Git.Sha +import Git.Branch (writeTree, update') +import qualified Git.Ref import Config.Smudge import qualified Utility.RawFilePath as R @@ -191,20 +194,17 @@ removeInodeCache = runWriterIO . SQL.removeInodeCache isInodeKnown :: InodeCache -> SentinalStatus -> Annex Bool isInodeKnown i s = or <$> runReaderIO ((:[]) <$$> SQL.isInodeKnown i s) -{- Looks at staged changes to find when unlocked files are copied/moved, - - and updates associated files in the keys database. +{- Looks at staged changes to annexed files, and updates the keys database, + - so that its information is consistent with the state of the repository. - - - Since staged changes can be dropped later, does not remove any - - associated files; only adds new associated files. - - - - This needs to be run before querying the keys database so that - - information is consistent with the state of the repository. + - This is run with a lock held, so only one process can be running this at + - a time. - - To avoid unncessary work, the index file is statted, and if it's not - changed since last time this was run, nothing is done. - - - Note that this is run with a lock held, so only one process can be - - running this at a time. + - A tree is generated from the index, and the diff between that tree + - and the last processed tree is examined for changes. - - This also cleans up after a race between eg a git mv and git-annex - get/drop/similar. If git moves the file between this being run and the @@ -233,17 +233,28 @@ reconcileStaged qh = do ) Nothing -> noop where + lastindexref = Ref "refs/annex/last-index" + go cur indexcache = do - (l, cleanup) <- inRepo $ pipeNullSplit' diff - changed <- procdiff l False - void $ liftIO cleanup - -- Flush database changes immediately - -- so other processes can see them. - when changed $ - liftIO $ H.flushDbQueue qh - liftIO $ writeFile indexcache $ showInodeCache cur + oldtree <- fromMaybe emptyTree + <$> inRepo (Git.Ref.sha lastindexref) + newtree <- inRepo writeTree + when (oldtree /= newtree) $ do + (l, cleanup) <- inRepo $ pipeNullSplit' $ + diff oldtree newtree + changed <- procdiff l False + void $ liftIO cleanup + -- Flush database changes immediately + -- so other processes can see them. + when changed $ + liftIO $ H.flushDbQueue qh + liftIO $ writeFile indexcache $ showInodeCache cur + -- Storing the tree in a ref makes sure it does not + -- get garbage collected, and is available to diff + -- against next time. + inRepo $ update' lastindexref newtree - diff = + diff oldtree newtree = -- Avoid running smudge or clean filters, since we want the -- raw output, and they would block trying to access the -- locked database. The --raw normally avoids git diff @@ -253,43 +264,49 @@ reconcileStaged qh = do -- (The -G option may make it be used otherwise.) [ Param "-c", Param "diff.external=" , Param "diff" - , Param "--cached" , Param "--raw" , Param "-z" , Param "--no-abbrev" - -- Optimization: Only find pointer files. This is not - -- perfect. A file could start with this and not be a - -- pointer file. And a pointer file that is replaced with - -- a non-pointer file will match this. - , Param $ "-G^" ++ fromRawFilePath (toInternalGitPath $ + -- Optimization: Limit to pointer files and annex symlinks. + -- This is not perfect. A file could contain with this and not + -- be a pointer file. And a pointer file that is replaced with + -- a non-pointer file will match this. This is only a + -- prefilter so that's ok. + , Param $ "-G" ++ fromRawFilePath (toInternalGitPath $ P.pathSeparator `S.cons` objectDir') - -- Don't include files that were deleted, because this only - -- wants to update information for files that are present - -- in the index. - , Param "--diff-filter=AMUT" -- Disable rename detection. , Param "--no-renames" -- Avoid other complications. , Param "--ignore-submodules=all" , Param "--no-ext-diff" + , Param (fromRef oldtree) + , Param (fromRef newtree) ] procdiff (info:file:rest) changed | ":" `S.isPrefixOf` info = case S8.words info of - (_colonsrcmode:dstmode:_srcsha:dstsha:_change:[]) - -- Only want files, not symlinks - | dstmode /= fmtTreeItemType TreeSymlink -> do - maybe noop (reconcile (asTopFilePath file)) - =<< catKey (Ref dstsha) - procdiff rest True - | otherwise -> procdiff rest changed + (_colonsrcmode:dstmode:srcsha:dstsha:_change:[]) -> do + removed <- catKey (Ref srcsha) >>= \case + Just oldkey -> do + liftIO $ SQL.removeAssociatedFile oldkey + (asTopFilePath file) + (SQL.WriteHandle qh) + return True + Nothing -> return False + added <- catKey (Ref dstsha) >>= \case + Just key -> do + liftIO $ SQL.addAssociatedFile key + (asTopFilePath file) + (SQL.WriteHandle qh) + when (dstmode /= fmtTreeItemType TreeSymlink) $ + reconcilerace (asTopFilePath file) key + return True + Nothing -> return False + procdiff rest (changed || removed || added) _ -> return changed -- parse failed procdiff _ changed = return changed - -- Note that database writes done in here will not necessarily - -- be visible to database reads also done in here. - reconcile file key = do - liftIO $ SQL.addAssociatedFileFast key file (SQL.WriteHandle qh) + reconcilerace file key = do caches <- liftIO $ SQL.getInodeCaches key (SQL.ReadHandle qh) keyloc <- calcRepo (gitAnnexLocation key) keypopulated <- sameInodeCache keyloc caches diff --git a/Upgrade/V5.hs b/Upgrade/V5.hs index bed8a6d801..2db92d57f9 100644 --- a/Upgrade/V5.hs +++ b/Upgrade/V5.hs @@ -47,7 +47,7 @@ upgrade automatic = flip catchNonAsync onexception $ do , do checkGitVersionForIndirectUpgrade ) - scanUnlockedFiles + scanAnnexedFiles configureSmudgeFilter -- Inode sentinal file was only used in direct mode and when -- locking down files as they were added. In v6, it's used more diff --git a/doc/todo/Avoid_lengthy___34__Scanning_for_unlocked_files_...__34__/comment_4_2cb31617bb7003c5bf0e5def358da0e4._comment b/doc/todo/Avoid_lengthy___34__Scanning_for_unlocked_files_...__34__/comment_4_2cb31617bb7003c5bf0e5def358da0e4._comment index b37477e6e4..b525275114 100644 --- a/doc/todo/Avoid_lengthy___34__Scanning_for_unlocked_files_...__34__/comment_4_2cb31617bb7003c5bf0e5def358da0e4._comment +++ b/doc/todo/Avoid_lengthy___34__Scanning_for_unlocked_files_...__34__/comment_4_2cb31617bb7003c5bf0e5def358da0e4._comment @@ -9,7 +9,12 @@ If most of the files are locked, that would actually make the scan somewhere around twice as slow as it currently is. So not a worthwhile optimisation. -And I don't see much else there that could be optimised. Possibly the +Update: Now that the scan also scans for locked files to make the +associated files include information about them, the catKey optimisation +did make sense. Unfortunately, that does mean this scan got a little bit +slower still, since it has to use git ls-tree --long. + +I don't see much else there that could be optimised. Possibly the ls-tree parser could be made faster but it's already using attoparsec so unlikely to be many gains. """]] From efae085272125411a2df98ce694c1b3bbd5d39af Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 24 May 2021 11:33:23 -0400 Subject: [PATCH 02/11] fixed reconcileStaged crash when index is locked or in conflict Eg, when git commit runs the smudge filter. Commit 428c91606b434512d1986622e751c795edf4df44 introduced the crash, as write-tree fails in those situations. Now it will work, and git-annex always gets up-to-date information even in those situations. It does need to do a bit more work, each time git-annex is run with the index locked. Although if the index is unmodified from the last time write-tree succeeded, that work is avoided. --- Database/Keys.hs | 49 ++++++++++++++++++++++++++++++------------------ Git/Branch.hs | 12 +++++++++++- Git/Command.hs | 12 +++++------- 3 files changed, 47 insertions(+), 26 deletions(-) diff --git a/Database/Keys.hs b/Database/Keys.hs index 984dac1958..9392b232f3 100644 --- a/Database/Keys.hs +++ b/Database/Keys.hs @@ -45,7 +45,7 @@ import Git.Command import Git.Types import Git.Index import Git.Sha -import Git.Branch (writeTree, update') +import Git.Branch (writeTreeQuiet, update') import qualified Git.Ref import Config.Smudge import qualified Utility.RawFilePath as R @@ -226,35 +226,48 @@ reconcileStaged qh = do withTSDelta (liftIO . genInodeCache gitindex) >>= \case Just cur -> liftIO (maybe Nothing readInodeCache <$> catchMaybeIO (readFile indexcache)) >>= \case - Nothing -> go cur indexcache + Nothing -> go cur indexcache =<< getindextree Just prev -> ifM (compareInodeCaches prev cur) ( noop - , go cur indexcache + , go cur indexcache =<< getindextree ) Nothing -> noop where lastindexref = Ref "refs/annex/last-index" - go cur indexcache = do - oldtree <- fromMaybe emptyTree - <$> inRepo (Git.Ref.sha lastindexref) - newtree <- inRepo writeTree + getindextree = inRepo writeTreeQuiet + + getoldtree = fromMaybe emptyTree <$> inRepo (Git.Ref.sha lastindexref) + + go cur indexcache (Just newtree) = do + oldtree <- getoldtree when (oldtree /= newtree) $ do - (l, cleanup) <- inRepo $ pipeNullSplit' $ - diff oldtree newtree - changed <- procdiff l False - void $ liftIO cleanup - -- Flush database changes immediately - -- so other processes can see them. - when changed $ - liftIO $ H.flushDbQueue qh + updatetodiff (fromRef oldtree) (fromRef newtree) liftIO $ writeFile indexcache $ showInodeCache cur -- Storing the tree in a ref makes sure it does not -- get garbage collected, and is available to diff -- against next time. inRepo $ update' lastindexref newtree + -- git write-tree will fail if the index is locked or when there is + -- a merge conflict. To get up-to-date with the current index, + -- diff --cached with the old index tree. The current index tree + -- is not known, so not recorded, and the inode cache is not updated, + -- so the next time git-annex runs, it will diff again, even + -- if the index is unchanged. + go _ _ Nothing = do + oldtree <- getoldtree + updatetodiff (fromRef oldtree) "--cached" + + updatetodiff old new = do + (l, cleanup) <- inRepo $ pipeNullSplit' $ diff old new + changed <- procdiff l False + void $ liftIO cleanup + -- Flush database changes immediately + -- so other processes can see them. + when changed $ + liftIO $ H.flushDbQueue qh - diff oldtree newtree = + diff old new = -- Avoid running smudge or clean filters, since we want the -- raw output, and they would block trying to access the -- locked database. The --raw normally avoids git diff @@ -264,6 +277,8 @@ reconcileStaged qh = do -- (The -G option may make it be used otherwise.) [ Param "-c", Param "diff.external=" , Param "diff" + , Param old + , Param new , Param "--raw" , Param "-z" , Param "--no-abbrev" @@ -279,8 +294,6 @@ reconcileStaged qh = do -- Avoid other complications. , Param "--ignore-submodules=all" , Param "--no-ext-diff" - , Param (fromRef oldtree) - , Param (fromRef newtree) ] procdiff (info:file:rest) changed diff --git a/Git/Branch.hs b/Git/Branch.hs index 70faca335f..54af1018ba 100644 --- a/Git/Branch.hs +++ b/Git/Branch.hs @@ -185,8 +185,18 @@ commitAlways :: CommitMode -> String -> Branch -> [Ref] -> Repo -> IO Sha commitAlways commitmode message branch parentrefs repo = fromJust <$> commit commitmode True message branch parentrefs repo +-- Throws exception if the index is locked, with an error message output by +-- git on stderr. writeTree :: Repo -> IO Sha -writeTree repo = getSha "write-tree" $ pipeReadStrict [Param "write-tree"] repo +writeTree repo = getSha "write-tree" $ + pipeReadStrict [Param "write-tree"] repo + +-- Avoids error output if the command fails due to eg, the index being locked. +writeTreeQuiet :: Repo -> IO (Maybe Sha) +writeTreeQuiet repo = extractSha <$> withNullHandle go + where + go nullh = pipeReadStrict' (\p -> p { std_err = UseHandle nullh }) + [Param "write-tree"] repo commitTree :: CommitMode -> String -> [Ref] -> Ref -> Repo -> IO Sha commitTree commitmode message parentrefs tree repo = diff --git a/Git/Command.hs b/Git/Command.hs index fef7eb91ad..2358b17361 100644 --- a/Git/Command.hs +++ b/Git/Command.hs @@ -70,17 +70,15 @@ pipeReadLazy params repo = assertLocal repo $ do - Nonzero exit status is ignored. -} pipeReadStrict :: [CommandParam] -> Repo -> IO S.ByteString -pipeReadStrict = pipeReadStrict' S.hGetContents +pipeReadStrict = pipeReadStrict' id -{- The reader action must be strict. -} -pipeReadStrict' :: (Handle -> IO a) -> [CommandParam] -> Repo -> IO a -pipeReadStrict' reader params repo = assertLocal repo $ withCreateProcess p go +pipeReadStrict' :: (CreateProcess -> CreateProcess) -> [CommandParam] -> Repo -> IO S.ByteString +pipeReadStrict' fp params repo = assertLocal repo $ withCreateProcess p go where - p = (gitCreateProcess params repo) - { std_out = CreatePipe } + p = fp (gitCreateProcess params repo) { std_out = CreatePipe } go _ (Just outh) _ pid = do - output <- reader outh + output <- S.hGetContents outh hClose outh void $ waitForProcess pid return output From 13423f337c7fb7814f95d775e65d12e16875c091 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 24 May 2021 11:38:22 -0400 Subject: [PATCH 03/11] refactoring --- Database/Keys.hs | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/Database/Keys.hs b/Database/Keys.hs index 9392b232f3..7f80049157 100644 --- a/Database/Keys.hs +++ b/Database/Keys.hs @@ -224,19 +224,21 @@ reconcileStaged qh = do gitindex <- inRepo currentIndexFile indexcache <- fromRawFilePath <$> fromRepo gitAnnexKeysDbIndexCache withTSDelta (liftIO . genInodeCache gitindex) >>= \case - Just cur -> - liftIO (maybe Nothing readInodeCache <$> catchMaybeIO (readFile indexcache)) >>= \case - Nothing -> go cur indexcache =<< getindextree - Just prev -> ifM (compareInodeCaches prev cur) - ( noop - , go cur indexcache =<< getindextree - ) + Just cur -> readindexcache indexcache >>= \case + Nothing -> go cur indexcache =<< getindextree + Just prev -> ifM (compareInodeCaches prev cur) + ( noop + , go cur indexcache =<< getindextree + ) Nothing -> noop where lastindexref = Ref "refs/annex/last-index" getindextree = inRepo writeTreeQuiet + readindexcache indexcache = liftIO $ maybe Nothing readInodeCache + <$> catchMaybeIO (readFile indexcache) + getoldtree = fromMaybe emptyTree <$> inRepo (Git.Ref.sha lastindexref) go cur indexcache (Just newtree) = do From d62d6e2fcf0dba8f6029755c24a446890ac05ed6 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 24 May 2021 12:05:35 -0400 Subject: [PATCH 04/11] note about a wart All code that uses associated files already deals with this problem, which used to be worse. Unfortunately I was not able to entirely eliminate it, although it happens in fewer cases now. --- Database/Keys.hs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Database/Keys.hs b/Database/Keys.hs index 7f80049157..a7ef6c7933 100644 --- a/Database/Keys.hs +++ b/Database/Keys.hs @@ -218,6 +218,14 @@ isInodeKnown i s = or <$> runReaderIO ((:[]) <$$> SQL.isInodeKnown i s) - filter. If a drop missed the file then the file is added back into the - annex. If a get missed the file then the clean filter populates the - file. + - + - There is a situation where, after this has run, the database can still + - contain associated files that have been deleted from the index. + - That happens when addAssociatedFile is used to record a newly + - added file, but that file then gets removed from the index before + - this is run. Eg, "git-annex add foo; git rm foo" + - So when using getAssociatedFiles, have to make sure the file still + - is an associated file. -} reconcileStaged :: H.DbQueue -> Annex () reconcileStaged qh = do From 78be7cf73fba9cdb3dbb87ee1629e4eef03d86b1 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 24 May 2021 13:53:28 -0400 Subject: [PATCH 05/11] remove warning about combining options the option parser no longer allows combining --want-get/--want-drop with options like --all --- doc/git-annex-matching-options.mdwn | 4 ---- 1 file changed, 4 deletions(-) diff --git a/doc/git-annex-matching-options.mdwn b/doc/git-annex-matching-options.mdwn index bc8cad5d93..d2104c7ccf 100644 --- a/doc/git-annex-matching-options.mdwn +++ b/doc/git-annex-matching-options.mdwn @@ -134,16 +134,12 @@ in either of two repositories. Matches files that the preferred content settings for the repository make it want to get. Note that this will match even files that are already present, unless limited with e.g., `--not --in .` - - Note that this will not match anything when using --all or --unused. * `--want-drop` Matches files that the preferred content settings for the repository make it want to drop. Note that this will match even files that have already been dropped, unless limited with e.g., `--in .` - - Note that this will not match anything when using --all or --unused. * `--accessedwithin=interval` From a56b151f9009590c97da8dbf66c1b138455657cb Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 24 May 2021 14:02:50 -0400 Subject: [PATCH 06/11] fix longstanding indeterminite preferred content for duplicated file problem * drop: When two files have the same content, and a preferred content expression matches one but not the other, do not drop the file. * sync --content, assistant: Fix an edge case where a file that is not preferred content did not get dropped. The sync --content edge case is that handleDropsFrom loaded associated files and used them without verifying that the information from the database was not stale. It seemed best to avoid changing --want-drop's behavior, this way when debugging a preferred content expression with it, the files matched will still reflect the expression. So added a note to the --want-drop documentation, to make clear it may not behave identically to git-annex drop --auto. While it would be possible to introspect the preferred content expression to see if it matches on filenames, and only look up the associated files when it does, it's generally fairly rare for 2 files to have the same content, and the database lookup is already avoided when there's only 1 file, so I did not implement that further optimisation. Note that there are still some situations where the associated files database does not get locked files recorded in it, which will prevent this fix from working. Sponsored-by: Dartmouth College's Datalad project --- Annex/Drop.hs | 16 +++---- Annex/Wanted.hs | 47 ++++++++++++++++--- CHANGELOG | 4 ++ Command/Drop.hs | 6 +-- Limit/Wanted.hs | 2 +- ...red_content_state_for_duplicated_file.mdwn | 2 + doc/git-annex-matching-options.mdwn | 5 ++ 7 files changed, 62 insertions(+), 20 deletions(-) diff --git a/Annex/Drop.hs b/Annex/Drop.hs index b7543ec794..6f55378719 100644 --- a/Annex/Drop.hs +++ b/Annex/Drop.hs @@ -103,16 +103,12 @@ handleDropsFrom locs rs reason fromhere key afile si preverified runner = do dropr fs r n >>= go fs rest | otherwise = pure n - checkdrop fs n u a - | null fs = check $ -- no associated files; unused content - wantDrop True u (Just key) (AssociatedFile Nothing) - | otherwise = check $ - allM (wantDrop True u (Just key) . AssociatedFile . Just) fs - where - check c = ifM c - ( dodrop n u a - , return n - ) + checkdrop fs n u a = + let afs = map (AssociatedFile . Just) fs + in ifM (wantDrop True u (Just key) afile (Just afs)) + ( dodrop n u a + , return n + ) dodrop n@(have, numcopies, mincopies, _untrusted) u a = ifM (safely $ runner $ a numcopies mincopies) diff --git a/Annex/Wanted.hs b/Annex/Wanted.hs index 021fe5cafb..a1ac7b20c4 100644 --- a/Annex/Wanted.hs +++ b/Annex/Wanted.hs @@ -1,6 +1,6 @@ {- git-annex checking whether content is wanted - - - Copyright 2012 Joey Hess + - Copyright 2012-2021 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -10,6 +10,9 @@ module Annex.Wanted where import Annex.Common import Logs.PreferredContent import Annex.UUID +import Annex.CatFile +import Git.FilePath +import qualified Database.Keys import qualified Data.Set as S @@ -22,8 +25,40 @@ wantSend :: Bool -> Maybe Key -> AssociatedFile -> UUID -> Annex Bool wantSend d key file to = isPreferredContent (Just to) S.empty key file d {- Check if a file can be dropped, maybe from a remote. - - Don't drop files that are preferred content. -} -wantDrop :: Bool -> Maybe UUID -> Maybe Key -> AssociatedFile -> Annex Bool -wantDrop d from key file = do - u <- maybe getUUID (return . id) from - not <$> isPreferredContent (Just u) (S.singleton u) key file d + - Don't drop files that are preferred content. + - + - The AssociatedFile is the one that the user requested to drop. + - There may be other files that use the same key, and preferred content + - may match some of those and not others. If any are preferred content, + - that will prevent dropping. When the other associated files are known, + - they can be provided, otherwise this looks them up. + -} +wantDrop :: Bool -> Maybe UUID -> Maybe Key -> AssociatedFile -> (Maybe [AssociatedFile]) -> Annex Bool +wantDrop d from key file others = do + u <- maybe getUUID (pure . id) from + let s = S.singleton u + let checkwant f = isPreferredContent (Just u) s key f d + ifM (checkwant file) + ( return False + , do + others' <- case others of + Just afs -> pure (filter (/= file) afs) + Nothing -> case key of + Just k -> mapM (\f -> AssociatedFile . Just <$> fromRepo (fromTopFilePath f)) + =<< Database.Keys.getAssociatedFiles k + Nothing -> pure [] + l <- filterM checkwant others' + if null l + then return True + else checkassociated l + ) + where + -- Some associated files that are in the keys database may no + -- longer correspond to files in the repository, and should + -- not prevent dropping. + checkassociated [] = return True + checkassociated (AssociatedFile (Just af):fs) = + catKeyFile af >>= \case + Just k | Just k == key -> return False + _ -> checkassociated fs + checkassociated (AssociatedFile Nothing:fs) = checkassociated fs diff --git a/CHANGELOG b/CHANGELOG index b00b6ae1a6..4fc140d15c 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,9 @@ git-annex (8.20210429) UNRELEASED; urgency=medium + * drop: When two files have the same content, and a preferred content + expression matches one but not the other, do not drop the file. + * sync --content, assistant: Fix an edge case where a file that is not + preferred content did not get dropped. * filter-branch: New command, useful to produce a filtered version of the git-annex branch, eg when splitting a repository. * fromkey: Create an unlocked file when used in an adjusted branch diff --git a/Command/Drop.hs b/Command/Drop.hs index 26a16fdd37..6422fc26d5 100644 --- a/Command/Drop.hs +++ b/Command/Drop.hs @@ -86,13 +86,13 @@ start o from si file key = start' o from key afile ai si start' :: DropOptions -> Maybe Remote -> Key -> AssociatedFile -> ActionItem -> SeekInput -> CommandStart start' o from key afile ai si = checkDropAuto (autoMode o) from afile key $ \numcopies mincopies -> - stopUnless want $ + stopUnless wantdrop $ case from of Nothing -> startLocal afile ai si numcopies mincopies key [] Just remote -> startRemote afile ai si numcopies mincopies key remote where - want - | autoMode o = wantDrop False (Remote.uuid <$> from) (Just key) afile + wantdrop + | autoMode o = wantDrop False (Remote.uuid <$> from) (Just key) afile Nothing | otherwise = return True startKeys :: DropOptions -> Maybe Remote -> (SeekInput, Key, ActionItem) -> CommandStart diff --git a/Limit/Wanted.hs b/Limit/Wanted.hs index 5518fa0bc9..9188594b90 100644 --- a/Limit/Wanted.hs +++ b/Limit/Wanted.hs @@ -19,7 +19,7 @@ addWantGet = addPreferredContentLimit $ addWantDrop :: Annex () addWantDrop = addPreferredContentLimit $ - checkWant $ wantDrop False Nothing Nothing + checkWant $ \af -> wantDrop False Nothing Nothing af (Just []) addPreferredContentLimit :: (MatchInfo -> Annex Bool) -> Annex () addPreferredContentLimit a = do diff --git a/doc/bugs/indeterminite_preferred_content_state_for_duplicated_file.mdwn b/doc/bugs/indeterminite_preferred_content_state_for_duplicated_file.mdwn index 0ca978dd3b..ee0d5a3930 100644 --- a/doc/bugs/indeterminite_preferred_content_state_for_duplicated_file.mdwn +++ b/doc/bugs/indeterminite_preferred_content_state_for_duplicated_file.mdwn @@ -19,3 +19,5 @@ So, this seems solvable in v7 repositories, but not in v5. Also, the associated files map may not be accurate at all times, so that's a wrinkle to using it for this. Also, only unlocked files get into the associated files map. --[[Joey]] + +> [[fixed|done]] --[[Joey]] diff --git a/doc/git-annex-matching-options.mdwn b/doc/git-annex-matching-options.mdwn index d2104c7ccf..fc7b794f4c 100644 --- a/doc/git-annex-matching-options.mdwn +++ b/doc/git-annex-matching-options.mdwn @@ -141,6 +141,11 @@ in either of two repositories. make it want to drop. Note that this will match even files that have already been dropped, unless limited with e.g., `--in .` + Files that this matches will not necessarily be dropped by + `git-annex drop --auto`. This does not check that there are enough copies + to drop. Also the same content may be used by a file that is not wanted + to be dropped. + * `--accessedwithin=interval` Matches files that were accessed recently, within the specified time From f46e4c9b7c90267c38d44354bf690ce07f761182 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 24 May 2021 14:46:59 -0400 Subject: [PATCH 07/11] fix case where keys db was not initialized in time When the keys db is opened for read, and did not exist yet, it used to skip creating it, and return mempty values. But that prevents reconcileStaged from populating associated files information in time for the read. This fixes the one remaining case I know of where the fix in a56b151f9009590c97da8dbf66c1b138455657cb didn't work. Note that, when there is a permissions error, it still avoids creating the db and returns mempty for all queries. This does mean that reconcileStaged does not run and so it may want to drop files that it should not. However, presumably a permissions error on the keys database also means that the user does not have permission to delete annex objects, so they won't be able to drop the files anyway. Sponsored-by: Dartmouth College's Datalad project --- Annex/Wanted.hs | 5 +++-- Database/Keys.hs | 29 ++++++++++++----------------- 2 files changed, 15 insertions(+), 19 deletions(-) diff --git a/Annex/Wanted.hs b/Annex/Wanted.hs index a1ac7b20c4..3ce7fe328f 100644 --- a/Annex/Wanted.hs +++ b/Annex/Wanted.hs @@ -44,8 +44,9 @@ wantDrop d from key file others = do others' <- case others of Just afs -> pure (filter (/= file) afs) Nothing -> case key of - Just k -> mapM (\f -> AssociatedFile . Just <$> fromRepo (fromTopFilePath f)) - =<< Database.Keys.getAssociatedFiles k + Just k -> + mapM (\f -> AssociatedFile . Just <$> fromRepo (fromTopFilePath f)) + =<< Database.Keys.getAssociatedFiles k Nothing -> pure [] l <- filterM checkwant others' if null l diff --git a/Database/Keys.hs b/Database/Keys.hs index a7ef6c7933..6a3b43cb54 100644 --- a/Database/Keys.hs +++ b/Database/Keys.hs @@ -55,10 +55,6 @@ import qualified Data.ByteString.Char8 as S8 import qualified System.FilePath.ByteString as P {- Runs an action that reads from the database. - - - - If the database doesn't already exist, it's not created; mempty is - - returned instead. This way, when the keys database is not in use, - - there's minimal overhead in checking it. - - If the database is already open, any writes are flushed to it, to ensure - consistency. @@ -76,7 +72,7 @@ runReader a = do v <- a (SQL.ReadHandle qh) return (v, st) go DbClosed = do - st' <- openDb False DbClosed + st' <- openDb True DbClosed v <- case st' of (DbOpen qh) -> a (SQL.ReadHandle qh) _ -> return mempty @@ -98,7 +94,7 @@ runWriter a = do v <- a (SQL.WriteHandle qh) return (v, st) go st = do - st' <- openDb True st + st' <- openDb False st v <- case st' of DbOpen qh -> a (SQL.WriteHandle qh) _ -> error "internal" @@ -107,7 +103,7 @@ runWriter a = do runWriterIO :: (SQL.WriteHandle -> IO ()) -> Annex () runWriterIO a = runWriter (liftIO . a) -{- Opens the database, perhaps creating it if it doesn't exist yet. +{- Opens the database, creating it if it doesn't exist yet. - - Multiple readers and writers can have the database open at the same - time. Database.Handle deals with the concurrency issues. @@ -118,22 +114,21 @@ runWriterIO a = runWriter (liftIO . a) openDb :: Bool -> DbState -> Annex DbState openDb _ st@(DbOpen _) = return st openDb False DbUnavailable = return DbUnavailable -openDb createdb _ = catchPermissionDenied permerr $ withExclusiveLock gitAnnexKeysDbLock $ do +openDb forwrite _ = catchPermissionDenied permerr $ withExclusiveLock gitAnnexKeysDbLock $ do dbdir <- fromRepo gitAnnexKeysDb let db = dbdir P. "db" dbexists <- liftIO $ R.doesPathExist db - case (dbexists, createdb) of - (True, _) -> open db - (False, True) -> do + case dbexists of + True -> open db + False -> do initDb db SQL.createTables open db - (False, False) -> return DbUnavailable where - -- If permissions don't allow opening the database, treat it as if - -- it does not exist. - permerr e = case createdb of - False -> return DbUnavailable - True -> throwM e + -- If permissions don't allow opening the database, and it's being + -- opened for read, treat it as if it does not exist. + permerr e + | forwrite = throwM e + | otherwise = return DbUnavailable open db = do qh <- liftIO $ H.openDbQueue H.MultiWriter db SQL.containedTable From 5d189947363401d0fa199d10fe44fb5a1d2717d0 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 24 May 2021 14:54:51 -0400 Subject: [PATCH 08/11] clearer language --- CHANGELOG | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 4fc140d15c..417847ed47 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,7 +1,7 @@ git-annex (8.20210429) UNRELEASED; urgency=medium - * drop: When two files have the same content, and a preferred content - expression matches one but not the other, do not drop the file. + * drop --auto: When two files have the same content, and a preferred content + expression matches one but not the other, do not drop the content. * sync --content, assistant: Fix an edge case where a file that is not preferred content did not get dropped. * filter-branch: New command, useful to produce a filtered version of the From 73e1507c72f656fb009b4c372f489698d5206bb6 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 24 May 2021 15:31:06 -0400 Subject: [PATCH 09/11] fix deadlock git-annex test hung, at varying points depending on when git decided to run the smudge clean filter. Recent changes to reconcileStaged caused a deadlock, when git write-tree for some reason decides to run the smudge clean filter. Which tries to open the keys db, and blocks waiting for the lock file that its grandparent has locked. I don't know why git write-tree does that. It's supposed to only write a tree from the index which needs no smudge/clean filtering. I've verified that, in a situation where git write-tree runs the clean filter, disabling the filter results in a tree being written that contains the annex link, not eg, the worktree file content. So it seems safe to disable the clean filter, but also this seems likely to be working around a bug in git because it seems it is running the clean filter in a situation where the object has already been cleaned. Sponsored-by: Dartmouth College's Datalad project --- Database/Keys.hs | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/Database/Keys.hs b/Database/Keys.hs index 6a3b43cb54..842b53f1d1 100644 --- a/Database/Keys.hs +++ b/Database/Keys.hs @@ -237,8 +237,6 @@ reconcileStaged qh = do where lastindexref = Ref "refs/annex/last-index" - getindextree = inRepo writeTreeQuiet - readindexcache indexcache = liftIO $ maybe Nothing readInodeCache <$> catchMaybeIO (readFile indexcache) @@ -272,9 +270,17 @@ reconcileStaged qh = do when changed $ liftIO $ H.flushDbQueue qh + -- Avoid running smudge clean filter, which would block trying to + -- access the locked database. git write-tree sometimes calls it, + -- even though it is not adding work tree files to the index, + -- and so the filter cannot have an effect on the contents of the + -- index or on the tree that gets written from it. + getindextree = inRepo $ \r -> writeTreeQuiet $ r + { gitGlobalOpts = gitGlobalOpts r ++ bypassSmudgeConfig } + diff old new = - -- Avoid running smudge or clean filters, since we want the - -- raw output, and they would block trying to access the + -- Avoid running smudge clean filter, since we want the + -- raw output, and it would block trying to access the -- locked database. The --raw normally avoids git diff -- running them, but older versions of git need this. bypassSmudgeConfig ++ From 7029ef1c3d4e3b5ce31548ab591d06294607d244 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 25 May 2021 10:08:29 -0400 Subject: [PATCH 10/11] improve changelog --- CHANGELOG | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index 417847ed47..2296541109 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,10 @@ git-annex (8.20210429) UNRELEASED; urgency=medium * drop --auto: When two files have the same content, and a preferred content expression matches one but not the other, do not drop the content. + * sync --content, assistant: When two unlocked files have the same + content, and a preferred content expression matches one but not the + other, do not drop the content. (This was already the case for locked + files.) * sync --content, assistant: Fix an edge case where a file that is not preferred content did not get dropped. * filter-branch: New command, useful to produce a filtered version of the From cedc28a78347b942034559a374e524e99376a14c Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 25 May 2021 10:57:06 -0400 Subject: [PATCH 11/11] prevent dropping required content of other file using same content When two files have the same content, and a required content expression matches one but not the other, dropping the latter file will fail as it would also remove the content of the required file. This will slow down drop (w/o --auto), dropunused, mirror, and move, by one keys db lookup per file. But I did include an optimisation to avoid a double db lookup in the drop --auto / sync --content case. I suspect that dropunused could also use PreferredContentChecked True, but haven't entirely thought it through and it's rarely used with enough files for the optimisation to matter. Sponsored-by: Dartmouth College's Datalad project --- Annex/Drop.hs | 15 +++++---- Annex/Wanted.hs | 36 ++++++++++++-------- CHANGELOG | 3 ++ Command/Drop.hs | 71 ++++++++++++++++++++++------------------ Command/DropUnused.hs | 5 +-- Command/Mirror.hs | 5 +-- Command/Move.hs | 4 +-- Logs/PreferredContent.hs | 4 +-- Test.hs | 22 +++++++++++++ 9 files changed, 105 insertions(+), 60 deletions(-) diff --git a/Annex/Drop.hs b/Annex/Drop.hs index 6f55378719..416ce49e94 100644 --- a/Annex/Drop.hs +++ b/Annex/Drop.hs @@ -27,8 +27,8 @@ import qualified Data.Set as S type Reason = String -{- Drop a key from local and/or remote when allowed by the preferred content - - and numcopies settings. +{- Drop a key from local and/or remote when allowed by the preferred content, + - required content, and numcopies settings. - - Skips trying to drop from remotes that are appendonly, since those drops - would presumably fail. Also skips dropping from exporttree/importtree remotes, @@ -105,8 +105,9 @@ handleDropsFrom locs rs reason fromhere key afile si preverified runner = do checkdrop fs n u a = let afs = map (AssociatedFile . Just) fs + pcc = Command.Drop.PreferredContentChecked True in ifM (wantDrop True u (Just key) afile (Just afs)) - ( dodrop n u a + ( dodrop n u (a pcc) , return n ) @@ -126,12 +127,12 @@ handleDropsFrom locs rs reason fromhere key afile si preverified runner = do , return n ) - dropl fs n = checkdrop fs n Nothing $ \numcopies mincopies -> + dropl fs n = checkdrop fs n Nothing $ \pcc numcopies mincopies -> stopUnless (inAnnex key) $ - Command.Drop.startLocal afile ai si numcopies mincopies key preverified + Command.Drop.startLocal pcc afile ai si numcopies mincopies key preverified - dropr fs r n = checkdrop fs n (Just $ Remote.uuid r) $ \numcopies mincopies -> - Command.Drop.startRemote afile ai si numcopies mincopies key r + dropr fs r n = checkdrop fs n (Just $ Remote.uuid r) $ \pcc numcopies mincopies -> + Command.Drop.startRemote pcc afile ai si numcopies mincopies key r ai = mkActionItem (key, afile) diff --git a/Annex/Wanted.hs b/Annex/Wanted.hs index 3ce7fe328f..cb2a8e3401 100644 --- a/Annex/Wanted.hs +++ b/Annex/Wanted.hs @@ -13,6 +13,7 @@ import Annex.UUID import Annex.CatFile import Git.FilePath import qualified Database.Keys +import Types.FileMatcher import qualified Data.Set as S @@ -20,12 +21,12 @@ import qualified Data.Set as S wantGet :: Bool -> Maybe Key -> AssociatedFile -> Annex Bool wantGet d key file = isPreferredContent Nothing S.empty key file d -{- Check if a file is preferred content for a remote. -} +{- Check if a file is preferred content for a repository. -} wantSend :: Bool -> Maybe Key -> AssociatedFile -> UUID -> Annex Bool wantSend d key file to = isPreferredContent (Just to) S.empty key file d -{- Check if a file can be dropped, maybe from a remote. - - Don't drop files that are preferred content. +{- Check if a file is not preferred or required content, and can be + - dropped. When a UUID is provided, checks for that repository. - - The AssociatedFile is the one that the user requested to drop. - There may be other files that use the same key, and preferred content @@ -34,12 +35,21 @@ wantSend d key file to = isPreferredContent (Just to) S.empty key file d - they can be provided, otherwise this looks them up. -} wantDrop :: Bool -> Maybe UUID -> Maybe Key -> AssociatedFile -> (Maybe [AssociatedFile]) -> Annex Bool -wantDrop d from key file others = do +wantDrop d from key file others = + isNothing <$> checkDrop isPreferredContent d from key file others + +{- Generalization of wantDrop that can also be used with isRequiredContent. + - + - When the content should not be dropped, returns Just the file that + - the checker matches. + -} +checkDrop :: (Maybe UUID -> AssumeNotPresent -> Maybe Key -> AssociatedFile -> Bool -> Annex Bool) -> Bool -> Maybe UUID -> Maybe Key -> AssociatedFile -> (Maybe [AssociatedFile]) -> Annex (Maybe AssociatedFile) +checkDrop checker d from key file others = do u <- maybe getUUID (pure . id) from let s = S.singleton u - let checkwant f = isPreferredContent (Just u) s key f d - ifM (checkwant file) - ( return False + let checker' f = checker (Just u) s key f d + ifM (checker' file) + ( return (Just file) , do others' <- case others of Just afs -> pure (filter (/= file) afs) @@ -48,18 +58,18 @@ wantDrop d from key file others = do mapM (\f -> AssociatedFile . Just <$> fromRepo (fromTopFilePath f)) =<< Database.Keys.getAssociatedFiles k Nothing -> pure [] - l <- filterM checkwant others' + l <- filterM checker' others' if null l - then return True + then return Nothing else checkassociated l ) where -- Some associated files that are in the keys database may no -- longer correspond to files in the repository, and should -- not prevent dropping. - checkassociated [] = return True - checkassociated (AssociatedFile (Just af):fs) = - catKeyFile af >>= \case - Just k | Just k == key -> return False + checkassociated [] = return Nothing + checkassociated (af@(AssociatedFile (Just f)):fs) = + catKeyFile f >>= \case + Just k | Just k == key -> return (Just af) _ -> checkassociated fs checkassociated (AssociatedFile Nothing:fs) = checkassociated fs diff --git a/CHANGELOG b/CHANGELOG index 2296541109..1e5b360e07 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,8 @@ git-annex (8.20210429) UNRELEASED; urgency=medium + * When two files have the same content, and a required content expression + matches one but not the other, dropping the latter file will fail as it + would also remove the content of the required file. * drop --auto: When two files have the same content, and a preferred content expression matches one but not the other, do not drop the content. * sync --content, assistant: When two unlocked files have the same diff --git a/Command/Drop.hs b/Command/Drop.hs index 6422fc26d5..f30b6f4c08 100644 --- a/Command/Drop.hs +++ b/Command/Drop.hs @@ -21,8 +21,6 @@ import Annex.Content import Annex.Wanted import Annex.Notification -import qualified Data.Set as S - cmd :: Command cmd = withGlobalOptions [jobsOption, jsonOptions, annexedMatchingOptions] $ command "drop" SectionCommon @@ -88,31 +86,32 @@ start' o from key afile ai si = checkDropAuto (autoMode o) from afile key $ \numcopies mincopies -> stopUnless wantdrop $ case from of - Nothing -> startLocal afile ai si numcopies mincopies key [] - Just remote -> startRemote afile ai si numcopies mincopies key remote + Nothing -> startLocal pcc afile ai si numcopies mincopies key [] + Just remote -> startRemote pcc afile ai si numcopies mincopies key remote where wantdrop | autoMode o = wantDrop False (Remote.uuid <$> from) (Just key) afile Nothing | otherwise = return True + pcc = PreferredContentChecked (autoMode o) startKeys :: DropOptions -> Maybe Remote -> (SeekInput, Key, ActionItem) -> CommandStart startKeys o from (si, key, ai) = start' o from key (AssociatedFile Nothing) ai si -startLocal :: AssociatedFile -> ActionItem -> SeekInput -> NumCopies -> MinCopies -> Key -> [VerifiedCopy] -> CommandStart -startLocal afile ai si numcopies mincopies key preverified = +startLocal :: PreferredContentChecked -> AssociatedFile -> ActionItem -> SeekInput -> NumCopies -> MinCopies -> Key -> [VerifiedCopy] -> CommandStart +startLocal pcc afile ai si numcopies mincopies key preverified = starting "drop" (OnlyActionOn key ai) si $ - performLocal key afile numcopies mincopies preverified + performLocal pcc key afile numcopies mincopies preverified -startRemote :: AssociatedFile -> ActionItem -> SeekInput -> NumCopies -> MinCopies -> Key -> Remote -> CommandStart -startRemote afile ai si numcopies mincopies key remote = +startRemote :: PreferredContentChecked -> AssociatedFile -> ActionItem -> SeekInput -> NumCopies -> MinCopies -> Key -> Remote -> CommandStart +startRemote pcc afile ai si numcopies mincopies key remote = starting ("drop " ++ Remote.name remote) (OnlyActionOn key ai) si $ - performRemote key afile numcopies mincopies remote + performRemote pcc key afile numcopies mincopies remote -performLocal :: Key -> AssociatedFile -> NumCopies -> MinCopies -> [VerifiedCopy] -> CommandPerform -performLocal key afile numcopies mincopies preverified = lockContentForRemoval key fallback $ \contentlock -> do +performLocal :: PreferredContentChecked -> Key -> AssociatedFile -> NumCopies -> MinCopies -> [VerifiedCopy] -> CommandPerform +performLocal pcc key afile numcopies mincopies preverified = lockContentForRemoval key fallback $ \contentlock -> do u <- getUUID (tocheck, verified) <- verifiableCopies key [u] - doDrop u (Just contentlock) key afile numcopies mincopies [] (preverified ++ verified) tocheck + doDrop pcc u (Just contentlock) key afile numcopies mincopies [] (preverified ++ verified) tocheck ( \proof -> do fastDebug "Command.Drop" $ unwords [ "Dropping from here" @@ -134,12 +133,12 @@ performLocal key afile numcopies mincopies preverified = lockContentForRemoval k -- to be done except for cleaning up. fallback = next $ cleanupLocal key -performRemote :: Key -> AssociatedFile -> NumCopies -> MinCopies -> Remote -> CommandPerform -performRemote key afile numcopies mincopies remote = do +performRemote :: PreferredContentChecked -> Key -> AssociatedFile -> NumCopies -> MinCopies -> Remote -> CommandPerform +performRemote pcc key afile numcopies mincopies remote = do -- Filter the uuid it's being dropped from out of the lists of -- places assumed to have the key, and places to check. (tocheck, verified) <- verifiableCopies key [uuid] - doDrop uuid Nothing key afile numcopies mincopies [uuid] verified tocheck + doDrop pcc uuid Nothing key afile numcopies mincopies [uuid] verified tocheck ( \proof -> do fastDebug "Command.Drop" $ unwords [ "Dropping from remote" @@ -169,12 +168,11 @@ cleanupRemote key remote ok = do - verify that enough copies of a key exist to allow it to be - safely removed (with no data loss). - - - Also checks if it's required content, and refuses to drop if so. - - - --force overrides and always allows dropping. -} doDrop - :: UUID + :: PreferredContentChecked + -> UUID -> Maybe ContentRemovalLock -> Key -> AssociatedFile @@ -185,10 +183,10 @@ doDrop -> [UnVerifiedCopy] -> (Maybe SafeDropProof -> CommandPerform, CommandPerform) -> CommandPerform -doDrop dropfrom contentlock key afile numcopies mincopies skip preverified check (dropaction, nodropaction) = +doDrop pcc dropfrom contentlock key afile numcopies mincopies skip preverified check (dropaction, nodropaction) = ifM (Annex.getState Annex.force) ( dropaction Nothing - , ifM (checkRequiredContent dropfrom key afile) + , ifM (checkRequiredContent pcc dropfrom key afile) ( verifyEnoughCopiesToDrop nolocmsg key contentlock numcopies mincopies skip preverified check @@ -203,18 +201,27 @@ doDrop dropfrom contentlock key afile numcopies mincopies skip preverified check showLongNote "(Use --force to override this check, or adjust numcopies.)" a -checkRequiredContent :: UUID -> Key -> AssociatedFile -> Annex Bool -checkRequiredContent u k afile = - ifM (isRequiredContent (Just u) S.empty (Just k) afile False) - ( requiredContent - , return True - ) +{- Checking preferred content also checks required content, so when + - auto mode causes preferred content to be checked, it's redundant + - for checkRequiredContent to separately check required content, and + - providing this avoids that extra work. -} +newtype PreferredContentChecked = PreferredContentChecked Bool -requiredContent :: Annex Bool -requiredContent = do - showLongNote "That file is required content, it cannot be dropped!" - showLongNote "(Use --force to override this check, or adjust required content configuration.)" - return False +checkRequiredContent :: PreferredContentChecked -> UUID -> Key -> AssociatedFile -> Annex Bool +checkRequiredContent (PreferredContentChecked True) _ _ _ = return True +checkRequiredContent (PreferredContentChecked False) u k afile = + checkDrop isRequiredContent False (Just u) (Just k) afile Nothing >>= \case + Nothing -> return True + Just afile' -> do + if afile == afile' + then showLongNote "That file is required content. It cannot be dropped!" + else showLongNote $ "That file has the same content as another file" + ++ case afile' of + AssociatedFile (Just f) -> " (" ++ fromRawFilePath f ++ ")," + AssociatedFile Nothing -> "" + ++ " which is required content. It cannot be dropped!" + showLongNote "(Use --force to override this check, or adjust required content configuration.)" + return False {- In auto mode, only runs the action if there are enough - copies on other semitrusted repositories. -} diff --git a/Command/DropUnused.hs b/Command/DropUnused.hs index 6c7ca34d41..0b81a9e467 100644 --- a/Command/DropUnused.hs +++ b/Command/DropUnused.hs @@ -49,7 +49,7 @@ perform :: Maybe Remote -> NumCopies -> MinCopies -> Key -> CommandPerform perform from numcopies mincopies key = case from of Just r -> do showAction $ "from " ++ Remote.name r - Command.Drop.performRemote key (AssociatedFile Nothing) numcopies mincopies r + Command.Drop.performRemote pcc key (AssociatedFile Nothing) numcopies mincopies r Nothing -> ifM (inAnnex key) ( droplocal , ifM (objectFileExists key) @@ -63,7 +63,8 @@ perform from numcopies mincopies key = case from of ) ) where - droplocal = Command.Drop.performLocal key (AssociatedFile Nothing) numcopies mincopies [] + droplocal = Command.Drop.performLocal pcc key (AssociatedFile Nothing) numcopies mincopies [] + pcc = Command.Drop.PreferredContentChecked False performOther :: (Key -> Git.Repo -> RawFilePath) -> Key -> CommandPerform performOther filespec key = do diff --git a/Command/Mirror.hs b/Command/Mirror.hs index 61c9a1f888..b5f49b2373 100644 --- a/Command/Mirror.hs +++ b/Command/Mirror.hs @@ -69,7 +69,7 @@ startKey o afile (si, key, ai) = case fromToOptions o of ( Command.Move.toStart Command.Move.RemoveNever afile key ai si =<< getParsed r , do (numcopies, mincopies) <- getnummincopies - Command.Drop.startRemote afile ai si numcopies mincopies key =<< getParsed r + Command.Drop.startRemote pcc afile ai si numcopies mincopies key =<< getParsed r ) FromRemote r -> checkFailedTransferDirection ai Download $ do haskey <- flip Remote.hasKey key =<< getParsed r @@ -82,10 +82,11 @@ startKey o afile (si, key, ai) = case fromToOptions o of Right False -> ifM (inAnnex key) ( do (numcopies, mincopies) <- getnummincopies - Command.Drop.startLocal afile ai si numcopies mincopies key [] + Command.Drop.startLocal pcc afile ai si numcopies mincopies key [] , stop ) where getnummincopies = case afile of AssociatedFile Nothing -> (,) <$> getNumCopies <*> getMinCopies AssociatedFile (Just af) -> getFileNumMinCopies af + pcc = Command.Drop.PreferredContentChecked False diff --git a/Command/Move.hs b/Command/Move.hs index fcb7390a6a..6d2cc50c30 100644 --- a/Command/Move.hs +++ b/Command/Move.hs @@ -293,7 +293,7 @@ toHereStart removewhen afile key ai si = - repository reduces the number of copies, and should fail if - that would violate numcopies settings. - - - On the other hand, when the destiation repository does not already + - On the other hand, when the destination repository does not already - have a copy of a file, it can be dropped without making numcopies - worse, so the move is allowed even if numcopies is not met. - @@ -311,7 +311,7 @@ toHereStart removewhen afile key ai si = -} willDropMakeItWorse :: UUID -> UUID -> DestStartedWithCopy -> Key -> AssociatedFile -> Annex DropCheck willDropMakeItWorse srcuuid destuuid (DestStartedWithCopy deststartedwithcopy) key afile = - ifM (Command.Drop.checkRequiredContent srcuuid key afile) + ifM (Command.Drop.checkRequiredContent (Command.Drop.PreferredContentChecked False) srcuuid key afile) ( if deststartedwithcopy then unlessforced DropCheckNumCopies else ifM checktrustlevel diff --git a/Logs/PreferredContent.hs b/Logs/PreferredContent.hs index adb0189ec3..1391bde4ce 100644 --- a/Logs/PreferredContent.hs +++ b/Logs/PreferredContent.hs @@ -46,8 +46,8 @@ import Logs.Remote import Types.StandardGroups import Limit -{- Checks if a file is preferred content for the specified repository - - (or the current repository if none is specified). -} +{- Checks if a file is preferred content (or required content) for the + - specified repository (or the current repository if none is specified). -} isPreferredContent :: Maybe UUID -> AssumeNotPresent -> Maybe Key -> AssociatedFile -> Bool -> Annex Bool isPreferredContent = checkMap preferredContentMap diff --git a/Test.hs b/Test.hs index 300ec157ff..c5d419074a 100644 --- a/Test.hs +++ b/Test.hs @@ -376,6 +376,7 @@ unitTests note = testGroup ("Unit Tests " ++ note) , testCase "bup remote" test_bup_remote , testCase "crypto" test_crypto , testCase "preferred content" test_preferred_content + , testCase "required_content" test_required_content , testCase "add subdirs" test_add_subdirs , testCase "addurl" test_addurl ] @@ -749,6 +750,27 @@ test_preferred_content = intmpclonerepo $ do git_annex "get" ["--auto", annexedfile] "get --auto of file with exclude=*" annexed_notpresent annexedfile +test_required_content :: Assertion +test_required_content = intmpclonerepo $ do + git_annex "get" [annexedfile] "get" + annexed_present annexedfile + git_annex "required" [".", "include=" ++ annexedfile] "annexedfile required" + + git_annex_shouldfail "drop" [annexedfile] "drop of required content should fail" + annexed_present annexedfile + + git_annex "drop" ["--auto", annexedfile] "drop --auto of required content skips it" + annexed_present annexedfile + + writecontent annexedfiledup $ content annexedfiledup + add_annex annexedfiledup "add of second file with same content failed" + annexed_present annexedfiledup + annexed_present annexedfile + + git_annex_shouldfail "drop" [annexedfiledup] "drop of file sharing required content should fail" + annexed_present annexedfiledup + annexed_present annexedfile + test_lock :: Assertion test_lock = intmpclonerepo $ do annexed_notpresent annexedfile