From a56b151f9009590c97da8dbf66c1b138455657cb Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 24 May 2021 14:02:50 -0400 Subject: [PATCH] fix longstanding indeterminite preferred content for duplicated file problem * drop: When two files have the same content, and a preferred content expression matches one but not the other, do not drop the file. * sync --content, assistant: Fix an edge case where a file that is not preferred content did not get dropped. The sync --content edge case is that handleDropsFrom loaded associated files and used them without verifying that the information from the database was not stale. It seemed best to avoid changing --want-drop's behavior, this way when debugging a preferred content expression with it, the files matched will still reflect the expression. So added a note to the --want-drop documentation, to make clear it may not behave identically to git-annex drop --auto. While it would be possible to introspect the preferred content expression to see if it matches on filenames, and only look up the associated files when it does, it's generally fairly rare for 2 files to have the same content, and the database lookup is already avoided when there's only 1 file, so I did not implement that further optimisation. Note that there are still some situations where the associated files database does not get locked files recorded in it, which will prevent this fix from working. Sponsored-by: Dartmouth College's Datalad project --- Annex/Drop.hs | 16 +++---- Annex/Wanted.hs | 47 ++++++++++++++++--- CHANGELOG | 4 ++ Command/Drop.hs | 6 +-- Limit/Wanted.hs | 2 +- ...red_content_state_for_duplicated_file.mdwn | 2 + doc/git-annex-matching-options.mdwn | 5 ++ 7 files changed, 62 insertions(+), 20 deletions(-) diff --git a/Annex/Drop.hs b/Annex/Drop.hs index b7543ec794..6f55378719 100644 --- a/Annex/Drop.hs +++ b/Annex/Drop.hs @@ -103,16 +103,12 @@ handleDropsFrom locs rs reason fromhere key afile si preverified runner = do dropr fs r n >>= go fs rest | otherwise = pure n - checkdrop fs n u a - | null fs = check $ -- no associated files; unused content - wantDrop True u (Just key) (AssociatedFile Nothing) - | otherwise = check $ - allM (wantDrop True u (Just key) . AssociatedFile . Just) fs - where - check c = ifM c - ( dodrop n u a - , return n - ) + checkdrop fs n u a = + let afs = map (AssociatedFile . Just) fs + in ifM (wantDrop True u (Just key) afile (Just afs)) + ( dodrop n u a + , return n + ) dodrop n@(have, numcopies, mincopies, _untrusted) u a = ifM (safely $ runner $ a numcopies mincopies) diff --git a/Annex/Wanted.hs b/Annex/Wanted.hs index 021fe5cafb..a1ac7b20c4 100644 --- a/Annex/Wanted.hs +++ b/Annex/Wanted.hs @@ -1,6 +1,6 @@ {- git-annex checking whether content is wanted - - - Copyright 2012 Joey Hess + - Copyright 2012-2021 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -10,6 +10,9 @@ module Annex.Wanted where import Annex.Common import Logs.PreferredContent import Annex.UUID +import Annex.CatFile +import Git.FilePath +import qualified Database.Keys import qualified Data.Set as S @@ -22,8 +25,40 @@ wantSend :: Bool -> Maybe Key -> AssociatedFile -> UUID -> Annex Bool wantSend d key file to = isPreferredContent (Just to) S.empty key file d {- Check if a file can be dropped, maybe from a remote. - - Don't drop files that are preferred content. -} -wantDrop :: Bool -> Maybe UUID -> Maybe Key -> AssociatedFile -> Annex Bool -wantDrop d from key file = do - u <- maybe getUUID (return . id) from - not <$> isPreferredContent (Just u) (S.singleton u) key file d + - Don't drop files that are preferred content. + - + - The AssociatedFile is the one that the user requested to drop. + - There may be other files that use the same key, and preferred content + - may match some of those and not others. If any are preferred content, + - that will prevent dropping. When the other associated files are known, + - they can be provided, otherwise this looks them up. + -} +wantDrop :: Bool -> Maybe UUID -> Maybe Key -> AssociatedFile -> (Maybe [AssociatedFile]) -> Annex Bool +wantDrop d from key file others = do + u <- maybe getUUID (pure . id) from + let s = S.singleton u + let checkwant f = isPreferredContent (Just u) s key f d + ifM (checkwant file) + ( return False + , do + others' <- case others of + Just afs -> pure (filter (/= file) afs) + Nothing -> case key of + Just k -> mapM (\f -> AssociatedFile . Just <$> fromRepo (fromTopFilePath f)) + =<< Database.Keys.getAssociatedFiles k + Nothing -> pure [] + l <- filterM checkwant others' + if null l + then return True + else checkassociated l + ) + where + -- Some associated files that are in the keys database may no + -- longer correspond to files in the repository, and should + -- not prevent dropping. + checkassociated [] = return True + checkassociated (AssociatedFile (Just af):fs) = + catKeyFile af >>= \case + Just k | Just k == key -> return False + _ -> checkassociated fs + checkassociated (AssociatedFile Nothing:fs) = checkassociated fs diff --git a/CHANGELOG b/CHANGELOG index b00b6ae1a6..4fc140d15c 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,9 @@ git-annex (8.20210429) UNRELEASED; urgency=medium + * drop: When two files have the same content, and a preferred content + expression matches one but not the other, do not drop the file. + * sync --content, assistant: Fix an edge case where a file that is not + preferred content did not get dropped. * filter-branch: New command, useful to produce a filtered version of the git-annex branch, eg when splitting a repository. * fromkey: Create an unlocked file when used in an adjusted branch diff --git a/Command/Drop.hs b/Command/Drop.hs index 26a16fdd37..6422fc26d5 100644 --- a/Command/Drop.hs +++ b/Command/Drop.hs @@ -86,13 +86,13 @@ start o from si file key = start' o from key afile ai si start' :: DropOptions -> Maybe Remote -> Key -> AssociatedFile -> ActionItem -> SeekInput -> CommandStart start' o from key afile ai si = checkDropAuto (autoMode o) from afile key $ \numcopies mincopies -> - stopUnless want $ + stopUnless wantdrop $ case from of Nothing -> startLocal afile ai si numcopies mincopies key [] Just remote -> startRemote afile ai si numcopies mincopies key remote where - want - | autoMode o = wantDrop False (Remote.uuid <$> from) (Just key) afile + wantdrop + | autoMode o = wantDrop False (Remote.uuid <$> from) (Just key) afile Nothing | otherwise = return True startKeys :: DropOptions -> Maybe Remote -> (SeekInput, Key, ActionItem) -> CommandStart diff --git a/Limit/Wanted.hs b/Limit/Wanted.hs index 5518fa0bc9..9188594b90 100644 --- a/Limit/Wanted.hs +++ b/Limit/Wanted.hs @@ -19,7 +19,7 @@ addWantGet = addPreferredContentLimit $ addWantDrop :: Annex () addWantDrop = addPreferredContentLimit $ - checkWant $ wantDrop False Nothing Nothing + checkWant $ \af -> wantDrop False Nothing Nothing af (Just []) addPreferredContentLimit :: (MatchInfo -> Annex Bool) -> Annex () addPreferredContentLimit a = do diff --git a/doc/bugs/indeterminite_preferred_content_state_for_duplicated_file.mdwn b/doc/bugs/indeterminite_preferred_content_state_for_duplicated_file.mdwn index 0ca978dd3b..ee0d5a3930 100644 --- a/doc/bugs/indeterminite_preferred_content_state_for_duplicated_file.mdwn +++ b/doc/bugs/indeterminite_preferred_content_state_for_duplicated_file.mdwn @@ -19,3 +19,5 @@ So, this seems solvable in v7 repositories, but not in v5. Also, the associated files map may not be accurate at all times, so that's a wrinkle to using it for this. Also, only unlocked files get into the associated files map. --[[Joey]] + +> [[fixed|done]] --[[Joey]] diff --git a/doc/git-annex-matching-options.mdwn b/doc/git-annex-matching-options.mdwn index d2104c7ccf..fc7b794f4c 100644 --- a/doc/git-annex-matching-options.mdwn +++ b/doc/git-annex-matching-options.mdwn @@ -141,6 +141,11 @@ in either of two repositories. make it want to drop. Note that this will match even files that have already been dropped, unless limited with e.g., `--in .` + Files that this matches will not necessarily be dropped by + `git-annex drop --auto`. This does not check that there are enough copies + to drop. Also the same content may be used by a file that is not wanted + to be dropped. + * `--accessedwithin=interval` Matches files that were accessed recently, within the specified time