fix longstanding indeterminite preferred content for duplicated file problem

* drop: When two files have the same content, and a preferred content
  expression matches one but not the other, do not drop the file.
* sync --content, assistant: Fix an edge case where a file that is not
  preferred content did not get dropped.

The sync --content edge case is that handleDropsFrom loaded associated files
and used them without verifying that the information from the database was
not stale.

It seemed best to avoid changing --want-drop's behavior, this way when
debugging a preferred content expression with it, the files matched will
still reflect the expression. So added a note to the --want-drop documentation,
to make clear it may not behave identically to git-annex drop --auto.

While it would be possible to introspect the preferred content
expression to see if it matches on filenames, and only look up the
associated files when it does, it's generally fairly rare for 2 files to
have the same content, and the database lookup is already avoided when
there's only 1 file, so I did not implement that further optimisation.

Note that there are still some situations where the associated files
database does not get locked files recorded in it, which will prevent
this fix from working.

Sponsored-by: Dartmouth College's Datalad project
This commit is contained in:
Joey Hess 2021-05-24 14:02:50 -04:00
parent 78be7cf73f
commit a56b151f90
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
7 changed files with 62 additions and 20 deletions

View file

@ -103,16 +103,12 @@ handleDropsFrom locs rs reason fromhere key afile si preverified runner = do
dropr fs r n >>= go fs rest
| otherwise = pure n
checkdrop fs n u a
| null fs = check $ -- no associated files; unused content
wantDrop True u (Just key) (AssociatedFile Nothing)
| otherwise = check $
allM (wantDrop True u (Just key) . AssociatedFile . Just) fs
where
check c = ifM c
( dodrop n u a
, return n
)
checkdrop fs n u a =
let afs = map (AssociatedFile . Just) fs
in ifM (wantDrop True u (Just key) afile (Just afs))
( dodrop n u a
, return n
)
dodrop n@(have, numcopies, mincopies, _untrusted) u a =
ifM (safely $ runner $ a numcopies mincopies)

View file

@ -1,6 +1,6 @@
{- git-annex checking whether content is wanted
-
- Copyright 2012 Joey Hess <id@joeyh.name>
- Copyright 2012-2021 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -10,6 +10,9 @@ module Annex.Wanted where
import Annex.Common
import Logs.PreferredContent
import Annex.UUID
import Annex.CatFile
import Git.FilePath
import qualified Database.Keys
import qualified Data.Set as S
@ -22,8 +25,40 @@ wantSend :: Bool -> Maybe Key -> AssociatedFile -> UUID -> Annex Bool
wantSend d key file to = isPreferredContent (Just to) S.empty key file d
{- Check if a file can be dropped, maybe from a remote.
- Don't drop files that are preferred content. -}
wantDrop :: Bool -> Maybe UUID -> Maybe Key -> AssociatedFile -> Annex Bool
wantDrop d from key file = do
u <- maybe getUUID (return . id) from
not <$> isPreferredContent (Just u) (S.singleton u) key file d
- Don't drop files that are preferred content.
-
- The AssociatedFile is the one that the user requested to drop.
- There may be other files that use the same key, and preferred content
- may match some of those and not others. If any are preferred content,
- that will prevent dropping. When the other associated files are known,
- they can be provided, otherwise this looks them up.
-}
wantDrop :: Bool -> Maybe UUID -> Maybe Key -> AssociatedFile -> (Maybe [AssociatedFile]) -> Annex Bool
wantDrop d from key file others = do
u <- maybe getUUID (pure . id) from
let s = S.singleton u
let checkwant f = isPreferredContent (Just u) s key f d
ifM (checkwant file)
( return False
, do
others' <- case others of
Just afs -> pure (filter (/= file) afs)
Nothing -> case key of
Just k -> mapM (\f -> AssociatedFile . Just <$> fromRepo (fromTopFilePath f))
=<< Database.Keys.getAssociatedFiles k
Nothing -> pure []
l <- filterM checkwant others'
if null l
then return True
else checkassociated l
)
where
-- Some associated files that are in the keys database may no
-- longer correspond to files in the repository, and should
-- not prevent dropping.
checkassociated [] = return True
checkassociated (AssociatedFile (Just af):fs) =
catKeyFile af >>= \case
Just k | Just k == key -> return False
_ -> checkassociated fs
checkassociated (AssociatedFile Nothing:fs) = checkassociated fs

View file

@ -1,5 +1,9 @@
git-annex (8.20210429) UNRELEASED; urgency=medium
* drop: When two files have the same content, and a preferred content
expression matches one but not the other, do not drop the file.
* sync --content, assistant: Fix an edge case where a file that is not
preferred content did not get dropped.
* filter-branch: New command, useful to produce a filtered version of the
git-annex branch, eg when splitting a repository.
* fromkey: Create an unlocked file when used in an adjusted branch

View file

@ -86,13 +86,13 @@ start o from si file key = start' o from key afile ai si
start' :: DropOptions -> Maybe Remote -> Key -> AssociatedFile -> ActionItem -> SeekInput -> CommandStart
start' o from key afile ai si =
checkDropAuto (autoMode o) from afile key $ \numcopies mincopies ->
stopUnless want $
stopUnless wantdrop $
case from of
Nothing -> startLocal afile ai si numcopies mincopies key []
Just remote -> startRemote afile ai si numcopies mincopies key remote
where
want
| autoMode o = wantDrop False (Remote.uuid <$> from) (Just key) afile
wantdrop
| autoMode o = wantDrop False (Remote.uuid <$> from) (Just key) afile Nothing
| otherwise = return True
startKeys :: DropOptions -> Maybe Remote -> (SeekInput, Key, ActionItem) -> CommandStart

View file

@ -19,7 +19,7 @@ addWantGet = addPreferredContentLimit $
addWantDrop :: Annex ()
addWantDrop = addPreferredContentLimit $
checkWant $ wantDrop False Nothing Nothing
checkWant $ \af -> wantDrop False Nothing Nothing af (Just [])
addPreferredContentLimit :: (MatchInfo -> Annex Bool) -> Annex ()
addPreferredContentLimit a = do

View file

@ -19,3 +19,5 @@ So, this seems solvable in v7 repositories, but not in v5.
Also, the associated files map may not be accurate at all times, so that's
a wrinkle to using it for this. Also, only unlocked files get into the
associated files map. --[[Joey]]
> [[fixed|done]] --[[Joey]]

View file

@ -141,6 +141,11 @@ in either of two repositories.
make it want to drop. Note that this will match even files that have
already been dropped, unless limited with e.g., `--in .`
Files that this matches will not necessarily be dropped by
`git-annex drop --auto`. This does not check that there are enough copies
to drop. Also the same content may be used by a file that is not wanted
to be dropped.
* `--accessedwithin=interval`
Matches files that were accessed recently, within the specified time