From 9a67ed0f10706141ff21b9c4ea9c148b667c1ffe Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 18 Dec 2023 16:27:26 -0400 Subject: [PATCH] importtree: support preferred content expressions needing keys When importing from a special remote, support preferred content expressions that use terms that match on keys (eg "present", "copies=1"). Such terms are ignored when importing, since the key is not known yet. When "standard" or "groupwanted" is used, the terms in those expressions also get pruned accordingly. This does allow setting preferred content to "not (copies=1)" to make a special remote into a "source" type of repository. Importing from it will import all files. Then exporting to it will drop all files from it. In the case of setting preferred content to "present", it's pruned on import, so everything gets imported from it. Then on export, it's applied, and everything in it is left on it, and no new content is exported to it. Since the old behavior on these preferred content expressions was for importtree to error out, there's no backwards compatability to worry about. Except that sync/pull/etc will now import where before it errored out. --- Annex/FileMatcher.hs | 44 +++++-------------- Annex/Import.hs | 16 ++++--- CHANGELOG | 4 ++ Logs/PreferredContent.hs | 13 +++--- Utility/Matcher.hs | 23 ++++++++++ doc/git-annex-import.mdwn | 17 +++---- ...ort_tree_preferred_content_expansions.mdwn | 2 + ..._df8ca8665e1dfc530a832b6d24d60ea4._comment | 29 ++++++++++++ 8 files changed, 94 insertions(+), 54 deletions(-) create mode 100644 doc/todo/import_tree_preferred_content_expansions/comment_1_df8ca8665e1dfc530a832b6d24d60ea4._comment diff --git a/Annex/FileMatcher.hs b/Annex/FileMatcher.hs index 6c049dfce4..e48931f360 100644 --- a/Annex/FileMatcher.hs +++ b/Annex/FileMatcher.hs @@ -16,7 +16,6 @@ module Annex.FileMatcher ( matchAll, PreferredContentData(..), preferredContentTokens, - preferredContentKeylessTokens, preferredContentParser, ParseToken, parsedToMatcher, @@ -139,19 +138,15 @@ tokenizeMatcher = filter (not . null) . concatMap splitparens . words where splitparens = segmentDelim (`elem` "()") -commonKeylessTokens :: LimitBy -> [ParseToken (MatchFiles Annex)] -commonKeylessTokens lb = +commonTokens :: LimitBy -> [ParseToken (MatchFiles Annex)] +commonTokens lb = [ SimpleToken "anything" (simply limitAnything) , SimpleToken "nothing" (simply limitNothing) , ValueToken "include" (usev limitInclude) , ValueToken "exclude" (usev limitExclude) , ValueToken "largerthan" (usev $ limitSize lb "largerthan" (>)) , ValueToken "smallerthan" (usev $ limitSize lb "smallerthan" (<)) - ] - -commonKeyedTokens :: [ParseToken (MatchFiles Annex)] -commonKeyedTokens = - [ SimpleToken "unused" (simply limitUnused) + , SimpleToken "unused" (simply limitUnused) ] data PreferredContentData = PCD @@ -162,25 +157,12 @@ data PreferredContentData = PCD , repoUUID :: Maybe UUID } --- Tokens of preferred content expressions that do not need a Key to be --- known. --- --- When importing from a special remote, this is used to match --- some preferred content expressions before the content is downloaded, --- so the Key is not known. -preferredContentKeylessTokens :: PreferredContentData -> [ParseToken (MatchFiles Annex)] -preferredContentKeylessTokens pcd = +preferredContentTokens :: PreferredContentData -> [ParseToken (MatchFiles Annex)] +preferredContentTokens pcd = [ SimpleToken "standard" (call "standard" $ matchStandard pcd) , SimpleToken "groupwanted" (call "groupwanted" $ matchGroupWanted pcd) , SimpleToken "inpreferreddir" (simply $ limitInDir preferreddir "inpreferreddir") - ] ++ commonKeylessTokens LimitAnnexFiles - where - preferreddir = maybe "public" fromProposedAccepted $ - M.lookup preferreddirField =<< (`M.lookup` configMap pcd) =<< repoUUID pcd - -preferredContentKeyedTokens :: PreferredContentData -> [ParseToken (MatchFiles Annex)] -preferredContentKeyedTokens pcd = - [ SimpleToken "present" (simply $ limitPresent $ repoUUID pcd) + , SimpleToken "present" (simply $ limitPresent $ repoUUID pcd) , SimpleToken "securehash" (simply limitSecureHash) , ValueToken "copies" (usev limitCopies) , ValueToken "lackingcopies" (usev $ limitLackingCopies "lackingcopies" False) @@ -189,13 +171,10 @@ preferredContentKeyedTokens pcd = , ValueToken "metadata" (usev limitMetaData) , ValueToken "inallgroup" (usev $ limitInAllGroup $ getGroupMap pcd) , ValueToken "onlyingroup" (usev $ limitOnlyInGroup $ getGroupMap pcd) - ] ++ commonKeyedTokens - -preferredContentTokens :: PreferredContentData -> [ParseToken (MatchFiles Annex)] -preferredContentTokens pcd = concat - [ preferredContentKeylessTokens pcd - , preferredContentKeyedTokens pcd - ] + ] ++ commonTokens LimitAnnexFiles + where + preferreddir = maybe "public" fromProposedAccepted $ + M.lookup preferreddirField =<< (`M.lookup` configMap pcd) =<< repoUUID pcd preferredContentParser :: [ParseToken (MatchFiles Annex)] -> String -> [ParseResult (MatchFiles Annex)] preferredContentParser tokens = map (parseToken tokens) . tokenizeMatcher @@ -210,8 +189,7 @@ mkMatchExpressionParser = do const $ Left $ "\""++n++"\" not supported; not built with MagicMime support" #endif let parse = parseToken $ - commonKeyedTokens ++ - commonKeylessTokens LimitDiskFiles ++ + commonTokens LimitDiskFiles ++ #ifdef WITH_MAGICMIME [ mimer "mimetype" $ matchMagic "mimetype" getMagicMimeType providedMimeType userProvidedMimeType diff --git a/Annex/Import.hs b/Annex/Import.hs index 1dbb50d4aa..023d7df1a9 100644 --- a/Annex/Import.hs +++ b/Annex/Import.hs @@ -990,20 +990,22 @@ addBackExportExcluded remote importtree = - - Only keyless tokens are supported, because the keys are not known - until an imported file is downloaded, which is too late to bother - - excluding it from an import. + - excluding it from an import. So prunes any tokens in the preferred + - content expression that need keys. -} makeImportMatcher :: Remote -> Annex (Either String (FileMatcher Annex)) -makeImportMatcher r = load preferredContentKeylessTokens >>= \case +makeImportMatcher r = load preferredContentTokens >>= \case Nothing -> return $ Right (matchAll, matcherdesc) Just (Right v) -> return $ Right (v, matcherdesc) - Just (Left err) -> load preferredContentTokens >>= \case - Just (Left err') -> return $ Left err' - _ -> return $ Left $ - "The preferred content expression contains terms that cannot be checked when importing: " ++ err + Just (Left err) -> return $ Left err where - load t = M.lookup (Remote.uuid r) . fst <$> preferredRequiredMapsLoad' t + load t = M.lookup (Remote.uuid r) . fst + <$> preferredRequiredMapsLoad' pruneImportMatcher t matcherdesc = MatcherDesc "preferred content" +pruneImportMatcher :: Utility.Matcher.Matcher (MatchFiles a) -> Utility.Matcher.Matcher (MatchFiles a) +pruneImportMatcher = Utility.Matcher.pruneMatcher matchNeedsKey + {- Gets the ImportableContents from the remote. - - Filters out any paths that include a ".git" component, because git does diff --git a/CHANGELOG b/CHANGELOG index 0a350cd892..55778eb153 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -19,6 +19,10 @@ git-annex (10.20231130) UNRELEASED; urgency=medium filesystems that have problems with such filenames. * Lower precision of timestamps in git-annex branch, which can reduce the size of the branch by up to 8%. + * When importing from a special remote, support preferred content + expressions that use terms that match on keys (eg "present", "copies=1"). + Such terms are ignored when importing, since the key is not known yet. + Before, such expressions caused the import to fail. -- Joey Hess Thu, 30 Nov 2023 14:48:12 -0400 diff --git a/Logs/PreferredContent.hs b/Logs/PreferredContent.hs index 9d2b30a907..c0f30c3c8f 100644 --- a/Logs/PreferredContent.hs +++ b/Logs/PreferredContent.hs @@ -82,7 +82,7 @@ requiredContentMap = maybe (snd <$> preferredRequiredMapsLoad preferredContentTo preferredRequiredMapsLoad :: (PreferredContentData -> [ParseToken (MatchFiles Annex)]) -> Annex (FileMatcherMap Annex, FileMatcherMap Annex) preferredRequiredMapsLoad mktokens = do - (pc, rc) <- preferredRequiredMapsLoad' mktokens + (pc, rc) <- preferredRequiredMapsLoad' id mktokens let pc' = handleunknown (MatcherDesc "preferred content") pc let rc' = handleunknown (MatcherDesc "required content") rc Annex.changeState $ \s -> s @@ -94,12 +94,12 @@ preferredRequiredMapsLoad mktokens = do handleunknown matcherdesc = M.mapWithKey $ \u v -> (either (const $ unknownMatcher u) id v, matcherdesc) -preferredRequiredMapsLoad' :: (PreferredContentData -> [ParseToken (MatchFiles Annex)]) -> Annex (M.Map UUID (Either String (Matcher (MatchFiles Annex))), M.Map UUID (Either String (Matcher (MatchFiles Annex)))) -preferredRequiredMapsLoad' mktokens = do +preferredRequiredMapsLoad' :: (Matcher (MatchFiles Annex) -> Matcher (MatchFiles Annex)) -> (PreferredContentData -> [ParseToken (MatchFiles Annex)]) -> Annex (M.Map UUID (Either String (Matcher (MatchFiles Annex))), M.Map UUID (Either String (Matcher (MatchFiles Annex)))) +preferredRequiredMapsLoad' matcherf mktokens = do groupmap <- groupMap configmap <- remoteConfigMap let genmap l gm = - let mk u = makeMatcher groupmap configmap gm u mktokens + let mk u = makeMatcher groupmap configmap gm u matcherf mktokens in simpleMap . parseLogOldWithUUID (\u -> mk u . decodeBS <$> A.takeByteString) <$> Annex.Branch.get l @@ -123,13 +123,14 @@ makeMatcher -> M.Map UUID RemoteConfig -> M.Map Group PreferredContentExpression -> UUID + -> (Matcher (MatchFiles Annex) -> Matcher (MatchFiles Annex)) -> (PreferredContentData -> [ParseToken (MatchFiles Annex)]) -> PreferredContentExpression -> Either String (Matcher (MatchFiles Annex)) -makeMatcher groupmap configmap groupwantedmap u mktokens = go True True +makeMatcher groupmap configmap groupwantedmap u matcherf mktokens = go True True where go expandstandard expandgroupwanted expr - | null (lefts tokens) = Right $ generate $ rights tokens + | null (lefts tokens) = Right $ matcherf $ generate $ rights tokens | otherwise = Left $ unwords $ lefts tokens where tokens = preferredContentParser (mktokens pcd) expr diff --git a/Utility/Matcher.hs b/Utility/Matcher.hs index e252eae538..c5eec539fb 100644 --- a/Utility/Matcher.hs +++ b/Utility/Matcher.hs @@ -24,6 +24,7 @@ module Utility.Matcher ( MatchResult(..), syntaxToken, generate, + pruneMatcher, match, match', matchM, @@ -99,6 +100,28 @@ generate = simplify . process MAny . implicitAnd . tokenGroups simplify (MNot x) = MNot (simplify x) simplify x = x +{- Prunes selected ops from the Matcher. -} +pruneMatcher :: (op -> Bool) -> Matcher op -> Matcher op +pruneMatcher f = fst . go + where + go MAny = (MAny, False) + go (MAnd a b) = case (go a, go b) of + ((_, True), (_, True)) -> (MAny, True) + ((a', False), (b', False)) -> (MAnd a' b', False) + ((_, True), (b', False)) -> (b', False) + ((a', False), (_, True)) -> (a', False) + go (MOr a b) = case (go a, go b) of + ((_, True), (_, True)) -> (MAny, True) + ((a', False), (b', False)) -> (MOr a' b', False) + ((_, True), (b', False)) -> (b', False) + ((a', False), (_, True)) -> (a', False) + go (MNot a) = case go a of + (_, True) -> (MAny, True) + (a', False) -> (MNot a', False) + go (MOp op) + | f op = (MAny, True) + | otherwise = (MOp op, False) + data TokenGroup op = One (Token op) | Group [TokenGroup op] deriving (Show, Eq) diff --git a/doc/git-annex-import.mdwn b/doc/git-annex-import.mdwn index ee9b4937c1..9844e239ee 100644 --- a/doc/git-annex-import.mdwn +++ b/doc/git-annex-import.mdwn @@ -74,16 +74,17 @@ Any files that are gitignored will not be included in the import, but will be left on the remote. When the special remote has a preferred content expression set by -[[git-annex-wanted]](1), it will be honored when importing from it. -Files that are not preferred content of the remote will not be +[[git-annex-wanted]](1), that is used to pick which files to import from +it. Files that are not preferred content of the remote will not be imported from it, but will be left on the remote. -However, preferred content expressions that relate to the key -can't be matched when importing, because the content of the file is not -known. Importing will fail when such a preferred content expression is -set. This includes expressions containing "copies=", "metadata=", and other -things that depend on the key. Preferred content expressions containing -"include=", "exclude=" "smallerthan=", "largerthan=" will work. +So for example, a preferred content expression like +`"include=*.jpeg or largerthan=100mb"` will make only jpegs and +large files be imported. + +Parts of a preferred content expression that relate to the key, +such as "copies=" are ignored when importing, because the key +is not known before importing. Things in the expression like "include=" match relative to the top of the tree of files on the remote, even when importing into a subdirectory. diff --git a/doc/todo/import_tree_preferred_content_expansions.mdwn b/doc/todo/import_tree_preferred_content_expansions.mdwn index f7f6cd0190..4ff04ffc82 100644 --- a/doc/todo/import_tree_preferred_content_expansions.mdwn +++ b/doc/todo/import_tree_preferred_content_expansions.mdwn @@ -13,3 +13,5 @@ key evaluate to true. Then it would import all files turn out that the special remote doesn't want to contain particular content that was imported from it, and it would make sense that an export to the special remote would remove those files. --[[Joey]] + +> [[done]] --[[Joey]] diff --git a/doc/todo/import_tree_preferred_content_expansions/comment_1_df8ca8665e1dfc530a832b6d24d60ea4._comment b/doc/todo/import_tree_preferred_content_expansions/comment_1_df8ca8665e1dfc530a832b6d24d60ea4._comment new file mode 100644 index 0000000000..7e9b4507a4 --- /dev/null +++ b/doc/todo/import_tree_preferred_content_expansions/comment_1_df8ca8665e1dfc530a832b6d24d60ea4._comment @@ -0,0 +1,29 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 1""" + date="2023-12-18T18:02:40Z" + content=""" +Hmm, consider for example a camera. If the user wants to import all jpeg +files from it, and not export to it jpeg files that have a copy in an +archive, they might use: + + not copies=archive:1 and include=*.jpeg + +But on import, if "copies=archive:1" were made to evaluate to true +as suggested here, then this expression would not match, and so +nothing would be imported. + +Seems that the approach needs to be instead to traverse the expression +and prune terms that operate on keys. So convert the example +above to "include=*.jpeg". + +How to prune in some other cases: + + not (copies=archive:1 and copies=backup:1) => anything + + not (copies=archive:1 and include=*.jpeg) => not (include=*.jpeg) + + not (copies=archive:1 or include=*.jpeg) => not (include=*.jpeg) + + not ((not copies=archive:1) or include=*.jpeg) => not (include=*.jpeg) +"""]]