From d364e434c8a99931c6e865799e0737d630126d4b Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 21 Jul 2025 12:13:40 -0400 Subject: [PATCH] Add --url option and url= preferred content expression To match content that is recorded as present in an url. Note that, this cannot ask remotes to provide an url using whereisKey, like whereis does. Because preferred content expressions need to match the same from multiple perspectives, and the remote would not always be available. That's why the docs say "recorded as present", but still this may be surprising to some who see an url in whereis output and are surprised they cannot match on it. The use of getDownloader is to strip the downloader prefix from urls like "yt:". Note that, when OtherDownloader is used, this strips the ":" prefix, and allows matching on those urls too. --- Annex/FileMatcher.hs | 3 ++- CHANGELOG | 2 ++ CmdLine/GitAnnex/Options.hs | 5 +++++ Limit.hs | 21 +++++++++++++++++++++ doc/git-annex-matching-options.mdwn | 5 +++++ doc/git-annex-preferred-content.mdwn | 5 +++++ doc/todo/match_on_url.mdwn | 2 ++ 7 files changed, 42 insertions(+), 1 deletion(-) diff --git a/Annex/FileMatcher.hs b/Annex/FileMatcher.hs index 6157efa3f0..385e23a16e 100644 --- a/Annex/FileMatcher.hs +++ b/Annex/FileMatcher.hs @@ -1,6 +1,6 @@ {- git-annex file matching - - - Copyright 2012-2024 Joey Hess + - Copyright 2012-2025 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -194,6 +194,7 @@ preferredContentTokens pcd = , ValueToken "approxlackingcopies" (usev $ limitLackingCopies "approxlackingcopies" True) , ValueToken "inbackend" (usev limitInBackend) , ValueToken "metadata" (usev limitMetaData) + , ValueToken "url" (usev limitUrl) , ValueToken "inallgroup" (usev $ limitInAllGroup $ getGroupMap pcd) , ValueToken "onlyingroup" (usev $ limitOnlyInGroup $ getGroupMap pcd) , ValueToken "balanced" (usev $ limitBalanced (repoUUID pcd) (getGroupMap pcd)) diff --git a/CHANGELOG b/CHANGELOG index 7216b21fbb..e0407b3919 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -10,6 +10,8 @@ git-annex (10.20250631) UNRELEASED; urgency=medium that have experienced the above bug. * Fix symlinks generated to annexed content when in adjusted unlocked branch in a linked worktree on a filesystem not supporting symlinks. + * Add --url option and url= preferred content expression, to match + content that is recorded as present in an url. -- Joey Hess Mon, 07 Jul 2025 15:59:42 -0400 diff --git a/CmdLine/GitAnnex/Options.hs b/CmdLine/GitAnnex/Options.hs index 890f9654de..4b44edda56 100644 --- a/CmdLine/GitAnnex/Options.hs +++ b/CmdLine/GitAnnex/Options.hs @@ -348,6 +348,11 @@ keyMatchingOptions' = <> help "match files with attached metadata" <> hidden ) + , annexOption (setAnnexState . Limit.addUrl) $ strOption + ( long "url" <> metavar paramGlob + <> help "match files by url" + <> hidden + ) , annexFlag (setAnnexState Limit.Wanted.addWantGet) ( long "want-get" <> help "match files the local repository wants to get" diff --git a/Limit.hs b/Limit.hs index d090e09d88..1916a606d5 100644 --- a/Limit.hs +++ b/Limit.hs @@ -31,6 +31,7 @@ import Types.FileMatcher import Types.MetaData import Annex.MetaData import Logs.MetaData +import Logs.Web import Logs.Group import Logs.Unused import Logs.Location @@ -867,6 +868,26 @@ limitMetaData s = case parseMetaDataMatcher s of . S.filter matching . metaDataValues f <$> getCurrentMetaData k +addUrl :: String -> Annex () +addUrl = addLimit . limitUrl + +limitUrl :: MkLimit Annex +limitUrl glob = Right $ MatchFiles + { matchAction = const $ const $ checkKey check + , matchNeedsFileName = False + , matchNeedsFileContent = False + , matchNeedsKey = True + , matchNeedsLocationLog = False + , matchNeedsLiveRepoSize = False + , matchNegationUnstable = False + , matchDesc = "url" =? glob + } + where + check k = any (matchGlob cglob) + . map (fst . getDownloader) + <$> getUrls k + cglob = compileGlob glob CaseSensitive (GlobFilePath False) -- memoized + addAccessedWithin :: Duration -> Annex () addAccessedWithin duration = do now <- liftIO getPOSIXTime diff --git a/doc/git-annex-matching-options.mdwn b/doc/git-annex-matching-options.mdwn index ea29f98848..cf964cc71d 100644 --- a/doc/git-annex-matching-options.mdwn +++ b/doc/git-annex-matching-options.mdwn @@ -178,6 +178,11 @@ in either of two repositories. (Note that you will need to quote the second parameter to avoid the shell doing redirection.) +* `--url=glob` + + Matches when the content is recorded as being present in an url that + matches the glob. + * `--want-get` Matches only when the preferred content settings for the local repository diff --git a/doc/git-annex-preferred-content.mdwn b/doc/git-annex-preferred-content.mdwn index 52c6ff225e..6b9fc521ac 100644 --- a/doc/git-annex-preferred-content.mdwn +++ b/doc/git-annex-preferred-content.mdwn @@ -166,6 +166,11 @@ content not being configured. To match PDFs with between 100 and 200 pages (assuming something has set that metadata), use `metadata=pagecount>=100 and metadata=pagecount<=200` +* `url=glob` + + Matches when the content is recorded as being present in an url that + matches the glob. + * `present` Makes content be wanted if it's present, but not otherwise. diff --git a/doc/todo/match_on_url.mdwn b/doc/todo/match_on_url.mdwn index 6623debbed..5ef885e02d 100644 --- a/doc/todo/match_on_url.mdwn +++ b/doc/todo/match_on_url.mdwn @@ -10,3 +10,5 @@ expression if adding that. An alternative way could be to populate a metadata field with the url, if that were done without increasing the size of the git repository. --[[Joey]] + +> [[done]] --[[Joey]]