From 8a305e5fa3b3fc5a2f776f2c87aa320fcd2ae980 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 10 Jan 2023 14:58:53 -0400 Subject: [PATCH] respect urlinclude/urlexclude of other web special remotes When a web special remote does not have urlinclude/urlexclude configured, make it respect the configuration of other web special remotes and avoid using urls that match the config of another. Note that the other web special remote does not have to be enabled. That seems ok, it would have been extra work to check for only ones that are enabled. The implementation does mean that the web special remote re-parses its own config once at startup, as well as re-parsing the configs of any other web special remotes. This should be a very small slowdown unless there are lots of web special remotes. Sponsored-by: Dartmouth College's DANDI project --- Remote/Web.hs | 85 +++++++++++++------ doc/special_remotes/web.mdwn | 19 ++++- .../using_the_web_as_a_special_remote.mdwn | 23 +++-- ..._ff4c23b4ccaf649a9914f4aa4435968d._comment | 22 +++++ ..._4029b264ea2b0d47e6949f832af69282._comment | 16 ++++ 5 files changed, 134 insertions(+), 31 deletions(-) create mode 100644 doc/todo/Allow_for_URLs_prioritization_WITHIN___40__web__41___remote/comment_10_ff4c23b4ccaf649a9914f4aa4435968d._comment create mode 100644 doc/todo/Allow_for_URLs_prioritization_WITHIN___40__web__41___remote/comment_9_4029b264ea2b0d47e6949f832af69282._comment diff --git a/Remote/Web.hs b/Remote/Web.hs index 2fca038c99..7d3d00622f 100644 --- a/Remote/Web.hs +++ b/Remote/Web.hs @@ -26,6 +26,9 @@ import Utility.Glob import qualified Annex.Url as Url import Annex.YoutubeDl import Annex.SpecialRemote.Config +import Logs.Remote + +import qualified Data.Map as M remote :: RemoteType remote = RemoteType @@ -44,14 +47,6 @@ remote = RemoteType , thirdPartyPopulated = False } -urlincludeField :: RemoteConfigField -urlincludeField = Accepted "urlinclude" - -urlexcludeField :: RemoteConfigField -urlexcludeField = Accepted "urlexclude" - -data UrlIncludeExclude = UrlIncludeExclude (Maybe Glob) (Maybe Glob) - -- The web remote always exists. -- (If the web should cease to exist, remove this module and redistribute -- a new release to the survivors by carrier pigeon.) @@ -71,9 +66,7 @@ gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> RemoteStateHandle gen r u rc gc rs = do c <- parsedRemoteConfig remote rc cst <- remoteCost gc expensiveRemoteCost - let urlincludeexclude = UrlIncludeExclude - (getglob c urlincludeField) - ( getglob c urlexcludeField) + urlincludeexclude <- mkUrlIncludeExclude c return $ Just Remote { uuid = if u == NoUUID then webUUID else u , cost = cst @@ -111,10 +104,6 @@ gen r u rc gc rs = do , checkUrl = Nothing , remoteStateHandle = rs } - where - getglob c f = do - glob <- getRemoteConfigValue f c - Just $ compileGlob glob CaseInsensative (GlobFilePath False) setupInstance :: SetupStage -> Maybe UUID -> Maybe CredPair -> RemoteConfig -> RemoteGitConfig -> Annex (RemoteConfig, UUID) setupInstance _ mu _ c _ = do @@ -180,18 +169,66 @@ checkKey' key us = firsthit us (Right False) $ \u -> do _ -> firsthit rest r a getWebUrls :: Key -> Annex [URLString] -getWebUrls key = getWebUrls' (UrlIncludeExclude Nothing Nothing) key +getWebUrls key = getWebUrls' alwaysInclude key getWebUrls' :: UrlIncludeExclude -> Key -> Annex [URLString] -getWebUrls' (UrlIncludeExclude minclude mexclude) key = +getWebUrls' urlincludeexclude key = filter supported <$> getUrls key where - supported u = supporteddownloader u && isincluded u && notexcluded u + supported u = supporteddownloader u + && checkUrlIncludeExclude urlincludeexclude u supporteddownloader u = snd (getDownloader u) `elem` [WebDownloader, YoutubeDownloader] - isincluded u = case minclude of - Nothing -> True - Just glob -> matchGlob glob u - notexcluded u = case mexclude of - Nothing -> True - Just glob -> not (matchGlob glob u) + +urlincludeField :: RemoteConfigField +urlincludeField = Accepted "urlinclude" + +urlexcludeField :: RemoteConfigField +urlexcludeField = Accepted "urlexclude" + +data UrlIncludeExclude = UrlIncludeExclude + { checkUrlIncludeExclude :: URLString -> Bool + } + +alwaysInclude :: UrlIncludeExclude +alwaysInclude = UrlIncludeExclude { checkUrlIncludeExclude = const True } + +mkUrlIncludeExclude :: ParsedRemoteConfig -> Annex UrlIncludeExclude +mkUrlIncludeExclude = go fallback + where + go b pc = case (getglob urlincludeField pc, getglob urlexcludeField pc) of + (Nothing, Nothing) -> b + (minclude, mexclude) -> mk minclude mexclude + + getglob f pc = do + glob <- getRemoteConfigValue f pc + Just $ compileGlob glob CaseInsensative (GlobFilePath False) + + mk minclude mexclude = pure $ UrlIncludeExclude + { checkUrlIncludeExclude = \u -> and + [ case minclude of + Just glob -> matchGlob glob u + Nothing -> True + , case mexclude of + Nothing -> True + Just glob -> not (matchGlob glob u) + ] + } + + -- When nothing to include or exclude is specified, only include + -- urls that are not explicitly included by other web special remotes. + fallback = do + rcs <- M.elems . M.filter iswebremote <$> remoteConfigMap + l <- forM rcs $ \rc -> + parsedRemoteConfig remote rc + >>= go (pure neverinclude) + liftIO $ print ("fallback", l) + pure $ UrlIncludeExclude + { checkUrlIncludeExclude = \u -> + not (any (\c -> checkUrlIncludeExclude c u) l) + } + + iswebremote rc = (fromProposedAccepted <$> M.lookup typeField rc) + == Just (typename remote) + + neverinclude = UrlIncludeExclude { checkUrlIncludeExclude = const False } diff --git a/doc/special_remotes/web.mdwn b/doc/special_remotes/web.mdwn index 05e43c773f..134d547065 100644 --- a/doc/special_remotes/web.mdwn +++ b/doc/special_remotes/web.mdwn @@ -20,8 +20,23 @@ These parameters can be passed to `git annex initremote` or * `urlinclude` - Only use urls that match the specified glob. For example, `urlinclude="https://s3.amazonaws.com/*"` - Note: Globs are matched case-insensitively. * `urlexclude` - Don't use urls that match the specified glob. For example, to prohibit http urls, but allow https, use `urlexclude="http:*"` - Note: Globs are matched case-insensitively. + +Globs are matched case-insensitively. + +When there are multiple special remotes of type web, and some are not +configured with `urlinclude` and/or `urlexclude`, those will avoid using +urls that are matched by the configuration of other web remotes. + +For example, this creates a second web special remote named "slowweb" that +is only used for urls on one host, and that has a higher cost than the +"web" special remote. With this configuration, `git-annex get` will first +try to get the file from the "web" special remote, which will avoid +using any urls that match slowweb's urlinclude. Only if the content +can't be downloaded from "web" (or some other remote) will it fall back +to downloading from slowweb. + + git-annex initremote --sameas=web slowweb type=web urlinclude='*//slowhost.com/*' + git config remote.slowweb.cost 300 diff --git a/doc/tips/using_the_web_as_a_special_remote.mdwn b/doc/tips/using_the_web_as_a_special_remote.mdwn index 2b8851f5eb..f29fc70b12 100644 --- a/doc/tips/using_the_web_as_a_special_remote.mdwn +++ b/doc/tips/using_the_web_as_a_special_remote.mdwn @@ -125,11 +125,11 @@ This is done using `git annex importfeed`. See [[downloading podcasts]]. An annexed file can have content at multiple urls that git-annex knows about, and git-annex may use any of those urls for downloading a file. -If some urls are especially fast, you might want to configure -which urls git-annex prefers to use first. To accomplish that, -you can create additional remotes, that are web special remotes, and are -configured to only use the fast urls. Then it's simply a matter of -configuring the cost of those remotes. +If some urls are especially fast, or especially slow, you might want to +configure which urls git-annex prefers to use first, or should only use as +a last resory. To accomplish that, you can create additional remotes, that +are web special remotes, and are configured to only be used for some urls. +Then it's simply a matter of configuring the cost of those remotes. For example, suppose that you want to prioritize using urls on "fasthost.com". @@ -141,3 +141,16 @@ will prefer to use the fasthost special remote, rather than the web special remote (which has a higher cost of 200), and so will use the fasthost.com url. If that url is not available, it will fall back to the web special remote, and use the other url. + +Suppose that you want to avoid using urls on "slowhost.com", except +as a last resort. + + git-annex initremote --sameas=web slowhost type=web urlinclude='*//slowhost.com/*' + git config remote.slowhost.annex-cost 300 + +Now, `git-annex get` of a file that is on both slowhost.com and another url +will first try the fasthost remote. If fasthost does not support the url, +it will next try the regular "web" remote. Which will avoid using +urls that are used by the configuration of either fasthost or slowhost. +Finally, if it's unable to get the file from some other url, it will +use the slowhost remote to get it from the slow url. diff --git a/doc/todo/Allow_for_URLs_prioritization_WITHIN___40__web__41___remote/comment_10_ff4c23b4ccaf649a9914f4aa4435968d._comment b/doc/todo/Allow_for_URLs_prioritization_WITHIN___40__web__41___remote/comment_10_ff4c23b4ccaf649a9914f4aa4435968d._comment new file mode 100644 index 0000000000..3d6f075d43 --- /dev/null +++ b/doc/todo/Allow_for_URLs_prioritization_WITHIN___40__web__41___remote/comment_10_ff4c23b4ccaf649a9914f4aa4435968d._comment @@ -0,0 +1,22 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 10""" + date="2023-01-10T18:43:38Z" + content=""" +I think this todo is not fully done yet, because every web special +remote that is split out this way still needs to have its cost +configured in each clone. + +So this seems to be pointing to needing a global way to configure +the default cost of a special remote, similar to `git-annex config`. +Local configs would of course need to override that. + +One way that might make sense is to add a cost=N setting to all special +remotes. Then when generating the Remote, it can just look at the value +set there, and use that for Remote.cost. Simple and efficient too. + +(That assumes that only special remotes should have their default cost +be configurable, not git repositories. Which seems right, since +the same git repo can have different costs depending on whether it's +accessed locally or remotely, etc.) +"""]] diff --git a/doc/todo/Allow_for_URLs_prioritization_WITHIN___40__web__41___remote/comment_9_4029b264ea2b0d47e6949f832af69282._comment b/doc/todo/Allow_for_URLs_prioritization_WITHIN___40__web__41___remote/comment_9_4029b264ea2b0d47e6949f832af69282._comment new file mode 100644 index 0000000000..4c71e2526d --- /dev/null +++ b/doc/todo/Allow_for_URLs_prioritization_WITHIN___40__web__41___remote/comment_9_4029b264ea2b0d47e6949f832af69282._comment @@ -0,0 +1,16 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 9""" + date="2023-01-10T18:41:09Z" + content=""" +100-some lines of code later, I got deprioritizing urls working well. +Eg: + + git-annex initremote --sameas=web slowweb type=web urlinclude='*//slowhost.com/*' + git config remote.slowweb.annex-cost 300 + +Now when the regular "web" special remote is asked to get a file, +it will skip any urls that match the urlinclude of other web remotes. +So, it won't use the slowhost.com urls, leaving those for slowweb to later +use if necessary. +"""]]