respect urlinclude/urlexclude of other web special remotes
When a web special remote does not have urlinclude/urlexclude configured, make it respect the configuration of other web special remotes and avoid using urls that match the config of another. Note that the other web special remote does not have to be enabled. That seems ok, it would have been extra work to check for only ones that are enabled. The implementation does mean that the web special remote re-parses its own config once at startup, as well as re-parsing the configs of any other web special remotes. This should be a very small slowdown unless there are lots of web special remotes. Sponsored-by: Dartmouth College's DANDI project
This commit is contained in:
parent
0fc476f16e
commit
8a305e5fa3
5 changed files with 134 additions and 31 deletions
|
@ -26,6 +26,9 @@ import Utility.Glob
|
||||||
import qualified Annex.Url as Url
|
import qualified Annex.Url as Url
|
||||||
import Annex.YoutubeDl
|
import Annex.YoutubeDl
|
||||||
import Annex.SpecialRemote.Config
|
import Annex.SpecialRemote.Config
|
||||||
|
import Logs.Remote
|
||||||
|
|
||||||
|
import qualified Data.Map as M
|
||||||
|
|
||||||
remote :: RemoteType
|
remote :: RemoteType
|
||||||
remote = RemoteType
|
remote = RemoteType
|
||||||
|
@ -44,14 +47,6 @@ remote = RemoteType
|
||||||
, thirdPartyPopulated = False
|
, thirdPartyPopulated = False
|
||||||
}
|
}
|
||||||
|
|
||||||
urlincludeField :: RemoteConfigField
|
|
||||||
urlincludeField = Accepted "urlinclude"
|
|
||||||
|
|
||||||
urlexcludeField :: RemoteConfigField
|
|
||||||
urlexcludeField = Accepted "urlexclude"
|
|
||||||
|
|
||||||
data UrlIncludeExclude = UrlIncludeExclude (Maybe Glob) (Maybe Glob)
|
|
||||||
|
|
||||||
-- The web remote always exists.
|
-- The web remote always exists.
|
||||||
-- (If the web should cease to exist, remove this module and redistribute
|
-- (If the web should cease to exist, remove this module and redistribute
|
||||||
-- a new release to the survivors by carrier pigeon.)
|
-- a new release to the survivors by carrier pigeon.)
|
||||||
|
@ -71,9 +66,7 @@ gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> RemoteStateHandle
|
||||||
gen r u rc gc rs = do
|
gen r u rc gc rs = do
|
||||||
c <- parsedRemoteConfig remote rc
|
c <- parsedRemoteConfig remote rc
|
||||||
cst <- remoteCost gc expensiveRemoteCost
|
cst <- remoteCost gc expensiveRemoteCost
|
||||||
let urlincludeexclude = UrlIncludeExclude
|
urlincludeexclude <- mkUrlIncludeExclude c
|
||||||
(getglob c urlincludeField)
|
|
||||||
( getglob c urlexcludeField)
|
|
||||||
return $ Just Remote
|
return $ Just Remote
|
||||||
{ uuid = if u == NoUUID then webUUID else u
|
{ uuid = if u == NoUUID then webUUID else u
|
||||||
, cost = cst
|
, cost = cst
|
||||||
|
@ -111,10 +104,6 @@ gen r u rc gc rs = do
|
||||||
, checkUrl = Nothing
|
, checkUrl = Nothing
|
||||||
, remoteStateHandle = rs
|
, remoteStateHandle = rs
|
||||||
}
|
}
|
||||||
where
|
|
||||||
getglob c f = do
|
|
||||||
glob <- getRemoteConfigValue f c
|
|
||||||
Just $ compileGlob glob CaseInsensative (GlobFilePath False)
|
|
||||||
|
|
||||||
setupInstance :: SetupStage -> Maybe UUID -> Maybe CredPair -> RemoteConfig -> RemoteGitConfig -> Annex (RemoteConfig, UUID)
|
setupInstance :: SetupStage -> Maybe UUID -> Maybe CredPair -> RemoteConfig -> RemoteGitConfig -> Annex (RemoteConfig, UUID)
|
||||||
setupInstance _ mu _ c _ = do
|
setupInstance _ mu _ c _ = do
|
||||||
|
@ -180,18 +169,66 @@ checkKey' key us = firsthit us (Right False) $ \u -> do
|
||||||
_ -> firsthit rest r a
|
_ -> firsthit rest r a
|
||||||
|
|
||||||
getWebUrls :: Key -> Annex [URLString]
|
getWebUrls :: Key -> Annex [URLString]
|
||||||
getWebUrls key = getWebUrls' (UrlIncludeExclude Nothing Nothing) key
|
getWebUrls key = getWebUrls' alwaysInclude key
|
||||||
|
|
||||||
getWebUrls' :: UrlIncludeExclude -> Key -> Annex [URLString]
|
getWebUrls' :: UrlIncludeExclude -> Key -> Annex [URLString]
|
||||||
getWebUrls' (UrlIncludeExclude minclude mexclude) key =
|
getWebUrls' urlincludeexclude key =
|
||||||
filter supported <$> getUrls key
|
filter supported <$> getUrls key
|
||||||
where
|
where
|
||||||
supported u = supporteddownloader u && isincluded u && notexcluded u
|
supported u = supporteddownloader u
|
||||||
|
&& checkUrlIncludeExclude urlincludeexclude u
|
||||||
supporteddownloader u = snd (getDownloader u)
|
supporteddownloader u = snd (getDownloader u)
|
||||||
`elem` [WebDownloader, YoutubeDownloader]
|
`elem` [WebDownloader, YoutubeDownloader]
|
||||||
isincluded u = case minclude of
|
|
||||||
Nothing -> True
|
urlincludeField :: RemoteConfigField
|
||||||
Just glob -> matchGlob glob u
|
urlincludeField = Accepted "urlinclude"
|
||||||
notexcluded u = case mexclude of
|
|
||||||
Nothing -> True
|
urlexcludeField :: RemoteConfigField
|
||||||
Just glob -> not (matchGlob glob u)
|
urlexcludeField = Accepted "urlexclude"
|
||||||
|
|
||||||
|
data UrlIncludeExclude = UrlIncludeExclude
|
||||||
|
{ checkUrlIncludeExclude :: URLString -> Bool
|
||||||
|
}
|
||||||
|
|
||||||
|
alwaysInclude :: UrlIncludeExclude
|
||||||
|
alwaysInclude = UrlIncludeExclude { checkUrlIncludeExclude = const True }
|
||||||
|
|
||||||
|
mkUrlIncludeExclude :: ParsedRemoteConfig -> Annex UrlIncludeExclude
|
||||||
|
mkUrlIncludeExclude = go fallback
|
||||||
|
where
|
||||||
|
go b pc = case (getglob urlincludeField pc, getglob urlexcludeField pc) of
|
||||||
|
(Nothing, Nothing) -> b
|
||||||
|
(minclude, mexclude) -> mk minclude mexclude
|
||||||
|
|
||||||
|
getglob f pc = do
|
||||||
|
glob <- getRemoteConfigValue f pc
|
||||||
|
Just $ compileGlob glob CaseInsensative (GlobFilePath False)
|
||||||
|
|
||||||
|
mk minclude mexclude = pure $ UrlIncludeExclude
|
||||||
|
{ checkUrlIncludeExclude = \u -> and
|
||||||
|
[ case minclude of
|
||||||
|
Just glob -> matchGlob glob u
|
||||||
|
Nothing -> True
|
||||||
|
, case mexclude of
|
||||||
|
Nothing -> True
|
||||||
|
Just glob -> not (matchGlob glob u)
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
-- When nothing to include or exclude is specified, only include
|
||||||
|
-- urls that are not explicitly included by other web special remotes.
|
||||||
|
fallback = do
|
||||||
|
rcs <- M.elems . M.filter iswebremote <$> remoteConfigMap
|
||||||
|
l <- forM rcs $ \rc ->
|
||||||
|
parsedRemoteConfig remote rc
|
||||||
|
>>= go (pure neverinclude)
|
||||||
|
liftIO $ print ("fallback", l)
|
||||||
|
pure $ UrlIncludeExclude
|
||||||
|
{ checkUrlIncludeExclude = \u ->
|
||||||
|
not (any (\c -> checkUrlIncludeExclude c u) l)
|
||||||
|
}
|
||||||
|
|
||||||
|
iswebremote rc = (fromProposedAccepted <$> M.lookup typeField rc)
|
||||||
|
== Just (typename remote)
|
||||||
|
|
||||||
|
neverinclude = UrlIncludeExclude { checkUrlIncludeExclude = const False }
|
||||||
|
|
|
@ -20,8 +20,23 @@ These parameters can be passed to `git annex initremote` or
|
||||||
|
|
||||||
* `urlinclude` - Only use urls that match the specified glob.
|
* `urlinclude` - Only use urls that match the specified glob.
|
||||||
For example, `urlinclude="https://s3.amazonaws.com/*"`
|
For example, `urlinclude="https://s3.amazonaws.com/*"`
|
||||||
Note: Globs are matched case-insensitively.
|
|
||||||
* `urlexclude` - Don't use urls that match the specified glob.
|
* `urlexclude` - Don't use urls that match the specified glob.
|
||||||
For example, to prohibit http urls, but allow https,
|
For example, to prohibit http urls, but allow https,
|
||||||
use `urlexclude="http:*"`
|
use `urlexclude="http:*"`
|
||||||
Note: Globs are matched case-insensitively.
|
|
||||||
|
Globs are matched case-insensitively.
|
||||||
|
|
||||||
|
When there are multiple special remotes of type web, and some are not
|
||||||
|
configured with `urlinclude` and/or `urlexclude`, those will avoid using
|
||||||
|
urls that are matched by the configuration of other web remotes.
|
||||||
|
|
||||||
|
For example, this creates a second web special remote named "slowweb" that
|
||||||
|
is only used for urls on one host, and that has a higher cost than the
|
||||||
|
"web" special remote. With this configuration, `git-annex get` will first
|
||||||
|
try to get the file from the "web" special remote, which will avoid
|
||||||
|
using any urls that match slowweb's urlinclude. Only if the content
|
||||||
|
can't be downloaded from "web" (or some other remote) will it fall back
|
||||||
|
to downloading from slowweb.
|
||||||
|
|
||||||
|
git-annex initremote --sameas=web slowweb type=web urlinclude='*//slowhost.com/*'
|
||||||
|
git config remote.slowweb.cost 300
|
||||||
|
|
|
@ -125,11 +125,11 @@ This is done using `git annex importfeed`. See [[downloading podcasts]].
|
||||||
An annexed file can have content at multiple urls that git-annex knows
|
An annexed file can have content at multiple urls that git-annex knows
|
||||||
about, and git-annex may use any of those urls for downloading a file.
|
about, and git-annex may use any of those urls for downloading a file.
|
||||||
|
|
||||||
If some urls are especially fast, you might want to configure
|
If some urls are especially fast, or especially slow, you might want to
|
||||||
which urls git-annex prefers to use first. To accomplish that,
|
configure which urls git-annex prefers to use first, or should only use as
|
||||||
you can create additional remotes, that are web special remotes, and are
|
a last resory. To accomplish that, you can create additional remotes, that
|
||||||
configured to only use the fast urls. Then it's simply a matter of
|
are web special remotes, and are configured to only be used for some urls.
|
||||||
configuring the cost of those remotes.
|
Then it's simply a matter of configuring the cost of those remotes.
|
||||||
|
|
||||||
For example, suppose that you want to prioritize using urls on "fasthost.com".
|
For example, suppose that you want to prioritize using urls on "fasthost.com".
|
||||||
|
|
||||||
|
@ -141,3 +141,16 @@ will prefer to use the fasthost special remote, rather than the web special
|
||||||
remote (which has a higher cost of 200), and so will use the fasthost.com
|
remote (which has a higher cost of 200), and so will use the fasthost.com
|
||||||
url. If that url is not available, it will fall back to the web special
|
url. If that url is not available, it will fall back to the web special
|
||||||
remote, and use the other url.
|
remote, and use the other url.
|
||||||
|
|
||||||
|
Suppose that you want to avoid using urls on "slowhost.com", except
|
||||||
|
as a last resort.
|
||||||
|
|
||||||
|
git-annex initremote --sameas=web slowhost type=web urlinclude='*//slowhost.com/*'
|
||||||
|
git config remote.slowhost.annex-cost 300
|
||||||
|
|
||||||
|
Now, `git-annex get` of a file that is on both slowhost.com and another url
|
||||||
|
will first try the fasthost remote. If fasthost does not support the url,
|
||||||
|
it will next try the regular "web" remote. Which will avoid using
|
||||||
|
urls that are used by the configuration of either fasthost or slowhost.
|
||||||
|
Finally, if it's unable to get the file from some other url, it will
|
||||||
|
use the slowhost remote to get it from the slow url.
|
||||||
|
|
|
@ -0,0 +1,22 @@
|
||||||
|
[[!comment format=mdwn
|
||||||
|
username="joey"
|
||||||
|
subject="""comment 10"""
|
||||||
|
date="2023-01-10T18:43:38Z"
|
||||||
|
content="""
|
||||||
|
I think this todo is not fully done yet, because every web special
|
||||||
|
remote that is split out this way still needs to have its cost
|
||||||
|
configured in each clone.
|
||||||
|
|
||||||
|
So this seems to be pointing to needing a global way to configure
|
||||||
|
the default cost of a special remote, similar to `git-annex config`.
|
||||||
|
Local configs would of course need to override that.
|
||||||
|
|
||||||
|
One way that might make sense is to add a cost=N setting to all special
|
||||||
|
remotes. Then when generating the Remote, it can just look at the value
|
||||||
|
set there, and use that for Remote.cost. Simple and efficient too.
|
||||||
|
|
||||||
|
(That assumes that only special remotes should have their default cost
|
||||||
|
be configurable, not git repositories. Which seems right, since
|
||||||
|
the same git repo can have different costs depending on whether it's
|
||||||
|
accessed locally or remotely, etc.)
|
||||||
|
"""]]
|
|
@ -0,0 +1,16 @@
|
||||||
|
[[!comment format=mdwn
|
||||||
|
username="joey"
|
||||||
|
subject="""comment 9"""
|
||||||
|
date="2023-01-10T18:41:09Z"
|
||||||
|
content="""
|
||||||
|
100-some lines of code later, I got deprioritizing urls working well.
|
||||||
|
Eg:
|
||||||
|
|
||||||
|
git-annex initremote --sameas=web slowweb type=web urlinclude='*//slowhost.com/*'
|
||||||
|
git config remote.slowweb.annex-cost 300
|
||||||
|
|
||||||
|
Now when the regular "web" special remote is asked to get a file,
|
||||||
|
it will skip any urls that match the urlinclude of other web remotes.
|
||||||
|
So, it won't use the slowhost.com urls, leaving those for slowweb to later
|
||||||
|
use if necessary.
|
||||||
|
"""]]
|
Loading…
Reference in a new issue