web: Add urlinclude and urlexclude configuration settings

Sponsored-by: Dartmouth College's DANDI project
This commit is contained in:
Joey Hess 2023-01-09 17:16:53 -04:00
parent 8d06930c88
commit 6fa166e1fc
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
5 changed files with 139 additions and 16 deletions

View file

@ -19,6 +19,7 @@ git-annex (10.20221213) UNRELEASED; urgency=medium
in addition to the default web special remote. When --sameas=web is used,
these provide additional names for the web special remote, and may
also have their own additional configuration and cost.
* web: Add urlinclude and urlexclude configuration settings.
-- Joey Hess <id@joeyh.name> Mon, 12 Dec 2022 13:04:54 -0400

View file

@ -9,6 +9,8 @@ module Remote.Web (remote, getWebUrls) where
import Annex.Common
import Types.Remote
import Types.ProposedAccepted
import Types.Creds
import Remote.Helper.Special
import Remote.Helper.ExportImport
import qualified Git
@ -20,23 +22,36 @@ import Config
import Logs.Web
import Annex.UUID
import Utility.Metered
import Utility.Glob
import qualified Annex.Url as Url
import Annex.YoutubeDl
import Annex.SpecialRemote.Config
import Types.Creds
remote :: RemoteType
remote = RemoteType
{ typename = "web"
, enumerate = list
, generate = gen
, configParser = mkRemoteConfigParser []
, configParser = mkRemoteConfigParser
[ optionalStringParser urlincludeField
(FieldDesc "only use urls matching this glob")
, optionalStringParser urlexcludeField
(FieldDesc "don't use urls that match this glob")
]
, setup = setupInstance
, exportSupported = exportUnsupported
, importSupported = importUnsupported
, thirdPartyPopulated = False
}
urlincludeField :: RemoteConfigField
urlincludeField = Accepted "urlinclude"
urlexcludeField :: RemoteConfigField
urlexcludeField = Accepted "urlexclude"
data UrlIncludeExclude = UrlIncludeExclude (Maybe Glob) (Maybe Glob)
-- The web remote always exists.
-- (If the web should cease to exist, remove this module and redistribute
-- a new release to the survivors by carrier pigeon.)
@ -56,19 +71,22 @@ gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> RemoteStateHandle
gen r u rc gc rs = do
c <- parsedRemoteConfig remote rc
cst <- remoteCost gc expensiveRemoteCost
let urlincludeexclude = UrlIncludeExclude
(getglob c urlincludeField)
( getglob c urlexcludeField)
return $ Just Remote
{ uuid = if u == NoUUID then webUUID else u
, cost = cst
, name = Git.repoDescribe r
, storeKey = uploadKey
, retrieveKeyFile = downloadKey
, retrieveKeyFile = downloadKey urlincludeexclude
, retrieveKeyFileCheap = Nothing
-- HttpManagerRestricted is used here, so this is
-- secure.
, retrievalSecurityPolicy = RetrievalAllKeysSecure
, removeKey = dropKey
, removeKey = dropKey urlincludeexclude
, lockContent = Nothing
, checkPresent = checkKey
, checkPresent = checkKey urlincludeexclude
, checkPresentCheap = False
, exportActions = exportUnsupported
, importActions = importUnsupported
@ -93,15 +111,20 @@ gen r u rc gc rs = do
, checkUrl = Nothing
, remoteStateHandle = rs
}
where
getglob c f = do
glob <- getRemoteConfigValue f c
Just $ compileGlob glob CaseInsensative (GlobFilePath False)
setupInstance :: SetupStage -> Maybe UUID -> Maybe CredPair -> RemoteConfig -> RemoteGitConfig -> Annex (RemoteConfig, UUID)
setupInstance ss mu _ c gc = do
setupInstance _ mu _ c _ = do
u <- maybe (liftIO genUUID) return mu
gitConfigSpecialRemote u c [("web", "true")]
return (c, u)
downloadKey :: Key -> AssociatedFile -> FilePath -> MeterUpdate -> VerifyConfig -> Annex Verification
downloadKey key _af dest p vc = go =<< getWebUrls key
downloadKey :: UrlIncludeExclude -> Key -> AssociatedFile -> FilePath -> MeterUpdate -> VerifyConfig -> Annex Verification
downloadKey urlincludeexclude key _af dest p vc =
go =<< getWebUrls' urlincludeexclude key
where
go [] = giveup "no known url"
go urls = dl (partition (not . isyoutube) (map getDownloader urls)) >>= \case
@ -132,12 +155,12 @@ downloadKey key _af dest p vc = go =<< getWebUrls key
uploadKey :: Key -> AssociatedFile -> MeterUpdate -> Annex ()
uploadKey _ _ _ = giveup "upload to web not supported"
dropKey :: Key -> Annex ()
dropKey k = mapM_ (setUrlMissing k) =<< getWebUrls k
dropKey :: UrlIncludeExclude -> Key -> Annex ()
dropKey urlincludeexclude k = mapM_ (setUrlMissing k) =<< getWebUrls' urlincludeexclude k
checkKey :: Key -> Annex Bool
checkKey key = do
us <- getWebUrls key
checkKey :: UrlIncludeExclude -> Key -> Annex Bool
checkKey urlincludeexclude key = do
us <- getWebUrls' urlincludeexclude key
if null us
then return False
else either giveup return =<< checkKey' key us
@ -157,7 +180,18 @@ checkKey' key us = firsthit us (Right False) $ \u -> do
_ -> firsthit rest r a
getWebUrls :: Key -> Annex [URLString]
getWebUrls key = filter supported <$> getUrls key
getWebUrls key = getWebUrls' (UrlIncludeExclude Nothing Nothing) key
getWebUrls' :: UrlIncludeExclude -> Key -> Annex [URLString]
getWebUrls' (UrlIncludeExclude minclude mexclude) key =
filter supported <$> getUrls key
where
supported u = snd (getDownloader u)
supported u = supporteddownloader u && isincluded u && notexcluded u
supporteddownloader u = snd (getDownloader u)
`elem` [WebDownloader, YoutubeDownloader]
isincluded u = case minclude of
Nothing -> True
Just glob -> matchGlob glob u
notexcluded u = case mexclude of
Nothing -> True
Just glob -> not (matchGlob glob u)

View file

@ -12,3 +12,16 @@ This special remote uses urls on the web as the source for content.
There are several other ways http can be used to download annexed objects,
including a git remote accessible by http, S3 with a `publicurl` configured,
and the [[httpalso]] special remote.
## configuration
These parameters can be passed to `git annex initremote` or
`git-annex enableremote` to configure a web remote:
* `urlinclude` - Only use urls that match the specified glob.
For example, `urlinclude="https://s3.amazonaws.com/*"`
Note: Globs are matched case-insensitively.
* `urlexclude` - Don't use urls that match the specified glob.
For example, to prohibit http urls, but allow https,
use `urlexclude="http:*"`
Note: Globs are matched case-insensitively.

View file

@ -1,3 +1,7 @@
[[!toc ]]
## basic use
The web can be used as a [[special_remote|special_remotes]] too.
# git annex addurl http://example.com/video.mpeg
@ -48,7 +52,7 @@ You can also attach urls to any file already in the annex:
00000000-0000-0000-0000-000000000001 -- web
27a9510c-760a-11e1-b9a0-c731d2b77df9 -- here
## configuring filenames
## configuring addurl filenames
By default, `addurl` will generate a filename for you. You can use
`--file=` to specify the filename to use.
@ -115,3 +119,25 @@ to work.
## podcasts
This is done using `git annex importfeed`. See [[downloading podcasts]].
## configuring which url is used when there are several
An annexed file can have content at multiple urls that git-annex knows
about, and git-annex may use any of those urls for downloading a file.
If some urls are especially fast, you might want to configure
which urls git-annex prefers to use first. To accomplish that,
you can create additional remotes, that are web special remotes, and are
configured to only use the fast urls. Then it's simply a matter of
configuring the cost of those remotes.
For example, suppose that you want to prioritize using urls on "fasthost.com".
git-annex initremote --sameas=web fasthost type=web urlinclude='*//fasthost.com/*'
git config remote.fasthost.annex-cost 150
Now, `git-annex get` of a file that is on both fasthost.com and another url
will prefer to use the fasthost special remote, rather than the web special
remote (which has a higher cost of 200), and so will use the fasthost.com
url. If that url is not available, it will fall back to the web special
remote, and use the other url.

View file

@ -0,0 +1,49 @@
[[!comment format=mdwn
username="joey"
subject="""comment 4"""
date="2023-01-09T20:48:18Z"
content="""
I've implemented support for multiple web special remotes,
and have added configurations urlinclude= and urlexclude=
(both case-insensitive globs).
Example use:
git-annex initremote --sameas=web fastweb type=web urlinclude='*//fasthost.com/*' autoenable=true
git config remote.fastweb.annex-cost 150
And then `git-annex get --from fasthost` will only use urls on that host,
not any other urls. `git-annex get --from web` will still use any urls.
The cost of 150 makes `git-annex get` use fasthost before web.
That's enough to handle the example you gave, just use
`urlinclude='*//dandiarchive.s3.amazonaws.com/*'
---
But, I don't think this is quite sufficient. Because it should also be
possible to deprioritize urls. And there's not a good way to yet.
In particular, this doesn't work:
git-annex initremote --sameas=web slowweb type=web urlinclude='*//slowhost.com/*' autoenable=true
git config remote.slowhost.annex-cost 300
Because when getting a file, the main web special remote is tried before
this high-cost slowhost one, and will use any url, including
slowhost.com urls.
Now you can instead do this:
git-annex initremote --sameas=web fastweb type=web urlexclude='*//slowhost.com/*' autoenable=true
git config remote.fasthost.annex-cost 150
But when there's a second slow host, that approach falls down, because you
can't specify urlexclude= twice. And even if you could, there would be a
distributed configs merging issue same as discussed in comment #3.
I think what's needed is for the main web special remote to notice that a
web remote such as fastweb or slowweb exists, and automatically exclude
from using the urls that other web remote is configured to use. Which
will be a little bit tricky to implent, but seems doable.
"""]]