web: Add urlinclude and urlexclude configuration settings
Sponsored-by: Dartmouth College's DANDI project
This commit is contained in:
parent
8d06930c88
commit
6fa166e1fc
5 changed files with 139 additions and 16 deletions
|
@ -19,6 +19,7 @@ git-annex (10.20221213) UNRELEASED; urgency=medium
|
|||
in addition to the default web special remote. When --sameas=web is used,
|
||||
these provide additional names for the web special remote, and may
|
||||
also have their own additional configuration and cost.
|
||||
* web: Add urlinclude and urlexclude configuration settings.
|
||||
|
||||
-- Joey Hess <id@joeyh.name> Mon, 12 Dec 2022 13:04:54 -0400
|
||||
|
||||
|
|
|
@ -9,6 +9,8 @@ module Remote.Web (remote, getWebUrls) where
|
|||
|
||||
import Annex.Common
|
||||
import Types.Remote
|
||||
import Types.ProposedAccepted
|
||||
import Types.Creds
|
||||
import Remote.Helper.Special
|
||||
import Remote.Helper.ExportImport
|
||||
import qualified Git
|
||||
|
@ -20,23 +22,36 @@ import Config
|
|||
import Logs.Web
|
||||
import Annex.UUID
|
||||
import Utility.Metered
|
||||
import Utility.Glob
|
||||
import qualified Annex.Url as Url
|
||||
import Annex.YoutubeDl
|
||||
import Annex.SpecialRemote.Config
|
||||
import Types.Creds
|
||||
|
||||
remote :: RemoteType
|
||||
remote = RemoteType
|
||||
{ typename = "web"
|
||||
, enumerate = list
|
||||
, generate = gen
|
||||
, configParser = mkRemoteConfigParser []
|
||||
, configParser = mkRemoteConfigParser
|
||||
[ optionalStringParser urlincludeField
|
||||
(FieldDesc "only use urls matching this glob")
|
||||
, optionalStringParser urlexcludeField
|
||||
(FieldDesc "don't use urls that match this glob")
|
||||
]
|
||||
, setup = setupInstance
|
||||
, exportSupported = exportUnsupported
|
||||
, importSupported = importUnsupported
|
||||
, thirdPartyPopulated = False
|
||||
}
|
||||
|
||||
urlincludeField :: RemoteConfigField
|
||||
urlincludeField = Accepted "urlinclude"
|
||||
|
||||
urlexcludeField :: RemoteConfigField
|
||||
urlexcludeField = Accepted "urlexclude"
|
||||
|
||||
data UrlIncludeExclude = UrlIncludeExclude (Maybe Glob) (Maybe Glob)
|
||||
|
||||
-- The web remote always exists.
|
||||
-- (If the web should cease to exist, remove this module and redistribute
|
||||
-- a new release to the survivors by carrier pigeon.)
|
||||
|
@ -56,19 +71,22 @@ gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> RemoteStateHandle
|
|||
gen r u rc gc rs = do
|
||||
c <- parsedRemoteConfig remote rc
|
||||
cst <- remoteCost gc expensiveRemoteCost
|
||||
let urlincludeexclude = UrlIncludeExclude
|
||||
(getglob c urlincludeField)
|
||||
( getglob c urlexcludeField)
|
||||
return $ Just Remote
|
||||
{ uuid = if u == NoUUID then webUUID else u
|
||||
, cost = cst
|
||||
, name = Git.repoDescribe r
|
||||
, storeKey = uploadKey
|
||||
, retrieveKeyFile = downloadKey
|
||||
, retrieveKeyFile = downloadKey urlincludeexclude
|
||||
, retrieveKeyFileCheap = Nothing
|
||||
-- HttpManagerRestricted is used here, so this is
|
||||
-- secure.
|
||||
, retrievalSecurityPolicy = RetrievalAllKeysSecure
|
||||
, removeKey = dropKey
|
||||
, removeKey = dropKey urlincludeexclude
|
||||
, lockContent = Nothing
|
||||
, checkPresent = checkKey
|
||||
, checkPresent = checkKey urlincludeexclude
|
||||
, checkPresentCheap = False
|
||||
, exportActions = exportUnsupported
|
||||
, importActions = importUnsupported
|
||||
|
@ -93,15 +111,20 @@ gen r u rc gc rs = do
|
|||
, checkUrl = Nothing
|
||||
, remoteStateHandle = rs
|
||||
}
|
||||
where
|
||||
getglob c f = do
|
||||
glob <- getRemoteConfigValue f c
|
||||
Just $ compileGlob glob CaseInsensative (GlobFilePath False)
|
||||
|
||||
setupInstance :: SetupStage -> Maybe UUID -> Maybe CredPair -> RemoteConfig -> RemoteGitConfig -> Annex (RemoteConfig, UUID)
|
||||
setupInstance ss mu _ c gc = do
|
||||
setupInstance _ mu _ c _ = do
|
||||
u <- maybe (liftIO genUUID) return mu
|
||||
gitConfigSpecialRemote u c [("web", "true")]
|
||||
return (c, u)
|
||||
|
||||
downloadKey :: Key -> AssociatedFile -> FilePath -> MeterUpdate -> VerifyConfig -> Annex Verification
|
||||
downloadKey key _af dest p vc = go =<< getWebUrls key
|
||||
downloadKey :: UrlIncludeExclude -> Key -> AssociatedFile -> FilePath -> MeterUpdate -> VerifyConfig -> Annex Verification
|
||||
downloadKey urlincludeexclude key _af dest p vc =
|
||||
go =<< getWebUrls' urlincludeexclude key
|
||||
where
|
||||
go [] = giveup "no known url"
|
||||
go urls = dl (partition (not . isyoutube) (map getDownloader urls)) >>= \case
|
||||
|
@ -132,12 +155,12 @@ downloadKey key _af dest p vc = go =<< getWebUrls key
|
|||
uploadKey :: Key -> AssociatedFile -> MeterUpdate -> Annex ()
|
||||
uploadKey _ _ _ = giveup "upload to web not supported"
|
||||
|
||||
dropKey :: Key -> Annex ()
|
||||
dropKey k = mapM_ (setUrlMissing k) =<< getWebUrls k
|
||||
dropKey :: UrlIncludeExclude -> Key -> Annex ()
|
||||
dropKey urlincludeexclude k = mapM_ (setUrlMissing k) =<< getWebUrls' urlincludeexclude k
|
||||
|
||||
checkKey :: Key -> Annex Bool
|
||||
checkKey key = do
|
||||
us <- getWebUrls key
|
||||
checkKey :: UrlIncludeExclude -> Key -> Annex Bool
|
||||
checkKey urlincludeexclude key = do
|
||||
us <- getWebUrls' urlincludeexclude key
|
||||
if null us
|
||||
then return False
|
||||
else either giveup return =<< checkKey' key us
|
||||
|
@ -157,7 +180,18 @@ checkKey' key us = firsthit us (Right False) $ \u -> do
|
|||
_ -> firsthit rest r a
|
||||
|
||||
getWebUrls :: Key -> Annex [URLString]
|
||||
getWebUrls key = filter supported <$> getUrls key
|
||||
getWebUrls key = getWebUrls' (UrlIncludeExclude Nothing Nothing) key
|
||||
|
||||
getWebUrls' :: UrlIncludeExclude -> Key -> Annex [URLString]
|
||||
getWebUrls' (UrlIncludeExclude minclude mexclude) key =
|
||||
filter supported <$> getUrls key
|
||||
where
|
||||
supported u = snd (getDownloader u)
|
||||
supported u = supporteddownloader u && isincluded u && notexcluded u
|
||||
supporteddownloader u = snd (getDownloader u)
|
||||
`elem` [WebDownloader, YoutubeDownloader]
|
||||
isincluded u = case minclude of
|
||||
Nothing -> True
|
||||
Just glob -> matchGlob glob u
|
||||
notexcluded u = case mexclude of
|
||||
Nothing -> True
|
||||
Just glob -> not (matchGlob glob u)
|
||||
|
|
|
@ -12,3 +12,16 @@ This special remote uses urls on the web as the source for content.
|
|||
There are several other ways http can be used to download annexed objects,
|
||||
including a git remote accessible by http, S3 with a `publicurl` configured,
|
||||
and the [[httpalso]] special remote.
|
||||
|
||||
## configuration
|
||||
|
||||
These parameters can be passed to `git annex initremote` or
|
||||
`git-annex enableremote` to configure a web remote:
|
||||
|
||||
* `urlinclude` - Only use urls that match the specified glob.
|
||||
For example, `urlinclude="https://s3.amazonaws.com/*"`
|
||||
Note: Globs are matched case-insensitively.
|
||||
* `urlexclude` - Don't use urls that match the specified glob.
|
||||
For example, to prohibit http urls, but allow https,
|
||||
use `urlexclude="http:*"`
|
||||
Note: Globs are matched case-insensitively.
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
[[!toc ]]
|
||||
|
||||
## basic use
|
||||
|
||||
The web can be used as a [[special_remote|special_remotes]] too.
|
||||
|
||||
# git annex addurl http://example.com/video.mpeg
|
||||
|
@ -48,7 +52,7 @@ You can also attach urls to any file already in the annex:
|
|||
00000000-0000-0000-0000-000000000001 -- web
|
||||
27a9510c-760a-11e1-b9a0-c731d2b77df9 -- here
|
||||
|
||||
## configuring filenames
|
||||
## configuring addurl filenames
|
||||
|
||||
By default, `addurl` will generate a filename for you. You can use
|
||||
`--file=` to specify the filename to use.
|
||||
|
@ -115,3 +119,25 @@ to work.
|
|||
## podcasts
|
||||
|
||||
This is done using `git annex importfeed`. See [[downloading podcasts]].
|
||||
|
||||
## configuring which url is used when there are several
|
||||
|
||||
An annexed file can have content at multiple urls that git-annex knows
|
||||
about, and git-annex may use any of those urls for downloading a file.
|
||||
|
||||
If some urls are especially fast, you might want to configure
|
||||
which urls git-annex prefers to use first. To accomplish that,
|
||||
you can create additional remotes, that are web special remotes, and are
|
||||
configured to only use the fast urls. Then it's simply a matter of
|
||||
configuring the cost of those remotes.
|
||||
|
||||
For example, suppose that you want to prioritize using urls on "fasthost.com".
|
||||
|
||||
git-annex initremote --sameas=web fasthost type=web urlinclude='*//fasthost.com/*'
|
||||
git config remote.fasthost.annex-cost 150
|
||||
|
||||
Now, `git-annex get` of a file that is on both fasthost.com and another url
|
||||
will prefer to use the fasthost special remote, rather than the web special
|
||||
remote (which has a higher cost of 200), and so will use the fasthost.com
|
||||
url. If that url is not available, it will fall back to the web special
|
||||
remote, and use the other url.
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
[[!comment format=mdwn
|
||||
username="joey"
|
||||
subject="""comment 4"""
|
||||
date="2023-01-09T20:48:18Z"
|
||||
content="""
|
||||
I've implemented support for multiple web special remotes,
|
||||
and have added configurations urlinclude= and urlexclude=
|
||||
(both case-insensitive globs).
|
||||
|
||||
Example use:
|
||||
|
||||
git-annex initremote --sameas=web fastweb type=web urlinclude='*//fasthost.com/*' autoenable=true
|
||||
git config remote.fastweb.annex-cost 150
|
||||
|
||||
And then `git-annex get --from fasthost` will only use urls on that host,
|
||||
not any other urls. `git-annex get --from web` will still use any urls.
|
||||
The cost of 150 makes `git-annex get` use fasthost before web.
|
||||
|
||||
That's enough to handle the example you gave, just use
|
||||
`urlinclude='*//dandiarchive.s3.amazonaws.com/*'
|
||||
|
||||
---
|
||||
|
||||
But, I don't think this is quite sufficient. Because it should also be
|
||||
possible to deprioritize urls. And there's not a good way to yet.
|
||||
|
||||
In particular, this doesn't work:
|
||||
|
||||
git-annex initremote --sameas=web slowweb type=web urlinclude='*//slowhost.com/*' autoenable=true
|
||||
git config remote.slowhost.annex-cost 300
|
||||
|
||||
Because when getting a file, the main web special remote is tried before
|
||||
this high-cost slowhost one, and will use any url, including
|
||||
slowhost.com urls.
|
||||
|
||||
Now you can instead do this:
|
||||
|
||||
git-annex initremote --sameas=web fastweb type=web urlexclude='*//slowhost.com/*' autoenable=true
|
||||
git config remote.fasthost.annex-cost 150
|
||||
|
||||
But when there's a second slow host, that approach falls down, because you
|
||||
can't specify urlexclude= twice. And even if you could, there would be a
|
||||
distributed configs merging issue same as discussed in comment #3.
|
||||
|
||||
I think what's needed is for the main web special remote to notice that a
|
||||
web remote such as fastweb or slowweb exists, and automatically exclude
|
||||
from using the urls that other web remote is configured to use. Which
|
||||
will be a little bit tricky to implent, but seems doable.
|
||||
"""]]
|
Loading…
Add table
Reference in a new issue