web: Add urlinclude and urlexclude configuration settings
Sponsored-by: Dartmouth College's DANDI project
This commit is contained in:
parent
8d06930c88
commit
6fa166e1fc
5 changed files with 139 additions and 16 deletions
|
@ -19,6 +19,7 @@ git-annex (10.20221213) UNRELEASED; urgency=medium
|
||||||
in addition to the default web special remote. When --sameas=web is used,
|
in addition to the default web special remote. When --sameas=web is used,
|
||||||
these provide additional names for the web special remote, and may
|
these provide additional names for the web special remote, and may
|
||||||
also have their own additional configuration and cost.
|
also have their own additional configuration and cost.
|
||||||
|
* web: Add urlinclude and urlexclude configuration settings.
|
||||||
|
|
||||||
-- Joey Hess <id@joeyh.name> Mon, 12 Dec 2022 13:04:54 -0400
|
-- Joey Hess <id@joeyh.name> Mon, 12 Dec 2022 13:04:54 -0400
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,8 @@ module Remote.Web (remote, getWebUrls) where
|
||||||
|
|
||||||
import Annex.Common
|
import Annex.Common
|
||||||
import Types.Remote
|
import Types.Remote
|
||||||
|
import Types.ProposedAccepted
|
||||||
|
import Types.Creds
|
||||||
import Remote.Helper.Special
|
import Remote.Helper.Special
|
||||||
import Remote.Helper.ExportImport
|
import Remote.Helper.ExportImport
|
||||||
import qualified Git
|
import qualified Git
|
||||||
|
@ -20,23 +22,36 @@ import Config
|
||||||
import Logs.Web
|
import Logs.Web
|
||||||
import Annex.UUID
|
import Annex.UUID
|
||||||
import Utility.Metered
|
import Utility.Metered
|
||||||
|
import Utility.Glob
|
||||||
import qualified Annex.Url as Url
|
import qualified Annex.Url as Url
|
||||||
import Annex.YoutubeDl
|
import Annex.YoutubeDl
|
||||||
import Annex.SpecialRemote.Config
|
import Annex.SpecialRemote.Config
|
||||||
import Types.Creds
|
|
||||||
|
|
||||||
remote :: RemoteType
|
remote :: RemoteType
|
||||||
remote = RemoteType
|
remote = RemoteType
|
||||||
{ typename = "web"
|
{ typename = "web"
|
||||||
, enumerate = list
|
, enumerate = list
|
||||||
, generate = gen
|
, generate = gen
|
||||||
, configParser = mkRemoteConfigParser []
|
, configParser = mkRemoteConfigParser
|
||||||
|
[ optionalStringParser urlincludeField
|
||||||
|
(FieldDesc "only use urls matching this glob")
|
||||||
|
, optionalStringParser urlexcludeField
|
||||||
|
(FieldDesc "don't use urls that match this glob")
|
||||||
|
]
|
||||||
, setup = setupInstance
|
, setup = setupInstance
|
||||||
, exportSupported = exportUnsupported
|
, exportSupported = exportUnsupported
|
||||||
, importSupported = importUnsupported
|
, importSupported = importUnsupported
|
||||||
, thirdPartyPopulated = False
|
, thirdPartyPopulated = False
|
||||||
}
|
}
|
||||||
|
|
||||||
|
urlincludeField :: RemoteConfigField
|
||||||
|
urlincludeField = Accepted "urlinclude"
|
||||||
|
|
||||||
|
urlexcludeField :: RemoteConfigField
|
||||||
|
urlexcludeField = Accepted "urlexclude"
|
||||||
|
|
||||||
|
data UrlIncludeExclude = UrlIncludeExclude (Maybe Glob) (Maybe Glob)
|
||||||
|
|
||||||
-- The web remote always exists.
|
-- The web remote always exists.
|
||||||
-- (If the web should cease to exist, remove this module and redistribute
|
-- (If the web should cease to exist, remove this module and redistribute
|
||||||
-- a new release to the survivors by carrier pigeon.)
|
-- a new release to the survivors by carrier pigeon.)
|
||||||
|
@ -56,19 +71,22 @@ gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> RemoteStateHandle
|
||||||
gen r u rc gc rs = do
|
gen r u rc gc rs = do
|
||||||
c <- parsedRemoteConfig remote rc
|
c <- parsedRemoteConfig remote rc
|
||||||
cst <- remoteCost gc expensiveRemoteCost
|
cst <- remoteCost gc expensiveRemoteCost
|
||||||
|
let urlincludeexclude = UrlIncludeExclude
|
||||||
|
(getglob c urlincludeField)
|
||||||
|
( getglob c urlexcludeField)
|
||||||
return $ Just Remote
|
return $ Just Remote
|
||||||
{ uuid = if u == NoUUID then webUUID else u
|
{ uuid = if u == NoUUID then webUUID else u
|
||||||
, cost = cst
|
, cost = cst
|
||||||
, name = Git.repoDescribe r
|
, name = Git.repoDescribe r
|
||||||
, storeKey = uploadKey
|
, storeKey = uploadKey
|
||||||
, retrieveKeyFile = downloadKey
|
, retrieveKeyFile = downloadKey urlincludeexclude
|
||||||
, retrieveKeyFileCheap = Nothing
|
, retrieveKeyFileCheap = Nothing
|
||||||
-- HttpManagerRestricted is used here, so this is
|
-- HttpManagerRestricted is used here, so this is
|
||||||
-- secure.
|
-- secure.
|
||||||
, retrievalSecurityPolicy = RetrievalAllKeysSecure
|
, retrievalSecurityPolicy = RetrievalAllKeysSecure
|
||||||
, removeKey = dropKey
|
, removeKey = dropKey urlincludeexclude
|
||||||
, lockContent = Nothing
|
, lockContent = Nothing
|
||||||
, checkPresent = checkKey
|
, checkPresent = checkKey urlincludeexclude
|
||||||
, checkPresentCheap = False
|
, checkPresentCheap = False
|
||||||
, exportActions = exportUnsupported
|
, exportActions = exportUnsupported
|
||||||
, importActions = importUnsupported
|
, importActions = importUnsupported
|
||||||
|
@ -93,15 +111,20 @@ gen r u rc gc rs = do
|
||||||
, checkUrl = Nothing
|
, checkUrl = Nothing
|
||||||
, remoteStateHandle = rs
|
, remoteStateHandle = rs
|
||||||
}
|
}
|
||||||
|
where
|
||||||
|
getglob c f = do
|
||||||
|
glob <- getRemoteConfigValue f c
|
||||||
|
Just $ compileGlob glob CaseInsensative (GlobFilePath False)
|
||||||
|
|
||||||
setupInstance :: SetupStage -> Maybe UUID -> Maybe CredPair -> RemoteConfig -> RemoteGitConfig -> Annex (RemoteConfig, UUID)
|
setupInstance :: SetupStage -> Maybe UUID -> Maybe CredPair -> RemoteConfig -> RemoteGitConfig -> Annex (RemoteConfig, UUID)
|
||||||
setupInstance ss mu _ c gc = do
|
setupInstance _ mu _ c _ = do
|
||||||
u <- maybe (liftIO genUUID) return mu
|
u <- maybe (liftIO genUUID) return mu
|
||||||
gitConfigSpecialRemote u c [("web", "true")]
|
gitConfigSpecialRemote u c [("web", "true")]
|
||||||
return (c, u)
|
return (c, u)
|
||||||
|
|
||||||
downloadKey :: Key -> AssociatedFile -> FilePath -> MeterUpdate -> VerifyConfig -> Annex Verification
|
downloadKey :: UrlIncludeExclude -> Key -> AssociatedFile -> FilePath -> MeterUpdate -> VerifyConfig -> Annex Verification
|
||||||
downloadKey key _af dest p vc = go =<< getWebUrls key
|
downloadKey urlincludeexclude key _af dest p vc =
|
||||||
|
go =<< getWebUrls' urlincludeexclude key
|
||||||
where
|
where
|
||||||
go [] = giveup "no known url"
|
go [] = giveup "no known url"
|
||||||
go urls = dl (partition (not . isyoutube) (map getDownloader urls)) >>= \case
|
go urls = dl (partition (not . isyoutube) (map getDownloader urls)) >>= \case
|
||||||
|
@ -132,12 +155,12 @@ downloadKey key _af dest p vc = go =<< getWebUrls key
|
||||||
uploadKey :: Key -> AssociatedFile -> MeterUpdate -> Annex ()
|
uploadKey :: Key -> AssociatedFile -> MeterUpdate -> Annex ()
|
||||||
uploadKey _ _ _ = giveup "upload to web not supported"
|
uploadKey _ _ _ = giveup "upload to web not supported"
|
||||||
|
|
||||||
dropKey :: Key -> Annex ()
|
dropKey :: UrlIncludeExclude -> Key -> Annex ()
|
||||||
dropKey k = mapM_ (setUrlMissing k) =<< getWebUrls k
|
dropKey urlincludeexclude k = mapM_ (setUrlMissing k) =<< getWebUrls' urlincludeexclude k
|
||||||
|
|
||||||
checkKey :: Key -> Annex Bool
|
checkKey :: UrlIncludeExclude -> Key -> Annex Bool
|
||||||
checkKey key = do
|
checkKey urlincludeexclude key = do
|
||||||
us <- getWebUrls key
|
us <- getWebUrls' urlincludeexclude key
|
||||||
if null us
|
if null us
|
||||||
then return False
|
then return False
|
||||||
else either giveup return =<< checkKey' key us
|
else either giveup return =<< checkKey' key us
|
||||||
|
@ -157,7 +180,18 @@ checkKey' key us = firsthit us (Right False) $ \u -> do
|
||||||
_ -> firsthit rest r a
|
_ -> firsthit rest r a
|
||||||
|
|
||||||
getWebUrls :: Key -> Annex [URLString]
|
getWebUrls :: Key -> Annex [URLString]
|
||||||
getWebUrls key = filter supported <$> getUrls key
|
getWebUrls key = getWebUrls' (UrlIncludeExclude Nothing Nothing) key
|
||||||
|
|
||||||
|
getWebUrls' :: UrlIncludeExclude -> Key -> Annex [URLString]
|
||||||
|
getWebUrls' (UrlIncludeExclude minclude mexclude) key =
|
||||||
|
filter supported <$> getUrls key
|
||||||
where
|
where
|
||||||
supported u = snd (getDownloader u)
|
supported u = supporteddownloader u && isincluded u && notexcluded u
|
||||||
|
supporteddownloader u = snd (getDownloader u)
|
||||||
`elem` [WebDownloader, YoutubeDownloader]
|
`elem` [WebDownloader, YoutubeDownloader]
|
||||||
|
isincluded u = case minclude of
|
||||||
|
Nothing -> True
|
||||||
|
Just glob -> matchGlob glob u
|
||||||
|
notexcluded u = case mexclude of
|
||||||
|
Nothing -> True
|
||||||
|
Just glob -> not (matchGlob glob u)
|
||||||
|
|
|
@ -12,3 +12,16 @@ This special remote uses urls on the web as the source for content.
|
||||||
There are several other ways http can be used to download annexed objects,
|
There are several other ways http can be used to download annexed objects,
|
||||||
including a git remote accessible by http, S3 with a `publicurl` configured,
|
including a git remote accessible by http, S3 with a `publicurl` configured,
|
||||||
and the [[httpalso]] special remote.
|
and the [[httpalso]] special remote.
|
||||||
|
|
||||||
|
## configuration
|
||||||
|
|
||||||
|
These parameters can be passed to `git annex initremote` or
|
||||||
|
`git-annex enableremote` to configure a web remote:
|
||||||
|
|
||||||
|
* `urlinclude` - Only use urls that match the specified glob.
|
||||||
|
For example, `urlinclude="https://s3.amazonaws.com/*"`
|
||||||
|
Note: Globs are matched case-insensitively.
|
||||||
|
* `urlexclude` - Don't use urls that match the specified glob.
|
||||||
|
For example, to prohibit http urls, but allow https,
|
||||||
|
use `urlexclude="http:*"`
|
||||||
|
Note: Globs are matched case-insensitively.
|
||||||
|
|
|
@ -1,3 +1,7 @@
|
||||||
|
[[!toc ]]
|
||||||
|
|
||||||
|
## basic use
|
||||||
|
|
||||||
The web can be used as a [[special_remote|special_remotes]] too.
|
The web can be used as a [[special_remote|special_remotes]] too.
|
||||||
|
|
||||||
# git annex addurl http://example.com/video.mpeg
|
# git annex addurl http://example.com/video.mpeg
|
||||||
|
@ -48,7 +52,7 @@ You can also attach urls to any file already in the annex:
|
||||||
00000000-0000-0000-0000-000000000001 -- web
|
00000000-0000-0000-0000-000000000001 -- web
|
||||||
27a9510c-760a-11e1-b9a0-c731d2b77df9 -- here
|
27a9510c-760a-11e1-b9a0-c731d2b77df9 -- here
|
||||||
|
|
||||||
## configuring filenames
|
## configuring addurl filenames
|
||||||
|
|
||||||
By default, `addurl` will generate a filename for you. You can use
|
By default, `addurl` will generate a filename for you. You can use
|
||||||
`--file=` to specify the filename to use.
|
`--file=` to specify the filename to use.
|
||||||
|
@ -115,3 +119,25 @@ to work.
|
||||||
## podcasts
|
## podcasts
|
||||||
|
|
||||||
This is done using `git annex importfeed`. See [[downloading podcasts]].
|
This is done using `git annex importfeed`. See [[downloading podcasts]].
|
||||||
|
|
||||||
|
## configuring which url is used when there are several
|
||||||
|
|
||||||
|
An annexed file can have content at multiple urls that git-annex knows
|
||||||
|
about, and git-annex may use any of those urls for downloading a file.
|
||||||
|
|
||||||
|
If some urls are especially fast, you might want to configure
|
||||||
|
which urls git-annex prefers to use first. To accomplish that,
|
||||||
|
you can create additional remotes, that are web special remotes, and are
|
||||||
|
configured to only use the fast urls. Then it's simply a matter of
|
||||||
|
configuring the cost of those remotes.
|
||||||
|
|
||||||
|
For example, suppose that you want to prioritize using urls on "fasthost.com".
|
||||||
|
|
||||||
|
git-annex initremote --sameas=web fasthost type=web urlinclude='*//fasthost.com/*'
|
||||||
|
git config remote.fasthost.annex-cost 150
|
||||||
|
|
||||||
|
Now, `git-annex get` of a file that is on both fasthost.com and another url
|
||||||
|
will prefer to use the fasthost special remote, rather than the web special
|
||||||
|
remote (which has a higher cost of 200), and so will use the fasthost.com
|
||||||
|
url. If that url is not available, it will fall back to the web special
|
||||||
|
remote, and use the other url.
|
||||||
|
|
|
@ -0,0 +1,49 @@
|
||||||
|
[[!comment format=mdwn
|
||||||
|
username="joey"
|
||||||
|
subject="""comment 4"""
|
||||||
|
date="2023-01-09T20:48:18Z"
|
||||||
|
content="""
|
||||||
|
I've implemented support for multiple web special remotes,
|
||||||
|
and have added configurations urlinclude= and urlexclude=
|
||||||
|
(both case-insensitive globs).
|
||||||
|
|
||||||
|
Example use:
|
||||||
|
|
||||||
|
git-annex initremote --sameas=web fastweb type=web urlinclude='*//fasthost.com/*' autoenable=true
|
||||||
|
git config remote.fastweb.annex-cost 150
|
||||||
|
|
||||||
|
And then `git-annex get --from fasthost` will only use urls on that host,
|
||||||
|
not any other urls. `git-annex get --from web` will still use any urls.
|
||||||
|
The cost of 150 makes `git-annex get` use fasthost before web.
|
||||||
|
|
||||||
|
That's enough to handle the example you gave, just use
|
||||||
|
`urlinclude='*//dandiarchive.s3.amazonaws.com/*'
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
But, I don't think this is quite sufficient. Because it should also be
|
||||||
|
possible to deprioritize urls. And there's not a good way to yet.
|
||||||
|
|
||||||
|
In particular, this doesn't work:
|
||||||
|
|
||||||
|
git-annex initremote --sameas=web slowweb type=web urlinclude='*//slowhost.com/*' autoenable=true
|
||||||
|
git config remote.slowhost.annex-cost 300
|
||||||
|
|
||||||
|
Because when getting a file, the main web special remote is tried before
|
||||||
|
this high-cost slowhost one, and will use any url, including
|
||||||
|
slowhost.com urls.
|
||||||
|
|
||||||
|
Now you can instead do this:
|
||||||
|
|
||||||
|
git-annex initremote --sameas=web fastweb type=web urlexclude='*//slowhost.com/*' autoenable=true
|
||||||
|
git config remote.fasthost.annex-cost 150
|
||||||
|
|
||||||
|
But when there's a second slow host, that approach falls down, because you
|
||||||
|
can't specify urlexclude= twice. And even if you could, there would be a
|
||||||
|
distributed configs merging issue same as discussed in comment #3.
|
||||||
|
|
||||||
|
I think what's needed is for the main web special remote to notice that a
|
||||||
|
web remote such as fastweb or slowweb exists, and automatically exclude
|
||||||
|
from using the urls that other web remote is configured to use. Which
|
||||||
|
will be a little bit tricky to implent, but seems doable.
|
||||||
|
"""]]
|
Loading…
Add table
Add a link
Reference in a new issue