Added http special remote, which is useful for accessing other remotes that publish content stored in them via http/https.
With automatic layout learning!
This commit is contained in:
parent
fccc9ab442
commit
571ec900ac
15 changed files with 287 additions and 5 deletions
Annex
CHANGELOGRemote
doc
git-annex.mdwnspecial_remotes.mdwn
git-annex.cabalspecial_remotes
tips
todo
generic_readonly_http_remote.mdwn
generic_readonly_http_remote
make_http_special_remote_support_exporttree_remotes.mdwnpublicurl_config_for_all_special_remotes.mdwnpublicurl_config_for_all_special_remotes
|
@ -59,7 +59,7 @@ branchHashDir = hashDirLower . branchHashLevels
|
|||
- which do not allow using a directory "XX" when "xx" already exists.
|
||||
- To support that, some git-annex repositories use the lower case-hash.
|
||||
- All special remotes use the lower-case hash for new data, but old data
|
||||
- may still used the mixed case hash. -}
|
||||
- may still use the mixed case hash. -}
|
||||
dirHashes :: [HashLevels -> Hasher]
|
||||
dirHashes = [hashDirLower, hashDirMixed]
|
||||
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
git-annex (8.20200815) UNRELEASED; urgency=medium
|
||||
|
||||
* Added http special remote, which is useful for accessing other remotes
|
||||
that publish content stored in them via http/https.
|
||||
* The external special remote protocol got an ASYNC extension.
|
||||
This can be used by an external special remote to let a single process
|
||||
perform concurrent actions, rather than multiple processes being
|
||||
|
|
181
Remote/Http.hs
Normal file
181
Remote/Http.hs
Normal file
|
@ -0,0 +1,181 @@
|
|||
{- Http remote (readonly).
|
||||
-
|
||||
- Copyright 2020 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
||||
module Remote.Http (remote) where
|
||||
|
||||
import Annex.Common
|
||||
import Types.Remote
|
||||
import Types.ProposedAccepted
|
||||
import Remote.Helper.Messages
|
||||
import Remote.Helper.ExportImport
|
||||
import Remote.Helper.Special
|
||||
import qualified Git
|
||||
import Annex.Content
|
||||
import Config.Cost
|
||||
import Config
|
||||
import Logs.Web
|
||||
import Creds
|
||||
import Utility.Metered
|
||||
import qualified Annex.Url as Url
|
||||
import Annex.SpecialRemote.Config
|
||||
|
||||
import qualified Data.Map as M
|
||||
import System.FilePath.Posix as P
|
||||
import Control.Concurrent.STM
|
||||
|
||||
remote :: RemoteType
|
||||
remote = RemoteType
|
||||
{ typename = "http"
|
||||
, enumerate = const (findSpecialRemotes "http")
|
||||
, generate = gen
|
||||
, configParser = mkRemoteConfigParser
|
||||
[ optionalStringParser urlField
|
||||
(FieldDesc "(required) url to the remote content")
|
||||
]
|
||||
, setup = httpSetup
|
||||
, exportSupported = exportUnsupported
|
||||
, importSupported = importUnsupported
|
||||
}
|
||||
|
||||
urlField :: RemoteConfigField
|
||||
urlField = Accepted "url"
|
||||
|
||||
gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> RemoteStateHandle -> Annex (Maybe Remote)
|
||||
gen r u rc gc rs = do
|
||||
c <- parsedRemoteConfig remote rc
|
||||
cst <- remoteCost gc expensiveRemoteCost
|
||||
let url = getRemoteConfigValue urlField c
|
||||
ll <- liftIO newLearnedLayout
|
||||
return $ Just $ this url ll c cst
|
||||
where
|
||||
this url ll c cst = Remote
|
||||
{ uuid = u
|
||||
, cost = cst
|
||||
, name = Git.repoDescribe r
|
||||
, storeKey = uploadKey
|
||||
, retrieveKeyFile = downloadKey url ll
|
||||
, retrieveKeyFileCheap = Nothing
|
||||
-- HttpManagerRestricted is used here, so this is
|
||||
-- secure.
|
||||
, retrievalSecurityPolicy = RetrievalAllKeysSecure
|
||||
, removeKey = dropKey
|
||||
, lockContent = Nothing
|
||||
, checkPresent = checkKey url ll (this url ll c cst)
|
||||
, checkPresentCheap = False
|
||||
, exportActions = exportUnsupported
|
||||
, importActions = importUnsupported
|
||||
, whereisKey = Nothing
|
||||
, remoteFsck = Nothing
|
||||
, repairRepo = Nothing
|
||||
, config = c
|
||||
, gitconfig = gc
|
||||
, localpath = Nothing
|
||||
, getRepo = return r
|
||||
, readonly = True
|
||||
, appendonly = False
|
||||
, availability = GloballyAvailable
|
||||
, remotetype = remote
|
||||
, mkUnavailable = return Nothing
|
||||
, getInfo = return []
|
||||
, claimUrl = Nothing
|
||||
, checkUrl = Nothing
|
||||
, remoteStateHandle = rs
|
||||
}
|
||||
|
||||
httpSetup :: SetupStage -> Maybe UUID -> Maybe CredPair -> RemoteConfig -> RemoteGitConfig -> Annex (RemoteConfig, UUID)
|
||||
httpSetup _ Nothing _ _ _ =
|
||||
error "Must use --sameas when initializing a http remote."
|
||||
httpSetup _ (Just u) _ c gc = do
|
||||
_url <- maybe (giveup "Specify url=")
|
||||
(return . fromProposedAccepted)
|
||||
(M.lookup urlField c)
|
||||
(c', _encsetup) <- encryptionSetup c gc
|
||||
gitConfigSpecialRemote u c' [("http", "true")]
|
||||
return (c', u)
|
||||
|
||||
downloadKey :: Maybe URLString -> LearnedLayout -> Key -> AssociatedFile -> FilePath -> MeterUpdate -> Annex Verification
|
||||
downloadKey baseurl ll key _af dest p = do
|
||||
unlessM (urlAction baseurl ll key go) $
|
||||
giveup "download failed"
|
||||
return UnVerified
|
||||
where
|
||||
go url = Url.withUrlOptions $ downloadUrl key p [url] dest
|
||||
|
||||
uploadKey :: Key -> AssociatedFile -> MeterUpdate -> Annex ()
|
||||
uploadKey _ _ _ = giveup "upload to http special remote not supported"
|
||||
|
||||
dropKey :: Key -> Annex ()
|
||||
dropKey _ = giveup "removal from http special remote not supported"
|
||||
|
||||
checkKey :: Maybe URLString -> LearnedLayout -> Remote -> Key -> Annex Bool
|
||||
checkKey baseurl ll r key = do
|
||||
showChecking r
|
||||
urlAction baseurl ll key $ \url ->
|
||||
Url.withUrlOptions $ Url.checkBoth url (fromKey keySize key)
|
||||
|
||||
type LearnedLayout = TVar (Maybe [Key -> URLString])
|
||||
|
||||
newLearnedLayout :: IO LearnedLayout
|
||||
newLearnedLayout = newTVarIO Nothing
|
||||
|
||||
-- Learns which layout the special remote uses, so the once any
|
||||
-- action on an url succeeds, subsequent calls will continue to use that
|
||||
-- layout (or related layouts).
|
||||
urlAction :: Maybe URLString -> LearnedLayout -> Key -> (URLString -> Annex Bool) -> Annex Bool
|
||||
urlAction (Just baseurl) ll key a = liftIO (readTVarIO ll) >>= \case
|
||||
Just learned -> go False [learned]
|
||||
Nothing -> go True (supportedLayouts baseurl)
|
||||
where
|
||||
go _learn [] = return False
|
||||
go learn (layouts:rest) = go' learn layouts [] <||> go learn rest
|
||||
|
||||
go' _ [] _ = return False
|
||||
go' learn (layout:rest) prevs =
|
||||
ifM (a (layout key))
|
||||
( do
|
||||
when learn $ do
|
||||
let learned = layout:prevs++rest
|
||||
liftIO $ atomically $
|
||||
writeTVar ll (Just learned)
|
||||
return True
|
||||
, go' learn rest (layout:prevs)
|
||||
)
|
||||
|
||||
-- cannot normally happen
|
||||
urlAction Nothing _ _ _ = giveup "no url configured for http special remote"
|
||||
|
||||
-- Different ways that keys can be laid out in the special remote,
|
||||
-- with the more common first.
|
||||
--
|
||||
-- This is a nested list, because a single remote may use more than one
|
||||
-- layout. In particular, old versions of git-annex used hashDirMixed
|
||||
-- for some special remotes, before switching to hashDirLower for new data.
|
||||
-- So, when learning the layout, both need to be tried.
|
||||
supportedLayouts :: URLString -> [[Key -> URLString]]
|
||||
supportedLayouts baseurl =
|
||||
-- Layout used for bare git-annex repos, and for many
|
||||
-- special remotes like directory.
|
||||
[ [ \k -> mkurl k (hashDirLower (HashLevels 2)) P.</> kf k
|
||||
-- Layout used for non-bare git-annex repos, and for some old
|
||||
-- special remotes.
|
||||
, \k -> mkurl k (hashDirMixed (HashLevels 2)) P.</> kf k
|
||||
]
|
||||
-- Special remotes that do not need hash directories.
|
||||
, [ \k -> baseurl P.</> kf k ]
|
||||
-- Layouts without a key directory, used by some special remotes.
|
||||
, [ \k -> mkurl k (hashDirLower def)
|
||||
, \k -> mkurl k (hashDirMixed def)
|
||||
]
|
||||
-- Layouts with only 1 level of hash directory,
|
||||
-- rather than the default 2.
|
||||
, [ \k -> mkurl k (hashDirLower (HashLevels 1))
|
||||
, \k -> mkurl k (hashDirMixed (HashLevels 1))
|
||||
]
|
||||
]
|
||||
where
|
||||
mkurl k hasher = baseurl P.</> fromRawFilePath (hasher k) P.</> kf k
|
||||
kf k = fromRawFilePath (keyFile k)
|
|
@ -41,6 +41,7 @@ import qualified Remote.Tahoe
|
|||
import qualified Remote.Glacier
|
||||
import qualified Remote.Ddar
|
||||
import qualified Remote.GitLFS
|
||||
import qualified Remote.Http
|
||||
import qualified Remote.Hook
|
||||
import qualified Remote.External
|
||||
|
||||
|
@ -65,6 +66,7 @@ remoteTypes = map adjustExportImportRemoteType
|
|||
, Remote.Glacier.remote
|
||||
, Remote.Ddar.remote
|
||||
, Remote.GitLFS.remote
|
||||
, Remote.Http.remote
|
||||
, Remote.Hook.remote
|
||||
, Remote.External.remote
|
||||
]
|
||||
|
|
|
@ -1547,6 +1547,11 @@ Remotes are configured using these settings in `.git/config`.
|
|||
|
||||
It is set to "true" if this is a git-lfs remote.
|
||||
|
||||
* `remote.<name>.annex-http`
|
||||
|
||||
Used to identify http special remotes.
|
||||
Normally this is automatically set up by `git annex initremote`.
|
||||
|
||||
* `remote.<name>.annex-externaltype`
|
||||
|
||||
Used external special remotes to record the type of the remote.
|
||||
|
|
|
@ -21,7 +21,7 @@ the git history is not stored in them.
|
|||
* [[S3]] (Amazon S3, and other compatible services)
|
||||
* [[tahoe]]
|
||||
* [[tor]]
|
||||
* [[web]]
|
||||
* [[web]] and [[http]]
|
||||
* [[webdav]]
|
||||
* [[git]]
|
||||
* [[xmpp]]
|
||||
|
|
28
doc/special_remotes/http.mdwn
Normal file
28
doc/special_remotes/http.mdwn
Normal file
|
@ -0,0 +1,28 @@
|
|||
This special remote allows downloading annexed objects from other remotes
|
||||
that expose their content by http. Not to be confused with the [[web]]
|
||||
special remote, this one is only useful in combination with some other
|
||||
special remote.
|
||||
|
||||
Suppose, for example, that you have a [[directory]] special remote. And the
|
||||
directory happens to be published by a web server. (Or it could be a
|
||||
[[rsync]] special remote, or many other kinds.) To let git-annex know that
|
||||
the content of this special remote can also be accessed over http, set up
|
||||
a http special remote.
|
||||
|
||||
git annex initremote --sameas=foo foo-http type=http url=http://example.com/foo
|
||||
|
||||
The --sameas parameter tells git-annex what other special remote this http
|
||||
remote is accessing. (See [[tips/multiple_remotes_accessing_the_same_data_store]].)
|
||||
Since the http remote is read-only, it can only be used to download content
|
||||
that is stored in that other remote.
|
||||
|
||||
This special remote is compatible with many, but not all, other special
|
||||
remotes. If the special remote does something unusual with the name
|
||||
a file is stored under, or with how the data is stored, it might not work.
|
||||
See [[tips/multiple_remotes_accessing_the_same_data_store]]
|
||||
for a list of known working combinations.
|
||||
|
||||
## configuration
|
||||
|
||||
* `url` - The http or https url to where the content is stored by the
|
||||
other special remote.
|
|
@ -1,4 +1,5 @@
|
|||
git-annex can use the WWW as a special remote, downloading urls to files.
|
||||
git-annex can use the WWW as a special remote, associating an url with an
|
||||
annexed file, and downloading the file content from the web.
|
||||
See [[tips/using_the_web_as_a_special_remote]] for usage examples.
|
||||
|
||||
## notes
|
||||
|
@ -7,5 +8,6 @@ Currently git-annex only supports downloading content from the web;
|
|||
it cannot upload to it or remove content.
|
||||
|
||||
This special remote uses urls on the web as the source for content.
|
||||
git-annex can also download content from a normal git remote, accessible by
|
||||
http.
|
||||
There are several other ways http can be used to download annexed objects,
|
||||
including a git remote accessible by http, S3 with a `publicurl` configured,
|
||||
and the [[http]] special remote.
|
||||
|
|
|
@ -55,4 +55,7 @@ If you find combinations that work, please edit this page to list them.
|
|||
## known working combinations
|
||||
|
||||
* directory and rsync
|
||||
* http and directory
|
||||
* http and rsync
|
||||
* http and rclone (any layout except for frankencase)
|
||||
|
||||
|
|
|
@ -15,3 +15,5 @@ access of other special remotes whose data stores are exposed via http.
|
|||
|
||||
Call it "http" maybe. (There may be some confusion between this and the web
|
||||
special remote by users looking for such a thing.) --[[Joey]]
|
||||
|
||||
> http special remote implemented, [[done]] --[[Joey]]
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
[[!comment format=mdwn
|
||||
username="joey"
|
||||
subject="""comment 5"""
|
||||
date="2020-09-01T19:00:05Z"
|
||||
content="""
|
||||
I'm implementing this with an automatic learning of the layout that a
|
||||
special remote uses.
|
||||
|
||||
It looks like it will probably be sufficiently fast
|
||||
for it to remember only for the duration of the command, although it would
|
||||
be possible to cache what it's learned in .git/config or something.
|
||||
|
||||
Usually, the learning will add 6 extra http requests to learn
|
||||
the most unlikely layout (hashDirMixed with 1 hash level, which I doubt
|
||||
anything actually uses). Since the first file it tries to access is almost
|
||||
certainly present in the special remote, it will then learn the layout and
|
||||
keep using it with no added overhead.
|
||||
|
||||
The unusual case would be if a lot of files are not present in the
|
||||
remote any longer. Then it will fail to learn, on each file, and so will
|
||||
always make 6 extra http requests per file processed. (Reusing the same
|
||||
http connection at least.) This seems unusual enough to not worry about
|
||||
remembering what it's learned for longer than a single run, or making the
|
||||
layout explicitly configurable. Content would have to be dropped from the
|
||||
other special remote and the git-annex branch not be synced up for it to
|
||||
happen.
|
||||
"""]]
|
|
@ -0,0 +1,4 @@
|
|||
The http special remote doesn't currently support being used with a
|
||||
--sameas remote that uses exporttree=yes.
|
||||
|
||||
It seems like this should be fairly easy to implement. --[[Joey]]
|
|
@ -3,3 +3,6 @@
|
|||
[[!meta author=yoh]]
|
||||
[[!tag projects/datalad]]
|
||||
[[!tag needsthought]]
|
||||
|
||||
> [[done]] by implementing another design, not the one suggested here
|
||||
> --[[Joey]]
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
[[!comment format=mdwn
|
||||
username="joey"
|
||||
subject="""comment 6"""
|
||||
date="2020-09-01T19:09:30Z"
|
||||
content="""
|
||||
I've implemented the http special remote, that can be combined with other
|
||||
special remotes to access them using anonymous http.
|
||||
|
||||
I think that probably addresses this todo well enough to close it.
|
||||
(Although I didn't get around to
|
||||
[[todo/make_http_special_remote_support_exporttree_remotes]] yet, and this
|
||||
todo mentions supporting exporttree. Should be easy to add later though.)
|
||||
|
||||
There are probably some special remotes that are unusual enough that the
|
||||
http special remote can't support them, which it would make sense to add a
|
||||
publicurl= config to, like S3 has. (Although I think S3 itself could now be
|
||||
used with the http special remote so its option is vestigal now.)
|
||||
|
||||
I guess that publicurl= config would best be added to the individual
|
||||
special remote, so it doesn't need any particular support in git-annex to
|
||||
add it.
|
||||
"""]]
|
|
@ -980,6 +980,7 @@ Executable git-annex
|
|||
Remote.Helper.ReadOnly
|
||||
Remote.Helper.Special
|
||||
Remote.Helper.Ssh
|
||||
Remote.Http
|
||||
Remote.Hook
|
||||
Remote.List
|
||||
Remote.List.Util
|
||||
|
|
Loading…
Add table
Reference in a new issue