Added http special remote, which is useful for accessing other remotes that publish content stored in them via http/https.

With automatic layout learning!
This commit is contained in:
Joey Hess 2020-09-01 15:16:35 -04:00
parent fccc9ab442
commit 571ec900ac
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
15 changed files with 287 additions and 5 deletions

View file

@ -59,7 +59,7 @@ branchHashDir = hashDirLower . branchHashLevels
- which do not allow using a directory "XX" when "xx" already exists.
- To support that, some git-annex repositories use the lower case-hash.
- All special remotes use the lower-case hash for new data, but old data
- may still used the mixed case hash. -}
- may still use the mixed case hash. -}
dirHashes :: [HashLevels -> Hasher]
dirHashes = [hashDirLower, hashDirMixed]

View file

@ -1,5 +1,7 @@
git-annex (8.20200815) UNRELEASED; urgency=medium
* Added http special remote, which is useful for accessing other remotes
that publish content stored in them via http/https.
* The external special remote protocol got an ASYNC extension.
This can be used by an external special remote to let a single process
perform concurrent actions, rather than multiple processes being

181
Remote/Http.hs Normal file
View file

@ -0,0 +1,181 @@
{- Http remote (readonly).
-
- Copyright 2020 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
module Remote.Http (remote) where
import Annex.Common
import Types.Remote
import Types.ProposedAccepted
import Remote.Helper.Messages
import Remote.Helper.ExportImport
import Remote.Helper.Special
import qualified Git
import Annex.Content
import Config.Cost
import Config
import Logs.Web
import Creds
import Utility.Metered
import qualified Annex.Url as Url
import Annex.SpecialRemote.Config
import qualified Data.Map as M
import System.FilePath.Posix as P
import Control.Concurrent.STM
remote :: RemoteType
remote = RemoteType
{ typename = "http"
, enumerate = const (findSpecialRemotes "http")
, generate = gen
, configParser = mkRemoteConfigParser
[ optionalStringParser urlField
(FieldDesc "(required) url to the remote content")
]
, setup = httpSetup
, exportSupported = exportUnsupported
, importSupported = importUnsupported
}
urlField :: RemoteConfigField
urlField = Accepted "url"
gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> RemoteStateHandle -> Annex (Maybe Remote)
gen r u rc gc rs = do
c <- parsedRemoteConfig remote rc
cst <- remoteCost gc expensiveRemoteCost
let url = getRemoteConfigValue urlField c
ll <- liftIO newLearnedLayout
return $ Just $ this url ll c cst
where
this url ll c cst = Remote
{ uuid = u
, cost = cst
, name = Git.repoDescribe r
, storeKey = uploadKey
, retrieveKeyFile = downloadKey url ll
, retrieveKeyFileCheap = Nothing
-- HttpManagerRestricted is used here, so this is
-- secure.
, retrievalSecurityPolicy = RetrievalAllKeysSecure
, removeKey = dropKey
, lockContent = Nothing
, checkPresent = checkKey url ll (this url ll c cst)
, checkPresentCheap = False
, exportActions = exportUnsupported
, importActions = importUnsupported
, whereisKey = Nothing
, remoteFsck = Nothing
, repairRepo = Nothing
, config = c
, gitconfig = gc
, localpath = Nothing
, getRepo = return r
, readonly = True
, appendonly = False
, availability = GloballyAvailable
, remotetype = remote
, mkUnavailable = return Nothing
, getInfo = return []
, claimUrl = Nothing
, checkUrl = Nothing
, remoteStateHandle = rs
}
httpSetup :: SetupStage -> Maybe UUID -> Maybe CredPair -> RemoteConfig -> RemoteGitConfig -> Annex (RemoteConfig, UUID)
httpSetup _ Nothing _ _ _ =
error "Must use --sameas when initializing a http remote."
httpSetup _ (Just u) _ c gc = do
_url <- maybe (giveup "Specify url=")
(return . fromProposedAccepted)
(M.lookup urlField c)
(c', _encsetup) <- encryptionSetup c gc
gitConfigSpecialRemote u c' [("http", "true")]
return (c', u)
downloadKey :: Maybe URLString -> LearnedLayout -> Key -> AssociatedFile -> FilePath -> MeterUpdate -> Annex Verification
downloadKey baseurl ll key _af dest p = do
unlessM (urlAction baseurl ll key go) $
giveup "download failed"
return UnVerified
where
go url = Url.withUrlOptions $ downloadUrl key p [url] dest
uploadKey :: Key -> AssociatedFile -> MeterUpdate -> Annex ()
uploadKey _ _ _ = giveup "upload to http special remote not supported"
dropKey :: Key -> Annex ()
dropKey _ = giveup "removal from http special remote not supported"
checkKey :: Maybe URLString -> LearnedLayout -> Remote -> Key -> Annex Bool
checkKey baseurl ll r key = do
showChecking r
urlAction baseurl ll key $ \url ->
Url.withUrlOptions $ Url.checkBoth url (fromKey keySize key)
type LearnedLayout = TVar (Maybe [Key -> URLString])
newLearnedLayout :: IO LearnedLayout
newLearnedLayout = newTVarIO Nothing
-- Learns which layout the special remote uses, so the once any
-- action on an url succeeds, subsequent calls will continue to use that
-- layout (or related layouts).
urlAction :: Maybe URLString -> LearnedLayout -> Key -> (URLString -> Annex Bool) -> Annex Bool
urlAction (Just baseurl) ll key a = liftIO (readTVarIO ll) >>= \case
Just learned -> go False [learned]
Nothing -> go True (supportedLayouts baseurl)
where
go _learn [] = return False
go learn (layouts:rest) = go' learn layouts [] <||> go learn rest
go' _ [] _ = return False
go' learn (layout:rest) prevs =
ifM (a (layout key))
( do
when learn $ do
let learned = layout:prevs++rest
liftIO $ atomically $
writeTVar ll (Just learned)
return True
, go' learn rest (layout:prevs)
)
-- cannot normally happen
urlAction Nothing _ _ _ = giveup "no url configured for http special remote"
-- Different ways that keys can be laid out in the special remote,
-- with the more common first.
--
-- This is a nested list, because a single remote may use more than one
-- layout. In particular, old versions of git-annex used hashDirMixed
-- for some special remotes, before switching to hashDirLower for new data.
-- So, when learning the layout, both need to be tried.
supportedLayouts :: URLString -> [[Key -> URLString]]
supportedLayouts baseurl =
-- Layout used for bare git-annex repos, and for many
-- special remotes like directory.
[ [ \k -> mkurl k (hashDirLower (HashLevels 2)) P.</> kf k
-- Layout used for non-bare git-annex repos, and for some old
-- special remotes.
, \k -> mkurl k (hashDirMixed (HashLevels 2)) P.</> kf k
]
-- Special remotes that do not need hash directories.
, [ \k -> baseurl P.</> kf k ]
-- Layouts without a key directory, used by some special remotes.
, [ \k -> mkurl k (hashDirLower def)
, \k -> mkurl k (hashDirMixed def)
]
-- Layouts with only 1 level of hash directory,
-- rather than the default 2.
, [ \k -> mkurl k (hashDirLower (HashLevels 1))
, \k -> mkurl k (hashDirMixed (HashLevels 1))
]
]
where
mkurl k hasher = baseurl P.</> fromRawFilePath (hasher k) P.</> kf k
kf k = fromRawFilePath (keyFile k)

View file

@ -41,6 +41,7 @@ import qualified Remote.Tahoe
import qualified Remote.Glacier
import qualified Remote.Ddar
import qualified Remote.GitLFS
import qualified Remote.Http
import qualified Remote.Hook
import qualified Remote.External
@ -65,6 +66,7 @@ remoteTypes = map adjustExportImportRemoteType
, Remote.Glacier.remote
, Remote.Ddar.remote
, Remote.GitLFS.remote
, Remote.Http.remote
, Remote.Hook.remote
, Remote.External.remote
]

View file

@ -1547,6 +1547,11 @@ Remotes are configured using these settings in `.git/config`.
It is set to "true" if this is a git-lfs remote.
* `remote.<name>.annex-http`
Used to identify http special remotes.
Normally this is automatically set up by `git annex initremote`.
* `remote.<name>.annex-externaltype`
Used external special remotes to record the type of the remote.

View file

@ -21,7 +21,7 @@ the git history is not stored in them.
* [[S3]] (Amazon S3, and other compatible services)
* [[tahoe]]
* [[tor]]
* [[web]]
* [[web]] and [[http]]
* [[webdav]]
* [[git]]
* [[xmpp]]

View file

@ -0,0 +1,28 @@
This special remote allows downloading annexed objects from other remotes
that expose their content by http. Not to be confused with the [[web]]
special remote, this one is only useful in combination with some other
special remote.
Suppose, for example, that you have a [[directory]] special remote. And the
directory happens to be published by a web server. (Or it could be a
[[rsync]] special remote, or many other kinds.) To let git-annex know that
the content of this special remote can also be accessed over http, set up
a http special remote.
git annex initremote --sameas=foo foo-http type=http url=http://example.com/foo
The --sameas parameter tells git-annex what other special remote this http
remote is accessing. (See [[tips/multiple_remotes_accessing_the_same_data_store]].)
Since the http remote is read-only, it can only be used to download content
that is stored in that other remote.
This special remote is compatible with many, but not all, other special
remotes. If the special remote does something unusual with the name
a file is stored under, or with how the data is stored, it might not work.
See [[tips/multiple_remotes_accessing_the_same_data_store]]
for a list of known working combinations.
## configuration
* `url` - The http or https url to where the content is stored by the
other special remote.

View file

@ -1,4 +1,5 @@
git-annex can use the WWW as a special remote, downloading urls to files.
git-annex can use the WWW as a special remote, associating an url with an
annexed file, and downloading the file content from the web.
See [[tips/using_the_web_as_a_special_remote]] for usage examples.
## notes
@ -7,5 +8,6 @@ Currently git-annex only supports downloading content from the web;
it cannot upload to it or remove content.
This special remote uses urls on the web as the source for content.
git-annex can also download content from a normal git remote, accessible by
http.
There are several other ways http can be used to download annexed objects,
including a git remote accessible by http, S3 with a `publicurl` configured,
and the [[http]] special remote.

View file

@ -55,4 +55,7 @@ If you find combinations that work, please edit this page to list them.
## known working combinations
* directory and rsync
* http and directory
* http and rsync
* http and rclone (any layout except for frankencase)

View file

@ -15,3 +15,5 @@ access of other special remotes whose data stores are exposed via http.
Call it "http" maybe. (There may be some confusion between this and the web
special remote by users looking for such a thing.) --[[Joey]]
> http special remote implemented, [[done]] --[[Joey]]

View file

@ -0,0 +1,27 @@
[[!comment format=mdwn
username="joey"
subject="""comment 5"""
date="2020-09-01T19:00:05Z"
content="""
I'm implementing this with an automatic learning of the layout that a
special remote uses.
It looks like it will probably be sufficiently fast
for it to remember only for the duration of the command, although it would
be possible to cache what it's learned in .git/config or something.
Usually, the learning will add 6 extra http requests to learn
the most unlikely layout (hashDirMixed with 1 hash level, which I doubt
anything actually uses). Since the first file it tries to access is almost
certainly present in the special remote, it will then learn the layout and
keep using it with no added overhead.
The unusual case would be if a lot of files are not present in the
remote any longer. Then it will fail to learn, on each file, and so will
always make 6 extra http requests per file processed. (Reusing the same
http connection at least.) This seems unusual enough to not worry about
remembering what it's learned for longer than a single run, or making the
layout explicitly configurable. Content would have to be dropped from the
other special remote and the git-annex branch not be synced up for it to
happen.
"""]]

View file

@ -0,0 +1,4 @@
The http special remote doesn't currently support being used with a
--sameas remote that uses exporttree=yes.
It seems like this should be fairly easy to implement. --[[Joey]]

View file

@ -3,3 +3,6 @@
[[!meta author=yoh]]
[[!tag projects/datalad]]
[[!tag needsthought]]
> [[done]] by implementing another design, not the one suggested here
> --[[Joey]]

View file

@ -0,0 +1,22 @@
[[!comment format=mdwn
username="joey"
subject="""comment 6"""
date="2020-09-01T19:09:30Z"
content="""
I've implemented the http special remote, that can be combined with other
special remotes to access them using anonymous http.
I think that probably addresses this todo well enough to close it.
(Although I didn't get around to
[[todo/make_http_special_remote_support_exporttree_remotes]] yet, and this
todo mentions supporting exporttree. Should be easy to add later though.)
There are probably some special remotes that are unusual enough that the
http special remote can't support them, which it would make sense to add a
publicurl= config to, like S3 has. (Although I think S3 itself could now be
used with the http special remote so its option is vestigal now.)
I guess that publicurl= config would best be added to the individual
special remote, so it doesn't need any particular support in git-annex to
add it.
"""]]

View file

@ -980,6 +980,7 @@ Executable git-annex
Remote.Helper.ReadOnly
Remote.Helper.Special
Remote.Helper.Ssh
Remote.Http
Remote.Hook
Remote.List
Remote.List.Util