diff --git a/Annex/DirHashes.hs b/Annex/DirHashes.hs index 237345feb1..7311acf3e6 100644 --- a/Annex/DirHashes.hs +++ b/Annex/DirHashes.hs @@ -59,7 +59,7 @@ branchHashDir = hashDirLower . branchHashLevels - which do not allow using a directory "XX" when "xx" already exists. - To support that, some git-annex repositories use the lower case-hash. - All special remotes use the lower-case hash for new data, but old data - - may still used the mixed case hash. -} + - may still use the mixed case hash. -} dirHashes :: [HashLevels -> Hasher] dirHashes = [hashDirLower, hashDirMixed] diff --git a/CHANGELOG b/CHANGELOG index 7ac371a783..73cd7ad525 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,7 @@ git-annex (8.20200815) UNRELEASED; urgency=medium + * Added http special remote, which is useful for accessing other remotes + that publish content stored in them via http/https. * The external special remote protocol got an ASYNC extension. This can be used by an external special remote to let a single process perform concurrent actions, rather than multiple processes being diff --git a/Remote/Http.hs b/Remote/Http.hs new file mode 100644 index 0000000000..0566e81812 --- /dev/null +++ b/Remote/Http.hs @@ -0,0 +1,181 @@ +{- Http remote (readonly). + - + - Copyright 2020 Joey Hess + - + - Licensed under the GNU AGPL version 3 or higher. + -} + +module Remote.Http (remote) where + +import Annex.Common +import Types.Remote +import Types.ProposedAccepted +import Remote.Helper.Messages +import Remote.Helper.ExportImport +import Remote.Helper.Special +import qualified Git +import Annex.Content +import Config.Cost +import Config +import Logs.Web +import Creds +import Utility.Metered +import qualified Annex.Url as Url +import Annex.SpecialRemote.Config + +import qualified Data.Map as M +import System.FilePath.Posix as P +import Control.Concurrent.STM + +remote :: RemoteType +remote = RemoteType + { typename = "http" + , enumerate = const (findSpecialRemotes "http") + , generate = gen + , configParser = mkRemoteConfigParser + [ optionalStringParser urlField + (FieldDesc "(required) url to the remote content") + ] + , setup = httpSetup + , exportSupported = exportUnsupported + , importSupported = importUnsupported + } + +urlField :: RemoteConfigField +urlField = Accepted "url" + +gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> RemoteStateHandle -> Annex (Maybe Remote) +gen r u rc gc rs = do + c <- parsedRemoteConfig remote rc + cst <- remoteCost gc expensiveRemoteCost + let url = getRemoteConfigValue urlField c + ll <- liftIO newLearnedLayout + return $ Just $ this url ll c cst + where + this url ll c cst = Remote + { uuid = u + , cost = cst + , name = Git.repoDescribe r + , storeKey = uploadKey + , retrieveKeyFile = downloadKey url ll + , retrieveKeyFileCheap = Nothing + -- HttpManagerRestricted is used here, so this is + -- secure. + , retrievalSecurityPolicy = RetrievalAllKeysSecure + , removeKey = dropKey + , lockContent = Nothing + , checkPresent = checkKey url ll (this url ll c cst) + , checkPresentCheap = False + , exportActions = exportUnsupported + , importActions = importUnsupported + , whereisKey = Nothing + , remoteFsck = Nothing + , repairRepo = Nothing + , config = c + , gitconfig = gc + , localpath = Nothing + , getRepo = return r + , readonly = True + , appendonly = False + , availability = GloballyAvailable + , remotetype = remote + , mkUnavailable = return Nothing + , getInfo = return [] + , claimUrl = Nothing + , checkUrl = Nothing + , remoteStateHandle = rs + } + +httpSetup :: SetupStage -> Maybe UUID -> Maybe CredPair -> RemoteConfig -> RemoteGitConfig -> Annex (RemoteConfig, UUID) +httpSetup _ Nothing _ _ _ = + error "Must use --sameas when initializing a http remote." +httpSetup _ (Just u) _ c gc = do + _url <- maybe (giveup "Specify url=") + (return . fromProposedAccepted) + (M.lookup urlField c) + (c', _encsetup) <- encryptionSetup c gc + gitConfigSpecialRemote u c' [("http", "true")] + return (c', u) + +downloadKey :: Maybe URLString -> LearnedLayout -> Key -> AssociatedFile -> FilePath -> MeterUpdate -> Annex Verification +downloadKey baseurl ll key _af dest p = do + unlessM (urlAction baseurl ll key go) $ + giveup "download failed" + return UnVerified + where + go url = Url.withUrlOptions $ downloadUrl key p [url] dest + +uploadKey :: Key -> AssociatedFile -> MeterUpdate -> Annex () +uploadKey _ _ _ = giveup "upload to http special remote not supported" + +dropKey :: Key -> Annex () +dropKey _ = giveup "removal from http special remote not supported" + +checkKey :: Maybe URLString -> LearnedLayout -> Remote -> Key -> Annex Bool +checkKey baseurl ll r key = do + showChecking r + urlAction baseurl ll key $ \url -> + Url.withUrlOptions $ Url.checkBoth url (fromKey keySize key) + +type LearnedLayout = TVar (Maybe [Key -> URLString]) + +newLearnedLayout :: IO LearnedLayout +newLearnedLayout = newTVarIO Nothing + +-- Learns which layout the special remote uses, so the once any +-- action on an url succeeds, subsequent calls will continue to use that +-- layout (or related layouts). +urlAction :: Maybe URLString -> LearnedLayout -> Key -> (URLString -> Annex Bool) -> Annex Bool +urlAction (Just baseurl) ll key a = liftIO (readTVarIO ll) >>= \case + Just learned -> go False [learned] + Nothing -> go True (supportedLayouts baseurl) + where + go _learn [] = return False + go learn (layouts:rest) = go' learn layouts [] <||> go learn rest + + go' _ [] _ = return False + go' learn (layout:rest) prevs = + ifM (a (layout key)) + ( do + when learn $ do + let learned = layout:prevs++rest + liftIO $ atomically $ + writeTVar ll (Just learned) + return True + , go' learn rest (layout:prevs) + ) + +-- cannot normally happen +urlAction Nothing _ _ _ = giveup "no url configured for http special remote" + +-- Different ways that keys can be laid out in the special remote, +-- with the more common first. +-- +-- This is a nested list, because a single remote may use more than one +-- layout. In particular, old versions of git-annex used hashDirMixed +-- for some special remotes, before switching to hashDirLower for new data. +-- So, when learning the layout, both need to be tried. +supportedLayouts :: URLString -> [[Key -> URLString]] +supportedLayouts baseurl = + -- Layout used for bare git-annex repos, and for many + -- special remotes like directory. + [ [ \k -> mkurl k (hashDirLower (HashLevels 2)) P. kf k + -- Layout used for non-bare git-annex repos, and for some old + -- special remotes. + , \k -> mkurl k (hashDirMixed (HashLevels 2)) P. kf k + ] + -- Special remotes that do not need hash directories. + , [ \k -> baseurl P. kf k ] + -- Layouts without a key directory, used by some special remotes. + , [ \k -> mkurl k (hashDirLower def) + , \k -> mkurl k (hashDirMixed def) + ] + -- Layouts with only 1 level of hash directory, + -- rather than the default 2. + , [ \k -> mkurl k (hashDirLower (HashLevels 1)) + , \k -> mkurl k (hashDirMixed (HashLevels 1)) + ] + ] + where + mkurl k hasher = baseurl P. fromRawFilePath (hasher k) P. kf k + kf k = fromRawFilePath (keyFile k) diff --git a/Remote/List.hs b/Remote/List.hs index db8c53feb5..f158141ca1 100644 --- a/Remote/List.hs +++ b/Remote/List.hs @@ -41,6 +41,7 @@ import qualified Remote.Tahoe import qualified Remote.Glacier import qualified Remote.Ddar import qualified Remote.GitLFS +import qualified Remote.Http import qualified Remote.Hook import qualified Remote.External @@ -65,6 +66,7 @@ remoteTypes = map adjustExportImportRemoteType , Remote.Glacier.remote , Remote.Ddar.remote , Remote.GitLFS.remote + , Remote.Http.remote , Remote.Hook.remote , Remote.External.remote ] diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index 175fe12d77..306d7e552e 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -1547,6 +1547,11 @@ Remotes are configured using these settings in `.git/config`. It is set to "true" if this is a git-lfs remote. +* `remote..annex-http` + + Used to identify http special remotes. + Normally this is automatically set up by `git annex initremote`. + * `remote..annex-externaltype` Used external special remotes to record the type of the remote. diff --git a/doc/special_remotes.mdwn b/doc/special_remotes.mdwn index 91f86c93fb..3471fc76c6 100644 --- a/doc/special_remotes.mdwn +++ b/doc/special_remotes.mdwn @@ -21,7 +21,7 @@ the git history is not stored in them. * [[S3]] (Amazon S3, and other compatible services) * [[tahoe]] * [[tor]] -* [[web]] +* [[web]] and [[http]] * [[webdav]] * [[git]] * [[xmpp]] diff --git a/doc/special_remotes/http.mdwn b/doc/special_remotes/http.mdwn new file mode 100644 index 0000000000..70373b6483 --- /dev/null +++ b/doc/special_remotes/http.mdwn @@ -0,0 +1,28 @@ +This special remote allows downloading annexed objects from other remotes +that expose their content by http. Not to be confused with the [[web]] +special remote, this one is only useful in combination with some other +special remote. + +Suppose, for example, that you have a [[directory]] special remote. And the +directory happens to be published by a web server. (Or it could be a +[[rsync]] special remote, or many other kinds.) To let git-annex know that +the content of this special remote can also be accessed over http, set up +a http special remote. + + git annex initremote --sameas=foo foo-http type=http url=http://example.com/foo + +The --sameas parameter tells git-annex what other special remote this http +remote is accessing. (See [[tips/multiple_remotes_accessing_the_same_data_store]].) +Since the http remote is read-only, it can only be used to download content +that is stored in that other remote. + +This special remote is compatible with many, but not all, other special +remotes. If the special remote does something unusual with the name +a file is stored under, or with how the data is stored, it might not work. +See [[tips/multiple_remotes_accessing_the_same_data_store]] +for a list of known working combinations. + +## configuration + +* `url` - The http or https url to where the content is stored by the + other special remote. diff --git a/doc/special_remotes/web.mdwn b/doc/special_remotes/web.mdwn index 8bb4091704..43b282bb7f 100644 --- a/doc/special_remotes/web.mdwn +++ b/doc/special_remotes/web.mdwn @@ -1,4 +1,5 @@ -git-annex can use the WWW as a special remote, downloading urls to files. +git-annex can use the WWW as a special remote, associating an url with an +annexed file, and downloading the file content from the web. See [[tips/using_the_web_as_a_special_remote]] for usage examples. ## notes @@ -7,5 +8,6 @@ Currently git-annex only supports downloading content from the web; it cannot upload to it or remove content. This special remote uses urls on the web as the source for content. -git-annex can also download content from a normal git remote, accessible by -http. +There are several other ways http can be used to download annexed objects, +including a git remote accessible by http, S3 with a `publicurl` configured, +and the [[http]] special remote. diff --git a/doc/tips/multiple_remotes_accessing_the_same_data_store.mdwn b/doc/tips/multiple_remotes_accessing_the_same_data_store.mdwn index e3cda30e28..d0924871e0 100644 --- a/doc/tips/multiple_remotes_accessing_the_same_data_store.mdwn +++ b/doc/tips/multiple_remotes_accessing_the_same_data_store.mdwn @@ -55,4 +55,7 @@ If you find combinations that work, please edit this page to list them. ## known working combinations * directory and rsync +* http and directory +* http and rsync +* http and rclone (any layout except for frankencase) diff --git a/doc/todo/generic_readonly_http_remote.mdwn b/doc/todo/generic_readonly_http_remote.mdwn index e19351d56f..ba3a4c4acd 100644 --- a/doc/todo/generic_readonly_http_remote.mdwn +++ b/doc/todo/generic_readonly_http_remote.mdwn @@ -15,3 +15,5 @@ access of other special remotes whose data stores are exposed via http. Call it "http" maybe. (There may be some confusion between this and the web special remote by users looking for such a thing.) --[[Joey]] + +> http special remote implemented, [[done]] --[[Joey]] diff --git a/doc/todo/generic_readonly_http_remote/comment_5_bfbec043d803ff1aa649f2661f6918f4._comment b/doc/todo/generic_readonly_http_remote/comment_5_bfbec043d803ff1aa649f2661f6918f4._comment new file mode 100644 index 0000000000..7a56cba643 --- /dev/null +++ b/doc/todo/generic_readonly_http_remote/comment_5_bfbec043d803ff1aa649f2661f6918f4._comment @@ -0,0 +1,27 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 5""" + date="2020-09-01T19:00:05Z" + content=""" +I'm implementing this with an automatic learning of the layout that a +special remote uses. + +It looks like it will probably be sufficiently fast +for it to remember only for the duration of the command, although it would +be possible to cache what it's learned in .git/config or something. + +Usually, the learning will add 6 extra http requests to learn +the most unlikely layout (hashDirMixed with 1 hash level, which I doubt +anything actually uses). Since the first file it tries to access is almost +certainly present in the special remote, it will then learn the layout and +keep using it with no added overhead. + +The unusual case would be if a lot of files are not present in the +remote any longer. Then it will fail to learn, on each file, and so will +always make 6 extra http requests per file processed. (Reusing the same +http connection at least.) This seems unusual enough to not worry about +remembering what it's learned for longer than a single run, or making the +layout explicitly configurable. Content would have to be dropped from the +other special remote and the git-annex branch not be synced up for it to +happen. +"""]] diff --git a/doc/todo/make_http_special_remote_support_exporttree_remotes.mdwn b/doc/todo/make_http_special_remote_support_exporttree_remotes.mdwn new file mode 100644 index 0000000000..3f09ad0dce --- /dev/null +++ b/doc/todo/make_http_special_remote_support_exporttree_remotes.mdwn @@ -0,0 +1,4 @@ +The http special remote doesn't currently support being used with a +--sameas remote that uses exporttree=yes. + +It seems like this should be fairly easy to implement. --[[Joey]] diff --git a/doc/todo/publicurl_config_for_all_special_remotes.mdwn b/doc/todo/publicurl_config_for_all_special_remotes.mdwn index 992c20fc30..7d275330c5 100644 --- a/doc/todo/publicurl_config_for_all_special_remotes.mdwn +++ b/doc/todo/publicurl_config_for_all_special_remotes.mdwn @@ -3,3 +3,6 @@ [[!meta author=yoh]] [[!tag projects/datalad]] [[!tag needsthought]] + +> [[done]] by implementing another design, not the one suggested here +> --[[Joey]] diff --git a/doc/todo/publicurl_config_for_all_special_remotes/comment_6_7f027aa4b5b495a93aee3cd3417862a8._comment b/doc/todo/publicurl_config_for_all_special_remotes/comment_6_7f027aa4b5b495a93aee3cd3417862a8._comment new file mode 100644 index 0000000000..a38c50478f --- /dev/null +++ b/doc/todo/publicurl_config_for_all_special_remotes/comment_6_7f027aa4b5b495a93aee3cd3417862a8._comment @@ -0,0 +1,22 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 6""" + date="2020-09-01T19:09:30Z" + content=""" +I've implemented the http special remote, that can be combined with other +special remotes to access them using anonymous http. + +I think that probably addresses this todo well enough to close it. +(Although I didn't get around to +[[todo/make_http_special_remote_support_exporttree_remotes]] yet, and this +todo mentions supporting exporttree. Should be easy to add later though.) + +There are probably some special remotes that are unusual enough that the +http special remote can't support them, which it would make sense to add a +publicurl= config to, like S3 has. (Although I think S3 itself could now be +used with the http special remote so its option is vestigal now.) + +I guess that publicurl= config would best be added to the individual +special remote, so it doesn't need any particular support in git-annex to +add it. +"""]] diff --git a/git-annex.cabal b/git-annex.cabal index e22b875999..f81fcb4ff9 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -980,6 +980,7 @@ Executable git-annex Remote.Helper.ReadOnly Remote.Helper.Special Remote.Helper.Ssh + Remote.Http Remote.Hook Remote.List Remote.List.Util