From 5ef3f1e7039c11191efbb00545555fc46e38cd69 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Sun, 28 Jul 2024 21:11:23 -0400 Subject: [PATCH 1/9] remove unused imports --- P2P/Http/Url.hs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/P2P/Http/Url.hs b/P2P/Http/Url.hs index b4d0a86264..09a8e56f9a 100644 --- a/P2P/Http/Url.hs +++ b/P2P/Http/Url.hs @@ -9,14 +9,9 @@ module P2P.Http.Url where -import Types.UUID -import Utility.FileSystemEncoding -import Utility.PartialPrelude - import Data.List import Network.URI import System.FilePath.Posix as P -import qualified Data.UUID as UUID #ifdef WITH_SERVANT import Servant.Client (BaseUrl(..), Scheme(..)) import Text.Read From 7ac8d36f388f8395d15dff94b23c95aecc3142cb Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 29 Jul 2024 09:11:27 -0400 Subject: [PATCH 2/9] idea --- doc/todo/git-annex_proxies.mdwn | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/doc/todo/git-annex_proxies.mdwn b/doc/todo/git-annex_proxies.mdwn index fcec9843fe..54a37ca651 100644 --- a/doc/todo/git-annex_proxies.mdwn +++ b/doc/todo/git-annex_proxies.mdwn @@ -60,7 +60,15 @@ Planned schedule of work: time (not as proxied remotes), so that eg, every git-annex repository on a server can be served on the same port. -* Support proxying to git remotes that use annex+http urls. +* Support proxying to git remotes that use annex+http urls. This needs a + translation from P2P protocol to servant-client to P2P protocol. + +* Should be possible to use a git-remote-annex annex::$uuid url as + remote.foo.url with remote.foo.annexUrl using annex+http, and so + not need a separate web server to serve the git repository. Doesn't work + currently because git-remote-annex urls only support special remotes. + It would need a new form of git-remote-annex url, eg: + annex::$uuid?annex+http://example.com/git-annex/ * `git-annex p2phttp` could support systemd socket activation. This would allow making a systemd unit that listens on port 80. From c8e7231f48695a3ab9485a056e1a1386b642ded5 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 29 Jul 2024 09:48:06 -0400 Subject: [PATCH 3/9] add debugging of opening and closing connections to proxies --- P2P/Http/State.hs | 21 +++++++++++++++------ doc/todo/git-annex_proxies.mdwn | 2 +- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/P2P/Http/State.hs b/P2P/Http/State.hs index 36f6d6ce69..0617064947 100644 --- a/P2P/Http/State.hs +++ b/P2P/Http/State.hs @@ -539,13 +539,20 @@ instance Show ProxyConnection where openedProxyConnection :: UUID + -> String -> Proxy.ProxySelector -> Annex () -> Proxy.ConcurrencyConfig - -> IO ProxyConnection -openedProxyConnection u selector closer concurrency = do - now <- getPOSIXTime - return $ ProxyConnection u selector closer concurrency now + -> Annex ProxyConnection +openedProxyConnection u desc selector closer concurrency = do + now <- liftIO getPOSIXTime + fastDebug "P2P.Http" ("Opened proxy connection to " ++ desc) + return $ ProxyConnection u selector closer' concurrency now + where + closer' = do + fastDebug "P2P.Http" ("Closing proxy connection to " ++ desc) + closer + fastDebug "P2P.Http" ("Closed proxy connection to " ++ desc) openProxyConnectionToRemote :: AnnexWorkerPool @@ -557,7 +564,8 @@ openProxyConnectionToRemote workerpool clientmaxversion bypass remote = inAnnexWorker' workerpool $ do remoteside <- proxyRemoteSide clientmaxversion bypass remote concurrencyconfig <- Proxy.noConcurrencyConfig - liftIO $ openedProxyConnection (Remote.uuid remote) + openedProxyConnection (Remote.uuid remote) + ("remote " ++ Remote.name remote) (Proxy.singleProxySelector remoteside) (Proxy.closeRemoteSide remoteside) concurrencyconfig @@ -576,7 +584,8 @@ openProxyConnectionToCluster workerpool clientmaxversion bypass clusteruuid conc (proxyselector, closenodes) <- clusterProxySelector clusteruuid clientmaxversion bypass concurrencyconfig <- Proxy.mkConcurrencyConfig concurrency - liftIO $ openedProxyConnection (fromClusterUUID clusteruuid) + openedProxyConnection (fromClusterUUID clusteruuid) + ("cluster " ++ fromUUID (fromClusterUUID clusteruuid)) proxyselector closenodes concurrencyconfig type ProxyConnectionPool = (Integer, M.Map ProxyConnectionPoolKey [ProxyConnection]) diff --git a/doc/todo/git-annex_proxies.mdwn b/doc/todo/git-annex_proxies.mdwn index 54a37ca651..83bc358a16 100644 --- a/doc/todo/git-annex_proxies.mdwn +++ b/doc/todo/git-annex_proxies.mdwn @@ -32,7 +32,7 @@ Planned schedule of work: over http leaves open the connection to the cluster, so the next request opens another one. - So does an interrupted PUT directly to the proxied ; + So does an interrupted PUT directly to the proxied special remote over http. * When part of a file has been sent to a cluster via the http server, From 4f3ae966662c38080fcafb9d5bbee4dedce566b8 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 29 Jul 2024 10:33:26 -0400 Subject: [PATCH 4/9] cleanly close proxy connection on interrupted PUT An interrupted PUT to cluster that has a node that is a special remote over http left open the connection to the cluster, so the next request opens another one. So did an interrupted PUT directly to the proxied special remote over http. proxySpecialRemote was stuck waiting for all the DATA. Its connection remained open so it kept waiting. In servePut, checktooshort handles closing the P2P connection when too short a data is received from PUT. But, checktooshort was only called after the protoaction, which is what runs the proxy, which is what was getting stuck. Modified it to run as a background thread, which waits for the tooshortv to be written to, which gather always does once it gets to the end of the data received from the http client. That makes proxyConnection's releaseconn run once all data is received from the http client. Made it close the connection handles before waiting on the asyncworker thread. This lets proxySpecialRemote finish processing any data from the handle, and then it will give up, more or less cleanly, if it didn't receive enough data. I say "more or less cleanly" because with both sides of the P2P connection taken down, some protocol unhappyness results. Which can lead to some ugly debug messages. But also can cause the asyncworker thread to throw an exception. So made withP2PConnections not crash when it receives an exception from releaseconn. This did have a small change to the behavior of an interrupted PUT when proxying to a regular remote. proxyConnection has a protoerrorhandler that closes the proxy connection on a protocol error. But the proxy connection is also closed by checktooshort when it closes the P2P connection. Closing the same proxy connection twice is not a problem, it just results in duplicated debug messages about it. --- P2P/Http/Server.hs | 8 ++++---- P2P/Http/State.hs | 6 +++--- doc/todo/git-annex_proxies.mdwn | 7 ------- 3 files changed, 7 insertions(+), 14 deletions(-) diff --git a/P2P/Http/Server.hs b/P2P/Http/Server.hs index 044dd4f950..8bc0284db4 100644 --- a/P2P/Http/Server.hs +++ b/P2P/Http/Server.hs @@ -314,9 +314,9 @@ servePut st resultmangle su apiver (DataLength len) (B64Key k) cu bypass baf mof tooshortv <- liftIO newEmptyTMVarIO content <- liftIO $ S.unSourceT stream (gather validityv tooshortv) res <- withP2PConnection' apiver st cu su bypass sec auth WriteAction - (\cst -> cst { connectionWaitVar = False }) $ \conn -> + (\cst -> cst { connectionWaitVar = False }) $ \conn -> do + liftIO $ void $ async $ checktooshort conn tooshortv liftIO (protoaction conn content validitycheck) - `finally` checktooshort conn tooshortv case res of Right (Right (Just plusuuids)) -> return $ resultmangle $ PutResultPlus True (map B64UUID plusuuids) @@ -385,8 +385,8 @@ servePut st resultmangle su apiver (DataLength len) (B64Key k) cu bypass baf mof -- The connection can no longer be used when too short a DATA has -- been written to it. - checktooshort conn tooshortv = - liftIO $ whenM (atomically $ fromMaybe True <$> tryTakeTMVar tooshortv) $ + checktooshort conn tooshortv = do + liftIO $ whenM (atomically $ takeTMVar tooshortv) $ closeP2PConnection conn servePutOffset diff --git a/P2P/Http/State.hs b/P2P/Http/State.hs index 0617064947..f52614b6c8 100644 --- a/P2P/Http/State.hs +++ b/P2P/Http/State.hs @@ -220,7 +220,7 @@ withP2PConnections workerpool proxyconnectionpoolsize clusterconcurrency a = do >>= atomically . putTMVar respvar servicer myuuid myproxies proxypool reqv relv endv Left (Right releaseconn) -> do - releaseconn + void $ tryNonAsync releaseconn servicer myuuid myproxies proxypool reqv relv endv Left (Left ()) -> return () @@ -378,11 +378,11 @@ proxyConnection proxyconnectionpoolsize relv connparams workerpool proxypool pro liftIO $ runNetProto proxyfromclientrunst proxyfromclientconn $ P2P.net P2P.receiveMessage - let releaseconn returntopool = + let releaseconn returntopool = do atomically $ void $ tryPutTMVar relv $ do - r <- liftIO $ wait asyncworker liftIO $ closeConnection proxyfromclientconn liftIO $ closeConnection clientconn + r <- liftIO $ wait asyncworker if returntopool then liftIO $ do now <- getPOSIXTime diff --git a/doc/todo/git-annex_proxies.mdwn b/doc/todo/git-annex_proxies.mdwn index 83bc358a16..2e679f1a9d 100644 --- a/doc/todo/git-annex_proxies.mdwn +++ b/doc/todo/git-annex_proxies.mdwn @@ -28,13 +28,6 @@ Planned schedule of work: ## work notes -* An interrupted PUT to cluster that has a node that is a special remote - over http leaves open the connection to the cluster, so the next request - opens another one. - - So does an interrupted PUT directly to the proxied - special remote over http. - * When part of a file has been sent to a cluster via the http server, the transfer interrupted, and another node is added to the cluster, and the transfer of the file performed again, there is a failure From 6af44b9de6168cf50128dfaf2c8400df961d2b70 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 29 Jul 2024 10:52:37 -0400 Subject: [PATCH 5/9] p2phttp remotes are not readonly That prevented testremote from working when remote.name.url = http://.. --- Remote/Git.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Remote/Git.hs b/Remote/Git.hs index 90fb301c34..f59ef528b6 100644 --- a/Remote/Git.hs +++ b/Remote/Git.hs @@ -227,7 +227,7 @@ gen r u rc gc rs , localpath = localpathCalc r , getRepo = getRepoFromState st , gitconfig = gc - , readonly = Git.repoIsHttp r + , readonly = Git.repoIsHttp r && not (isP2PHttp' gc) , appendonly = False , untrustworthy = False , availability = repoAvail r From 7402ae61d9cb0fc5b09ea05a9cad8ed7a886490b Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 29 Jul 2024 11:03:18 -0400 Subject: [PATCH 6/9] fix reversion in GET from proxy over http 4f3ae966662c38080fcafb9d5bbee4dedce566b8 caused a hang in GET, which git-annex testremote could reliably cause. The problem is that closing both P2P handles before waiting on the asyncworker prevents all the DATA from getting sent. The solution is to only close the P2P handles early when the P2PConnection is being closed. When it's being released, let the asyncworker finish. closeP2PConnection is called in GET when it was unable to send all data, and in PUT when it did not receive all the data, and in both cases closing the P2P handles early is ok. --- P2P/Http/State.hs | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/P2P/Http/State.hs b/P2P/Http/State.hs index f52614b6c8..7e43df81de 100644 --- a/P2P/Http/State.hs +++ b/P2P/Http/State.hs @@ -378,12 +378,18 @@ proxyConnection proxyconnectionpoolsize relv connparams workerpool proxypool pro liftIO $ runNetProto proxyfromclientrunst proxyfromclientconn $ P2P.net P2P.receiveMessage - let releaseconn returntopool = do + let closebothsides = do + liftIO $ closeConnection proxyfromclientconn + liftIO $ closeConnection clientconn + + let releaseconn connstillusable = do atomically $ void $ tryPutTMVar relv $ do - liftIO $ closeConnection proxyfromclientconn - liftIO $ closeConnection clientconn + unless connstillusable + closebothsides r <- liftIO $ wait asyncworker - if returntopool + when connstillusable + closebothsides + if connstillusable then liftIO $ do now <- getPOSIXTime evicted <- atomically $ putProxyConnectionPool proxypool proxyconnectionpoolsize connparams $ From 0dc064a9ad6e1d6abbc84e71d8e7007d042b1f4d Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 29 Jul 2024 11:18:03 -0400 Subject: [PATCH 7/9] When proxying for a special remote, avoid unncessary hashing Like the comment says, the client will do its own verification. But it was calling verifyKeyContentPostRetrieval, which was hashing the file. --- Annex/Proxy.hs | 8 +------- CHANGELOG | 1 + 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/Annex/Proxy.hs b/Annex/Proxy.hs index 854ce289e2..16d4fc2cdd 100644 --- a/Annex/Proxy.hs +++ b/Annex/Proxy.hs @@ -16,7 +16,6 @@ import P2P.Proxy import P2P.Protocol import P2P.IO import Remote.Helper.Ssh (openP2PShellConnection', closeP2PShellConnection) -import Annex.Content import Annex.Concurrent import Annex.Tmp import Logs.Proxy @@ -209,12 +208,7 @@ proxySpecialRemote protoversion r ihdl ohdl owaitv oclosedv = go -- because the client will do its own verification. let vc = Remote.NoVerify tryNonAsync (Remote.retrieveKeyFile r k af (fromRawFilePath tmpfile) nullMeterUpdate vc) >>= \case - Right v -> - ifM (verifyKeyContentPostRetrieval Remote.RetrievalVerifiableKeysSecure vc v k tmpfile) - ( liftIO $ senddata offset tmpfile - , liftIO $ sendmessage $ - ERROR "verification of content failed" - ) + Right _ -> liftIO $ senddata offset tmpfile Left err -> liftIO $ propagateerror err senddata (Offset offset) f = do diff --git a/CHANGELOG b/CHANGELOG index db33d0ba37..b84c2e9f29 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -13,6 +13,7 @@ git-annex (10.20240702) UNRELEASED; urgency=medium * Avoid potential data loss in situations where git-annex-shell or git-annex remotedaemon is killed while locking a key to prevent its removal. + * When proxying for a special remote, avoid unncessary hashing. * Added a dependency on clock. * Propagate --force to git-annex transferrer. From 60b1c53df56d94a0d8df7204c170911363c818f9 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 29 Jul 2024 11:22:27 -0400 Subject: [PATCH 8/9] preparing to merge --- CHANGELOG | 2 +- doc/tips/smart_http_server.mdwn | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index b84c2e9f29..36397dbfa9 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,4 @@ -git-annex (10.20240702) UNRELEASED; urgency=medium +git-annex (10.20240731) UNRELEASED; urgency=medium * New HTTP API that is equivilant to the P2P protocol. * annex+http and annex+https urls can be configured for diff --git a/doc/tips/smart_http_server.mdwn b/doc/tips/smart_http_server.mdwn index d7a56d0596..54264e3499 100644 --- a/doc/tips/smart_http_server.mdwn +++ b/doc/tips/smart_http_server.mdwn @@ -3,8 +3,8 @@ as shown in the tip [[setup_a_public_repository_on_a_web_site]]. That's limited to basic read-only repository access though. Git has [smart HTTP](https://git-scm.com/book/en/v2/Git-on-the-Server-Smart-HTTP) -that can be used to allow pushes over http. And git-annex has an -equivilant, the [[git annex-p2phttp command|/git-annex-p2phttp]]. +that can be used to allow pushes over http. And git-annex +has the [[git annex-p2phttp command|/git-annex-p2phttp]]. As well as allowing write access to authorized users over http, `git-annex p2phttp` also allows accessing [[clusters]], and other proxied @@ -37,3 +37,7 @@ In the git config file of the repository, set `annex.url` to the "annex+http" (or "annex+https") url. The first time it uses a http remote, git-annex downloads its git config file, and sets `remote.name.annexUrl` to the value of the remote's `annex.url`. + +Support for this first appeared in git-annex version 10.20240731. Users of +older git-annex won't be able to use the smart http server, but can still +clone the repository from the dumb http server. From 6f20085a601a5ece5a308ad5c7567f1be8cde1cc Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 29 Jul 2024 11:25:07 -0400 Subject: [PATCH 9/9] update --- doc/todo/git-annex_proxies.mdwn | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/todo/git-annex_proxies.mdwn b/doc/todo/git-annex_proxies.mdwn index 2e679f1a9d..7d2af32c0f 100644 --- a/doc/todo/git-annex_proxies.mdwn +++ b/doc/todo/git-annex_proxies.mdwn @@ -45,7 +45,10 @@ Planned schedule of work: special remote. This violates a usual invariant that any data being received into a repository gets verified in passing. Although on the other hand, when sending data to a special remote normally, there is also - no verification. + no verification. On the third hand, a p2p http proxy (or for that matter + a ssh server) may have users who are allowed to store objects, but are + not really trusted, and if they can upload garbage without verification, + that could be bad. ## items deferred until later for p2p protocol over http