add --clusterjobs option and default to 1

The default of 1 is not ideal at all, but it avoids an accidental M*N causing so much concurrency it becomes unusable.
2024-07-28 10:36:22 -04:00 · 2024-07-28 10:36:22 -04:00 · fbbedae497
commit fbbedae497
parent 1259ad89b6
8 changed files with 60 additions and 30 deletions
--- a/Annex/Cluster.hs
+++ b/Annex/Cluster.hs
@ -58,10 +58,11 @@ proxyCluster clusteruuid proxydone servermode clientside protoerrhandler = do
 			(withclientbypass protocolversion) (protoerrhandler noop)
 	withclientbypass protocolversion (bypassuuids, othermsg) = do
-		(selectnode, closenodes, concurrencyconfig) <-
+		(selectnode, closenodes) <-
 			clusterProxySelector clusteruuid
 				protocolversion bypassuuids
 		proxystate <- liftIO mkProxyState
 		concurrencyconfig <- concurrencyConfigJobs
 		let proxyparams = ProxyParams
 			{ proxyMethods = mkProxyMethods
 			, proxyState = proxystate
@ -79,7 +80,7 @@ clusterProxySelector
 	:: ClusterUUID
 	-> ProtocolVersion
 	-> Bypass
-	-> Annex (ProxySelector, Annex (), ConcurrencyConfig)
+	-> Annex (ProxySelector, Annex ())
 clusterProxySelector clusteruuid protocolversion (Bypass bypass) = do
 	nodeuuids <- (fromMaybe S.empty . M.lookup clusteruuid . clusterUUIDs)
 		<$> getClusters
@ -120,8 +121,7 @@ clusterProxySelector clusteruuid protocolversion (Bypass bypass) = do
 		-- proxied to the client.
 		, proxyLOCKCONTENT = const (pure Nothing)
 		}
-	concurrencyconfig <- getConcurrencyConfig
+	return (proxyselector, closenodes)
 	return (proxyselector, closenodes, concurrencyconfig)
  where
 	-- Nodes of the cluster have remote.name.annex-cluster-node
 	-- containing its name. 
--- a/Command/P2PHttp.hs
+++ b/Command/P2PHttp.hs
@ -41,6 +41,7 @@ data Options = Options
 	, unauthAppendOnlyOption :: Bool
 	, wideOpenOption :: Bool
 	, proxyConnectionsOption :: Maybe Integer
 	, clusterJobsOption :: Maybe Int
 	}
 optParser :: CmdParamsDesc -> Parser Options
@ -89,10 +90,16 @@ optParser _ = Options
 		( long "proxyconnections" <> metavar paramNumber
 		<> help "maximum number of idle connections when proxying"
 		))
 	<*> optional (option auto
 		( long "clusterjobs" <> metavar paramNumber
 		<> help "number of concurrent node accesses per connection"
 		))
 seek :: Options -> CommandSeek
 seek o = getAnnexWorkerPool $ \workerpool ->
-	withP2PConnections workerpool (fromMaybe 1 $ proxyConnectionsOption o)
+	withP2PConnections workerpool
 		(fromMaybe 1 $ proxyConnectionsOption o)
 		(fromMaybe 1 $ clusterJobsOption o)
 		(go workerpool)
  where
 	go workerpool acquireconn = liftIO $ do
--- a/P2P/Http/State.hs
+++ b/P2P/Http/State.hs
@ -181,9 +181,10 @@ type AcquireP2PConnection
 withP2PConnections
 	:: AnnexWorkerPool
 	-> ProxyConnectionPoolSize
 	-> ClusterConcurrency
 	-> (AcquireP2PConnection -> Annex a)
 	-> Annex a
-withP2PConnections workerpool proxyconnectionpoolsize a = do
+withP2PConnections workerpool proxyconnectionpoolsize clusterconcurrency a = do
 	myuuid <- getUUID
 	reqv <- liftIO newEmptyTMVarIO
 	relv <- liftIO newEmptyTMVarIO
@ -241,7 +242,7 @@ withP2PConnections workerpool proxyconnectionpoolsize a = do
 			Right (Right (Left clusteruuid)) -> proxyconnection $
 				openProxyConnectionToCluster workerpool
 					(connectionProtocolVersion connparams)
-					bypass clusteruuid
+					bypass clusteruuid clusterconcurrency
 			Left ex -> return $ Left $
 				ConnectionFailed $ show ex
 	  where
@ -557,16 +558,20 @@ openProxyConnectionToRemote workerpool clientmaxversion bypass remote =
 			(Proxy.closeRemoteSide remoteside)
 			concurrencyconfig
 type ClusterConcurrency = Int
 openProxyConnectionToCluster
 	:: AnnexWorkerPool
 	-> P2P.ProtocolVersion
 	-> P2P.Bypass
 	-> ClusterUUID
 	-> ClusterConcurrency
 	-> IO (Either SomeException ProxyConnection)
-openProxyConnectionToCluster workerpool clientmaxversion bypass clusteruuid =
+openProxyConnectionToCluster workerpool clientmaxversion bypass clusteruuid concurrency =
 	inAnnexWorker' workerpool $ do
-		(proxyselector, closenodes, concurrencyconfig) <-
+		(proxyselector, closenodes) <-
 			clusterProxySelector clusteruuid clientmaxversion bypass
 		concurrencyconfig <- Proxy.mkConcurrencyConfig concurrency
 		liftIO $ openedProxyConnection (fromClusterUUID clusteruuid)
 			proxyselector closenodes concurrencyconfig
--- a/P2P/Proxy.hs
+++ b/P2P/Proxy.hs
@ -659,10 +659,13 @@ proxyRequest proxydone proxyparams requestcomplete requestmessage protoerrhandle
 data ConcurrencyConfig = ConcurrencyConfig Int (MSem.MSem Int)
 noConcurrencyConfig :: Annex ConcurrencyConfig
-noConcurrencyConfig = liftIO $ ConcurrencyConfig 1 <$> MSem.new 1
+noConcurrencyConfig = mkConcurrencyConfig 1
-getConcurrencyConfig :: Annex ConcurrencyConfig
+mkConcurrencyConfig :: Int -> Annex ConcurrencyConfig
-getConcurrencyConfig = (annexJobs <$> Annex.getGitConfig) >>= \case
+mkConcurrencyConfig n = liftIO $ ConcurrencyConfig n <$> MSem.new n
 concurrencyConfigJobs :: Annex ConcurrencyConfig
 concurrencyConfigJobs = (annexJobs <$> Annex.getGitConfig) >>= \case
 	NonConcurrent -> noConcurrencyConfig
 	Concurrent n -> go n
 	ConcurrentPerCpu -> go =<< liftIO getNumProcessors
@ -672,8 +675,7 @@ getConcurrencyConfig = (annexJobs <$> Annex.getGitConfig) >>= \case
 		when (n > c) $
 			liftIO $ setNumCapabilities n
 		setConcurrency (ConcurrencyGitConfig (Concurrent n))
-		msem <- liftIO $ MSem.new n
+		mkConcurrencyConfig n
 		return (ConcurrencyConfig n msem)
 forMC :: ConcurrencyConfig -> [a] -> (a -> Annex b) -> Annex [b]
 forMC _ (x:[]) a = do
--- a/doc/git-annex-p2phttp.mdwn
+++ b/doc/git-annex-p2phttp.mdwn
@ -32,21 +32,37 @@ convenient way to download the content of any key, by using the path
 * `--jobs=N` `-JN`
  This or annex.jobs must be set to configure the number of worker
-  threads.
+  threads that serve connections to the webserver.
-  Since the webserver itself uses one thread, this needs to be set to
+  Since the webserver itself also uses one of these threads, 
-  2 or more.
+  this needs to be set to 2 or more.
-  A good choice is one worker per CPU core: `--jobs=cpus`
+  A good choice is often one worker per CPU core: `--jobs=cpus`
 * `--proxyconnections=N`
-  When is command is run in a repository that is configured to act as a
+  When this command is run in a repository that is configured to act as a
  proxy for some of its remotes, this is the maximum number of idle
  connections to keep open to proxied remotes.
  The default is 1.
 * `--clusterjobs=N`
  When this command is run in a repository that is a gateway for a cluster,
  this is the number of concurrent jobs to use to access nodes of the
  cluster, per connection to the webserver.
  The default is 1.
  A good choice for this will be a balance between the number of nodes
  in the cluster and the value of `--jobs`.
  For example, if the cluster has 4 nodes, and `--jobs=4`, using
  `--clusterjobs=4` will make all nodes in the cluster be accessed
  concurrently, which is often optimal. But around 20 cores can be needed
  when the webserver is busy.
 * `--port=N`
  Port to listen on. The default is port 9417, which is the default
@ -122,6 +138,10 @@ git-http-backend(1)
 [[git-annex-updateproxy]](1)
 [[git-annex-initcluster]](1)
 [[git-annex-updatecluster]](1)
 <https://git-annex.branchable.com/design/p2p_protocol_over_http/>
 # AUTHOR
--- a/doc/git-annex-updateproxy.mdwn
+++ b/doc/git-annex-updateproxy.mdwn
@ -26,7 +26,7 @@ it. Then after pulling from "work", git-annex will know about an
 additional remote, "work-foo". That remote will be accessed using "work" as
 a proxy.
-Proxies can only be accessed via ssh.
+Proxies can only be accessed via ssh or by an annex+http url.
 # OPTIONS
--- a/doc/tips/clusters.mdwn
+++ b/doc/tips/clusters.mdwn
@ -12,8 +12,8 @@ special remotes.
 ## using a cluster
 To use a cluster, your repository needs to have its gateway configured as a
-remote. Clusters can currently only be accessed via ssh. This gateway
+remote. Clusters can currently only be accessed via ssh or by a annex+http
-remote is added the same as any other git remote:
+url. This gateway remote is added the same as any other git remote:
    $ git remote add bigserver me@bigserver:annex
--- a/doc/todo/git-annex_proxies.mdwn
+++ b/doc/todo/git-annex_proxies.mdwn
@ -32,11 +32,6 @@ Planned schedule of work:
 * git-annex testremote cluster
 * Support proxying to git remotes using annex+http urls.
  (Current documentation says proxying only works with ssh remotes,
  so current state is not confusing, but this still needs to be done
  eventually.)
 ## completed items for July's work on p2p protocol over http
 * HTTP P2P protocol design [[design/p2p_protocol_over_http]].
@ -53,14 +48,15 @@ Planned schedule of work:
 * Make http server support proxying.
 * Make http server support serving a cluster.
 ## items deferred until later for p2p protocol over http
 * Support proxying to git remotes that use annex+http urls.
 * `git-annex p2phttp` could support systemd socket activation. This would
  allow making a systemd unit that listens on port 80.
 * `git-annex p2phttp` could serve `.git/annex/p2phttp/.well-known/`, 
  allowing it to be used by an ACME client to get certificates.
 ## items deferred until later for [[design/passthrough_proxy]]
 * Check annex.diskreserve when proxying for special remotes