From fb0fd7848589016a840d56be560f6580b7155d17 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 18 Jun 2024 11:37:38 -0400 Subject: [PATCH] only use a remote as a node when git configuration is set Avoids someone writing to cluster.log and nominating remotes of someone else's repository as a cluster. --- Annex/Cluster.hs | 20 +++++++- doc/design/passthrough_proxy.mdwn | 76 +++++++++++++++++-------------- 2 files changed, 59 insertions(+), 37 deletions(-) diff --git a/Annex/Cluster.hs b/Annex/Cluster.hs index b832675aa0..4d9552ac77 100644 --- a/Annex/Cluster.hs +++ b/Annex/Cluster.hs @@ -10,6 +10,7 @@ module Annex.Cluster where import Annex.Common +import qualified Annex import Types.Cluster import Logs.Cluster import P2P.Proxy @@ -20,6 +21,7 @@ import Logs.Location import Types.Command import Remote.List import qualified Remote +import qualified Types.Remote as Remote import qualified Data.Map as M import qualified Data.Set as S @@ -56,8 +58,8 @@ clusterProxySelector :: ClusterUUID -> ProtocolVersion -> Annex ProxySelector clusterProxySelector clusteruuid protocolversion = do nodes <- (fromMaybe S.empty . M.lookup clusteruuid . clusterUUIDs) <$> getClusters - remotes <- filter (flip S.member nodes . ClusterNodeUUID . Remote.uuid) - <$> remoteList + clusternames <- annexClusters <$> Annex.getGitConfig + remotes <- filter (isnode nodes clusternames) <$> remoteList remotesides <- mapM (proxySshRemoteSide protocolversion) remotes return $ ProxySelector { proxyCHECKPRESENT = nodecontaining remotesides @@ -71,6 +73,20 @@ clusterProxySelector clusteruuid protocolversion = do , proxyUNLOCKCONTENT = pure Nothing } where + -- Nodes of the cluster have remote.name.annex-cluster-node + -- containing its name. + isnode nodes clusternames r = + case remoteAnnexClusterNode (Remote.gitconfig r) of + Nothing -> False + Just names + | any (isclustername clusternames) names -> + flip S.member nodes $ + ClusterNodeUUID $ Remote.uuid r + | otherwise -> False + + isclustername clusternames name = + M.lookup name clusternames == Just clusteruuid + nodecontaining remotesides k = do locs <- S.fromList <$> loggedLocations k case filter (flip S.member locs . remoteUUID) remotesides of diff --git a/doc/design/passthrough_proxy.mdwn b/doc/design/passthrough_proxy.mdwn index 87c065957c..3c5c357997 100644 --- a/doc/design/passthrough_proxy.mdwn +++ b/doc/design/passthrough_proxy.mdwn @@ -151,41 +151,6 @@ for any number of git remotes. Which might be obnoxious. Ah, instead git-annex's tab completion can be made to include instantiated remotes, no need to list them in git config. -## single upload with fanout - -If we want to send a file to multiple repositories that are behind the same -proxy, it would be wasteful to upload it through the proxy repeatedly. - -Perhaps a good user interface to this is `git-annex copy --to proxy`. -The proxy could fan out the upload and store it in one or more nodes behind -it. Using preferred content to select which nodes to use. -This would need `storeKey` to be changed to allow returning a UUID (or UUIDs) -where the content was actually stored. - -Alternatively, `git-annex copy --to proxy-foo` could notice that proxy-bar -also wants the content, and fan out a copy to there. Then it could -record in its git-annex branch that the content is present in proxy-bar. -If the user later does `git-annex copy --to proxy-bar`, it would avoid -another upload (and the user would learn at that point that it was in -proxy-bar). This avoids needing to change the `storeKey` interface. - -Should a proxy always fanout? if `git-annex copy --to proxy` is what does -fanout, and `git-annex copy --to proxy-foo` doesn't, then the user has -content. But if the latter does fanout, that might be annoying to users who -want to use proxies, but want full control over what lands where, and don't -want to use preferred content to do it. So probably fanout should be -configurable. But it can't be configured client side, because the fanout -happens on the proxy. Seems like remote.name.annex-fanout could be set to -false to prevent fanout to a specific remote. (This is analagous to a -remote having `git-annex assistant` running on it, it might fan out uploads -to it to other repos, and only the owner of that repo can control it.) - -A command like `git-annex push` would see all the instantiated remotes and -would pick ones to send content to. If the proxy does fanout, this would -lead to `git-annex push` doing extra work iterating over instantiated -remotes that have already received content via fanout. Could this extra -work be avoided? - ## clusters One way to use a proxy is just as a convenient way to access a group of @@ -281,6 +246,43 @@ cluster UUIDs. No other protocol extensions or special cases should be needed. +## single upload with fanout + +If we want to send a file to multiple repositories that are behind the same +proxy, it would be wasteful to upload it through the proxy repeatedly. + +Perhaps a good user interface to this is `git-annex copy --to proxy`. +The proxy could fan out the upload and store it in one or more nodes behind +it. Using preferred content to select which nodes to use. +This would need `storeKey` to be changed to allow returning a UUID (or UUIDs) +where the content was actually stored. + +Alternatively, `git-annex copy --to proxy-foo` could notice that proxy-bar +also wants the content, and fan out a copy to there. Then it could +record in its git-annex branch that the content is present in proxy-bar. +If the user later does `git-annex copy --to proxy-bar`, it would avoid +another upload (and the user would learn at that point that it was in +proxy-bar). This avoids needing to change the `storeKey` interface. + +Should a proxy always fanout? if `git-annex copy --to proxy` is what does +fanout, and `git-annex copy --to proxy-foo` doesn't, then the user has +content. But if the latter does fanout, that might be annoying to users who +want to use proxies, but want full control over what lands where, and don't +want to use preferred content to do it. So probably fanout should be +configurable. But it can't be configured client side, because the fanout +happens on the proxy. Seems like remote.name.annex-fanout could be set to +false to prevent fanout to a specific remote. (This is analagous to a +remote having `git-annex assistant` running on it, it might fan out uploads +to it to other repos, and only the owner of that repo can control it.) + +Alternatively, fanout could be limited to clusters. + +A command like `git-annex push` would see all the instantiated remotes and +would pick ones to send content to. If fanout is done, this would +lead to `git-annex push` doing extra work iterating over instantiated +remotes that have already received content via fanout. Could this extra +work be avoided? + ## cluster configuration lockdown If some organization is running a cluster, and giving others access to it, @@ -302,6 +304,10 @@ to lock down the proxy configuration. Of course, someone with access to a cluster can also drop all data from it! Unless git-annex-shell is run with `GIT_ANNEX_SHELL_APPENDONLY` set. +A remote will only be treated as a node of a cluster when the git +configuration remote.name.annex-cluster-node is set, which will prevent +creating clusters in places where they are not intended to be. + ## speed A passthrough proxy should be as fast as possible so as not to add overhead