git-annex/Annex/Cluster.hs

159 lines
5.5 KiB
Haskell
Raw Normal View History

2024-06-18 14:36:04 +00:00
{- clusters
-
- Copyright 2024 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE RankNTypes, OverloadedStrings #-}
2024-06-18 14:36:04 +00:00
module Annex.Cluster where
import Annex.Common
import qualified Annex
2024-06-18 14:36:04 +00:00
import Types.Cluster
import Logs.Cluster
import P2P.Proxy
import P2P.Protocol
import P2P.IO
import Annex.Proxy
import Annex.UUID
2024-06-18 14:36:04 +00:00
import Logs.Location
import Logs.PreferredContent
2024-06-18 14:36:04 +00:00
import Types.Command
import Remote.List
import qualified Remote
import qualified Types.Remote as Remote
2024-06-18 14:36:04 +00:00
import qualified Data.Map as M
import qualified Data.Set as S
{- Proxy to a cluster. -}
proxyCluster
:: ClusterUUID
-> CommandPerform
-> ServerMode
-> ClientSide
-> (forall a. ((a -> CommandPerform) -> Annex (Either ProtoFailure a) -> CommandPerform))
-> CommandPerform
proxyCluster clusteruuid proxydone servermode clientside protoerrhandler = do
getClientProtocolVersion (fromClusterUUID clusteruuid) clientside
withclientversion protoerrhandler
where
proxymethods = ProxyMethods
{ removedContent = \u k -> logChange k u InfoMissing
, addedContent = \u k -> logChange k u InfoPresent
}
withclientversion (Just (clientmaxversion, othermsg)) = do
-- The protocol versions supported by the nodes are not
-- known at this point, and would be too expensive to
-- determine. Instead, pick the newest protocol version
don't sync with cluster nodes by default Avoid `git-annex sync --content` etc from operating on cluster nodes by default since syncing with a cluster implicitly syncs with its nodes. This avoids a lot of unncessary work when a cluster has a lot of nodes just in checking if each node's preferred content is satisfied. And it avoids content being sent to nodes individually, so instead syncing with clusters always fanout uploads to nodes. The downside is that there are situations where a cluster's preferred content settings can be met, but those of its nodes are not. Or where a node does not contain a key, but the cluster does, and there are not enough copies of the key yet, so it would be desirable the send it there. I think that's an acceptable tradeoff. These kind of situations are ones where the cluster itself should probably be responsible for copying content to the node. Which it can do much less expensively than a client can. Part of the balanced preferred content design that I will be working on in a couple of months involves rebalancing clusters, so I expect to revisit this. The use of annex-sync config does allow running git-annex sync with a specific node, or nodes, and it will sync with it. And it's also possible to set annex-sync git configs to make it sync with a node by default. (Although that will require setting up an explicit git remote for the node rather than relying on the proxied remote.) Logs.Cluster.Basic is needed because Remote.Git cannot import Logs.Cluster due to a cycle. And the Annex.Startup load of clusters happens too late for Remote.Git to use that. This does mean one redundant load of the cluster log, though only when there is a proxy.
2024-06-25 14:06:28 +00:00
-- that we and the client both speak. The proxy code
-- checks protocol versions when operating on multiple
-- nodes, and allows nodes to have different protocol
-- versions.
2024-06-18 14:36:04 +00:00
let protocolversion = min maxProtocolVersion clientmaxversion
sendClientProtocolVersion clientside othermsg protocolversion
(getclientbypass protocolversion) protoerrhandler
withclientversion Nothing = proxydone
getclientbypass protocolversion othermsg =
getClientBypass clientside protocolversion othermsg
(withclientbypass protocolversion) protoerrhandler
withclientbypass protocolversion (bypassuuids, othermsg) = do
selectnode <- clusterProxySelector clusteruuid protocolversion bypassuuids
2024-06-25 18:52:47 +00:00
concurrencyconfig <- getConcurrencyConfig
proxy proxydone proxymethods servermode clientside
(fromClusterUUID clusteruuid)
2024-06-25 18:52:47 +00:00
selectnode concurrencyconfig protocolversion
othermsg protoerrhandler
2024-06-18 14:36:04 +00:00
clusterProxySelector :: ClusterUUID -> ProtocolVersion -> Bypass -> Annex ProxySelector
clusterProxySelector clusteruuid protocolversion (Bypass bypass) = do
nodeuuids <- (fromMaybe S.empty . M.lookup clusteruuid . clusterUUIDs)
2024-06-18 14:36:04 +00:00
<$> getClusters
myclusters <- annexClusters <$> Annex.getGitConfig
allremotes <- remoteList
hereu <- getUUID
let bypass' = S.insert hereu bypass
let clusterremotes = filter (isnode bypass' allremotes nodeuuids myclusters) allremotes
fastDebug "Annex.Cluster" $ unwords
[ "cluster gateway at", fromUUID hereu
, "connecting to", show (map Remote.name clusterremotes)
, "bypass", show (S.toList bypass)
]
nodes <- mapM (proxySshRemoteSide protocolversion (Bypass bypass')) clusterremotes
2024-06-18 14:36:04 +00:00
return $ ProxySelector
{ proxyCHECKPRESENT = nodecontaining nodes
, proxyGET = nodecontaining nodes
-- The key is sent to multiple nodes at the same time,
-- skipping nodes where it's known/expected to already be
-- present to avoid needing to connect to those, and
-- skipping nodes where it's not preferred content.
, proxyPUT = \af k -> do
locs <- S.fromList <$> loggedLocations k
let l = filter (flip S.notMember locs . remoteUUID) nodes
l' <- filterM (\n -> isPreferredContent (Just (remoteUUID n)) mempty (Just k) af True) l
-- PUT to no nodes doesn't work, so fall
-- back to all nodes.
return $ nonempty [l', l] nodes
-- Remove the key from every node that contains it.
-- But, since it's possible the location log for some nodes
-- could be out of date, actually try to remove from every
-- node.
, proxyREMOVE = const (pure nodes)
-- Content is not locked on the cluster as a whole,
-- instead it can be locked on individual nodes that are
-- proxied to the client.
, proxyLOCKCONTENT = const (pure Nothing)
, proxyUNLOCKCONTENT = pure Nothing
2024-06-18 14:36:04 +00:00
}
where
-- Nodes of the cluster have remote.name.annex-cluster-node
-- containing its name.
--
-- Or, a node can be the cluster proxied by another gateway.
isnode bypass' rs nodeuuids myclusters r =
case remoteAnnexClusterNode (Remote.gitconfig r) of
Just names
| any (isclustername myclusters) names ->
flip S.member nodeuuids $
ClusterNodeUUID $ Remote.uuid r
| otherwise -> False
Nothing -> isclusterviagateway bypass' rs r
-- Is this remote the same cluster, proxied via another gateway?
--
-- Must avoid bypassed gateways to prevent cycles.
isclusterviagateway bypass' rs r =
case mkClusterUUID (Remote.uuid r) of
Just cu | cu == clusteruuid ->
case remoteAnnexProxiedBy (Remote.gitconfig r) of
Just proxyuuid | proxyuuid `S.notMember` bypass' ->
not $ null $
filter isclustergateway $
filter (\p -> Remote.uuid p == proxyuuid) rs
_ -> False
_ -> False
isclustergateway r = any (== clusteruuid) $
remoteAnnexClusterGateway $ Remote.gitconfig r
isclustername myclusters name =
M.lookup name myclusters == Just clusteruuid
nodecontaining nodes k = do
locs <- S.fromList <$> loggedLocations k
case filter (flip S.member locs . remoteUUID) nodes of
-- For now, pick the first node that has the
-- content. Load balancing would be nice..
(r:_) -> return (Just r)
[] -> return Nothing
nonempty (l:ls) fallback
| null l = nonempty ls fallback
| otherwise = l
nonempty [] fallback = fallback