2024-06-18 14:36:04 +00:00
|
|
|
{- clusters
|
|
|
|
-
|
|
|
|
- Copyright 2024 Joey Hess <id@joeyh.name>
|
|
|
|
-
|
|
|
|
- Licensed under the GNU AGPL version 3 or higher.
|
|
|
|
-}
|
|
|
|
|
2024-06-27 16:20:22 +00:00
|
|
|
{-# LANGUAGE RankNTypes, OverloadedStrings #-}
|
2024-06-18 14:36:04 +00:00
|
|
|
|
|
|
|
module Annex.Cluster where
|
|
|
|
|
|
|
|
import Annex.Common
|
2024-06-18 15:37:38 +00:00
|
|
|
import qualified Annex
|
2024-06-18 14:36:04 +00:00
|
|
|
import Types.Cluster
|
|
|
|
import Logs.Cluster
|
|
|
|
import P2P.Proxy
|
|
|
|
import P2P.Protocol
|
|
|
|
import P2P.IO
|
2024-06-18 15:01:10 +00:00
|
|
|
import Annex.Proxy
|
2024-06-27 16:20:22 +00:00
|
|
|
import Annex.UUID
|
2024-07-28 16:39:42 +00:00
|
|
|
import Annex.BranchState
|
2024-06-18 14:36:04 +00:00
|
|
|
import Logs.Location
|
2024-06-25 15:35:41 +00:00
|
|
|
import Logs.PreferredContent
|
2024-06-18 14:36:04 +00:00
|
|
|
import Types.Command
|
|
|
|
import Remote.List
|
|
|
|
import qualified Remote
|
2024-06-18 15:37:38 +00:00
|
|
|
import qualified Types.Remote as Remote
|
2024-06-18 14:36:04 +00:00
|
|
|
|
|
|
|
import qualified Data.Map as M
|
|
|
|
import qualified Data.Set as S
|
2024-06-27 18:36:55 +00:00
|
|
|
import System.Random
|
2024-06-18 14:36:04 +00:00
|
|
|
|
|
|
|
{- Proxy to a cluster. -}
|
|
|
|
proxyCluster
|
|
|
|
:: ClusterUUID
|
|
|
|
-> CommandPerform
|
|
|
|
-> ServerMode
|
|
|
|
-> ClientSide
|
2024-06-28 17:19:57 +00:00
|
|
|
-> (forall a. Annex () -> ((a -> CommandPerform) -> Annex (Either ProtoFailure a) -> CommandPerform))
|
2024-06-18 14:36:04 +00:00
|
|
|
-> CommandPerform
|
|
|
|
proxyCluster clusteruuid proxydone servermode clientside protoerrhandler = do
|
2024-07-28 16:39:42 +00:00
|
|
|
enableInteractiveBranchAccess
|
2024-06-18 14:36:04 +00:00
|
|
|
getClientProtocolVersion (fromClusterUUID clusteruuid) clientside
|
2024-06-28 17:19:57 +00:00
|
|
|
withclientversion (protoerrhandler noop)
|
2024-06-18 14:36:04 +00:00
|
|
|
where
|
|
|
|
withclientversion (Just (clientmaxversion, othermsg)) = do
|
|
|
|
-- The protocol versions supported by the nodes are not
|
|
|
|
-- known at this point, and would be too expensive to
|
|
|
|
-- determine. Instead, pick the newest protocol version
|
2024-06-25 14:06:28 +00:00
|
|
|
-- that we and the client both speak. The proxy code
|
2024-07-26 17:39:43 +00:00
|
|
|
-- checks protocol versions of remotes, so nodes can
|
|
|
|
-- have different protocol versions.
|
2024-06-18 14:36:04 +00:00
|
|
|
let protocolversion = min maxProtocolVersion clientmaxversion
|
2024-06-27 16:20:22 +00:00
|
|
|
sendClientProtocolVersion clientside othermsg protocolversion
|
2024-06-28 17:19:57 +00:00
|
|
|
(getclientbypass protocolversion) (protoerrhandler noop)
|
2024-06-27 16:20:22 +00:00
|
|
|
withclientversion Nothing = proxydone
|
|
|
|
|
|
|
|
getclientbypass protocolversion othermsg =
|
|
|
|
getClientBypass clientside protocolversion othermsg
|
2024-06-28 17:19:57 +00:00
|
|
|
(withclientbypass protocolversion) (protoerrhandler noop)
|
2024-06-27 16:20:22 +00:00
|
|
|
|
|
|
|
withclientbypass protocolversion (bypassuuids, othermsg) = do
|
2024-07-28 14:36:22 +00:00
|
|
|
(selectnode, closenodes) <-
|
2024-07-28 14:16:35 +00:00
|
|
|
clusterProxySelector clusteruuid
|
|
|
|
protocolversion bypassuuids
|
REMOVE-BEFORE and GETTIMESTAMP proxying
For clusters, the timestamps have to be translated, since each node can
have its own idea about what time it is. To translate a timestamp, the
proxy remembers what time it asked the node for a timestamp in
GETTIMESTAMP, and applies the delta as an offset in REMOVE-BEFORE.
This does mean that a remove from a cluster has to call GETTIMESTAMP on
every node before dropping from nodes. Not very efficient. Although
currently it tries to drop from every single node anyway, which is also
not very efficient.
I thought about caching the GETTIMESTAMP from the nodes on the first
call. That would improve efficiency. But, since monotonic clocks on
!Linux don't advance when the computer is suspended, consider what might
happen if one node was suspended for a while, then came back. Its
monotonic timestamp would end up behind where the proxying expects it to
be. Would that result in removing when it shouldn't, or refusing to
remove when it should? Have not thought it through. Either way, a
cluster behaving strangly for an extended period of time because one
of its nodes was briefly asleep doesn't seem like good behavior.
2024-07-04 19:08:33 +00:00
|
|
|
proxystate <- liftIO mkProxyState
|
2024-07-28 14:36:22 +00:00
|
|
|
concurrencyconfig <- concurrencyConfigJobs
|
2024-07-25 19:18:06 +00:00
|
|
|
let proxyparams = ProxyParams
|
2024-07-28 13:35:09 +00:00
|
|
|
{ proxyMethods = mkProxyMethods
|
2024-07-25 19:18:06 +00:00
|
|
|
, proxyState = proxystate
|
|
|
|
, proxyServerMode = servermode
|
|
|
|
, proxyClientSide = clientside
|
|
|
|
, proxyUUID = fromClusterUUID clusteruuid
|
|
|
|
, proxySelector = selectnode
|
|
|
|
, proxyConcurrencyConfig = concurrencyconfig
|
2024-07-26 17:39:43 +00:00
|
|
|
, proxyClientProtocolVersion = protocolversion
|
2024-07-25 19:18:06 +00:00
|
|
|
}
|
|
|
|
proxy proxydone proxyparams othermsg
|
|
|
|
(protoerrhandler closenodes)
|
2024-06-18 14:36:04 +00:00
|
|
|
|
2024-07-28 14:16:35 +00:00
|
|
|
clusterProxySelector
|
|
|
|
:: ClusterUUID
|
|
|
|
-> ProtocolVersion
|
|
|
|
-> Bypass
|
2024-07-28 14:36:22 +00:00
|
|
|
-> Annex (ProxySelector, Annex ())
|
2024-06-27 16:20:22 +00:00
|
|
|
clusterProxySelector clusteruuid protocolversion (Bypass bypass) = do
|
2024-06-25 15:35:41 +00:00
|
|
|
nodeuuids <- (fromMaybe S.empty . M.lookup clusteruuid . clusterUUIDs)
|
2024-06-18 14:36:04 +00:00
|
|
|
<$> getClusters
|
2024-06-27 16:20:22 +00:00
|
|
|
myclusters <- annexClusters <$> Annex.getGitConfig
|
2024-06-27 18:36:55 +00:00
|
|
|
allremotes <- concat . Remote.byCost <$> remoteList
|
2024-06-27 16:20:22 +00:00
|
|
|
hereu <- getUUID
|
|
|
|
let bypass' = S.insert hereu bypass
|
|
|
|
let clusterremotes = filter (isnode bypass' allremotes nodeuuids myclusters) allremotes
|
|
|
|
fastDebug "Annex.Cluster" $ unwords
|
|
|
|
[ "cluster gateway at", fromUUID hereu
|
|
|
|
, "connecting to", show (map Remote.name clusterremotes)
|
|
|
|
, "bypass", show (S.toList bypass)
|
|
|
|
]
|
2024-06-28 17:19:57 +00:00
|
|
|
nodes <- mapM (proxyRemoteSide protocolversion (Bypass bypass')) clusterremotes
|
|
|
|
let closenodes = mapM_ closeRemoteSide nodes
|
|
|
|
let proxyselector = ProxySelector
|
2024-06-25 15:35:41 +00:00
|
|
|
{ proxyCHECKPRESENT = nodecontaining nodes
|
|
|
|
, proxyGET = nodecontaining nodes
|
2024-06-25 14:32:34 +00:00
|
|
|
-- The key is sent to multiple nodes at the same time,
|
|
|
|
-- skipping nodes where it's known/expected to already be
|
2024-06-25 15:35:41 +00:00
|
|
|
-- present to avoid needing to connect to those, and
|
|
|
|
-- skipping nodes where it's not preferred content.
|
|
|
|
, proxyPUT = \af k -> do
|
2024-06-18 16:07:01 +00:00
|
|
|
locs <- S.fromList <$> loggedLocations k
|
2024-06-27 18:36:55 +00:00
|
|
|
let l = filter (flip S.notMember locs . Remote.uuid . remote) nodes
|
|
|
|
l' <- filterM (\n -> isPreferredContent (Just (Remote.uuid (remote n))) mempty (Just k) af True) l
|
2024-06-25 15:35:41 +00:00
|
|
|
-- PUT to no nodes doesn't work, so fall
|
|
|
|
-- back to all nodes.
|
|
|
|
return $ nonempty [l', l] nodes
|
2024-06-23 13:28:18 +00:00
|
|
|
-- Remove the key from every node that contains it.
|
|
|
|
-- But, since it's possible the location log for some nodes
|
|
|
|
-- could be out of date, actually try to remove from every
|
|
|
|
-- node.
|
2024-06-25 15:35:41 +00:00
|
|
|
, proxyREMOVE = const (pure nodes)
|
REMOVE-BEFORE and GETTIMESTAMP proxying
For clusters, the timestamps have to be translated, since each node can
have its own idea about what time it is. To translate a timestamp, the
proxy remembers what time it asked the node for a timestamp in
GETTIMESTAMP, and applies the delta as an offset in REMOVE-BEFORE.
This does mean that a remove from a cluster has to call GETTIMESTAMP on
every node before dropping from nodes. Not very efficient. Although
currently it tries to drop from every single node anyway, which is also
not very efficient.
I thought about caching the GETTIMESTAMP from the nodes on the first
call. That would improve efficiency. But, since monotonic clocks on
!Linux don't advance when the computer is suspended, consider what might
happen if one node was suspended for a while, then came back. Its
monotonic timestamp would end up behind where the proxying expects it to
be. Would that result in removing when it shouldn't, or refusing to
remove when it should? Have not thought it through. Either way, a
cluster behaving strangly for an extended period of time because one
of its nodes was briefly asleep doesn't seem like good behavior.
2024-07-04 19:08:33 +00:00
|
|
|
, proxyGETTIMESTAMP = pure nodes
|
2024-06-18 15:01:10 +00:00
|
|
|
-- Content is not locked on the cluster as a whole,
|
|
|
|
-- instead it can be locked on individual nodes that are
|
|
|
|
-- proxied to the client.
|
|
|
|
, proxyLOCKCONTENT = const (pure Nothing)
|
2024-06-18 14:36:04 +00:00
|
|
|
}
|
2024-07-28 14:36:22 +00:00
|
|
|
return (proxyselector, closenodes)
|
checkpresent support for clusters
This assumes that the proxy for a cluster has up-to-date location
logs. If it didn't, it might proxy the checkpresent to a node that no
longer has the content, while some other node still does, and so
it would incorrectly appear that the cluster no longer contains the
content.
Since cluster UUIDs are not stored to location logs,
git-annex fsck --fast when claiming to fix a location log when
that occurred would not cause any problems. And presumably the location
tracking would later get sorted out.
At least usually, changes to the content of nodes goes via the proxy,
and it will update its location logs, so they will be accurate. However,
if there were multiple proxies to the same cluster, or nodes were
accessed directly (or via proxy to the node and not the cluster),
the proxy's location log could certainly be wrong.
(The location log access for GET has the same issues.)
2024-06-18 15:10:48 +00:00
|
|
|
where
|
2024-06-18 15:37:38 +00:00
|
|
|
-- Nodes of the cluster have remote.name.annex-cluster-node
|
2024-06-27 16:20:22 +00:00
|
|
|
-- containing its name.
|
|
|
|
--
|
|
|
|
-- Or, a node can be the cluster proxied by another gateway.
|
|
|
|
isnode bypass' rs nodeuuids myclusters r =
|
2024-06-18 15:37:38 +00:00
|
|
|
case remoteAnnexClusterNode (Remote.gitconfig r) of
|
|
|
|
Just names
|
2024-06-27 16:20:22 +00:00
|
|
|
| any (isclustername myclusters) names ->
|
2024-06-25 15:35:41 +00:00
|
|
|
flip S.member nodeuuids $
|
2024-06-18 15:37:38 +00:00
|
|
|
ClusterNodeUUID $ Remote.uuid r
|
|
|
|
| otherwise -> False
|
2024-06-27 16:20:22 +00:00
|
|
|
Nothing -> isclusterviagateway bypass' rs r
|
|
|
|
|
|
|
|
-- Is this remote the same cluster, proxied via another gateway?
|
|
|
|
--
|
|
|
|
-- Must avoid bypassed gateways to prevent cycles.
|
|
|
|
isclusterviagateway bypass' rs r =
|
|
|
|
case mkClusterUUID (Remote.uuid r) of
|
|
|
|
Just cu | cu == clusteruuid ->
|
|
|
|
case remoteAnnexProxiedBy (Remote.gitconfig r) of
|
|
|
|
Just proxyuuid | proxyuuid `S.notMember` bypass' ->
|
2024-06-26 19:27:16 +00:00
|
|
|
not $ null $
|
2024-06-27 16:20:22 +00:00
|
|
|
filter isclustergateway $
|
2024-06-26 19:27:16 +00:00
|
|
|
filter (\p -> Remote.uuid p == proxyuuid) rs
|
2024-06-27 16:20:22 +00:00
|
|
|
_ -> False
|
|
|
|
_ -> False
|
2024-06-18 15:37:38 +00:00
|
|
|
|
2024-06-27 16:20:22 +00:00
|
|
|
isclustergateway r = any (== clusteruuid) $
|
|
|
|
remoteAnnexClusterGateway $ Remote.gitconfig r
|
|
|
|
|
|
|
|
isclustername myclusters name =
|
|
|
|
M.lookup name myclusters == Just clusteruuid
|
2024-06-18 15:37:38 +00:00
|
|
|
|
2024-06-25 15:35:41 +00:00
|
|
|
nodecontaining nodes k = do
|
checkpresent support for clusters
This assumes that the proxy for a cluster has up-to-date location
logs. If it didn't, it might proxy the checkpresent to a node that no
longer has the content, while some other node still does, and so
it would incorrectly appear that the cluster no longer contains the
content.
Since cluster UUIDs are not stored to location logs,
git-annex fsck --fast when claiming to fix a location log when
that occurred would not cause any problems. And presumably the location
tracking would later get sorted out.
At least usually, changes to the content of nodes goes via the proxy,
and it will update its location logs, so they will be accurate. However,
if there were multiple proxies to the same cluster, or nodes were
accessed directly (or via proxy to the node and not the cluster),
the proxy's location log could certainly be wrong.
(The location log access for GET has the same issues.)
2024-06-18 15:10:48 +00:00
|
|
|
locs <- S.fromList <$> loggedLocations k
|
2024-06-27 18:36:55 +00:00
|
|
|
case filter (flip S.member locs . Remote.uuid . remote) nodes of
|
checkpresent support for clusters
This assumes that the proxy for a cluster has up-to-date location
logs. If it didn't, it might proxy the checkpresent to a node that no
longer has the content, while some other node still does, and so
it would incorrectly appear that the cluster no longer contains the
content.
Since cluster UUIDs are not stored to location logs,
git-annex fsck --fast when claiming to fix a location log when
that occurred would not cause any problems. And presumably the location
tracking would later get sorted out.
At least usually, changes to the content of nodes goes via the proxy,
and it will update its location logs, so they will be accurate. However,
if there were multiple proxies to the same cluster, or nodes were
accessed directly (or via proxy to the node and not the cluster),
the proxy's location log could certainly be wrong.
(The location log access for GET has the same issues.)
2024-06-18 15:10:48 +00:00
|
|
|
[] -> return Nothing
|
2024-06-27 18:36:55 +00:00
|
|
|
(node:[]) -> return (Just node)
|
|
|
|
(node:rest) ->
|
|
|
|
-- The list of nodes is ordered by cost.
|
|
|
|
-- Use any of the ones with equally low
|
|
|
|
-- cost.
|
|
|
|
let lowestcost = Remote.cost (remote node)
|
|
|
|
samecost = node : takeWhile (\n -> Remote.cost (remote n) == lowestcost) rest
|
|
|
|
in do
|
2024-07-02 16:27:14 +00:00
|
|
|
n <- liftIO $ getStdRandom $
|
2024-06-27 18:36:55 +00:00
|
|
|
randomR (0, length samecost - 1)
|
|
|
|
return (Just (samecost !! n))
|
checkpresent support for clusters
This assumes that the proxy for a cluster has up-to-date location
logs. If it didn't, it might proxy the checkpresent to a node that no
longer has the content, while some other node still does, and so
it would incorrectly appear that the cluster no longer contains the
content.
Since cluster UUIDs are not stored to location logs,
git-annex fsck --fast when claiming to fix a location log when
that occurred would not cause any problems. And presumably the location
tracking would later get sorted out.
At least usually, changes to the content of nodes goes via the proxy,
and it will update its location logs, so they will be accurate. However,
if there were multiple proxies to the same cluster, or nodes were
accessed directly (or via proxy to the node and not the cluster),
the proxy's location log could certainly be wrong.
(The location log access for GET has the same issues.)
2024-06-18 15:10:48 +00:00
|
|
|
|
2024-06-25 15:35:41 +00:00
|
|
|
nonempty (l:ls) fallback
|
|
|
|
| null l = nonempty ls fallback
|
|
|
|
| otherwise = l
|
|
|
|
nonempty [] fallback = fallback
|