support simulating clusters

Without actually simulating cluster implementation at all. Instead, only
the essential fact that cluster gateways know what changes they have
made to each node of a cluster. That is enough for sims like
sizebalanced_cluster.
This commit is contained in:
Joey Hess 2024-09-25 14:06:41 -04:00
parent 61c95f4d29
commit 8e94b75a61
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
5 changed files with 84 additions and 112 deletions

View file

@ -60,6 +60,7 @@ data SimState t = SimState
{ simRepos :: M.Map RepoName UUID
, simRepoState :: M.Map UUID (SimRepoState t)
, simConnections :: M.Map UUID (S.Set RemoteName)
, simClusterNodes :: M.Map RepoName UUID
, simFiles :: M.Map RawFilePath Key
, simRng :: Int
, simTrustLevels :: M.Map UUID TrustLevel
@ -83,6 +84,7 @@ emptySimState rngseed rootdir = SimState
{ simRepos = mempty
, simRepoState = mempty
, simConnections = mempty
, simClusterNodes = mempty
, simFiles = mempty
, simRng = rngseed
, simTrustLevels = mempty
@ -121,15 +123,36 @@ newerLocationState l1@(LocationState vc1 _) l2@(LocationState vc2 _)
| vc1 > vc2 = l1
| otherwise = l2
{- Updates the state of stu to indicate that a key is present or not in u. -}
setPresentKey :: Bool -> UUID -> Key -> UUID -> SimState SimRepo -> SimState SimRepo
setPresentKey present u k stu st = st
{- Updates the state of stu to indicate that a key is present or not in u.
-
- Also, when the reponame is the name of a cluster node, updates
- the state of every other repository that has a connection to that
- same cluster node.
-}
setPresentKey :: Bool -> (UUID, RepoName) -> Key -> UUID -> SimState SimRepo -> SimState SimRepo
setPresentKey present (u, reponame) k stu st = handleclusters $ st
{ simRepoState = case M.lookup stu (simRepoState st) of
Just rst -> M.insert stu
(setPresentKey' present (simVectorClock st) u k rst)
(simRepoState st)
Nothing -> error "no simRepoState in setPresentKey"
}
where
handleclusters st' = case M.lookup reponame (simClusterNodes st') of
Just u' | u' == u -> handleclusters' st' $
filter (/= stu) $ M.keys $
M.filter (S.member (repoNameToRemoteName reponame))
(simConnections st')
_ -> st'
handleclusters' st' [] = st'
handleclusters' st' (cu:cus) =
flip handleclusters' cus $ st'
{ simRepoState = case M.lookup cu (simRepoState st') of
Just rst -> M.insert cu
(setPresentKey' present (simVectorClock st') u k rst)
(simRepoState st')
Nothing -> simRepoState st'
}
setPresentKey' :: Bool -> VectorClock -> UUID -> Key -> SimRepoState t -> SimRepoState t
setPresentKey' present vc u k rst = rememberLiveSizeChanges present u k rst $ rst
@ -230,6 +253,7 @@ data SimCommand
| CommandRandomGroupWanted Group [PreferredContentExpression]
| CommandMaxSize RepoName MaxSize
| CommandRebalance Bool
| CommandClusterNode RepoName RepoName
| CommandVisit RepoName [String]
| CommandComment String
| CommandBlank
@ -401,7 +425,7 @@ applySimCommand' (CommandAddTree repo expr) st _ =
afile <- AssociatedFile . Just . getTopFilePath
<$> inRepo (toTopFilePath f)
ifM (checkMatcher matcher (Just k) afile NoLiveUpdate mempty (pure False) (pure False))
( let st'' = setPresentKey True u k u $ st'
( let st'' = setPresentKey True (u, repo) k u $ st'
{ simFiles = M.insert f k (simFiles st')
}
in go matcher u st'' fs
@ -426,7 +450,7 @@ applySimCommand' (CommandAdd file sz repos) st _ =
where
go _k st' [] = Right $ Right st'
go k st' (repo:rest) = checkKnownRepo repo st' $ \u ->
let st'' = setPresentKey True u k u $ st'
let st'' = setPresentKey True (u, repo) k u $ st'
{ simFiles = M.insert file k (simFiles st')
}
in go k st'' rest
@ -520,6 +544,13 @@ applySimCommand' (CommandMaxSize repo sz) st _ =
Right $ Right $ st
{ simMaxSize = M.insert u sz (simMaxSize st)
}
applySimCommand' (CommandClusterNode nodename repo) st _ =
checkKnownRepo repo st $ \u ->
checkNonexistantRepo nodename st $
Right $ Right $ st
{ simClusterNodes = M.insert nodename u
(simClusterNodes st)
}
applySimCommand' (CommandRebalance b) st _ =
Right $ Right $ st
{ simRebalance = b
@ -606,7 +637,7 @@ getSimActionComponents
-> Either String (Either (SimState SimRepo, [SimState SimRepo -> Annex (SimState SimRepo, Bool)]) (SimState SimRepo))
getSimActionComponents (ActionGetWanted repo remote) st =
checkKnownRepoNotSpecialRemote repo st $ \u ->
let go _remoteu f k _r st' = setPresentKey True u k u $
let go _remoteu f k _r st' = setPresentKey True (u, repo) k u $
addHistory st' $ CommandPresent repo f
in overFilesRemote repo u remote S.member S.notMember wanted go st
where
@ -619,9 +650,11 @@ getSimActionComponents (ActionSendWanted repo remote) st =
go u remoteu f k _r st' =
-- Sending to a remote updates the location log
-- of both the repository sending and the remote.
setPresentKey True remoteu k remoteu $
setPresentKey True remoteu k u $
setpresent remoteu $
setpresent u $
addHistory st' $ CommandPresent (remoteNameToRepoName remote) f
where
setpresent = setPresentKey True (remoteu, remoteNameToRepoName remote) k
getSimActionComponents (ActionDropUnwanted repo Nothing) st =
checkKnownRepoNotSpecialRemote repo st $ \u ->
simulateDropUnwanted st u repo u
@ -785,8 +818,8 @@ simulateDropUnwanted st u dropfromname dropfrom =
SafeDropCheckTime -> (dodrop k f st', True)
dodrop k f st' =
setPresentKey False dropfrom k u $
setPresentKey False dropfrom k dropfrom $
setPresentKey False (dropfrom, dropfromname) k u $
setPresentKey False (dropfrom, dropfromname) k dropfrom $
addHistory st' $ CommandNotPresent dropfromname f
remotes = S.fromList $ mapMaybe
@ -808,16 +841,21 @@ simulateDropUnwanted st u dropfromname dropfrom =
checkNonexistantRepo :: RepoName -> SimState SimRepo -> Either String a -> Either String a
checkNonexistantRepo reponame st a = case M.lookup reponame (simRepos st) of
Nothing -> a
Nothing -> case M.lookup reponame (simClusterNodes st) of
Just _ -> Left $ "There is already a cluster node in the simulation named \""
++ fromRepoName reponame ++ "\"."
Nothing -> a
Just _ -> Left $ "There is already a repository in the simulation named \""
++ fromRepoName reponame ++ "\"."
checkKnownRepo :: RepoName -> SimState SimRepo -> (UUID -> Either String a) -> Either String a
checkKnownRepo reponame st a = case M.lookup reponame (simRepos st) of
Just u -> a u
Nothing -> Left $ "No repository in the simulation is named \""
++ fromRepoName reponame ++ "\". Choose from: "
++ unwords (map fromRepoName $ M.keys (simRepos st))
Nothing -> case M.lookup reponame (simClusterNodes st) of
Just u -> a u
Nothing -> Left $ "No repository in the simulation is named \""
++ fromRepoName reponame ++ "\". Choose from: "
++ unwords (map fromRepoName $ M.keys (simRepos st))
checkKnownRepoNotSpecialRemote :: RepoName -> SimState SimRepo -> (UUID -> Either String a) -> Either String a
checkKnownRepoNotSpecialRemote reponame st a =

View file

@ -88,6 +88,8 @@ generateSimFile = unlines . map unwords . go
["maxsize", repo, showsize (fromMaxSize maxsize)] : go rest
go (CommandRebalance b : rest) =
["rebalance", if b then "on" else "off"] : go rest
go (CommandClusterNode (RepoName nodename) (RepoName repo) : rest) =
["clusternode", nodename, repo] : go rest
go (CommandVisit (RepoName repo) cmdparams : rest) =
(["visit", repo] ++ cmdparams) : go rest
go (CommandComment s : rest) =
@ -201,6 +203,8 @@ parseSimCommand ("maxsize":repo:size:[]) =
case readSize dataUnits size of
Just sz -> Right $ CommandMaxSize (RepoName repo) (MaxSize sz)
Nothing -> Left $ "Unable to parse maxsize \"" ++ size ++ "\""
parseSimCommand ("clusternode":nodename:repo:[]) =
Right $ CommandClusterNode (RepoName nodename) (RepoName repo)
parseSimCommand ("rebalance":onoff:[]) = case isTrueFalse onoff of
Just b -> Right $ CommandRebalance b
Nothing -> Left $ "Unable to parse rebalance value \"" ++ onoff ++ "\""

View file

@ -398,6 +398,8 @@ as passed to "git annex sim" while a simulation is running.
group node2 cluster
wanted node1 sizebalanced=cluster
wanted node2 sizebalanced=cluster
maxsize node1 100gb
maxsize node2 100gb
connect cluster-node2 <- foo -> cluster-node1
connect cluster-node2 <- bar -> cluster-node1
addmulti 10 foo 1gb 2gb foo
@ -405,9 +407,9 @@ as passed to "git annex sim" while a simulation is running.
action foo sendwanted cluster-node1 while action foo sendwanted cluster-node2 while action bar sendwanted cluster-node1 while action bar sendwanted cluster-node2
In the above example, while foo and bar are both concurrently sending
wanted files to both nodes, each will know immediately which files have
been sent by the other, and so the files will be sizebalanced between
them optimally.
wanted files to both cluster nodes, each will know immediately which
files have been sent by the other, and so the files will be sizebalanced
between them optimally.
# OPTIONS

View file

@ -0,0 +1,23 @@
# Size balanced preferred content sim with multiple repositories sending
# concurrently to the same repositories, in a cluster.
#
# This demonstrates that size balanced preferred content does not get out
# of balance when used with cluster nodes.
init foo
init bar
init node1
init node2
clusternode cluster-node1 node1
clusternode cluster-node2 node2
group node1 cluster
group node2 cluster
wanted node1 sizebalanced=cluster
wanted node2 sizebalanced=cluster
maxsize node1 100gb
maxsize node2 100gb
connect cluster-node2 <- foo -> cluster-node1
connect cluster-node2 <- bar -> cluster-node1
addmulti 10 foo 1gb 2gb foo
addmulti 10 bar 1gb 2gb bar
action foo sendwanted cluster-node1 while action foo sendwanted cluster-node2 while action bar sendwanted cluster-node1 while action bar sendwanted cluster-node2
visit foo git-annex maxsize

View file

@ -30,101 +30,6 @@ Planned schedule of work:
* Currently working in [[todo/proving_preferred_content_behavior]]
* sim: Can a cluster using size balanced preferred content be simulated?
May need the sim to get the concept of a cluster gateway, since the
gateway is what picks amoung the nodes on the basis of size. On the other
hand, it may suffice to connect the client repo directly to each node of
the cluster, and let that repo pick which nodes to send to.
The difference between having a cluster gateway and direct connections to
the nodes is when there are multiple clients. The cluster gateway updates
its location logs to reflect changes in the nodes that get proxied via
it. So it will pick a node that is not full when using size balanced
preferred content. If two clients are accessing a node directly without a
cluster gateway, that doesn't happen.
So, for a cluster accessed via a single client, direct connections to the
nodes are ok for the sim. But for multiple clients, the sim would need to
support clusters.
Would it suffice, if a repo is a node in a cluster, for every change to
its location log to be immediately propagated to every other repo in the
sim that has a connection to it? That simulates the centralized view that
the cluster gateway has, without the complication of actually simulating
a cluster gateway.
That would not allow simulating a cluster node that is
also accessed directly via another repository. But cluster nodes
generally should not be accessed except via the gateway. Still, to allow
simulating that, it would be possible to have a new type of connection,
which is via a gateway. Use eg "-g->" for it. Then to simulate a cluster,
which foo is accessing via a gateway:
connect node1 <-g- foo -g-> node2
connect node1 <-g- bar -g-> node2
What that would do is, for every change in foo's location log for node1
or node2, immediately propagate it to bar's location log.
Or an alternative syntax:
cluster g node1 node2
connect g-node1 <- foo -> g-node2
connect g-node1 <- bar -> g-node2
The only thing that does not allow simulating is 2 cluster gateways
that each proxy for some of the same nodes. In that situation, there
are two views of the contents of the nodes, which is similar to two
clients having direct connections to the nodes, but not the same when
there are more than 2 clients connected to the 2 gateways. Simulating
that would require a first-class gateway simulation with its own location
log and node selection.
Alternative approach: Let a cluster node be initialized, which is an
overlay over a repository which shares all of its configuration
except for its uuid. Every change to the location log of a cluster
node is immediately propigated to every repository that has a connection
to it. It is also propigated to the underlaying repository. This lets
more than one cluster node be initialized for the same repository, for
when it is in multiple clusters or behind multiple gateways in the same
cluster.
clusternode mycluster-foo foo
clusternode othercluster-foo foo
Implementation plan for this:
* clusternode initializes a new cluster node UUID, and adds to
simRepos.
* add `simClusterNodes :: M.Map UUID (UUID, RemoteName)`,
which maps from the cluster node UUID to the UUID of the underlying
repo, and its node name.
* clusternode also adds to simClusterNodes.
* setPresentKey checks if the UUID is in simClusterNodes.
* If it is, it makes the key present/missing in the underlying repo
UUID as well.
* And, it looks through simConnections to find any other repos that
also have a connection to the cluster node with that name.
Each of those repos also gets its simLocations updated.
But: The cluster node UUID would need to have the same preferred content
etc as the underlying repo. And, it would need to be in the same groups.
And it would be counted as another copy. Could use a cluster UUID to
avoid the numcopies count. But can adding a separate UUID be avoided?
Implementation plan for this without separate UUID:
* add `simClusterNodes :: M.Map RepoName UUID`,
* clusternode adds to simClusterNodes.
* checkKnownRemote needs to check simClusterNodes as well as
simRepos so that cluster nodes can be used as remotes.
* Plumb repo name through to setPresentKey.
* setPresentKey checks if repo name is in simClusterNodes.
* If it is, it looks through simConnections to find any other
repos that also have a connection to the cluster node with
that name. Each of those repos also gets its simLocations updated
for the change being logged.
* sim: Add support for metadata, so preferred content that matches on it
will work