support simulating clusters
Without actually simulating cluster implementation at all. Instead, only the essential fact that cluster gateways know what changes they have made to each node of a cluster. That is enough for sims like sizebalanced_cluster.
This commit is contained in:
parent
61c95f4d29
commit
8e94b75a61
5 changed files with 84 additions and 112 deletions
66
Annex/Sim.hs
66
Annex/Sim.hs
|
@ -60,6 +60,7 @@ data SimState t = SimState
|
|||
{ simRepos :: M.Map RepoName UUID
|
||||
, simRepoState :: M.Map UUID (SimRepoState t)
|
||||
, simConnections :: M.Map UUID (S.Set RemoteName)
|
||||
, simClusterNodes :: M.Map RepoName UUID
|
||||
, simFiles :: M.Map RawFilePath Key
|
||||
, simRng :: Int
|
||||
, simTrustLevels :: M.Map UUID TrustLevel
|
||||
|
@ -83,6 +84,7 @@ emptySimState rngseed rootdir = SimState
|
|||
{ simRepos = mempty
|
||||
, simRepoState = mempty
|
||||
, simConnections = mempty
|
||||
, simClusterNodes = mempty
|
||||
, simFiles = mempty
|
||||
, simRng = rngseed
|
||||
, simTrustLevels = mempty
|
||||
|
@ -121,15 +123,36 @@ newerLocationState l1@(LocationState vc1 _) l2@(LocationState vc2 _)
|
|||
| vc1 > vc2 = l1
|
||||
| otherwise = l2
|
||||
|
||||
{- Updates the state of stu to indicate that a key is present or not in u. -}
|
||||
setPresentKey :: Bool -> UUID -> Key -> UUID -> SimState SimRepo -> SimState SimRepo
|
||||
setPresentKey present u k stu st = st
|
||||
{- Updates the state of stu to indicate that a key is present or not in u.
|
||||
-
|
||||
- Also, when the reponame is the name of a cluster node, updates
|
||||
- the state of every other repository that has a connection to that
|
||||
- same cluster node.
|
||||
-}
|
||||
setPresentKey :: Bool -> (UUID, RepoName) -> Key -> UUID -> SimState SimRepo -> SimState SimRepo
|
||||
setPresentKey present (u, reponame) k stu st = handleclusters $ st
|
||||
{ simRepoState = case M.lookup stu (simRepoState st) of
|
||||
Just rst -> M.insert stu
|
||||
(setPresentKey' present (simVectorClock st) u k rst)
|
||||
(simRepoState st)
|
||||
Nothing -> error "no simRepoState in setPresentKey"
|
||||
}
|
||||
where
|
||||
handleclusters st' = case M.lookup reponame (simClusterNodes st') of
|
||||
Just u' | u' == u -> handleclusters' st' $
|
||||
filter (/= stu) $ M.keys $
|
||||
M.filter (S.member (repoNameToRemoteName reponame))
|
||||
(simConnections st')
|
||||
_ -> st'
|
||||
handleclusters' st' [] = st'
|
||||
handleclusters' st' (cu:cus) =
|
||||
flip handleclusters' cus $ st'
|
||||
{ simRepoState = case M.lookup cu (simRepoState st') of
|
||||
Just rst -> M.insert cu
|
||||
(setPresentKey' present (simVectorClock st') u k rst)
|
||||
(simRepoState st')
|
||||
Nothing -> simRepoState st'
|
||||
}
|
||||
|
||||
setPresentKey' :: Bool -> VectorClock -> UUID -> Key -> SimRepoState t -> SimRepoState t
|
||||
setPresentKey' present vc u k rst = rememberLiveSizeChanges present u k rst $ rst
|
||||
|
@ -230,6 +253,7 @@ data SimCommand
|
|||
| CommandRandomGroupWanted Group [PreferredContentExpression]
|
||||
| CommandMaxSize RepoName MaxSize
|
||||
| CommandRebalance Bool
|
||||
| CommandClusterNode RepoName RepoName
|
||||
| CommandVisit RepoName [String]
|
||||
| CommandComment String
|
||||
| CommandBlank
|
||||
|
@ -401,7 +425,7 @@ applySimCommand' (CommandAddTree repo expr) st _ =
|
|||
afile <- AssociatedFile . Just . getTopFilePath
|
||||
<$> inRepo (toTopFilePath f)
|
||||
ifM (checkMatcher matcher (Just k) afile NoLiveUpdate mempty (pure False) (pure False))
|
||||
( let st'' = setPresentKey True u k u $ st'
|
||||
( let st'' = setPresentKey True (u, repo) k u $ st'
|
||||
{ simFiles = M.insert f k (simFiles st')
|
||||
}
|
||||
in go matcher u st'' fs
|
||||
|
@ -426,7 +450,7 @@ applySimCommand' (CommandAdd file sz repos) st _ =
|
|||
where
|
||||
go _k st' [] = Right $ Right st'
|
||||
go k st' (repo:rest) = checkKnownRepo repo st' $ \u ->
|
||||
let st'' = setPresentKey True u k u $ st'
|
||||
let st'' = setPresentKey True (u, repo) k u $ st'
|
||||
{ simFiles = M.insert file k (simFiles st')
|
||||
}
|
||||
in go k st'' rest
|
||||
|
@ -520,6 +544,13 @@ applySimCommand' (CommandMaxSize repo sz) st _ =
|
|||
Right $ Right $ st
|
||||
{ simMaxSize = M.insert u sz (simMaxSize st)
|
||||
}
|
||||
applySimCommand' (CommandClusterNode nodename repo) st _ =
|
||||
checkKnownRepo repo st $ \u ->
|
||||
checkNonexistantRepo nodename st $
|
||||
Right $ Right $ st
|
||||
{ simClusterNodes = M.insert nodename u
|
||||
(simClusterNodes st)
|
||||
}
|
||||
applySimCommand' (CommandRebalance b) st _ =
|
||||
Right $ Right $ st
|
||||
{ simRebalance = b
|
||||
|
@ -606,7 +637,7 @@ getSimActionComponents
|
|||
-> Either String (Either (SimState SimRepo, [SimState SimRepo -> Annex (SimState SimRepo, Bool)]) (SimState SimRepo))
|
||||
getSimActionComponents (ActionGetWanted repo remote) st =
|
||||
checkKnownRepoNotSpecialRemote repo st $ \u ->
|
||||
let go _remoteu f k _r st' = setPresentKey True u k u $
|
||||
let go _remoteu f k _r st' = setPresentKey True (u, repo) k u $
|
||||
addHistory st' $ CommandPresent repo f
|
||||
in overFilesRemote repo u remote S.member S.notMember wanted go st
|
||||
where
|
||||
|
@ -619,9 +650,11 @@ getSimActionComponents (ActionSendWanted repo remote) st =
|
|||
go u remoteu f k _r st' =
|
||||
-- Sending to a remote updates the location log
|
||||
-- of both the repository sending and the remote.
|
||||
setPresentKey True remoteu k remoteu $
|
||||
setPresentKey True remoteu k u $
|
||||
setpresent remoteu $
|
||||
setpresent u $
|
||||
addHistory st' $ CommandPresent (remoteNameToRepoName remote) f
|
||||
where
|
||||
setpresent = setPresentKey True (remoteu, remoteNameToRepoName remote) k
|
||||
getSimActionComponents (ActionDropUnwanted repo Nothing) st =
|
||||
checkKnownRepoNotSpecialRemote repo st $ \u ->
|
||||
simulateDropUnwanted st u repo u
|
||||
|
@ -785,8 +818,8 @@ simulateDropUnwanted st u dropfromname dropfrom =
|
|||
SafeDropCheckTime -> (dodrop k f st', True)
|
||||
|
||||
dodrop k f st' =
|
||||
setPresentKey False dropfrom k u $
|
||||
setPresentKey False dropfrom k dropfrom $
|
||||
setPresentKey False (dropfrom, dropfromname) k u $
|
||||
setPresentKey False (dropfrom, dropfromname) k dropfrom $
|
||||
addHistory st' $ CommandNotPresent dropfromname f
|
||||
|
||||
remotes = S.fromList $ mapMaybe
|
||||
|
@ -808,16 +841,21 @@ simulateDropUnwanted st u dropfromname dropfrom =
|
|||
|
||||
checkNonexistantRepo :: RepoName -> SimState SimRepo -> Either String a -> Either String a
|
||||
checkNonexistantRepo reponame st a = case M.lookup reponame (simRepos st) of
|
||||
Nothing -> a
|
||||
Nothing -> case M.lookup reponame (simClusterNodes st) of
|
||||
Just _ -> Left $ "There is already a cluster node in the simulation named \""
|
||||
++ fromRepoName reponame ++ "\"."
|
||||
Nothing -> a
|
||||
Just _ -> Left $ "There is already a repository in the simulation named \""
|
||||
++ fromRepoName reponame ++ "\"."
|
||||
|
||||
checkKnownRepo :: RepoName -> SimState SimRepo -> (UUID -> Either String a) -> Either String a
|
||||
checkKnownRepo reponame st a = case M.lookup reponame (simRepos st) of
|
||||
Just u -> a u
|
||||
Nothing -> Left $ "No repository in the simulation is named \""
|
||||
++ fromRepoName reponame ++ "\". Choose from: "
|
||||
++ unwords (map fromRepoName $ M.keys (simRepos st))
|
||||
Nothing -> case M.lookup reponame (simClusterNodes st) of
|
||||
Just u -> a u
|
||||
Nothing -> Left $ "No repository in the simulation is named \""
|
||||
++ fromRepoName reponame ++ "\". Choose from: "
|
||||
++ unwords (map fromRepoName $ M.keys (simRepos st))
|
||||
|
||||
checkKnownRepoNotSpecialRemote :: RepoName -> SimState SimRepo -> (UUID -> Either String a) -> Either String a
|
||||
checkKnownRepoNotSpecialRemote reponame st a =
|
||||
|
|
|
@ -88,6 +88,8 @@ generateSimFile = unlines . map unwords . go
|
|||
["maxsize", repo, showsize (fromMaxSize maxsize)] : go rest
|
||||
go (CommandRebalance b : rest) =
|
||||
["rebalance", if b then "on" else "off"] : go rest
|
||||
go (CommandClusterNode (RepoName nodename) (RepoName repo) : rest) =
|
||||
["clusternode", nodename, repo] : go rest
|
||||
go (CommandVisit (RepoName repo) cmdparams : rest) =
|
||||
(["visit", repo] ++ cmdparams) : go rest
|
||||
go (CommandComment s : rest) =
|
||||
|
@ -201,6 +203,8 @@ parseSimCommand ("maxsize":repo:size:[]) =
|
|||
case readSize dataUnits size of
|
||||
Just sz -> Right $ CommandMaxSize (RepoName repo) (MaxSize sz)
|
||||
Nothing -> Left $ "Unable to parse maxsize \"" ++ size ++ "\""
|
||||
parseSimCommand ("clusternode":nodename:repo:[]) =
|
||||
Right $ CommandClusterNode (RepoName nodename) (RepoName repo)
|
||||
parseSimCommand ("rebalance":onoff:[]) = case isTrueFalse onoff of
|
||||
Just b -> Right $ CommandRebalance b
|
||||
Nothing -> Left $ "Unable to parse rebalance value \"" ++ onoff ++ "\""
|
||||
|
|
|
@ -398,6 +398,8 @@ as passed to "git annex sim" while a simulation is running.
|
|||
group node2 cluster
|
||||
wanted node1 sizebalanced=cluster
|
||||
wanted node2 sizebalanced=cluster
|
||||
maxsize node1 100gb
|
||||
maxsize node2 100gb
|
||||
connect cluster-node2 <- foo -> cluster-node1
|
||||
connect cluster-node2 <- bar -> cluster-node1
|
||||
addmulti 10 foo 1gb 2gb foo
|
||||
|
@ -405,9 +407,9 @@ as passed to "git annex sim" while a simulation is running.
|
|||
action foo sendwanted cluster-node1 while action foo sendwanted cluster-node2 while action bar sendwanted cluster-node1 while action bar sendwanted cluster-node2
|
||||
|
||||
In the above example, while foo and bar are both concurrently sending
|
||||
wanted files to both nodes, each will know immediately which files have
|
||||
been sent by the other, and so the files will be sizebalanced between
|
||||
them optimally.
|
||||
wanted files to both cluster nodes, each will know immediately which
|
||||
files have been sent by the other, and so the files will be sizebalanced
|
||||
between them optimally.
|
||||
|
||||
# OPTIONS
|
||||
|
||||
|
|
23
doc/sims/sizebalanced_cluster.mdwn
Normal file
23
doc/sims/sizebalanced_cluster.mdwn
Normal file
|
@ -0,0 +1,23 @@
|
|||
# Size balanced preferred content sim with multiple repositories sending
|
||||
# concurrently to the same repositories, in a cluster.
|
||||
#
|
||||
# This demonstrates that size balanced preferred content does not get out
|
||||
# of balance when used with cluster nodes.
|
||||
init foo
|
||||
init bar
|
||||
init node1
|
||||
init node2
|
||||
clusternode cluster-node1 node1
|
||||
clusternode cluster-node2 node2
|
||||
group node1 cluster
|
||||
group node2 cluster
|
||||
wanted node1 sizebalanced=cluster
|
||||
wanted node2 sizebalanced=cluster
|
||||
maxsize node1 100gb
|
||||
maxsize node2 100gb
|
||||
connect cluster-node2 <- foo -> cluster-node1
|
||||
connect cluster-node2 <- bar -> cluster-node1
|
||||
addmulti 10 foo 1gb 2gb foo
|
||||
addmulti 10 bar 1gb 2gb bar
|
||||
action foo sendwanted cluster-node1 while action foo sendwanted cluster-node2 while action bar sendwanted cluster-node1 while action bar sendwanted cluster-node2
|
||||
visit foo git-annex maxsize
|
|
@ -30,101 +30,6 @@ Planned schedule of work:
|
|||
|
||||
* Currently working in [[todo/proving_preferred_content_behavior]]
|
||||
|
||||
* sim: Can a cluster using size balanced preferred content be simulated?
|
||||
May need the sim to get the concept of a cluster gateway, since the
|
||||
gateway is what picks amoung the nodes on the basis of size. On the other
|
||||
hand, it may suffice to connect the client repo directly to each node of
|
||||
the cluster, and let that repo pick which nodes to send to.
|
||||
|
||||
The difference between having a cluster gateway and direct connections to
|
||||
the nodes is when there are multiple clients. The cluster gateway updates
|
||||
its location logs to reflect changes in the nodes that get proxied via
|
||||
it. So it will pick a node that is not full when using size balanced
|
||||
preferred content. If two clients are accessing a node directly without a
|
||||
cluster gateway, that doesn't happen.
|
||||
|
||||
So, for a cluster accessed via a single client, direct connections to the
|
||||
nodes are ok for the sim. But for multiple clients, the sim would need to
|
||||
support clusters.
|
||||
|
||||
Would it suffice, if a repo is a node in a cluster, for every change to
|
||||
its location log to be immediately propagated to every other repo in the
|
||||
sim that has a connection to it? That simulates the centralized view that
|
||||
the cluster gateway has, without the complication of actually simulating
|
||||
a cluster gateway.
|
||||
|
||||
That would not allow simulating a cluster node that is
|
||||
also accessed directly via another repository. But cluster nodes
|
||||
generally should not be accessed except via the gateway. Still, to allow
|
||||
simulating that, it would be possible to have a new type of connection,
|
||||
which is via a gateway. Use eg "-g->" for it. Then to simulate a cluster,
|
||||
which foo is accessing via a gateway:
|
||||
|
||||
connect node1 <-g- foo -g-> node2
|
||||
connect node1 <-g- bar -g-> node2
|
||||
|
||||
What that would do is, for every change in foo's location log for node1
|
||||
or node2, immediately propagate it to bar's location log.
|
||||
|
||||
Or an alternative syntax:
|
||||
|
||||
cluster g node1 node2
|
||||
connect g-node1 <- foo -> g-node2
|
||||
connect g-node1 <- bar -> g-node2
|
||||
|
||||
The only thing that does not allow simulating is 2 cluster gateways
|
||||
that each proxy for some of the same nodes. In that situation, there
|
||||
are two views of the contents of the nodes, which is similar to two
|
||||
clients having direct connections to the nodes, but not the same when
|
||||
there are more than 2 clients connected to the 2 gateways. Simulating
|
||||
that would require a first-class gateway simulation with its own location
|
||||
log and node selection.
|
||||
|
||||
Alternative approach: Let a cluster node be initialized, which is an
|
||||
overlay over a repository which shares all of its configuration
|
||||
except for its uuid. Every change to the location log of a cluster
|
||||
node is immediately propigated to every repository that has a connection
|
||||
to it. It is also propigated to the underlaying repository. This lets
|
||||
more than one cluster node be initialized for the same repository, for
|
||||
when it is in multiple clusters or behind multiple gateways in the same
|
||||
cluster.
|
||||
|
||||
clusternode mycluster-foo foo
|
||||
clusternode othercluster-foo foo
|
||||
|
||||
Implementation plan for this:
|
||||
|
||||
* clusternode initializes a new cluster node UUID, and adds to
|
||||
simRepos.
|
||||
* add `simClusterNodes :: M.Map UUID (UUID, RemoteName)`,
|
||||
which maps from the cluster node UUID to the UUID of the underlying
|
||||
repo, and its node name.
|
||||
* clusternode also adds to simClusterNodes.
|
||||
* setPresentKey checks if the UUID is in simClusterNodes.
|
||||
* If it is, it makes the key present/missing in the underlying repo
|
||||
UUID as well.
|
||||
* And, it looks through simConnections to find any other repos that
|
||||
also have a connection to the cluster node with that name.
|
||||
Each of those repos also gets its simLocations updated.
|
||||
|
||||
But: The cluster node UUID would need to have the same preferred content
|
||||
etc as the underlying repo. And, it would need to be in the same groups.
|
||||
And it would be counted as another copy. Could use a cluster UUID to
|
||||
avoid the numcopies count. But can adding a separate UUID be avoided?
|
||||
|
||||
Implementation plan for this without separate UUID:
|
||||
|
||||
* add `simClusterNodes :: M.Map RepoName UUID`,
|
||||
* clusternode adds to simClusterNodes.
|
||||
* checkKnownRemote needs to check simClusterNodes as well as
|
||||
simRepos so that cluster nodes can be used as remotes.
|
||||
* Plumb repo name through to setPresentKey.
|
||||
* setPresentKey checks if repo name is in simClusterNodes.
|
||||
* If it is, it looks through simConnections to find any other
|
||||
repos that also have a connection to the cluster node with
|
||||
that name. Each of those repos also gets its simLocations updated
|
||||
for the change being logged.
|
||||
|
||||
* sim: Add support for metadata, so preferred content that matches on it
|
||||
will work
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue