From 8e94b75a6153ff019a879b299dadaa379f572869 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Wed, 25 Sep 2024 14:06:41 -0400 Subject: [PATCH] support simulating clusters Without actually simulating cluster implementation at all. Instead, only the essential fact that cluster gateways know what changes they have made to each node of a cluster. That is enough for sims like sizebalanced_cluster. --- Annex/Sim.hs | 66 ++++++++++++++++----- Annex/Sim/File.hs | 4 ++ doc/git-annex-sim.mdwn | 8 ++- doc/sims/sizebalanced_cluster.mdwn | 23 ++++++++ doc/todo/git-annex_proxies.mdwn | 95 ------------------------------ 5 files changed, 84 insertions(+), 112 deletions(-) create mode 100644 doc/sims/sizebalanced_cluster.mdwn diff --git a/Annex/Sim.hs b/Annex/Sim.hs index dfae56949b..f35f1b85c7 100644 --- a/Annex/Sim.hs +++ b/Annex/Sim.hs @@ -60,6 +60,7 @@ data SimState t = SimState { simRepos :: M.Map RepoName UUID , simRepoState :: M.Map UUID (SimRepoState t) , simConnections :: M.Map UUID (S.Set RemoteName) + , simClusterNodes :: M.Map RepoName UUID , simFiles :: M.Map RawFilePath Key , simRng :: Int , simTrustLevels :: M.Map UUID TrustLevel @@ -83,6 +84,7 @@ emptySimState rngseed rootdir = SimState { simRepos = mempty , simRepoState = mempty , simConnections = mempty + , simClusterNodes = mempty , simFiles = mempty , simRng = rngseed , simTrustLevels = mempty @@ -121,15 +123,36 @@ newerLocationState l1@(LocationState vc1 _) l2@(LocationState vc2 _) | vc1 > vc2 = l1 | otherwise = l2 -{- Updates the state of stu to indicate that a key is present or not in u. -} -setPresentKey :: Bool -> UUID -> Key -> UUID -> SimState SimRepo -> SimState SimRepo -setPresentKey present u k stu st = st +{- Updates the state of stu to indicate that a key is present or not in u. + - + - Also, when the reponame is the name of a cluster node, updates + - the state of every other repository that has a connection to that + - same cluster node. + -} +setPresentKey :: Bool -> (UUID, RepoName) -> Key -> UUID -> SimState SimRepo -> SimState SimRepo +setPresentKey present (u, reponame) k stu st = handleclusters $ st { simRepoState = case M.lookup stu (simRepoState st) of Just rst -> M.insert stu (setPresentKey' present (simVectorClock st) u k rst) (simRepoState st) Nothing -> error "no simRepoState in setPresentKey" } + where + handleclusters st' = case M.lookup reponame (simClusterNodes st') of + Just u' | u' == u -> handleclusters' st' $ + filter (/= stu) $ M.keys $ + M.filter (S.member (repoNameToRemoteName reponame)) + (simConnections st') + _ -> st' + handleclusters' st' [] = st' + handleclusters' st' (cu:cus) = + flip handleclusters' cus $ st' + { simRepoState = case M.lookup cu (simRepoState st') of + Just rst -> M.insert cu + (setPresentKey' present (simVectorClock st') u k rst) + (simRepoState st') + Nothing -> simRepoState st' + } setPresentKey' :: Bool -> VectorClock -> UUID -> Key -> SimRepoState t -> SimRepoState t setPresentKey' present vc u k rst = rememberLiveSizeChanges present u k rst $ rst @@ -230,6 +253,7 @@ data SimCommand | CommandRandomGroupWanted Group [PreferredContentExpression] | CommandMaxSize RepoName MaxSize | CommandRebalance Bool + | CommandClusterNode RepoName RepoName | CommandVisit RepoName [String] | CommandComment String | CommandBlank @@ -401,7 +425,7 @@ applySimCommand' (CommandAddTree repo expr) st _ = afile <- AssociatedFile . Just . getTopFilePath <$> inRepo (toTopFilePath f) ifM (checkMatcher matcher (Just k) afile NoLiveUpdate mempty (pure False) (pure False)) - ( let st'' = setPresentKey True u k u $ st' + ( let st'' = setPresentKey True (u, repo) k u $ st' { simFiles = M.insert f k (simFiles st') } in go matcher u st'' fs @@ -426,7 +450,7 @@ applySimCommand' (CommandAdd file sz repos) st _ = where go _k st' [] = Right $ Right st' go k st' (repo:rest) = checkKnownRepo repo st' $ \u -> - let st'' = setPresentKey True u k u $ st' + let st'' = setPresentKey True (u, repo) k u $ st' { simFiles = M.insert file k (simFiles st') } in go k st'' rest @@ -520,6 +544,13 @@ applySimCommand' (CommandMaxSize repo sz) st _ = Right $ Right $ st { simMaxSize = M.insert u sz (simMaxSize st) } +applySimCommand' (CommandClusterNode nodename repo) st _ = + checkKnownRepo repo st $ \u -> + checkNonexistantRepo nodename st $ + Right $ Right $ st + { simClusterNodes = M.insert nodename u + (simClusterNodes st) + } applySimCommand' (CommandRebalance b) st _ = Right $ Right $ st { simRebalance = b @@ -606,7 +637,7 @@ getSimActionComponents -> Either String (Either (SimState SimRepo, [SimState SimRepo -> Annex (SimState SimRepo, Bool)]) (SimState SimRepo)) getSimActionComponents (ActionGetWanted repo remote) st = checkKnownRepoNotSpecialRemote repo st $ \u -> - let go _remoteu f k _r st' = setPresentKey True u k u $ + let go _remoteu f k _r st' = setPresentKey True (u, repo) k u $ addHistory st' $ CommandPresent repo f in overFilesRemote repo u remote S.member S.notMember wanted go st where @@ -619,9 +650,11 @@ getSimActionComponents (ActionSendWanted repo remote) st = go u remoteu f k _r st' = -- Sending to a remote updates the location log -- of both the repository sending and the remote. - setPresentKey True remoteu k remoteu $ - setPresentKey True remoteu k u $ + setpresent remoteu $ + setpresent u $ addHistory st' $ CommandPresent (remoteNameToRepoName remote) f + where + setpresent = setPresentKey True (remoteu, remoteNameToRepoName remote) k getSimActionComponents (ActionDropUnwanted repo Nothing) st = checkKnownRepoNotSpecialRemote repo st $ \u -> simulateDropUnwanted st u repo u @@ -785,8 +818,8 @@ simulateDropUnwanted st u dropfromname dropfrom = SafeDropCheckTime -> (dodrop k f st', True) dodrop k f st' = - setPresentKey False dropfrom k u $ - setPresentKey False dropfrom k dropfrom $ + setPresentKey False (dropfrom, dropfromname) k u $ + setPresentKey False (dropfrom, dropfromname) k dropfrom $ addHistory st' $ CommandNotPresent dropfromname f remotes = S.fromList $ mapMaybe @@ -808,16 +841,21 @@ simulateDropUnwanted st u dropfromname dropfrom = checkNonexistantRepo :: RepoName -> SimState SimRepo -> Either String a -> Either String a checkNonexistantRepo reponame st a = case M.lookup reponame (simRepos st) of - Nothing -> a + Nothing -> case M.lookup reponame (simClusterNodes st) of + Just _ -> Left $ "There is already a cluster node in the simulation named \"" + ++ fromRepoName reponame ++ "\"." + Nothing -> a Just _ -> Left $ "There is already a repository in the simulation named \"" ++ fromRepoName reponame ++ "\"." checkKnownRepo :: RepoName -> SimState SimRepo -> (UUID -> Either String a) -> Either String a checkKnownRepo reponame st a = case M.lookup reponame (simRepos st) of Just u -> a u - Nothing -> Left $ "No repository in the simulation is named \"" - ++ fromRepoName reponame ++ "\". Choose from: " - ++ unwords (map fromRepoName $ M.keys (simRepos st)) + Nothing -> case M.lookup reponame (simClusterNodes st) of + Just u -> a u + Nothing -> Left $ "No repository in the simulation is named \"" + ++ fromRepoName reponame ++ "\". Choose from: " + ++ unwords (map fromRepoName $ M.keys (simRepos st)) checkKnownRepoNotSpecialRemote :: RepoName -> SimState SimRepo -> (UUID -> Either String a) -> Either String a checkKnownRepoNotSpecialRemote reponame st a = diff --git a/Annex/Sim/File.hs b/Annex/Sim/File.hs index d4bc59c75c..4c3d2652d4 100644 --- a/Annex/Sim/File.hs +++ b/Annex/Sim/File.hs @@ -88,6 +88,8 @@ generateSimFile = unlines . map unwords . go ["maxsize", repo, showsize (fromMaxSize maxsize)] : go rest go (CommandRebalance b : rest) = ["rebalance", if b then "on" else "off"] : go rest + go (CommandClusterNode (RepoName nodename) (RepoName repo) : rest) = + ["clusternode", nodename, repo] : go rest go (CommandVisit (RepoName repo) cmdparams : rest) = (["visit", repo] ++ cmdparams) : go rest go (CommandComment s : rest) = @@ -201,6 +203,8 @@ parseSimCommand ("maxsize":repo:size:[]) = case readSize dataUnits size of Just sz -> Right $ CommandMaxSize (RepoName repo) (MaxSize sz) Nothing -> Left $ "Unable to parse maxsize \"" ++ size ++ "\"" +parseSimCommand ("clusternode":nodename:repo:[]) = + Right $ CommandClusterNode (RepoName nodename) (RepoName repo) parseSimCommand ("rebalance":onoff:[]) = case isTrueFalse onoff of Just b -> Right $ CommandRebalance b Nothing -> Left $ "Unable to parse rebalance value \"" ++ onoff ++ "\"" diff --git a/doc/git-annex-sim.mdwn b/doc/git-annex-sim.mdwn index 20e739aba0..1b77e92c47 100644 --- a/doc/git-annex-sim.mdwn +++ b/doc/git-annex-sim.mdwn @@ -398,6 +398,8 @@ as passed to "git annex sim" while a simulation is running. group node2 cluster wanted node1 sizebalanced=cluster wanted node2 sizebalanced=cluster + maxsize node1 100gb + maxsize node2 100gb connect cluster-node2 <- foo -> cluster-node1 connect cluster-node2 <- bar -> cluster-node1 addmulti 10 foo 1gb 2gb foo @@ -405,9 +407,9 @@ as passed to "git annex sim" while a simulation is running. action foo sendwanted cluster-node1 while action foo sendwanted cluster-node2 while action bar sendwanted cluster-node1 while action bar sendwanted cluster-node2 In the above example, while foo and bar are both concurrently sending - wanted files to both nodes, each will know immediately which files have - been sent by the other, and so the files will be sizebalanced between - them optimally. + wanted files to both cluster nodes, each will know immediately which + files have been sent by the other, and so the files will be sizebalanced + between them optimally. # OPTIONS diff --git a/doc/sims/sizebalanced_cluster.mdwn b/doc/sims/sizebalanced_cluster.mdwn new file mode 100644 index 0000000000..d649dc2915 --- /dev/null +++ b/doc/sims/sizebalanced_cluster.mdwn @@ -0,0 +1,23 @@ +# Size balanced preferred content sim with multiple repositories sending +# concurrently to the same repositories, in a cluster. +# +# This demonstrates that size balanced preferred content does not get out +# of balance when used with cluster nodes. +init foo +init bar +init node1 +init node2 +clusternode cluster-node1 node1 +clusternode cluster-node2 node2 +group node1 cluster +group node2 cluster +wanted node1 sizebalanced=cluster +wanted node2 sizebalanced=cluster +maxsize node1 100gb +maxsize node2 100gb +connect cluster-node2 <- foo -> cluster-node1 +connect cluster-node2 <- bar -> cluster-node1 +addmulti 10 foo 1gb 2gb foo +addmulti 10 bar 1gb 2gb bar +action foo sendwanted cluster-node1 while action foo sendwanted cluster-node2 while action bar sendwanted cluster-node1 while action bar sendwanted cluster-node2 +visit foo git-annex maxsize diff --git a/doc/todo/git-annex_proxies.mdwn b/doc/todo/git-annex_proxies.mdwn index fdae563424..d2684fe314 100644 --- a/doc/todo/git-annex_proxies.mdwn +++ b/doc/todo/git-annex_proxies.mdwn @@ -30,101 +30,6 @@ Planned schedule of work: * Currently working in [[todo/proving_preferred_content_behavior]] -* sim: Can a cluster using size balanced preferred content be simulated? - May need the sim to get the concept of a cluster gateway, since the - gateway is what picks amoung the nodes on the basis of size. On the other - hand, it may suffice to connect the client repo directly to each node of - the cluster, and let that repo pick which nodes to send to. - - The difference between having a cluster gateway and direct connections to - the nodes is when there are multiple clients. The cluster gateway updates - its location logs to reflect changes in the nodes that get proxied via - it. So it will pick a node that is not full when using size balanced - preferred content. If two clients are accessing a node directly without a - cluster gateway, that doesn't happen. - - So, for a cluster accessed via a single client, direct connections to the - nodes are ok for the sim. But for multiple clients, the sim would need to - support clusters. - - Would it suffice, if a repo is a node in a cluster, for every change to - its location log to be immediately propagated to every other repo in the - sim that has a connection to it? That simulates the centralized view that - the cluster gateway has, without the complication of actually simulating - a cluster gateway. - - That would not allow simulating a cluster node that is - also accessed directly via another repository. But cluster nodes - generally should not be accessed except via the gateway. Still, to allow - simulating that, it would be possible to have a new type of connection, - which is via a gateway. Use eg "-g->" for it. Then to simulate a cluster, - which foo is accessing via a gateway: - - connect node1 <-g- foo -g-> node2 - connect node1 <-g- bar -g-> node2 - - What that would do is, for every change in foo's location log for node1 - or node2, immediately propagate it to bar's location log. - - Or an alternative syntax: - - cluster g node1 node2 - connect g-node1 <- foo -> g-node2 - connect g-node1 <- bar -> g-node2 - - The only thing that does not allow simulating is 2 cluster gateways - that each proxy for some of the same nodes. In that situation, there - are two views of the contents of the nodes, which is similar to two - clients having direct connections to the nodes, but not the same when - there are more than 2 clients connected to the 2 gateways. Simulating - that would require a first-class gateway simulation with its own location - log and node selection. - - Alternative approach: Let a cluster node be initialized, which is an - overlay over a repository which shares all of its configuration - except for its uuid. Every change to the location log of a cluster - node is immediately propigated to every repository that has a connection - to it. It is also propigated to the underlaying repository. This lets - more than one cluster node be initialized for the same repository, for - when it is in multiple clusters or behind multiple gateways in the same - cluster. - - clusternode mycluster-foo foo - clusternode othercluster-foo foo - - Implementation plan for this: - - * clusternode initializes a new cluster node UUID, and adds to - simRepos. - * add `simClusterNodes :: M.Map UUID (UUID, RemoteName)`, - which maps from the cluster node UUID to the UUID of the underlying - repo, and its node name. - * clusternode also adds to simClusterNodes. - * setPresentKey checks if the UUID is in simClusterNodes. - * If it is, it makes the key present/missing in the underlying repo - UUID as well. - * And, it looks through simConnections to find any other repos that - also have a connection to the cluster node with that name. - Each of those repos also gets its simLocations updated. - - But: The cluster node UUID would need to have the same preferred content - etc as the underlying repo. And, it would need to be in the same groups. - And it would be counted as another copy. Could use a cluster UUID to - avoid the numcopies count. But can adding a separate UUID be avoided? - - Implementation plan for this without separate UUID: - - * add `simClusterNodes :: M.Map RepoName UUID`, - * clusternode adds to simClusterNodes. - * checkKnownRemote needs to check simClusterNodes as well as - simRepos so that cluster nodes can be used as remotes. - * Plumb repo name through to setPresentKey. - * setPresentKey checks if repo name is in simClusterNodes. - * If it is, it looks through simConnections to find any other - repos that also have a connection to the cluster node with - that name. Each of those repos also gets its simLocations updated - for the change being logged. - * sim: Add support for metadata, so preferred content that matches on it will work