From 202ea3ff2a066ad4960929b3f09b12705f69d1d4 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 25 Jun 2024 10:06:28 -0400 Subject: [PATCH] don't sync with cluster nodes by default Avoid `git-annex sync --content` etc from operating on cluster nodes by default since syncing with a cluster implicitly syncs with its nodes. This avoids a lot of unncessary work when a cluster has a lot of nodes just in checking if each node's preferred content is satisfied. And it avoids content being sent to nodes individually, so instead syncing with clusters always fanout uploads to nodes. The downside is that there are situations where a cluster's preferred content settings can be met, but those of its nodes are not. Or where a node does not contain a key, but the cluster does, and there are not enough copies of the key yet, so it would be desirable the send it there. I think that's an acceptable tradeoff. These kind of situations are ones where the cluster itself should probably be responsible for copying content to the node. Which it can do much less expensively than a client can. Part of the balanced preferred content design that I will be working on in a couple of months involves rebalancing clusters, so I expect to revisit this. The use of annex-sync config does allow running git-annex sync with a specific node, or nodes, and it will sync with it. And it's also possible to set annex-sync git configs to make it sync with a node by default. (Although that will require setting up an explicit git remote for the node rather than relying on the proxied remote.) Logs.Cluster.Basic is needed because Remote.Git cannot import Logs.Cluster due to a cycle. And the Annex.Startup load of clusters happens too late for Remote.Git to use that. This does mean one redundant load of the cluster log, though only when there is a proxy. --- Annex/Cluster.hs | 4 +- Logs/Cluster.hs | 79 +++---------------- Logs/Cluster/Basic.hs | 91 ++++++++++++++++++++++ Remote/Git.hs | 37 ++++++--- Types/GitConfig.hs | 4 +- doc/design/balanced_preferred_content.mdwn | 13 +++- doc/todo/git-annex_proxies.mdwn | 16 ++-- git-annex.cabal | 1 + 8 files changed, 152 insertions(+), 93 deletions(-) create mode 100644 Logs/Cluster/Basic.hs diff --git a/Annex/Cluster.hs b/Annex/Cluster.hs index cfb580dea5..10125e9c5f 100644 --- a/Annex/Cluster.hs +++ b/Annex/Cluster.hs @@ -47,7 +47,9 @@ proxyCluster clusteruuid proxydone servermode clientside protoerrhandler = do -- The protocol versions supported by the nodes are not -- known at this point, and would be too expensive to -- determine. Instead, pick the newest protocol version - -- that we and the client both speak. + -- that we and the client both speak. The proxy code + -- checks protocol versions when operating on multiple + -- nodes. let protocolversion = min maxProtocolVersion clientmaxversion selectnode <- clusterProxySelector clusteruuid protocolversion proxy proxydone proxymethods servermode clientside diff --git a/Logs/Cluster.hs b/Logs/Cluster.hs index 584dcb6867..8ee0dc975a 100644 --- a/Logs/Cluster.hs +++ b/Logs/Cluster.hs @@ -16,83 +16,26 @@ module Logs.Cluster ( import qualified Annex import Annex.Common -import qualified Annex.Branch import Types.Cluster -import Logs -import Logs.UUIDBased -import Logs.MapLog +import Logs.Cluster.Basic import Logs.Trust -import qualified Data.Set as S import qualified Data.Map as M -import Data.ByteString.Builder -import qualified Data.Attoparsec.ByteString as A -import qualified Data.Attoparsec.ByteString.Char8 as A8 -import qualified Data.ByteString.Lazy as L +import qualified Data.Set as S getClusters :: Annex Clusters getClusters = maybe loadClusters return =<< Annex.getState Annex.clusters -{- Loads the clusters and caches it for later. -} +{- Loads the clusters and caches it for later. + - + - This takes care of removing dead nodes from clusters, + - to avoid inserting the cluster uuid into the location + - log when only dead nodes contain the content of a key. + -} loadClusters :: Annex Clusters loadClusters = do - m <- convclusteruuids . M.map value . fromMapLog . parseClusterLog - <$> Annex.Branch.get clusterLog - m' <- removedeadnodes m - let clusters = Clusters - { clusterUUIDs = m' - , clusterNodeUUIDs = M.foldlWithKey inverter mempty m' - } + dead <- (S.fromList . map ClusterNodeUUID) + <$> trustGet DeadTrusted + clusters <- getClustersWith (M.map (`S.difference` dead)) Annex.changeState $ \s -> s { Annex.clusters = Just clusters } return clusters - where - convclusteruuids :: M.Map UUID (S.Set ClusterNodeUUID) -> M.Map ClusterUUID (S.Set ClusterNodeUUID) - convclusteruuids = M.fromList - . mapMaybe (\(mk, v) -> (, v) <$> mk) - . M.toList . M.mapKeys mkClusterUUID - inverter m k v = M.unionWith (<>) m - (M.fromList (map (, S.singleton k) (S.toList v))) - - -- Dead nodes are removed from clusters to avoid inserting the - -- cluster uuid into the location log when only dead nodes contain - -- the content of a key. - removedeadnodes m = do - dead <- (S.fromList . map ClusterNodeUUID) - <$> trustGet DeadTrusted - return $ M.map (`S.difference` dead) m - -recordCluster :: ClusterUUID -> S.Set ClusterNodeUUID -> Annex () -recordCluster clusteruuid nodeuuids = do - -- If a private UUID has been configured as a cluster node, - -- avoid leaking it into the git-annex log. - privateuuids <- annexPrivateRepos <$> Annex.getGitConfig - let nodeuuids' = S.filter - (\(ClusterNodeUUID n) -> S.notMember n privateuuids) - nodeuuids - - c <- currentVectorClock - Annex.Branch.change (Annex.Branch.RegardingUUID [fromClusterUUID clusteruuid]) clusterLog $ - (buildLogNew buildClusterNodeList) - . changeLog c (fromClusterUUID clusteruuid) nodeuuids' - . parseClusterLog - -buildClusterNodeList :: S.Set ClusterNodeUUID -> Builder -buildClusterNodeList = assemble - . map (buildUUID . fromClusterNodeUUID) - . S.toList - where - assemble [] = mempty - assemble (x:[]) = x - assemble (x:y:l) = x <> " " <> assemble (y:l) - -parseClusterLog :: L.ByteString -> Log (S.Set ClusterNodeUUID) -parseClusterLog = parseLogNew parseClusterNodeList - -parseClusterNodeList :: A.Parser (S.Set ClusterNodeUUID) -parseClusterNodeList = S.fromList <$> many parseword - where - parseword = parsenode - <* ((const () <$> A8.char ' ') <|> A.endOfInput) - parsenode = ClusterNodeUUID - <$> (toUUID <$> A8.takeWhile1 (/= ' ')) - diff --git a/Logs/Cluster/Basic.hs b/Logs/Cluster/Basic.hs new file mode 100644 index 0000000000..f1b5e87649 --- /dev/null +++ b/Logs/Cluster/Basic.hs @@ -0,0 +1,91 @@ +{- git-annex cluster log, basics + - + - Copyright 2024 Joey Hess + - + - Licensed under the GNU AGPL version 3 or higher. + -} + +{-# LANGUAGE OverloadedStrings, TupleSections #-} + +module Logs.Cluster.Basic ( + module Types.Cluster, + getClustersWith, + recordCluster, +) where + +import qualified Annex +import Annex.Common +import qualified Annex.Branch +import Types.Cluster +import Logs +import Logs.UUIDBased +import Logs.MapLog + +import qualified Data.Set as S +import qualified Data.Map as M +import Data.ByteString.Builder +import qualified Data.Attoparsec.ByteString as A +import qualified Data.Attoparsec.ByteString.Char8 as A8 +import qualified Data.ByteString.Lazy as L + +{- Gets the clusters. Note that this includes any dead nodes, + - unless a function is provided to remove them. + -} +getClustersWith + :: (M.Map ClusterUUID (S.Set ClusterNodeUUID) + -> M.Map ClusterUUID (S.Set ClusterNodeUUID)) + -> Annex Clusters +getClustersWith removedeadnodes = do + m <- removedeadnodes + . convclusteruuids + . M.map value + . fromMapLog + . parseClusterLog + <$> Annex.Branch.get clusterLog + return $ Clusters + { clusterUUIDs = m + , clusterNodeUUIDs = M.foldlWithKey inverter mempty m + } + where + convclusteruuids :: M.Map UUID (S.Set ClusterNodeUUID) -> M.Map ClusterUUID (S.Set ClusterNodeUUID) + convclusteruuids = M.fromList + . mapMaybe (\(mk, v) -> (, v) <$> mk) + . M.toList . M.mapKeys mkClusterUUID + inverter m k v = M.unionWith (<>) m + (M.fromList (map (, S.singleton k) (S.toList v))) + +recordCluster :: ClusterUUID -> S.Set ClusterNodeUUID -> Annex () +recordCluster clusteruuid nodeuuids = do + -- If a private UUID has been configured as a cluster node, + -- avoid leaking it into the git-annex log. + privateuuids <- annexPrivateRepos <$> Annex.getGitConfig + let nodeuuids' = S.filter + (\(ClusterNodeUUID n) -> S.notMember n privateuuids) + nodeuuids + + c <- currentVectorClock + Annex.Branch.change (Annex.Branch.RegardingUUID [fromClusterUUID clusteruuid]) clusterLog $ + (buildLogNew buildClusterNodeList) + . changeLog c (fromClusterUUID clusteruuid) nodeuuids' + . parseClusterLog + +buildClusterNodeList :: S.Set ClusterNodeUUID -> Builder +buildClusterNodeList = assemble + . map (buildUUID . fromClusterNodeUUID) + . S.toList + where + assemble [] = mempty + assemble (x:[]) = x + assemble (x:y:l) = x <> " " <> assemble (y:l) + +parseClusterLog :: L.ByteString -> Log (S.Set ClusterNodeUUID) +parseClusterLog = parseLogNew parseClusterNodeList + +parseClusterNodeList :: A.Parser (S.Set ClusterNodeUUID) +parseClusterNodeList = S.fromList <$> many parseword + where + parseword = parsenode + <* ((const () <$> A8.char ' ') <|> A.endOfInput) + parsenode = ClusterNodeUUID + <$> (toUUID <$> A8.takeWhile1 (/= ' ')) + diff --git a/Remote/Git.hs b/Remote/Git.hs index 6e3906892f..6c29c28cfe 100644 --- a/Remote/Git.hs +++ b/Remote/Git.hs @@ -45,6 +45,7 @@ import Types.CleanupActions import qualified CmdLine.GitAnnexShell.Fields as Fields import Logs.Location import Logs.Proxy +import Logs.Cluster.Basic import Utility.Metered import Utility.Env import Utility.Batch @@ -794,26 +795,27 @@ listProxied proxies rs = concat <$> mapM go rs else case M.lookup cu proxies of Nothing -> pure [] Just s -> catMaybes - <$> mapM (mkproxied g r) (S.toList s) + <$> mapM (mkproxied g r s) (S.toList s) proxiedremotename r p = do n <- Git.remoteName r pure $ n ++ "-" ++ proxyRemoteName p - mkproxied g r p = case proxiedremotename r p of + mkproxied g r proxied p = case proxiedremotename r p of Nothing -> pure Nothing - Just proxyname -> mkproxied' g r p proxyname + Just proxyname -> mkproxied' g r proxied p proxyname -- The proxied remote is constructed by renaming the proxy remote, -- changing its uuid, and setting the proxied remote's inherited -- configs and uuid in Annex state. - mkproxied' g r p proxyname + mkproxied' g r proxied p proxyname | any isconfig (M.keys (Git.config g)) = pure Nothing | otherwise = do + clusters <- getClustersWith id -- Not using addGitConfigOverride for inherited -- configs, because child git processes do not -- need them to be provided with -c. - Annex.adjustGitRepo (pure . annexconfigadjuster) + Annex.adjustGitRepo (pure . annexconfigadjuster clusters) return $ Just $ renamedr where renamedr = @@ -825,10 +827,11 @@ listProxied proxies rs = concat <$> mapM go rs , Git.fullconfig = c } - annexconfigadjuster r' = + annexconfigadjuster clusters r' = let c = adduuid (configRepoUUID renamedr) $ addurl $ addproxied $ + adjustclusternode clusters $ inheritconfigs $ Git.fullconfig r' in r' { Git.config = M.map Prelude.head c @@ -838,11 +841,27 @@ listProxied proxies rs = concat <$> mapM go rs adduuid ck = M.insert ck [Git.ConfigValue $ fromUUID $ proxyRemoteUUID p] - addurl = M.insert (remoteConfig renamedr "url") + addurl = M.insert (remoteConfig renamedr (remoteGitConfigKey UrlField)) [Git.ConfigValue $ encodeBS $ Git.repoLocation r] - addproxied = M.insert (remoteAnnexConfig renamedr "proxied") - [Git.ConfigValue $ Git.Config.boolConfig' True] + addproxied = addremoteannexfield ProxiedField True + + -- A node of a cluster that is being proxied along with + -- that cluster does not need to be synced with + -- by default, because syncing with the cluster will + -- effectively sync with all of its nodes. + adjustclusternode clusters = + case M.lookup (ClusterNodeUUID (proxyRemoteUUID p)) (clusterNodeUUIDs clusters) of + Just cs + | any (\c -> S.member (fromClusterUUID c) proxieduuids) (S.toList cs) -> + addremoteannexfield SyncField False + _ -> id + + proxieduuids = S.map proxyRemoteUUID proxied + + addremoteannexfield f b = M.insert + (remoteAnnexConfig renamedr (remoteGitConfigKey f)) + [Git.ConfigValue $ Git.Config.boolConfig' b] inheritconfigs c = foldl' inheritconfig c proxyInheritedFields diff --git a/Types/GitConfig.hs b/Types/GitConfig.hs index 56eb25713b..4b1827306c 100644 --- a/Types/GitConfig.hs +++ b/Types/GitConfig.hs @@ -22,7 +22,9 @@ module Types.GitConfig ( RemoteNameable(..), remoteAnnexConfig, remoteConfig, - proxyInheritedFields + RemoteGitConfigField(..), + remoteGitConfigKey, + proxyInheritedFields, ) where import Common diff --git a/doc/design/balanced_preferred_content.mdwn b/doc/design/balanced_preferred_content.mdwn index cd93d89313..a41483f329 100644 --- a/doc/design/balanced_preferred_content.mdwn +++ b/doc/design/balanced_preferred_content.mdwn @@ -124,9 +124,16 @@ See [[todo/proving_preferred_content_behavior]]. ## rebalancing In both the 3 of 5 use case and a split brain situation, it's possible for -content to end up not optimally balanced between repositories. git-annex -can be made to operate in a mode where it does additional work to rebalance -repositories. +content to end up not optimally balanced between repositories. + +(There are also situations where a cluster node ends up without a copy +of a file that is preferred content, or where adding a copy to a node +would satisfy numcopies. This can happen eg, when a client sends a file +to a single node rather than to the cluster. Rebalancing also will deal +with those.) + +git-annex can be made to operate in a mode where it does additional work +to rebalance repositories. This can be an option like --rebalance, that changes how the preferred content expression is evaluated. The user can choose where and when to run that. diff --git a/doc/todo/git-annex_proxies.mdwn b/doc/todo/git-annex_proxies.mdwn index 15637fe78a..99800b7c37 100644 --- a/doc/todo/git-annex_proxies.mdwn +++ b/doc/todo/git-annex_proxies.mdwn @@ -26,17 +26,8 @@ In development on the `proxy` branch. For June's work on [[design/passthrough_proxy]], remaining todos: -* On upload to cluster, send to nodes where it's preferred content, and not - to other nodes. - -* `git-annex sync --content` etc, when operating on clusters, should first - operate on the cluster as a whole, to take advantages of fanout on upload - and mass drop. Only operate on individual cluster nodes afterwards, - to handle cases such as a cluster containing a key, but some node - wanting and lacking the key. Perhaps just setting cost for nodes slightly - higher than the cluster cost will be enough? Or should it even send a key - to a cluster node if the cluster contains the key? Perhaps that is - unnecessary work, the cluster should be able to rebalance itself. +* On upload to cluster, send to nodes where its preferred content, and not + to other nodes. Unless no nodes prefer it, then what? * Getting a key from a cluster currently always selects the lowest cost remote, and always the same remote if cost is the same. Should @@ -116,3 +107,6 @@ For June's work on [[design/passthrough_proxy]], remaining todos: which UUIDs it was dropped from. (done) * `git-annex testremote` works against proxied remote and cluster. (done) + +* Avoid `git-annex sync --content` etc from operating on cluster nodes by + default since syncing with a cluster implicitly syncs with its nodes. (done) diff --git a/git-annex.cabal b/git-annex.cabal index 51ef9a0f44..c9c2397d03 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -821,6 +821,7 @@ Executable git-annex Logs.Chunk Logs.Chunk.Pure Logs.Cluster + Logs.Cluster.Basic Logs.Config Logs.ContentIdentifier Logs.ContentIdentifier.Pure