diff --git a/Annex/Cluster.hs b/Annex/Cluster.hs index cfb580dea5..10125e9c5f 100644 --- a/Annex/Cluster.hs +++ b/Annex/Cluster.hs @@ -47,7 +47,9 @@ proxyCluster clusteruuid proxydone servermode clientside protoerrhandler = do -- The protocol versions supported by the nodes are not -- known at this point, and would be too expensive to -- determine. Instead, pick the newest protocol version - -- that we and the client both speak. + -- that we and the client both speak. The proxy code + -- checks protocol versions when operating on multiple + -- nodes. let protocolversion = min maxProtocolVersion clientmaxversion selectnode <- clusterProxySelector clusteruuid protocolversion proxy proxydone proxymethods servermode clientside diff --git a/Logs/Cluster.hs b/Logs/Cluster.hs index 584dcb6867..8ee0dc975a 100644 --- a/Logs/Cluster.hs +++ b/Logs/Cluster.hs @@ -16,83 +16,26 @@ module Logs.Cluster ( import qualified Annex import Annex.Common -import qualified Annex.Branch import Types.Cluster -import Logs -import Logs.UUIDBased -import Logs.MapLog +import Logs.Cluster.Basic import Logs.Trust -import qualified Data.Set as S import qualified Data.Map as M -import Data.ByteString.Builder -import qualified Data.Attoparsec.ByteString as A -import qualified Data.Attoparsec.ByteString.Char8 as A8 -import qualified Data.ByteString.Lazy as L +import qualified Data.Set as S getClusters :: Annex Clusters getClusters = maybe loadClusters return =<< Annex.getState Annex.clusters -{- Loads the clusters and caches it for later. -} +{- Loads the clusters and caches it for later. + - + - This takes care of removing dead nodes from clusters, + - to avoid inserting the cluster uuid into the location + - log when only dead nodes contain the content of a key. + -} loadClusters :: Annex Clusters loadClusters = do - m <- convclusteruuids . M.map value . fromMapLog . parseClusterLog - <$> Annex.Branch.get clusterLog - m' <- removedeadnodes m - let clusters = Clusters - { clusterUUIDs = m' - , clusterNodeUUIDs = M.foldlWithKey inverter mempty m' - } + dead <- (S.fromList . map ClusterNodeUUID) + <$> trustGet DeadTrusted + clusters <- getClustersWith (M.map (`S.difference` dead)) Annex.changeState $ \s -> s { Annex.clusters = Just clusters } return clusters - where - convclusteruuids :: M.Map UUID (S.Set ClusterNodeUUID) -> M.Map ClusterUUID (S.Set ClusterNodeUUID) - convclusteruuids = M.fromList - . mapMaybe (\(mk, v) -> (, v) <$> mk) - . M.toList . M.mapKeys mkClusterUUID - inverter m k v = M.unionWith (<>) m - (M.fromList (map (, S.singleton k) (S.toList v))) - - -- Dead nodes are removed from clusters to avoid inserting the - -- cluster uuid into the location log when only dead nodes contain - -- the content of a key. - removedeadnodes m = do - dead <- (S.fromList . map ClusterNodeUUID) - <$> trustGet DeadTrusted - return $ M.map (`S.difference` dead) m - -recordCluster :: ClusterUUID -> S.Set ClusterNodeUUID -> Annex () -recordCluster clusteruuid nodeuuids = do - -- If a private UUID has been configured as a cluster node, - -- avoid leaking it into the git-annex log. - privateuuids <- annexPrivateRepos <$> Annex.getGitConfig - let nodeuuids' = S.filter - (\(ClusterNodeUUID n) -> S.notMember n privateuuids) - nodeuuids - - c <- currentVectorClock - Annex.Branch.change (Annex.Branch.RegardingUUID [fromClusterUUID clusteruuid]) clusterLog $ - (buildLogNew buildClusterNodeList) - . changeLog c (fromClusterUUID clusteruuid) nodeuuids' - . parseClusterLog - -buildClusterNodeList :: S.Set ClusterNodeUUID -> Builder -buildClusterNodeList = assemble - . map (buildUUID . fromClusterNodeUUID) - . S.toList - where - assemble [] = mempty - assemble (x:[]) = x - assemble (x:y:l) = x <> " " <> assemble (y:l) - -parseClusterLog :: L.ByteString -> Log (S.Set ClusterNodeUUID) -parseClusterLog = parseLogNew parseClusterNodeList - -parseClusterNodeList :: A.Parser (S.Set ClusterNodeUUID) -parseClusterNodeList = S.fromList <$> many parseword - where - parseword = parsenode - <* ((const () <$> A8.char ' ') <|> A.endOfInput) - parsenode = ClusterNodeUUID - <$> (toUUID <$> A8.takeWhile1 (/= ' ')) - diff --git a/Logs/Cluster/Basic.hs b/Logs/Cluster/Basic.hs new file mode 100644 index 0000000000..f1b5e87649 --- /dev/null +++ b/Logs/Cluster/Basic.hs @@ -0,0 +1,91 @@ +{- git-annex cluster log, basics + - + - Copyright 2024 Joey Hess + - + - Licensed under the GNU AGPL version 3 or higher. + -} + +{-# LANGUAGE OverloadedStrings, TupleSections #-} + +module Logs.Cluster.Basic ( + module Types.Cluster, + getClustersWith, + recordCluster, +) where + +import qualified Annex +import Annex.Common +import qualified Annex.Branch +import Types.Cluster +import Logs +import Logs.UUIDBased +import Logs.MapLog + +import qualified Data.Set as S +import qualified Data.Map as M +import Data.ByteString.Builder +import qualified Data.Attoparsec.ByteString as A +import qualified Data.Attoparsec.ByteString.Char8 as A8 +import qualified Data.ByteString.Lazy as L + +{- Gets the clusters. Note that this includes any dead nodes, + - unless a function is provided to remove them. + -} +getClustersWith + :: (M.Map ClusterUUID (S.Set ClusterNodeUUID) + -> M.Map ClusterUUID (S.Set ClusterNodeUUID)) + -> Annex Clusters +getClustersWith removedeadnodes = do + m <- removedeadnodes + . convclusteruuids + . M.map value + . fromMapLog + . parseClusterLog + <$> Annex.Branch.get clusterLog + return $ Clusters + { clusterUUIDs = m + , clusterNodeUUIDs = M.foldlWithKey inverter mempty m + } + where + convclusteruuids :: M.Map UUID (S.Set ClusterNodeUUID) -> M.Map ClusterUUID (S.Set ClusterNodeUUID) + convclusteruuids = M.fromList + . mapMaybe (\(mk, v) -> (, v) <$> mk) + . M.toList . M.mapKeys mkClusterUUID + inverter m k v = M.unionWith (<>) m + (M.fromList (map (, S.singleton k) (S.toList v))) + +recordCluster :: ClusterUUID -> S.Set ClusterNodeUUID -> Annex () +recordCluster clusteruuid nodeuuids = do + -- If a private UUID has been configured as a cluster node, + -- avoid leaking it into the git-annex log. + privateuuids <- annexPrivateRepos <$> Annex.getGitConfig + let nodeuuids' = S.filter + (\(ClusterNodeUUID n) -> S.notMember n privateuuids) + nodeuuids + + c <- currentVectorClock + Annex.Branch.change (Annex.Branch.RegardingUUID [fromClusterUUID clusteruuid]) clusterLog $ + (buildLogNew buildClusterNodeList) + . changeLog c (fromClusterUUID clusteruuid) nodeuuids' + . parseClusterLog + +buildClusterNodeList :: S.Set ClusterNodeUUID -> Builder +buildClusterNodeList = assemble + . map (buildUUID . fromClusterNodeUUID) + . S.toList + where + assemble [] = mempty + assemble (x:[]) = x + assemble (x:y:l) = x <> " " <> assemble (y:l) + +parseClusterLog :: L.ByteString -> Log (S.Set ClusterNodeUUID) +parseClusterLog = parseLogNew parseClusterNodeList + +parseClusterNodeList :: A.Parser (S.Set ClusterNodeUUID) +parseClusterNodeList = S.fromList <$> many parseword + where + parseword = parsenode + <* ((const () <$> A8.char ' ') <|> A.endOfInput) + parsenode = ClusterNodeUUID + <$> (toUUID <$> A8.takeWhile1 (/= ' ')) + diff --git a/Remote/Git.hs b/Remote/Git.hs index 6e3906892f..6c29c28cfe 100644 --- a/Remote/Git.hs +++ b/Remote/Git.hs @@ -45,6 +45,7 @@ import Types.CleanupActions import qualified CmdLine.GitAnnexShell.Fields as Fields import Logs.Location import Logs.Proxy +import Logs.Cluster.Basic import Utility.Metered import Utility.Env import Utility.Batch @@ -794,26 +795,27 @@ listProxied proxies rs = concat <$> mapM go rs else case M.lookup cu proxies of Nothing -> pure [] Just s -> catMaybes - <$> mapM (mkproxied g r) (S.toList s) + <$> mapM (mkproxied g r s) (S.toList s) proxiedremotename r p = do n <- Git.remoteName r pure $ n ++ "-" ++ proxyRemoteName p - mkproxied g r p = case proxiedremotename r p of + mkproxied g r proxied p = case proxiedremotename r p of Nothing -> pure Nothing - Just proxyname -> mkproxied' g r p proxyname + Just proxyname -> mkproxied' g r proxied p proxyname -- The proxied remote is constructed by renaming the proxy remote, -- changing its uuid, and setting the proxied remote's inherited -- configs and uuid in Annex state. - mkproxied' g r p proxyname + mkproxied' g r proxied p proxyname | any isconfig (M.keys (Git.config g)) = pure Nothing | otherwise = do + clusters <- getClustersWith id -- Not using addGitConfigOverride for inherited -- configs, because child git processes do not -- need them to be provided with -c. - Annex.adjustGitRepo (pure . annexconfigadjuster) + Annex.adjustGitRepo (pure . annexconfigadjuster clusters) return $ Just $ renamedr where renamedr = @@ -825,10 +827,11 @@ listProxied proxies rs = concat <$> mapM go rs , Git.fullconfig = c } - annexconfigadjuster r' = + annexconfigadjuster clusters r' = let c = adduuid (configRepoUUID renamedr) $ addurl $ addproxied $ + adjustclusternode clusters $ inheritconfigs $ Git.fullconfig r' in r' { Git.config = M.map Prelude.head c @@ -838,11 +841,27 @@ listProxied proxies rs = concat <$> mapM go rs adduuid ck = M.insert ck [Git.ConfigValue $ fromUUID $ proxyRemoteUUID p] - addurl = M.insert (remoteConfig renamedr "url") + addurl = M.insert (remoteConfig renamedr (remoteGitConfigKey UrlField)) [Git.ConfigValue $ encodeBS $ Git.repoLocation r] - addproxied = M.insert (remoteAnnexConfig renamedr "proxied") - [Git.ConfigValue $ Git.Config.boolConfig' True] + addproxied = addremoteannexfield ProxiedField True + + -- A node of a cluster that is being proxied along with + -- that cluster does not need to be synced with + -- by default, because syncing with the cluster will + -- effectively sync with all of its nodes. + adjustclusternode clusters = + case M.lookup (ClusterNodeUUID (proxyRemoteUUID p)) (clusterNodeUUIDs clusters) of + Just cs + | any (\c -> S.member (fromClusterUUID c) proxieduuids) (S.toList cs) -> + addremoteannexfield SyncField False + _ -> id + + proxieduuids = S.map proxyRemoteUUID proxied + + addremoteannexfield f b = M.insert + (remoteAnnexConfig renamedr (remoteGitConfigKey f)) + [Git.ConfigValue $ Git.Config.boolConfig' b] inheritconfigs c = foldl' inheritconfig c proxyInheritedFields diff --git a/Types/GitConfig.hs b/Types/GitConfig.hs index 56eb25713b..4b1827306c 100644 --- a/Types/GitConfig.hs +++ b/Types/GitConfig.hs @@ -22,7 +22,9 @@ module Types.GitConfig ( RemoteNameable(..), remoteAnnexConfig, remoteConfig, - proxyInheritedFields + RemoteGitConfigField(..), + remoteGitConfigKey, + proxyInheritedFields, ) where import Common diff --git a/doc/design/balanced_preferred_content.mdwn b/doc/design/balanced_preferred_content.mdwn index cd93d89313..a41483f329 100644 --- a/doc/design/balanced_preferred_content.mdwn +++ b/doc/design/balanced_preferred_content.mdwn @@ -124,9 +124,16 @@ See [[todo/proving_preferred_content_behavior]]. ## rebalancing In both the 3 of 5 use case and a split brain situation, it's possible for -content to end up not optimally balanced between repositories. git-annex -can be made to operate in a mode where it does additional work to rebalance -repositories. +content to end up not optimally balanced between repositories. + +(There are also situations where a cluster node ends up without a copy +of a file that is preferred content, or where adding a copy to a node +would satisfy numcopies. This can happen eg, when a client sends a file +to a single node rather than to the cluster. Rebalancing also will deal +with those.) + +git-annex can be made to operate in a mode where it does additional work +to rebalance repositories. This can be an option like --rebalance, that changes how the preferred content expression is evaluated. The user can choose where and when to run that. diff --git a/doc/todo/git-annex_proxies.mdwn b/doc/todo/git-annex_proxies.mdwn index 15637fe78a..99800b7c37 100644 --- a/doc/todo/git-annex_proxies.mdwn +++ b/doc/todo/git-annex_proxies.mdwn @@ -26,17 +26,8 @@ In development on the `proxy` branch. For June's work on [[design/passthrough_proxy]], remaining todos: -* On upload to cluster, send to nodes where it's preferred content, and not - to other nodes. - -* `git-annex sync --content` etc, when operating on clusters, should first - operate on the cluster as a whole, to take advantages of fanout on upload - and mass drop. Only operate on individual cluster nodes afterwards, - to handle cases such as a cluster containing a key, but some node - wanting and lacking the key. Perhaps just setting cost for nodes slightly - higher than the cluster cost will be enough? Or should it even send a key - to a cluster node if the cluster contains the key? Perhaps that is - unnecessary work, the cluster should be able to rebalance itself. +* On upload to cluster, send to nodes where its preferred content, and not + to other nodes. Unless no nodes prefer it, then what? * Getting a key from a cluster currently always selects the lowest cost remote, and always the same remote if cost is the same. Should @@ -116,3 +107,6 @@ For June's work on [[design/passthrough_proxy]], remaining todos: which UUIDs it was dropped from. (done) * `git-annex testremote` works against proxied remote and cluster. (done) + +* Avoid `git-annex sync --content` etc from operating on cluster nodes by + default since syncing with a cluster implicitly syncs with its nodes. (done) diff --git a/git-annex.cabal b/git-annex.cabal index 51ef9a0f44..c9c2397d03 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -821,6 +821,7 @@ Executable git-annex Logs.Chunk Logs.Chunk.Pure Logs.Cluster + Logs.Cluster.Basic Logs.Config Logs.ContentIdentifier Logs.ContentIdentifier.Pure