git-annex/Logs/Cluster.hs

99 lines
3 KiB
Haskell
Raw Normal View History

{- git-annex cluster log
-
- Copyright 2024 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE OverloadedStrings, TupleSections #-}
module Logs.Cluster (
module Types.Cluster,
getClusters,
remove dead nodes when loading the cluster log This is to avoid inserting a cluster uuid into the location log when only dead nodes in the cluster contain the content of a key. One reason why this is necessary is Remote.keyLocations, which excludes dead repositories from the list. But there are probably many more. Implementing this was challenging, because Logs.Location importing Logs.Cluster which imports Logs.Trust which imports Remote.List resulted in an import cycle through several other modules. Resorted to making Logs.Location not import Logs.Cluster, and instead it assumes that Annex.clusters gets populated when necessary before it's called. That's done in Annex.Startup, which is run by the git-annex command (but not other commands) at early startup in initialized repos. Or, is run after initialization. Note that is Remote.Git, it is unable to import Annex.Startup, because Remote.Git importing Logs.Cluster leads the the same import cycle. So ensureInitialized is not passed annexStartup in there. Other commands, like git-annex-shell currently don't run annexStartup either. So there are cases where Logs.Location will not see clusters. So it won't add any cluster UUIDs when loading the log. That's ok, the only reason to do that is to make display of where objects are located include clusters, and to make commands like git-annex get --from treat keys as being located in a cluster. git-annex-shell certainly does not do anything like that, and I'm pretty sure Remote.Git (and callers to Remote.Git.onLocalRepo) don't either.
2024-06-16 18:35:07 +00:00
loadClusters,
recordCluster,
) where
import qualified Annex
import Annex.Common
import qualified Annex.Branch
import Types.Cluster
import Logs
import Logs.UUIDBased
import Logs.MapLog
remove dead nodes when loading the cluster log This is to avoid inserting a cluster uuid into the location log when only dead nodes in the cluster contain the content of a key. One reason why this is necessary is Remote.keyLocations, which excludes dead repositories from the list. But there are probably many more. Implementing this was challenging, because Logs.Location importing Logs.Cluster which imports Logs.Trust which imports Remote.List resulted in an import cycle through several other modules. Resorted to making Logs.Location not import Logs.Cluster, and instead it assumes that Annex.clusters gets populated when necessary before it's called. That's done in Annex.Startup, which is run by the git-annex command (but not other commands) at early startup in initialized repos. Or, is run after initialization. Note that is Remote.Git, it is unable to import Annex.Startup, because Remote.Git importing Logs.Cluster leads the the same import cycle. So ensureInitialized is not passed annexStartup in there. Other commands, like git-annex-shell currently don't run annexStartup either. So there are cases where Logs.Location will not see clusters. So it won't add any cluster UUIDs when loading the log. That's ok, the only reason to do that is to make display of where objects are located include clusters, and to make commands like git-annex get --from treat keys as being located in a cluster. git-annex-shell certainly does not do anything like that, and I'm pretty sure Remote.Git (and callers to Remote.Git.onLocalRepo) don't either.
2024-06-16 18:35:07 +00:00
import Logs.Trust
import qualified Data.Set as S
import qualified Data.Map as M
import Data.ByteString.Builder
import qualified Data.Attoparsec.ByteString as A
import qualified Data.Attoparsec.ByteString.Char8 as A8
import qualified Data.ByteString.Lazy as L
getClusters :: Annex Clusters
remove dead nodes when loading the cluster log This is to avoid inserting a cluster uuid into the location log when only dead nodes in the cluster contain the content of a key. One reason why this is necessary is Remote.keyLocations, which excludes dead repositories from the list. But there are probably many more. Implementing this was challenging, because Logs.Location importing Logs.Cluster which imports Logs.Trust which imports Remote.List resulted in an import cycle through several other modules. Resorted to making Logs.Location not import Logs.Cluster, and instead it assumes that Annex.clusters gets populated when necessary before it's called. That's done in Annex.Startup, which is run by the git-annex command (but not other commands) at early startup in initialized repos. Or, is run after initialization. Note that is Remote.Git, it is unable to import Annex.Startup, because Remote.Git importing Logs.Cluster leads the the same import cycle. So ensureInitialized is not passed annexStartup in there. Other commands, like git-annex-shell currently don't run annexStartup either. So there are cases where Logs.Location will not see clusters. So it won't add any cluster UUIDs when loading the log. That's ok, the only reason to do that is to make display of where objects are located include clusters, and to make commands like git-annex get --from treat keys as being located in a cluster. git-annex-shell certainly does not do anything like that, and I'm pretty sure Remote.Git (and callers to Remote.Git.onLocalRepo) don't either.
2024-06-16 18:35:07 +00:00
getClusters = maybe loadClusters return =<< Annex.getState Annex.clusters
2024-06-14 15:16:01 +00:00
remove dead nodes when loading the cluster log This is to avoid inserting a cluster uuid into the location log when only dead nodes in the cluster contain the content of a key. One reason why this is necessary is Remote.keyLocations, which excludes dead repositories from the list. But there are probably many more. Implementing this was challenging, because Logs.Location importing Logs.Cluster which imports Logs.Trust which imports Remote.List resulted in an import cycle through several other modules. Resorted to making Logs.Location not import Logs.Cluster, and instead it assumes that Annex.clusters gets populated when necessary before it's called. That's done in Annex.Startup, which is run by the git-annex command (but not other commands) at early startup in initialized repos. Or, is run after initialization. Note that is Remote.Git, it is unable to import Annex.Startup, because Remote.Git importing Logs.Cluster leads the the same import cycle. So ensureInitialized is not passed annexStartup in there. Other commands, like git-annex-shell currently don't run annexStartup either. So there are cases where Logs.Location will not see clusters. So it won't add any cluster UUIDs when loading the log. That's ok, the only reason to do that is to make display of where objects are located include clusters, and to make commands like git-annex get --from treat keys as being located in a cluster. git-annex-shell certainly does not do anything like that, and I'm pretty sure Remote.Git (and callers to Remote.Git.onLocalRepo) don't either.
2024-06-16 18:35:07 +00:00
{- Loads the clusters and caches it for later. -}
2024-06-14 15:16:01 +00:00
loadClusters :: Annex Clusters
loadClusters = do
m <- convclusteruuids . M.map value . fromMapLog . parseClusterLog
<$> Annex.Branch.get clusterLog
remove dead nodes when loading the cluster log This is to avoid inserting a cluster uuid into the location log when only dead nodes in the cluster contain the content of a key. One reason why this is necessary is Remote.keyLocations, which excludes dead repositories from the list. But there are probably many more. Implementing this was challenging, because Logs.Location importing Logs.Cluster which imports Logs.Trust which imports Remote.List resulted in an import cycle through several other modules. Resorted to making Logs.Location not import Logs.Cluster, and instead it assumes that Annex.clusters gets populated when necessary before it's called. That's done in Annex.Startup, which is run by the git-annex command (but not other commands) at early startup in initialized repos. Or, is run after initialization. Note that is Remote.Git, it is unable to import Annex.Startup, because Remote.Git importing Logs.Cluster leads the the same import cycle. So ensureInitialized is not passed annexStartup in there. Other commands, like git-annex-shell currently don't run annexStartup either. So there are cases where Logs.Location will not see clusters. So it won't add any cluster UUIDs when loading the log. That's ok, the only reason to do that is to make display of where objects are located include clusters, and to make commands like git-annex get --from treat keys as being located in a cluster. git-annex-shell certainly does not do anything like that, and I'm pretty sure Remote.Git (and callers to Remote.Git.onLocalRepo) don't either.
2024-06-16 18:35:07 +00:00
m' <- removedeadnodes m
2024-06-14 15:16:01 +00:00
let clusters = Clusters
remove dead nodes when loading the cluster log This is to avoid inserting a cluster uuid into the location log when only dead nodes in the cluster contain the content of a key. One reason why this is necessary is Remote.keyLocations, which excludes dead repositories from the list. But there are probably many more. Implementing this was challenging, because Logs.Location importing Logs.Cluster which imports Logs.Trust which imports Remote.List resulted in an import cycle through several other modules. Resorted to making Logs.Location not import Logs.Cluster, and instead it assumes that Annex.clusters gets populated when necessary before it's called. That's done in Annex.Startup, which is run by the git-annex command (but not other commands) at early startup in initialized repos. Or, is run after initialization. Note that is Remote.Git, it is unable to import Annex.Startup, because Remote.Git importing Logs.Cluster leads the the same import cycle. So ensureInitialized is not passed annexStartup in there. Other commands, like git-annex-shell currently don't run annexStartup either. So there are cases where Logs.Location will not see clusters. So it won't add any cluster UUIDs when loading the log. That's ok, the only reason to do that is to make display of where objects are located include clusters, and to make commands like git-annex get --from treat keys as being located in a cluster. git-annex-shell certainly does not do anything like that, and I'm pretty sure Remote.Git (and callers to Remote.Git.onLocalRepo) don't either.
2024-06-16 18:35:07 +00:00
{ clusterUUIDs = m'
, clusterNodeUUIDs = M.foldlWithKey inverter mempty m'
}
2024-06-14 15:16:01 +00:00
Annex.changeState $ \s -> s { Annex.clusters = Just clusters }
return clusters
where
convclusteruuids :: M.Map UUID (S.Set ClusterNodeUUID) -> M.Map ClusterUUID (S.Set ClusterNodeUUID)
convclusteruuids = M.fromList
. mapMaybe (\(mk, v) -> (, v) <$> mk)
. M.toList . M.mapKeys mkClusterUUID
inverter m k v = M.unionWith (<>) m
(M.fromList (map (, S.singleton k) (S.toList v)))
remove dead nodes when loading the cluster log This is to avoid inserting a cluster uuid into the location log when only dead nodes in the cluster contain the content of a key. One reason why this is necessary is Remote.keyLocations, which excludes dead repositories from the list. But there are probably many more. Implementing this was challenging, because Logs.Location importing Logs.Cluster which imports Logs.Trust which imports Remote.List resulted in an import cycle through several other modules. Resorted to making Logs.Location not import Logs.Cluster, and instead it assumes that Annex.clusters gets populated when necessary before it's called. That's done in Annex.Startup, which is run by the git-annex command (but not other commands) at early startup in initialized repos. Or, is run after initialization. Note that is Remote.Git, it is unable to import Annex.Startup, because Remote.Git importing Logs.Cluster leads the the same import cycle. So ensureInitialized is not passed annexStartup in there. Other commands, like git-annex-shell currently don't run annexStartup either. So there are cases where Logs.Location will not see clusters. So it won't add any cluster UUIDs when loading the log. That's ok, the only reason to do that is to make display of where objects are located include clusters, and to make commands like git-annex get --from treat keys as being located in a cluster. git-annex-shell certainly does not do anything like that, and I'm pretty sure Remote.Git (and callers to Remote.Git.onLocalRepo) don't either.
2024-06-16 18:35:07 +00:00
-- Dead nodes are removed from clusters to avoid inserting the
-- cluster uuid into the location log when only dead nodes contain
-- the content of a key.
removedeadnodes m = do
dead <- (S.fromList . map ClusterNodeUUID)
<$> trustGet DeadTrusted
return $ M.map (`S.difference` dead) m
recordCluster :: ClusterUUID -> S.Set ClusterNodeUUID -> Annex ()
recordCluster clusteruuid nodeuuids = do
-- If a private UUID has been configured as a cluster node,
-- avoid leaking it into the git-annex log.
privateuuids <- annexPrivateRepos <$> Annex.getGitConfig
let nodeuuids' = S.filter
(\(ClusterNodeUUID n) -> S.notMember n privateuuids)
nodeuuids
c <- currentVectorClock
Annex.Branch.change (Annex.Branch.RegardingUUID [fromClusterUUID clusteruuid]) clusterLog $
(buildLogNew buildClusterNodeList)
. changeLog c (fromClusterUUID clusteruuid) nodeuuids'
. parseClusterLog
buildClusterNodeList :: S.Set ClusterNodeUUID -> Builder
buildClusterNodeList = assemble
. map (buildUUID . fromClusterNodeUUID)
. S.toList
where
assemble [] = mempty
assemble (x:[]) = x
assemble (x:y:l) = x <> " " <> assemble (y:l)
parseClusterLog :: L.ByteString -> Log (S.Set ClusterNodeUUID)
parseClusterLog = parseLogNew parseClusterNodeList
parseClusterNodeList :: A.Parser (S.Set ClusterNodeUUID)
parseClusterNodeList = S.fromList <$> many parseword
where
parseword = parsenode
<* ((const () <$> A8.char ' ') <|> A.endOfInput)
parsenode = ClusterNodeUUID
<$> (toUUID <$> A8.takeWhile1 (/= ' '))