implement cluster.log

Not used yet. (Or tested.)

I did consider making the log start with the uuid of the node, followed
by the cluster uuid (or uuids). That would perhaps mean a smaller write
to the git-annex branch when adding a node, but overall the log file
would be larger, and it will be read and cached near to startup on most
git-annex runs.
This commit is contained in:
Joey Hess 2024-06-13 16:00:58 -04:00
parent 01f5015f30
commit aa56d433d5
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
6 changed files with 135 additions and 7 deletions

View file

@ -99,6 +99,7 @@ topLevelNewUUIDBasedLogs :: [RawFilePath]
topLevelNewUUIDBasedLogs =
[ exportLog
, proxyLog
, clusterLog
]
{- Other top-level logs. -}
@ -158,6 +159,9 @@ exportLog = "export.log"
proxyLog :: RawFilePath
proxyLog = "proxy.log"
clusterLog :: RawFilePath
clusterLog = "cluster.log"
{- This is not a log file, it's where exported treeishes get grafted into
- the git-annex branch. -}
exportTreeGraftPoint :: RawFilePath

82
Logs/Cluster.hs Normal file
View file

@ -0,0 +1,82 @@
{- git-annex cluster log
-
- Copyright 2024 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE OverloadedStrings, TupleSections #-}
module Logs.Cluster (
ClusterUUID(..),
ClusterNodeUUID(..),
getClusters,
recordCluster,
) where
import qualified Annex
import Annex.Common
import qualified Annex.Branch
import Types.Cluster
import Logs
import Logs.UUIDBased
import Logs.MapLog
import Annex.UUID
import qualified Data.Set as S
import qualified Data.Map as M
import Data.ByteString.Builder
import qualified Data.Attoparsec.ByteString as A
import qualified Data.Attoparsec.ByteString.Char8 as A8
import qualified Data.ByteString.Lazy as L
-- TODO caching
getClusters :: Annex Clusters
getClusters = do
m <- M.mapKeys ClusterUUID . M.map value
. fromMapLog . parseClusterLog
<$> Annex.Branch.get clusterLog
return $ Clusters
{ clusterUUIDs = m
, clusterNodeUUIDs = M.foldlWithKey inverter mempty m
}
where
inverter m k v = M.unionWith (<>) m
(M.fromList (map (, S.singleton k) (S.toList v)))
recordCluster :: ClusterUUID -> S.Set ClusterNodeUUID -> Annex ()
recordCluster clusteruuid nodeuuids = do
-- If a private UUID has been configured as a cluster node,
-- avoid leaking it into the git-annex log.
privateuuids <- annexPrivateRepos <$> Annex.getGitConfig
let nodeuuids' = S.filter
(\(ClusterNodeUUID n) -> S.notMember n privateuuids)
nodeuuids
c <- currentVectorClock
u <- getUUID
Annex.Branch.change (Annex.Branch.RegardingUUID [fromClusterUUID clusteruuid]) clusterLog $
(buildLogNew buildClusterNodeList)
. changeLog c u nodeuuids'
. parseClusterLog
buildClusterNodeList :: S.Set ClusterNodeUUID -> Builder
buildClusterNodeList = assemble
. map (buildUUID . fromClusterNodeUUID)
. S.toList
where
assemble [] = mempty
assemble (x:[]) = x
assemble (x:y:l) = x <> " " <> assemble (y:l)
parseClusterLog :: L.ByteString -> Log (S.Set ClusterNodeUUID)
parseClusterLog = parseLogNew parseClusterNodeList
parseClusterNodeList :: A.Parser (S.Set ClusterNodeUUID)
parseClusterNodeList = S.fromList <$> many parseword
where
parseword = parsenode
<* ((const () <$> A8.char ' ') <|> A.endOfInput)
parsenode = ClusterNodeUUID
<$> (toUUID <$> A8.takeWhile1 (/= ' '))

View file

@ -13,8 +13,6 @@ module Logs.Proxy (
recordProxies,
) where
import qualified Data.Map as M
import qualified Annex
import Annex.Common
import qualified Annex.Branch
@ -26,6 +24,7 @@ import Logs.MapLog
import Annex.UUID
import qualified Data.Set as S
import qualified Data.Map as M
import Data.ByteString.Builder
import qualified Data.Attoparsec.ByteString as A
import qualified Data.Attoparsec.ByteString.Char8 as A8

29
Types/Cluster.hs Normal file
View file

@ -0,0 +1,29 @@
{- git-annex cluster types
-
- Copyright 2024 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
module Types.Cluster where
import Types.UUID
import qualified Data.Set as S
import qualified Data.Map as M
-- The UUID of a cluster as a whole.
newtype ClusterUUID = ClusterUUID { fromClusterUUID :: UUID }
deriving (Show, Eq, Ord)
-- The UUID of a node in a cluster. The UUID can be either the UUID of a
-- repository, or of another cluster.
newtype ClusterNodeUUID = ClusterNodeUUID { fromClusterNodeUUID :: UUID }
deriving (Show, Eq, Ord)
-- The same information is indexed two ways to allow fast lookups in either
-- direction.
data Clusters = Clusters
{ clusterUUIDs :: M.Map ClusterUUID (S.Set ClusterNodeUUID)
, clusterNodeUUIDs :: M.Map ClusterNodeUUID (S.Set ClusterUUID)
}

View file

@ -288,7 +288,7 @@ For example:
These log files store per-remote content identifiers for keys.
A given key may have any number of content identifiers.
The format is a timestamp, followed by the uuid of the remote,
The format is a timestamp, followed by the UUID of the remote,
followed by the content identifiers which are separated by colons.
If a content identifier contains a colon or \r or \n, it will be base64
encoded. Base64 encoded values are indicated by prefixing them with "!".
@ -312,17 +312,29 @@ For example, this logs that a remote has an object stored using both
Used to record what repositories are accessible via a proxy.
Each line starts with a timestamp, then the uuid of the repository
Each line starts with a timestamp, then the UUID of the repository
that can serve as a proxy, and then a list of the remotes that it can
proxy to, separated by spaces.
Each remote in the list consists of a uuid, followed by a colon (`:`)
and then a remote name.
Each remote in the list consists of a repository's UUID,
followed by a colon (`:`) and then a remote name.
For example:
1317929100.012345s e605dca6-446a-11e0-8b2a-002170d25c55 26339d22-446b-11e0-9101-002170d25c55:foo c076460c-2290-11ef-be53-b7f0d194c863:bar
## `cluster.log`
Used to record the UUIDs of clusters, and the UUIDs of the nodes
comprising each cluster.
Each line starts with a timestamp, then the UUID the cluster,
followed by a list of the UUIDs of its nodes, separated by spaces.
For example:
1317929100.012345s 5b070cc8-29b8-11ef-80e1-0fd524be241b 5c0c97d2-29b8-11ef-b1d2-5f3d1c80940d 5c40375e-29b8-11ef-814d-872959d2c013
## `schedule.log`
Used to record scheduled events, such as periodic fscks.

View file

@ -815,6 +815,7 @@ Executable git-annex
Logs.AdjustedBranchUpdate
Logs.Chunk
Logs.Chunk.Pure
Logs.Cluster
Logs.Config
Logs.ContentIdentifier
Logs.ContentIdentifier.Pure
@ -933,6 +934,7 @@ Executable git-annex
Types.BranchState
Types.CatFileHandles
Types.CleanupActions
Types.Cluster
Types.Command
Types.Concurrency
Types.Creds