From 5c99f6247e758d2f57ac81deee2e45917d6ca352 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 31 Aug 2018 12:23:22 -0400 Subject: [PATCH] per-remote metadata storage Actually very straightforward reuse of the metadata log file code. Although I had to add a todo item as git-annex forget won't clean up dead remote's metadata yet. This would be worth adding to the external special remote interface sometime. Have not opened a todo though, guess I'll wait until something needs it. This commit was supported by the NSF-funded DataLad project. --- Command/MetaData.hs | 2 +- Logs.hs | 14 ++++- Logs/MetaData.hs | 51 ++++++++++++++----- Types/MetaData.hs | 31 ++++++++++- doc/internals.mdwn | 18 +++++++ ...adata_needs_to_be_cleaned_in_dropdead.mdwn | 2 + 6 files changed, 100 insertions(+), 18 deletions(-) create mode 100644 doc/todo/per-remote_metadata_needs_to_be_cleaned_in_dropdead.mdwn diff --git a/Command/MetaData.hs b/Command/MetaData.hs index 23f16a53a0..1e9e43423f 100644 --- a/Command/MetaData.hs +++ b/Command/MetaData.hs @@ -112,7 +112,7 @@ perform c o k = case getSet o of Set ms -> do oldm <- getCurrentMetaData k let m = combineMetaData $ map (modMeta oldm) ms - addMetaData' k m c + addMetaDataClocked k m c next $ cleanup k _ -> next $ cleanup k diff --git a/Logs.hs b/Logs.hs index 7b6c7dd20d..db865716b9 100644 --- a/Logs.hs +++ b/Logs.hs @@ -1,6 +1,6 @@ {- git-annex log file names - - - Copyright 2013-2015 Joey Hess + - Copyright 2013-2018 Joey Hess - - Licensed under the GNU GPL version 3 or higher. -} @@ -26,7 +26,7 @@ getLogVariety f | f `elem` topLevelUUIDBasedLogs = Just UUIDBasedLog | isRemoteStateLog f = Just NewUUIDBasedLog | isChunkLog f = ChunkLog <$> chunkLogFileKey f - | isMetaDataLog f || f `elem` otherLogs = Just OtherLog + | isMetaDataLog f || isRemoteMetaDataLog f || f `elem` otherLogs = Just OtherLog | otherwise = PresenceLog <$> firstJust (presenceLogs f) {- All the uuid-based logs stored in the top of the git-annex branch. -} @@ -185,3 +185,13 @@ metaDataLogExt = ".log.met" isMetaDataLog :: FilePath -> Bool isMetaDataLog path = metaDataLogExt `isSuffixOf` path + +{- The filename of the remote metadata log for a given key. -} +remoteMetaDataLogFile :: GitConfig -> Key -> FilePath +remoteMetaDataLogFile config key = branchHashDir config key keyFile key ++ remoteMetaDataLogExt + +remoteMetaDataLogExt :: String +remoteMetaDataLogExt = ".log.rmet" + +isRemoteMetaDataLog :: FilePath -> Bool +isRemoteMetaDataLog path = remoteMetaDataLogExt `isSuffixOf` path diff --git a/Logs/MetaData.hs b/Logs/MetaData.hs index 0393702bc1..4610ef481f 100644 --- a/Logs/MetaData.hs +++ b/Logs/MetaData.hs @@ -1,7 +1,9 @@ -{- git-annex general metadata storage log +{- git-annex general metadata storage log and per-remote metadata storage log. - - A line of the log will look like "timestamp field [+-]value [...]" - + - (In the per-remote log, each field is prefixed with "uuid:") + - - Note that unset values are preserved. Consider this case: - - We have: @@ -18,7 +20,7 @@ - and so foo currently has no value. - - - - Copyright 2014 Joey Hess + - Copyright 2014-2018 Joey Hess - - Licensed under the GNU GPL version 3 or higher. -} @@ -27,8 +29,10 @@ module Logs.MetaData ( getCurrentMetaData, + getCurrentRemoteMetaData, addMetaData, - addMetaData', + addRemoteMetaData, + addMetaDataClocked, currentMetaData, copyMetaData, ) where @@ -50,11 +54,6 @@ instance SingleValueSerializable MetaData where serialize = Types.MetaData.serialize deserialize = Types.MetaData.deserialize -getMetaDataLog :: Key -> Annex (Log MetaData) -getMetaDataLog key = do - config <- Annex.getGitConfig - readLog $ metaDataLogFile config key - logToCurrentMetaData :: [LogEntry MetaData] -> MetaData logToCurrentMetaData = currentMetaData . combineMetaData . map value @@ -65,8 +64,12 @@ logToCurrentMetaData = currentMetaData . combineMetaData . map value - currently set, based on timestamps in the log. -} getCurrentMetaData :: Key -> Annex MetaData -getCurrentMetaData k = do - ls <- S.toAscList <$> getMetaDataLog k +getCurrentMetaData = getCurrentMetaData' metaDataLogFile + +getCurrentMetaData' :: (GitConfig -> Key -> FilePath) -> Key -> Annex MetaData +getCurrentMetaData' getlogfile k = do + config <- Annex.getGitConfig + ls <- S.toAscList <$> readLog (getlogfile config k) let loggedmeta = logToCurrentMetaData ls return $ currentMetaData $ unionMetaData loggedmeta (lastchanged ls loggedmeta) @@ -92,27 +95,42 @@ getCurrentMetaData k = do Unknown -> 0 showts = formatPOSIXTime "%F@%H-%M-%S" +getCurrentRemoteMetaData :: UUID -> Key -> Annex RemoteMetaData +getCurrentRemoteMetaData u k = mkRemoteMetaData u <$> + getCurrentMetaData' remoteMetaDataLogFile k + {- Adds in some metadata, which can override existing values, or unset - them, but otherwise leaves any existing metadata as-is. -} addMetaData :: Key -> MetaData -> Annex () -addMetaData k metadata = addMetaData' k metadata =<< liftIO currentVectorClock +addMetaData = addMetaData' metaDataLogFile + +addMetaData' :: (GitConfig -> Key -> FilePath) -> Key -> MetaData -> Annex () +addMetaData' getlogfile k metadata = + addMetaDataClocked' getlogfile k metadata =<< liftIO currentVectorClock {- Reusing the same VectorClock when making changes to the metadata - of multiple keys is a nice optimisation. The same metadata lines - will tend to be generated across the different log files, and so - git will be able to pack the data more efficiently. -} -addMetaData' :: Key -> MetaData -> VectorClock -> Annex () -addMetaData' k d@(MetaData m) c +addMetaDataClocked :: Key -> MetaData -> VectorClock -> Annex () +addMetaDataClocked = addMetaDataClocked' metaDataLogFile + +addMetaDataClocked' :: (GitConfig -> Key -> FilePath) -> Key -> MetaData -> VectorClock -> Annex () +addMetaDataClocked' getlogfile k d@(MetaData m) c | d == emptyMetaData = noop | otherwise = do config <- Annex.getGitConfig - Annex.Branch.change (metaDataLogFile config k) $ + Annex.Branch.change (getlogfile config k) $ showLog . simplifyLog . S.insert (LogEntry c metadata) . parseLog where metadata = MetaData $ M.filterWithKey (\f _ -> not (isLastChangedField f)) m +addRemoteMetaData :: Key -> RemoteMetaData -> Annex () +addRemoteMetaData k m = do + addMetaData' remoteMetaDataLogFile k (fromRemoteMetaData m) + {- Simplify a log, removing historical values that are no longer - needed. - @@ -173,6 +191,11 @@ simplifyLog s = case sl of older = value l unique = older `differenceMetaData` newer +getMetaDataLog :: Key -> Annex (Log MetaData) +getMetaDataLog key = do + config <- Annex.getGitConfig + readLog $ metaDataLogFile config key + {- Copies the metadata from the old key to the new key. - - The exact content of the metadata file is copied, so that the timestamps diff --git a/Types/MetaData.hs b/Types/MetaData.hs index e0be811df2..f0dd833d6d 100644 --- a/Types/MetaData.hs +++ b/Types/MetaData.hs @@ -1,6 +1,6 @@ {- git-annex general metadata - - - Copyright 2014 Joey Hess + - Copyright 2014-2018 Joey Hess - - Licensed under the GNU GPL version 3 or higher. -} @@ -36,6 +36,9 @@ module Types.MetaData ( metaDataValues, ModMeta(..), modMeta, + RemoteMetaData(..), + mkRemoteMetaData, + fromRemoteMetaData, prop_metadata_sane, prop_metadata_serialize ) where @@ -44,6 +47,7 @@ import Common import Utility.Base64 import Utility.QuickCheck import Utility.Aeson +import Types.UUID import qualified Data.Text as T import qualified Data.Set as S @@ -282,6 +286,31 @@ modMeta m (MaybeSetMeta f v) | otherwise = emptyMetaData modMeta m (ComposeModMeta a b) = unionMetaData (modMeta m a) (modMeta m b) +data RemoteMetaData = RemoteMetaData UUID MetaData + deriving (Show, Eq, Ord) + +{- Extracts only the fields prefixed with "uuid:", which belong to that + - remote. -} +mkRemoteMetaData :: UUID -> MetaData -> RemoteMetaData +mkRemoteMetaData u (MetaData m) = RemoteMetaData u $ MetaData $ + M.mapKeys removeprefix $ M.filterWithKey belongsremote m + where + belongsremote (MetaField f) _v = prefix `isPrefixOf` CI.original f + removeprefix (MetaField f) = MetaField $ + CI.mk $ drop prefixlen $ CI.original f + prefix = remoteMetaDataPrefix u + prefixlen = length prefix + +remoteMetaDataPrefix :: UUID -> String +remoteMetaDataPrefix u = fromUUID u ++ ":" + +fromRemoteMetaData :: RemoteMetaData -> MetaData +fromRemoteMetaData (RemoteMetaData u (MetaData m)) = MetaData $ + M.mapKeys addprefix m + where + addprefix (MetaField f) = MetaField $ CI.mk $ (++ prefix) $ CI.original f + prefix = remoteMetaDataPrefix u + {- Avoid putting too many fields in the map; extremely large maps make - the seriaization test slow due to the sheer amount of data. - It's unlikely that more than 100 fields of metadata will be used. -} diff --git a/doc/internals.mdwn b/doc/internals.mdwn index 5abe7aa070..2074d52e95 100644 --- a/doc/internals.mdwn +++ b/doc/internals.mdwn @@ -236,6 +236,10 @@ These log files are used by remotes that need to record their own state about keys. Each remote can store one line of data about a key, in its own format. +Note that only the most recently set state about a key is seen +by remotes using this. The `log.rmet` documented below does not have this +limitation. + Example: 1287290776.765152s e605dca6-446a-11e0-8b2a-002170d25c55 blah blah @@ -262,6 +266,20 @@ reasonably short. If the value contains any whitespace (including \r or \n), it will be base64 encoded. Base64 encoded values are indicated by prefixing them with "!". +## `aaa/bbb/*.log.rmet` + +These log files store per-remote metadata about keys. This metadata +is only used by the remote. + +Format is the same as the metadata log files above, but each metadata key +is prefixed with "uuid:" to indicate the remote it belongs to. + +For example: + + 1287290776.765152s e605dca6-446a-11e0-8b2a-002170d25c55:foo +bar + 1287290776.765152s 26339d22-446b-11e0-9101-002170d25c55:x +1 + 1291237510.141453s 26339d22-446b-11e0-9101-002170d25c55:x -1 26339d22-446b-11e0-9101-002170d25c55:x +2 + ## `aaa/bbb/*.log.cnk` These log files are used when objects are stored in chunked form on diff --git a/doc/todo/per-remote_metadata_needs_to_be_cleaned_in_dropdead.mdwn b/doc/todo/per-remote_metadata_needs_to_be_cleaned_in_dropdead.mdwn new file mode 100644 index 0000000000..28d67c5d92 --- /dev/null +++ b/doc/todo/per-remote_metadata_needs_to_be_cleaned_in_dropdead.mdwn @@ -0,0 +1,2 @@ +The newly added per-remote metadata log files need to be scrubbed clean of +dead remotes during a transition. --[[Joey]]