per-remote metadata storage

Actually very straightforward reuse of the metadata log file code.
Although I had to add a todo item as git-annex forget won't clean up
dead remote's metadata yet.

This would be worth adding to the external special remote interface
sometime. Have not opened a todo though, guess I'll wait until something
needs it.

This commit was supported by the NSF-funded DataLad project.
This commit is contained in:
Joey Hess 2018-08-31 12:23:22 -04:00
parent 9d78a4387f
commit 5c99f6247e
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
6 changed files with 100 additions and 18 deletions

View file

@ -112,7 +112,7 @@ perform c o k = case getSet o of
Set ms -> do Set ms -> do
oldm <- getCurrentMetaData k oldm <- getCurrentMetaData k
let m = combineMetaData $ map (modMeta oldm) ms let m = combineMetaData $ map (modMeta oldm) ms
addMetaData' k m c addMetaDataClocked k m c
next $ cleanup k next $ cleanup k
_ -> next $ cleanup k _ -> next $ cleanup k

14
Logs.hs
View file

@ -1,6 +1,6 @@
{- git-annex log file names {- git-annex log file names
- -
- Copyright 2013-2015 Joey Hess <id@joeyh.name> - Copyright 2013-2018 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU GPL version 3 or higher. - Licensed under the GNU GPL version 3 or higher.
-} -}
@ -26,7 +26,7 @@ getLogVariety f
| f `elem` topLevelUUIDBasedLogs = Just UUIDBasedLog | f `elem` topLevelUUIDBasedLogs = Just UUIDBasedLog
| isRemoteStateLog f = Just NewUUIDBasedLog | isRemoteStateLog f = Just NewUUIDBasedLog
| isChunkLog f = ChunkLog <$> chunkLogFileKey f | isChunkLog f = ChunkLog <$> chunkLogFileKey f
| isMetaDataLog f || f `elem` otherLogs = Just OtherLog | isMetaDataLog f || isRemoteMetaDataLog f || f `elem` otherLogs = Just OtherLog
| otherwise = PresenceLog <$> firstJust (presenceLogs f) | otherwise = PresenceLog <$> firstJust (presenceLogs f)
{- All the uuid-based logs stored in the top of the git-annex branch. -} {- All the uuid-based logs stored in the top of the git-annex branch. -}
@ -185,3 +185,13 @@ metaDataLogExt = ".log.met"
isMetaDataLog :: FilePath -> Bool isMetaDataLog :: FilePath -> Bool
isMetaDataLog path = metaDataLogExt `isSuffixOf` path isMetaDataLog path = metaDataLogExt `isSuffixOf` path
{- The filename of the remote metadata log for a given key. -}
remoteMetaDataLogFile :: GitConfig -> Key -> FilePath
remoteMetaDataLogFile config key = branchHashDir config key </> keyFile key ++ remoteMetaDataLogExt
remoteMetaDataLogExt :: String
remoteMetaDataLogExt = ".log.rmet"
isRemoteMetaDataLog :: FilePath -> Bool
isRemoteMetaDataLog path = remoteMetaDataLogExt `isSuffixOf` path

View file

@ -1,7 +1,9 @@
{- git-annex general metadata storage log {- git-annex general metadata storage log and per-remote metadata storage log.
- -
- A line of the log will look like "timestamp field [+-]value [...]" - A line of the log will look like "timestamp field [+-]value [...]"
- -
- (In the per-remote log, each field is prefixed with "uuid:")
-
- Note that unset values are preserved. Consider this case: - Note that unset values are preserved. Consider this case:
- -
- We have: - We have:
@ -18,7 +20,7 @@
- and so foo currently has no value. - and so foo currently has no value.
- -
- -
- Copyright 2014 Joey Hess <id@joeyh.name> - Copyright 2014-2018 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU GPL version 3 or higher. - Licensed under the GNU GPL version 3 or higher.
-} -}
@ -27,8 +29,10 @@
module Logs.MetaData ( module Logs.MetaData (
getCurrentMetaData, getCurrentMetaData,
getCurrentRemoteMetaData,
addMetaData, addMetaData,
addMetaData', addRemoteMetaData,
addMetaDataClocked,
currentMetaData, currentMetaData,
copyMetaData, copyMetaData,
) where ) where
@ -50,11 +54,6 @@ instance SingleValueSerializable MetaData where
serialize = Types.MetaData.serialize serialize = Types.MetaData.serialize
deserialize = Types.MetaData.deserialize deserialize = Types.MetaData.deserialize
getMetaDataLog :: Key -> Annex (Log MetaData)
getMetaDataLog key = do
config <- Annex.getGitConfig
readLog $ metaDataLogFile config key
logToCurrentMetaData :: [LogEntry MetaData] -> MetaData logToCurrentMetaData :: [LogEntry MetaData] -> MetaData
logToCurrentMetaData = currentMetaData . combineMetaData . map value logToCurrentMetaData = currentMetaData . combineMetaData . map value
@ -65,8 +64,12 @@ logToCurrentMetaData = currentMetaData . combineMetaData . map value
- currently set, based on timestamps in the log. - currently set, based on timestamps in the log.
-} -}
getCurrentMetaData :: Key -> Annex MetaData getCurrentMetaData :: Key -> Annex MetaData
getCurrentMetaData k = do getCurrentMetaData = getCurrentMetaData' metaDataLogFile
ls <- S.toAscList <$> getMetaDataLog k
getCurrentMetaData' :: (GitConfig -> Key -> FilePath) -> Key -> Annex MetaData
getCurrentMetaData' getlogfile k = do
config <- Annex.getGitConfig
ls <- S.toAscList <$> readLog (getlogfile config k)
let loggedmeta = logToCurrentMetaData ls let loggedmeta = logToCurrentMetaData ls
return $ currentMetaData $ unionMetaData loggedmeta return $ currentMetaData $ unionMetaData loggedmeta
(lastchanged ls loggedmeta) (lastchanged ls loggedmeta)
@ -92,27 +95,42 @@ getCurrentMetaData k = do
Unknown -> 0 Unknown -> 0
showts = formatPOSIXTime "%F@%H-%M-%S" showts = formatPOSIXTime "%F@%H-%M-%S"
getCurrentRemoteMetaData :: UUID -> Key -> Annex RemoteMetaData
getCurrentRemoteMetaData u k = mkRemoteMetaData u <$>
getCurrentMetaData' remoteMetaDataLogFile k
{- Adds in some metadata, which can override existing values, or unset {- Adds in some metadata, which can override existing values, or unset
- them, but otherwise leaves any existing metadata as-is. -} - them, but otherwise leaves any existing metadata as-is. -}
addMetaData :: Key -> MetaData -> Annex () addMetaData :: Key -> MetaData -> Annex ()
addMetaData k metadata = addMetaData' k metadata =<< liftIO currentVectorClock addMetaData = addMetaData' metaDataLogFile
addMetaData' :: (GitConfig -> Key -> FilePath) -> Key -> MetaData -> Annex ()
addMetaData' getlogfile k metadata =
addMetaDataClocked' getlogfile k metadata =<< liftIO currentVectorClock
{- Reusing the same VectorClock when making changes to the metadata {- Reusing the same VectorClock when making changes to the metadata
- of multiple keys is a nice optimisation. The same metadata lines - of multiple keys is a nice optimisation. The same metadata lines
- will tend to be generated across the different log files, and so - will tend to be generated across the different log files, and so
- git will be able to pack the data more efficiently. -} - git will be able to pack the data more efficiently. -}
addMetaData' :: Key -> MetaData -> VectorClock -> Annex () addMetaDataClocked :: Key -> MetaData -> VectorClock -> Annex ()
addMetaData' k d@(MetaData m) c addMetaDataClocked = addMetaDataClocked' metaDataLogFile
addMetaDataClocked' :: (GitConfig -> Key -> FilePath) -> Key -> MetaData -> VectorClock -> Annex ()
addMetaDataClocked' getlogfile k d@(MetaData m) c
| d == emptyMetaData = noop | d == emptyMetaData = noop
| otherwise = do | otherwise = do
config <- Annex.getGitConfig config <- Annex.getGitConfig
Annex.Branch.change (metaDataLogFile config k) $ Annex.Branch.change (getlogfile config k) $
showLog . simplifyLog showLog . simplifyLog
. S.insert (LogEntry c metadata) . S.insert (LogEntry c metadata)
. parseLog . parseLog
where where
metadata = MetaData $ M.filterWithKey (\f _ -> not (isLastChangedField f)) m metadata = MetaData $ M.filterWithKey (\f _ -> not (isLastChangedField f)) m
addRemoteMetaData :: Key -> RemoteMetaData -> Annex ()
addRemoteMetaData k m = do
addMetaData' remoteMetaDataLogFile k (fromRemoteMetaData m)
{- Simplify a log, removing historical values that are no longer {- Simplify a log, removing historical values that are no longer
- needed. - needed.
- -
@ -173,6 +191,11 @@ simplifyLog s = case sl of
older = value l older = value l
unique = older `differenceMetaData` newer unique = older `differenceMetaData` newer
getMetaDataLog :: Key -> Annex (Log MetaData)
getMetaDataLog key = do
config <- Annex.getGitConfig
readLog $ metaDataLogFile config key
{- Copies the metadata from the old key to the new key. {- Copies the metadata from the old key to the new key.
- -
- The exact content of the metadata file is copied, so that the timestamps - The exact content of the metadata file is copied, so that the timestamps

View file

@ -1,6 +1,6 @@
{- git-annex general metadata {- git-annex general metadata
- -
- Copyright 2014 Joey Hess <id@joeyh.name> - Copyright 2014-2018 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU GPL version 3 or higher. - Licensed under the GNU GPL version 3 or higher.
-} -}
@ -36,6 +36,9 @@ module Types.MetaData (
metaDataValues, metaDataValues,
ModMeta(..), ModMeta(..),
modMeta, modMeta,
RemoteMetaData(..),
mkRemoteMetaData,
fromRemoteMetaData,
prop_metadata_sane, prop_metadata_sane,
prop_metadata_serialize prop_metadata_serialize
) where ) where
@ -44,6 +47,7 @@ import Common
import Utility.Base64 import Utility.Base64
import Utility.QuickCheck import Utility.QuickCheck
import Utility.Aeson import Utility.Aeson
import Types.UUID
import qualified Data.Text as T import qualified Data.Text as T
import qualified Data.Set as S import qualified Data.Set as S
@ -282,6 +286,31 @@ modMeta m (MaybeSetMeta f v)
| otherwise = emptyMetaData | otherwise = emptyMetaData
modMeta m (ComposeModMeta a b) = unionMetaData (modMeta m a) (modMeta m b) modMeta m (ComposeModMeta a b) = unionMetaData (modMeta m a) (modMeta m b)
data RemoteMetaData = RemoteMetaData UUID MetaData
deriving (Show, Eq, Ord)
{- Extracts only the fields prefixed with "uuid:", which belong to that
- remote. -}
mkRemoteMetaData :: UUID -> MetaData -> RemoteMetaData
mkRemoteMetaData u (MetaData m) = RemoteMetaData u $ MetaData $
M.mapKeys removeprefix $ M.filterWithKey belongsremote m
where
belongsremote (MetaField f) _v = prefix `isPrefixOf` CI.original f
removeprefix (MetaField f) = MetaField $
CI.mk $ drop prefixlen $ CI.original f
prefix = remoteMetaDataPrefix u
prefixlen = length prefix
remoteMetaDataPrefix :: UUID -> String
remoteMetaDataPrefix u = fromUUID u ++ ":"
fromRemoteMetaData :: RemoteMetaData -> MetaData
fromRemoteMetaData (RemoteMetaData u (MetaData m)) = MetaData $
M.mapKeys addprefix m
where
addprefix (MetaField f) = MetaField $ CI.mk $ (++ prefix) $ CI.original f
prefix = remoteMetaDataPrefix u
{- Avoid putting too many fields in the map; extremely large maps make {- Avoid putting too many fields in the map; extremely large maps make
- the seriaization test slow due to the sheer amount of data. - the seriaization test slow due to the sheer amount of data.
- It's unlikely that more than 100 fields of metadata will be used. -} - It's unlikely that more than 100 fields of metadata will be used. -}

View file

@ -236,6 +236,10 @@ These log files are used by remotes that need to record their own state
about keys. Each remote can store one line of data about a key, in about keys. Each remote can store one line of data about a key, in
its own format. its own format.
Note that only the most recently set state about a key is seen
by remotes using this. The `log.rmet` documented below does not have this
limitation.
Example: Example:
1287290776.765152s e605dca6-446a-11e0-8b2a-002170d25c55 blah blah 1287290776.765152s e605dca6-446a-11e0-8b2a-002170d25c55 blah blah
@ -262,6 +266,20 @@ reasonably short. If the value contains any whitespace
(including \r or \n), it will be base64 encoded. Base64 encoded values (including \r or \n), it will be base64 encoded. Base64 encoded values
are indicated by prefixing them with "!". are indicated by prefixing them with "!".
## `aaa/bbb/*.log.rmet`
These log files store per-remote metadata about keys. This metadata
is only used by the remote.
Format is the same as the metadata log files above, but each metadata key
is prefixed with "uuid:" to indicate the remote it belongs to.
For example:
1287290776.765152s e605dca6-446a-11e0-8b2a-002170d25c55:foo +bar
1287290776.765152s 26339d22-446b-11e0-9101-002170d25c55:x +1
1291237510.141453s 26339d22-446b-11e0-9101-002170d25c55:x -1 26339d22-446b-11e0-9101-002170d25c55:x +2
## `aaa/bbb/*.log.cnk` ## `aaa/bbb/*.log.cnk`
These log files are used when objects are stored in chunked form on These log files are used when objects are stored in chunked form on

View file

@ -0,0 +1,2 @@
The newly added per-remote metadata log files need to be scrubbed clean of
dead remotes during a transition. --[[Joey]]