per-remote metadata storage

Actually very straightforward reuse of the metadata log file code.
Although I had to add a todo item as git-annex forget won't clean up
dead remote's metadata yet.

This would be worth adding to the external special remote interface
sometime. Have not opened a todo though, guess I'll wait until something
needs it.

This commit was supported by the NSF-funded DataLad project.
This commit is contained in:
Joey Hess 2018-08-31 12:23:22 -04:00
parent 9d78a4387f
commit 5c99f6247e
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
6 changed files with 100 additions and 18 deletions

View file

@ -112,7 +112,7 @@ perform c o k = case getSet o of
Set ms -> do
oldm <- getCurrentMetaData k
let m = combineMetaData $ map (modMeta oldm) ms
addMetaData' k m c
addMetaDataClocked k m c
next $ cleanup k
_ -> next $ cleanup k

14
Logs.hs
View file

@ -1,6 +1,6 @@
{- git-annex log file names
-
- Copyright 2013-2015 Joey Hess <id@joeyh.name>
- Copyright 2013-2018 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU GPL version 3 or higher.
-}
@ -26,7 +26,7 @@ getLogVariety f
| f `elem` topLevelUUIDBasedLogs = Just UUIDBasedLog
| isRemoteStateLog f = Just NewUUIDBasedLog
| isChunkLog f = ChunkLog <$> chunkLogFileKey f
| isMetaDataLog f || f `elem` otherLogs = Just OtherLog
| isMetaDataLog f || isRemoteMetaDataLog f || f `elem` otherLogs = Just OtherLog
| otherwise = PresenceLog <$> firstJust (presenceLogs f)
{- All the uuid-based logs stored in the top of the git-annex branch. -}
@ -185,3 +185,13 @@ metaDataLogExt = ".log.met"
isMetaDataLog :: FilePath -> Bool
isMetaDataLog path = metaDataLogExt `isSuffixOf` path
{- The filename of the remote metadata log for a given key. -}
remoteMetaDataLogFile :: GitConfig -> Key -> FilePath
remoteMetaDataLogFile config key = branchHashDir config key </> keyFile key ++ remoteMetaDataLogExt
remoteMetaDataLogExt :: String
remoteMetaDataLogExt = ".log.rmet"
isRemoteMetaDataLog :: FilePath -> Bool
isRemoteMetaDataLog path = remoteMetaDataLogExt `isSuffixOf` path

View file

@ -1,7 +1,9 @@
{- git-annex general metadata storage log
{- git-annex general metadata storage log and per-remote metadata storage log.
-
- A line of the log will look like "timestamp field [+-]value [...]"
-
- (In the per-remote log, each field is prefixed with "uuid:")
-
- Note that unset values are preserved. Consider this case:
-
- We have:
@ -18,7 +20,7 @@
- and so foo currently has no value.
-
-
- Copyright 2014 Joey Hess <id@joeyh.name>
- Copyright 2014-2018 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU GPL version 3 or higher.
-}
@ -27,8 +29,10 @@
module Logs.MetaData (
getCurrentMetaData,
getCurrentRemoteMetaData,
addMetaData,
addMetaData',
addRemoteMetaData,
addMetaDataClocked,
currentMetaData,
copyMetaData,
) where
@ -50,11 +54,6 @@ instance SingleValueSerializable MetaData where
serialize = Types.MetaData.serialize
deserialize = Types.MetaData.deserialize
getMetaDataLog :: Key -> Annex (Log MetaData)
getMetaDataLog key = do
config <- Annex.getGitConfig
readLog $ metaDataLogFile config key
logToCurrentMetaData :: [LogEntry MetaData] -> MetaData
logToCurrentMetaData = currentMetaData . combineMetaData . map value
@ -65,8 +64,12 @@ logToCurrentMetaData = currentMetaData . combineMetaData . map value
- currently set, based on timestamps in the log.
-}
getCurrentMetaData :: Key -> Annex MetaData
getCurrentMetaData k = do
ls <- S.toAscList <$> getMetaDataLog k
getCurrentMetaData = getCurrentMetaData' metaDataLogFile
getCurrentMetaData' :: (GitConfig -> Key -> FilePath) -> Key -> Annex MetaData
getCurrentMetaData' getlogfile k = do
config <- Annex.getGitConfig
ls <- S.toAscList <$> readLog (getlogfile config k)
let loggedmeta = logToCurrentMetaData ls
return $ currentMetaData $ unionMetaData loggedmeta
(lastchanged ls loggedmeta)
@ -92,27 +95,42 @@ getCurrentMetaData k = do
Unknown -> 0
showts = formatPOSIXTime "%F@%H-%M-%S"
getCurrentRemoteMetaData :: UUID -> Key -> Annex RemoteMetaData
getCurrentRemoteMetaData u k = mkRemoteMetaData u <$>
getCurrentMetaData' remoteMetaDataLogFile k
{- Adds in some metadata, which can override existing values, or unset
- them, but otherwise leaves any existing metadata as-is. -}
addMetaData :: Key -> MetaData -> Annex ()
addMetaData k metadata = addMetaData' k metadata =<< liftIO currentVectorClock
addMetaData = addMetaData' metaDataLogFile
addMetaData' :: (GitConfig -> Key -> FilePath) -> Key -> MetaData -> Annex ()
addMetaData' getlogfile k metadata =
addMetaDataClocked' getlogfile k metadata =<< liftIO currentVectorClock
{- Reusing the same VectorClock when making changes to the metadata
- of multiple keys is a nice optimisation. The same metadata lines
- will tend to be generated across the different log files, and so
- git will be able to pack the data more efficiently. -}
addMetaData' :: Key -> MetaData -> VectorClock -> Annex ()
addMetaData' k d@(MetaData m) c
addMetaDataClocked :: Key -> MetaData -> VectorClock -> Annex ()
addMetaDataClocked = addMetaDataClocked' metaDataLogFile
addMetaDataClocked' :: (GitConfig -> Key -> FilePath) -> Key -> MetaData -> VectorClock -> Annex ()
addMetaDataClocked' getlogfile k d@(MetaData m) c
| d == emptyMetaData = noop
| otherwise = do
config <- Annex.getGitConfig
Annex.Branch.change (metaDataLogFile config k) $
Annex.Branch.change (getlogfile config k) $
showLog . simplifyLog
. S.insert (LogEntry c metadata)
. parseLog
where
metadata = MetaData $ M.filterWithKey (\f _ -> not (isLastChangedField f)) m
addRemoteMetaData :: Key -> RemoteMetaData -> Annex ()
addRemoteMetaData k m = do
addMetaData' remoteMetaDataLogFile k (fromRemoteMetaData m)
{- Simplify a log, removing historical values that are no longer
- needed.
-
@ -173,6 +191,11 @@ simplifyLog s = case sl of
older = value l
unique = older `differenceMetaData` newer
getMetaDataLog :: Key -> Annex (Log MetaData)
getMetaDataLog key = do
config <- Annex.getGitConfig
readLog $ metaDataLogFile config key
{- Copies the metadata from the old key to the new key.
-
- The exact content of the metadata file is copied, so that the timestamps

View file

@ -1,6 +1,6 @@
{- git-annex general metadata
-
- Copyright 2014 Joey Hess <id@joeyh.name>
- Copyright 2014-2018 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU GPL version 3 or higher.
-}
@ -36,6 +36,9 @@ module Types.MetaData (
metaDataValues,
ModMeta(..),
modMeta,
RemoteMetaData(..),
mkRemoteMetaData,
fromRemoteMetaData,
prop_metadata_sane,
prop_metadata_serialize
) where
@ -44,6 +47,7 @@ import Common
import Utility.Base64
import Utility.QuickCheck
import Utility.Aeson
import Types.UUID
import qualified Data.Text as T
import qualified Data.Set as S
@ -282,6 +286,31 @@ modMeta m (MaybeSetMeta f v)
| otherwise = emptyMetaData
modMeta m (ComposeModMeta a b) = unionMetaData (modMeta m a) (modMeta m b)
data RemoteMetaData = RemoteMetaData UUID MetaData
deriving (Show, Eq, Ord)
{- Extracts only the fields prefixed with "uuid:", which belong to that
- remote. -}
mkRemoteMetaData :: UUID -> MetaData -> RemoteMetaData
mkRemoteMetaData u (MetaData m) = RemoteMetaData u $ MetaData $
M.mapKeys removeprefix $ M.filterWithKey belongsremote m
where
belongsremote (MetaField f) _v = prefix `isPrefixOf` CI.original f
removeprefix (MetaField f) = MetaField $
CI.mk $ drop prefixlen $ CI.original f
prefix = remoteMetaDataPrefix u
prefixlen = length prefix
remoteMetaDataPrefix :: UUID -> String
remoteMetaDataPrefix u = fromUUID u ++ ":"
fromRemoteMetaData :: RemoteMetaData -> MetaData
fromRemoteMetaData (RemoteMetaData u (MetaData m)) = MetaData $
M.mapKeys addprefix m
where
addprefix (MetaField f) = MetaField $ CI.mk $ (++ prefix) $ CI.original f
prefix = remoteMetaDataPrefix u
{- Avoid putting too many fields in the map; extremely large maps make
- the seriaization test slow due to the sheer amount of data.
- It's unlikely that more than 100 fields of metadata will be used. -}

View file

@ -236,6 +236,10 @@ These log files are used by remotes that need to record their own state
about keys. Each remote can store one line of data about a key, in
its own format.
Note that only the most recently set state about a key is seen
by remotes using this. The `log.rmet` documented below does not have this
limitation.
Example:
1287290776.765152s e605dca6-446a-11e0-8b2a-002170d25c55 blah blah
@ -262,6 +266,20 @@ reasonably short. If the value contains any whitespace
(including \r or \n), it will be base64 encoded. Base64 encoded values
are indicated by prefixing them with "!".
## `aaa/bbb/*.log.rmet`
These log files store per-remote metadata about keys. This metadata
is only used by the remote.
Format is the same as the metadata log files above, but each metadata key
is prefixed with "uuid:" to indicate the remote it belongs to.
For example:
1287290776.765152s e605dca6-446a-11e0-8b2a-002170d25c55:foo +bar
1287290776.765152s 26339d22-446b-11e0-9101-002170d25c55:x +1
1291237510.141453s 26339d22-446b-11e0-9101-002170d25c55:x -1 26339d22-446b-11e0-9101-002170d25c55:x +2
## `aaa/bbb/*.log.cnk`
These log files are used when objects are stored in chunked form on

View file

@ -0,0 +1,2 @@
The newly added per-remote metadata log files need to be scrubbed clean of
dead remotes during a transition. --[[Joey]]