From 9f7e76130ef9cf2ee3c3c0123a14814fe4952c7d Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Wed, 12 Feb 2014 21:12:22 -0400 Subject: [PATCH] add metadata command to get/set metadata Adds metadata log, and command. Note that unsetting field values seems to currently be broken. And in general this has had all of 2 minutes worth of testing. This commit was sponsored by Julien Lefrique. --- Annex/Branch/Transitions.hs | 2 +- CmdLine/GitAnnex.hs | 2 + Command/MetaData.hs | 73 +++++++++++++++++++ Logs.hs | 23 ++++-- Logs/MetaData.hs | 135 ++++++++++++++++++++++++++++++++++++ Types/MetaData.hs | 57 ++++++++++++--- doc/design/metadata.mdwn | 14 ---- doc/git-annex.mdwn | 17 +++++ doc/internals.mdwn | 21 ++++++ 9 files changed, 312 insertions(+), 32 deletions(-) create mode 100644 Command/MetaData.hs create mode 100644 Logs/MetaData.hs diff --git a/Annex/Branch/Transitions.hs b/Annex/Branch/Transitions.hs index 95d47257a3..42c61d96a6 100644 --- a/Annex/Branch/Transitions.hs +++ b/Annex/Branch/Transitions.hs @@ -41,7 +41,7 @@ dropDead f content trustmap = case getLogVariety f of in if null newlog then RemoveFile else ChangeFile $ Presence.showLog newlog - Just SingleValueLog -> PreserveFile + Just OtherLog -> PreserveFile Nothing -> PreserveFile dropDeadFromUUIDBasedLog :: TrustMap -> UUIDBased.Log String -> UUIDBased.Log String diff --git a/CmdLine/GitAnnex.hs b/CmdLine/GitAnnex.hs index b25082963b..a67c6be291 100644 --- a/CmdLine/GitAnnex.hs +++ b/CmdLine/GitAnnex.hs @@ -26,6 +26,7 @@ import qualified Command.DropKey import qualified Command.TransferKey import qualified Command.TransferKeys import qualified Command.ReKey +import qualified Command.MetaData import qualified Command.Reinject import qualified Command.Fix import qualified Command.Init @@ -134,6 +135,7 @@ cmds = concat , Command.TransferKey.def , Command.TransferKeys.def , Command.ReKey.def + , Command.MetaData.def , Command.Fix.def , Command.Fsck.def , Command.Repair.def diff --git a/Command/MetaData.hs b/Command/MetaData.hs new file mode 100644 index 0000000000..f2c4abcead --- /dev/null +++ b/Command/MetaData.hs @@ -0,0 +1,73 @@ +{- git-annex command + - + - Copyright 2014 Joey Hess + - + - Licensed under the GNU GPL version 3 or higher. + -} + +module Command.MetaData where + +import Common.Annex +import Command +import Logs.MetaData +import Types.MetaData + +import qualified Data.Set as S + +def :: [Command] +def = [command "metadata" (paramPair paramFile (paramRepeating "FIELD[+-]=VALUE")) seek + SectionUtility "sets metadata of a file"] + +seek :: CommandSeek +seek = withWords start + +start :: [String] -> CommandStart +start (file:settings) = ifAnnexed file + go + (error $ "not an annexed file, so cannot add metadata: " ++ file) + where + go (k, _b) = do + showStart "metadata" file + next $ perform k (map parse settings) +start _ = error "specify a file and the metadata to set" + +perform :: Key -> [Action] -> CommandPerform +perform k actions = do + m <- getCurrentMetaData k + if null actions + then next $ cleanup m + else do + let m' = foldr apply m actions + addMetaData k m' + next $ cleanup m' + +cleanup :: MetaData -> CommandCleanup +cleanup m = do + showLongNote $ unlines $ concatMap showmeta $ fromMetaData $ currentMetaData m + return True + where + showmeta (f, vs) = map (\v -> fromMetaField f ++ "=" ++ fromMetaValue v) $ S.toList vs + +data Action + = AddMeta MetaField MetaValue + | DelMeta MetaField MetaValue + | SetMeta MetaField MetaValue + +parse :: String -> Action +parse p = case lastMaybe f of + Just '+' -> AddMeta (mkf f') v + Just '-' -> DelMeta (mkf f') v + _ -> SetMeta (mkf f) v + where + (f, sv) = separate (== '=') p + f' = beginning f + v = toMetaValue sv + mkf fld = fromMaybe (badfield fld) (toMetaField fld) + badfield fld = error $ "Illegal metadata field name, \"" ++ fld ++ "\"" + +apply :: Action -> MetaData -> MetaData +apply (AddMeta f v) m = updateMetaData f v m +apply (DelMeta f oldv) m = updateMetaData f (unsetMetaValue oldv) m +apply (SetMeta f v) m = updateMetaData f v $ + foldr (updateMetaData f) m $ + map unsetMetaValue $ S.toList $ currentMetaDataValues f m diff --git a/Logs.hs b/Logs.hs index 1e7a8e8c4e..21908a9cf2 100644 --- a/Logs.hs +++ b/Logs.hs @@ -1,6 +1,6 @@ {- git-annex log file names - - - Copyright 2013 Joey Hess + - Copyright 2013-2014 Joey Hess - - Licensed under the GNU GPL version 3 or higher. -} @@ -15,7 +15,7 @@ data LogVariety = UUIDBasedLog | NewUUIDBasedLog | PresenceLog Key - | SingleValueLog + | OtherLog deriving (Show) {- Converts a path from the git-annex branch into one of the varieties @@ -24,7 +24,7 @@ getLogVariety :: FilePath -> Maybe LogVariety getLogVariety f | f `elem` topLevelUUIDBasedLogs = Just UUIDBasedLog | isRemoteStateLog f = Just NewUUIDBasedLog - | f == numcopiesLog = Just SingleValueLog + | isMetaDataLog f || f == numcopiesLog = Just OtherLog | otherwise = PresenceLog <$> firstJust (presenceLogs f) {- All the uuid-based logs stored in the top of the git-annex branch. -} @@ -119,6 +119,16 @@ remoteStateLogExt = ".log.rmt" isRemoteStateLog :: FilePath -> Bool isRemoteStateLog path = remoteStateLogExt `isSuffixOf` path +{- The filename of the metadata log for a given key. -} +metaDataLogFile :: Key -> FilePath +metaDataLogFile key = hashDirLower key keyFile key ++ metaDataLogExt + +metaDataLogExt :: String +metaDataLogExt = ".log.met" + +isMetaDataLog :: FilePath -> Bool +isMetaDataLog path = metaDataLogExt `isSuffixOf` path + prop_logs_sane :: Key -> Bool prop_logs_sane dummykey = and [ isNothing (getLogVariety "unknown") @@ -126,7 +136,8 @@ prop_logs_sane dummykey = and , expect isPresenceLog (getLogVariety $ locationLogFile dummykey) , expect isPresenceLog (getLogVariety $ urlLogFile dummykey) , expect isNewUUIDBasedLog (getLogVariety $ remoteStateLogFile dummykey) - , expect isSingleValueLog (getLogVariety $ numcopiesLog) + , expect isOtherLog (getLogVariety $ metaDataLogFile dummykey) + , expect isOtherLog (getLogVariety $ numcopiesLog) ] where expect = maybe False @@ -136,5 +147,5 @@ prop_logs_sane dummykey = and isNewUUIDBasedLog _ = False isPresenceLog (PresenceLog k) = k == dummykey isPresenceLog _ = False - isSingleValueLog SingleValueLog = True - isSingleValueLog _ = False + isOtherLog OtherLog = True + isOtherLog _ = False diff --git a/Logs/MetaData.hs b/Logs/MetaData.hs new file mode 100644 index 0000000000..6f7f4154a2 --- /dev/null +++ b/Logs/MetaData.hs @@ -0,0 +1,135 @@ +{- git-annex general metadata storage log + - + - A line of the log will look like "timestamp field [+-]value [...]" + - + - Note that unset values are preserved. Consider this case: + - + - We have: + - + - 100 foo +x + - 200 foo -x + - + - An unmerged remote has: + - + - 150 foo +x + - + - After union merge, because the foo -x was preserved, we know that + - after the other remote redundantly set foo +x, it was unset, + - and so foo currently has no value. + - + - + - Copyright 2014 Joey Hess + - + - Licensed under the GNU GPL version 3 or higher. + -} + +{-# OPTIONS_GHC -fno-warn-orphans #-} + +module Logs.MetaData ( + getCurrentMetaData, + getMetaData, + setMetaData, + unsetMetaData, + addMetaData, + currentMetaData, +) where + +import Common.Annex +import Types.MetaData +import qualified Annex.Branch +import Logs +import Logs.SingleValue + +import qualified Data.Set as S +import Data.Time.Clock.POSIX + +instance SingleValueSerializable MetaData where + serialize = Types.MetaData.serialize + deserialize = Types.MetaData.deserialize + +getMetaData :: Key -> Annex (Log MetaData) +getMetaData = readLog . metaDataLogFile + +{- Go through the log from oldest to newest, and combine it all + - into a single MetaData representing the current state. -} +getCurrentMetaData :: Key -> Annex MetaData +getCurrentMetaData = currentMetaData . collect <$$> getMetaData + where + collect = foldl' unionMetaData newMetaData . map value . S.toAscList + +setMetaData :: Key -> MetaField -> String -> Annex () +setMetaData = setMetaData' True + +unsetMetaData :: Key -> MetaField -> String -> Annex () +unsetMetaData = setMetaData' False + +setMetaData' :: Bool -> Key -> MetaField -> String -> Annex () +setMetaData' isset k field s = addMetaData k $ + updateMetaData field (mkMetaValue (CurrentlySet isset) s) newMetaData + +{- Adds in some metadata, which can override existing values, or unset + - them, but otherwise leaves any existing metadata as-is. -} +addMetaData :: Key -> MetaData -> Annex () +addMetaData k metadata = do + now <- liftIO getPOSIXTime + Annex.Branch.change (metaDataLogFile k) $ + showLog . simplifyLog + . S.insert (LogEntry now metadata) + . parseLog + +{- Simplify a log, removing historical values that are no longer + - needed. + - + - This is not as simple as just making a single log line with the newest + - state of all metadata. Consider this case: + - + - We have: + - + - 100 foo +x bar +y + - 200 foo -x + - + - An unmerged remote has: + - + - 150 bar +z baz +w + - + - If what we have were simplified to "200 foo -x bar +y" then when the line + - from the remote became available, it would be older than the simplified + - line, and its change to bar would not take effect. That is wrong. + - + - Instead, simplify it to: (this simpliciation is optional) + - + - 100 bar +y (100 foo +x bar +y) + - 200 foo -x + - + - Now merging with the remote yields: + - + - 100 bar +y (100 foo +x bar +y) + - 150 bar +z baz +w + - 200 foo -x + - + - Simplifying again: + - + - 150 bar +z baz +w + - 200 foo -x + - + - In practice, there is little benefit to making simplications to lines + - that only remove some values, while leaving others on the line. + - Since lines are kept in git, that likely increases the size of the + - git repo (depending on compression), rather than saving any space. + - + - So, the only simplication that is actually done is to throw out an + - old line when all the values in it have been overridden by lines that + - came before + -} +simplifyLog :: Log MetaData -> Log MetaData +simplifyLog s = case S.toDescList s of + (newest:rest) -> S.fromList $ go [newest] (value newest) rest + _ -> s + where + go c _ [] = c + go c newer (l:ls) + | older `hasUniqueMetaData` newer = + go (l:c) (unionMetaData older newer) ls + | otherwise = go c newer ls + where + older = value l diff --git a/Types/MetaData.hs b/Types/MetaData.hs index ee6ba66a02..a8be9231d0 100644 --- a/Types/MetaData.hs +++ b/Types/MetaData.hs @@ -16,11 +16,16 @@ module Types.MetaData ( toMetaField, fromMetaField, toMetaValue, - toMetaValue', + mkMetaValue, + unsetMetaValue, fromMetaValue, + fromMetaData, newMetaData, updateMetaData, - getCurrentMetaData, + unionMetaData, + hasUniqueMetaData, + currentMetaData, + currentMetaDataValues, getAllMetaData, serialize, deserialize, @@ -37,7 +42,7 @@ import qualified Data.Map as M import Data.Char newtype MetaData = MetaData (M.Map MetaField (S.Set MetaValue)) - deriving (Show, Eq) + deriving (Show, Eq, Ord) {- A metadata value can be currently be set (True), or may have been - set before and we're remembering it no longer is (False). -} @@ -118,8 +123,11 @@ legalField f toMetaValue :: String -> MetaValue toMetaValue = MetaValue (CurrentlySet True) -toMetaValue' :: CurrentlySet -> String -> MetaValue -toMetaValue' = MetaValue +mkMetaValue :: CurrentlySet -> String -> MetaValue +mkMetaValue = MetaValue + +unsetMetaValue :: MetaValue -> MetaValue +unsetMetaValue (MetaValue _ s) = MetaValue (CurrentlySet False) s fromMetaField :: MetaField -> String fromMetaField (MetaField f) = f @@ -127,6 +135,9 @@ fromMetaField (MetaField f) = f fromMetaValue :: MetaValue -> String fromMetaValue (MetaValue _ f) = f +fromMetaData :: MetaData -> [(MetaField, S.Set MetaValue)] +fromMetaData (MetaData m) = M.toList m + newMetaData :: MetaData newMetaData = MetaData M.empty @@ -136,13 +147,38 @@ updateMetaData :: MetaField -> MetaValue -> MetaData -> MetaData updateMetaData f v (MetaData m) = MetaData $ M.insertWith' S.union f (S.singleton v) m -{- Gets only currently set values -} -getCurrentMetaData :: MetaField -> MetaData -> S.Set MetaValue -getCurrentMetaData f m = S.filter isSet (getAllMetaData f m) +{- New metadata overrides old._-} +unionMetaData :: MetaData -> MetaData -> MetaData +unionMetaData (MetaData old) (MetaData new) = MetaData $ + M.unionWith S.union new old + +{- Checks if m contains any fields with values that are not + - the same in comparewith. Note that unset and set values are + - considered to be the same, so if m sets a value and comparewith + - unsets it, m is not unique. However, if m changes the value, + - or adds a new value, it is unique. -} +hasUniqueMetaData :: MetaData -> MetaData -> Bool +hasUniqueMetaData (MetaData comparewith) (MetaData m) = + any uniquefield (M.toList m) + where + uniquefield :: (MetaField, S.Set MetaValue) -> Bool + uniquefield (f, v) = maybe True (uniquevalue v) (M.lookup f comparewith) + uniquevalue v1 v2 = not $ S.null $ S.difference v1 v2 isSet :: MetaValue -> Bool isSet (MetaValue (CurrentlySet isset) _) = isset +{- Gets only currently set values -} +currentMetaDataValues :: MetaField -> MetaData -> S.Set MetaValue +currentMetaDataValues f m = S.filter isSet (getAllMetaData f m) + +currentMetaData :: MetaData -> MetaData +currentMetaData (MetaData m) = removeEmptyFields $ MetaData $ + M.map (S.filter isSet) m + +removeEmptyFields :: MetaData -> MetaData +removeEmptyFields (MetaData m) = MetaData $ M.filter (not . S.null) m + {- Gets currently set values, but also values that have been unset. -} getAllMetaData :: MetaField -> MetaData -> S.Set MetaValue getAllMetaData f (MetaData m) = fromMaybe S.empty (M.lookup f m) @@ -164,7 +200,7 @@ instance Arbitrary MetaField where prop_updateMetaData_sane :: MetaData -> MetaField -> MetaValue -> Bool prop_updateMetaData_sane m f v = and [ S.member v $ getAllMetaData f m' - , not (isSet v) || S.member v (getCurrentMetaData f m') + , not (isSet v) || S.member v (currentMetaDataValues f m') ] where m' = updateMetaData f v m @@ -176,5 +212,4 @@ prop_metadata_serialize f v m = and , deserialize (serialize m') == Just m' ] where - m' = removeemptyfields m - removeemptyfields (MetaData x) = MetaData $ M.filter (not . S.null) x + m' = removeEmptyFields m diff --git a/doc/design/metadata.mdwn b/doc/design/metadata.mdwn index 40c085cdc0..3e2d4bf089 100644 --- a/doc/design/metadata.mdwn +++ b/doc/design/metadata.mdwn @@ -145,20 +145,6 @@ a tag was removed: 1287290991.152124s tag +baz 1291237510.141453s tag -bar -The end result is that tags foo and baz are set. This can be simplified: - - 1291237510.141453s tag +foo +baz -bar - -Note the reuse of the most recent timestamp in the simplified version, -rather than putting in the timestamp when the simplification was done. -This ensures that is some other repo is making changes, they won't get -trampled over. For example: - - 1291237510.141453s tag +foo +baz -bar - 1291239999.000000s tag +bar -foo - -Now tags bar and baz are set. - # efficient metadata lookup Looking up metadata for filtering so far requires traversing all keys in diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index 4e672f6089..17d78c555d 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -695,6 +695,23 @@ subdirectories). # UTILITY COMMANDS +* `metadata file [field=value field+=value field-=value ...]` + + Each file can have any number of metadata fields attached to it, + which each in turn have any number of values. This sets metadata + for a file, or if run without any values, shows its current metadata. + + To set a field's value, removing any old value(s), use field=value. + + To add an additional value, use field+=value. + + To remove a value, use field-=value. + + For example, to set some tags on a file: + + git annex metadata annexscreencast.ogv tag+=video tag+=screencast + + * `migrate [path ...]` Changes the specified annexed files to use the default key-value backend diff --git a/doc/internals.mdwn b/doc/internals.mdwn index 1cf0cf5051..970e88ba02 100644 --- a/doc/internals.mdwn +++ b/doc/internals.mdwn @@ -146,6 +146,27 @@ Example: 1287290776.765152s e605dca6-446a-11e0-8b2a-002170d25c55 blah blah 1287290767.478634s 26339d22-446b-11e0-9101-002170d25c55 foo=bar +## `aaa/bbb/*.log.met` + +These log files are used to store arbitrary [[design/metadata]] about keys. +Each key can have any number of metadata fields. Each field has a set of +values. + +Lines are timestamped, and record when values are added (`field +value`), +but also when values are removed (`field -value`). Removed values +are retained in the log so that when merging an old line that sets a value +that was later unset, the value is not accidentially added back. + +For example: + + 1287290776.765152s tag +foo +bar author +joey + 1291237510.141453s tag -bar +baz + +The value can be completely arbitrary data, although it's typically +reasonably short. If the value contains any whitespace +(including \r or \r), it will be base64 encoded. Base64 encoded values +are indicated by prefixing them with "!" + ## `schedule.log` Used to record scheduled events, such as periodic fscks.