From 1b79d18a407feacbf45f4d3464e835ef0c106a7f Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Wed, 12 Feb 2014 17:54:28 -0400 Subject: [PATCH] data types and serialization for metadata A very haskell commit! Just data types, instances to serialize the metadata to a nice format, and QuickCheck tests. This commit was sponsored by Andreas Leha. --- Test.hs | 3 + Types/MetaData.hs | 180 +++++++++++++++++++++++++++++++++++++++ Utility/QuickCheck.hs | 6 +- doc/design/metadata.mdwn | 33 +++++++ 4 files changed, 221 insertions(+), 1 deletion(-) create mode 100644 Types/MetaData.hs diff --git a/Test.hs b/Test.hs index a783071278..43f5939ee9 100644 --- a/Test.hs +++ b/Test.hs @@ -45,6 +45,7 @@ import qualified Logs.Remote import qualified Logs.Unused import qualified Logs.Transfer import qualified Logs.Presence +import qualified Types.MetaData import qualified Remote import qualified Types.Key import qualified Types.Messages @@ -144,6 +145,8 @@ properties = localOption (QuickCheckTests 1000) $ testGroup "QuickCheck" , testProperty "prop_hashes_stable" Utility.Hash.prop_hashes_stable , testProperty "prop_schedule_roundtrips" Utility.Scheduled.prop_schedule_roundtrips , testProperty "prop_duration_roundtrips" Utility.HumanTime.prop_duration_roundtrips + , testProperty "prop_updateMetaData_sane" Types.MetaData.prop_updateMetaData_sane + , testProperty "prop_metadata_serialize" Types.MetaData.prop_metadata_serialize ] {- These tests set up the test environment, but also test some basic parts diff --git a/Types/MetaData.hs b/Types/MetaData.hs new file mode 100644 index 0000000000..ee6ba66a02 --- /dev/null +++ b/Types/MetaData.hs @@ -0,0 +1,180 @@ +{- git-annex general metadata + - + - Copyright 2014 Joey Hess + - + - Licensed under the GNU GPL version 3 or higher. + -} + +{-# LANGUAGE GeneralizedNewtypeDeriving #-} + +module Types.MetaData ( + MetaData, + MetaField, + MetaValue, + CurrentlySet(..), + MetaSerializable, + toMetaField, + fromMetaField, + toMetaValue, + toMetaValue', + fromMetaValue, + newMetaData, + updateMetaData, + getCurrentMetaData, + getAllMetaData, + serialize, + deserialize, + prop_updateMetaData_sane, + prop_metadata_serialize +) where + +import Common +import Utility.Base64 +import Utility.QuickCheck + +import qualified Data.Set as S +import qualified Data.Map as M +import Data.Char + +newtype MetaData = MetaData (M.Map MetaField (S.Set MetaValue)) + deriving (Show, Eq) + +{- A metadata value can be currently be set (True), or may have been + - set before and we're remembering it no longer is (False). -} +newtype CurrentlySet = CurrentlySet Bool + deriving (Show, Eq, Ord, Arbitrary) + +newtype MetaField = MetaField String + deriving (Show, Eq, Ord) + +data MetaValue = MetaValue CurrentlySet String + deriving (Show, Ord) + +{- Metadata values are compared equal whether currently set or not. -} +instance Eq MetaValue where + MetaValue _ a == MetaValue _ b = a == b + +{- MetaData is serialized to a format like: + - + - field1 +val1 +val2 -val3 field2 +val4 +val5 + -} +class MetaSerializable v where + serialize :: v -> String + deserialize :: String -> Maybe v + +instance MetaSerializable MetaData where + serialize (MetaData m) = unwords $ concatMap go $ M.toList m + where + go (f, vs) = serialize f : map serialize (S.toList vs) + deserialize = Just . getfield newMetaData . words + where + getfield m [] = m + getfield m (w:ws) = maybe m (getvalues m ws) (deserialize w) + getvalues m [] _ = m + getvalues m l@(w:ws) f = case deserialize w of + Just v -> getvalues (updateMetaData f v m) ws f + Nothing -> getfield m l + +instance MetaSerializable MetaField where + serialize (MetaField f) = f + deserialize = Just . MetaField + +{- Base64 problimatic values. -} +instance MetaSerializable MetaValue where + serialize (MetaValue isset v) = + serialize isset ++ + if any isSpace v || "!" `isPrefixOf` v + then '!' : toB64 v + else v + deserialize (isset:'!':v) = MetaValue + <$> deserialize [isset] + <*> fromB64Maybe v + deserialize (isset:v) = MetaValue + <$> deserialize [isset] + <*> pure v + deserialize [] = Nothing + +instance MetaSerializable CurrentlySet where + serialize (CurrentlySet True) = "+" + serialize (CurrentlySet False) = "-" + deserialize "+" = Just (CurrentlySet True) + deserialize "-" = Just (CurrentlySet False) + deserialize _ = Nothing + +{- Fields cannot be empty, contain whitespace, or start with "+-" as + - that would break the serialization. -} +toMetaField :: String -> Maybe MetaField +toMetaField f + | legalField f = Just $ MetaField f + | otherwise = Nothing + +legalField :: String -> Bool +legalField f + | null f = False + | any isSpace f = False + | any (`isPrefixOf` f) ["+", "-"] = False + | otherwise = True + +toMetaValue :: String -> MetaValue +toMetaValue = MetaValue (CurrentlySet True) + +toMetaValue' :: CurrentlySet -> String -> MetaValue +toMetaValue' = MetaValue + +fromMetaField :: MetaField -> String +fromMetaField (MetaField f) = f + +fromMetaValue :: MetaValue -> String +fromMetaValue (MetaValue _ f) = f + +newMetaData :: MetaData +newMetaData = MetaData M.empty + +{- Can be used to set a value, or to unset it, depending on whether + - the MetaValue has CurrentlySet or not. -} +updateMetaData :: MetaField -> MetaValue -> MetaData -> MetaData +updateMetaData f v (MetaData m) = MetaData $ + M.insertWith' S.union f (S.singleton v) m + +{- Gets only currently set values -} +getCurrentMetaData :: MetaField -> MetaData -> S.Set MetaValue +getCurrentMetaData f m = S.filter isSet (getAllMetaData f m) + +isSet :: MetaValue -> Bool +isSet (MetaValue (CurrentlySet isset) _) = isset + +{- Gets currently set values, but also values that have been unset. -} +getAllMetaData :: MetaField -> MetaData -> S.Set MetaValue +getAllMetaData f (MetaData m) = fromMaybe S.empty (M.lookup f m) + +{- Avoid putting too many fields in the map; extremely large maps make + - the seriaization test slow due to the sheer amount of data. + - It's unlikely that more than 100 fields of metadata will be used. -} +instance Arbitrary MetaData where + arbitrary = do + size <- arbitrarySizedBoundedIntegral `suchThat` (< 500) + MetaData . M.fromList <$> vector size + +instance Arbitrary MetaValue where + arbitrary = MetaValue <$> arbitrary <*> arbitrary + +instance Arbitrary MetaField where + arbitrary = MetaField <$> arbitrary `suchThat` legalField + +prop_updateMetaData_sane :: MetaData -> MetaField -> MetaValue -> Bool +prop_updateMetaData_sane m f v = and + [ S.member v $ getAllMetaData f m' + , not (isSet v) || S.member v (getCurrentMetaData f m') + ] + where + m' = updateMetaData f v m + +prop_metadata_serialize :: MetaField -> MetaValue -> MetaData -> Bool +prop_metadata_serialize f v m = and + [ deserialize (serialize f) == Just f + , deserialize (serialize v) == Just v + , deserialize (serialize m') == Just m' + ] + where + m' = removeemptyfields m + removeemptyfields (MetaData x) = MetaData $ M.filter (not . S.null) x diff --git a/Utility/QuickCheck.hs b/Utility/QuickCheck.hs index 82af09f3dc..e2539f3d69 100644 --- a/Utility/QuickCheck.hs +++ b/Utility/QuickCheck.hs @@ -1,6 +1,6 @@ {- QuickCheck with additional instances - - - Copyright 2012 Joey Hess + - Copyright 2012-2014 Joey Hess - - Licensed under the GNU GPL version 3 or higher. -} @@ -17,11 +17,15 @@ import Test.QuickCheck as X import Data.Time.Clock.POSIX import System.Posix.Types import qualified Data.Map as M +import qualified Data.Set as S import Control.Applicative instance (Arbitrary k, Arbitrary v, Eq k, Ord k) => Arbitrary (M.Map k v) where arbitrary = M.fromList <$> arbitrary +instance (Arbitrary v, Eq v, Ord v) => Arbitrary (S.Set v) where + arbitrary = S.fromList <$> arbitrary + {- Times before the epoch are excluded. -} instance Arbitrary POSIXTime where arbitrary = nonNegative arbitrarySizedIntegral diff --git a/doc/design/metadata.mdwn b/doc/design/metadata.mdwn index 6b8b5bdc0a..40c085cdc0 100644 --- a/doc/design/metadata.mdwn +++ b/doc/design/metadata.mdwn @@ -126,6 +126,39 @@ Note that any of these filenames can in theory conflict. May need to use `.variant-*` like sync does on conflict to allow 2 files with same name in same filtered branch. +## union merge properties + +While the storage could just list all the current values of a field on a +line with a timestamp, that's not good enough. Two disconnected +repositories can make changes to the values of a field (setting and +unsetting tags for example) and when this is union merged back together, +the changes need to be able to be replayed in order to determine which +values we end up with. + +To make that work, we log not only when a field is set to a value, +but when a value is unset as well. + +For example, here two different remotes added tags, and then later +a tag was removed: + + 1287290776.765152s tag +foo +bar + 1287290991.152124s tag +baz + 1291237510.141453s tag -bar + +The end result is that tags foo and baz are set. This can be simplified: + + 1291237510.141453s tag +foo +baz -bar + +Note the reuse of the most recent timestamp in the simplified version, +rather than putting in the timestamp when the simplification was done. +This ensures that is some other repo is making changes, they won't get +trampled over. For example: + + 1291237510.141453s tag +foo +baz -bar + 1291239999.000000s tag +bar -foo + +Now tags bar and baz are set. + # efficient metadata lookup Looking up metadata for filtering so far requires traversing all keys in