data types and serialization for metadata

A very haskell commit! Just data types, instances to serialize the metadata
to a nice format, and QuickCheck tests.

This commit was sponsored by Andreas Leha.
This commit is contained in:
Joey Hess 2014-02-12 17:54:28 -04:00
parent 655549af91
commit 1b79d18a40
4 changed files with 221 additions and 1 deletions

View file

@ -45,6 +45,7 @@ import qualified Logs.Remote
import qualified Logs.Unused import qualified Logs.Unused
import qualified Logs.Transfer import qualified Logs.Transfer
import qualified Logs.Presence import qualified Logs.Presence
import qualified Types.MetaData
import qualified Remote import qualified Remote
import qualified Types.Key import qualified Types.Key
import qualified Types.Messages import qualified Types.Messages
@ -144,6 +145,8 @@ properties = localOption (QuickCheckTests 1000) $ testGroup "QuickCheck"
, testProperty "prop_hashes_stable" Utility.Hash.prop_hashes_stable , testProperty "prop_hashes_stable" Utility.Hash.prop_hashes_stable
, testProperty "prop_schedule_roundtrips" Utility.Scheduled.prop_schedule_roundtrips , testProperty "prop_schedule_roundtrips" Utility.Scheduled.prop_schedule_roundtrips
, testProperty "prop_duration_roundtrips" Utility.HumanTime.prop_duration_roundtrips , testProperty "prop_duration_roundtrips" Utility.HumanTime.prop_duration_roundtrips
, testProperty "prop_updateMetaData_sane" Types.MetaData.prop_updateMetaData_sane
, testProperty "prop_metadata_serialize" Types.MetaData.prop_metadata_serialize
] ]
{- These tests set up the test environment, but also test some basic parts {- These tests set up the test environment, but also test some basic parts

180
Types/MetaData.hs Normal file
View file

@ -0,0 +1,180 @@
{- git-annex general metadata
-
- Copyright 2014 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
module Types.MetaData (
MetaData,
MetaField,
MetaValue,
CurrentlySet(..),
MetaSerializable,
toMetaField,
fromMetaField,
toMetaValue,
toMetaValue',
fromMetaValue,
newMetaData,
updateMetaData,
getCurrentMetaData,
getAllMetaData,
serialize,
deserialize,
prop_updateMetaData_sane,
prop_metadata_serialize
) where
import Common
import Utility.Base64
import Utility.QuickCheck
import qualified Data.Set as S
import qualified Data.Map as M
import Data.Char
newtype MetaData = MetaData (M.Map MetaField (S.Set MetaValue))
deriving (Show, Eq)
{- A metadata value can be currently be set (True), or may have been
- set before and we're remembering it no longer is (False). -}
newtype CurrentlySet = CurrentlySet Bool
deriving (Show, Eq, Ord, Arbitrary)
newtype MetaField = MetaField String
deriving (Show, Eq, Ord)
data MetaValue = MetaValue CurrentlySet String
deriving (Show, Ord)
{- Metadata values are compared equal whether currently set or not. -}
instance Eq MetaValue where
MetaValue _ a == MetaValue _ b = a == b
{- MetaData is serialized to a format like:
-
- field1 +val1 +val2 -val3 field2 +val4 +val5
-}
class MetaSerializable v where
serialize :: v -> String
deserialize :: String -> Maybe v
instance MetaSerializable MetaData where
serialize (MetaData m) = unwords $ concatMap go $ M.toList m
where
go (f, vs) = serialize f : map serialize (S.toList vs)
deserialize = Just . getfield newMetaData . words
where
getfield m [] = m
getfield m (w:ws) = maybe m (getvalues m ws) (deserialize w)
getvalues m [] _ = m
getvalues m l@(w:ws) f = case deserialize w of
Just v -> getvalues (updateMetaData f v m) ws f
Nothing -> getfield m l
instance MetaSerializable MetaField where
serialize (MetaField f) = f
deserialize = Just . MetaField
{- Base64 problimatic values. -}
instance MetaSerializable MetaValue where
serialize (MetaValue isset v) =
serialize isset ++
if any isSpace v || "!" `isPrefixOf` v
then '!' : toB64 v
else v
deserialize (isset:'!':v) = MetaValue
<$> deserialize [isset]
<*> fromB64Maybe v
deserialize (isset:v) = MetaValue
<$> deserialize [isset]
<*> pure v
deserialize [] = Nothing
instance MetaSerializable CurrentlySet where
serialize (CurrentlySet True) = "+"
serialize (CurrentlySet False) = "-"
deserialize "+" = Just (CurrentlySet True)
deserialize "-" = Just (CurrentlySet False)
deserialize _ = Nothing
{- Fields cannot be empty, contain whitespace, or start with "+-" as
- that would break the serialization. -}
toMetaField :: String -> Maybe MetaField
toMetaField f
| legalField f = Just $ MetaField f
| otherwise = Nothing
legalField :: String -> Bool
legalField f
| null f = False
| any isSpace f = False
| any (`isPrefixOf` f) ["+", "-"] = False
| otherwise = True
toMetaValue :: String -> MetaValue
toMetaValue = MetaValue (CurrentlySet True)
toMetaValue' :: CurrentlySet -> String -> MetaValue
toMetaValue' = MetaValue
fromMetaField :: MetaField -> String
fromMetaField (MetaField f) = f
fromMetaValue :: MetaValue -> String
fromMetaValue (MetaValue _ f) = f
newMetaData :: MetaData
newMetaData = MetaData M.empty
{- Can be used to set a value, or to unset it, depending on whether
- the MetaValue has CurrentlySet or not. -}
updateMetaData :: MetaField -> MetaValue -> MetaData -> MetaData
updateMetaData f v (MetaData m) = MetaData $
M.insertWith' S.union f (S.singleton v) m
{- Gets only currently set values -}
getCurrentMetaData :: MetaField -> MetaData -> S.Set MetaValue
getCurrentMetaData f m = S.filter isSet (getAllMetaData f m)
isSet :: MetaValue -> Bool
isSet (MetaValue (CurrentlySet isset) _) = isset
{- Gets currently set values, but also values that have been unset. -}
getAllMetaData :: MetaField -> MetaData -> S.Set MetaValue
getAllMetaData f (MetaData m) = fromMaybe S.empty (M.lookup f m)
{- Avoid putting too many fields in the map; extremely large maps make
- the seriaization test slow due to the sheer amount of data.
- It's unlikely that more than 100 fields of metadata will be used. -}
instance Arbitrary MetaData where
arbitrary = do
size <- arbitrarySizedBoundedIntegral `suchThat` (< 500)
MetaData . M.fromList <$> vector size
instance Arbitrary MetaValue where
arbitrary = MetaValue <$> arbitrary <*> arbitrary
instance Arbitrary MetaField where
arbitrary = MetaField <$> arbitrary `suchThat` legalField
prop_updateMetaData_sane :: MetaData -> MetaField -> MetaValue -> Bool
prop_updateMetaData_sane m f v = and
[ S.member v $ getAllMetaData f m'
, not (isSet v) || S.member v (getCurrentMetaData f m')
]
where
m' = updateMetaData f v m
prop_metadata_serialize :: MetaField -> MetaValue -> MetaData -> Bool
prop_metadata_serialize f v m = and
[ deserialize (serialize f) == Just f
, deserialize (serialize v) == Just v
, deserialize (serialize m') == Just m'
]
where
m' = removeemptyfields m
removeemptyfields (MetaData x) = MetaData $ M.filter (not . S.null) x

View file

@ -1,6 +1,6 @@
{- QuickCheck with additional instances {- QuickCheck with additional instances
- -
- Copyright 2012 Joey Hess <joey@kitenet.net> - Copyright 2012-2014 Joey Hess <joey@kitenet.net>
- -
- Licensed under the GNU GPL version 3 or higher. - Licensed under the GNU GPL version 3 or higher.
-} -}
@ -17,11 +17,15 @@ import Test.QuickCheck as X
import Data.Time.Clock.POSIX import Data.Time.Clock.POSIX
import System.Posix.Types import System.Posix.Types
import qualified Data.Map as M import qualified Data.Map as M
import qualified Data.Set as S
import Control.Applicative import Control.Applicative
instance (Arbitrary k, Arbitrary v, Eq k, Ord k) => Arbitrary (M.Map k v) where instance (Arbitrary k, Arbitrary v, Eq k, Ord k) => Arbitrary (M.Map k v) where
arbitrary = M.fromList <$> arbitrary arbitrary = M.fromList <$> arbitrary
instance (Arbitrary v, Eq v, Ord v) => Arbitrary (S.Set v) where
arbitrary = S.fromList <$> arbitrary
{- Times before the epoch are excluded. -} {- Times before the epoch are excluded. -}
instance Arbitrary POSIXTime where instance Arbitrary POSIXTime where
arbitrary = nonNegative arbitrarySizedIntegral arbitrary = nonNegative arbitrarySizedIntegral

View file

@ -126,6 +126,39 @@ Note that any of these filenames can in theory conflict. May need to use
`.variant-*` like sync does on conflict to allow 2 files with same name in `.variant-*` like sync does on conflict to allow 2 files with same name in
same filtered branch. same filtered branch.
## union merge properties
While the storage could just list all the current values of a field on a
line with a timestamp, that's not good enough. Two disconnected
repositories can make changes to the values of a field (setting and
unsetting tags for example) and when this is union merged back together,
the changes need to be able to be replayed in order to determine which
values we end up with.
To make that work, we log not only when a field is set to a value,
but when a value is unset as well.
For example, here two different remotes added tags, and then later
a tag was removed:
1287290776.765152s tag +foo +bar
1287290991.152124s tag +baz
1291237510.141453s tag -bar
The end result is that tags foo and baz are set. This can be simplified:
1291237510.141453s tag +foo +baz -bar
Note the reuse of the most recent timestamp in the simplified version,
rather than putting in the timestamp when the simplification was done.
This ensures that is some other repo is making changes, they won't get
trampled over. For example:
1291237510.141453s tag +foo +baz -bar
1291239999.000000s tag +bar -foo
Now tags bar and baz are set.
# efficient metadata lookup # efficient metadata lookup
Looking up metadata for filtering so far requires traversing all keys in Looking up metadata for filtering so far requires traversing all keys in