From 0a7c5a9982bc80a141db372c619638898929ec9c Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Wed, 5 Sep 2018 13:20:10 -0400 Subject: [PATCH] dropdead per-remote metadata Had to refactor pure code into separate modules so it is accessible inside Annex.Branch.Transitions. This commit was sponsored by Peter on Patreon. --- Annex/Branch/Transitions.hs | 19 ++- Logs.hs | 4 +- Logs/MetaData.hs | 76 +----------- Logs/MetaData/Pure.hs | 111 ++++++++++++++++++ Logs/SingleValue.hs | 40 ++----- Logs/SingleValue/Pure.hs | 45 +++++++ Types/MetaData.hs | 7 ++ ...adata_needs_to_be_cleaned_in_dropdead.mdwn | 2 + git-annex.cabal | 2 + 9 files changed, 197 insertions(+), 109 deletions(-) create mode 100644 Logs/MetaData/Pure.hs create mode 100644 Logs/SingleValue/Pure.hs diff --git a/Annex/Branch/Transitions.hs b/Annex/Branch/Transitions.hs index 7b1f32b317..d682c8fd13 100644 --- a/Annex/Branch/Transitions.hs +++ b/Annex/Branch/Transitions.hs @@ -1,6 +1,6 @@ {- git-annex branch transitions - - - Copyright 2013 Joey Hess + - Copyright 2013-2018 Joey Hess - - Licensed under the GNU GPL version 3 or higher. -} @@ -15,8 +15,10 @@ import Logs.Transitions import qualified Logs.UUIDBased as UUIDBased import qualified Logs.Presence.Pure as Presence import qualified Logs.Chunk.Pure as Chunk +import qualified Logs.MetaData.Pure as MetaData import Types.TrustLevel import Types.UUID +import Types.MetaData import qualified Data.Map as M import Data.Default @@ -49,16 +51,27 @@ dropDead f content trustmap = case getLogVariety f of in if null newlog then RemoveFile else ChangeFile $ Presence.showLog newlog + Just RemoteMetaDataLog -> + let newlog = dropDeadFromRemoteMetaDataLog trustmap $ MetaData.simplifyLog $ MetaData.parseLog content + in if null newlog + then RemoveFile + else ChangeFile $ MetaData.showLog newlog Just OtherLog -> PreserveFile Nothing -> PreserveFile dropDeadFromMapLog :: TrustMap -> (k -> UUID) -> M.Map k v -> M.Map k v -dropDeadFromMapLog trustmap getuuid = M.filterWithKey $ \k _v -> notDead trustmap getuuid k +dropDeadFromMapLog trustmap getuuid = + M.filterWithKey $ \k _v -> notDead trustmap getuuid k {- Presence logs can contain UUIDs or other values. Any line that matches - a dead uuid is dropped; any other values are passed through. -} dropDeadFromPresenceLog :: TrustMap -> [Presence.LogLine] -> [Presence.LogLine] -dropDeadFromPresenceLog trustmap = filter $ notDead trustmap (toUUID . Presence.info) +dropDeadFromPresenceLog trustmap = + filter $ notDead trustmap (toUUID . Presence.info) + +dropDeadFromRemoteMetaDataLog :: TrustMap -> MetaData.Log MetaData -> MetaData.Log MetaData +dropDeadFromRemoteMetaDataLog trustmap = + MetaData.filterOutEmpty . MetaData.filterRemoteMetaData (notDead trustmap id) notDead :: TrustMap -> (v -> UUID) -> v -> Bool notDead trustmap a v = M.findWithDefault def (a v) trustmap /= DeadTrusted diff --git a/Logs.hs b/Logs.hs index db865716b9..0af14eb26c 100644 --- a/Logs.hs +++ b/Logs.hs @@ -16,6 +16,7 @@ data LogVariety | NewUUIDBasedLog | ChunkLog Key | PresenceLog Key + | RemoteMetaDataLog | OtherLog deriving (Show) @@ -26,7 +27,8 @@ getLogVariety f | f `elem` topLevelUUIDBasedLogs = Just UUIDBasedLog | isRemoteStateLog f = Just NewUUIDBasedLog | isChunkLog f = ChunkLog <$> chunkLogFileKey f - | isMetaDataLog f || isRemoteMetaDataLog f || f `elem` otherLogs = Just OtherLog + | isRemoteMetaDataLog f = Just RemoteMetaDataLog + | isMetaDataLog f || f `elem` otherLogs = Just OtherLog | otherwise = PresenceLog <$> firstJust (presenceLogs f) {- All the uuid-based logs stored in the top of the git-annex branch. -} diff --git a/Logs/MetaData.hs b/Logs/MetaData.hs index 5527ea7604..09e429cd23 100644 --- a/Logs/MetaData.hs +++ b/Logs/MetaData.hs @@ -20,13 +20,9 @@ - and so foo currently has no value. - - - - Copyright 2014-2018 Joey Hess - - - Licensed under the GNU GPL version 3 or higher. -} -{-# OPTIONS_GHC -fno-warn-orphans #-} - module Logs.MetaData ( getCurrentMetaData, getCurrentRemoteMetaData, @@ -44,19 +40,12 @@ import Annex.VectorClock import qualified Annex.Branch import qualified Annex import Logs -import Logs.SingleValue import Logs.TimeStamp +import Logs.MetaData.Pure import qualified Data.Set as S import qualified Data.Map as M -instance SingleValueSerializable MetaData where - serialize = Types.MetaData.serialize - deserialize = Types.MetaData.deserialize - -logToCurrentMetaData :: [LogEntry MetaData] -> MetaData -logToCurrentMetaData = currentMetaData . combineMetaData . map value - {- Go through the log from oldest to newest, and combine it all - into a single MetaData representing the current state. - @@ -131,66 +120,6 @@ addRemoteMetaData :: Key -> RemoteMetaData -> Annex () addRemoteMetaData k m = do addMetaData' remoteMetaDataLogFile k (fromRemoteMetaData m) -{- Simplify a log, removing historical values that are no longer - - needed. - - - - This is not as simple as just making a single log line with the newest - - state of all metadata. Consider this case: - - - - We have: - - - - 100 foo +x bar +y - - 200 foo -x - - - - An unmerged remote has: - - - - 150 bar -y baz +w - - - - If what we have were simplified to "200 foo -x bar +y" then when the line - - from the remote became available, it would be older than the simplified - - line, and its change to bar would not take effect. That is wrong. - - - - Instead, simplify it to: - - - - 100 bar +y - - 200 foo -x - - - - (Note that this ends up with the same number of lines as the - - unsimplified version, so there's really no point in updating - - the log to this version. Doing so would only add data to git, - - with little benefit.) - - - - Now merging with the remote yields: - - - - 100 bar +y - - 150 bar -y baz +w - - 200 foo -x - - - - Simplifying again: - - - - 150 bar +z baz +w - - 200 foo -x - -} -simplifyLog :: Log MetaData -> Log MetaData -simplifyLog s = case sl of - (newest:rest) -> - let sl' = go [newest] (value newest) rest - in if length sl' < length sl - then S.fromList sl' - else s - _ -> s - where - sl = S.toDescList s - - go c _ [] = c - go c newer (l:ls) - | unique == emptyMetaData = go c newer ls - | otherwise = go (l { value = unique } : c) - (unionMetaData unique newer) ls - where - older = value l - unique = older `differenceMetaData` newer - getMetaDataLog :: Key -> Annex (Log MetaData) getMetaDataLog key = do config <- Annex.getGitConfig @@ -218,3 +147,6 @@ copyMetaData oldkey newkey Annex.Branch.change (metaDataLogFile config newkey) $ const $ showLog l return True + +readLog :: FilePath -> Annex (Log MetaData) +readLog = parseLog <$$> Annex.Branch.get diff --git a/Logs/MetaData/Pure.hs b/Logs/MetaData/Pure.hs new file mode 100644 index 0000000000..6cfdf19cd8 --- /dev/null +++ b/Logs/MetaData/Pure.hs @@ -0,0 +1,111 @@ +{- git-annex metadata log, pure operations + - + - Copyright 2014-2018 Joey Hess + - + - Licensed under the GNU GPL version 3 or higher. + -} + +{-# OPTIONS_GHC -fno-warn-orphans #-} + +module Logs.MetaData.Pure ( + Log, + LogEntry(..), + parseLog, + showLog, + logToCurrentMetaData, + simplifyLog, + filterRemoteMetaData, + filterOutEmpty, +) where + +import Types.MetaData +import Logs.SingleValue.Pure +import Types.UUID + +import qualified Data.Set as S +import qualified Data.Map.Strict as M + +instance SingleValueSerializable MetaData where + serialize = Types.MetaData.serialize + deserialize = Types.MetaData.deserialize + +logToCurrentMetaData :: [LogEntry MetaData] -> MetaData +logToCurrentMetaData = currentMetaData . combineMetaData . map value + +{- Simplify a log, removing historical values that are no longer + - needed. + - + - This is not as simple as just making a single log line with the newest + - state of all metadata. Consider this case: + - + - We have: + - + - 100 foo +x bar +y + - 200 foo -x + - + - An unmerged remote has: + - + - 150 bar -y baz +w + - + - If what we have were simplified to "200 foo -x bar +y" then when the line + - from the remote became available, it would be older than the simplified + - line, and its change to bar would not take effect. That is wrong. + - + - Instead, simplify it to: + - + - 100 bar +y + - 200 foo -x + - + - (Note that this ends up with the same number of lines as the + - unsimplified version, so there's really no point in updating + - the log to this version. Doing so would only add data to git, + - with little benefit.) + - + - Now merging with the remote yields: + - + - 100 bar +y + - 150 bar -y baz +w + - 200 foo -x + - + - Simplifying again: + - + - 150 bar +z baz +w + - 200 foo -x + -} +simplifyLog :: Log MetaData -> Log MetaData +simplifyLog s = case sl of + (newest:rest) -> + let sl' = go [newest] (value newest) rest + in if length sl' < length sl + then S.fromList sl' + else s + _ -> s + where + sl = S.toDescList s + + go c _ [] = c + go c newer (l:ls) + | unique == emptyMetaData = go c newer ls + | otherwise = go (l { value = unique } : c) + (unionMetaData unique newer) ls + where + older = value l + unique = older `differenceMetaData` newer + +{- Filters per-remote metadata on the basis of UUID. + - + - Note that the LogEntry's clock is left the same, so this should not be + - used except for in a transition. + -} +filterRemoteMetaData :: (UUID -> Bool) -> Log MetaData -> Log MetaData +filterRemoteMetaData p = S.map go + where + go l@(LogEntry { value = MetaData m }) = + l { value = MetaData $ M.filterWithKey fil m } + fil f _v = case splitRemoteMetaDataField f of + Just (u, _) -> p u + Nothing -> True + +{- Filters out log lines that are empty. -} +filterOutEmpty :: Log MetaData -> Log MetaData +filterOutEmpty = S.filter $ \l -> value l /= emptyMetaData diff --git a/Logs/SingleValue.hs b/Logs/SingleValue.hs index 1a6181f56a..8e648a6289 100644 --- a/Logs/SingleValue.hs +++ b/Logs/SingleValue.hs @@ -11,46 +11,20 @@ - Licensed under the GNU GPL version 3 or higher. -} -module Logs.SingleValue where +module Logs.SingleValue ( + module Logs.SingleValue.Pure, + readLog, + getLog, + setLog, +) where import Annex.Common import qualified Annex.Branch -import Logs.Line +import Logs.SingleValue.Pure import Annex.VectorClock import qualified Data.Set as S -class SingleValueSerializable v where - serialize :: v -> String - deserialize :: String -> Maybe v - -data LogEntry v = LogEntry - { changed :: VectorClock - , value :: v - } deriving (Eq, Ord) - -type Log v = S.Set (LogEntry v) - -showLog :: (SingleValueSerializable v) => Log v -> String -showLog = unlines . map showline . S.toList - where - showline (LogEntry c v) = unwords [formatVectorClock c, serialize v] - -parseLog :: (Ord v, SingleValueSerializable v) => String -> Log v -parseLog = S.fromList . mapMaybe parse . splitLines - where - parse line = do - let (sc, s) = splitword line - c <- parseVectorClock sc - v <- deserialize s - Just (LogEntry c v) - splitword = separate (== ' ') - -newestValue :: Log v -> Maybe v -newestValue s - | S.null s = Nothing - | otherwise = Just (value $ S.findMax s) - readLog :: (Ord v, SingleValueSerializable v) => FilePath -> Annex (Log v) readLog = parseLog <$$> Annex.Branch.get diff --git a/Logs/SingleValue/Pure.hs b/Logs/SingleValue/Pure.hs new file mode 100644 index 0000000000..de3ceb14a3 --- /dev/null +++ b/Logs/SingleValue/Pure.hs @@ -0,0 +1,45 @@ +{- git-annex single-value log, pure operations + - + - Copyright 2014 Joey Hess + - + - Licensed under the GNU GPL version 3 or higher. + -} + +module Logs.SingleValue.Pure where + +import Annex.Common +import Logs.Line +import Annex.VectorClock + +import qualified Data.Set as S + +class SingleValueSerializable v where + serialize :: v -> String + deserialize :: String -> Maybe v + +data LogEntry v = LogEntry + { changed :: VectorClock + , value :: v + } deriving (Eq, Ord) + +type Log v = S.Set (LogEntry v) + +showLog :: (SingleValueSerializable v) => Log v -> String +showLog = unlines . map showline . S.toList + where + showline (LogEntry c v) = unwords [formatVectorClock c, serialize v] + +parseLog :: (Ord v, SingleValueSerializable v) => String -> Log v +parseLog = S.fromList . mapMaybe parse . splitLines + where + parse line = do + let (sc, s) = splitword line + c <- parseVectorClock sc + v <- deserialize s + Just (LogEntry c v) + splitword = separate (== ' ') + +newestValue :: Log v -> Maybe v +newestValue s + | S.null s = Nothing + | otherwise = Just (value $ S.findMax s) diff --git a/Types/MetaData.hs b/Types/MetaData.hs index 8359ace350..95b7dbb78a 100644 --- a/Types/MetaData.hs +++ b/Types/MetaData.hs @@ -38,6 +38,7 @@ module Types.MetaData ( modMeta, RemoteMetaData(..), extractRemoteMetaData, + splitRemoteMetaDataField, fromRemoteMetaData, prop_metadata_sane, prop_metadata_serialize @@ -301,6 +302,12 @@ extractRemoteMetaData u (MetaData m) = RemoteMetaData u $ MetaData $ prefix = remoteMetaDataPrefix u prefixlen = length prefix +splitRemoteMetaDataField :: MetaField -> Maybe (UUID, MetaField) +splitRemoteMetaDataField (MetaField f) = do + let (su, sf) = separate (== ':') (CI.original f) + f' <- toMetaField sf + return $ (toUUID su, f') + remoteMetaDataPrefix :: UUID -> String remoteMetaDataPrefix u = fromUUID u ++ ":" diff --git a/doc/todo/per-remote_metadata_needs_to_be_cleaned_in_dropdead.mdwn b/doc/todo/per-remote_metadata_needs_to_be_cleaned_in_dropdead.mdwn index 28d67c5d92..394f2bafd7 100644 --- a/doc/todo/per-remote_metadata_needs_to_be_cleaned_in_dropdead.mdwn +++ b/doc/todo/per-remote_metadata_needs_to_be_cleaned_in_dropdead.mdwn @@ -1,2 +1,4 @@ The newly added per-remote metadata log files need to be scrubbed clean of dead remotes during a transition. --[[Joey]] + +> [[done]] --[[Joey]] diff --git a/git-annex.cabal b/git-annex.cabal index 16d504861d..06ef9cfd82 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -889,6 +889,7 @@ Executable git-annex Logs.Location Logs.MapLog Logs.MetaData + Logs.MetaData.Pure Logs.Multicast Logs.NumCopies Logs.PreferredContent @@ -899,6 +900,7 @@ Executable git-annex Logs.RemoteState Logs.Schedule Logs.SingleValue + Logs.SingleValue.Pure Logs.TimeStamp Logs.Transfer Logs.Transitions