Merge branch 'vectorclock'

This commit is contained in:
Joey Hess 2021-08-04 12:39:54 -04:00
commit 9b9b5759b0
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
20 changed files with 143 additions and 96 deletions

View file

@ -200,7 +200,7 @@ data AnnexState = AnnexState
, cachedgitenv :: Maybe (AltIndexFile, FilePath, [(String, String)])
, urloptions :: Maybe UrlOptions
, insmudgecleanfilter :: Bool
, getvectorclock :: IO VectorClock
, getvectorclock :: IO CandidateVectorClock
}
newAnnexState :: GitConfig -> Git.Repo -> IO AnnexState

View file

@ -1,9 +1,11 @@
{- git-annex vector clocks
-
- We don't have a way yet to keep true distributed vector clocks.
- The next best thing is a timestamp.
- These are basically a timestamp. However, when logging a new
- value, if the old value has a vector clock that is the same or greater
- than the current vector clock, the old vector clock is incremented.
- This way, clock skew does not cause confusion.
-
- Copyright 2017-2020 Joey Hess <id@joeyh.name>
- Copyright 2017-2021 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -21,10 +23,12 @@ import Utility.TimeStamp
import Data.ByteString.Builder
import qualified Data.Attoparsec.ByteString.Lazy as A
currentVectorClock :: Annex VectorClock
currentVectorClock :: Annex CandidateVectorClock
currentVectorClock = liftIO =<< Annex.getState Annex.getvectorclock
-- Runs the action and uses the same vector clock throughout.
-- Runs the action and uses the same vector clock throughout,
-- except when it's necessary to use a newer one due to a past value having
-- a newer vector clock.
--
-- When the action modifies several files in the git-annex branch,
-- this can cause less space to be used, since the same vector clock
@ -52,6 +56,19 @@ reuseVectorClockWhile = bracket setup cleanup . const
use vc = Annex.changeState $ \s ->
s { Annex.getvectorclock = vc }
-- Convert a candidate vector clock in to the final one to use,
-- advancing it if necessary when necessary to get ahead of a previously
-- used vector clock.
advanceVectorClock :: CandidateVectorClock -> [VectorClock] -> VectorClock
advanceVectorClock (CandidateVectorClock c) [] = VectorClock c
advanceVectorClock (CandidateVectorClock c) prevs
| prev >= VectorClock c = case prev of
VectorClock v -> VectorClock (v + 1)
Unknown -> VectorClock c
| otherwise = VectorClock c
where
prev = maximum prevs
formatVectorClock :: VectorClock -> String
formatVectorClock Unknown = "0"
formatVectorClock (VectorClock t) = show t

View file

@ -13,11 +13,11 @@ import Types.VectorClock
import Utility.Env
import Utility.TimeStamp
startVectorClock :: IO (IO VectorClock)
startVectorClock :: IO (IO CandidateVectorClock)
startVectorClock = go =<< getEnv "GIT_ANNEX_VECTOR_CLOCK"
where
go Nothing = timebased
go (Just s) = case parsePOSIXTime s of
Just t -> return (pure (VectorClock t))
Just t -> return (pure (CandidateVectorClock t))
Nothing -> timebased
timebased = return (VectorClock <$> getPOSIXTime)
timebased = return (CandidateVectorClock <$> getPOSIXTime)

View file

@ -1,5 +1,10 @@
git-annex (8.20210804) UNRELEASED; urgency=medium
* Deal with clock skew, both forwards and backwards, when logging
information to the git-annex branch.
* GIT_ANNEX_VECTOR_CLOCK can now be set to a fixed value (eg 1)
rather than needing to be advanced each time a new change is made.
* Misuse of GIT_ANNEX_VECTOR_CLOCK will no longer confuse git-annex.
* add: When adding a dotfile, avoid treating its name as an extension.
-- Joey Hess <id@joeyh.name> Tue, 03 Aug 2021 12:22:45 -0400

View file

@ -98,12 +98,12 @@ seek o = case batchOption o of
)
_ -> giveup "--batch is currently only supported in --json mode"
start :: VectorClock -> MetaDataOptions -> SeekInput -> RawFilePath -> Key -> CommandStart
start :: CandidateVectorClock -> MetaDataOptions -> SeekInput -> RawFilePath -> Key -> CommandStart
start c o si file k = startKeys c o (si, k, mkActionItem (k, afile))
where
afile = AssociatedFile (Just file)
startKeys :: VectorClock -> MetaDataOptions -> (SeekInput, Key, ActionItem) -> CommandStart
startKeys :: CandidateVectorClock -> MetaDataOptions -> (SeekInput, Key, ActionItem) -> CommandStart
startKeys c o (si, k, ai) = case getSet o of
Get f -> startingCustomOutput k $ do
l <- S.toList . currentMetaDataValues f <$> getCurrentMetaData k
@ -113,7 +113,7 @@ startKeys c o (si, k, ai) = case getSet o of
_ -> starting "metadata" ai si $
perform c o k
perform :: VectorClock -> MetaDataOptions -> Key -> CommandPerform
perform :: CandidateVectorClock -> MetaDataOptions -> Key -> CommandPerform
perform c o k = case getSet o of
Set ms -> do
oldm <- getCurrentMetaData k

View file

@ -97,9 +97,8 @@ recordExportUnderway remoteuuid ec = do
hereuuid <- getUUID
let ep = ExportParticipants { exportFrom = hereuuid, exportTo = remoteuuid }
let exported = mkExported (newTreeish ec) []
Annex.Branch.change
(Annex.Branch.RegardingUUID [remoteuuid, hereuuid])
exportLog $
let ru = Annex.Branch.RegardingUUID [remoteuuid, hereuuid]
Annex.Branch.change ru exportLog $
buildExportLog
. changeMapLog c ep exported
. M.mapWithKey (updateForExportChange remoteuuid ec c hereuuid)

View file

@ -23,6 +23,7 @@ module Logs.Export.Pure (
) where
import Annex.Common
import Annex.VectorClock
import qualified Git
import Logs.MapLog
@ -110,7 +111,9 @@ exportedParser = Exported <$> refparser <*> many refparser
-- This way, when multiple repositories are exporting to
-- the same special remote, there's no conflict as long as they move
-- forward in lock-step.
updateForExportChange :: UUID -> ExportChange -> VectorClock -> UUID -> ExportParticipants -> LogEntry Exported -> LogEntry Exported
updateForExportChange remoteuuid ec c hereuuid ep le@(LogEntry _ exported@(Exported { exportedTreeish = t }))
updateForExportChange :: UUID -> ExportChange -> CandidateVectorClock -> UUID -> ExportParticipants -> LogEntry Exported -> LogEntry Exported
updateForExportChange remoteuuid ec c hereuuid ep le@(LogEntry lc exported@(Exported { exportedTreeish = t }))
| hereuuid == exportFrom ep || remoteuuid /= exportTo ep || t `notElem` oldTreeish ec = le
| otherwise = LogEntry c (exported { exportedTreeish = newTreeish ec })
| otherwise = LogEntry c' (exported { exportedTreeish = newTreeish ec })
where
c' = advanceVectorClock c [lc]

View file

@ -8,7 +8,7 @@
- Repositories record their UUID and the date when they --get or --drop
- a value.
-
- Copyright 2010-2018 Joey Hess <id@joeyh.name>
- Copyright 2010-2021 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -61,14 +61,14 @@ logStatusAfter key a = ifM a
{- Log a change in the presence of a key's value in a repository. -}
logChange :: Key -> UUID -> LogStatus -> Annex ()
logChange = logChange' logNow
logChange' :: (LogStatus -> LogInfo -> Annex LogLine) -> Key -> UUID -> LogStatus -> Annex ()
logChange' mklog key u@(UUID _) s = do
logChange key u@(UUID _) s = do
config <- Annex.getGitConfig
maybeAddLog (Annex.Branch.RegardingUUID [u]) (locationLogFile config key)
=<< mklog s (LogInfo (fromUUID u))
logChange' _ _ NoUUID _ = noop
maybeAddLog
(Annex.Branch.RegardingUUID [u])
(locationLogFile config key)
s
(LogInfo (fromUUID u))
logChange _ NoUUID _ = noop
{- Returns a list of repository UUIDs that, according to the log, have
- the value of a key. -}
@ -107,6 +107,9 @@ checkDead key = do
-
- Changes all logged lines for the key, in any location, that are
- currently InfoMissing, to be InfoDead.
-
- The vector clock in the log is updated minimally, so that any
- other location log changes are guaranteed to overrule this.
-}
setDead :: Key -> Annex ()
setDead key = do
@ -117,18 +120,12 @@ setDead key = do
where
go logfile l =
let u = toUUID (fromLogInfo (info l))
in addLog (Annex.Branch.RegardingUUID [u]) logfile (setDead' l)
{- Note that the timestamp in the log is updated minimally, so that this
- can be overruled by other location log changes. -}
setDead' :: LogLine -> LogLine
setDead' l = l
{ status = InfoDead
, date = case date l of
VectorClock c -> VectorClock $
c + realToFrac (picosecondsToDiffTime 1)
Unknown -> Unknown
}
c = case date l of
VectorClock v -> CandidateVectorClock $
v + realToFrac (picosecondsToDiffTime 1)
Unknown -> CandidateVectorClock 0
in addLog' (Annex.Branch.RegardingUUID [u]) logfile InfoDead
(info l) c
data Unchecked a = Unchecked (Annex (Maybe a))

View file

@ -67,8 +67,12 @@ mapLogParser fieldparser valueparser = M.fromListWith best <$> parseLogLines go
A.endOfInput
return (f, LogEntry c v)
changeMapLog :: Ord f => VectorClock -> f -> v -> MapLog f v -> MapLog f v
changeMapLog c f v = M.insert f $ LogEntry c v
changeMapLog :: Ord f => CandidateVectorClock -> f -> v -> MapLog f v -> MapLog f v
changeMapLog c f v m = M.insert f (LogEntry c' v) m
where
c' = case M.lookup f m of
Nothing -> advanceVectorClock c []
Just old -> advanceVectorClock c [changed old]
{- Only add an LogEntry if it's newer (or at least as new as) than any
- existing LogEntry for a field. -}

View file

@ -105,22 +105,23 @@ addMetaData' :: Annex.Branch.RegardingUUID -> (GitConfig -> Key -> RawFilePath)
addMetaData' ru getlogfile k metadata =
addMetaDataClocked' ru getlogfile k metadata =<< currentVectorClock
{- Reusing the same VectorClock when making changes to the metadata
{- Reusing the same CandidateVectorClock when making changes to the metadata
- of multiple keys is a nice optimisation. The same metadata lines
- will tend to be generated across the different log files, and so
- git will be able to pack the data more efficiently. -}
addMetaDataClocked :: Key -> MetaData -> VectorClock -> Annex ()
addMetaDataClocked :: Key -> MetaData -> CandidateVectorClock -> Annex ()
addMetaDataClocked = addMetaDataClocked' (Annex.Branch.RegardingUUID []) metaDataLogFile
addMetaDataClocked' :: Annex.Branch.RegardingUUID -> (GitConfig -> Key -> RawFilePath) -> Key -> MetaData -> VectorClock -> Annex ()
addMetaDataClocked' :: Annex.Branch.RegardingUUID -> (GitConfig -> Key -> RawFilePath) -> Key -> MetaData -> CandidateVectorClock -> Annex ()
addMetaDataClocked' ru getlogfile k d@(MetaData m) c
| d == emptyMetaData = noop
| otherwise = do
config <- Annex.getGitConfig
Annex.Branch.change ru (getlogfile config k) $
buildLog . simplifyLog
. S.insert (LogEntry c metadata)
. parseLog
Annex.Branch.change ru (getlogfile config k) $ \b ->
let s = parseLog b
c' = advanceVectorClock c (map changed (S.toList s))
ent = LogEntry c' metadata
in buildLog $ simplifyLog $ S.insert ent s
where
metadata = MetaData $ M.filterWithKey (\f _ -> not (isLastChangedField f)) m

View file

@ -6,7 +6,7 @@
- A line of the log will look like: "date N INFO"
- Where N=1 when the INFO is present, 0 otherwise.
-
- Copyright 2010-2014 Joey Hess <id@joeyh.name>
- Copyright 2010-2021 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -14,9 +14,9 @@
module Logs.Presence (
module X,
addLog,
addLog',
maybeAddLog,
readLog,
logNow,
currentLog,
currentLogInfo,
historicalLogInfo,
@ -28,32 +28,43 @@ import Annex.VectorClock
import qualified Annex.Branch
import Git.Types (RefDate)
{- Adds a LogLine to the log, removing any LogLines that are obsoleted by
- adding it. -}
addLog :: Annex.Branch.RegardingUUID -> RawFilePath -> LogLine -> Annex ()
addLog ru file line = Annex.Branch.change ru file $ \b ->
buildLog $ compactLog (line : parseLog b)
{- Adds to the log, removing any LogLines that are obsoleted. -}
addLog :: Annex.Branch.RegardingUUID -> RawFilePath -> LogStatus -> LogInfo -> Annex ()
addLog ru file logstatus loginfo =
addLog' ru file logstatus loginfo =<< currentVectorClock
addLog' :: Annex.Branch.RegardingUUID -> RawFilePath -> LogStatus -> LogInfo -> CandidateVectorClock -> Annex ()
addLog' ru file logstatus loginfo c =
Annex.Branch.change ru file $ \b ->
let old = parseLog b
line = genLine logstatus loginfo c old
in buildLog $ compactLog (line : old)
{- When a LogLine already exists with the same status and info, but an
- older timestamp, that LogLine is preserved, rather than updating the log
- with a newer timestamp.
-}
maybeAddLog :: Annex.Branch.RegardingUUID -> RawFilePath -> LogLine -> Annex ()
maybeAddLog ru file line = Annex.Branch.maybeChange ru file $ \s -> do
m <- insertNewStatus line $ logMap $ parseLog s
return $ buildLog $ mapLog m
maybeAddLog :: Annex.Branch.RegardingUUID -> RawFilePath -> LogStatus -> LogInfo -> Annex ()
maybeAddLog ru file logstatus loginfo = do
c <- currentVectorClock
Annex.Branch.maybeChange ru file $ \b ->
let old = parseLog b
line = genLine logstatus loginfo c old
in do
m <- insertNewStatus line $ logMap old
return $ buildLog $ mapLog m
genLine :: LogStatus -> LogInfo -> CandidateVectorClock -> [LogLine] -> LogLine
genLine logstatus loginfo c old = LogLine c' logstatus loginfo
where
oldcs = map date (filter (\l -> info l == loginfo) old)
c' = advanceVectorClock c oldcs
{- Reads a log file.
- Note that the LogLines returned may be in any order. -}
readLog :: RawFilePath -> Annex [LogLine]
readLog = parseLog <$$> Annex.Branch.get
{- Generates a new LogLine with the current time. -}
logNow :: LogStatus -> LogInfo -> Annex LogLine
logNow s i = do
c <- currentVectorClock
return $ LogLine c s i
{- Reads a log and returns only the info that is still in effect. -}
currentLogInfo :: RawFilePath -> Annex [LogInfo]
currentLogInfo file = map info <$> currentLog file

View file

@ -123,4 +123,5 @@ instance Arbitrary LogLine where
(\b -> C8.notElem '\n' b && C8.notElem '\r' b)
prop_parse_build_presence_log :: [LogLine] -> Bool
prop_parse_build_presence_log l = parseLog (toLazyByteString (buildLog l)) == l
prop_parse_build_presence_log l =
parseLog (toLazyByteString (buildLog l)) == l

View file

@ -1,3 +1,5 @@
{-# LANGUAGE ScopedTypeVariables #-}
{- git-annex single-value log
-
- This is used to store a value in a way that can be union merged.
@ -6,7 +8,7 @@
-
- The line with the newest timestamp wins.
-
- Copyright 2014 Joey Hess <id@joeyh.name>
- Copyright 2014-2021 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -31,8 +33,10 @@ readLog = parseLog <$$> Annex.Branch.get
getLog :: (Ord v, SingleValueSerializable v) => RawFilePath -> Annex (Maybe v)
getLog = newestValue <$$> readLog
setLog :: (SingleValueSerializable v) => Annex.Branch.RegardingUUID -> RawFilePath -> v -> Annex ()
setLog :: (Ord v, SingleValueSerializable v) => Annex.Branch.RegardingUUID -> RawFilePath -> v -> Annex ()
setLog ru f v = do
c <- currentVectorClock
let ent = LogEntry c v
Annex.Branch.change ru f $ \_old -> buildLog (S.singleton ent)
Annex.Branch.change ru f $ \old ->
let oldcs = map changed ((parseLog' old) `asTypeOf` [ent])
ent = LogEntry (advanceVectorClock c oldcs) v
in buildLog (S.singleton ent)

View file

@ -1,6 +1,6 @@
{- git-annex single-value log, pure operations
-
- Copyright 2014-2019 Joey Hess <id@joeyh.name>
- Copyright 2014-2021 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -33,13 +33,18 @@ buildLog :: (SingleValueSerializable v) => Log v -> Builder
buildLog = mconcat . map genline . S.toList
where
genline (LogEntry c v) =
buildVectorClock c <> sp <> byteString (serialize v) <> nl
buildVectorClock c
<> sp
<> byteString (serialize v)
<> nl
sp = charUtf8 ' '
nl = charUtf8 '\n'
parseLog :: (Ord v, SingleValueSerializable v) => L.ByteString -> Log v
parseLog = S.fromList . fromMaybe []
. A.maybeResult . A.parse (logParser <* A.endOfInput)
parseLog = S.fromList . parseLog'
parseLog' :: SingleValueSerializable v => L.ByteString -> [LogEntry v]
parseLog' = fromMaybe [] . A.maybeResult . A.parse (logParser <* A.endOfInput)
logParser :: SingleValueSerializable v => A.Parser [LogEntry v]
logParser = parseLogLines $ LogEntry

View file

@ -51,8 +51,12 @@ describeTransition ForgetDeadRemotes = "forget dead remotes"
noTransitions :: Transitions
noTransitions = S.empty
addTransition :: VectorClock -> Transition -> Transitions -> Transitions
addTransition c t = S.insert $ TransitionLine c (Right t)
addTransition :: CandidateVectorClock -> Transition -> Transitions -> Transitions
addTransition c t s = S.insert (TransitionLine c' (Right t)) s
where
oldcs = map transitionStarted $
filter (\l -> transition l == Right t) (S.toList s)
c' = advanceVectorClock c oldcs
buildTransitions :: Transitions -> Builder
buildTransitions = mconcat . map genline . S.elems
@ -73,9 +77,6 @@ parseTransitionsStrictly source b =
then ts
else giveup $ "unknown transitions listed in " ++ source ++ "; upgrade git-annex!"
showTransitionLine :: TransitionLine -> String
showTransitionLine (TransitionLine c t) = unwords [show t, formatVectorClock c]
transitionLineParser :: A.Parser TransitionLine
transitionLineParser = do
t <- (parsetransition <$> A.takeByteString)

View file

@ -53,8 +53,10 @@ buildLogOld :: (v -> Builder) -> Log v -> Builder
buildLogOld builder = mconcat . map genline . M.toList
where
genline (u, LogEntry c@(VectorClock {}) v) =
buildUUID u <> sp <> builder v <> sp <>
byteString "timestamp=" <> buildVectorClock c <> nl
buildUUID u <> sp <> builder v <> sp
<> byteString "timestamp="
<> buildVectorClock c
<> nl
genline (u, LogEntry Unknown v) =
buildUUID u <> sp <> builder v <> nl
sp = charUtf8 ' '
@ -89,7 +91,7 @@ buildLogNew = buildMapLog buildUUID
parseLogNew :: A.Parser v -> L.ByteString -> Log v
parseLogNew = parseMapLog (toUUID <$> A.takeByteString)
changeLog :: VectorClock -> UUID -> v -> Log v -> Log v
changeLog :: CandidateVectorClock -> UUID -> v -> Log v -> Log v
changeLog = changeMapLog
addLog :: UUID -> LogEntry v -> Log v -> Log v

View file

@ -61,7 +61,7 @@ setUrlPresent key url = do
unless (url `elem` us) $ do
config <- Annex.getGitConfig
addLog (Annex.Branch.RegardingUUID []) (urlLogFile config key)
=<< logNow InfoPresent (LogInfo (encodeBS url))
InfoPresent (LogInfo (encodeBS url))
-- If the url does not have an OtherDownloader, it must be present
-- in the web.
case snd (getDownloader url) of
@ -75,7 +75,7 @@ setUrlMissing key url = do
when (url `elem` us) $ do
config <- Annex.getGitConfig
addLog (Annex.Branch.RegardingUUID []) (urlLogFile config key)
=<< logNow InfoMissing (LogInfo (encodeBS url))
InfoMissing (LogInfo (encodeBS url))
-- If the url was a web url and none of the remaining urls
-- for the key are web urls, the key must not be present
-- in the web.

View file

@ -1,9 +1,6 @@
{- git-annex vector clocks
-
- We don't have a way yet to keep true distributed vector clocks.
- The next best thing is a timestamp.
-
- Copyright 2017-2020 Joey Hess <id@joeyh.name>
- Copyright 2017-2021 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -21,6 +18,11 @@ import Utility.QuickCheck
data VectorClock = Unknown | VectorClock POSIXTime
deriving (Eq, Ord, Show)
-- | This is a candidate value to use in a VectorClock. It
-- may not be suitable to use this, when a previously used VectorClock
-- is the same or higher.
data CandidateVectorClock = CandidateVectorClock POSIXTime
-- Unknown is oldest.
prop_VectorClock_sane :: Bool
prop_VectorClock_sane = Unknown < VectorClock 1

View file

@ -103,10 +103,6 @@ Possible reasons to make changes:
performance. The disk usage change of this method has not yet been
quantified.
* Another reason to do it would be improving git-annex to use vector clocks,
instead of its current assumption that client's clocks are close enough to
accurate. This would presumably change the contents of the files.
* While not a sufficient reason on its own, the best practices for file
formats in the git-annex branch has evolved over time, and there are some
files that have unusual formats for historical reasons. Other files have

View file

@ -1908,14 +1908,13 @@ These environment variables are used by git-annex when set:
Normally git-annex timestamps lines in the log files committed to the
git-annex branch. Setting this environment variable to a number
will make git-annex use that rather than the current number of seconds
since the UNIX epoch. Note that decimal seconds are supported.
will make git-annex use that (or a larger number)
rather than the current number of seconds since the UNIX epoch.
Note that decimal seconds are supported.
This is only provided for advanced users who either have a better way to
tell which commit is current than the local clock, or who need to avoid
embedding timestamps for policy reasons. Misuse of this environment
variable can confuse git-annex's book-keeping, sometimes in ways that
`git annex fsck` is unable to repair.
embedding timestamps for policy reasons.
* Some special remotes use additional environment variables
for authentication etc. For example, `AWS_ACCESS_KEY_ID`