generate more compact git-annex branch for imports

Especially from borg, where the content identifier logs
all end up being the same identical file!

But also, for other imports, the location tracking logs can,
in some cases, be identical files.

Bonus optimisation: Avoid looking up (and parsing when set)
GIT_ANNEX_VECTOR_CLOCK env var every time a log is written to.
Although the lookup does happen at startup even when no
log will be written now.
This commit is contained in:
Joey Hess 2020-12-23 15:21:33 -04:00
parent f8aadbfb9b
commit 6280af2901
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
26 changed files with 133 additions and 49 deletions

View file

@ -72,6 +72,8 @@ import Types.IndexFiles
import Types.CatFileHandles import Types.CatFileHandles
import Types.RemoteConfig import Types.RemoteConfig
import Types.TransferrerPool import Types.TransferrerPool
import Types.VectorClock
import Annex.VectorClock.Utility
import qualified Database.Keys.Handle as Keys import qualified Database.Keys.Handle as Keys
import Utility.InodeCache import Utility.InodeCache
import Utility.Url import Utility.Url
@ -121,6 +123,7 @@ data AnnexState = AnnexState
, fast :: Bool , fast :: Bool
, daemon :: Bool , daemon :: Bool
, branchstate :: BranchState , branchstate :: BranchState
, getvectorclock :: IO VectorClock
, repoqueue :: Maybe (Git.Queue.Queue Annex) , repoqueue :: Maybe (Git.Queue.Queue Annex)
, catfilehandles :: CatFileHandles , catfilehandles :: CatFileHandles
, hashobjecthandle :: Maybe HashObjectHandle , hashobjecthandle :: Maybe HashObjectHandle
@ -172,6 +175,7 @@ newState c r = do
sc <- newTMVarIO False sc <- newTMVarIO False
kh <- Keys.newDbHandle kh <- Keys.newDbHandle
tp <- newTransferrerPool tp <- newTransferrerPool
vc <- startVectorClock
return $ AnnexState return $ AnnexState
{ repo = r { repo = r
, repoadjustment = return , repoadjustment = return
@ -187,6 +191,7 @@ newState c r = do
, fast = False , fast = False
, daemon = False , daemon = False
, branchstate = startBranchState , branchstate = startBranchState
, getvectorclock = vc
, repoqueue = Nothing , repoqueue = Nothing
, catfilehandles = catFileHandlesNonConcurrent , catfilehandles = catFileHandlesNonConcurrent
, hashobjecthandle = Nothing , hashobjecthandle = Nothing

View file

@ -38,6 +38,7 @@ import Annex.RemoteTrackingBranch
import Annex.HashObject import Annex.HashObject
import Annex.Transfer import Annex.Transfer
import Annex.CheckIgnore import Annex.CheckIgnore
import Annex.VectorClock
import Command import Command
import Backend import Backend
import Types.Key import Types.Key
@ -342,8 +343,17 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec
bracket CIDDb.openDb CIDDb.closeDb $ \db -> do bracket CIDDb.openDb CIDDb.closeDb $ \db -> do
CIDDb.needsUpdateFromLog db CIDDb.needsUpdateFromLog db
>>= maybe noop (CIDDb.updateFromLog db) >>= maybe noop (CIDDb.updateFromLog db)
go False cidmap importing importablecontents db (run (go False cidmap importing importablecontents db))
where where
-- When not importing content, reuse the same vector
-- clock for all state that's recorded. This can save
-- a little bit of disk space. Individual file downloads
-- while downloading take too long for this optimisation
-- to be safe to do.
run a
| importcontent = a
| otherwise = reuseVectorClockWhile a
go oldversion cidmap importing (ImportableContents l h) db = do go oldversion cidmap importing (ImportableContents l h) db = do
largematcher <- largeFilesMatcher largematcher <- largeFilesMatcher
jobs <- forM l $ \i -> jobs <- forM l $ \i ->

View file

@ -3,42 +3,54 @@
- We don't have a way yet to keep true distributed vector clocks. - We don't have a way yet to keep true distributed vector clocks.
- The next best thing is a timestamp. - The next best thing is a timestamp.
- -
- Copyright 2017-2019 Joey Hess <id@joeyh.name> - Copyright 2017-2020 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
module Annex.VectorClock where module Annex.VectorClock (
module Annex.VectorClock,
module Types.VectorClock,
) where
import Data.Time.Clock.POSIX import Types.VectorClock
import Data.ByteString.Builder import Annex.Common
import Control.Applicative import qualified Annex
import Prelude
import Utility.Env
import Utility.TimeStamp import Utility.TimeStamp
import Utility.QuickCheck
import Data.ByteString.Builder
import qualified Data.Attoparsec.ByteString.Lazy as A import qualified Data.Attoparsec.ByteString.Lazy as A
-- | Some very old logs did not have any time stamp at all; currentVectorClock :: Annex VectorClock
-- Unknown is used for those. currentVectorClock = liftIO =<< Annex.getState Annex.getvectorclock
data VectorClock = Unknown | VectorClock POSIXTime
deriving (Eq, Ord, Show)
-- Unknown is oldest. -- Runs the action and uses the same vector clock throughout.
prop_VectorClock_sane :: Bool --
prop_VectorClock_sane = Unknown < VectorClock 1 -- When the action modifies several files in the git-annex branch,
-- this can cause less space to be used, since the same vector clock
instance Arbitrary VectorClock where -- value is used, which can compress better.
arbitrary = VectorClock <$> arbitrary --
-- However, this should not be used when running a long-duration action,
currentVectorClock :: IO VectorClock -- because the vector clock is based on the start of the action, and not on
currentVectorClock = go =<< getEnv "GIT_ANNEX_VECTOR_CLOCK" -- the later points where it writes changes. For example, if this were
-- used across downloads of several files, the location log information
-- would have an earlier vector clock than necessary, which might cause it
-- to be disregarded in favor of other information that was collected
-- at an earlier point in time than when the transfers completted and the
-- log was written.
reuseVectorClockWhile :: Annex a -> Annex a
reuseVectorClockWhile = bracket setup cleanup . const
where where
go Nothing = VectorClock <$> getPOSIXTime setup = do
go (Just s) = case parsePOSIXTime s of origget <- Annex.getState Annex.getvectorclock
Just t -> return (VectorClock t) vc <- liftIO origget
Nothing -> VectorClock <$> getPOSIXTime use (pure vc)
return origget
cleanup origget = use origget
use vc = Annex.changeState $ \s ->
s { Annex.getvectorclock = vc }
formatVectorClock :: VectorClock -> String formatVectorClock :: VectorClock -> String
formatVectorClock Unknown = "0" formatVectorClock Unknown = "0"

View file

@ -0,0 +1,23 @@
{- git-annex vector clock utilities
-
- Copyright 2017-2020 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
module Annex.VectorClock.Utility where
import Data.Time.Clock.POSIX
import Types.VectorClock
import Utility.Env
import Utility.TimeStamp
startVectorClock :: IO (IO VectorClock)
startVectorClock = go =<< getEnv "GIT_ANNEX_VECTOR_CLOCK"
where
go Nothing = timebased
go (Just s) = case parsePOSIXTime s of
Just t -> return (pure (VectorClock t))
Nothing -> timebased
timebased = return (VectorClock <$> getPOSIXTime)

View file

@ -35,7 +35,7 @@ seek = commandAction . start
start :: ForgetOptions -> CommandStart start :: ForgetOptions -> CommandStart
start o = starting "forget" ai si $ do start o = starting "forget" ai si $ do
c <- liftIO currentVectorClock c <- currentVectorClock
let basets = addTransition c ForgetGitHistory noTransitions let basets = addTransition c ForgetGitHistory noTransitions
let ts = if dropDead o let ts = if dropDead o
then addTransition c ForgetDeadRemotes basets then addTransition c ForgetDeadRemotes basets

View file

@ -74,7 +74,7 @@ optParser desc = MetaDataOptions
seek :: MetaDataOptions -> CommandSeek seek :: MetaDataOptions -> CommandSeek
seek o = case batchOption o of seek o = case batchOption o of
NoBatch -> do NoBatch -> do
c <- liftIO currentVectorClock c <- currentVectorClock
let ww = WarnUnmatchLsFiles let ww = WarnUnmatchLsFiles
let seeker = AnnexedFileSeeker let seeker = AnnexedFileSeeker
{ startAction = start c o { startAction = start c o
@ -188,7 +188,7 @@ startBatch (si, (i, (MetaData m))) = case i of
, keyOptions = Nothing , keyOptions = Nothing
, batchOption = NoBatch , batchOption = NoBatch
} }
t <- liftIO currentVectorClock t <- currentVectorClock
-- It would be bad if two batch mode changes used exactly -- It would be bad if two batch mode changes used exactly
-- the same timestamp, since the order of adds and removals -- the same timestamp, since the order of adds and removals
-- of the same metadata value would then be indeterminate. -- of the same metadata value would then be indeterminate.

View file

@ -27,7 +27,7 @@ data Activity
recordActivity :: Activity -> UUID -> Annex () recordActivity :: Activity -> UUID -> Annex ()
recordActivity act uuid = do recordActivity act uuid = do
c <- liftIO currentVectorClock c <- currentVectorClock
Annex.Branch.change activityLog $ Annex.Branch.change activityLog $
buildLogOld buildActivity buildLogOld buildActivity
. changeLog c uuid (Right act) . changeLog c uuid (Right act)

View file

@ -35,7 +35,7 @@ import qualified Data.Map as M
chunksStored :: UUID -> Key -> ChunkMethod -> ChunkCount -> Annex () chunksStored :: UUID -> Key -> ChunkMethod -> ChunkCount -> Annex ()
chunksStored u k chunkmethod chunkcount = do chunksStored u k chunkmethod chunkcount = do
c <- liftIO currentVectorClock c <- currentVectorClock
config <- Annex.getGitConfig config <- Annex.getGitConfig
Annex.Branch.change (chunkLogFile config k) $ Annex.Branch.change (chunkLogFile config k) $
buildLog . changeMapLog c (u, chunkmethod) chunkcount . parseLog buildLog . changeMapLog c (u, chunkmethod) chunkcount . parseLog

View file

@ -34,7 +34,7 @@ setGlobalConfig name new = do
setGlobalConfig' :: ConfigKey -> ConfigValue -> Annex () setGlobalConfig' :: ConfigKey -> ConfigValue -> Annex ()
setGlobalConfig' name new = do setGlobalConfig' name new = do
c <- liftIO currentVectorClock c <- currentVectorClock
Annex.Branch.change configLog $ Annex.Branch.change configLog $
buildGlobalConfig . changeMapLog c name new . parseGlobalConfig buildGlobalConfig . changeMapLog c name new . parseGlobalConfig

View file

@ -30,7 +30,7 @@ import qualified Data.List.NonEmpty as NonEmpty
-- so ones that were recorded before are preserved. -- so ones that were recorded before are preserved.
recordContentIdentifier :: RemoteStateHandle -> ContentIdentifier -> Key -> Annex () recordContentIdentifier :: RemoteStateHandle -> ContentIdentifier -> Key -> Annex ()
recordContentIdentifier (RemoteStateHandle u) cid k = do recordContentIdentifier (RemoteStateHandle u) cid k = do
c <- liftIO currentVectorClock c <- currentVectorClock
config <- Annex.getGitConfig config <- Annex.getGitConfig
Annex.Branch.maybeChange (remoteContentIdentifierLogFile config k) $ Annex.Branch.maybeChange (remoteContentIdentifierLogFile config k) $
addcid c . parseLog addcid c . parseLog

View file

@ -25,7 +25,7 @@ import Logs.Difference.Pure
recordDifferences :: Differences -> UUID -> Annex () recordDifferences :: Differences -> UUID -> Annex ()
recordDifferences ds@(Differences {}) uuid = do recordDifferences ds@(Differences {}) uuid = do
c <- liftIO currentVectorClock c <- currentVectorClock
Annex.Branch.change differenceLog $ Annex.Branch.change differenceLog $
buildLogOld byteString buildLogOld byteString
. changeLog c uuid (encodeBS $ showDifferences ds) . changeLog c uuid (encodeBS $ showDifferences ds)

View file

@ -92,7 +92,7 @@ getExport remoteuuid = nub . mapMaybe get . M.toList . simpleMap
-- This is called before any changes are made to the remote. -- This is called before any changes are made to the remote.
recordExportBeginning :: UUID -> Git.Ref -> Annex () recordExportBeginning :: UUID -> Git.Ref -> Annex ()
recordExportBeginning remoteuuid newtree = do recordExportBeginning remoteuuid newtree = do
c <- liftIO currentVectorClock c <- currentVectorClock
u <- getUUID u <- getUUID
let ep = ExportParticipants { exportFrom = u, exportTo = remoteuuid } let ep = ExportParticipants { exportFrom = u, exportTo = remoteuuid }
old <- fromMaybe (Exported emptyTree []) old <- fromMaybe (Exported emptyTree [])
@ -127,7 +127,7 @@ recordExportTreeish t =
-- forward in lock-step. -- forward in lock-step.
recordExportUnderway :: UUID -> ExportChange -> Annex () recordExportUnderway :: UUID -> ExportChange -> Annex ()
recordExportUnderway remoteuuid ec = do recordExportUnderway remoteuuid ec = do
c <- liftIO currentVectorClock c <- currentVectorClock
u <- getUUID u <- getUUID
let ep = ExportParticipants { exportFrom = u, exportTo = remoteuuid } let ep = ExportParticipants { exportFrom = u, exportTo = remoteuuid }
let exported = Exported (newTreeish ec) [] let exported = Exported (newTreeish ec) []

View file

@ -38,7 +38,7 @@ lookupGroups u = (fromMaybe S.empty . M.lookup u) . groupsByUUID <$> groupMap
groupChange :: UUID -> (S.Set Group -> S.Set Group) -> Annex () groupChange :: UUID -> (S.Set Group -> S.Set Group) -> Annex ()
groupChange uuid@(UUID _) modifier = do groupChange uuid@(UUID _) modifier = do
curr <- lookupGroups uuid curr <- lookupGroups uuid
c <- liftIO currentVectorClock c <- currentVectorClock
Annex.Branch.change groupLog $ Annex.Branch.change groupLog $
buildLogOld buildGroup . changeLog c uuid (modifier curr) . parseLogOld parseGroup buildLogOld buildGroup . changeLog c uuid (modifier curr) . parseLogOld parseGroup

View file

@ -103,7 +103,7 @@ addMetaData = addMetaData' metaDataLogFile
addMetaData' :: (GitConfig -> Key -> RawFilePath) -> Key -> MetaData -> Annex () addMetaData' :: (GitConfig -> Key -> RawFilePath) -> Key -> MetaData -> Annex ()
addMetaData' getlogfile k metadata = addMetaData' getlogfile k metadata =
addMetaDataClocked' getlogfile k metadata =<< liftIO currentVectorClock addMetaDataClocked' getlogfile k metadata =<< currentVectorClock
{- Reusing the same VectorClock when making changes to the metadata {- Reusing the same VectorClock when making changes to the metadata
- of multiple keys is a nice optimisation. The same metadata lines - of multiple keys is a nice optimisation. The same metadata lines

View file

@ -25,7 +25,7 @@ newtype Fingerprint = Fingerprint String
recordFingerprint :: Fingerprint -> UUID -> Annex () recordFingerprint :: Fingerprint -> UUID -> Annex ()
recordFingerprint fp uuid = do recordFingerprint fp uuid = do
c <- liftIO currentVectorClock c <- currentVectorClock
Annex.Branch.change multicastLog $ Annex.Branch.change multicastLog $
buildLogOld buildFindgerPrint buildLogOld buildFindgerPrint
. changeLog c uuid fp . changeLog c uuid fp

View file

@ -30,7 +30,7 @@ requiredContentSet = setLog requiredContentLog
setLog :: RawFilePath -> UUID -> PreferredContentExpression -> Annex () setLog :: RawFilePath -> UUID -> PreferredContentExpression -> Annex ()
setLog logfile uuid@(UUID _) val = do setLog logfile uuid@(UUID _) val = do
c <- liftIO currentVectorClock c <- currentVectorClock
Annex.Branch.change logfile $ Annex.Branch.change logfile $
buildLogOld buildPreferredContentExpression buildLogOld buildPreferredContentExpression
. changeLog c uuid val . changeLog c uuid val
@ -44,7 +44,7 @@ setLog _ NoUUID _ = error "unknown UUID; cannot modify"
{- Changes the preferred content configuration of a group. -} {- Changes the preferred content configuration of a group. -}
groupPreferredContentSet :: Group -> PreferredContentExpression -> Annex () groupPreferredContentSet :: Group -> PreferredContentExpression -> Annex ()
groupPreferredContentSet g val = do groupPreferredContentSet g val = do
c <- liftIO currentVectorClock c <- currentVectorClock
Annex.Branch.change groupPreferredContentLog $ Annex.Branch.change groupPreferredContentLog $
buildGroupPreferredContent buildGroupPreferredContent
. changeMapLog c g val . changeMapLog c g val

View file

@ -51,7 +51,7 @@ readLog = parseLog <$$> Annex.Branch.get
{- Generates a new LogLine with the current time. -} {- Generates a new LogLine with the current time. -}
logNow :: LogStatus -> LogInfo -> Annex LogLine logNow :: LogStatus -> LogInfo -> Annex LogLine
logNow s i = do logNow s i = do
c <- liftIO currentVectorClock c <- currentVectorClock
return $ LogLine c s i return $ LogLine c s i
{- Reads a log and returns only the info that is still in effect. -} {- Reads a log and returns only the info that is still in effect. -}

View file

@ -32,7 +32,7 @@ import qualified Data.Map as M
{- Adds or updates a remote's config in the log. -} {- Adds or updates a remote's config in the log. -}
configSet :: UUID -> RemoteConfig -> Annex () configSet :: UUID -> RemoteConfig -> Annex ()
configSet u cfg = do configSet u cfg = do
c <- liftIO currentVectorClock c <- currentVectorClock
Annex.Branch.change remoteLog $ Annex.Branch.change remoteLog $
buildRemoteConfigLog buildRemoteConfigLog
. changeLog c u (removeSameasInherited cfg) . changeLog c u (removeSameasInherited cfg)

View file

@ -26,7 +26,7 @@ type RemoteState = String
setRemoteState :: RemoteStateHandle -> Key -> RemoteState -> Annex () setRemoteState :: RemoteStateHandle -> Key -> RemoteState -> Annex ()
setRemoteState (RemoteStateHandle u) k s = do setRemoteState (RemoteStateHandle u) k s = do
c <- liftIO currentVectorClock c <- currentVectorClock
config <- Annex.getGitConfig config <- Annex.getGitConfig
Annex.Branch.change (remoteStateLogFile config k) $ Annex.Branch.change (remoteStateLogFile config k) $
buildRemoteState . changeLog c u s . parseRemoteState buildRemoteState . changeLog c u s . parseRemoteState

View file

@ -32,7 +32,7 @@ import Logs.File
scheduleSet :: UUID -> [ScheduledActivity] -> Annex () scheduleSet :: UUID -> [ScheduledActivity] -> Annex ()
scheduleSet uuid@(UUID _) activities = do scheduleSet uuid@(UUID _) activities = do
c <- liftIO currentVectorClock c <- currentVectorClock
Annex.Branch.change scheduleLog $ Annex.Branch.change scheduleLog $
buildLogOld byteString buildLogOld byteString
. changeLog c uuid (encodeBS val) . changeLog c uuid (encodeBS val)

View file

@ -33,6 +33,6 @@ getLog = newestValue <$$> readLog
setLog :: (SingleValueSerializable v) => RawFilePath -> v -> Annex () setLog :: (SingleValueSerializable v) => RawFilePath -> v -> Annex ()
setLog f v = do setLog f v = do
c <- liftIO currentVectorClock c <- currentVectorClock
let ent = LogEntry c v let ent = LogEntry c v
Annex.Branch.change f $ \_old -> buildLog (S.singleton ent) Annex.Branch.change f $ \_old -> buildLog (S.singleton ent)

View file

@ -22,7 +22,7 @@ import Logs.Trust.Pure as X
{- Changes the trust level for a uuid in the trustLog. -} {- Changes the trust level for a uuid in the trustLog. -}
trustSet :: UUID -> TrustLevel -> Annex () trustSet :: UUID -> TrustLevel -> Annex ()
trustSet uuid@(UUID _) level = do trustSet uuid@(UUID _) level = do
c <- liftIO currentVectorClock c <- currentVectorClock
Annex.Branch.change trustLog $ Annex.Branch.change trustLog $
buildLogOld buildTrustLevel . buildLogOld buildTrustLevel .
changeLog c uuid level . changeLog c uuid level .

View file

@ -31,7 +31,7 @@ import qualified Data.Attoparsec.ByteString.Lazy as A
{- Records a description for a uuid in the log. -} {- Records a description for a uuid in the log. -}
describeUUID :: UUID -> UUIDDesc -> Annex () describeUUID :: UUID -> UUIDDesc -> Annex ()
describeUUID uuid desc = do describeUUID uuid desc = do
c <- liftIO currentVectorClock c <- currentVectorClock
Annex.Branch.change uuidLog $ Annex.Branch.change uuidLog $
buildLogOld buildUUIDDesc . changeLog c uuid desc . parseUUIDLog buildLogOld buildUUIDDesc . changeLog c uuid desc . parseUUIDLog

29
Types/VectorClock.hs Normal file
View file

@ -0,0 +1,29 @@
{- git-annex vector clocks
-
- We don't have a way yet to keep true distributed vector clocks.
- The next best thing is a timestamp.
-
- Copyright 2017-2020 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
module Types.VectorClock where
import Data.Time.Clock.POSIX
import Control.Applicative
import Prelude
import Utility.QuickCheck
-- | Some very old logs did not have any time stamp at all;
-- Unknown is used for those.
data VectorClock = Unknown | VectorClock POSIXTime
deriving (Eq, Ord, Show)
-- Unknown is oldest.
prop_VectorClock_sane :: Bool
prop_VectorClock_sane = Unknown < VectorClock 1
instance Arbitrary VectorClock where
arbitrary = VectorClock <$> arbitrary

View file

@ -11,3 +11,6 @@ As a bonus, since borg uses the same content identifiers for all keys
(""), implementing this would make the content of the logs all identical, (""), implementing this would make the content of the logs all identical,
and so avoid any overhead entirely. Well.. perhaps that's my actual and so avoid any overhead entirely. Well.. perhaps that's my actual
motivation. ;-) --[[Joey]] motivation. ;-) --[[Joey]]
> [[done]], though reuseVectorClockWhile could be used in other places
> perhaps. --[[Joey]]

View file

@ -670,6 +670,7 @@ Executable git-annex
Annex.UUID Annex.UUID
Annex.Url Annex.Url
Annex.VectorClock Annex.VectorClock
Annex.VectorClock.Utility
Annex.VariantFile Annex.VariantFile
Annex.Version Annex.Version
Annex.View Annex.View
@ -1035,6 +1036,7 @@ Executable git-annex
Types.TrustLevel Types.TrustLevel
Types.UUID Types.UUID
Types.UrlContents Types.UrlContents
Types.VectorClock
Types.View Types.View
Types.WorkerPool Types.WorkerPool
Upgrade Upgrade