generate more compact git-annex branch for imports

Especially from borg, where the content identifier logs
all end up being the same identical file!

But also, for other imports, the location tracking logs can,
in some cases, be identical files.

Bonus optimisation: Avoid looking up (and parsing when set)
GIT_ANNEX_VECTOR_CLOCK env var every time a log is written to.
Although the lookup does happen at startup even when no
log will be written now.
This commit is contained in:
Joey Hess 2020-12-23 15:21:33 -04:00
parent f8aadbfb9b
commit 6280af2901
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
26 changed files with 133 additions and 49 deletions

View file

@ -72,6 +72,8 @@ import Types.IndexFiles
import Types.CatFileHandles
import Types.RemoteConfig
import Types.TransferrerPool
import Types.VectorClock
import Annex.VectorClock.Utility
import qualified Database.Keys.Handle as Keys
import Utility.InodeCache
import Utility.Url
@ -121,6 +123,7 @@ data AnnexState = AnnexState
, fast :: Bool
, daemon :: Bool
, branchstate :: BranchState
, getvectorclock :: IO VectorClock
, repoqueue :: Maybe (Git.Queue.Queue Annex)
, catfilehandles :: CatFileHandles
, hashobjecthandle :: Maybe HashObjectHandle
@ -172,6 +175,7 @@ newState c r = do
sc <- newTMVarIO False
kh <- Keys.newDbHandle
tp <- newTransferrerPool
vc <- startVectorClock
return $ AnnexState
{ repo = r
, repoadjustment = return
@ -187,6 +191,7 @@ newState c r = do
, fast = False
, daemon = False
, branchstate = startBranchState
, getvectorclock = vc
, repoqueue = Nothing
, catfilehandles = catFileHandlesNonConcurrent
, hashobjecthandle = Nothing

View file

@ -38,6 +38,7 @@ import Annex.RemoteTrackingBranch
import Annex.HashObject
import Annex.Transfer
import Annex.CheckIgnore
import Annex.VectorClock
import Command
import Backend
import Types.Key
@ -342,8 +343,17 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec
bracket CIDDb.openDb CIDDb.closeDb $ \db -> do
CIDDb.needsUpdateFromLog db
>>= maybe noop (CIDDb.updateFromLog db)
go False cidmap importing importablecontents db
(run (go False cidmap importing importablecontents db))
where
-- When not importing content, reuse the same vector
-- clock for all state that's recorded. This can save
-- a little bit of disk space. Individual file downloads
-- while downloading take too long for this optimisation
-- to be safe to do.
run a
| importcontent = a
| otherwise = reuseVectorClockWhile a
go oldversion cidmap importing (ImportableContents l h) db = do
largematcher <- largeFilesMatcher
jobs <- forM l $ \i ->

View file

@ -3,42 +3,54 @@
- We don't have a way yet to keep true distributed vector clocks.
- The next best thing is a timestamp.
-
- Copyright 2017-2019 Joey Hess <id@joeyh.name>
- Copyright 2017-2020 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
module Annex.VectorClock where
module Annex.VectorClock (
module Annex.VectorClock,
module Types.VectorClock,
) where
import Data.Time.Clock.POSIX
import Data.ByteString.Builder
import Control.Applicative
import Prelude
import Utility.Env
import Types.VectorClock
import Annex.Common
import qualified Annex
import Utility.TimeStamp
import Utility.QuickCheck
import Data.ByteString.Builder
import qualified Data.Attoparsec.ByteString.Lazy as A
-- | Some very old logs did not have any time stamp at all;
-- Unknown is used for those.
data VectorClock = Unknown | VectorClock POSIXTime
deriving (Eq, Ord, Show)
currentVectorClock :: Annex VectorClock
currentVectorClock = liftIO =<< Annex.getState Annex.getvectorclock
-- Unknown is oldest.
prop_VectorClock_sane :: Bool
prop_VectorClock_sane = Unknown < VectorClock 1
instance Arbitrary VectorClock where
arbitrary = VectorClock <$> arbitrary
currentVectorClock :: IO VectorClock
currentVectorClock = go =<< getEnv "GIT_ANNEX_VECTOR_CLOCK"
-- Runs the action and uses the same vector clock throughout.
--
-- When the action modifies several files in the git-annex branch,
-- this can cause less space to be used, since the same vector clock
-- value is used, which can compress better.
--
-- However, this should not be used when running a long-duration action,
-- because the vector clock is based on the start of the action, and not on
-- the later points where it writes changes. For example, if this were
-- used across downloads of several files, the location log information
-- would have an earlier vector clock than necessary, which might cause it
-- to be disregarded in favor of other information that was collected
-- at an earlier point in time than when the transfers completted and the
-- log was written.
reuseVectorClockWhile :: Annex a -> Annex a
reuseVectorClockWhile = bracket setup cleanup . const
where
go Nothing = VectorClock <$> getPOSIXTime
go (Just s) = case parsePOSIXTime s of
Just t -> return (VectorClock t)
Nothing -> VectorClock <$> getPOSIXTime
setup = do
origget <- Annex.getState Annex.getvectorclock
vc <- liftIO origget
use (pure vc)
return origget
cleanup origget = use origget
use vc = Annex.changeState $ \s ->
s { Annex.getvectorclock = vc }
formatVectorClock :: VectorClock -> String
formatVectorClock Unknown = "0"

View file

@ -0,0 +1,23 @@
{- git-annex vector clock utilities
-
- Copyright 2017-2020 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
module Annex.VectorClock.Utility where
import Data.Time.Clock.POSIX
import Types.VectorClock
import Utility.Env
import Utility.TimeStamp
startVectorClock :: IO (IO VectorClock)
startVectorClock = go =<< getEnv "GIT_ANNEX_VECTOR_CLOCK"
where
go Nothing = timebased
go (Just s) = case parsePOSIXTime s of
Just t -> return (pure (VectorClock t))
Nothing -> timebased
timebased = return (VectorClock <$> getPOSIXTime)

View file

@ -35,7 +35,7 @@ seek = commandAction . start
start :: ForgetOptions -> CommandStart
start o = starting "forget" ai si $ do
c <- liftIO currentVectorClock
c <- currentVectorClock
let basets = addTransition c ForgetGitHistory noTransitions
let ts = if dropDead o
then addTransition c ForgetDeadRemotes basets

View file

@ -74,7 +74,7 @@ optParser desc = MetaDataOptions
seek :: MetaDataOptions -> CommandSeek
seek o = case batchOption o of
NoBatch -> do
c <- liftIO currentVectorClock
c <- currentVectorClock
let ww = WarnUnmatchLsFiles
let seeker = AnnexedFileSeeker
{ startAction = start c o
@ -188,7 +188,7 @@ startBatch (si, (i, (MetaData m))) = case i of
, keyOptions = Nothing
, batchOption = NoBatch
}
t <- liftIO currentVectorClock
t <- currentVectorClock
-- It would be bad if two batch mode changes used exactly
-- the same timestamp, since the order of adds and removals
-- of the same metadata value would then be indeterminate.

View file

@ -27,7 +27,7 @@ data Activity
recordActivity :: Activity -> UUID -> Annex ()
recordActivity act uuid = do
c <- liftIO currentVectorClock
c <- currentVectorClock
Annex.Branch.change activityLog $
buildLogOld buildActivity
. changeLog c uuid (Right act)

View file

@ -35,7 +35,7 @@ import qualified Data.Map as M
chunksStored :: UUID -> Key -> ChunkMethod -> ChunkCount -> Annex ()
chunksStored u k chunkmethod chunkcount = do
c <- liftIO currentVectorClock
c <- currentVectorClock
config <- Annex.getGitConfig
Annex.Branch.change (chunkLogFile config k) $
buildLog . changeMapLog c (u, chunkmethod) chunkcount . parseLog

View file

@ -34,7 +34,7 @@ setGlobalConfig name new = do
setGlobalConfig' :: ConfigKey -> ConfigValue -> Annex ()
setGlobalConfig' name new = do
c <- liftIO currentVectorClock
c <- currentVectorClock
Annex.Branch.change configLog $
buildGlobalConfig . changeMapLog c name new . parseGlobalConfig

View file

@ -30,7 +30,7 @@ import qualified Data.List.NonEmpty as NonEmpty
-- so ones that were recorded before are preserved.
recordContentIdentifier :: RemoteStateHandle -> ContentIdentifier -> Key -> Annex ()
recordContentIdentifier (RemoteStateHandle u) cid k = do
c <- liftIO currentVectorClock
c <- currentVectorClock
config <- Annex.getGitConfig
Annex.Branch.maybeChange (remoteContentIdentifierLogFile config k) $
addcid c . parseLog

View file

@ -25,7 +25,7 @@ import Logs.Difference.Pure
recordDifferences :: Differences -> UUID -> Annex ()
recordDifferences ds@(Differences {}) uuid = do
c <- liftIO currentVectorClock
c <- currentVectorClock
Annex.Branch.change differenceLog $
buildLogOld byteString
. changeLog c uuid (encodeBS $ showDifferences ds)

View file

@ -92,7 +92,7 @@ getExport remoteuuid = nub . mapMaybe get . M.toList . simpleMap
-- This is called before any changes are made to the remote.
recordExportBeginning :: UUID -> Git.Ref -> Annex ()
recordExportBeginning remoteuuid newtree = do
c <- liftIO currentVectorClock
c <- currentVectorClock
u <- getUUID
let ep = ExportParticipants { exportFrom = u, exportTo = remoteuuid }
old <- fromMaybe (Exported emptyTree [])
@ -127,7 +127,7 @@ recordExportTreeish t =
-- forward in lock-step.
recordExportUnderway :: UUID -> ExportChange -> Annex ()
recordExportUnderway remoteuuid ec = do
c <- liftIO currentVectorClock
c <- currentVectorClock
u <- getUUID
let ep = ExportParticipants { exportFrom = u, exportTo = remoteuuid }
let exported = Exported (newTreeish ec) []

View file

@ -38,7 +38,7 @@ lookupGroups u = (fromMaybe S.empty . M.lookup u) . groupsByUUID <$> groupMap
groupChange :: UUID -> (S.Set Group -> S.Set Group) -> Annex ()
groupChange uuid@(UUID _) modifier = do
curr <- lookupGroups uuid
c <- liftIO currentVectorClock
c <- currentVectorClock
Annex.Branch.change groupLog $
buildLogOld buildGroup . changeLog c uuid (modifier curr) . parseLogOld parseGroup

View file

@ -103,7 +103,7 @@ addMetaData = addMetaData' metaDataLogFile
addMetaData' :: (GitConfig -> Key -> RawFilePath) -> Key -> MetaData -> Annex ()
addMetaData' getlogfile k metadata =
addMetaDataClocked' getlogfile k metadata =<< liftIO currentVectorClock
addMetaDataClocked' getlogfile k metadata =<< currentVectorClock
{- Reusing the same VectorClock when making changes to the metadata
- of multiple keys is a nice optimisation. The same metadata lines

View file

@ -25,7 +25,7 @@ newtype Fingerprint = Fingerprint String
recordFingerprint :: Fingerprint -> UUID -> Annex ()
recordFingerprint fp uuid = do
c <- liftIO currentVectorClock
c <- currentVectorClock
Annex.Branch.change multicastLog $
buildLogOld buildFindgerPrint
. changeLog c uuid fp

View file

@ -30,7 +30,7 @@ requiredContentSet = setLog requiredContentLog
setLog :: RawFilePath -> UUID -> PreferredContentExpression -> Annex ()
setLog logfile uuid@(UUID _) val = do
c <- liftIO currentVectorClock
c <- currentVectorClock
Annex.Branch.change logfile $
buildLogOld buildPreferredContentExpression
. changeLog c uuid val
@ -44,7 +44,7 @@ setLog _ NoUUID _ = error "unknown UUID; cannot modify"
{- Changes the preferred content configuration of a group. -}
groupPreferredContentSet :: Group -> PreferredContentExpression -> Annex ()
groupPreferredContentSet g val = do
c <- liftIO currentVectorClock
c <- currentVectorClock
Annex.Branch.change groupPreferredContentLog $
buildGroupPreferredContent
. changeMapLog c g val

View file

@ -51,7 +51,7 @@ readLog = parseLog <$$> Annex.Branch.get
{- Generates a new LogLine with the current time. -}
logNow :: LogStatus -> LogInfo -> Annex LogLine
logNow s i = do
c <- liftIO currentVectorClock
c <- currentVectorClock
return $ LogLine c s i
{- Reads a log and returns only the info that is still in effect. -}

View file

@ -32,7 +32,7 @@ import qualified Data.Map as M
{- Adds or updates a remote's config in the log. -}
configSet :: UUID -> RemoteConfig -> Annex ()
configSet u cfg = do
c <- liftIO currentVectorClock
c <- currentVectorClock
Annex.Branch.change remoteLog $
buildRemoteConfigLog
. changeLog c u (removeSameasInherited cfg)

View file

@ -26,7 +26,7 @@ type RemoteState = String
setRemoteState :: RemoteStateHandle -> Key -> RemoteState -> Annex ()
setRemoteState (RemoteStateHandle u) k s = do
c <- liftIO currentVectorClock
c <- currentVectorClock
config <- Annex.getGitConfig
Annex.Branch.change (remoteStateLogFile config k) $
buildRemoteState . changeLog c u s . parseRemoteState

View file

@ -32,7 +32,7 @@ import Logs.File
scheduleSet :: UUID -> [ScheduledActivity] -> Annex ()
scheduleSet uuid@(UUID _) activities = do
c <- liftIO currentVectorClock
c <- currentVectorClock
Annex.Branch.change scheduleLog $
buildLogOld byteString
. changeLog c uuid (encodeBS val)

View file

@ -33,6 +33,6 @@ getLog = newestValue <$$> readLog
setLog :: (SingleValueSerializable v) => RawFilePath -> v -> Annex ()
setLog f v = do
c <- liftIO currentVectorClock
c <- currentVectorClock
let ent = LogEntry c v
Annex.Branch.change f $ \_old -> buildLog (S.singleton ent)

View file

@ -22,7 +22,7 @@ import Logs.Trust.Pure as X
{- Changes the trust level for a uuid in the trustLog. -}
trustSet :: UUID -> TrustLevel -> Annex ()
trustSet uuid@(UUID _) level = do
c <- liftIO currentVectorClock
c <- currentVectorClock
Annex.Branch.change trustLog $
buildLogOld buildTrustLevel .
changeLog c uuid level .

View file

@ -31,7 +31,7 @@ import qualified Data.Attoparsec.ByteString.Lazy as A
{- Records a description for a uuid in the log. -}
describeUUID :: UUID -> UUIDDesc -> Annex ()
describeUUID uuid desc = do
c <- liftIO currentVectorClock
c <- currentVectorClock
Annex.Branch.change uuidLog $
buildLogOld buildUUIDDesc . changeLog c uuid desc . parseUUIDLog

29
Types/VectorClock.hs Normal file
View file

@ -0,0 +1,29 @@
{- git-annex vector clocks
-
- We don't have a way yet to keep true distributed vector clocks.
- The next best thing is a timestamp.
-
- Copyright 2017-2020 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
module Types.VectorClock where
import Data.Time.Clock.POSIX
import Control.Applicative
import Prelude
import Utility.QuickCheck
-- | Some very old logs did not have any time stamp at all;
-- Unknown is used for those.
data VectorClock = Unknown | VectorClock POSIXTime
deriving (Eq, Ord, Show)
-- Unknown is oldest.
prop_VectorClock_sane :: Bool
prop_VectorClock_sane = Unknown < VectorClock 1
instance Arbitrary VectorClock where
arbitrary = VectorClock <$> arbitrary

View file

@ -11,3 +11,6 @@ As a bonus, since borg uses the same content identifiers for all keys
(""), implementing this would make the content of the logs all identical,
and so avoid any overhead entirely. Well.. perhaps that's my actual
motivation. ;-) --[[Joey]]
> [[done]], though reuseVectorClockWhile could be used in other places
> perhaps. --[[Joey]]

View file

@ -670,6 +670,7 @@ Executable git-annex
Annex.UUID
Annex.Url
Annex.VectorClock
Annex.VectorClock.Utility
Annex.VariantFile
Annex.Version
Annex.View
@ -1035,6 +1036,7 @@ Executable git-annex
Types.TrustLevel
Types.UUID
Types.UrlContents
Types.VectorClock
Types.View
Types.WorkerPool
Upgrade