generate more compact git-annex branch for imports
Especially from borg, where the content identifier logs all end up being the same identical file! But also, for other imports, the location tracking logs can, in some cases, be identical files. Bonus optimisation: Avoid looking up (and parsing when set) GIT_ANNEX_VECTOR_CLOCK env var every time a log is written to. Although the lookup does happen at startup even when no log will be written now.
This commit is contained in:
parent
f8aadbfb9b
commit
6280af2901
26 changed files with 133 additions and 49 deletions
5
Annex.hs
5
Annex.hs
|
@ -72,6 +72,8 @@ import Types.IndexFiles
|
|||
import Types.CatFileHandles
|
||||
import Types.RemoteConfig
|
||||
import Types.TransferrerPool
|
||||
import Types.VectorClock
|
||||
import Annex.VectorClock.Utility
|
||||
import qualified Database.Keys.Handle as Keys
|
||||
import Utility.InodeCache
|
||||
import Utility.Url
|
||||
|
@ -121,6 +123,7 @@ data AnnexState = AnnexState
|
|||
, fast :: Bool
|
||||
, daemon :: Bool
|
||||
, branchstate :: BranchState
|
||||
, getvectorclock :: IO VectorClock
|
||||
, repoqueue :: Maybe (Git.Queue.Queue Annex)
|
||||
, catfilehandles :: CatFileHandles
|
||||
, hashobjecthandle :: Maybe HashObjectHandle
|
||||
|
@ -172,6 +175,7 @@ newState c r = do
|
|||
sc <- newTMVarIO False
|
||||
kh <- Keys.newDbHandle
|
||||
tp <- newTransferrerPool
|
||||
vc <- startVectorClock
|
||||
return $ AnnexState
|
||||
{ repo = r
|
||||
, repoadjustment = return
|
||||
|
@ -187,6 +191,7 @@ newState c r = do
|
|||
, fast = False
|
||||
, daemon = False
|
||||
, branchstate = startBranchState
|
||||
, getvectorclock = vc
|
||||
, repoqueue = Nothing
|
||||
, catfilehandles = catFileHandlesNonConcurrent
|
||||
, hashobjecthandle = Nothing
|
||||
|
|
|
@ -38,6 +38,7 @@ import Annex.RemoteTrackingBranch
|
|||
import Annex.HashObject
|
||||
import Annex.Transfer
|
||||
import Annex.CheckIgnore
|
||||
import Annex.VectorClock
|
||||
import Command
|
||||
import Backend
|
||||
import Types.Key
|
||||
|
@ -342,8 +343,17 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec
|
|||
bracket CIDDb.openDb CIDDb.closeDb $ \db -> do
|
||||
CIDDb.needsUpdateFromLog db
|
||||
>>= maybe noop (CIDDb.updateFromLog db)
|
||||
go False cidmap importing importablecontents db
|
||||
(run (go False cidmap importing importablecontents db))
|
||||
where
|
||||
-- When not importing content, reuse the same vector
|
||||
-- clock for all state that's recorded. This can save
|
||||
-- a little bit of disk space. Individual file downloads
|
||||
-- while downloading take too long for this optimisation
|
||||
-- to be safe to do.
|
||||
run a
|
||||
| importcontent = a
|
||||
| otherwise = reuseVectorClockWhile a
|
||||
|
||||
go oldversion cidmap importing (ImportableContents l h) db = do
|
||||
largematcher <- largeFilesMatcher
|
||||
jobs <- forM l $ \i ->
|
||||
|
|
|
@ -3,42 +3,54 @@
|
|||
- We don't have a way yet to keep true distributed vector clocks.
|
||||
- The next best thing is a timestamp.
|
||||
-
|
||||
- Copyright 2017-2019 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2017-2020 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
||||
module Annex.VectorClock where
|
||||
module Annex.VectorClock (
|
||||
module Annex.VectorClock,
|
||||
module Types.VectorClock,
|
||||
) where
|
||||
|
||||
import Data.Time.Clock.POSIX
|
||||
import Data.ByteString.Builder
|
||||
import Control.Applicative
|
||||
import Prelude
|
||||
|
||||
import Utility.Env
|
||||
import Types.VectorClock
|
||||
import Annex.Common
|
||||
import qualified Annex
|
||||
import Utility.TimeStamp
|
||||
import Utility.QuickCheck
|
||||
|
||||
import Data.ByteString.Builder
|
||||
import qualified Data.Attoparsec.ByteString.Lazy as A
|
||||
|
||||
-- | Some very old logs did not have any time stamp at all;
|
||||
-- Unknown is used for those.
|
||||
data VectorClock = Unknown | VectorClock POSIXTime
|
||||
deriving (Eq, Ord, Show)
|
||||
currentVectorClock :: Annex VectorClock
|
||||
currentVectorClock = liftIO =<< Annex.getState Annex.getvectorclock
|
||||
|
||||
-- Unknown is oldest.
|
||||
prop_VectorClock_sane :: Bool
|
||||
prop_VectorClock_sane = Unknown < VectorClock 1
|
||||
|
||||
instance Arbitrary VectorClock where
|
||||
arbitrary = VectorClock <$> arbitrary
|
||||
|
||||
currentVectorClock :: IO VectorClock
|
||||
currentVectorClock = go =<< getEnv "GIT_ANNEX_VECTOR_CLOCK"
|
||||
-- Runs the action and uses the same vector clock throughout.
|
||||
--
|
||||
-- When the action modifies several files in the git-annex branch,
|
||||
-- this can cause less space to be used, since the same vector clock
|
||||
-- value is used, which can compress better.
|
||||
--
|
||||
-- However, this should not be used when running a long-duration action,
|
||||
-- because the vector clock is based on the start of the action, and not on
|
||||
-- the later points where it writes changes. For example, if this were
|
||||
-- used across downloads of several files, the location log information
|
||||
-- would have an earlier vector clock than necessary, which might cause it
|
||||
-- to be disregarded in favor of other information that was collected
|
||||
-- at an earlier point in time than when the transfers completted and the
|
||||
-- log was written.
|
||||
reuseVectorClockWhile :: Annex a -> Annex a
|
||||
reuseVectorClockWhile = bracket setup cleanup . const
|
||||
where
|
||||
go Nothing = VectorClock <$> getPOSIXTime
|
||||
go (Just s) = case parsePOSIXTime s of
|
||||
Just t -> return (VectorClock t)
|
||||
Nothing -> VectorClock <$> getPOSIXTime
|
||||
setup = do
|
||||
origget <- Annex.getState Annex.getvectorclock
|
||||
vc <- liftIO origget
|
||||
use (pure vc)
|
||||
return origget
|
||||
|
||||
cleanup origget = use origget
|
||||
|
||||
use vc = Annex.changeState $ \s ->
|
||||
s { Annex.getvectorclock = vc }
|
||||
|
||||
formatVectorClock :: VectorClock -> String
|
||||
formatVectorClock Unknown = "0"
|
||||
|
|
23
Annex/VectorClock/Utility.hs
Normal file
23
Annex/VectorClock/Utility.hs
Normal file
|
@ -0,0 +1,23 @@
|
|||
{- git-annex vector clock utilities
|
||||
-
|
||||
- Copyright 2017-2020 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
||||
module Annex.VectorClock.Utility where
|
||||
|
||||
import Data.Time.Clock.POSIX
|
||||
|
||||
import Types.VectorClock
|
||||
import Utility.Env
|
||||
import Utility.TimeStamp
|
||||
|
||||
startVectorClock :: IO (IO VectorClock)
|
||||
startVectorClock = go =<< getEnv "GIT_ANNEX_VECTOR_CLOCK"
|
||||
where
|
||||
go Nothing = timebased
|
||||
go (Just s) = case parsePOSIXTime s of
|
||||
Just t -> return (pure (VectorClock t))
|
||||
Nothing -> timebased
|
||||
timebased = return (VectorClock <$> getPOSIXTime)
|
|
@ -35,7 +35,7 @@ seek = commandAction . start
|
|||
|
||||
start :: ForgetOptions -> CommandStart
|
||||
start o = starting "forget" ai si $ do
|
||||
c <- liftIO currentVectorClock
|
||||
c <- currentVectorClock
|
||||
let basets = addTransition c ForgetGitHistory noTransitions
|
||||
let ts = if dropDead o
|
||||
then addTransition c ForgetDeadRemotes basets
|
||||
|
|
|
@ -74,7 +74,7 @@ optParser desc = MetaDataOptions
|
|||
seek :: MetaDataOptions -> CommandSeek
|
||||
seek o = case batchOption o of
|
||||
NoBatch -> do
|
||||
c <- liftIO currentVectorClock
|
||||
c <- currentVectorClock
|
||||
let ww = WarnUnmatchLsFiles
|
||||
let seeker = AnnexedFileSeeker
|
||||
{ startAction = start c o
|
||||
|
@ -188,7 +188,7 @@ startBatch (si, (i, (MetaData m))) = case i of
|
|||
, keyOptions = Nothing
|
||||
, batchOption = NoBatch
|
||||
}
|
||||
t <- liftIO currentVectorClock
|
||||
t <- currentVectorClock
|
||||
-- It would be bad if two batch mode changes used exactly
|
||||
-- the same timestamp, since the order of adds and removals
|
||||
-- of the same metadata value would then be indeterminate.
|
||||
|
|
|
@ -27,7 +27,7 @@ data Activity
|
|||
|
||||
recordActivity :: Activity -> UUID -> Annex ()
|
||||
recordActivity act uuid = do
|
||||
c <- liftIO currentVectorClock
|
||||
c <- currentVectorClock
|
||||
Annex.Branch.change activityLog $
|
||||
buildLogOld buildActivity
|
||||
. changeLog c uuid (Right act)
|
||||
|
|
|
@ -35,7 +35,7 @@ import qualified Data.Map as M
|
|||
|
||||
chunksStored :: UUID -> Key -> ChunkMethod -> ChunkCount -> Annex ()
|
||||
chunksStored u k chunkmethod chunkcount = do
|
||||
c <- liftIO currentVectorClock
|
||||
c <- currentVectorClock
|
||||
config <- Annex.getGitConfig
|
||||
Annex.Branch.change (chunkLogFile config k) $
|
||||
buildLog . changeMapLog c (u, chunkmethod) chunkcount . parseLog
|
||||
|
|
|
@ -34,7 +34,7 @@ setGlobalConfig name new = do
|
|||
|
||||
setGlobalConfig' :: ConfigKey -> ConfigValue -> Annex ()
|
||||
setGlobalConfig' name new = do
|
||||
c <- liftIO currentVectorClock
|
||||
c <- currentVectorClock
|
||||
Annex.Branch.change configLog $
|
||||
buildGlobalConfig . changeMapLog c name new . parseGlobalConfig
|
||||
|
||||
|
|
|
@ -30,7 +30,7 @@ import qualified Data.List.NonEmpty as NonEmpty
|
|||
-- so ones that were recorded before are preserved.
|
||||
recordContentIdentifier :: RemoteStateHandle -> ContentIdentifier -> Key -> Annex ()
|
||||
recordContentIdentifier (RemoteStateHandle u) cid k = do
|
||||
c <- liftIO currentVectorClock
|
||||
c <- currentVectorClock
|
||||
config <- Annex.getGitConfig
|
||||
Annex.Branch.maybeChange (remoteContentIdentifierLogFile config k) $
|
||||
addcid c . parseLog
|
||||
|
|
|
@ -25,7 +25,7 @@ import Logs.Difference.Pure
|
|||
|
||||
recordDifferences :: Differences -> UUID -> Annex ()
|
||||
recordDifferences ds@(Differences {}) uuid = do
|
||||
c <- liftIO currentVectorClock
|
||||
c <- currentVectorClock
|
||||
Annex.Branch.change differenceLog $
|
||||
buildLogOld byteString
|
||||
. changeLog c uuid (encodeBS $ showDifferences ds)
|
||||
|
|
|
@ -92,7 +92,7 @@ getExport remoteuuid = nub . mapMaybe get . M.toList . simpleMap
|
|||
-- This is called before any changes are made to the remote.
|
||||
recordExportBeginning :: UUID -> Git.Ref -> Annex ()
|
||||
recordExportBeginning remoteuuid newtree = do
|
||||
c <- liftIO currentVectorClock
|
||||
c <- currentVectorClock
|
||||
u <- getUUID
|
||||
let ep = ExportParticipants { exportFrom = u, exportTo = remoteuuid }
|
||||
old <- fromMaybe (Exported emptyTree [])
|
||||
|
@ -127,7 +127,7 @@ recordExportTreeish t =
|
|||
-- forward in lock-step.
|
||||
recordExportUnderway :: UUID -> ExportChange -> Annex ()
|
||||
recordExportUnderway remoteuuid ec = do
|
||||
c <- liftIO currentVectorClock
|
||||
c <- currentVectorClock
|
||||
u <- getUUID
|
||||
let ep = ExportParticipants { exportFrom = u, exportTo = remoteuuid }
|
||||
let exported = Exported (newTreeish ec) []
|
||||
|
|
|
@ -38,7 +38,7 @@ lookupGroups u = (fromMaybe S.empty . M.lookup u) . groupsByUUID <$> groupMap
|
|||
groupChange :: UUID -> (S.Set Group -> S.Set Group) -> Annex ()
|
||||
groupChange uuid@(UUID _) modifier = do
|
||||
curr <- lookupGroups uuid
|
||||
c <- liftIO currentVectorClock
|
||||
c <- currentVectorClock
|
||||
Annex.Branch.change groupLog $
|
||||
buildLogOld buildGroup . changeLog c uuid (modifier curr) . parseLogOld parseGroup
|
||||
|
||||
|
|
|
@ -103,7 +103,7 @@ addMetaData = addMetaData' metaDataLogFile
|
|||
|
||||
addMetaData' :: (GitConfig -> Key -> RawFilePath) -> Key -> MetaData -> Annex ()
|
||||
addMetaData' getlogfile k metadata =
|
||||
addMetaDataClocked' getlogfile k metadata =<< liftIO currentVectorClock
|
||||
addMetaDataClocked' getlogfile k metadata =<< currentVectorClock
|
||||
|
||||
{- Reusing the same VectorClock when making changes to the metadata
|
||||
- of multiple keys is a nice optimisation. The same metadata lines
|
||||
|
|
|
@ -25,7 +25,7 @@ newtype Fingerprint = Fingerprint String
|
|||
|
||||
recordFingerprint :: Fingerprint -> UUID -> Annex ()
|
||||
recordFingerprint fp uuid = do
|
||||
c <- liftIO currentVectorClock
|
||||
c <- currentVectorClock
|
||||
Annex.Branch.change multicastLog $
|
||||
buildLogOld buildFindgerPrint
|
||||
. changeLog c uuid fp
|
||||
|
|
|
@ -30,7 +30,7 @@ requiredContentSet = setLog requiredContentLog
|
|||
|
||||
setLog :: RawFilePath -> UUID -> PreferredContentExpression -> Annex ()
|
||||
setLog logfile uuid@(UUID _) val = do
|
||||
c <- liftIO currentVectorClock
|
||||
c <- currentVectorClock
|
||||
Annex.Branch.change logfile $
|
||||
buildLogOld buildPreferredContentExpression
|
||||
. changeLog c uuid val
|
||||
|
@ -44,7 +44,7 @@ setLog _ NoUUID _ = error "unknown UUID; cannot modify"
|
|||
{- Changes the preferred content configuration of a group. -}
|
||||
groupPreferredContentSet :: Group -> PreferredContentExpression -> Annex ()
|
||||
groupPreferredContentSet g val = do
|
||||
c <- liftIO currentVectorClock
|
||||
c <- currentVectorClock
|
||||
Annex.Branch.change groupPreferredContentLog $
|
||||
buildGroupPreferredContent
|
||||
. changeMapLog c g val
|
||||
|
|
|
@ -51,7 +51,7 @@ readLog = parseLog <$$> Annex.Branch.get
|
|||
{- Generates a new LogLine with the current time. -}
|
||||
logNow :: LogStatus -> LogInfo -> Annex LogLine
|
||||
logNow s i = do
|
||||
c <- liftIO currentVectorClock
|
||||
c <- currentVectorClock
|
||||
return $ LogLine c s i
|
||||
|
||||
{- Reads a log and returns only the info that is still in effect. -}
|
||||
|
|
|
@ -32,7 +32,7 @@ import qualified Data.Map as M
|
|||
{- Adds or updates a remote's config in the log. -}
|
||||
configSet :: UUID -> RemoteConfig -> Annex ()
|
||||
configSet u cfg = do
|
||||
c <- liftIO currentVectorClock
|
||||
c <- currentVectorClock
|
||||
Annex.Branch.change remoteLog $
|
||||
buildRemoteConfigLog
|
||||
. changeLog c u (removeSameasInherited cfg)
|
||||
|
|
|
@ -26,7 +26,7 @@ type RemoteState = String
|
|||
|
||||
setRemoteState :: RemoteStateHandle -> Key -> RemoteState -> Annex ()
|
||||
setRemoteState (RemoteStateHandle u) k s = do
|
||||
c <- liftIO currentVectorClock
|
||||
c <- currentVectorClock
|
||||
config <- Annex.getGitConfig
|
||||
Annex.Branch.change (remoteStateLogFile config k) $
|
||||
buildRemoteState . changeLog c u s . parseRemoteState
|
||||
|
|
|
@ -32,7 +32,7 @@ import Logs.File
|
|||
|
||||
scheduleSet :: UUID -> [ScheduledActivity] -> Annex ()
|
||||
scheduleSet uuid@(UUID _) activities = do
|
||||
c <- liftIO currentVectorClock
|
||||
c <- currentVectorClock
|
||||
Annex.Branch.change scheduleLog $
|
||||
buildLogOld byteString
|
||||
. changeLog c uuid (encodeBS val)
|
||||
|
|
|
@ -33,6 +33,6 @@ getLog = newestValue <$$> readLog
|
|||
|
||||
setLog :: (SingleValueSerializable v) => RawFilePath -> v -> Annex ()
|
||||
setLog f v = do
|
||||
c <- liftIO currentVectorClock
|
||||
c <- currentVectorClock
|
||||
let ent = LogEntry c v
|
||||
Annex.Branch.change f $ \_old -> buildLog (S.singleton ent)
|
||||
|
|
|
@ -22,7 +22,7 @@ import Logs.Trust.Pure as X
|
|||
{- Changes the trust level for a uuid in the trustLog. -}
|
||||
trustSet :: UUID -> TrustLevel -> Annex ()
|
||||
trustSet uuid@(UUID _) level = do
|
||||
c <- liftIO currentVectorClock
|
||||
c <- currentVectorClock
|
||||
Annex.Branch.change trustLog $
|
||||
buildLogOld buildTrustLevel .
|
||||
changeLog c uuid level .
|
||||
|
|
|
@ -31,7 +31,7 @@ import qualified Data.Attoparsec.ByteString.Lazy as A
|
|||
{- Records a description for a uuid in the log. -}
|
||||
describeUUID :: UUID -> UUIDDesc -> Annex ()
|
||||
describeUUID uuid desc = do
|
||||
c <- liftIO currentVectorClock
|
||||
c <- currentVectorClock
|
||||
Annex.Branch.change uuidLog $
|
||||
buildLogOld buildUUIDDesc . changeLog c uuid desc . parseUUIDLog
|
||||
|
||||
|
|
29
Types/VectorClock.hs
Normal file
29
Types/VectorClock.hs
Normal file
|
@ -0,0 +1,29 @@
|
|||
{- git-annex vector clocks
|
||||
-
|
||||
- We don't have a way yet to keep true distributed vector clocks.
|
||||
- The next best thing is a timestamp.
|
||||
-
|
||||
- Copyright 2017-2020 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
||||
module Types.VectorClock where
|
||||
|
||||
import Data.Time.Clock.POSIX
|
||||
import Control.Applicative
|
||||
import Prelude
|
||||
|
||||
import Utility.QuickCheck
|
||||
|
||||
-- | Some very old logs did not have any time stamp at all;
|
||||
-- Unknown is used for those.
|
||||
data VectorClock = Unknown | VectorClock POSIXTime
|
||||
deriving (Eq, Ord, Show)
|
||||
|
||||
-- Unknown is oldest.
|
||||
prop_VectorClock_sane :: Bool
|
||||
prop_VectorClock_sane = Unknown < VectorClock 1
|
||||
|
||||
instance Arbitrary VectorClock where
|
||||
arbitrary = VectorClock <$> arbitrary
|
|
@ -11,3 +11,6 @@ As a bonus, since borg uses the same content identifiers for all keys
|
|||
(""), implementing this would make the content of the logs all identical,
|
||||
and so avoid any overhead entirely. Well.. perhaps that's my actual
|
||||
motivation. ;-) --[[Joey]]
|
||||
|
||||
> [[done]], though reuseVectorClockWhile could be used in other places
|
||||
> perhaps. --[[Joey]]
|
||||
|
|
|
@ -670,6 +670,7 @@ Executable git-annex
|
|||
Annex.UUID
|
||||
Annex.Url
|
||||
Annex.VectorClock
|
||||
Annex.VectorClock.Utility
|
||||
Annex.VariantFile
|
||||
Annex.Version
|
||||
Annex.View
|
||||
|
@ -1035,6 +1036,7 @@ Executable git-annex
|
|||
Types.TrustLevel
|
||||
Types.UUID
|
||||
Types.UrlContents
|
||||
Types.VectorClock
|
||||
Types.View
|
||||
Types.WorkerPool
|
||||
Upgrade
|
||||
|
|
Loading…
Add table
Reference in a new issue