Sped up query commands that read the git-annex branch by around 5%
The only price paid is one additional MVar read per write to the journal. Presumably writing a journal file dominiates over a MVar read time by several orders of magnitude. --batch does not get the speedup because then it needs to notice when another process has made a change. Also made the assistant and other damon modes bypass the optimisation, which would not help them anyway.
This commit is contained in:
parent
aba905152a
commit
aeca7c2207
14 changed files with 101 additions and 25 deletions
|
@ -42,6 +42,7 @@ import Data.ByteString.Builder
|
|||
import Control.Concurrent (threadDelay)
|
||||
|
||||
import Annex.Common
|
||||
import Types.BranchState
|
||||
import Annex.BranchState
|
||||
import Annex.Journal
|
||||
import Annex.GitOverlay
|
||||
|
@ -123,7 +124,7 @@ getBranch = maybe (hasOrigin >>= go >>= use) return =<< branchsha
|
|||
|
||||
{- Ensures that the branch and index are up-to-date; should be
|
||||
- called before data is read from it. Runs only once per git-annex run. -}
|
||||
update :: Annex ()
|
||||
update :: Annex BranchState
|
||||
update = runUpdateOnce $ void $ updateTo =<< siblingBranches
|
||||
|
||||
{- Forces an update even if one has already been run. -}
|
||||
|
@ -221,8 +222,10 @@ updateTo' pairs = do
|
|||
- Returns an empty string if the file doesn't exist yet. -}
|
||||
get :: RawFilePath -> Annex L.ByteString
|
||||
get file = do
|
||||
update
|
||||
getLocal file
|
||||
st <- update
|
||||
if journalIgnorable st
|
||||
then getRef fullname file
|
||||
else getLocal file
|
||||
|
||||
{- Like get, but does not merge the branch, so the info returned may not
|
||||
- reflect changes in remotes.
|
||||
|
@ -274,7 +277,9 @@ maybeChange file f = lockJournal $ \jl -> do
|
|||
|
||||
{- Records new content of a file into the journal -}
|
||||
set :: Journalable content => JournalLocked -> RawFilePath -> content -> Annex ()
|
||||
set = setJournalFile
|
||||
set jl f c = do
|
||||
journalChanged
|
||||
setJournalFile jl f c
|
||||
|
||||
{- Commit message used when making a commit of whatever data has changed
|
||||
- to the git-annex brach. -}
|
||||
|
@ -359,7 +364,7 @@ commitIndex' jl branchref message basemessage retrynum parents = do
|
|||
- that have not been committed yet. There may be duplicates in the list. -}
|
||||
files :: Annex [RawFilePath]
|
||||
files = do
|
||||
update
|
||||
_ <- update
|
||||
-- ++ forces the content of the first list to be buffered in memory,
|
||||
-- so use getJournalledFilesStale which should be much smaller most
|
||||
-- of the time. branchFiles will stream as the list is consumed.
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
-
|
||||
- Runtime state about the git-annex branch.
|
||||
-
|
||||
- Copyright 2011-2012 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2011-2020 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
@ -29,13 +29,56 @@ checkIndexOnce a = unlessM (indexChecked <$> getState) $ do
|
|||
|
||||
{- Runs an action to update the branch, if it's not been updated before
|
||||
- in this run of git-annex. -}
|
||||
runUpdateOnce :: Annex () -> Annex ()
|
||||
runUpdateOnce a = unlessM (branchUpdated <$> getState) $ do
|
||||
a
|
||||
disableUpdate
|
||||
runUpdateOnce :: Annex () -> Annex BranchState
|
||||
runUpdateOnce a = do
|
||||
st <- getState
|
||||
if branchUpdated st
|
||||
then return st
|
||||
else do
|
||||
a
|
||||
let stf = \st' -> st'
|
||||
{ branchUpdated = True
|
||||
-- The update staged anything that was
|
||||
-- journalled before, so the journal
|
||||
-- does not need to be checked going
|
||||
-- forward, unless new information
|
||||
-- gets written to it, or unless
|
||||
-- this run of git-annex needs to notice
|
||||
-- changes journalled by other processes
|
||||
-- while it's running.
|
||||
, journalIgnorable = not $
|
||||
journalNeverIgnorable st'
|
||||
}
|
||||
changeState stf
|
||||
return (stf st)
|
||||
|
||||
{- Avoids updating the branch. A useful optimisation when the branch
|
||||
- is known to have not changed, or git-annex won't be relying on info
|
||||
- from it. -}
|
||||
disableUpdate :: Annex ()
|
||||
disableUpdate = changeState $ \s -> s { branchUpdated = True }
|
||||
|
||||
{- Called when a change is made to the journal. -}
|
||||
journalChanged :: Annex ()
|
||||
journalChanged = do
|
||||
-- Optimisation: Typically journalIgnorable will already be True
|
||||
-- (when one thing gets journalled, often other things do to),
|
||||
-- so avoid an unnecessary write to the MVar that changeState
|
||||
-- would do.
|
||||
--
|
||||
-- This assumes that another thread is not changing journalIgnorable
|
||||
-- at the same time, but since runUpdateOnce is the only
|
||||
-- thing that changes it, and it only runs once, that
|
||||
-- should not happen.
|
||||
st <- getState
|
||||
when (journalIgnorable st) $
|
||||
changeState $ \st' -> st' { journalIgnorable = False }
|
||||
|
||||
{- When git-annex is somehow interactive, eg in --batch mode,
|
||||
- and needs to always notice changes made to the journal by other
|
||||
- processes, this disables optimisations that avoid normally reading the
|
||||
- journal.
|
||||
-}
|
||||
enableInteractiveJournalAccess :: Annex ()
|
||||
enableInteractiveJournalAccess = changeState $
|
||||
\s -> s { journalNeverIgnorable = True }
|
||||
|
|
|
@ -50,6 +50,7 @@ import Utility.ThreadScheduler
|
|||
import Utility.HumanTime
|
||||
import qualified BuildInfo
|
||||
import Annex.Perms
|
||||
import Annex.BranchState
|
||||
import Utility.LogFile
|
||||
#ifdef mingw32_HOST_OS
|
||||
import Utility.Env
|
||||
|
@ -70,8 +71,8 @@ stopDaemon = liftIO . Utility.Daemon.stopDaemon =<< fromRepo gitAnnexPidFile
|
|||
- stdout and stderr descriptors. -}
|
||||
startDaemon :: Bool -> Bool -> Maybe Duration -> Maybe String -> Maybe HostName -> Maybe (Maybe Handle -> Maybe Handle -> String -> FilePath -> IO ()) -> Annex ()
|
||||
startDaemon assistant foreground startdelay cannotrun listenhost startbrowser = do
|
||||
|
||||
Annex.changeState $ \s -> s { Annex.daemon = True }
|
||||
enableInteractiveJournalAccess
|
||||
pidfile <- fromRepo gitAnnexPidFile
|
||||
logfile <- fromRepo gitAnnexLogFile
|
||||
liftIO $ debugM desc $ "logging to " ++ logfile
|
||||
|
|
|
@ -28,7 +28,7 @@ mkGenerator cmds userinput = do
|
|||
-- so skewing the runtime of the first action that will be
|
||||
-- benchmarked.
|
||||
Annex.Branch.commit "benchmarking"
|
||||
Annex.Branch.update
|
||||
_ <- Annex.Branch.update
|
||||
l <- mapM parsesubcommand $ split [";"] userinput
|
||||
return $ do
|
||||
forM_ l $ \(cmd, seek, st) ->
|
||||
|
|
|
@ -3,6 +3,7 @@ git-annex (8.20200331) UNRELEASED; urgency=medium
|
|||
* Improve git-annex's ability to find the path to its program,
|
||||
especially when it needs to run itself in another repo to upgrade it.
|
||||
* adb: Better messages when the adb command is not installed.
|
||||
* Sped up query commands that read the git-annex branch by around 5%.
|
||||
* Various speed improvements gained by using ByteStrings for git refs and
|
||||
shas.
|
||||
|
||||
|
|
|
@ -14,6 +14,7 @@ import CmdLine.GitAnnex.Options
|
|||
import Options.Applicative
|
||||
import Limit
|
||||
import Types.FileMatcher
|
||||
import Annex.BranchState
|
||||
|
||||
data BatchMode = Batch BatchFormat | NoBatch
|
||||
|
||||
|
@ -72,7 +73,9 @@ batchInput fmt parser a = go =<< batchLines fmt
|
|||
parseerr s = giveup $ "Batch input parse failure: " ++ s
|
||||
|
||||
batchLines :: BatchFormat -> Annex [String]
|
||||
batchLines fmt = liftIO $ splitter <$> getContents
|
||||
batchLines fmt = do
|
||||
enableInteractiveJournalAccess
|
||||
liftIO $ splitter <$> getContents
|
||||
where
|
||||
splitter = case fmt of
|
||||
BatchLine -> lines
|
||||
|
|
|
@ -45,7 +45,7 @@ perform :: Transitions -> Bool -> CommandPerform
|
|||
perform ts True = do
|
||||
recordTransitions Branch.change ts
|
||||
-- get branch committed before contining with the transition
|
||||
Branch.update
|
||||
_ <- Branch.update
|
||||
void $ Branch.performTransitions ts True []
|
||||
next $ return True
|
||||
perform _ False = do
|
||||
|
|
|
@ -30,7 +30,7 @@ seek bs = do
|
|||
|
||||
mergeAnnexBranch :: CommandStart
|
||||
mergeAnnexBranch = starting "merge" (ActionItemOther (Just "git-annex")) $ do
|
||||
Annex.Branch.update
|
||||
_ <- Annex.Branch.update
|
||||
-- commit explicitly, in case no remote branches were merged
|
||||
Annex.Branch.commit =<< Annex.Branch.commitMessage
|
||||
next $ return True
|
||||
|
|
|
@ -17,6 +17,7 @@ import qualified Remote
|
|||
import Utility.SimpleProtocol (dupIoHandles)
|
||||
import Git.Types (RemoteName)
|
||||
import qualified Database.Keys
|
||||
import Annex.BranchState
|
||||
|
||||
data TransferRequest = TransferRequest Direction Remote Key AssociatedFile
|
||||
|
||||
|
@ -29,6 +30,7 @@ seek = withNothing (commandAction start)
|
|||
|
||||
start :: CommandStart
|
||||
start = do
|
||||
enableInteractiveJournalAccess
|
||||
(readh, writeh) <- liftIO dupIoHandles
|
||||
runRequests readh writeh runner
|
||||
stop
|
||||
|
|
|
@ -90,7 +90,7 @@ knownUrls = do
|
|||
{- Ensure the git-annex branch's index file is up-to-date and
|
||||
- any journaled changes are reflected in it, since we're going
|
||||
- to query its index directly. -}
|
||||
Annex.Branch.update
|
||||
_ <- Annex.Branch.update
|
||||
Annex.Branch.commit =<< Annex.Branch.commitMessage
|
||||
Annex.Branch.withIndex $ do
|
||||
top <- fromRepo Git.repoPath
|
||||
|
|
|
@ -22,6 +22,7 @@ import Utility.SimpleProtocol
|
|||
import Utility.ThreadScheduler
|
||||
import Config
|
||||
import Annex.Ssh
|
||||
import Annex.BranchState
|
||||
import Types.Messages
|
||||
|
||||
import Control.Concurrent
|
||||
|
@ -163,7 +164,9 @@ genTransportHandle = do
|
|||
annexstate <- newMVar =<< Annex.new =<< Git.CurrentRepo.get
|
||||
g <- Annex.repo <$> readMVar annexstate
|
||||
let h = TransportHandle (LocalRepo g) annexstate
|
||||
liftAnnex h $ Annex.setOutput QuietOutput
|
||||
liftAnnex h $ do
|
||||
Annex.setOutput QuietOutput
|
||||
enableInteractiveJournalAccess
|
||||
return h
|
||||
|
||||
updateTransportHandle :: TransportHandle -> IO TransportHandle
|
||||
|
|
|
@ -8,9 +8,17 @@
|
|||
module Types.BranchState where
|
||||
|
||||
data BranchState = BranchState
|
||||
{ branchUpdated :: Bool -- has the branch been updated this run?
|
||||
, indexChecked :: Bool -- has the index file been checked to exist?
|
||||
{ branchUpdated :: Bool
|
||||
-- ^ has the branch been updated this run?
|
||||
, indexChecked :: Bool
|
||||
-- ^ has the index file been checked to exist?
|
||||
, journalIgnorable :: Bool
|
||||
-- ^ can reading the journal be skipped, while still getting
|
||||
-- sufficiently up-to-date information from the branch?
|
||||
, journalNeverIgnorable :: Bool
|
||||
-- ^ should the journal always be read even if it would normally
|
||||
-- be safe to skip it?
|
||||
}
|
||||
|
||||
startBranchState :: BranchState
|
||||
startBranchState = BranchState False False
|
||||
startBranchState = BranchState False False False False
|
||||
|
|
|
@ -102,12 +102,12 @@ push = do
|
|||
-- will immediately work. Not pushed here,
|
||||
-- because it's less obnoxious to let the user
|
||||
-- push.
|
||||
Annex.Branch.update
|
||||
void Annex.Branch.update
|
||||
(True, False) -> do
|
||||
-- push git-annex to origin, so that
|
||||
-- "git push" will from then on
|
||||
-- automatically push it
|
||||
Annex.Branch.update -- just in case
|
||||
void Annex.Branch.update -- just in case
|
||||
showAction "pushing new git-annex branch to origin"
|
||||
showOutput
|
||||
inRepo $ Git.Command.run
|
||||
|
@ -118,7 +118,7 @@ push = do
|
|||
_ -> do
|
||||
-- no origin exists, so just let the user
|
||||
-- know about the new branch
|
||||
Annex.Branch.update
|
||||
void Annex.Branch.update
|
||||
showLongNote $
|
||||
"git-annex branch created\n" ++
|
||||
"Be sure to push this branch when pushing to remotes.\n"
|
||||
|
|
|
@ -11,13 +11,23 @@ What if, once at startup, it checked if the journal was entirely empty.
|
|||
If so, it can remember that, and avoid reading journal files.
|
||||
Perhaps paired with staging the journal if it's not empty.
|
||||
|
||||
When a process writes to the journal, it will need to update its state
|
||||
to remember it's no longer empty.
|
||||
|
||||
This could lead to behavior changes in some cases where one command is
|
||||
writing changes and another command used to read them from the journal and
|
||||
may no longer do so. But any such behavior change is of a behavior that
|
||||
used to involve a race; the reader could just as well be ahead of the
|
||||
writer and it would have already behaved as it would after the change.
|
||||
|
||||
But: When a process writes to the journal, it will need to update its state
|
||||
to remember it's no longer empty. --[[Joey]]
|
||||
> Hmm, not so fast. If the user has two --batch processes, one that makes
|
||||
> changes and the other that queries, they will expect the querying process
|
||||
> to see the changes after they were made. There's no race, the user can
|
||||
> control which process runs by feeding batch inputs to them.
|
||||
>
|
||||
> So, --batch and the assistant, as well as batch-like things that don't
|
||||
> use --batch will need to disable this optimisation it seems. --[[Joey]]
|
||||
|
||||
[[!tag confirmed]]
|
||||
|
||||
>> [[done]] speedup was around 5% --[[Joey]]
|
||||
|
|
Loading…
Reference in a new issue