Sped up query commands that read the git-annex branch by around 5%
The only price paid is one additional MVar read per write to the journal. Presumably writing a journal file dominiates over a MVar read time by several orders of magnitude. --batch does not get the speedup because then it needs to notice when another process has made a change. Also made the assistant and other damon modes bypass the optimisation, which would not help them anyway.
This commit is contained in:
		
					parent
					
						
							
								aba905152a
							
						
					
				
			
			
				commit
				
					
						aeca7c2207
					
				
			
		
					 14 changed files with 101 additions and 25 deletions
				
			
		|  | @ -42,6 +42,7 @@ import Data.ByteString.Builder | |||
| import Control.Concurrent (threadDelay) | ||||
| 
 | ||||
| import Annex.Common | ||||
| import Types.BranchState | ||||
| import Annex.BranchState | ||||
| import Annex.Journal | ||||
| import Annex.GitOverlay | ||||
|  | @ -123,7 +124,7 @@ getBranch = maybe (hasOrigin >>= go >>= use) return =<< branchsha | |||
| 
 | ||||
| {- Ensures that the branch and index are up-to-date; should be | ||||
|  - called before data is read from it. Runs only once per git-annex run. -} | ||||
| update :: Annex () | ||||
| update :: Annex BranchState | ||||
| update = runUpdateOnce $ void $ updateTo =<< siblingBranches | ||||
| 
 | ||||
| {- Forces an update even if one has already been run. -} | ||||
|  | @ -221,8 +222,10 @@ updateTo' pairs = do | |||
|  - Returns an empty string if the file doesn't exist yet. -} | ||||
| get :: RawFilePath -> Annex L.ByteString | ||||
| get file = do | ||||
| 	update | ||||
| 	getLocal file | ||||
| 	st <- update | ||||
| 	if journalIgnorable st | ||||
| 		then getRef fullname file | ||||
| 		else getLocal file | ||||
| 
 | ||||
| {- Like get, but does not merge the branch, so the info returned may not | ||||
|  - reflect changes in remotes. | ||||
|  | @ -274,7 +277,9 @@ maybeChange file f = lockJournal $ \jl -> do | |||
| 
 | ||||
| {- Records new content of a file into the journal -} | ||||
| set :: Journalable content => JournalLocked -> RawFilePath -> content -> Annex () | ||||
| set = setJournalFile | ||||
| set jl f c = do | ||||
| 	journalChanged | ||||
| 	setJournalFile jl f c | ||||
| 
 | ||||
| {- Commit message used when making a commit of whatever data has changed | ||||
|  - to the git-annex brach. -} | ||||
|  | @ -359,7 +364,7 @@ commitIndex' jl branchref message basemessage retrynum parents = do | |||
|  - that have not been committed yet. There may be duplicates in the list. -} | ||||
| files :: Annex [RawFilePath] | ||||
| files = do | ||||
| 	update | ||||
| 	_  <- update | ||||
| 	-- ++ forces the content of the first list to be buffered in memory, | ||||
| 	-- so use getJournalledFilesStale which should be much smaller most | ||||
| 	-- of the time. branchFiles will stream as the list is consumed. | ||||
|  |  | |||
|  | @ -2,7 +2,7 @@ | |||
|  - | ||||
|  - Runtime state about the git-annex branch. | ||||
|  - | ||||
|  - Copyright 2011-2012 Joey Hess <id@joeyh.name> | ||||
|  - Copyright 2011-2020 Joey Hess <id@joeyh.name> | ||||
|  - | ||||
|  - Licensed under the GNU AGPL version 3 or higher. | ||||
|  -} | ||||
|  | @ -29,13 +29,56 @@ checkIndexOnce a = unlessM (indexChecked <$> getState) $ do | |||
| 
 | ||||
| {- Runs an action to update the branch, if it's not been updated before | ||||
|  - in this run of git-annex. -} | ||||
| runUpdateOnce :: Annex () -> Annex () | ||||
| runUpdateOnce a = unlessM (branchUpdated <$> getState) $ do | ||||
| runUpdateOnce :: Annex () -> Annex BranchState | ||||
| runUpdateOnce a = do | ||||
| 	st <- getState | ||||
| 	if branchUpdated st | ||||
| 		then return st | ||||
| 		else do | ||||
| 			a | ||||
| 	disableUpdate | ||||
| 			let stf = \st' -> st' | ||||
| 				{ branchUpdated = True | ||||
| 				-- The update staged anything that was | ||||
| 				-- journalled before, so the journal | ||||
| 				-- does not need to be checked going | ||||
| 				-- forward, unless new information | ||||
| 				-- gets written to it, or unless | ||||
| 				-- this run of git-annex needs to notice | ||||
| 				-- changes journalled by other processes | ||||
| 				-- while it's running. | ||||
| 				, journalIgnorable = not $ | ||||
| 					journalNeverIgnorable st' | ||||
| 				} | ||||
| 			changeState stf | ||||
| 			return (stf st) | ||||
| 
 | ||||
| {- Avoids updating the branch. A useful optimisation when the branch | ||||
|  - is known to have not changed, or git-annex won't be relying on info | ||||
|  - from it. -} | ||||
| disableUpdate :: Annex () | ||||
| disableUpdate = changeState $ \s -> s { branchUpdated = True } | ||||
| 
 | ||||
| {- Called when a change is made to the journal. -} | ||||
| journalChanged :: Annex () | ||||
| journalChanged = do | ||||
| 	-- Optimisation: Typically journalIgnorable will already be True | ||||
| 	-- (when one thing gets journalled, often other things do to), | ||||
| 	-- so avoid an unnecessary write to the MVar that changeState | ||||
| 	-- would do. | ||||
| 	-- | ||||
| 	-- This assumes that another thread is not changing journalIgnorable | ||||
| 	-- at the same time, but since runUpdateOnce is the only | ||||
| 	-- thing that changes it, and it only runs once, that | ||||
| 	-- should not happen. | ||||
| 	st <- getState  | ||||
| 	when (journalIgnorable st) $ | ||||
| 		changeState $ \st' -> st' { journalIgnorable = False } | ||||
| 
 | ||||
| {- When git-annex is somehow interactive, eg in --batch mode, | ||||
|  - and needs to always notice changes made to the journal by other | ||||
|  - processes, this disables optimisations that avoid normally reading the | ||||
|  - journal. | ||||
|  -} | ||||
| enableInteractiveJournalAccess :: Annex () | ||||
| enableInteractiveJournalAccess = changeState $ | ||||
| 	\s -> s { journalNeverIgnorable = True } | ||||
|  |  | |||
|  | @ -50,6 +50,7 @@ import Utility.ThreadScheduler | |||
| import Utility.HumanTime | ||||
| import qualified BuildInfo | ||||
| import Annex.Perms | ||||
| import Annex.BranchState | ||||
| import Utility.LogFile | ||||
| #ifdef mingw32_HOST_OS | ||||
| import Utility.Env | ||||
|  | @ -70,8 +71,8 @@ stopDaemon = liftIO . Utility.Daemon.stopDaemon =<< fromRepo gitAnnexPidFile | |||
|  - stdout and stderr descriptors. -} | ||||
| startDaemon :: Bool -> Bool -> Maybe Duration -> Maybe String -> Maybe HostName ->  Maybe (Maybe Handle -> Maybe Handle -> String -> FilePath -> IO ()) -> Annex () | ||||
| startDaemon assistant foreground startdelay cannotrun listenhost startbrowser = do | ||||
| 	 | ||||
| 	Annex.changeState $ \s -> s { Annex.daemon = True } | ||||
| 	enableInteractiveJournalAccess | ||||
| 	pidfile <- fromRepo gitAnnexPidFile | ||||
| 	logfile <- fromRepo gitAnnexLogFile | ||||
| 	liftIO $ debugM desc $ "logging to " ++ logfile | ||||
|  |  | |||
|  | @ -28,7 +28,7 @@ mkGenerator cmds userinput = do | |||
| 	-- so skewing the runtime of the first action that will be | ||||
| 	-- benchmarked. | ||||
| 	Annex.Branch.commit "benchmarking" | ||||
| 	Annex.Branch.update | ||||
| 	_ <- Annex.Branch.update | ||||
| 	l <- mapM parsesubcommand $ split [";"] userinput | ||||
| 	return $ do | ||||
| 		forM_ l $ \(cmd, seek, st) -> | ||||
|  |  | |||
|  | @ -3,6 +3,7 @@ git-annex (8.20200331) UNRELEASED; urgency=medium | |||
|   * Improve git-annex's ability to find the path to its program, | ||||
|     especially when it needs to run itself in another repo to upgrade it. | ||||
|   * adb: Better messages when the adb command is not installed. | ||||
|   * Sped up query commands that read the git-annex branch by around 5%. | ||||
|   * Various speed improvements gained by using ByteStrings for git refs and | ||||
|     shas. | ||||
| 
 | ||||
|  |  | |||
|  | @ -14,6 +14,7 @@ import CmdLine.GitAnnex.Options | |||
| import Options.Applicative | ||||
| import Limit | ||||
| import Types.FileMatcher | ||||
| import Annex.BranchState | ||||
| 
 | ||||
| data BatchMode = Batch BatchFormat | NoBatch | ||||
| 
 | ||||
|  | @ -72,7 +73,9 @@ batchInput fmt parser a = go =<< batchLines fmt | |||
| 	parseerr s = giveup $ "Batch input parse failure: " ++ s | ||||
| 
 | ||||
| batchLines :: BatchFormat -> Annex [String] | ||||
| batchLines fmt = liftIO $ splitter <$> getContents | ||||
| batchLines fmt = do | ||||
| 	enableInteractiveJournalAccess | ||||
| 	liftIO $ splitter <$> getContents | ||||
|   where | ||||
| 	splitter = case fmt of | ||||
| 		BatchLine -> lines | ||||
|  |  | |||
|  | @ -45,7 +45,7 @@ perform :: Transitions -> Bool -> CommandPerform | |||
| perform ts True = do | ||||
| 	recordTransitions Branch.change ts | ||||
| 	-- get branch committed before contining with the transition | ||||
| 	Branch.update | ||||
| 	_ <- Branch.update | ||||
| 	void $ Branch.performTransitions ts True [] | ||||
| 	next $ return True | ||||
| perform _ False = do | ||||
|  |  | |||
|  | @ -30,7 +30,7 @@ seek bs = do | |||
| 
 | ||||
| mergeAnnexBranch :: CommandStart | ||||
| mergeAnnexBranch = starting "merge" (ActionItemOther (Just "git-annex")) $ do | ||||
| 	Annex.Branch.update | ||||
| 	_ <- Annex.Branch.update | ||||
| 	-- commit explicitly, in case no remote branches were merged | ||||
| 	Annex.Branch.commit =<< Annex.Branch.commitMessage | ||||
| 	next $ return True | ||||
|  |  | |||
|  | @ -17,6 +17,7 @@ import qualified Remote | |||
| import Utility.SimpleProtocol (dupIoHandles) | ||||
| import Git.Types (RemoteName) | ||||
| import qualified Database.Keys | ||||
| import Annex.BranchState | ||||
| 
 | ||||
| data TransferRequest = TransferRequest Direction Remote Key AssociatedFile | ||||
| 
 | ||||
|  | @ -29,6 +30,7 @@ seek = withNothing (commandAction start) | |||
| 
 | ||||
| start :: CommandStart | ||||
| start = do | ||||
| 	enableInteractiveJournalAccess | ||||
| 	(readh, writeh) <- liftIO dupIoHandles | ||||
| 	runRequests readh writeh runner | ||||
| 	stop | ||||
|  |  | |||
|  | @ -90,7 +90,7 @@ knownUrls = do | |||
| 	{- Ensure the git-annex branch's index file is up-to-date and | ||||
| 	 - any journaled changes are reflected in it, since we're going | ||||
| 	 - to query its index directly. -} | ||||
| 	Annex.Branch.update | ||||
| 	_ <- Annex.Branch.update | ||||
| 	Annex.Branch.commit =<< Annex.Branch.commitMessage | ||||
| 	Annex.Branch.withIndex $ do | ||||
| 		top <- fromRepo Git.repoPath | ||||
|  |  | |||
|  | @ -22,6 +22,7 @@ import Utility.SimpleProtocol | |||
| import Utility.ThreadScheduler | ||||
| import Config | ||||
| import Annex.Ssh | ||||
| import Annex.BranchState | ||||
| import Types.Messages | ||||
| 
 | ||||
| import Control.Concurrent | ||||
|  | @ -163,7 +164,9 @@ genTransportHandle = do | |||
| 	annexstate <- newMVar =<< Annex.new =<< Git.CurrentRepo.get | ||||
| 	g <- Annex.repo <$> readMVar annexstate | ||||
| 	let h = TransportHandle (LocalRepo g) annexstate | ||||
| 	liftAnnex h $ Annex.setOutput QuietOutput | ||||
| 	liftAnnex h $ do | ||||
| 		Annex.setOutput QuietOutput | ||||
| 		enableInteractiveJournalAccess | ||||
| 	return h | ||||
| 
 | ||||
| updateTransportHandle :: TransportHandle -> IO TransportHandle | ||||
|  |  | |||
|  | @ -8,9 +8,17 @@ | |||
| module Types.BranchState where | ||||
| 
 | ||||
| data BranchState = BranchState | ||||
| 	{ branchUpdated :: Bool -- has the branch been updated this run? | ||||
| 	, indexChecked :: Bool -- has the index file been checked to exist? | ||||
| 	{ branchUpdated :: Bool | ||||
| 	-- ^ has the branch been updated this run? | ||||
| 	, indexChecked :: Bool | ||||
| 	-- ^ has the index file been checked to exist? | ||||
| 	, journalIgnorable :: Bool | ||||
| 	-- ^ can reading the journal be skipped, while still getting | ||||
| 	-- sufficiently up-to-date information from the branch? | ||||
| 	, journalNeverIgnorable :: Bool | ||||
| 	-- ^ should the journal always be read even if it would normally | ||||
| 	-- be safe to skip it? | ||||
| 	} | ||||
| 
 | ||||
| startBranchState :: BranchState | ||||
| startBranchState = BranchState False False | ||||
| startBranchState = BranchState False False False False | ||||
|  |  | |||
|  | @ -102,12 +102,12 @@ push = do | |||
| 			-- will immediately work. Not pushed here, | ||||
| 			-- because it's less obnoxious to let the user | ||||
| 			-- push. | ||||
| 			Annex.Branch.update | ||||
| 			void Annex.Branch.update | ||||
| 		(True, False) -> do | ||||
| 			-- push git-annex to origin, so that | ||||
| 			-- "git push" will from then on | ||||
| 			-- automatically push it | ||||
| 			Annex.Branch.update -- just in case | ||||
| 			void Annex.Branch.update -- just in case | ||||
| 			showAction "pushing new git-annex branch to origin" | ||||
| 			showOutput | ||||
| 			inRepo $ Git.Command.run | ||||
|  | @ -118,7 +118,7 @@ push = do | |||
| 		_ -> do | ||||
| 			-- no origin exists, so just let the user | ||||
| 			-- know about the new branch | ||||
| 			Annex.Branch.update | ||||
| 			void Annex.Branch.update | ||||
| 			showLongNote $ | ||||
| 				"git-annex branch created\n" ++ | ||||
| 				"Be sure to push this branch when pushing to remotes.\n" | ||||
|  |  | |||
|  | @ -11,13 +11,23 @@ What if, once at startup, it checked if the journal was entirely empty. | |||
| If so, it can remember that, and avoid reading journal files. | ||||
| Perhaps paired with staging the journal if it's not empty. | ||||
| 
 | ||||
| When a process writes to the journal, it will need to update its state | ||||
| to remember it's no longer empty. | ||||
| 
 | ||||
| This could lead to behavior changes in some cases where one command is | ||||
| writing changes and another command used to read them from the journal and | ||||
| may no longer do so. But any such behavior change is of a behavior that | ||||
| used to involve a race; the reader could just as well be ahead of the | ||||
| writer and it would have already behaved as it would after the change. | ||||
| 
 | ||||
| But: When a process writes to the journal, it will need to update its state | ||||
| to remember it's no longer empty. --[[Joey]] | ||||
| > Hmm, not so fast. If the user has two --batch processes, one that makes | ||||
| > changes and the other that queries, they will expect the querying process | ||||
| > to see the changes after they were made. There's no race, the user can | ||||
| > control which process runs by feeding batch inputs to them. | ||||
| >  | ||||
| > So, --batch and the assistant, as well as batch-like things that don't | ||||
| > use --batch will need to disable this optimisation it seems. --[[Joey]] | ||||
| 
 | ||||
| [[!tag confirmed]] | ||||
| 
 | ||||
| >> [[done]] speedup was around 5% --[[Joey]] | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Joey Hess
				Joey Hess