use ResourcePool for hash-object handles

Avoid starting an unncessary number of git hash-object processes when
concurrency is enabled.

Sponsored-by: Dartmouth College's DANDI project
This commit is contained in:
Joey Hess 2022-07-25 17:32:39 -04:00
parent b1c49c373a
commit d905232842
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
7 changed files with 56 additions and 28 deletions

View file

@ -175,7 +175,7 @@ data AnnexState = AnnexState
, branchstate :: BranchState
, repoqueue :: Maybe (Git.Queue.Queue Annex)
, catfilehandles :: CatFileHandles
, hashobjecthandle :: Maybe HashObjectHandle
, hashobjecthandle :: Maybe (ResourcePool HashObjectHandle)
, checkattrhandle :: Maybe (ResourcePool CheckAttrHandle)
, checkignorehandle :: Maybe (ResourcePool CheckIgnoreHandle)
, globalnumcopies :: Maybe NumCopies

View file

@ -631,9 +631,9 @@ genIndex g = Git.UpdateIndex.streamUpdateIndex g
mergeIndex :: JournalLocked -> [Git.Ref] -> Annex ()
mergeIndex jl branches = do
prepareModifyIndex jl
hashhandle <- hashObjectHandle
withCatFileHandle $ \ch ->
inRepo $ \g -> Git.UnionMerge.mergeIndex hashhandle ch g branches
withHashObjectHandle $ \hashhandle ->
withCatFileHandle $ \ch ->
inRepo $ \g -> Git.UnionMerge.mergeIndex hashhandle ch g branches
{- Removes any stale git lock file, to avoid git falling over when
- updating the index.
@ -709,10 +709,10 @@ stageJournal jl commitindex = withIndex $ withOtherTmp $ \tmpdir -> do
g <- gitRepo
let dir = gitAnnexJournalDir g
(jlogf, jlogh) <- openjlog (fromRawFilePath tmpdir)
h <- hashObjectHandle
withJournalHandle gitAnnexJournalDir $ \jh ->
Git.UpdateIndex.streamUpdateIndex g
[genstream dir h jh jlogh]
withHashObjectHandle $ \h ->
withJournalHandle gitAnnexJournalDir $ \jh ->
Git.UpdateIndex.streamUpdateIndex g
[genstream dir h jh jlogh]
commitindex
liftIO $ cleanup (fromRawFilePath dir) jlogh jlogf
where

View file

@ -1,6 +1,6 @@
{- git-annex concurrent state
-
- Copyright 2015-2021 Joey Hess <id@joeyh.name>
- Copyright 2015-2022 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -18,6 +18,7 @@ import Types.Concurrency
import Types.CatFileHandles
import Annex.CatFile
import Annex.CheckAttr
import Annex.HashObject
import Annex.CheckIgnore
import qualified Data.Map as M
@ -47,14 +48,17 @@ setConcurrency' c f = do
fromnonconcurrent = do
catFileStop
checkAttrStop
hashObjectStop
checkIgnoreStop
cfh <- liftIO catFileHandlesPool
cah <- mkConcurrentCheckAttrHandle c
hoh <- mkConcurrentHashObjectHandle c
cih <- mkConcurrentCheckIgnoreHandle c
Annex.changeState $ \s -> s
{ Annex.concurrency = newc
, Annex.catfilehandles = cfh
, Annex.checkattrhandle = Just cah
, Annex.hashobjecthandle = Just hoh
, Annex.checkignorehandle = Just cih
}

View file

@ -1,6 +1,6 @@
{- git hash-object interface, with handle automatically stored in the Annex monad
{- git hash-object interface
-
- Copyright 2016 Joey Hess <id@joeyh.name>
- Copyright 2016-2022 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -8,40 +8,59 @@
module Annex.HashObject (
hashFile,
hashBlob,
hashObjectHandle,
hashObjectStop,
mkConcurrentHashObjectHandle,
withHashObjectHandle,
) where
import Annex.Common
import qualified Git.HashObject
import qualified Annex
import Git.Types
hashObjectHandle :: Annex Git.HashObject.HashObjectHandle
hashObjectHandle = maybe startup return =<< Annex.getState Annex.hashobjecthandle
where
startup = do
h <- inRepo $ Git.HashObject.hashObjectStart True
Annex.changeState $ \s -> s { Annex.hashobjecthandle = Just h }
return h
import Utility.ResourcePool
import Types.Concurrency
import Annex.Concurrent.Utility
hashObjectStop :: Annex ()
hashObjectStop = maybe noop stop =<< Annex.getState Annex.hashobjecthandle
where
stop h = do
liftIO $ Git.HashObject.hashObjectStop h
stop p = do
liftIO $ freeResourcePool p Git.HashObject.hashObjectStop
Annex.changeState $ \s -> s { Annex.hashobjecthandle = Nothing }
return ()
hashFile :: RawFilePath -> Annex Sha
hashFile f = do
h <- hashObjectHandle
hashFile f = withHashObjectHandle $ \h ->
liftIO $ Git.HashObject.hashFile h f
{- Note that the content will be written to a temp file.
- So it may be faster to use Git.HashObject.hashObject for large
- blob contents. -}
hashBlob :: Git.HashObject.HashableBlob b => b -> Annex Sha
hashBlob content = do
h <- hashObjectHandle
hashBlob content = withHashObjectHandle $ \h ->
liftIO $ Git.HashObject.hashBlob h content
withHashObjectHandle :: (Git.HashObject.HashObjectHandle -> Annex a) -> Annex a
withHashObjectHandle a =
maybe mkpool go =<< Annex.getState Annex.hashobjecthandle
where
go p = withResourcePool p start a
start = inRepo $ Git.HashObject.hashObjectStart True
mkpool = do
-- This only runs in non-concurrent code paths;
-- a concurrent pool is set up earlier when needed.
p <- mkResourcePoolNonConcurrent start
Annex.changeState $ \s -> s { Annex.hashobjecthandle = Just p }
go p
mkConcurrentHashObjectHandle :: Concurrency -> Annex (ResourcePool Git.HashObject.HashObjectHandle)
mkConcurrentHashObjectHandle c =
Annex.getState Annex.hashobjecthandle >>= \case
Just p@(ResourcePool {}) -> return p
_ -> mkResourcePool =<< liftIO (maxHashObjects c)
{- git hash-object is typically CPU bound, and is not likely to be the main
- bottleneck for any command. So limit to the number of CPU cores, maximum,
- while respecting the -Jn value.
-}
maxHashObjects :: Concurrency -> IO Int
maxHashObjects = concurrencyUpToCpus

View file

@ -4,6 +4,8 @@ git-annex (10.20220725) UNRELEASED; urgency=medium
automatically upgrade to v10 in a year's time.
To avoid this upgrade, you can set annex.autoupgraderepository to false.
* Use v10 by default for new repositories.
* Avoid starting an unncessary number of git hash-object processes when
concurrency is enabled.
-- Joey Hess <id@joeyh.name> Mon, 25 Jul 2022 15:35:45 -0400

View file

@ -34,7 +34,7 @@ mkResourcePool maxsz = liftIO $
<*> newTVarIO []
{- When there will not be multiple threads that may
- may concurrently try to use it, using this is more
- concurrently try to use it, using this is more
- efficient than mkResourcePool.
-}
mkResourcePoolNonConcurrent :: (MonadMask m, MonadIO m) => m r -> m (ResourcePool r)

View file

@ -43,3 +43,6 @@ which might be ok but still wonder why they are just sleeping there in more than
[[!tag projects/dandi]]
> [[done]]; this is now handled like other git helper processes
> and will be capped to the maximum of the number of jobs or cpu cores,
> and in practice usually fewer than that will be started. --[[Joey]]