use ResourcePool for hash-object handles

Avoid starting an unncessary number of git hash-object processes when
concurrency is enabled.

Sponsored-by: Dartmouth College's DANDI project
This commit is contained in:
Joey Hess 2022-07-25 17:32:39 -04:00
parent b1c49c373a
commit d905232842
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
7 changed files with 56 additions and 28 deletions

View file

@ -175,7 +175,7 @@ data AnnexState = AnnexState
, branchstate :: BranchState , branchstate :: BranchState
, repoqueue :: Maybe (Git.Queue.Queue Annex) , repoqueue :: Maybe (Git.Queue.Queue Annex)
, catfilehandles :: CatFileHandles , catfilehandles :: CatFileHandles
, hashobjecthandle :: Maybe HashObjectHandle , hashobjecthandle :: Maybe (ResourcePool HashObjectHandle)
, checkattrhandle :: Maybe (ResourcePool CheckAttrHandle) , checkattrhandle :: Maybe (ResourcePool CheckAttrHandle)
, checkignorehandle :: Maybe (ResourcePool CheckIgnoreHandle) , checkignorehandle :: Maybe (ResourcePool CheckIgnoreHandle)
, globalnumcopies :: Maybe NumCopies , globalnumcopies :: Maybe NumCopies

View file

@ -631,7 +631,7 @@ genIndex g = Git.UpdateIndex.streamUpdateIndex g
mergeIndex :: JournalLocked -> [Git.Ref] -> Annex () mergeIndex :: JournalLocked -> [Git.Ref] -> Annex ()
mergeIndex jl branches = do mergeIndex jl branches = do
prepareModifyIndex jl prepareModifyIndex jl
hashhandle <- hashObjectHandle withHashObjectHandle $ \hashhandle ->
withCatFileHandle $ \ch -> withCatFileHandle $ \ch ->
inRepo $ \g -> Git.UnionMerge.mergeIndex hashhandle ch g branches inRepo $ \g -> Git.UnionMerge.mergeIndex hashhandle ch g branches
@ -709,7 +709,7 @@ stageJournal jl commitindex = withIndex $ withOtherTmp $ \tmpdir -> do
g <- gitRepo g <- gitRepo
let dir = gitAnnexJournalDir g let dir = gitAnnexJournalDir g
(jlogf, jlogh) <- openjlog (fromRawFilePath tmpdir) (jlogf, jlogh) <- openjlog (fromRawFilePath tmpdir)
h <- hashObjectHandle withHashObjectHandle $ \h ->
withJournalHandle gitAnnexJournalDir $ \jh -> withJournalHandle gitAnnexJournalDir $ \jh ->
Git.UpdateIndex.streamUpdateIndex g Git.UpdateIndex.streamUpdateIndex g
[genstream dir h jh jlogh] [genstream dir h jh jlogh]

View file

@ -1,6 +1,6 @@
{- git-annex concurrent state {- git-annex concurrent state
- -
- Copyright 2015-2021 Joey Hess <id@joeyh.name> - Copyright 2015-2022 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
@ -18,6 +18,7 @@ import Types.Concurrency
import Types.CatFileHandles import Types.CatFileHandles
import Annex.CatFile import Annex.CatFile
import Annex.CheckAttr import Annex.CheckAttr
import Annex.HashObject
import Annex.CheckIgnore import Annex.CheckIgnore
import qualified Data.Map as M import qualified Data.Map as M
@ -47,14 +48,17 @@ setConcurrency' c f = do
fromnonconcurrent = do fromnonconcurrent = do
catFileStop catFileStop
checkAttrStop checkAttrStop
hashObjectStop
checkIgnoreStop checkIgnoreStop
cfh <- liftIO catFileHandlesPool cfh <- liftIO catFileHandlesPool
cah <- mkConcurrentCheckAttrHandle c cah <- mkConcurrentCheckAttrHandle c
hoh <- mkConcurrentHashObjectHandle c
cih <- mkConcurrentCheckIgnoreHandle c cih <- mkConcurrentCheckIgnoreHandle c
Annex.changeState $ \s -> s Annex.changeState $ \s -> s
{ Annex.concurrency = newc { Annex.concurrency = newc
, Annex.catfilehandles = cfh , Annex.catfilehandles = cfh
, Annex.checkattrhandle = Just cah , Annex.checkattrhandle = Just cah
, Annex.hashobjecthandle = Just hoh
, Annex.checkignorehandle = Just cih , Annex.checkignorehandle = Just cih
} }

View file

@ -1,6 +1,6 @@
{- git hash-object interface, with handle automatically stored in the Annex monad {- git hash-object interface
- -
- Copyright 2016 Joey Hess <id@joeyh.name> - Copyright 2016-2022 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
@ -8,40 +8,59 @@
module Annex.HashObject ( module Annex.HashObject (
hashFile, hashFile,
hashBlob, hashBlob,
hashObjectHandle,
hashObjectStop, hashObjectStop,
mkConcurrentHashObjectHandle,
withHashObjectHandle,
) where ) where
import Annex.Common import Annex.Common
import qualified Git.HashObject import qualified Git.HashObject
import qualified Annex import qualified Annex
import Git.Types import Git.Types
import Utility.ResourcePool
hashObjectHandle :: Annex Git.HashObject.HashObjectHandle import Types.Concurrency
hashObjectHandle = maybe startup return =<< Annex.getState Annex.hashobjecthandle import Annex.Concurrent.Utility
where
startup = do
h <- inRepo $ Git.HashObject.hashObjectStart True
Annex.changeState $ \s -> s { Annex.hashobjecthandle = Just h }
return h
hashObjectStop :: Annex () hashObjectStop :: Annex ()
hashObjectStop = maybe noop stop =<< Annex.getState Annex.hashobjecthandle hashObjectStop = maybe noop stop =<< Annex.getState Annex.hashobjecthandle
where where
stop h = do stop p = do
liftIO $ Git.HashObject.hashObjectStop h liftIO $ freeResourcePool p Git.HashObject.hashObjectStop
Annex.changeState $ \s -> s { Annex.hashobjecthandle = Nothing } Annex.changeState $ \s -> s { Annex.hashobjecthandle = Nothing }
return ()
hashFile :: RawFilePath -> Annex Sha hashFile :: RawFilePath -> Annex Sha
hashFile f = do hashFile f = withHashObjectHandle $ \h ->
h <- hashObjectHandle
liftIO $ Git.HashObject.hashFile h f liftIO $ Git.HashObject.hashFile h f
{- Note that the content will be written to a temp file. {- Note that the content will be written to a temp file.
- So it may be faster to use Git.HashObject.hashObject for large - So it may be faster to use Git.HashObject.hashObject for large
- blob contents. -} - blob contents. -}
hashBlob :: Git.HashObject.HashableBlob b => b -> Annex Sha hashBlob :: Git.HashObject.HashableBlob b => b -> Annex Sha
hashBlob content = do hashBlob content = withHashObjectHandle $ \h ->
h <- hashObjectHandle
liftIO $ Git.HashObject.hashBlob h content liftIO $ Git.HashObject.hashBlob h content
withHashObjectHandle :: (Git.HashObject.HashObjectHandle -> Annex a) -> Annex a
withHashObjectHandle a =
maybe mkpool go =<< Annex.getState Annex.hashobjecthandle
where
go p = withResourcePool p start a
start = inRepo $ Git.HashObject.hashObjectStart True
mkpool = do
-- This only runs in non-concurrent code paths;
-- a concurrent pool is set up earlier when needed.
p <- mkResourcePoolNonConcurrent start
Annex.changeState $ \s -> s { Annex.hashobjecthandle = Just p }
go p
mkConcurrentHashObjectHandle :: Concurrency -> Annex (ResourcePool Git.HashObject.HashObjectHandle)
mkConcurrentHashObjectHandle c =
Annex.getState Annex.hashobjecthandle >>= \case
Just p@(ResourcePool {}) -> return p
_ -> mkResourcePool =<< liftIO (maxHashObjects c)
{- git hash-object is typically CPU bound, and is not likely to be the main
- bottleneck for any command. So limit to the number of CPU cores, maximum,
- while respecting the -Jn value.
-}
maxHashObjects :: Concurrency -> IO Int
maxHashObjects = concurrencyUpToCpus

View file

@ -4,6 +4,8 @@ git-annex (10.20220725) UNRELEASED; urgency=medium
automatically upgrade to v10 in a year's time. automatically upgrade to v10 in a year's time.
To avoid this upgrade, you can set annex.autoupgraderepository to false. To avoid this upgrade, you can set annex.autoupgraderepository to false.
* Use v10 by default for new repositories. * Use v10 by default for new repositories.
* Avoid starting an unncessary number of git hash-object processes when
concurrency is enabled.
-- Joey Hess <id@joeyh.name> Mon, 25 Jul 2022 15:35:45 -0400 -- Joey Hess <id@joeyh.name> Mon, 25 Jul 2022 15:35:45 -0400

View file

@ -34,7 +34,7 @@ mkResourcePool maxsz = liftIO $
<*> newTVarIO [] <*> newTVarIO []
{- When there will not be multiple threads that may {- When there will not be multiple threads that may
- may concurrently try to use it, using this is more - concurrently try to use it, using this is more
- efficient than mkResourcePool. - efficient than mkResourcePool.
-} -}
mkResourcePoolNonConcurrent :: (MonadMask m, MonadIO m) => m r -> m (ResourcePool r) mkResourcePoolNonConcurrent :: (MonadMask m, MonadIO m) => m r -> m (ResourcePool r)

View file

@ -43,3 +43,6 @@ which might be ok but still wonder why they are just sleeping there in more than
[[!tag projects/dandi]] [[!tag projects/dandi]]
> [[done]]; this is now handled like other git helper processes
> and will be capped to the maximum of the number of jobs or cpu cores,
> and in practice usually fewer than that will be started. --[[Joey]]