use ResourcePool for hash-object handles

Avoid starting an unncessary number of git hash-object processes when
concurrency is enabled.

Sponsored-by: Dartmouth College's DANDI project
This commit is contained in:
Joey Hess 2022-07-25 17:32:39 -04:00
parent b1c49c373a
commit d905232842
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
7 changed files with 56 additions and 28 deletions

View file

@ -1,6 +1,6 @@
{- git hash-object interface, with handle automatically stored in the Annex monad
{- git hash-object interface
-
- Copyright 2016 Joey Hess <id@joeyh.name>
- Copyright 2016-2022 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -8,40 +8,59 @@
module Annex.HashObject (
hashFile,
hashBlob,
hashObjectHandle,
hashObjectStop,
mkConcurrentHashObjectHandle,
withHashObjectHandle,
) where
import Annex.Common
import qualified Git.HashObject
import qualified Annex
import Git.Types
hashObjectHandle :: Annex Git.HashObject.HashObjectHandle
hashObjectHandle = maybe startup return =<< Annex.getState Annex.hashobjecthandle
where
startup = do
h <- inRepo $ Git.HashObject.hashObjectStart True
Annex.changeState $ \s -> s { Annex.hashobjecthandle = Just h }
return h
import Utility.ResourcePool
import Types.Concurrency
import Annex.Concurrent.Utility
hashObjectStop :: Annex ()
hashObjectStop = maybe noop stop =<< Annex.getState Annex.hashobjecthandle
where
stop h = do
liftIO $ Git.HashObject.hashObjectStop h
stop p = do
liftIO $ freeResourcePool p Git.HashObject.hashObjectStop
Annex.changeState $ \s -> s { Annex.hashobjecthandle = Nothing }
return ()
hashFile :: RawFilePath -> Annex Sha
hashFile f = do
h <- hashObjectHandle
hashFile f = withHashObjectHandle $ \h ->
liftIO $ Git.HashObject.hashFile h f
{- Note that the content will be written to a temp file.
- So it may be faster to use Git.HashObject.hashObject for large
- blob contents. -}
hashBlob :: Git.HashObject.HashableBlob b => b -> Annex Sha
hashBlob content = do
h <- hashObjectHandle
hashBlob content = withHashObjectHandle $ \h ->
liftIO $ Git.HashObject.hashBlob h content
withHashObjectHandle :: (Git.HashObject.HashObjectHandle -> Annex a) -> Annex a
withHashObjectHandle a =
maybe mkpool go =<< Annex.getState Annex.hashobjecthandle
where
go p = withResourcePool p start a
start = inRepo $ Git.HashObject.hashObjectStart True
mkpool = do
-- This only runs in non-concurrent code paths;
-- a concurrent pool is set up earlier when needed.
p <- mkResourcePoolNonConcurrent start
Annex.changeState $ \s -> s { Annex.hashobjecthandle = Just p }
go p
mkConcurrentHashObjectHandle :: Concurrency -> Annex (ResourcePool Git.HashObject.HashObjectHandle)
mkConcurrentHashObjectHandle c =
Annex.getState Annex.hashobjecthandle >>= \case
Just p@(ResourcePool {}) -> return p
_ -> mkResourcePool =<< liftIO (maxHashObjects c)
{- git hash-object is typically CPU bound, and is not likely to be the main
- bottleneck for any command. So limit to the number of CPU cores, maximum,
- while respecting the -Jn value.
-}
maxHashObjects :: Concurrency -> IO Int
maxHashObjects = concurrencyUpToCpus