use ResourcePool for hash-object handles
Avoid starting an unncessary number of git hash-object processes when concurrency is enabled. Sponsored-by: Dartmouth College's DANDI project
This commit is contained in:
parent
b1c49c373a
commit
d905232842
7 changed files with 56 additions and 28 deletions
2
Annex.hs
2
Annex.hs
|
@ -175,7 +175,7 @@ data AnnexState = AnnexState
|
|||
, branchstate :: BranchState
|
||||
, repoqueue :: Maybe (Git.Queue.Queue Annex)
|
||||
, catfilehandles :: CatFileHandles
|
||||
, hashobjecthandle :: Maybe HashObjectHandle
|
||||
, hashobjecthandle :: Maybe (ResourcePool HashObjectHandle)
|
||||
, checkattrhandle :: Maybe (ResourcePool CheckAttrHandle)
|
||||
, checkignorehandle :: Maybe (ResourcePool CheckIgnoreHandle)
|
||||
, globalnumcopies :: Maybe NumCopies
|
||||
|
|
|
@ -631,9 +631,9 @@ genIndex g = Git.UpdateIndex.streamUpdateIndex g
|
|||
mergeIndex :: JournalLocked -> [Git.Ref] -> Annex ()
|
||||
mergeIndex jl branches = do
|
||||
prepareModifyIndex jl
|
||||
hashhandle <- hashObjectHandle
|
||||
withCatFileHandle $ \ch ->
|
||||
inRepo $ \g -> Git.UnionMerge.mergeIndex hashhandle ch g branches
|
||||
withHashObjectHandle $ \hashhandle ->
|
||||
withCatFileHandle $ \ch ->
|
||||
inRepo $ \g -> Git.UnionMerge.mergeIndex hashhandle ch g branches
|
||||
|
||||
{- Removes any stale git lock file, to avoid git falling over when
|
||||
- updating the index.
|
||||
|
@ -709,10 +709,10 @@ stageJournal jl commitindex = withIndex $ withOtherTmp $ \tmpdir -> do
|
|||
g <- gitRepo
|
||||
let dir = gitAnnexJournalDir g
|
||||
(jlogf, jlogh) <- openjlog (fromRawFilePath tmpdir)
|
||||
h <- hashObjectHandle
|
||||
withJournalHandle gitAnnexJournalDir $ \jh ->
|
||||
Git.UpdateIndex.streamUpdateIndex g
|
||||
[genstream dir h jh jlogh]
|
||||
withHashObjectHandle $ \h ->
|
||||
withJournalHandle gitAnnexJournalDir $ \jh ->
|
||||
Git.UpdateIndex.streamUpdateIndex g
|
||||
[genstream dir h jh jlogh]
|
||||
commitindex
|
||||
liftIO $ cleanup (fromRawFilePath dir) jlogh jlogf
|
||||
where
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{- git-annex concurrent state
|
||||
-
|
||||
- Copyright 2015-2021 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2015-2022 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
@ -18,6 +18,7 @@ import Types.Concurrency
|
|||
import Types.CatFileHandles
|
||||
import Annex.CatFile
|
||||
import Annex.CheckAttr
|
||||
import Annex.HashObject
|
||||
import Annex.CheckIgnore
|
||||
|
||||
import qualified Data.Map as M
|
||||
|
@ -47,14 +48,17 @@ setConcurrency' c f = do
|
|||
fromnonconcurrent = do
|
||||
catFileStop
|
||||
checkAttrStop
|
||||
hashObjectStop
|
||||
checkIgnoreStop
|
||||
cfh <- liftIO catFileHandlesPool
|
||||
cah <- mkConcurrentCheckAttrHandle c
|
||||
hoh <- mkConcurrentHashObjectHandle c
|
||||
cih <- mkConcurrentCheckIgnoreHandle c
|
||||
Annex.changeState $ \s -> s
|
||||
{ Annex.concurrency = newc
|
||||
, Annex.catfilehandles = cfh
|
||||
, Annex.checkattrhandle = Just cah
|
||||
, Annex.hashobjecthandle = Just hoh
|
||||
, Annex.checkignorehandle = Just cih
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{- git hash-object interface, with handle automatically stored in the Annex monad
|
||||
{- git hash-object interface
|
||||
-
|
||||
- Copyright 2016 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2016-2022 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
@ -8,40 +8,59 @@
|
|||
module Annex.HashObject (
|
||||
hashFile,
|
||||
hashBlob,
|
||||
hashObjectHandle,
|
||||
hashObjectStop,
|
||||
mkConcurrentHashObjectHandle,
|
||||
withHashObjectHandle,
|
||||
) where
|
||||
|
||||
import Annex.Common
|
||||
import qualified Git.HashObject
|
||||
import qualified Annex
|
||||
import Git.Types
|
||||
|
||||
hashObjectHandle :: Annex Git.HashObject.HashObjectHandle
|
||||
hashObjectHandle = maybe startup return =<< Annex.getState Annex.hashobjecthandle
|
||||
where
|
||||
startup = do
|
||||
h <- inRepo $ Git.HashObject.hashObjectStart True
|
||||
Annex.changeState $ \s -> s { Annex.hashobjecthandle = Just h }
|
||||
return h
|
||||
import Utility.ResourcePool
|
||||
import Types.Concurrency
|
||||
import Annex.Concurrent.Utility
|
||||
|
||||
hashObjectStop :: Annex ()
|
||||
hashObjectStop = maybe noop stop =<< Annex.getState Annex.hashobjecthandle
|
||||
where
|
||||
stop h = do
|
||||
liftIO $ Git.HashObject.hashObjectStop h
|
||||
stop p = do
|
||||
liftIO $ freeResourcePool p Git.HashObject.hashObjectStop
|
||||
Annex.changeState $ \s -> s { Annex.hashobjecthandle = Nothing }
|
||||
return ()
|
||||
|
||||
hashFile :: RawFilePath -> Annex Sha
|
||||
hashFile f = do
|
||||
h <- hashObjectHandle
|
||||
hashFile f = withHashObjectHandle $ \h ->
|
||||
liftIO $ Git.HashObject.hashFile h f
|
||||
|
||||
{- Note that the content will be written to a temp file.
|
||||
- So it may be faster to use Git.HashObject.hashObject for large
|
||||
- blob contents. -}
|
||||
hashBlob :: Git.HashObject.HashableBlob b => b -> Annex Sha
|
||||
hashBlob content = do
|
||||
h <- hashObjectHandle
|
||||
hashBlob content = withHashObjectHandle $ \h ->
|
||||
liftIO $ Git.HashObject.hashBlob h content
|
||||
|
||||
withHashObjectHandle :: (Git.HashObject.HashObjectHandle -> Annex a) -> Annex a
|
||||
withHashObjectHandle a =
|
||||
maybe mkpool go =<< Annex.getState Annex.hashobjecthandle
|
||||
where
|
||||
go p = withResourcePool p start a
|
||||
start = inRepo $ Git.HashObject.hashObjectStart True
|
||||
mkpool = do
|
||||
-- This only runs in non-concurrent code paths;
|
||||
-- a concurrent pool is set up earlier when needed.
|
||||
p <- mkResourcePoolNonConcurrent start
|
||||
Annex.changeState $ \s -> s { Annex.hashobjecthandle = Just p }
|
||||
go p
|
||||
|
||||
mkConcurrentHashObjectHandle :: Concurrency -> Annex (ResourcePool Git.HashObject.HashObjectHandle)
|
||||
mkConcurrentHashObjectHandle c =
|
||||
Annex.getState Annex.hashobjecthandle >>= \case
|
||||
Just p@(ResourcePool {}) -> return p
|
||||
_ -> mkResourcePool =<< liftIO (maxHashObjects c)
|
||||
|
||||
{- git hash-object is typically CPU bound, and is not likely to be the main
|
||||
- bottleneck for any command. So limit to the number of CPU cores, maximum,
|
||||
- while respecting the -Jn value.
|
||||
-}
|
||||
maxHashObjects :: Concurrency -> IO Int
|
||||
maxHashObjects = concurrencyUpToCpus
|
||||
|
|
|
@ -4,6 +4,8 @@ git-annex (10.20220725) UNRELEASED; urgency=medium
|
|||
automatically upgrade to v10 in a year's time.
|
||||
To avoid this upgrade, you can set annex.autoupgraderepository to false.
|
||||
* Use v10 by default for new repositories.
|
||||
* Avoid starting an unncessary number of git hash-object processes when
|
||||
concurrency is enabled.
|
||||
|
||||
-- Joey Hess <id@joeyh.name> Mon, 25 Jul 2022 15:35:45 -0400
|
||||
|
||||
|
|
|
@ -34,7 +34,7 @@ mkResourcePool maxsz = liftIO $
|
|||
<*> newTVarIO []
|
||||
|
||||
{- When there will not be multiple threads that may
|
||||
- may concurrently try to use it, using this is more
|
||||
- concurrently try to use it, using this is more
|
||||
- efficient than mkResourcePool.
|
||||
-}
|
||||
mkResourcePoolNonConcurrent :: (MonadMask m, MonadIO m) => m r -> m (ResourcePool r)
|
||||
|
|
|
@ -43,3 +43,6 @@ which might be ok but still wonder why they are just sleeping there in more than
|
|||
[[!tag projects/dandi]]
|
||||
|
||||
|
||||
> [[done]]; this is now handled like other git helper processes
|
||||
> and will be capped to the maximum of the number of jobs or cpu cores,
|
||||
> and in practice usually fewer than that will be started. --[[Joey]]
|
||||
|
|
Loading…
Reference in a new issue