check-attr resource pool
Limited to min of -JN or number of CPU cores, because it will often be CPU bound, once it's read the gitignore file for a directory. In some situations it's more disk bound, but in any case it's unlikely to be the main bottleneck that -J is used to avoid. Eg, when dropping, this is used for numcopies checks, but the main bottleneck will be accessing the remotes to verify presence. So the user might decide to -J32 that, but having 32 check-attr processes would just waste however many filehandles they open, and probably worsen their performance due to CPU contention. Note that, I first tried just letting up to the -JN be started. However, even when it's no bottleneck at all, that still results in all of them being started. Why? Well, all the worker threads start up nearly simulantaneously, so there's a thundering herd..
This commit is contained in:
parent
cee6b344b4
commit
45fb7af21c
5 changed files with 47 additions and 16 deletions
3
Annex.hs
3
Annex.hs
|
@ -71,6 +71,7 @@ import Types.CatFileHandles
|
|||
import qualified Database.Keys.Handle as Keys
|
||||
import Utility.InodeCache
|
||||
import Utility.Url
|
||||
import Utility.ResourcePool
|
||||
|
||||
import "mtl" Control.Monad.Reader
|
||||
import Control.Concurrent
|
||||
|
@ -118,7 +119,7 @@ data AnnexState = AnnexState
|
|||
, repoqueue :: Maybe (Git.Queue.Queue Annex)
|
||||
, catfilehandles :: CatFileHandles
|
||||
, hashobjecthandle :: Maybe HashObjectHandle
|
||||
, checkattrhandle :: Maybe CheckAttrHandle
|
||||
, checkattrhandle :: Maybe (ResourcePool CheckAttrHandle)
|
||||
, checkignorehandle :: Maybe CheckIgnoreHandle
|
||||
, forcebackend :: Maybe String
|
||||
, globalnumcopies :: Maybe NumCopies
|
||||
|
|
|
@ -1,19 +1,22 @@
|
|||
{- git check-attr interface, with handle automatically stored in the Annex monad
|
||||
-
|
||||
- Copyright 2012 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2012-2020 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
||||
module Annex.CheckAttr (
|
||||
checkAttr,
|
||||
checkAttrHandle,
|
||||
checkAttrStop,
|
||||
mkConcurrentCheckAttrHandle,
|
||||
) where
|
||||
|
||||
import Annex.Common
|
||||
import qualified Git.CheckAttr as Git
|
||||
import qualified Annex
|
||||
import Utility.ResourcePool
|
||||
import Types.Concurrency
|
||||
import GHC.Conc
|
||||
|
||||
{- All gitattributes used by git-annex. -}
|
||||
annexAttrs :: [Git.Attr]
|
||||
|
@ -24,21 +27,44 @@ annexAttrs =
|
|||
]
|
||||
|
||||
checkAttr :: Git.Attr -> FilePath -> Annex String
|
||||
checkAttr attr file = do
|
||||
h <- checkAttrHandle
|
||||
checkAttr attr file = withCheckAttrHandle $ \h ->
|
||||
liftIO $ Git.checkAttr h attr file
|
||||
|
||||
checkAttrHandle :: Annex Git.CheckAttrHandle
|
||||
checkAttrHandle = maybe startup return =<< Annex.getState Annex.checkattrhandle
|
||||
withCheckAttrHandle :: (Git.CheckAttrHandle -> Annex a) -> Annex a
|
||||
withCheckAttrHandle a =
|
||||
maybe mkpool go =<< Annex.getState Annex.checkattrhandle
|
||||
where
|
||||
startup = do
|
||||
h <- inRepo $ Git.checkAttrStart annexAttrs
|
||||
Annex.changeState $ \s -> s { Annex.checkattrhandle = Just h }
|
||||
return h
|
||||
go p = withResourcePool p start a
|
||||
start = inRepo $ Git.checkAttrStart annexAttrs
|
||||
mkpool = do
|
||||
-- This only runs in non-concurrent code paths;
|
||||
-- a concurrent pool is set up earlier when needed.
|
||||
p <- mkResourcePoolNonConcurrent start
|
||||
Annex.changeState $ \s -> s { Annex.checkattrhandle = Just p }
|
||||
go p
|
||||
|
||||
mkConcurrentCheckAttrHandle :: Concurrency -> Annex (ResourcePool Git.CheckAttrHandle)
|
||||
mkConcurrentCheckAttrHandle c =
|
||||
Annex.getState Annex.checkattrhandle >>= \case
|
||||
Just p@(ResourcePool {}) -> return p
|
||||
_ -> mkResourcePool =<< liftIO (maxCheckAttrs c)
|
||||
|
||||
{- git check-attr is typically CPU bound, and is not likely to be the main
|
||||
- bottleneck for any command. So limit to the number of CPU cores, maximum,
|
||||
- while respecting the -Jn value.
|
||||
-}
|
||||
maxCheckAttrs :: Concurrency -> IO Int
|
||||
maxCheckAttrs c = do
|
||||
let cn = case c of
|
||||
Concurrent n -> n
|
||||
NonConcurrent -> 1
|
||||
ConcurrentPerCpu -> 1
|
||||
pn <- liftIO getNumProcessors
|
||||
return (min cn pn)
|
||||
|
||||
checkAttrStop :: Annex ()
|
||||
checkAttrStop = maybe noop stop =<< Annex.getState Annex.checkattrhandle
|
||||
where
|
||||
stop h = do
|
||||
liftIO $ Git.checkAttrStop h
|
||||
stop p = do
|
||||
liftIO $ freeResourcePool p Git.checkAttrStop
|
||||
Annex.changeState $ \s -> s { Annex.checkattrhandle = Nothing }
|
||||
|
|
|
@ -14,6 +14,7 @@ import Annex.Action
|
|||
import Types.Concurrency
|
||||
import Types.WorkerPool
|
||||
import Types.CatFileHandles
|
||||
import Annex.CheckAttr
|
||||
import Remote.List
|
||||
|
||||
import Control.Concurrent
|
||||
|
@ -29,9 +30,11 @@ setConcurrency c = do
|
|||
cfh' <- case cfh of
|
||||
CatFileHandlesNonConcurrent _ -> liftIO catFileHandlesPool
|
||||
CatFileHandlesPool _ -> pure cfh
|
||||
cah <- mkConcurrentCheckAttrHandle c
|
||||
Annex.changeState $ \s -> s
|
||||
{ Annex.concurrency = c
|
||||
, Annex.catfilehandles = cfh'
|
||||
, Annex.checkattrhandle = Just cah
|
||||
}
|
||||
|
||||
{- Allows forking off a thread that uses a copy of the current AnnexState
|
||||
|
@ -67,7 +70,7 @@ dupState = do
|
|||
|
||||
st <- Annex.getState id
|
||||
-- Make sure that concurrency is enabled, if it was not already,
|
||||
-- so the resource pools are set up.
|
||||
-- so the concurrency-safe resource pools are set up.
|
||||
st' <- case Annex.concurrency st of
|
||||
NonConcurrent -> do
|
||||
setConcurrency (Concurrent 1)
|
||||
|
@ -77,7 +80,6 @@ dupState = do
|
|||
-- each thread has its own repoqueue
|
||||
{ Annex.repoqueue = Nothing
|
||||
-- avoid sharing open file handles
|
||||
, Annex.checkattrhandle = Nothing
|
||||
, Annex.checkignorehandle = Nothing
|
||||
}
|
||||
|
||||
|
|
|
@ -17,6 +17,8 @@ git-annex (8.20200331) UNRELEASED; urgency=medium
|
|||
and -J is used.
|
||||
* Avoid running a large number of git cat-file child processes when run
|
||||
with a large -J value.
|
||||
* Avoid running with more git check-attr processes than there are CPUs
|
||||
cores when run with a large -J value.
|
||||
|
||||
-- Joey Hess <id@joeyh.name> Mon, 30 Mar 2020 15:58:34 -0400
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
{-# LANGUAGE BangPatterns #-}
|
||||
|
||||
module Utility.ResourcePool (
|
||||
ResourcePool,
|
||||
ResourcePool(..),
|
||||
mkResourcePool,
|
||||
mkResourcePoolNonConcurrent,
|
||||
withResourcePool,
|
||||
|
|
Loading…
Reference in a new issue