check-attr resource pool
Limited to min of -JN or number of CPU cores, because it will often be CPU bound, once it's read the gitignore file for a directory. In some situations it's more disk bound, but in any case it's unlikely to be the main bottleneck that -J is used to avoid. Eg, when dropping, this is used for numcopies checks, but the main bottleneck will be accessing the remotes to verify presence. So the user might decide to -J32 that, but having 32 check-attr processes would just waste however many filehandles they open, and probably worsen their performance due to CPU contention. Note that, I first tried just letting up to the -JN be started. However, even when it's no bottleneck at all, that still results in all of them being started. Why? Well, all the worker threads start up nearly simulantaneously, so there's a thundering herd..
This commit is contained in:
parent
cee6b344b4
commit
45fb7af21c
5 changed files with 47 additions and 16 deletions
3
Annex.hs
3
Annex.hs
|
@ -71,6 +71,7 @@ import Types.CatFileHandles
|
||||||
import qualified Database.Keys.Handle as Keys
|
import qualified Database.Keys.Handle as Keys
|
||||||
import Utility.InodeCache
|
import Utility.InodeCache
|
||||||
import Utility.Url
|
import Utility.Url
|
||||||
|
import Utility.ResourcePool
|
||||||
|
|
||||||
import "mtl" Control.Monad.Reader
|
import "mtl" Control.Monad.Reader
|
||||||
import Control.Concurrent
|
import Control.Concurrent
|
||||||
|
@ -118,7 +119,7 @@ data AnnexState = AnnexState
|
||||||
, repoqueue :: Maybe (Git.Queue.Queue Annex)
|
, repoqueue :: Maybe (Git.Queue.Queue Annex)
|
||||||
, catfilehandles :: CatFileHandles
|
, catfilehandles :: CatFileHandles
|
||||||
, hashobjecthandle :: Maybe HashObjectHandle
|
, hashobjecthandle :: Maybe HashObjectHandle
|
||||||
, checkattrhandle :: Maybe CheckAttrHandle
|
, checkattrhandle :: Maybe (ResourcePool CheckAttrHandle)
|
||||||
, checkignorehandle :: Maybe CheckIgnoreHandle
|
, checkignorehandle :: Maybe CheckIgnoreHandle
|
||||||
, forcebackend :: Maybe String
|
, forcebackend :: Maybe String
|
||||||
, globalnumcopies :: Maybe NumCopies
|
, globalnumcopies :: Maybe NumCopies
|
||||||
|
|
|
@ -1,19 +1,22 @@
|
||||||
{- git check-attr interface, with handle automatically stored in the Annex monad
|
{- git check-attr interface, with handle automatically stored in the Annex monad
|
||||||
-
|
-
|
||||||
- Copyright 2012 Joey Hess <id@joeyh.name>
|
- Copyright 2012-2020 Joey Hess <id@joeyh.name>
|
||||||
-
|
-
|
||||||
- Licensed under the GNU AGPL version 3 or higher.
|
- Licensed under the GNU AGPL version 3 or higher.
|
||||||
-}
|
-}
|
||||||
|
|
||||||
module Annex.CheckAttr (
|
module Annex.CheckAttr (
|
||||||
checkAttr,
|
checkAttr,
|
||||||
checkAttrHandle,
|
|
||||||
checkAttrStop,
|
checkAttrStop,
|
||||||
|
mkConcurrentCheckAttrHandle,
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Annex.Common
|
import Annex.Common
|
||||||
import qualified Git.CheckAttr as Git
|
import qualified Git.CheckAttr as Git
|
||||||
import qualified Annex
|
import qualified Annex
|
||||||
|
import Utility.ResourcePool
|
||||||
|
import Types.Concurrency
|
||||||
|
import GHC.Conc
|
||||||
|
|
||||||
{- All gitattributes used by git-annex. -}
|
{- All gitattributes used by git-annex. -}
|
||||||
annexAttrs :: [Git.Attr]
|
annexAttrs :: [Git.Attr]
|
||||||
|
@ -24,21 +27,44 @@ annexAttrs =
|
||||||
]
|
]
|
||||||
|
|
||||||
checkAttr :: Git.Attr -> FilePath -> Annex String
|
checkAttr :: Git.Attr -> FilePath -> Annex String
|
||||||
checkAttr attr file = do
|
checkAttr attr file = withCheckAttrHandle $ \h ->
|
||||||
h <- checkAttrHandle
|
|
||||||
liftIO $ Git.checkAttr h attr file
|
liftIO $ Git.checkAttr h attr file
|
||||||
|
|
||||||
checkAttrHandle :: Annex Git.CheckAttrHandle
|
withCheckAttrHandle :: (Git.CheckAttrHandle -> Annex a) -> Annex a
|
||||||
checkAttrHandle = maybe startup return =<< Annex.getState Annex.checkattrhandle
|
withCheckAttrHandle a =
|
||||||
|
maybe mkpool go =<< Annex.getState Annex.checkattrhandle
|
||||||
where
|
where
|
||||||
startup = do
|
go p = withResourcePool p start a
|
||||||
h <- inRepo $ Git.checkAttrStart annexAttrs
|
start = inRepo $ Git.checkAttrStart annexAttrs
|
||||||
Annex.changeState $ \s -> s { Annex.checkattrhandle = Just h }
|
mkpool = do
|
||||||
return h
|
-- This only runs in non-concurrent code paths;
|
||||||
|
-- a concurrent pool is set up earlier when needed.
|
||||||
|
p <- mkResourcePoolNonConcurrent start
|
||||||
|
Annex.changeState $ \s -> s { Annex.checkattrhandle = Just p }
|
||||||
|
go p
|
||||||
|
|
||||||
|
mkConcurrentCheckAttrHandle :: Concurrency -> Annex (ResourcePool Git.CheckAttrHandle)
|
||||||
|
mkConcurrentCheckAttrHandle c =
|
||||||
|
Annex.getState Annex.checkattrhandle >>= \case
|
||||||
|
Just p@(ResourcePool {}) -> return p
|
||||||
|
_ -> mkResourcePool =<< liftIO (maxCheckAttrs c)
|
||||||
|
|
||||||
|
{- git check-attr is typically CPU bound, and is not likely to be the main
|
||||||
|
- bottleneck for any command. So limit to the number of CPU cores, maximum,
|
||||||
|
- while respecting the -Jn value.
|
||||||
|
-}
|
||||||
|
maxCheckAttrs :: Concurrency -> IO Int
|
||||||
|
maxCheckAttrs c = do
|
||||||
|
let cn = case c of
|
||||||
|
Concurrent n -> n
|
||||||
|
NonConcurrent -> 1
|
||||||
|
ConcurrentPerCpu -> 1
|
||||||
|
pn <- liftIO getNumProcessors
|
||||||
|
return (min cn pn)
|
||||||
|
|
||||||
checkAttrStop :: Annex ()
|
checkAttrStop :: Annex ()
|
||||||
checkAttrStop = maybe noop stop =<< Annex.getState Annex.checkattrhandle
|
checkAttrStop = maybe noop stop =<< Annex.getState Annex.checkattrhandle
|
||||||
where
|
where
|
||||||
stop h = do
|
stop p = do
|
||||||
liftIO $ Git.checkAttrStop h
|
liftIO $ freeResourcePool p Git.checkAttrStop
|
||||||
Annex.changeState $ \s -> s { Annex.checkattrhandle = Nothing }
|
Annex.changeState $ \s -> s { Annex.checkattrhandle = Nothing }
|
||||||
|
|
|
@ -14,6 +14,7 @@ import Annex.Action
|
||||||
import Types.Concurrency
|
import Types.Concurrency
|
||||||
import Types.WorkerPool
|
import Types.WorkerPool
|
||||||
import Types.CatFileHandles
|
import Types.CatFileHandles
|
||||||
|
import Annex.CheckAttr
|
||||||
import Remote.List
|
import Remote.List
|
||||||
|
|
||||||
import Control.Concurrent
|
import Control.Concurrent
|
||||||
|
@ -29,9 +30,11 @@ setConcurrency c = do
|
||||||
cfh' <- case cfh of
|
cfh' <- case cfh of
|
||||||
CatFileHandlesNonConcurrent _ -> liftIO catFileHandlesPool
|
CatFileHandlesNonConcurrent _ -> liftIO catFileHandlesPool
|
||||||
CatFileHandlesPool _ -> pure cfh
|
CatFileHandlesPool _ -> pure cfh
|
||||||
|
cah <- mkConcurrentCheckAttrHandle c
|
||||||
Annex.changeState $ \s -> s
|
Annex.changeState $ \s -> s
|
||||||
{ Annex.concurrency = c
|
{ Annex.concurrency = c
|
||||||
, Annex.catfilehandles = cfh'
|
, Annex.catfilehandles = cfh'
|
||||||
|
, Annex.checkattrhandle = Just cah
|
||||||
}
|
}
|
||||||
|
|
||||||
{- Allows forking off a thread that uses a copy of the current AnnexState
|
{- Allows forking off a thread that uses a copy of the current AnnexState
|
||||||
|
@ -67,7 +70,7 @@ dupState = do
|
||||||
|
|
||||||
st <- Annex.getState id
|
st <- Annex.getState id
|
||||||
-- Make sure that concurrency is enabled, if it was not already,
|
-- Make sure that concurrency is enabled, if it was not already,
|
||||||
-- so the resource pools are set up.
|
-- so the concurrency-safe resource pools are set up.
|
||||||
st' <- case Annex.concurrency st of
|
st' <- case Annex.concurrency st of
|
||||||
NonConcurrent -> do
|
NonConcurrent -> do
|
||||||
setConcurrency (Concurrent 1)
|
setConcurrency (Concurrent 1)
|
||||||
|
@ -77,7 +80,6 @@ dupState = do
|
||||||
-- each thread has its own repoqueue
|
-- each thread has its own repoqueue
|
||||||
{ Annex.repoqueue = Nothing
|
{ Annex.repoqueue = Nothing
|
||||||
-- avoid sharing open file handles
|
-- avoid sharing open file handles
|
||||||
, Annex.checkattrhandle = Nothing
|
|
||||||
, Annex.checkignorehandle = Nothing
|
, Annex.checkignorehandle = Nothing
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,8 @@ git-annex (8.20200331) UNRELEASED; urgency=medium
|
||||||
and -J is used.
|
and -J is used.
|
||||||
* Avoid running a large number of git cat-file child processes when run
|
* Avoid running a large number of git cat-file child processes when run
|
||||||
with a large -J value.
|
with a large -J value.
|
||||||
|
* Avoid running with more git check-attr processes than there are CPUs
|
||||||
|
cores when run with a large -J value.
|
||||||
|
|
||||||
-- Joey Hess <id@joeyh.name> Mon, 30 Mar 2020 15:58:34 -0400
|
-- Joey Hess <id@joeyh.name> Mon, 30 Mar 2020 15:58:34 -0400
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
{-# LANGUAGE BangPatterns #-}
|
{-# LANGUAGE BangPatterns #-}
|
||||||
|
|
||||||
module Utility.ResourcePool (
|
module Utility.ResourcePool (
|
||||||
ResourcePool,
|
ResourcePool(..),
|
||||||
mkResourcePool,
|
mkResourcePool,
|
||||||
mkResourcePoolNonConcurrent,
|
mkResourcePoolNonConcurrent,
|
||||||
withResourcePool,
|
withResourcePool,
|
||||||
|
|
Loading…
Reference in a new issue