check-attr resource pool

Limited to min of -JN or number of CPU cores, because it will often be
CPU bound, once it's read the gitignore file for a directory.

In some situations it's more disk bound, but in any case it's unlikely
to be the main bottleneck that -J is used to avoid. Eg, when dropping,
this is used for numcopies checks, but the main bottleneck will be
accessing the remotes to verify presence. So the user might decide to
-J32 that, but having 32 check-attr processes would just waste however
many filehandles they open, and probably worsen their performance due to
CPU contention.

Note that, I first tried just letting up to the -JN be started. However,
even when it's no bottleneck at all, that still results in all of them
being started. Why? Well, all the worker threads start up nearly
simulantaneously, so there's a thundering herd..
This commit is contained in:
Joey Hess 2020-04-21 10:38:44 -04:00
parent cee6b344b4
commit 45fb7af21c
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
5 changed files with 47 additions and 16 deletions

View file

@ -71,6 +71,7 @@ import Types.CatFileHandles
import qualified Database.Keys.Handle as Keys import qualified Database.Keys.Handle as Keys
import Utility.InodeCache import Utility.InodeCache
import Utility.Url import Utility.Url
import Utility.ResourcePool
import "mtl" Control.Monad.Reader import "mtl" Control.Monad.Reader
import Control.Concurrent import Control.Concurrent
@ -118,7 +119,7 @@ data AnnexState = AnnexState
, repoqueue :: Maybe (Git.Queue.Queue Annex) , repoqueue :: Maybe (Git.Queue.Queue Annex)
, catfilehandles :: CatFileHandles , catfilehandles :: CatFileHandles
, hashobjecthandle :: Maybe HashObjectHandle , hashobjecthandle :: Maybe HashObjectHandle
, checkattrhandle :: Maybe CheckAttrHandle , checkattrhandle :: Maybe (ResourcePool CheckAttrHandle)
, checkignorehandle :: Maybe CheckIgnoreHandle , checkignorehandle :: Maybe CheckIgnoreHandle
, forcebackend :: Maybe String , forcebackend :: Maybe String
, globalnumcopies :: Maybe NumCopies , globalnumcopies :: Maybe NumCopies

View file

@ -1,19 +1,22 @@
{- git check-attr interface, with handle automatically stored in the Annex monad {- git check-attr interface, with handle automatically stored in the Annex monad
- -
- Copyright 2012 Joey Hess <id@joeyh.name> - Copyright 2012-2020 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
module Annex.CheckAttr ( module Annex.CheckAttr (
checkAttr, checkAttr,
checkAttrHandle,
checkAttrStop, checkAttrStop,
mkConcurrentCheckAttrHandle,
) where ) where
import Annex.Common import Annex.Common
import qualified Git.CheckAttr as Git import qualified Git.CheckAttr as Git
import qualified Annex import qualified Annex
import Utility.ResourcePool
import Types.Concurrency
import GHC.Conc
{- All gitattributes used by git-annex. -} {- All gitattributes used by git-annex. -}
annexAttrs :: [Git.Attr] annexAttrs :: [Git.Attr]
@ -24,21 +27,44 @@ annexAttrs =
] ]
checkAttr :: Git.Attr -> FilePath -> Annex String checkAttr :: Git.Attr -> FilePath -> Annex String
checkAttr attr file = do checkAttr attr file = withCheckAttrHandle $ \h ->
h <- checkAttrHandle
liftIO $ Git.checkAttr h attr file liftIO $ Git.checkAttr h attr file
checkAttrHandle :: Annex Git.CheckAttrHandle withCheckAttrHandle :: (Git.CheckAttrHandle -> Annex a) -> Annex a
checkAttrHandle = maybe startup return =<< Annex.getState Annex.checkattrhandle withCheckAttrHandle a =
maybe mkpool go =<< Annex.getState Annex.checkattrhandle
where where
startup = do go p = withResourcePool p start a
h <- inRepo $ Git.checkAttrStart annexAttrs start = inRepo $ Git.checkAttrStart annexAttrs
Annex.changeState $ \s -> s { Annex.checkattrhandle = Just h } mkpool = do
return h -- This only runs in non-concurrent code paths;
-- a concurrent pool is set up earlier when needed.
p <- mkResourcePoolNonConcurrent start
Annex.changeState $ \s -> s { Annex.checkattrhandle = Just p }
go p
mkConcurrentCheckAttrHandle :: Concurrency -> Annex (ResourcePool Git.CheckAttrHandle)
mkConcurrentCheckAttrHandle c =
Annex.getState Annex.checkattrhandle >>= \case
Just p@(ResourcePool {}) -> return p
_ -> mkResourcePool =<< liftIO (maxCheckAttrs c)
{- git check-attr is typically CPU bound, and is not likely to be the main
- bottleneck for any command. So limit to the number of CPU cores, maximum,
- while respecting the -Jn value.
-}
maxCheckAttrs :: Concurrency -> IO Int
maxCheckAttrs c = do
let cn = case c of
Concurrent n -> n
NonConcurrent -> 1
ConcurrentPerCpu -> 1
pn <- liftIO getNumProcessors
return (min cn pn)
checkAttrStop :: Annex () checkAttrStop :: Annex ()
checkAttrStop = maybe noop stop =<< Annex.getState Annex.checkattrhandle checkAttrStop = maybe noop stop =<< Annex.getState Annex.checkattrhandle
where where
stop h = do stop p = do
liftIO $ Git.checkAttrStop h liftIO $ freeResourcePool p Git.checkAttrStop
Annex.changeState $ \s -> s { Annex.checkattrhandle = Nothing } Annex.changeState $ \s -> s { Annex.checkattrhandle = Nothing }

View file

@ -14,6 +14,7 @@ import Annex.Action
import Types.Concurrency import Types.Concurrency
import Types.WorkerPool import Types.WorkerPool
import Types.CatFileHandles import Types.CatFileHandles
import Annex.CheckAttr
import Remote.List import Remote.List
import Control.Concurrent import Control.Concurrent
@ -29,9 +30,11 @@ setConcurrency c = do
cfh' <- case cfh of cfh' <- case cfh of
CatFileHandlesNonConcurrent _ -> liftIO catFileHandlesPool CatFileHandlesNonConcurrent _ -> liftIO catFileHandlesPool
CatFileHandlesPool _ -> pure cfh CatFileHandlesPool _ -> pure cfh
cah <- mkConcurrentCheckAttrHandle c
Annex.changeState $ \s -> s Annex.changeState $ \s -> s
{ Annex.concurrency = c { Annex.concurrency = c
, Annex.catfilehandles = cfh' , Annex.catfilehandles = cfh'
, Annex.checkattrhandle = Just cah
} }
{- Allows forking off a thread that uses a copy of the current AnnexState {- Allows forking off a thread that uses a copy of the current AnnexState
@ -67,7 +70,7 @@ dupState = do
st <- Annex.getState id st <- Annex.getState id
-- Make sure that concurrency is enabled, if it was not already, -- Make sure that concurrency is enabled, if it was not already,
-- so the resource pools are set up. -- so the concurrency-safe resource pools are set up.
st' <- case Annex.concurrency st of st' <- case Annex.concurrency st of
NonConcurrent -> do NonConcurrent -> do
setConcurrency (Concurrent 1) setConcurrency (Concurrent 1)
@ -77,7 +80,6 @@ dupState = do
-- each thread has its own repoqueue -- each thread has its own repoqueue
{ Annex.repoqueue = Nothing { Annex.repoqueue = Nothing
-- avoid sharing open file handles -- avoid sharing open file handles
, Annex.checkattrhandle = Nothing
, Annex.checkignorehandle = Nothing , Annex.checkignorehandle = Nothing
} }

View file

@ -17,6 +17,8 @@ git-annex (8.20200331) UNRELEASED; urgency=medium
and -J is used. and -J is used.
* Avoid running a large number of git cat-file child processes when run * Avoid running a large number of git cat-file child processes when run
with a large -J value. with a large -J value.
* Avoid running with more git check-attr processes than there are CPUs
cores when run with a large -J value.
-- Joey Hess <id@joeyh.name> Mon, 30 Mar 2020 15:58:34 -0400 -- Joey Hess <id@joeyh.name> Mon, 30 Mar 2020 15:58:34 -0400

View file

@ -8,7 +8,7 @@
{-# LANGUAGE BangPatterns #-} {-# LANGUAGE BangPatterns #-}
module Utility.ResourcePool ( module Utility.ResourcePool (
ResourcePool, ResourcePool(..),
mkResourcePool, mkResourcePool,
mkResourcePoolNonConcurrent, mkResourcePoolNonConcurrent,
withResourcePool, withResourcePool,