check-attr resource pool

Limited to min of -JN or number of CPU cores, because it will often be
CPU bound, once it's read the gitignore file for a directory.

In some situations it's more disk bound, but in any case it's unlikely
to be the main bottleneck that -J is used to avoid. Eg, when dropping,
this is used for numcopies checks, but the main bottleneck will be
accessing the remotes to verify presence. So the user might decide to
-J32 that, but having 32 check-attr processes would just waste however
many filehandles they open, and probably worsen their performance due to
CPU contention.

Note that, I first tried just letting up to the -JN be started. However,
even when it's no bottleneck at all, that still results in all of them
being started. Why? Well, all the worker threads start up nearly
simulantaneously, so there's a thundering herd..
This commit is contained in:
Joey Hess 2020-04-21 10:38:44 -04:00
parent cee6b344b4
commit 45fb7af21c
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
5 changed files with 47 additions and 16 deletions

View file

@ -71,6 +71,7 @@ import Types.CatFileHandles
import qualified Database.Keys.Handle as Keys
import Utility.InodeCache
import Utility.Url
import Utility.ResourcePool
import "mtl" Control.Monad.Reader
import Control.Concurrent
@ -118,7 +119,7 @@ data AnnexState = AnnexState
, repoqueue :: Maybe (Git.Queue.Queue Annex)
, catfilehandles :: CatFileHandles
, hashobjecthandle :: Maybe HashObjectHandle
, checkattrhandle :: Maybe CheckAttrHandle
, checkattrhandle :: Maybe (ResourcePool CheckAttrHandle)
, checkignorehandle :: Maybe CheckIgnoreHandle
, forcebackend :: Maybe String
, globalnumcopies :: Maybe NumCopies

View file

@ -1,19 +1,22 @@
{- git check-attr interface, with handle automatically stored in the Annex monad
-
- Copyright 2012 Joey Hess <id@joeyh.name>
- Copyright 2012-2020 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
module Annex.CheckAttr (
checkAttr,
checkAttrHandle,
checkAttrStop,
mkConcurrentCheckAttrHandle,
) where
import Annex.Common
import qualified Git.CheckAttr as Git
import qualified Annex
import Utility.ResourcePool
import Types.Concurrency
import GHC.Conc
{- All gitattributes used by git-annex. -}
annexAttrs :: [Git.Attr]
@ -24,21 +27,44 @@ annexAttrs =
]
checkAttr :: Git.Attr -> FilePath -> Annex String
checkAttr attr file = do
h <- checkAttrHandle
checkAttr attr file = withCheckAttrHandle $ \h ->
liftIO $ Git.checkAttr h attr file
checkAttrHandle :: Annex Git.CheckAttrHandle
checkAttrHandle = maybe startup return =<< Annex.getState Annex.checkattrhandle
withCheckAttrHandle :: (Git.CheckAttrHandle -> Annex a) -> Annex a
withCheckAttrHandle a =
maybe mkpool go =<< Annex.getState Annex.checkattrhandle
where
startup = do
h <- inRepo $ Git.checkAttrStart annexAttrs
Annex.changeState $ \s -> s { Annex.checkattrhandle = Just h }
return h
go p = withResourcePool p start a
start = inRepo $ Git.checkAttrStart annexAttrs
mkpool = do
-- This only runs in non-concurrent code paths;
-- a concurrent pool is set up earlier when needed.
p <- mkResourcePoolNonConcurrent start
Annex.changeState $ \s -> s { Annex.checkattrhandle = Just p }
go p
mkConcurrentCheckAttrHandle :: Concurrency -> Annex (ResourcePool Git.CheckAttrHandle)
mkConcurrentCheckAttrHandle c =
Annex.getState Annex.checkattrhandle >>= \case
Just p@(ResourcePool {}) -> return p
_ -> mkResourcePool =<< liftIO (maxCheckAttrs c)
{- git check-attr is typically CPU bound, and is not likely to be the main
- bottleneck for any command. So limit to the number of CPU cores, maximum,
- while respecting the -Jn value.
-}
maxCheckAttrs :: Concurrency -> IO Int
maxCheckAttrs c = do
let cn = case c of
Concurrent n -> n
NonConcurrent -> 1
ConcurrentPerCpu -> 1
pn <- liftIO getNumProcessors
return (min cn pn)
checkAttrStop :: Annex ()
checkAttrStop = maybe noop stop =<< Annex.getState Annex.checkattrhandle
where
stop h = do
liftIO $ Git.checkAttrStop h
stop p = do
liftIO $ freeResourcePool p Git.checkAttrStop
Annex.changeState $ \s -> s { Annex.checkattrhandle = Nothing }

View file

@ -14,6 +14,7 @@ import Annex.Action
import Types.Concurrency
import Types.WorkerPool
import Types.CatFileHandles
import Annex.CheckAttr
import Remote.List
import Control.Concurrent
@ -29,9 +30,11 @@ setConcurrency c = do
cfh' <- case cfh of
CatFileHandlesNonConcurrent _ -> liftIO catFileHandlesPool
CatFileHandlesPool _ -> pure cfh
cah <- mkConcurrentCheckAttrHandle c
Annex.changeState $ \s -> s
{ Annex.concurrency = c
, Annex.catfilehandles = cfh'
, Annex.checkattrhandle = Just cah
}
{- Allows forking off a thread that uses a copy of the current AnnexState
@ -67,7 +70,7 @@ dupState = do
st <- Annex.getState id
-- Make sure that concurrency is enabled, if it was not already,
-- so the resource pools are set up.
-- so the concurrency-safe resource pools are set up.
st' <- case Annex.concurrency st of
NonConcurrent -> do
setConcurrency (Concurrent 1)
@ -77,7 +80,6 @@ dupState = do
-- each thread has its own repoqueue
{ Annex.repoqueue = Nothing
-- avoid sharing open file handles
, Annex.checkattrhandle = Nothing
, Annex.checkignorehandle = Nothing
}

View file

@ -17,6 +17,8 @@ git-annex (8.20200331) UNRELEASED; urgency=medium
and -J is used.
* Avoid running a large number of git cat-file child processes when run
with a large -J value.
* Avoid running with more git check-attr processes than there are CPUs
cores when run with a large -J value.
-- Joey Hess <id@joeyh.name> Mon, 30 Mar 2020 15:58:34 -0400

View file

@ -8,7 +8,7 @@
{-# LANGUAGE BangPatterns #-}
module Utility.ResourcePool (
ResourcePool,
ResourcePool(..),
mkResourcePool,
mkResourcePoolNonConcurrent,
withResourcePool,