cat-file resource pool
Avoid running a large number of git cat-file child processes when run with a large -J value. This implementation takes care to avoid adding any overhead to git-annex when run without -J. When run with -J, there is a small bit of added overhead, to manipulate the resource pool. That optimisation added a fair bit of complexity.
This commit is contained in:
parent
87b7b0f202
commit
cee6b344b4
14 changed files with 243 additions and 47 deletions
6
Annex.hs
6
Annex.hs
|
@ -43,7 +43,6 @@ import qualified Git
|
|||
import qualified Git.Config
|
||||
import qualified Git.Construct
|
||||
import Annex.Fixup
|
||||
import Git.CatFile
|
||||
import Git.HashObject
|
||||
import Git.CheckAttr
|
||||
import Git.CheckIgnore
|
||||
|
@ -68,6 +67,7 @@ import Types.CleanupActions
|
|||
import Types.AdjustedBranch
|
||||
import Types.WorkerPool
|
||||
import Types.IndexFiles
|
||||
import Types.CatFileHandles
|
||||
import qualified Database.Keys.Handle as Keys
|
||||
import Utility.InodeCache
|
||||
import Utility.Url
|
||||
|
@ -116,7 +116,7 @@ data AnnexState = AnnexState
|
|||
, daemon :: Bool
|
||||
, branchstate :: BranchState
|
||||
, repoqueue :: Maybe (Git.Queue.Queue Annex)
|
||||
, catfilehandles :: M.Map FilePath CatFileHandle
|
||||
, catfilehandles :: CatFileHandles
|
||||
, hashobjecthandle :: Maybe HashObjectHandle
|
||||
, checkattrhandle :: Maybe CheckAttrHandle
|
||||
, checkignorehandle :: Maybe CheckIgnoreHandle
|
||||
|
@ -174,7 +174,7 @@ newState c r = do
|
|||
, daemon = False
|
||||
, branchstate = startBranchState
|
||||
, repoqueue = Nothing
|
||||
, catfilehandles = M.empty
|
||||
, catfilehandles = catFileHandlesNonConcurrent
|
||||
, hashobjecthandle = Nothing
|
||||
, checkattrhandle = Nothing
|
||||
, checkignorehandle = Nothing
|
||||
|
|
|
@ -418,8 +418,8 @@ mergeIndex :: JournalLocked -> [Git.Ref] -> Annex ()
|
|||
mergeIndex jl branches = do
|
||||
prepareModifyIndex jl
|
||||
hashhandle <- hashObjectHandle
|
||||
ch <- catFileHandle
|
||||
inRepo $ \g -> Git.UnionMerge.mergeIndex hashhandle ch g branches
|
||||
withCatFileHandle $ \ch ->
|
||||
inRepo $ \g -> Git.UnionMerge.mergeIndex hashhandle ch g branches
|
||||
|
||||
{- Removes any stale git lock file, to avoid git falling over when
|
||||
- updating the index.
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
{- git cat-file interface, with handle automatically stored in the Annex monad
|
||||
-
|
||||
- Copyright 2011-2019 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2011-2020 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
||||
{-# LANGUAGE BangPatterns #-}
|
||||
|
||||
module Annex.CatFile (
|
||||
catFile,
|
||||
catFileDetails,
|
||||
|
@ -12,7 +14,7 @@ module Annex.CatFile (
|
|||
catTree,
|
||||
catCommit,
|
||||
catObjectDetails,
|
||||
catFileHandle,
|
||||
withCatFileHandle,
|
||||
catObjectMetaData,
|
||||
catFileStop,
|
||||
catKey,
|
||||
|
@ -27,6 +29,7 @@ module Annex.CatFile (
|
|||
import qualified Data.ByteString.Lazy as L
|
||||
import qualified Data.Map as M
|
||||
import System.PosixCompat.Types
|
||||
import Control.Concurrent.STM
|
||||
|
||||
import Annex.Common
|
||||
import qualified Git
|
||||
|
@ -39,64 +42,94 @@ import qualified Git.Ref
|
|||
import Annex.Link
|
||||
import Annex.CurrentBranch
|
||||
import Types.AdjustedBranch
|
||||
import Types.CatFileHandles
|
||||
import Utility.ResourcePool
|
||||
|
||||
catFile :: Git.Branch -> RawFilePath -> Annex L.ByteString
|
||||
catFile branch file = do
|
||||
h <- catFileHandle
|
||||
catFile branch file = withCatFileHandle $ \h ->
|
||||
liftIO $ Git.CatFile.catFile h branch file
|
||||
|
||||
catFileDetails :: Git.Branch -> RawFilePath -> Annex (Maybe (L.ByteString, Sha, ObjectType))
|
||||
catFileDetails branch file = do
|
||||
h <- catFileHandle
|
||||
catFileDetails branch file = withCatFileHandle $ \h ->
|
||||
liftIO $ Git.CatFile.catFileDetails h branch file
|
||||
|
||||
catObject :: Git.Ref -> Annex L.ByteString
|
||||
catObject ref = do
|
||||
h <- catFileHandle
|
||||
catObject ref = withCatFileHandle $ \h ->
|
||||
liftIO $ Git.CatFile.catObject h ref
|
||||
|
||||
catObjectMetaData :: Git.Ref -> Annex (Maybe (Sha, Integer, ObjectType))
|
||||
catObjectMetaData ref = do
|
||||
h <- catFileHandle
|
||||
catObjectMetaData ref = withCatFileHandle $ \h ->
|
||||
liftIO $ Git.CatFile.catObjectMetaData h ref
|
||||
|
||||
catTree :: Git.Ref -> Annex [(FilePath, FileMode)]
|
||||
catTree ref = do
|
||||
h <- catFileHandle
|
||||
catTree ref = withCatFileHandle $ \h ->
|
||||
liftIO $ Git.CatFile.catTree h ref
|
||||
|
||||
catCommit :: Git.Ref -> Annex (Maybe Commit)
|
||||
catCommit ref = do
|
||||
h <- catFileHandle
|
||||
catCommit ref = withCatFileHandle $ \h ->
|
||||
liftIO $ Git.CatFile.catCommit h ref
|
||||
|
||||
catObjectDetails :: Git.Ref -> Annex (Maybe (L.ByteString, Sha, ObjectType))
|
||||
catObjectDetails ref = do
|
||||
h <- catFileHandle
|
||||
catObjectDetails ref = withCatFileHandle $ \h ->
|
||||
liftIO $ Git.CatFile.catObjectDetails h ref
|
||||
|
||||
{- There can be multiple index files, and a different cat-file is needed
|
||||
- for each. This is selected by setting GIT_INDEX_FILE in the gitEnv. -}
|
||||
catFileHandle :: Annex Git.CatFile.CatFileHandle
|
||||
catFileHandle = do
|
||||
m <- Annex.getState Annex.catfilehandles
|
||||
- for each. That is selected by setting GIT_INDEX_FILE in the gitEnv
|
||||
- before running this. -}
|
||||
withCatFileHandle :: (Git.CatFile.CatFileHandle -> Annex a) -> Annex a
|
||||
withCatFileHandle a = do
|
||||
cfh <- Annex.getState Annex.catfilehandles
|
||||
indexfile <- fromMaybe "" . maybe Nothing (lookup indexEnv)
|
||||
<$> fromRepo gitEnv
|
||||
case M.lookup indexfile m of
|
||||
Just h -> return h
|
||||
Nothing -> do
|
||||
h <- inRepo Git.CatFile.catFileStart
|
||||
let m' = M.insert indexfile h m
|
||||
Annex.changeState $ \s -> s { Annex.catfilehandles = m' }
|
||||
return h
|
||||
p <- case cfh of
|
||||
CatFileHandlesNonConcurrent m -> case M.lookup indexfile m of
|
||||
Just p -> return p
|
||||
Nothing -> do
|
||||
p <- mkResourcePoolNonConcurrent startcatfile
|
||||
let !m' = M.insert indexfile p m
|
||||
Annex.changeState $ \s -> s { Annex.catfilehandles = CatFileHandlesNonConcurrent m' }
|
||||
return p
|
||||
CatFileHandlesPool tm -> do
|
||||
m <- liftIO $ atomically $ takeTMVar tm
|
||||
case M.lookup indexfile m of
|
||||
Just p -> do
|
||||
liftIO $ atomically $ putTMVar tm m
|
||||
return p
|
||||
Nothing -> do
|
||||
p <- mkResourcePool maxCatFiles
|
||||
let !m' = M.insert indexfile p m
|
||||
liftIO $ atomically $ putTMVar tm m'
|
||||
return p
|
||||
withResourcePool p startcatfile a
|
||||
where
|
||||
startcatfile = inRepo Git.CatFile.catFileStart
|
||||
|
||||
{- A lot of git cat-file processes are unlikely to improve concurrency,
|
||||
- because a query to them takes only a little bit of CPU, and tends to be
|
||||
- bottlenecked on disk. Also, they each open a number of files, so
|
||||
- using too many might run out of file handles. So, only start a maximum
|
||||
- of 2.
|
||||
-
|
||||
- Note that each different index file gets its own pool of cat-files;
|
||||
- this is the size of each pool. In all, 4 times this many cat-files
|
||||
- may end up running.
|
||||
-}
|
||||
maxCatFiles :: Int
|
||||
maxCatFiles = 2
|
||||
|
||||
{- Stops all running cat-files. Should only be run when it's known that
|
||||
- nothing is using the handles, eg at shutdown. -}
|
||||
catFileStop :: Annex ()
|
||||
catFileStop = do
|
||||
m <- Annex.withState $ pure . \s ->
|
||||
(s { Annex.catfilehandles = M.empty }, Annex.catfilehandles s)
|
||||
liftIO $ mapM_ Git.CatFile.catFileStop (M.elems m)
|
||||
cfh <- Annex.getState Annex.catfilehandles
|
||||
m <- case cfh of
|
||||
CatFileHandlesNonConcurrent m -> do
|
||||
Annex.changeState $ \s -> s { Annex.catfilehandles = CatFileHandlesNonConcurrent M.empty }
|
||||
return m
|
||||
CatFileHandlesPool tm ->
|
||||
liftIO $ atomically $ swapTMVar tm M.empty
|
||||
liftIO $ forM_ (M.elems m) $ \p ->
|
||||
freeResourcePool p Git.CatFile.catFileStop
|
||||
|
||||
{- From ref to a symlink or a pointer file, get the key. -}
|
||||
catKey :: Ref -> Annex (Maybe Key)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{- git-annex concurrent state
|
||||
-
|
||||
- Copyright 2015-2019 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2015-2020 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
@ -11,13 +11,29 @@ import Annex
|
|||
import Annex.Common
|
||||
import qualified Annex.Queue
|
||||
import Annex.Action
|
||||
import Types.Concurrency
|
||||
import Types.WorkerPool
|
||||
import Types.CatFileHandles
|
||||
import Remote.List
|
||||
|
||||
import Control.Concurrent
|
||||
import Control.Concurrent.STM
|
||||
import qualified Data.Map as M
|
||||
|
||||
setConcurrency :: Concurrency -> Annex ()
|
||||
setConcurrency NonConcurrent = Annex.changeState $ \s -> s
|
||||
{ Annex.concurrency = NonConcurrent
|
||||
}
|
||||
setConcurrency c = do
|
||||
cfh <- Annex.getState Annex.catfilehandles
|
||||
cfh' <- case cfh of
|
||||
CatFileHandlesNonConcurrent _ -> liftIO catFileHandlesPool
|
||||
CatFileHandlesPool _ -> pure cfh
|
||||
Annex.changeState $ \s -> s
|
||||
{ Annex.concurrency = c
|
||||
, Annex.catfilehandles = cfh'
|
||||
}
|
||||
|
||||
{- Allows forking off a thread that uses a copy of the current AnnexState
|
||||
- to run an Annex action.
|
||||
-
|
||||
|
@ -50,11 +66,17 @@ dupState = do
|
|||
_ <- remoteList
|
||||
|
||||
st <- Annex.getState id
|
||||
return $ st
|
||||
-- Make sure that concurrency is enabled, if it was not already,
|
||||
-- so the resource pools are set up.
|
||||
st' <- case Annex.concurrency st of
|
||||
NonConcurrent -> do
|
||||
setConcurrency (Concurrent 1)
|
||||
Annex.getState id
|
||||
_ -> return st
|
||||
return $ st'
|
||||
-- each thread has its own repoqueue
|
||||
{ Annex.repoqueue = Nothing
|
||||
-- avoid sharing eg, open file handles
|
||||
, Annex.catfilehandles = M.empty
|
||||
-- avoid sharing open file handles
|
||||
, Annex.checkattrhandle = Nothing
|
||||
, Annex.checkignorehandle = Nothing
|
||||
}
|
||||
|
|
|
@ -15,6 +15,8 @@ git-annex (8.20200331) UNRELEASED; urgency=medium
|
|||
for unlocked files, which already worked for locked files.
|
||||
* Avoid repeatedly opening keys db when accessing a local git remote
|
||||
and -J is used.
|
||||
* Avoid running a large number of git cat-file child processes when run
|
||||
with a large -J value.
|
||||
|
||||
-- Joey Hess <id@joeyh.name> Mon, 30 Mar 2020 15:58:34 -0400
|
||||
|
||||
|
|
|
@ -199,8 +199,7 @@ startConcurrency :: UsedStages -> Annex a -> Annex a
|
|||
startConcurrency usedstages a = do
|
||||
fromcmdline <- Annex.getState Annex.concurrency
|
||||
fromgitcfg <- annexJobs <$> Annex.getGitConfig
|
||||
let usegitcfg = Annex.changeState $
|
||||
\c -> c { Annex.concurrency = fromgitcfg }
|
||||
let usegitcfg = setConcurrency fromgitcfg
|
||||
case (fromcmdline, fromgitcfg) of
|
||||
(NonConcurrent, NonConcurrent) -> a
|
||||
(Concurrent n, _) ->
|
||||
|
|
|
@ -36,6 +36,7 @@ import CmdLine.GlobalSetter
|
|||
import qualified Backend
|
||||
import qualified Types.Backend as Backend
|
||||
import Utility.HumanTime
|
||||
import Annex.Concurrent
|
||||
|
||||
-- Global options that are accepted by all git-annex sub-commands,
|
||||
-- although not always used.
|
||||
|
@ -395,7 +396,7 @@ jsonProgressOption =
|
|||
-- action in `allowConcurrentOutput`.
|
||||
jobsOption :: [GlobalOption]
|
||||
jobsOption =
|
||||
[ globalSetter set $
|
||||
[ globalSetter setConcurrency $
|
||||
option (maybeReader parseConcurrency)
|
||||
( long "jobs" <> short 'J'
|
||||
<> metavar (paramNumber `paramOr` "cpus")
|
||||
|
@ -403,8 +404,6 @@ jobsOption =
|
|||
<> hidden
|
||||
)
|
||||
]
|
||||
where
|
||||
set v = Annex.changeState $ \s -> s { Annex.concurrency = v }
|
||||
|
||||
timeLimitOption :: [GlobalOption]
|
||||
timeLimitOption =
|
||||
|
|
30
Types/CatFileHandles.hs
Normal file
30
Types/CatFileHandles.hs
Normal file
|
@ -0,0 +1,30 @@
|
|||
{- git-cat file handles pools
|
||||
-
|
||||
- Copyright 2020 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
||||
module Types.CatFileHandles (
|
||||
CatFileHandles(..),
|
||||
catFileHandlesNonConcurrent,
|
||||
catFileHandlesPool,
|
||||
) where
|
||||
|
||||
import Control.Concurrent.STM
|
||||
import qualified Data.Map as M
|
||||
|
||||
import Utility.ResourcePool
|
||||
import Git.CatFile (CatFileHandle)
|
||||
|
||||
data CatFileHandles
|
||||
= CatFileHandlesNonConcurrent CatMap
|
||||
| CatFileHandlesPool (TMVar CatMap)
|
||||
|
||||
type CatMap = M.Map FilePath (ResourcePool CatFileHandle)
|
||||
|
||||
catFileHandlesNonConcurrent :: CatFileHandles
|
||||
catFileHandlesNonConcurrent = CatFileHandlesNonConcurrent M.empty
|
||||
|
||||
catFileHandlesPool :: IO CatFileHandles
|
||||
catFileHandlesPool = CatFileHandlesPool <$> newTMVarIO M.empty
|
|
@ -11,6 +11,7 @@ import Utility.PartialPrelude
|
|||
-- the former specifies 1 job of each particular kind, but there can be
|
||||
-- more than one kind of job running concurrently.
|
||||
data Concurrency = NonConcurrent | Concurrent Int | ConcurrentPerCpu
|
||||
deriving (Eq)
|
||||
|
||||
parseConcurrency :: String -> Maybe Concurrency
|
||||
parseConcurrency "cpus" = Just ConcurrentPerCpu
|
||||
|
|
|
@ -9,7 +9,8 @@
|
|||
{-# LANGUAGE CPP #-}
|
||||
|
||||
module Utility.CoProcess (
|
||||
CoProcessHandle,
|
||||
CoProcessHandle(..),
|
||||
CoProcessState(..),
|
||||
start,
|
||||
stop,
|
||||
query,
|
||||
|
|
94
Utility/ResourcePool.hs
Normal file
94
Utility/ResourcePool.hs
Normal file
|
@ -0,0 +1,94 @@
|
|||
{- Resource pools.
|
||||
-
|
||||
- Copyright 2020 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- License: BSD-2-clause
|
||||
-}
|
||||
|
||||
{-# LANGUAGE BangPatterns #-}
|
||||
|
||||
module Utility.ResourcePool (
|
||||
ResourcePool,
|
||||
mkResourcePool,
|
||||
mkResourcePoolNonConcurrent,
|
||||
withResourcePool,
|
||||
freeResourcePool,
|
||||
) where
|
||||
|
||||
import Common
|
||||
|
||||
import Control.Concurrent.STM
|
||||
import Control.Monad.IO.Class
|
||||
import Data.Either
|
||||
|
||||
data ResourcePool r
|
||||
= ResourcePool Int (TVar Int) (TVar [r])
|
||||
| ResourcePoolNonConcurrent r
|
||||
|
||||
{- Make a new resource pool, that can grow to contain the specified number
|
||||
- of resources. -}
|
||||
mkResourcePool :: MonadIO m => Int -> m (ResourcePool r)
|
||||
mkResourcePool maxsz = liftIO $
|
||||
ResourcePool maxsz
|
||||
<$> newTVarIO 0
|
||||
<*> newTVarIO []
|
||||
|
||||
{- When there will not be multiple threads that may
|
||||
- may concurrently try to use it, using this is more
|
||||
- efficient than mkResourcePool.
|
||||
-}
|
||||
mkResourcePoolNonConcurrent :: (MonadMask m, MonadIO m) => m r -> m (ResourcePool r)
|
||||
mkResourcePoolNonConcurrent allocresource =
|
||||
ResourcePoolNonConcurrent <$> allocresource
|
||||
|
||||
{- Runs an action with a resource.
|
||||
-
|
||||
- If no free resource is available in the pool,
|
||||
- will run the action the allocate a new resource if the pool's size
|
||||
- allows. Or will block a resource becomes available to use.
|
||||
-
|
||||
- The resource is returned to the pool at the end.
|
||||
-}
|
||||
withResourcePool :: (MonadMask m, MonadIO m) => ResourcePool r -> m r -> (r -> m a) -> m a
|
||||
withResourcePool (ResourcePoolNonConcurrent r) _ a = a r
|
||||
withResourcePool (ResourcePool maxsz currsz p) allocresource a =
|
||||
bracket setup cleanup a
|
||||
where
|
||||
setup = do
|
||||
mr <- liftIO $ atomically $ do
|
||||
l <- readTVar p
|
||||
case l of
|
||||
(r:rs) -> do
|
||||
writeTVar p rs
|
||||
return (Just r)
|
||||
[] -> do
|
||||
n <- readTVar currsz
|
||||
if n < maxsz
|
||||
then do
|
||||
let !n' = succ n
|
||||
writeTVar currsz n'
|
||||
return Nothing
|
||||
else retry
|
||||
case mr of
|
||||
Just r -> return r
|
||||
Nothing -> allocresource
|
||||
cleanup r = liftIO $ atomically $ modifyTVar' p (r:)
|
||||
|
||||
{- Frees all resources in use in the pool, running the supplied action on
|
||||
- each. (If any of the actions throw an exception, it will be rethrown
|
||||
- after all the actions have completed.)
|
||||
-
|
||||
- The pool should not have any resources in use when this is called,
|
||||
- and the pool should not be used again after calling this.
|
||||
-}
|
||||
freeResourcePool :: (MonadMask m, MonadIO m) => ResourcePool r -> (r -> m ()) -> m ()
|
||||
freeResourcePool (ResourcePoolNonConcurrent r) freeresource = freeresource r
|
||||
freeResourcePool (ResourcePool _ currsz p) freeresource = do
|
||||
rs <- liftIO $ atomically $ do
|
||||
writeTVar currsz 0
|
||||
swapTVar p []
|
||||
res <- forM rs $ tryNonAsync . freeresource
|
||||
case lefts res of
|
||||
[] -> return ()
|
||||
(e:_) -> throwM e
|
||||
|
|
@ -50,5 +50,3 @@ I will try to get a chance to troubleshoot it more to provide possibly more deta
|
|||
|
||||
[[!meta author=yoh]]
|
||||
[[!tag projects/datalad]]
|
||||
|
||||
[[!tag moreinfo]]
|
||||
|
|
|
@ -0,0 +1,15 @@
|
|||
[[!comment format=mdwn
|
||||
username="joey"
|
||||
subject="""comment 10"""
|
||||
date="2020-04-20T17:24:53Z"
|
||||
content="""
|
||||
Implemented the cat-file pool. Capped at 2 cat-files of each distinct type,
|
||||
so it will start a max of 8 no matter the -J level.
|
||||
|
||||
(Although cat-file can also be run in those repositories so there will be
|
||||
more then.)
|
||||
|
||||
While testing, I noticed git-anenx drop -Jn starts n git check-attr
|
||||
processes, so the same thing ought to be done with them. Leaving this bug open
|
||||
for that, but I do think that the problem you reported should be fixed now.
|
||||
"""]]
|
|
@ -975,6 +975,7 @@ Executable git-annex
|
|||
Types.Backend
|
||||
Types.Benchmark
|
||||
Types.BranchState
|
||||
Types.CatFileHandles
|
||||
Types.CleanupActions
|
||||
Types.Command
|
||||
Types.Concurrency
|
||||
|
@ -1089,6 +1090,7 @@ Executable git-annex
|
|||
Utility.Process.Transcript
|
||||
Utility.QuickCheck
|
||||
Utility.RawFilePath
|
||||
Utility.ResourcePool
|
||||
Utility.Rsync
|
||||
Utility.SafeCommand
|
||||
Utility.Scheduled
|
||||
|
|
Loading…
Add table
Reference in a new issue