separate handles for cat-file and cat-file --batch-check

This avoids starting one process when only the other one is needed.
Eg in git-annex smudge --clean, this reduces the total number of
cat-file processes that are started from 4 to 2.

The only performance penalty is that when both are needed, it has to do
twice as much work to maintain the two Maps. But both are very small,
consisting of 1 or 2 items, so that work is negligible.

Sponsored-by: Dartmouth College's Datalad project
This commit is contained in:
Joey Hess 2021-09-24 13:16:13 -04:00
parent e6c0bbd645
commit e47b4badb3
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
3 changed files with 82 additions and 37 deletions

View file

@ -1,6 +1,6 @@
{- git cat-file interface, with handle automatically stored in the Annex monad {- git cat-file interface, with handle automatically stored in the Annex monad
- -
- Copyright 2011-2020 Joey Hess <id@joeyh.name> - Copyright 2011-2021 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
@ -58,7 +58,7 @@ catObject ref = withCatFileHandle $ \h ->
liftIO $ Git.CatFile.catObject h ref liftIO $ Git.CatFile.catObject h ref
catObjectMetaData :: Git.Ref -> Annex (Maybe (Sha, Integer, ObjectType)) catObjectMetaData :: Git.Ref -> Annex (Maybe (Sha, Integer, ObjectType))
catObjectMetaData ref = withCatFileHandle $ \h -> catObjectMetaData ref = withCatFileMetaDataHandle $ \h ->
liftIO $ Git.CatFile.catObjectMetaData h ref liftIO $ Git.CatFile.catObjectMetaData h ref
catTree :: Git.Ref -> Annex [(FilePath, FileMode)] catTree :: Git.Ref -> Annex [(FilePath, FileMode)]
@ -77,32 +77,50 @@ catObjectDetails ref = withCatFileHandle $ \h ->
- for each. That is selected by setting GIT_INDEX_FILE in the gitEnv - for each. That is selected by setting GIT_INDEX_FILE in the gitEnv
- before running this. -} - before running this. -}
withCatFileHandle :: (Git.CatFile.CatFileHandle -> Annex a) -> Annex a withCatFileHandle :: (Git.CatFile.CatFileHandle -> Annex a) -> Annex a
withCatFileHandle a = do withCatFileHandle = withCatFileHandle'
Git.CatFile.catFileStart
catFileMap
(\v m -> v { catFileMap = m })
withCatFileMetaDataHandle :: (Git.CatFile.CatFileMetaDataHandle -> Annex a) -> Annex a
withCatFileMetaDataHandle = withCatFileHandle'
Git.CatFile.catFileMetaDataStart
catFileMetaDataMap
(\v m -> v { catFileMetaDataMap = m })
withCatFileHandle'
:: (Repo -> IO hdl)
-> (CatMap -> M.Map FilePath (ResourcePool hdl))
-> (CatMap -> M.Map FilePath (ResourcePool hdl) -> CatMap)
-> (hdl -> Annex a)
-> Annex a
withCatFileHandle' startcat get set a = do
cfh <- Annex.getState Annex.catfilehandles cfh <- Annex.getState Annex.catfilehandles
indexfile <- fromMaybe "" . maybe Nothing (lookup indexEnv) indexfile <- fromMaybe "" . maybe Nothing (lookup indexEnv)
<$> fromRepo gitEnv <$> fromRepo gitEnv
p <- case cfh of p <- case cfh of
CatFileHandlesNonConcurrent m -> case M.lookup indexfile m of CatFileHandlesNonConcurrent m -> case M.lookup indexfile (get m) of
Just p -> return p Just p -> return p
Nothing -> do Nothing -> do
p <- mkResourcePoolNonConcurrent startcatfile p <- mkResourcePoolNonConcurrent startcatfile
let !m' = M.insert indexfile p m let !m' = set m (M.insert indexfile p (get m))
Annex.changeState $ \s -> s { Annex.catfilehandles = CatFileHandlesNonConcurrent m' } Annex.changeState $ \s -> s
{ Annex.catfilehandles = CatFileHandlesNonConcurrent m' }
return p return p
CatFileHandlesPool tm -> do CatFileHandlesPool tm -> do
m <- liftIO $ atomically $ takeTMVar tm m <- liftIO $ atomically $ takeTMVar tm
case M.lookup indexfile m of case M.lookup indexfile (get m) of
Just p -> do Just p -> do
liftIO $ atomically $ putTMVar tm m liftIO $ atomically $ putTMVar tm m
return p return p
Nothing -> do Nothing -> do
p <- mkResourcePool maxCatFiles p <- mkResourcePool maxCatFiles
let !m' = M.insert indexfile p m let !m' = set m (M.insert indexfile p (get m))
liftIO $ atomically $ putTMVar tm m' liftIO $ atomically $ putTMVar tm m'
return p return p
withResourcePool p startcatfile a withResourcePool p startcatfile a
where where
startcatfile = inRepo Git.CatFile.catFileStart startcatfile = inRepo startcat
{- A lot of git cat-file processes are unlikely to improve concurrency, {- A lot of git cat-file processes are unlikely to improve concurrency,
- because a query to them takes only a little bit of CPU, and tends to be - because a query to them takes only a little bit of CPU, and tends to be
@ -124,12 +142,14 @@ catFileStop = do
cfh <- Annex.getState Annex.catfilehandles cfh <- Annex.getState Annex.catfilehandles
m <- case cfh of m <- case cfh of
CatFileHandlesNonConcurrent m -> do CatFileHandlesNonConcurrent m -> do
Annex.changeState $ \s -> s { Annex.catfilehandles = CatFileHandlesNonConcurrent M.empty } Annex.changeState $ \s -> s { Annex.catfilehandles = CatFileHandlesNonConcurrent emptyCatMap }
return m return m
CatFileHandlesPool tm -> CatFileHandlesPool tm ->
liftIO $ atomically $ swapTMVar tm M.empty liftIO $ atomically $ swapTMVar tm emptyCatMap
liftIO $ forM_ (M.elems m) $ \p -> liftIO $ forM_ (M.elems (catFileMap m)) $ \p ->
freeResourcePool p Git.CatFile.catFileStop freeResourcePool p Git.CatFile.catFileStop
liftIO $ forM_ (M.elems (catFileMetaDataMap m)) $ \p ->
freeResourcePool p Git.CatFile.catFileMetaDataStop
{- From ref to a symlink or a pointer file, get the key. -} {- From ref to a symlink or a pointer file, get the key. -}
catKey :: Ref -> Annex (Maybe Key) catKey :: Ref -> Annex (Maybe Key)

View file

@ -1,6 +1,6 @@
{- git cat-file interface {- git cat-file interface
- -
- Copyright 2011-2020 Joey Hess <id@joeyh.name> - Copyright 2011-2021 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
@ -10,9 +10,13 @@
module Git.CatFile ( module Git.CatFile (
CatFileHandle, CatFileHandle,
CatFileMetaDataHandle,
catFileStart, catFileStart,
catFileMetaDataStart,
catFileStart', catFileStart',
catFileMetaDataStart',
catFileStop, catFileStop,
catFileMetaDataStop,
catFile, catFile,
catFileDetails, catFileDetails,
catTree, catTree,
@ -55,8 +59,12 @@ import Utility.Tuple
data CatFileHandle = CatFileHandle data CatFileHandle = CatFileHandle
{ catFileProcess :: CoProcess.CoProcessHandle { catFileProcess :: CoProcess.CoProcessHandle
, checkFileProcess :: CoProcess.CoProcessHandle , catFileGitRepo :: Repo
, gitRepo :: Repo }
data CatFileMetaDataHandle = CatFileMetaDataHandle
{ checkFileProcess :: CoProcess.CoProcessHandle
, checkFileGitRepo :: Repo
} }
catFileStart :: Repo -> IO CatFileHandle catFileStart :: Repo -> IO CatFileHandle
@ -64,22 +72,31 @@ catFileStart = catFileStart' True
catFileStart' :: Bool -> Repo -> IO CatFileHandle catFileStart' :: Bool -> Repo -> IO CatFileHandle
catFileStart' restartable repo = CatFileHandle catFileStart' restartable repo = CatFileHandle
<$> startp "--batch" <$> startcat restartable repo "--batch"
<*> startp ("--batch-check=" ++ batchFormat) <*> pure repo
catFileMetaDataStart :: Repo -> IO CatFileMetaDataHandle
catFileMetaDataStart = catFileMetaDataStart' True
catFileMetaDataStart' :: Bool -> Repo -> IO CatFileMetaDataHandle
catFileMetaDataStart' restartable repo = CatFileMetaDataHandle
<$> startcat restartable repo ("--batch-check=" ++ batchFormat)
<*> pure repo <*> pure repo
where
startp p = gitCoProcessStart restartable
[ Param "cat-file"
, Param p
] repo
batchFormat :: String batchFormat :: String
batchFormat = "%(objectname) %(objecttype) %(objectsize)" batchFormat = "%(objectname) %(objecttype) %(objectsize)"
startcat :: Bool -> Repo -> String -> IO CoProcess.CoProcessHandle
startcat restartable repo p = gitCoProcessStart restartable
[ Param "cat-file"
, Param p
] repo
catFileStop :: CatFileHandle -> IO () catFileStop :: CatFileHandle -> IO ()
catFileStop h = do catFileStop = CoProcess.stop . catFileProcess
CoProcess.stop (catFileProcess h)
CoProcess.stop (checkFileProcess h) catFileMetaDataStop :: CatFileMetaDataHandle -> IO ()
catFileMetaDataStop = CoProcess.stop . checkFileProcess
{- Reads a file from a specified branch. -} {- Reads a file from a specified branch. -}
catFile :: CatFileHandle -> Branch -> RawFilePath -> IO L.ByteString catFile :: CatFileHandle -> Branch -> RawFilePath -> IO L.ByteString
@ -106,16 +123,16 @@ catObjectDetails h object = query (catFileProcess h) object newlinefallback $ \f
Nothing -> error $ "unknown response from git cat-file " ++ show (header, object) Nothing -> error $ "unknown response from git cat-file " ++ show (header, object)
where where
-- Slow fallback path for filenames containing newlines. -- Slow fallback path for filenames containing newlines.
newlinefallback = queryObjectType object (gitRepo h) >>= \case newlinefallback = queryObjectType object (catFileGitRepo h) >>= \case
Nothing -> return Nothing Nothing -> return Nothing
Just objtype -> queryContent object (gitRepo h) >>= \case Just objtype -> queryContent object (catFileGitRepo h) >>= \case
Nothing -> return Nothing Nothing -> return Nothing
Just content -> do Just content -> do
-- only the --batch interface allows getting -- only the --batch interface allows getting
-- the sha, so have to re-hash the object -- the sha, so have to re-hash the object
sha <- hashObject' objtype sha <- hashObject' objtype
(flip L.hPut content) (flip L.hPut content)
(gitRepo h) (catFileGitRepo h)
return (Just (content, sha, objtype)) return (Just (content, sha, objtype))
readObjectContent :: Handle -> ParsedResp -> IO L.ByteString readObjectContent :: Handle -> ParsedResp -> IO L.ByteString
@ -131,7 +148,7 @@ readObjectContent h (ParsedResp _ _ size) = do
readObjectContent _ DNE = error "internal" readObjectContent _ DNE = error "internal"
{- Gets the size and type of an object, without reading its content. -} {- Gets the size and type of an object, without reading its content. -}
catObjectMetaData :: CatFileHandle -> Ref -> IO (Maybe (Sha, FileSize, ObjectType)) catObjectMetaData :: CatFileMetaDataHandle -> Ref -> IO (Maybe (Sha, FileSize, ObjectType))
catObjectMetaData h object = query (checkFileProcess h) object newlinefallback $ \from -> do catObjectMetaData h object = query (checkFileProcess h) object newlinefallback $ \from -> do
resp <- S8.hGetLine from resp <- S8.hGetLine from
case parseResp object resp of case parseResp object resp of
@ -142,9 +159,9 @@ catObjectMetaData h object = query (checkFileProcess h) object newlinefallback $
where where
-- Slow fallback path for filenames containing newlines. -- Slow fallback path for filenames containing newlines.
newlinefallback = do newlinefallback = do
sha <- Git.Ref.sha object (gitRepo h) sha <- Git.Ref.sha object (checkFileGitRepo h)
sz <- querySize object (gitRepo h) sz <- querySize object (checkFileGitRepo h)
objtype <- queryObjectType object (gitRepo h) objtype <- queryObjectType object (checkFileGitRepo h)
return $ (,,) <$> sha <*> sz <*> objtype return $ (,,) <$> sha <*> sz <*> objtype
data ParsedResp = ParsedResp Sha ObjectType FileSize | DNE data ParsedResp = ParsedResp Sha ObjectType FileSize | DNE

View file

@ -1,6 +1,6 @@
{- git-cat file handles pools {- git-cat file handles pools
- -
- Copyright 2020 Joey Hess <id@joeyh.name> - Copyright 2020-2021 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
@ -9,22 +9,30 @@ module Types.CatFileHandles (
CatFileHandles(..), CatFileHandles(..),
catFileHandlesNonConcurrent, catFileHandlesNonConcurrent,
catFileHandlesPool, catFileHandlesPool,
CatMap(..),
emptyCatMap,
) where ) where
import Control.Concurrent.STM import Control.Concurrent.STM
import qualified Data.Map as M import qualified Data.Map as M
import Utility.ResourcePool import Utility.ResourcePool
import Git.CatFile (CatFileHandle) import Git.CatFile (CatFileHandle, CatFileMetaDataHandle)
data CatFileHandles data CatFileHandles
= CatFileHandlesNonConcurrent CatMap = CatFileHandlesNonConcurrent CatMap
| CatFileHandlesPool (TMVar CatMap) | CatFileHandlesPool (TMVar CatMap)
type CatMap = M.Map FilePath (ResourcePool CatFileHandle) data CatMap = CatMap
{ catFileMap :: M.Map FilePath (ResourcePool CatFileHandle)
, catFileMetaDataMap :: M.Map FilePath (ResourcePool CatFileMetaDataHandle)
}
emptyCatMap :: CatMap
emptyCatMap = CatMap M.empty M.empty
catFileHandlesNonConcurrent :: CatFileHandles catFileHandlesNonConcurrent :: CatFileHandles
catFileHandlesNonConcurrent = CatFileHandlesNonConcurrent M.empty catFileHandlesNonConcurrent = CatFileHandlesNonConcurrent emptyCatMap
catFileHandlesPool :: IO CatFileHandles catFileHandlesPool :: IO CatFileHandles
catFileHandlesPool = CatFileHandlesPool <$> newTMVarIO M.empty catFileHandlesPool = CatFileHandlesPool <$> newTMVarIO emptyCatMap