make catObjectStream support newline and carriage return in filenames

Turns out the %(rest) trick was not needed. Instead, just maintain a
list of files we've asked for, and each cat-file response is for the
next file in the list.

This actually benchmarks 25% faster than before! Very surprising, but it
must be due to needing to shove less data through the pipe, and parse
less.
This commit is contained in:
Joey Hess 2020-07-08 12:34:56 -04:00
parent 2cf6717aec
commit de3d7d044d
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
4 changed files with 34 additions and 37 deletions

View file

@ -30,9 +30,8 @@ git-annex (8.20200618) UNRELEASED; urgency=medium
* Sped up the --all option by 2x to 16x by using git cat-file --buffer. * Sped up the --all option by 2x to 16x by using git cat-file --buffer.
Thanks to Lukey for finding this optimisation. Thanks to Lukey for finding this optimisation.
* fsck: Detect if WORM keys contain a carriage return, and recommend * fsck: Detect if WORM keys contain a carriage return, and recommend
upgrading to fix the problem, because the --all optimisation above upgrading the key. (git-annex could have maybe created such keys back
skips any such keys that might exist (git-annex could have maybe in 2013).
created such keys back in 2013).
-- Joey Hess <id@joeyh.name> Thu, 18 Jun 2020 12:21:14 -0400 -- Joey Hess <id@joeyh.name> Thu, 18 Jun 2020 12:21:14 -0400

View file

@ -6,6 +6,7 @@
-} -}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE BangPatterns #-}
module Git.CatFile ( module Git.CatFile (
CatFileHandle, CatFileHandle,
@ -28,13 +29,15 @@ import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Char8 as S8 import qualified Data.ByteString.Char8 as S8
import qualified Data.Attoparsec.ByteString as A import qualified Data.Attoparsec.ByteString as A
import qualified Data.Attoparsec.ByteString.Char8 as A8 import qualified Data.Attoparsec.ByteString.Char8 as A8
import qualified Data.Map as M import qualified Data.Map.Strict as M
import Data.String import Data.String
import Data.Char import Data.Char
import Numeric import Numeric
import System.Posix.Types import System.Posix.Types
import Text.Read import Text.Read
import Control.Concurrent.Async import Control.Concurrent.Async
import Control.Concurrent.STM
import Control.Monad.IO.Class (MonadIO)
import Common import Common
import Git import Git
@ -47,7 +50,7 @@ import Git.HashObject
import qualified Git.LsTree as LsTree import qualified Git.LsTree as LsTree
import qualified Utility.CoProcess as CoProcess import qualified Utility.CoProcess as CoProcess
import Utility.Tuple import Utility.Tuple
import Control.Monad.IO.Class (MonadIO) import Utility.TList
data CatFileHandle = CatFileHandle data CatFileHandle = CatFileHandle
{ catFileProcess :: CoProcess.CoProcessHandle { catFileProcess :: CoProcess.CoProcessHandle
@ -284,8 +287,8 @@ parseCommit b = Commit
- as efficiently as possible. This is much faster than querying it - as efficiently as possible. This is much faster than querying it
- repeatedly per file. - repeatedly per file.
- -
- Any files with a newline or carriage return in their name will be - (Note that, while a more polymorphic version of this can be written,
- skipped, because the interface does not support them. - this version is faster, possibly due to being less polymorphic.)
-} -}
catObjectStream catObjectStream
:: (MonadMask m, MonadIO m) :: (MonadMask m, MonadIO m)
@ -294,53 +297,46 @@ catObjectStream
-> Repo -> Repo
-> (IO (Maybe (TopFilePath, L.ByteString)) -> m ()) -> (IO (Maybe (TopFilePath, L.ByteString)) -> m ())
-> m () -> m ()
catObjectStream l want repo a = assertLocal repo $ catObjectStream l want repo a = assertLocal repo $ do
bracketIO start stop $ \(_, _, hout, _) -> a (reader hout) bracketIO start stop $ \(mv, _, _, hout, _) -> a (reader mv hout)
where where
feeder h = do feeder mv h = do
forM_ l $ \ti -> forM_ l $ \ti ->
when (want ti) $ do when (want ti) $ do
let f = getTopFilePath (LsTree.file ti) let f = LsTree.file ti
-- skip files with newlines or carriage returns liftIO $ atomically $ snocTList mv f
unless (any (`S8.elem` f) ['\n', '\r']) $ S8.hPutStrLn h (fromRef' (LsTree.sha ti))
S8.hPutStrLn h $
fromRef' (LsTree.sha ti) <> " " <> f
hClose h hClose h
reader h = ifM (hIsEOF h) reader mv h = ifM (hIsEOF h)
( return Nothing ( return Nothing
, do , do
f <- liftIO $ atomically $ headTList mv
resp <- S8.hGetLine h resp <- S8.hGetLine h
case eitherToMaybe $ A.parseOnly respparser resp of case eitherToMaybe $ A.parseOnly respParser resp of
Nothing -> error $ "unknown response from git cat-file " ++ show resp Just r -> do
Just (r, f) -> do
content <- readObjectContent h r content <- readObjectContent h r
return (Just (asTopFilePath f, content)) return (Just (f, content))
Nothing -> error $ "unknown response from git cat-file " ++ show resp
) )
params = params =
[ Param "cat-file" [ Param "cat-file"
-- %(rest) is used to feed the filename through , Param ("--batch=" ++ batchFormat)
-- cat-file; it will be at the end of the response
, Param ("--batch=" ++ batchFormat ++ " %(rest)")
, Param "--buffer" , Param "--buffer"
] ]
respparser = (,)
<$> respParser
<* A8.char ' '
<*> A.takeByteString
start = do start = do
mv <- liftIO $ atomically newTList
let p = gitCreateProcess params repo let p = gitCreateProcess params repo
(Just hin, Just hout, _, pid) <- createProcess p (Just hin, Just hout, _, pid) <- createProcess p
{ std_in = CreatePipe { std_in = CreatePipe
, std_out = CreatePipe , std_out = CreatePipe
} }
f <- async (feeder hin) f <- async (feeder mv hin)
return (f, hin, hout, pid) return (mv, f, hin, hout, pid)
stop (f, hin, hout, pid) = do stop (_, f, hin, hout, pid) = do
cancel f cancel f
hClose hin hClose hin
hClose hout hClose hout

View file

@ -6,7 +6,9 @@
- Unlike a TQueue, the entire contents of a TList can be efficiently - Unlike a TQueue, the entire contents of a TList can be efficiently
- read without modifying it. - read without modifying it.
- -
- Copyright 2013 Joey Hess <id@joeyh.name> - Copyright 2013-2020 Joey Hess <id@joeyh.name>
-
- License: BSD-2-clause
-} -}
{-# LANGUAGE BangPatterns #-} {-# LANGUAGE BangPatterns #-}
@ -21,6 +23,7 @@ module Utility.TList (
consTList, consTList,
snocTList, snocTList,
appendTList, appendTList,
headTList,
) where ) where
import Common import Common
@ -77,3 +80,6 @@ snocTList tlist v = modifyTList tlist $ \dl -> D.snoc dl v
appendTList :: TList a -> [a] -> STM () appendTList :: TList a -> [a] -> STM ()
appendTList tlist l = modifyTList tlist $ \dl -> D.append dl (D.fromList l) appendTList tlist l = modifyTList tlist $ \dl -> D.append dl (D.fromList l)
headTList :: TList a -> STM a
headTList tlist = D.head <$> readTMVar tlist

View file

@ -16,16 +16,12 @@ be as good as --all's was, but it could still be significant.
> to get file blob, through cat-file to get key, through cat-file to > to get file blob, through cat-file to get key, through cat-file to
> precache logs. > precache logs.
catObjectStream not supporting newline or carriage return needs to be dealt
with somehow first, because worktree filenames can contain either.
One odd edge case is, could there be a worktree file that refers to a key One odd edge case is, could there be a worktree file that refers to a key
with no location log? In that case, catObjectStream would skip it. This with no location log? In that case, catObjectStream would skip it. This
doesn't usually happen. One case where it does happen is if the git-annex doesn't usually happen. One case where it does happen is if the git-annex
branch is not pulled, but master is. branch is not pulled, but master is.
Perhaps both the newline and the missing location log could be dealt with Perhaps make catObjectStream not skip them, but return an item
together, by making catObjectStream not skip them, but return an item
with no log file content. It's important things not be reordered when doing with no log file content. It's important things not be reordered when doing
that -- could a dummy item somehow be passed through cat-file to represent that -- could a dummy item somehow be passed through cat-file to represent
these problem cases? these problem cases?