make catObjectStream support newline and carriage return in filenames
Turns out the %(rest) trick was not needed. Instead, just maintain a list of files we've asked for, and each cat-file response is for the next file in the list. This actually benchmarks 25% faster than before! Very surprising, but it must be due to needing to shove less data through the pipe, and parse less.
This commit is contained in:
parent
2cf6717aec
commit
de3d7d044d
4 changed files with 34 additions and 37 deletions
|
@ -30,9 +30,8 @@ git-annex (8.20200618) UNRELEASED; urgency=medium
|
|||
* Sped up the --all option by 2x to 16x by using git cat-file --buffer.
|
||||
Thanks to Lukey for finding this optimisation.
|
||||
* fsck: Detect if WORM keys contain a carriage return, and recommend
|
||||
upgrading to fix the problem, because the --all optimisation above
|
||||
skips any such keys that might exist (git-annex could have maybe
|
||||
created such keys back in 2013).
|
||||
upgrading the key. (git-annex could have maybe created such keys back
|
||||
in 2013).
|
||||
|
||||
-- Joey Hess <id@joeyh.name> Thu, 18 Jun 2020 12:21:14 -0400
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
-}
|
||||
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE BangPatterns #-}
|
||||
|
||||
module Git.CatFile (
|
||||
CatFileHandle,
|
||||
|
@ -28,13 +29,15 @@ import qualified Data.ByteString.Lazy as L
|
|||
import qualified Data.ByteString.Char8 as S8
|
||||
import qualified Data.Attoparsec.ByteString as A
|
||||
import qualified Data.Attoparsec.ByteString.Char8 as A8
|
||||
import qualified Data.Map as M
|
||||
import qualified Data.Map.Strict as M
|
||||
import Data.String
|
||||
import Data.Char
|
||||
import Numeric
|
||||
import System.Posix.Types
|
||||
import Text.Read
|
||||
import Control.Concurrent.Async
|
||||
import Control.Concurrent.STM
|
||||
import Control.Monad.IO.Class (MonadIO)
|
||||
|
||||
import Common
|
||||
import Git
|
||||
|
@ -47,7 +50,7 @@ import Git.HashObject
|
|||
import qualified Git.LsTree as LsTree
|
||||
import qualified Utility.CoProcess as CoProcess
|
||||
import Utility.Tuple
|
||||
import Control.Monad.IO.Class (MonadIO)
|
||||
import Utility.TList
|
||||
|
||||
data CatFileHandle = CatFileHandle
|
||||
{ catFileProcess :: CoProcess.CoProcessHandle
|
||||
|
@ -284,8 +287,8 @@ parseCommit b = Commit
|
|||
- as efficiently as possible. This is much faster than querying it
|
||||
- repeatedly per file.
|
||||
-
|
||||
- Any files with a newline or carriage return in their name will be
|
||||
- skipped, because the interface does not support them.
|
||||
- (Note that, while a more polymorphic version of this can be written,
|
||||
- this version is faster, possibly due to being less polymorphic.)
|
||||
-}
|
||||
catObjectStream
|
||||
:: (MonadMask m, MonadIO m)
|
||||
|
@ -294,53 +297,46 @@ catObjectStream
|
|||
-> Repo
|
||||
-> (IO (Maybe (TopFilePath, L.ByteString)) -> m ())
|
||||
-> m ()
|
||||
catObjectStream l want repo a = assertLocal repo $
|
||||
bracketIO start stop $ \(_, _, hout, _) -> a (reader hout)
|
||||
catObjectStream l want repo a = assertLocal repo $ do
|
||||
bracketIO start stop $ \(mv, _, _, hout, _) -> a (reader mv hout)
|
||||
where
|
||||
feeder h = do
|
||||
feeder mv h = do
|
||||
forM_ l $ \ti ->
|
||||
when (want ti) $ do
|
||||
let f = getTopFilePath (LsTree.file ti)
|
||||
-- skip files with newlines or carriage returns
|
||||
unless (any (`S8.elem` f) ['\n', '\r']) $
|
||||
S8.hPutStrLn h $
|
||||
fromRef' (LsTree.sha ti) <> " " <> f
|
||||
let f = LsTree.file ti
|
||||
liftIO $ atomically $ snocTList mv f
|
||||
S8.hPutStrLn h (fromRef' (LsTree.sha ti))
|
||||
hClose h
|
||||
|
||||
reader h = ifM (hIsEOF h)
|
||||
reader mv h = ifM (hIsEOF h)
|
||||
( return Nothing
|
||||
, do
|
||||
f <- liftIO $ atomically $ headTList mv
|
||||
resp <- S8.hGetLine h
|
||||
case eitherToMaybe $ A.parseOnly respparser resp of
|
||||
Nothing -> error $ "unknown response from git cat-file " ++ show resp
|
||||
Just (r, f) -> do
|
||||
case eitherToMaybe $ A.parseOnly respParser resp of
|
||||
Just r -> do
|
||||
content <- readObjectContent h r
|
||||
return (Just (asTopFilePath f, content))
|
||||
return (Just (f, content))
|
||||
Nothing -> error $ "unknown response from git cat-file " ++ show resp
|
||||
)
|
||||
|
||||
params =
|
||||
[ Param "cat-file"
|
||||
-- %(rest) is used to feed the filename through
|
||||
-- cat-file; it will be at the end of the response
|
||||
, Param ("--batch=" ++ batchFormat ++ " %(rest)")
|
||||
, Param ("--batch=" ++ batchFormat)
|
||||
, Param "--buffer"
|
||||
]
|
||||
|
||||
respparser = (,)
|
||||
<$> respParser
|
||||
<* A8.char ' '
|
||||
<*> A.takeByteString
|
||||
|
||||
start = do
|
||||
mv <- liftIO $ atomically newTList
|
||||
let p = gitCreateProcess params repo
|
||||
(Just hin, Just hout, _, pid) <- createProcess p
|
||||
{ std_in = CreatePipe
|
||||
, std_out = CreatePipe
|
||||
}
|
||||
f <- async (feeder hin)
|
||||
return (f, hin, hout, pid)
|
||||
f <- async (feeder mv hin)
|
||||
return (mv, f, hin, hout, pid)
|
||||
|
||||
stop (f, hin, hout, pid) = do
|
||||
stop (_, f, hin, hout, pid) = do
|
||||
cancel f
|
||||
hClose hin
|
||||
hClose hout
|
||||
|
|
|
@ -6,7 +6,9 @@
|
|||
- Unlike a TQueue, the entire contents of a TList can be efficiently
|
||||
- read without modifying it.
|
||||
-
|
||||
- Copyright 2013 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2013-2020 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- License: BSD-2-clause
|
||||
-}
|
||||
|
||||
{-# LANGUAGE BangPatterns #-}
|
||||
|
@ -21,6 +23,7 @@ module Utility.TList (
|
|||
consTList,
|
||||
snocTList,
|
||||
appendTList,
|
||||
headTList,
|
||||
) where
|
||||
|
||||
import Common
|
||||
|
@ -77,3 +80,6 @@ snocTList tlist v = modifyTList tlist $ \dl -> D.snoc dl v
|
|||
|
||||
appendTList :: TList a -> [a] -> STM ()
|
||||
appendTList tlist l = modifyTList tlist $ \dl -> D.append dl (D.fromList l)
|
||||
|
||||
headTList :: TList a -> STM a
|
||||
headTList tlist = D.head <$> readTMVar tlist
|
||||
|
|
|
@ -16,16 +16,12 @@ be as good as --all's was, but it could still be significant.
|
|||
> to get file blob, through cat-file to get key, through cat-file to
|
||||
> precache logs.
|
||||
|
||||
catObjectStream not supporting newline or carriage return needs to be dealt
|
||||
with somehow first, because worktree filenames can contain either.
|
||||
|
||||
One odd edge case is, could there be a worktree file that refers to a key
|
||||
with no location log? In that case, catObjectStream would skip it. This
|
||||
doesn't usually happen. One case where it does happen is if the git-annex
|
||||
branch is not pulled, but master is.
|
||||
|
||||
Perhaps both the newline and the missing location log could be dealt with
|
||||
together, by making catObjectStream not skip them, but return an item
|
||||
Perhaps make catObjectStream not skip them, but return an item
|
||||
with no log file content. It's important things not be reordered when doing
|
||||
that -- could a dummy item somehow be passed through cat-file to represent
|
||||
these problem cases?
|
||||
|
|
Loading…
Reference in a new issue