make catObjectStream support newline and carriage return in filenames
Turns out the %(rest) trick was not needed. Instead, just maintain a list of files we've asked for, and each cat-file response is for the next file in the list. This actually benchmarks 25% faster than before! Very surprising, but it must be due to needing to shove less data through the pipe, and parse less.
This commit is contained in:
parent
2cf6717aec
commit
de3d7d044d
4 changed files with 34 additions and 37 deletions
|
@ -30,9 +30,8 @@ git-annex (8.20200618) UNRELEASED; urgency=medium
|
||||||
* Sped up the --all option by 2x to 16x by using git cat-file --buffer.
|
* Sped up the --all option by 2x to 16x by using git cat-file --buffer.
|
||||||
Thanks to Lukey for finding this optimisation.
|
Thanks to Lukey for finding this optimisation.
|
||||||
* fsck: Detect if WORM keys contain a carriage return, and recommend
|
* fsck: Detect if WORM keys contain a carriage return, and recommend
|
||||||
upgrading to fix the problem, because the --all optimisation above
|
upgrading the key. (git-annex could have maybe created such keys back
|
||||||
skips any such keys that might exist (git-annex could have maybe
|
in 2013).
|
||||||
created such keys back in 2013).
|
|
||||||
|
|
||||||
-- Joey Hess <id@joeyh.name> Thu, 18 Jun 2020 12:21:14 -0400
|
-- Joey Hess <id@joeyh.name> Thu, 18 Jun 2020 12:21:14 -0400
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
-}
|
-}
|
||||||
|
|
||||||
{-# LANGUAGE OverloadedStrings #-}
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
{-# LANGUAGE BangPatterns #-}
|
||||||
|
|
||||||
module Git.CatFile (
|
module Git.CatFile (
|
||||||
CatFileHandle,
|
CatFileHandle,
|
||||||
|
@ -28,13 +29,15 @@ import qualified Data.ByteString.Lazy as L
|
||||||
import qualified Data.ByteString.Char8 as S8
|
import qualified Data.ByteString.Char8 as S8
|
||||||
import qualified Data.Attoparsec.ByteString as A
|
import qualified Data.Attoparsec.ByteString as A
|
||||||
import qualified Data.Attoparsec.ByteString.Char8 as A8
|
import qualified Data.Attoparsec.ByteString.Char8 as A8
|
||||||
import qualified Data.Map as M
|
import qualified Data.Map.Strict as M
|
||||||
import Data.String
|
import Data.String
|
||||||
import Data.Char
|
import Data.Char
|
||||||
import Numeric
|
import Numeric
|
||||||
import System.Posix.Types
|
import System.Posix.Types
|
||||||
import Text.Read
|
import Text.Read
|
||||||
import Control.Concurrent.Async
|
import Control.Concurrent.Async
|
||||||
|
import Control.Concurrent.STM
|
||||||
|
import Control.Monad.IO.Class (MonadIO)
|
||||||
|
|
||||||
import Common
|
import Common
|
||||||
import Git
|
import Git
|
||||||
|
@ -47,7 +50,7 @@ import Git.HashObject
|
||||||
import qualified Git.LsTree as LsTree
|
import qualified Git.LsTree as LsTree
|
||||||
import qualified Utility.CoProcess as CoProcess
|
import qualified Utility.CoProcess as CoProcess
|
||||||
import Utility.Tuple
|
import Utility.Tuple
|
||||||
import Control.Monad.IO.Class (MonadIO)
|
import Utility.TList
|
||||||
|
|
||||||
data CatFileHandle = CatFileHandle
|
data CatFileHandle = CatFileHandle
|
||||||
{ catFileProcess :: CoProcess.CoProcessHandle
|
{ catFileProcess :: CoProcess.CoProcessHandle
|
||||||
|
@ -284,8 +287,8 @@ parseCommit b = Commit
|
||||||
- as efficiently as possible. This is much faster than querying it
|
- as efficiently as possible. This is much faster than querying it
|
||||||
- repeatedly per file.
|
- repeatedly per file.
|
||||||
-
|
-
|
||||||
- Any files with a newline or carriage return in their name will be
|
- (Note that, while a more polymorphic version of this can be written,
|
||||||
- skipped, because the interface does not support them.
|
- this version is faster, possibly due to being less polymorphic.)
|
||||||
-}
|
-}
|
||||||
catObjectStream
|
catObjectStream
|
||||||
:: (MonadMask m, MonadIO m)
|
:: (MonadMask m, MonadIO m)
|
||||||
|
@ -294,53 +297,46 @@ catObjectStream
|
||||||
-> Repo
|
-> Repo
|
||||||
-> (IO (Maybe (TopFilePath, L.ByteString)) -> m ())
|
-> (IO (Maybe (TopFilePath, L.ByteString)) -> m ())
|
||||||
-> m ()
|
-> m ()
|
||||||
catObjectStream l want repo a = assertLocal repo $
|
catObjectStream l want repo a = assertLocal repo $ do
|
||||||
bracketIO start stop $ \(_, _, hout, _) -> a (reader hout)
|
bracketIO start stop $ \(mv, _, _, hout, _) -> a (reader mv hout)
|
||||||
where
|
where
|
||||||
feeder h = do
|
feeder mv h = do
|
||||||
forM_ l $ \ti ->
|
forM_ l $ \ti ->
|
||||||
when (want ti) $ do
|
when (want ti) $ do
|
||||||
let f = getTopFilePath (LsTree.file ti)
|
let f = LsTree.file ti
|
||||||
-- skip files with newlines or carriage returns
|
liftIO $ atomically $ snocTList mv f
|
||||||
unless (any (`S8.elem` f) ['\n', '\r']) $
|
S8.hPutStrLn h (fromRef' (LsTree.sha ti))
|
||||||
S8.hPutStrLn h $
|
|
||||||
fromRef' (LsTree.sha ti) <> " " <> f
|
|
||||||
hClose h
|
hClose h
|
||||||
|
|
||||||
reader h = ifM (hIsEOF h)
|
reader mv h = ifM (hIsEOF h)
|
||||||
( return Nothing
|
( return Nothing
|
||||||
, do
|
, do
|
||||||
|
f <- liftIO $ atomically $ headTList mv
|
||||||
resp <- S8.hGetLine h
|
resp <- S8.hGetLine h
|
||||||
case eitherToMaybe $ A.parseOnly respparser resp of
|
case eitherToMaybe $ A.parseOnly respParser resp of
|
||||||
Nothing -> error $ "unknown response from git cat-file " ++ show resp
|
Just r -> do
|
||||||
Just (r, f) -> do
|
|
||||||
content <- readObjectContent h r
|
content <- readObjectContent h r
|
||||||
return (Just (asTopFilePath f, content))
|
return (Just (f, content))
|
||||||
|
Nothing -> error $ "unknown response from git cat-file " ++ show resp
|
||||||
)
|
)
|
||||||
|
|
||||||
params =
|
params =
|
||||||
[ Param "cat-file"
|
[ Param "cat-file"
|
||||||
-- %(rest) is used to feed the filename through
|
, Param ("--batch=" ++ batchFormat)
|
||||||
-- cat-file; it will be at the end of the response
|
|
||||||
, Param ("--batch=" ++ batchFormat ++ " %(rest)")
|
|
||||||
, Param "--buffer"
|
, Param "--buffer"
|
||||||
]
|
]
|
||||||
|
|
||||||
respparser = (,)
|
|
||||||
<$> respParser
|
|
||||||
<* A8.char ' '
|
|
||||||
<*> A.takeByteString
|
|
||||||
|
|
||||||
start = do
|
start = do
|
||||||
|
mv <- liftIO $ atomically newTList
|
||||||
let p = gitCreateProcess params repo
|
let p = gitCreateProcess params repo
|
||||||
(Just hin, Just hout, _, pid) <- createProcess p
|
(Just hin, Just hout, _, pid) <- createProcess p
|
||||||
{ std_in = CreatePipe
|
{ std_in = CreatePipe
|
||||||
, std_out = CreatePipe
|
, std_out = CreatePipe
|
||||||
}
|
}
|
||||||
f <- async (feeder hin)
|
f <- async (feeder mv hin)
|
||||||
return (f, hin, hout, pid)
|
return (mv, f, hin, hout, pid)
|
||||||
|
|
||||||
stop (f, hin, hout, pid) = do
|
stop (_, f, hin, hout, pid) = do
|
||||||
cancel f
|
cancel f
|
||||||
hClose hin
|
hClose hin
|
||||||
hClose hout
|
hClose hout
|
||||||
|
|
|
@ -6,7 +6,9 @@
|
||||||
- Unlike a TQueue, the entire contents of a TList can be efficiently
|
- Unlike a TQueue, the entire contents of a TList can be efficiently
|
||||||
- read without modifying it.
|
- read without modifying it.
|
||||||
-
|
-
|
||||||
- Copyright 2013 Joey Hess <id@joeyh.name>
|
- Copyright 2013-2020 Joey Hess <id@joeyh.name>
|
||||||
|
-
|
||||||
|
- License: BSD-2-clause
|
||||||
-}
|
-}
|
||||||
|
|
||||||
{-# LANGUAGE BangPatterns #-}
|
{-# LANGUAGE BangPatterns #-}
|
||||||
|
@ -21,6 +23,7 @@ module Utility.TList (
|
||||||
consTList,
|
consTList,
|
||||||
snocTList,
|
snocTList,
|
||||||
appendTList,
|
appendTList,
|
||||||
|
headTList,
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Common
|
import Common
|
||||||
|
@ -77,3 +80,6 @@ snocTList tlist v = modifyTList tlist $ \dl -> D.snoc dl v
|
||||||
|
|
||||||
appendTList :: TList a -> [a] -> STM ()
|
appendTList :: TList a -> [a] -> STM ()
|
||||||
appendTList tlist l = modifyTList tlist $ \dl -> D.append dl (D.fromList l)
|
appendTList tlist l = modifyTList tlist $ \dl -> D.append dl (D.fromList l)
|
||||||
|
|
||||||
|
headTList :: TList a -> STM a
|
||||||
|
headTList tlist = D.head <$> readTMVar tlist
|
||||||
|
|
|
@ -16,16 +16,12 @@ be as good as --all's was, but it could still be significant.
|
||||||
> to get file blob, through cat-file to get key, through cat-file to
|
> to get file blob, through cat-file to get key, through cat-file to
|
||||||
> precache logs.
|
> precache logs.
|
||||||
|
|
||||||
catObjectStream not supporting newline or carriage return needs to be dealt
|
|
||||||
with somehow first, because worktree filenames can contain either.
|
|
||||||
|
|
||||||
One odd edge case is, could there be a worktree file that refers to a key
|
One odd edge case is, could there be a worktree file that refers to a key
|
||||||
with no location log? In that case, catObjectStream would skip it. This
|
with no location log? In that case, catObjectStream would skip it. This
|
||||||
doesn't usually happen. One case where it does happen is if the git-annex
|
doesn't usually happen. One case where it does happen is if the git-annex
|
||||||
branch is not pulled, but master is.
|
branch is not pulled, but master is.
|
||||||
|
|
||||||
Perhaps both the newline and the missing location log could be dealt with
|
Perhaps make catObjectStream not skip them, but return an item
|
||||||
together, by making catObjectStream not skip them, but return an item
|
|
||||||
with no log file content. It's important things not be reordered when doing
|
with no log file content. It's important things not be reordered when doing
|
||||||
that -- could a dummy item somehow be passed through cat-file to represent
|
that -- could a dummy item somehow be passed through cat-file to represent
|
||||||
these problem cases?
|
these problem cases?
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue