sped up the --all option by 2x to 16x by using git cat-file --buffer

This assumes that no location log files will have a newline or carriage return in their name. catObjectStream skips any such files due to cat-file not supporting them. Keys have been prevented from containing newlines since 2011, commit 480495beb4. If some old repo had a key with a newline in it, --all will just skip processing that key. Other things, like .git/annex/unused files certianly assume no newlines in keys too, and AFAICR, such keys never actually worked. Carriage return is escaped by preSanitizeKeyName since 2013. WORM keys generated before that point could perhaps contain a CR. (URL probably not, http probably doesn't support an URL with a raw CR in it.) So, added a warning in fsck about such keys. Although, fsck --all will naturally skip them, so won't be able to warn about them. Not entirely satisfactory, but I'll bet there are not really any such keys in existence. Thanks to Lukey for finding this optimisation.
2020-07-07 13:46:45 -04:00 · 2020-07-07 13:46:45 -04:00 · d010ab04be
commit d010ab04be
parent 98e2e3cb9c
5 changed files with 162 additions and 24 deletions
--- a/Backend/WORM.hs
+++ b/Backend/WORM.hs
@ -27,7 +27,7 @@ backend = Backend
 	, getKey = Just keyValue
 	, verifyKeyContent = Nothing
 	, canUpgradeKey = Just needsUpgrade
-	, fastMigrate = Just removeSpaces
+	, fastMigrate = Just removeProblemChars
 	, isStableKey = const True
 	}
@ -48,12 +48,13 @@ keyValue source _ = do
 		, keyMtime = Just $ modificationTime stat
 		}
-{- Old WORM keys could contain spaces, and can be upgraded to remove them. -}
+{- Old WORM keys could contain spaces and carriage returns, 
 - and can be upgraded to remove them. -}
 needsUpgrade :: Key -> Bool
-needsUpgrade key = ' ' `S8.elem` fromKey keyName key
+needsUpgrade key = any (`S8.elem` fromKey keyName key) [' ', '\r']
-removeSpaces :: Key -> Backend -> AssociatedFile -> Annex (Maybe Key)
+removeProblemChars :: Key -> Backend -> AssociatedFile -> Annex (Maybe Key)
-removeSpaces oldkey newbackend _
+removeProblemChars oldkey newbackend _
 	| migratable = return $ Just $ alterKey oldkey $ \d -> d
 		{ keyName = encodeBS $ reSanitizeKeyName $ decodeBS $ keyName d }
 	| otherwise = return Nothing
--- a/6
+++ b/6
@ -27,6 +27,12 @@ git-annex (8.20200618) UNRELEASED; urgency=medium
    git-annex sync --content --all gets 20% faster.
  * Fix a recently introduced bug that could cause a "fork: resource exhausted"
    after getting several thousand files.
  * Sped up the --all option by 2x to 16x by using git cat-file --buffer.
    Thanks to Lukey for finding this optimisation.
  * fsck: Detect if WORM keys contain a carriage return, and recommend
    upgrading to fix the problem, because the --all optimisation above
    skips any such keys that might exist (git-annex could have maybe 
    created such keys back in 2013).
 -- Joey Hess <id@joeyh.name>  Thu, 18 Jun 2020 12:21:14 -0400
--- a/CmdLine/Seek.hs
+++ b/CmdLine/Seek.hs
@ -22,16 +22,19 @@ import qualified Git.LsTree as LsTree
 import Git.FilePath
 import qualified Limit
 import CmdLine.GitAnnex.Options
-import Logs.Location
+import Logs
 import Logs.Unused
 import Types.Transfer
 import Logs.Transfer
 import Remote.List
 import qualified Remote
 import Annex.CatFile
 import Git.CatFile (catObjectStream)
 import Annex.CurrentBranch
 import Annex.Content
 import Annex.InodeSentinal
 import qualified Annex.Branch
 import qualified Annex.BranchState
 import qualified Database.Keys
 import qualified Utility.RawFilePath as R
@ -180,26 +183,55 @@ withKeyOptions' ko auto mkkeyaction fallbackaction params = do
 		giveup "Cannot use --auto in a bare repository"
 	case (null params, ko) of
 		(True, Nothing)
-			| bare -> noauto $ runkeyaction finishCheck loggedKeys 
+			| bare -> noauto runallkeys
 			| otherwise -> fallbackaction params
 		(False, Nothing) -> fallbackaction params
-		(True, Just WantAllKeys) -> noauto $ runkeyaction finishCheck loggedKeys
+		(True, Just WantAllKeys) -> noauto runallkeys
-		(True, Just WantUnusedKeys) -> noauto $ runkeyaction (pure . Just) unusedKeys'
+		(True, Just WantUnusedKeys) -> noauto $ runkeyaction unusedKeys'
 		(True, Just WantFailedTransfers) -> noauto runfailedtransfers
-		(True, Just (WantSpecificKey k)) -> noauto $ runkeyaction (pure . Just) (return [k])
+		(True, Just (WantSpecificKey k)) -> noauto $ runkeyaction (return [k])
-		(True, Just WantIncompleteKeys) -> noauto $ runkeyaction (pure . Just) incompletekeys
+		(True, Just WantIncompleteKeys) -> noauto $ runkeyaction incompletekeys
 		(True, Just (WantBranchKeys bs)) -> noauto $ runbranchkeys bs
 		(False, Just _) -> giveup "Can only specify one of file names, --all, --branch, --unused, --failed, --key, or --incomplete"
  where
 	noauto a
 		| auto = giveup "Cannot use --auto with --all or --branch or --unused or --key or --incomplete"
 		| otherwise = a
 	incompletekeys = staleKeysPrune gitAnnexTmpObjectDir True
-	runkeyaction checker getks = do
+
 	-- List all location log files on the git-annex branch,
 	-- and use those to get keys. Pass through cat-file
 	-- to get the contents of the location logs, and pre-cache
 	-- those. This significantly speeds up typical operations
 	-- that need to look at the location log for each key.
 	runallkeys = do
 		keyaction <- mkkeyaction
 		config <- Annex.getGitConfig
 		g <- Annex.gitRepo
 		void Annex.Branch.update
 		(l, cleanup) <- inRepo $ LsTree.lsTree
 			LsTree.LsTreeRecursive
 			Annex.Branch.fullname
 		let getk = locationLogFileKey config . getTopFilePath
 		let go reader = liftIO reader >>= \case
 			Nothing -> return ()
 			Just (f, content) -> do
 				case getk f of
 					Just k -> do
 						Annex.BranchState.setCache (getTopFilePath f) content
 						keyaction (k, mkActionItem k)
 					Nothing -> return ()
 				go reader
 		catObjectStream l (isJust . getk . LsTree.file) g go
 		liftIO $ void cleanup
 	runkeyaction getks = do
 		keyaction <- mkkeyaction
 		ks <- getks
-		forM_ ks $ checker >=> maybe noop 
+		forM_ ks $ \k -> keyaction (k, mkActionItem k)
-			(\k -> keyaction (k, mkActionItem k))
+	
 	runbranchkeys bs = do
 		keyaction <- mkkeyaction
 		forM_ bs $ \b -> do
@ -211,6 +243,7 @@ withKeyOptions' ko auto mkkeyaction fallbackaction params = do
 					in keyaction (k, bfp)
 			unlessM (liftIO cleanup) $
 				error ("git ls-tree " ++ Git.fromRef b ++ " failed")
 	runfailedtransfers = do
 		keyaction <- mkkeyaction
 		rs <- remoteList
--- a/Git/CatFile.hs
+++ b/Git/CatFile.hs
@ -19,6 +19,7 @@ module Git.CatFile (
 	catObject,
 	catObjectDetails,
 	catObjectMetaData,
 	catObjectStream,
 ) where
 import System.IO
@ -33,6 +34,7 @@ import Data.Char
 import Numeric
 import System.Posix.Types
 import Text.Read
 import Control.Concurrent.Async
 import Common
 import Git
@ -42,8 +44,10 @@ import Git.Command
 import Git.Types
 import Git.FilePath
 import Git.HashObject
 import qualified Git.LsTree as LsTree
 import qualified Utility.CoProcess as CoProcess
 import Utility.Tuple
 import Control.Monad.IO.Class (MonadIO)
 data CatFileHandle = CatFileHandle 
 	{ catFileProcess :: CoProcess.CoProcessHandle
@ -57,7 +61,7 @@ catFileStart = catFileStart' True
 catFileStart' :: Bool -> Repo -> IO CatFileHandle
 catFileStart' restartable repo = CatFileHandle
 	<$> startp "--batch"
-	<*> startp "--batch-check=%(objectname) %(objecttype) %(objectsize)"
+	<*> startp ("--batch-check=" ++ batchFormat)
 	<*> pure repo
  where
 	startp p = gitCoProcessStart restartable
@ -65,6 +69,9 @@ catFileStart' restartable repo = CatFileHandle
 		, Param p
 		] repo
 batchFormat :: String
 batchFormat = "%(objectname) %(objecttype) %(objectsize)"
 catFileStop :: CatFileHandle -> IO ()
 catFileStop h = do
 	CoProcess.stop (catFileProcess h)
@ -88,18 +95,12 @@ catObjectDetails :: CatFileHandle -> Ref -> IO (Maybe (L.ByteString, Sha, Object
 catObjectDetails h object = query (catFileProcess h) object newlinefallback $ \from -> do
 	header <- S8.hGetLine from
 	case parseResp object header of
-		Just (ParsedResp sha objtype size) -> do
+		Just r@(ParsedResp sha objtype _size) -> do
-			content <- S.hGet from (fromIntegral size)
+			content <- readObjectContent from r
-			eatchar '\n' from
+			return $ Just (content, sha, objtype)
 			return $ Just (L.fromChunks [content], sha, objtype)
 		Just DNE -> return Nothing
 		Nothing -> error $ "unknown response from git cat-file " ++ show (header, object)
  where
 	eatchar expected from = do
 		c <- hGetChar from
 		when (c /= expected) $
 			error $ "missing " ++ (show expected) ++ " from git cat-file"
 	-- Slow fallback path for filenames containing newlines.
 	newlinefallback = queryObjectType object (gitRepo h) >>= \case
 		Nothing -> return Nothing
@ -113,6 +114,18 @@ catObjectDetails h object = query (catFileProcess h) object newlinefallback $ \f
 					(gitRepo h)
 				return (Just (content, sha, objtype))
 readObjectContent :: Handle -> ParsedResp -> IO L.ByteString
 readObjectContent h (ParsedResp _ _ size) = do
 	content <- S.hGet h (fromIntegral size)
 	eatchar '\n'
 	return (L.fromChunks [content])
  where
 	eatchar expected = do
 		c <- hGetChar h
 		when (c /= expected) $
 			error $ "missing " ++ (show expected) ++ " from git cat-file"
 readObjectContent _ DNE = error "internal"
 {- Gets the size and type of an object, without reading its content. -}
 catObjectMetaData :: CatFileHandle -> Ref -> IO (Maybe (Sha, FileSize, ObjectType))
 catObjectMetaData h object = query (checkFileProcess h) object newlinefallback $ \from -> do
@ -266,3 +279,69 @@ parseCommit b = Commit
 	sp = fromIntegral (ord ' ')
 	lt = fromIntegral (ord '<')
 	gt = fromIntegral (ord '>')
 {- Uses cat-file to stream the contents of the files listed by lstree
 - as efficiently as possible. This is much faster than querying it
 - repeatedly per file.
 -
 - Any files with a newline or carriage return in their name will be
 - skipped, because the interface does not support them.
 -}
 catObjectStream
 	:: (MonadMask m, MonadIO m)
 	=> [LsTree.TreeItem]
 	-> (LsTree.TreeItem -> Bool)
 	-> Repo
 	-> (IO (Maybe (TopFilePath, L.ByteString)) -> m ())
 	-> m ()
 catObjectStream l want repo a = assertLocal repo $ 
 	bracketIO start stop $ \(_, _, hout, _) -> a (reader hout)
  where
 	feeder h = do
 		forM_ l $ \ti ->
 			when (want ti) $ do
 				let f = getTopFilePath (LsTree.file ti)
 				-- skip files with newlines or carriage returns
 				unless (any (`S8.elem` f) ['\n', '\r']) $
 					S8.hPutStrLn h $
 						fromRef' (LsTree.sha ti) <> " " <> f
 		hClose h
 	reader h = ifM (hIsEOF h)
 		( return Nothing
 		, do
 			resp <- S8.hGetLine h
 			case eitherToMaybe $ A.parseOnly respparser resp of
 				Nothing -> error $ "unknown response from git cat-file " ++ show resp
 				Just (r, f) -> do
 					content <- readObjectContent h r
 					return (Just (asTopFilePath f, content))
 		)
 	params = 
 		[ Param "cat-file"
 		-- %(rest) is used to feed the filename through
 		-- cat-file; it will be at the end of the response
 		, Param ("--batch=" ++ batchFormat ++ " %(rest)")
 		, Param "--buffer"
 		]
 	respparser = (,)
 		<$> respParser
 		<* A8.char ' '
 		<*> A.takeByteString
 	start = do
 		let p = gitCreateProcess params repo
 		(Just hin, Just hout, _, pid) <- createProcess p
 			{ std_in = CreatePipe
 			, std_out = CreatePipe
 			}
 		f <- async (feeder hin)
 		return (f, hin, hout, pid)
 	stop (f, hin, hout, pid) = do
 		cancel f
 		hClose hin
 		hClose hout
 		void $ checkSuccessProcess pid
--- a/doc/todo/speed_up_git_annex_sync_--content_--all/comment_10_d1d462a22ad849c62e8620a12f3ba064._comment
+++ b/doc/todo/speed_up_git_annex_sync_--content_--all/comment_10_d1d462a22ad849c62e8620a12f3ba064._comment
@ -0,0 +1,19 @@
 [[!comment format=mdwn
 username="joey"
 subject="""comment 10"""
 date="2020-07-07T17:13:46Z"
 content="""
 Wow, I implemented your --buffer trick, and `get --all`
 is over 2x faster. `sync --content --all` somewhat less, but still
 another decent improvement there. (cold cache timings)
 And some warm cache times are *much* faster than my cold cache benchmarks.
 `get --all` is 17x faster in a 10k file repo, which makes it only 3x slower
 than `get` without --all.
 I think I will still leave this open because it's still worth considering
 sqlite caching or finding a way to speed up the second sync --all pass...
 but would be interested to know how your use case is improved now.
 Please feel free to find optimisations anytime, I really appreciate it.
 """]]