add git-annex filter-process

filter-process: New command that can make git add/checkout faster when there are a lot of unlocked annexed files or non-annexed files, but that also makes git add of large annexed files slower. Use it by running: git config filter.annex.process 'git-annex filter-process' Fully tested and working, but I have not benchmarked it at all. And, incremental hashing is not done when git add uses it, so extra work is done in that case. Sponsored-by: Mark Reidenbach on Patreon
2021-11-04 15:02:36 -04:00 · 2021-11-04 15:02:36 -04:00 · 68257e9076
commit 68257e9076
parent d706b49979
10 changed files with 254 additions and 43 deletions
--- a/4
+++ b/4
@ -4,6 +4,10 @@ git-annex (8.20211029) UNRELEASED; urgency=medium
    instead output a blank line like other batch commands do.
  * metadata --batch --json: Reject input whose "fields" does not consist
    of arrays of strings. Such invalid input used to be silently ignored.
  * filter-process: New command that can make git add/checkout faster when
    there are a lot of unlocked annexed files or non-annexed files, but that
    also makes git add of large annexed files slower. Use it by running:
    git config filter.annex.process 'git-annex filter-process'
 -- Joey Hess <id@joeyh.name>  Mon, 01 Nov 2021 13:19:46 -0400
--- a/CmdLine/GitAnnex.hs
+++ b/CmdLine/GitAnnex.hs
@ -113,6 +113,7 @@ import qualified Command.P2P
 import qualified Command.Proxy
 import qualified Command.DiffDriver
 import qualified Command.Smudge
 import qualified Command.FilterProcess
 import qualified Command.Undo
 import qualified Command.Version
 import qualified Command.RemoteDaemon
@ -226,6 +227,7 @@ cmds testoptparser testrunner mkbenchmarkgenerator = map addGitAnnexGlobalOption
 	, Command.Proxy.cmd
 	, Command.DiffDriver.cmd
 	, Command.Smudge.cmd
 	, Command.FilterProcess.cmd
 	, Command.Undo.cmd
 	, Command.Version.cmd
 	, Command.RemoteDaemon.cmd
--- a/Command/FilterProcess.hs
+++ b/Command/FilterProcess.hs
@ -0,0 +1,89 @@
 {- git-annex command
 -
 - Copyright 2021 Joey Hess <id@joeyh.name>
 -
 - Licensed under the GNU AGPL version 3 or higher.
 -}
 module Command.FilterProcess where
 import Command
 import qualified Command.Smudge
 import Git.FilterProcess
 import Git.PktLine
 import Annex.Link
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Lazy as L
 cmd :: Command
 cmd = noCommit $ noMessages $
 	command "filter-process" SectionPlumbing 
 		"long running git filter process"
 		paramNothing (withParams seek)
 seek :: CmdParams -> CommandSeek
 seek _ = liftIO longRunningFilterProcessHandshake >>= \case
 	Left err -> giveup err
 	Right () -> go
  where
 	go = liftIO getFilterRequest >>= \case
 		Just (Smudge f) -> do
 			smudge f
 			go
 		Just (Clean f) -> do
 			clean f
 			go
 		Nothing -> return ()
 smudge :: FilePath -> Annex ()
 smudge file = do
 	{- The whole git file content is necessarily buffered in memory,
 	 - because we have to consume everything git is sending before
 	 - we can respond to it. An annexed file will be only a pointer
 	 - though. -}
 	b <- B.concat . map pktLineToByteString <$> liftIO readUntilFlushPkt
 	Command.Smudge.smudge' file (L.fromStrict b)
 	{- Git expects us to output the content of unlocked annexed files,
 	 - but if we got a pointer, we output only the pointer.
 	 - See Command.Smudge.smudge for details of how this works. -}
 	liftIO $ respondFilterRequest b
 clean :: FilePath -> Annex ()
 clean file = do
 	{- We have to consume everything git is sending before we can
 	 - respond to it. But it can be an arbitrarily large file,
 	 - which is being added to the annex, and we do not want to buffer
 	 - all that in memory. 
 	 -
 	 - Start by reading enough to determine if the file is an annex
 	 - pointer.
 	 -}
 	let conv b l = (B.concat (map pktLineToByteString l), b)
 	(b, readcomplete) <- 
 		either (conv False) (conv True)
 			<$> liftIO (readUntilFlushPktOrSize unpaddedMaxPointerSz)
 	let passthrough
 		| readcomplete = liftIO $ respondFilterRequest b
 		| otherwise = liftIO $ do
 			-- Have to buffer the file content in memory here,
 			-- but it's not an annexed file, so not typically
 			-- large, and it's all stored in git, which also
 			-- buffers files in memory.
 			b' <- B.concat . (map pktLineToByteString)
 				<$> readUntilFlushPkt
 			respondFilterRequest (b <> b')
 	let discardreststdin
 		| readcomplete = return ()
 		| otherwise = liftIO discardUntilFlushPkt
 	let emitpointer = liftIO . respondFilterRequest . formatPointer
 	-- This does not incrementally hash, so both git and git-annex
 	-- read from the file. It may be less expensive to incrementally
 	-- hash the content provided by git, but Backend does not currently
 	-- have an interface to do so.
 	Command.Smudge.clean' (toRawFilePath file)
 		(parseLinkTargetOrPointer b)
 		passthrough
 		discardreststdin
 		emitpointer
--- a/Command/Smudge.hs
+++ b/Command/Smudge.hs
@ -74,15 +74,19 @@ seek UpdateOption = commandAction update
 smudge :: FilePath -> CommandStart
 smudge file = do
 	b <- liftIO $ L.hGetContents stdin
-	case parseLinkTargetOrPointerLazy b of
+	smudge' file b
 		Nothing -> noop
 		Just k -> do
 			topfile <- inRepo (toTopFilePath (toRawFilePath file))
 			Database.Keys.addAssociatedFile k topfile
 			void $ smudgeLog k topfile
 	liftIO $ L.putStr b
 	stop
 -- Handles everything except the IO of the file content.
 smudge' :: FilePath -> L.ByteString -> Annex ()
 smudge' file b = case parseLinkTargetOrPointerLazy b of
 	Nothing -> noop
 	Just k -> do
 		topfile <- inRepo (toTopFilePath (toRawFilePath file))
 		Database.Keys.addAssociatedFile k topfile
 		void $ smudgeLog k topfile
 -- Clean filter is fed file content on stdin, decides if a file
 -- should be stored in the annex, and outputs a pointer to its
 -- injested content if so. Otherwise, the original content.
@ -90,50 +94,72 @@ clean :: RawFilePath -> CommandStart
 clean file = do
 	Annex.BranchState.disableUpdate -- optimisation
 	b <- liftIO $ L.hGetContents stdin
-	ifM fileoutsiderepo
+	let passthrough = liftIO $ L.hPut stdout b
-		( liftIO $ L.hPut stdout b
+	-- Before git 2.5, failing to consume all stdin here would
-		, do
+	-- cause a SIGPIPE and crash it.
-			-- Avoid a potential deadlock.
+	-- Newer git catches the signal and stops sending, which is
-			Annex.changeState $ \s -> s
+	-- much faster. (Also, git seems to forget to free memory
-				{ Annex.insmudgecleanfilter = True }
+	-- when sending the file, so the less we let it send, the
-			go b
+	-- less memory it will waste.)
-		)
+	let discardreststdin = if Git.BuildVersion.older "2.5"
 		then L.length b `seq` return ()
 		else liftIO $ hClose stdin
 	let emitpointer = liftIO . S.hPut stdout . formatPointer
 	clean' file (parseLinkTargetOrPointerLazy b)
 		passthrough
 		discardreststdin
 		emitpointer
 	stop
  where
-	go b = case parseLinkTargetOrPointerLazy b of
+
 -- Handles everything except the IO of the file content.
 clean'
 	:: RawFilePath
 	-> Maybe Key
 	-- ^ If the content provided by git is an annex pointer,
 	-- this is the key it points to.
 	-> Annex ()
 	-- ^ passthrough: Feed the content provided by git back out to git.
 	-> Annex ()
 	-- ^ discardreststdin: Called when passthrough will not be called,
 	-- this has to take care of reading the content provided by git, or
 	-- otherwise dealing with it.
 	-> (Key -> Annex ())
 	-- ^ emitpointer: Emit a pointer file for the key.
 	-> Annex ()
 clean' file mk passthrough discardreststdin emitpointer =
 	ifM (fileOutsideRepo file)
 		( passthrough
 		, inSmudgeCleanFilter go
 		)
  where
 	go = case mk of
 		Just k -> do
 			addingExistingLink file k $ do
 				getMoveRaceRecovery k file
-				liftIO $ L.hPut stdout b
+				passthrough
 		Nothing -> inRepo (Git.Ref.fileRef file) >>= \case
 			Just fileref -> do
 				indexmeta <- catObjectMetaData fileref
 				oldkey <- case indexmeta of
 					Just (_, sz, _) -> catKey' fileref sz
 					Nothing -> return Nothing
-				go' b indexmeta oldkey
+				go' indexmeta oldkey
-			Nothing -> liftIO $ L.hPut stdout b
+			Nothing -> passthrough
-	go' b indexmeta oldkey = ifM (shouldAnnex file indexmeta oldkey)
+	go' indexmeta oldkey = ifM (shouldAnnex file indexmeta oldkey)
 		( do
-			-- Before git 2.5, failing to consume all stdin here
+			discardreststdin
 			-- would cause a SIGPIPE and crash it.
 			-- Newer git catches the signal and stops sending,
 			-- which is much faster. (Also, git seems to forget
 			-- to free memory when sending the file, so the
 			-- less we let it send, the less memory it will waste.)
 			if Git.BuildVersion.older "2.5"
 				then L.length b `seq` return ()
 				else liftIO $ hClose stdin
 			-- Optimization for the case when the file is already
 			-- annexed and is unmodified.
 			case oldkey of
 				Nothing -> doingest Nothing
 				Just ko -> ifM (isUnmodifiedCheap ko file)
-					( liftIO $ emitPointer ko
+					( emitpointer ko
 					, updateingest ko
 					)
-		, liftIO $ L.hPut stdout b
+		, passthrough
 		)
 	-- Use the same backend that was used before, when possible.
@ -150,7 +176,7 @@ clean file = do
 		-- Can't restage associated files because git add
 		-- runs this and has the index locked.
 		let norestage = Restage False
-		liftIO . emitPointer
+		emitpointer
 			=<< postingest
 			=<< (\ld -> ingest' preferredbackend nullMeterUpdate ld Nothing norestage)
 			=<< lockDown cfg (fromRawFilePath file)
@ -166,12 +192,22 @@ clean file = do
 		, checkWritePerms = True
 		}
-	-- git diff can run the clean filter on files outside the
+-- git diff can run the clean filter on files outside the
-	-- repository; can't annex those
+-- repository; can't annex those
-	fileoutsiderepo = do
+fileOutsideRepo :: RawFilePath -> Annex Bool
-	        repopath <- liftIO . absPath =<< fromRepo Git.repoPath
+fileOutsideRepo file = do
-		filepath <- liftIO $ absPath file
+        repopath <- liftIO . absPath =<< fromRepo Git.repoPath
-		return $ not $ dirContains repopath filepath
+	filepath <- liftIO $ absPath file
 	return $ not $ dirContains repopath filepath
 -- Avoid a potential deadlock.
 inSmudgeCleanFilter :: Annex a -> Annex a
 inSmudgeCleanFilter = bracket setup cleanup . const
  where
 	setup = Annex.changeState $ \s -> s
 		{ Annex.insmudgecleanfilter = True }
 	cleanup () = Annex.changeState $ \s -> s
 		{ Annex.insmudgecleanfilter = False }
 -- If annex.largefiles is configured (and not disabled by annex.gitaddtoannex
 -- being set to false), matching files are added to the annex and the rest to
@ -245,9 +281,6 @@ shouldAnnex file indexmeta moldkey = do
 			_ -> cont
 		_ -> cont
 emitPointer :: Key -> IO ()
 emitPointer = S.putStr . formatPointer
 -- Recover from a previous race between eg git mv and git-annex get.
 -- That could result in the file remaining a pointer file, while
 -- its content is present in the annex. Populate the pointer file.
--- a/Git/FilterProcess.hs
+++ b/Git/FilterProcess.hs
@ -15,17 +15,18 @@ module Git.FilterProcess (
 	Version(..),
 	Capability(..),
 	readUntilFlushPkt,
 	readUntilFlushPktOrSize,
 	discardUntilFlushPkt,
 	longRunningProcessHandshake,
 	longRunningFilterProcessHandshake,
 	FilterRequest(..),
 	getFilterRequest,
 	respondFilterRequest,
 ) where
 import Common
 import Git.PktLine
 import System.IO
 import qualified Data.ByteString as B
 {- This is a message like "git-filter-client" or "git-filter-server" -}
@ -85,6 +86,24 @@ readUntilFlushPkt = go []
 		Just pktline | not (isFlushPkt pktline) -> go (pktline:l)
 		_ -> return (reverse l)
 {- Reads PktLines until at least the specified number of bytes have been
 - read, or until a flushPkt (or EOF). Returns Right if it did read a
 - flushPkt/EOF, and Left if there is still content leftover that needs to
 - be read. -}
 readUntilFlushPktOrSize :: Int -> IO (Either [PktLine] [PktLine])
 readUntilFlushPktOrSize = go []
  where
 	go l n = readPktLine stdin >>= \case
 		Just pktline
 			| isFlushPkt pktline -> return (Right (reverse l))
 			| otherwise -> 
 				let len = B.length (pktLineToByteString pktline)
 				    n' = n - len
 				in if n' <= 0
 					then return (Left (reverse (pktline:l)))
 					else go (pktline:l) n'
 		Nothing -> return (Right (reverse l))
 {- Reads PktLines until a flushPkt (or EOF), and throws them away. -}
 discardUntilFlushPkt :: IO ()
 discardUntilFlushPkt = readPktLine stdin >>= \case
--- a/Git/PktLine.hs
+++ b/Git/PktLine.hs
@ -101,9 +101,14 @@ writePktLine h (PktLine b)
 		hFlush h
 {- Maximum possible length of the string encoded in PktLine;
- - the length header takes up 4 bytes. -}
+ - the length header takes up 4 bytes.
 -
 - While the 4 byte length header can express lengths up to 65535,
 - git actually does not support packets larger than 65520 (including the
 - header). See "LARGE_PACKET_MAX" in the git source code.
 -}
 maxPktLineLength :: Int
-maxPktLineLength = 65535 - 4
+maxPktLineLength = 65520 - 4
 flushPkt :: PktLine
 flushPkt = PktLine mempty
--- a/doc/git-annex-filter-process.mdwn
+++ b/doc/git-annex-filter-process.mdwn
@ -0,0 +1,48 @@
 # NAME
 git-annex filter-process - long running git filter process for git-annex
 # SYNOPSIS
 git annex filter-process
 # DESCRIPTION
 By default, `git-annex smudge` is used as the git filter driver.
 This is an alternative way to accomplish the same thing, using git's
 long-running filter process interface.
 To enable using this, run:
 	git config filter.annex.process 'git-annex filter-process'
 There will be no visible difference in behavior between enabling this and
 not, besides changes in speed and memory use when using git.
 When this is not enabled, each file that git wants to filter involves
 starting up a new `git-annex smudge` process. Starting many such processes
 for many files can be slow, and can make commands like `git checkout` and
 `git add` slow when they are operating on a lot of files. (A lot of locked
 annexed files do not make `git checkout` slow, but unlocked files and
 non-annexed files do slow it down.)
 On the other hand when this is enabled, `git add` of a large file does an
 unncessary extra read of the file, and pipes its contents into git-annex.
 So if you enable this, it will be faster to use `git-annex add` to add
 large files to the annex, rather than `git add`. Other commands that
 add files, like `git commit -a`, are also impacted by this.
 # OPTIONS
 * The [[git-annex-common-options]](1) can be used.
 # SEE ALSO
 [[git-annex]](1)
 [[git-annex-smudge]](1)
 # AUTHOR
 Joey Hess <id@joeyh.name>
 Warning: Automatically converted into a man page by mdwn2man. Edit with care.
--- a/doc/git-annex-smudge.mdwn
+++ b/doc/git-annex-smudge.mdwn
@ -58,6 +58,7 @@ tree.
 # SEE ALSO
 [[git-annex]](1)
 [[git-annex-filter-process]](1)
 # AUTHOR
--- a/doc/git-annex.mdwn
+++ b/doc/git-annex.mdwn
@ -713,6 +713,13 @@ content from the key-value store.
  See [[git-annex-smudge]](1) for details.
 * `filter-process`
  An alternative implementation of a git filter driver, that is faster
  in some situations and slower in others than `git-annex smudge`.
  See [[git-annex-filter-process]](1) for details.
 * `findref [ref]`
  Lists files in a git ref. (deprecated)
--- a/git-annex.cabal
+++ b/git-annex.cabal
@ -745,6 +745,7 @@ Executable git-annex
    Command.Expire
    Command.Export
    Command.FilterBranch
    Command.FilterProcess
    Command.Find
    Command.FindRef
    Command.Fix
@ -871,6 +872,7 @@ Executable git-annex
    Git.FileMode
    Git.FilePath
    Git.Filename
    Git.FilterProcess
    Git.Fsck
    Git.GCrypt
    Git.HashObject
@ -882,6 +884,7 @@ Executable git-annex
    Git.LsTree
    Git.Merge
    Git.Objects
    Git.PktLine
    Git.Queue
    Git.Ref
    Git.RefLog