git-annex/Command/FilterProcess.hs

{- git-annex command
 -
 - Copyright 2021 Joey Hess <id@joeyh.name>
 -
 - Licensed under the GNU AGPL version 3 or higher.
 -}

module Command.FilterProcess where

import Command
import qualified Command.Smudge
import Git.FilterProcess
import Git.PktLine
import Annex.Link

import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as L

cmd :: Command
cmd = noCommit $ noMessages $
	command "filter-process" SectionPlumbing 
		"long running git filter process"
		paramNothing (withParams seek)

seek :: CmdParams -> CommandSeek
seek _ = liftIO longRunningFilterProcessHandshake >>= \case
	Left err -> giveup err
	Right () -> go
  where
	go = liftIO getFilterRequest >>= \case
		Just (Smudge f) -> do
			smudge f
			go
		Just (Clean f) -> do
			clean f
			go
		Nothing -> return ()

smudge :: FilePath -> Annex ()
smudge file = do
	{- The whole git file content is necessarily buffered in memory,
	 - because we have to consume everything git is sending before
	 - we can respond to it. An annexed file will be only a pointer
	 - though. -}
	b <- B.concat . map pktLineToByteString <$> liftIO readUntilFlushPkt
	Command.Smudge.smudge' file (L.fromStrict b)
	{- Git expects us to output the content of unlocked annexed files,
	 - but if we got a pointer, we output only the pointer.
	 - See Command.Smudge.smudge for details of how this works. -}
	liftIO $ respondFilterRequest b

clean :: FilePath -> Annex ()
clean file = do
	{- We have to consume everything git is sending before we can
	 - respond to it. But it can be an arbitrarily large file,
	 - which is being added to the annex, and we do not want to buffer
	 - all that in memory. 
	 -
	 - Start by reading enough to determine if the file is an annex
	 - pointer.
	 -}
	let conv b l = (B.concat (map pktLineToByteString l), b)
	(b, readcomplete) <- 
		either (conv False) (conv True)
			<$> liftIO (readUntilFlushPktOrSize maxPointerSz)
	
	let passthrough
		| readcomplete = liftIO $ respondFilterRequest b
		| otherwise = liftIO $ do
			-- Have to buffer the file content in memory here,
			-- but it's not an annexed file, so not typically
			-- large, and it's all stored in git, which also
			-- buffers files in memory.
			b' <- B.concat . (map pktLineToByteString)
				<$> readUntilFlushPkt
			respondFilterRequest (b <> b')
	let discardreststdin
		| readcomplete = return ()
		| otherwise = liftIO discardUntilFlushPkt
	let emitpointer = liftIO . respondFilterRequest . formatPointer
	-- This does not incrementally hash, so both git and git-annex
	-- read from the file. It may be less expensive to incrementally
	-- hash the content provided by git, but Backend does not currently
	-- have an interface to do so.
	Command.Smudge.clean' (toRawFilePath file)
		(parseLinkTargetOrPointer' b)
		passthrough
		discardreststdin
		emitpointer
add git-annex filter-process filter-process: New command that can make git add/checkout faster when there are a lot of unlocked annexed files or non-annexed files, but that also makes git add of large annexed files slower. Use it by running: git config filter.annex.process 'git-annex filter-process' Fully tested and working, but I have not benchmarked it at all. And, incremental hashing is not done when git add uses it, so extra work is done in that case. Sponsored-by: Mark Reidenbach on Patreon 2021-11-04 19:02:36 +00:00			`{- git-annex command`
			`-`
			`- Copyright 2021 Joey Hess <id@joeyh.name>`
			`-`
			`- Licensed under the GNU AGPL version 3 or higher.`
			`-}`

			`module Command.FilterProcess where`

			`import Command`
			`import qualified Command.Smudge`
			`import Git.FilterProcess`
			`import Git.PktLine`
			`import Annex.Link`

			`import qualified Data.ByteString as B`
			`import qualified Data.ByteString.Lazy as L`

			`cmd :: Command`
			`cmd = noCommit $ noMessages $`
			`command "filter-process" SectionPlumbing`
			`"long running git filter process"`
			`paramNothing (withParams seek)`

			`seek :: CmdParams -> CommandSeek`
			`seek _ = liftIO longRunningFilterProcessHandshake >>= \case`
			`Left err -> giveup err`
			`Right () -> go`
			`where`
			`go = liftIO getFilterRequest >>= \case`
			`Just (Smudge f) -> do`
			`smudge f`
			`go`
			`Just (Clean f) -> do`
			`clean f`
			`go`
			`Nothing -> return ()`

			`smudge :: FilePath -> Annex ()`
			`smudge file = do`
			`{- The whole git file content is necessarily buffered in memory,`
			`- because we have to consume everything git is sending before`
			`- we can respond to it. An annexed file will be only a pointer`
			`- though. -}`
			`b <- B.concat . map pktLineToByteString <$> liftIO readUntilFlushPkt`
			`Command.Smudge.smudge' file (L.fromStrict b)`
			`{- Git expects us to output the content of unlocked annexed files,`
			`- but if we got a pointer, we output only the pointer.`
			`- See Command.Smudge.smudge for details of how this works. -}`
			`liftIO $ respondFilterRequest b`

			`clean :: FilePath -> Annex ()`
			`clean file = do`
			`{- We have to consume everything git is sending before we can`
			`- respond to it. But it can be an arbitrarily large file,`
			`- which is being added to the annex, and we do not want to buffer`
			`- all that in memory.`
			`-`
			`- Start by reading enough to determine if the file is an annex`
			`- pointer.`
			`-}`
			`let conv b l = (B.concat (map pktLineToByteString l), b)`
			`(b, readcomplete) <-`
			`either (conv False) (conv True)`
read a consistent amount from pointer file A few places were reading the max symlink size of a pointer file, then passing tp parseLinkTargetOrPointer. Which is fine currently, but to support pointer files with lines of data after the pointer, enough has to be read that parseLinkTargetOrPointer can be assured of seeing enough of that data to know if it's correctly formatted. Sponsored-by: Dartmouth College's Datalad project 2022-02-23 16:38:35 +00:00			`<$> liftIO (readUntilFlushPktOrSize maxPointerSz)`
add git-annex filter-process filter-process: New command that can make git add/checkout faster when there are a lot of unlocked annexed files or non-annexed files, but that also makes git add of large annexed files slower. Use it by running: git config filter.annex.process 'git-annex filter-process' Fully tested and working, but I have not benchmarked it at all. And, incremental hashing is not done when git add uses it, so extra work is done in that case. Sponsored-by: Mark Reidenbach on Patreon 2021-11-04 19:02:36 +00:00
			`let passthrough`
			`\| readcomplete = liftIO $ respondFilterRequest b`
			`\| otherwise = liftIO $ do`
			`-- Have to buffer the file content in memory here,`
			`-- but it's not an annexed file, so not typically`
			`-- large, and it's all stored in git, which also`
			`-- buffers files in memory.`
			`b' <- B.concat . (map pktLineToByteString)`
			`<$> readUntilFlushPkt`
			`respondFilterRequest (b <> b')`
			`let discardreststdin`
			`\| readcomplete = return ()`
			`\| otherwise = liftIO discardUntilFlushPkt`
			`let emitpointer = liftIO . respondFilterRequest . formatPointer`
			`-- This does not incrementally hash, so both git and git-annex`
			`-- read from the file. It may be less expensive to incrementally`
			`-- hash the content provided by git, but Backend does not currently`
			`-- have an interface to do so.`
			`Command.Smudge.clean' (toRawFilePath file)`
smudge: Warn when encountering a pointer file that has other content appended to it It will then proceed to add the file the same as if it were any other file containing possibly annexable content. Usually the file is one that was annexed before, so the new, probably corrupt content will also be added to the annex. If the file was not annexed before, the content will be added to git. It's not possible for the smudge filter to throw an error here, because git then just adds the file to git anyway. Sponsored-by: Dartmouth College's Datalad project 2022-02-23 19:17:08 +00:00			`(parseLinkTargetOrPointer' b)`
add git-annex filter-process filter-process: New command that can make git add/checkout faster when there are a lot of unlocked annexed files or non-annexed files, but that also makes git add of large annexed files slower. Use it by running: git config filter.annex.process 'git-annex filter-process' Fully tested and working, but I have not benchmarked it at all. And, incremental hashing is not done when git add uses it, so extra work is done in that case. Sponsored-by: Mark Reidenbach on Patreon 2021-11-04 19:02:36 +00:00			`passthrough`
			`discardreststdin`
			`emitpointer`