add git-annex filter-process

filter-process: New command that can make git add/checkout faster when
there are a lot of unlocked annexed files or non-annexed files, but that
also makes git add of large annexed files slower.

Use it by running: git
config filter.annex.process 'git-annex filter-process'

Fully tested and working, but I have not benchmarked it at all.
And, incremental hashing is not done when git add uses it, so extra work is
done in that case.

Sponsored-by: Mark Reidenbach on Patreon
This commit is contained in:
Joey Hess 2021-11-04 15:02:36 -04:00
parent d706b49979
commit 68257e9076
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
10 changed files with 254 additions and 43 deletions

89
Command/FilterProcess.hs Normal file
View file

@ -0,0 +1,89 @@
{- git-annex command
-
- Copyright 2021 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
module Command.FilterProcess where
import Command
import qualified Command.Smudge
import Git.FilterProcess
import Git.PktLine
import Annex.Link
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as L
cmd :: Command
cmd = noCommit $ noMessages $
command "filter-process" SectionPlumbing
"long running git filter process"
paramNothing (withParams seek)
seek :: CmdParams -> CommandSeek
seek _ = liftIO longRunningFilterProcessHandshake >>= \case
Left err -> giveup err
Right () -> go
where
go = liftIO getFilterRequest >>= \case
Just (Smudge f) -> do
smudge f
go
Just (Clean f) -> do
clean f
go
Nothing -> return ()
smudge :: FilePath -> Annex ()
smudge file = do
{- The whole git file content is necessarily buffered in memory,
- because we have to consume everything git is sending before
- we can respond to it. An annexed file will be only a pointer
- though. -}
b <- B.concat . map pktLineToByteString <$> liftIO readUntilFlushPkt
Command.Smudge.smudge' file (L.fromStrict b)
{- Git expects us to output the content of unlocked annexed files,
- but if we got a pointer, we output only the pointer.
- See Command.Smudge.smudge for details of how this works. -}
liftIO $ respondFilterRequest b
clean :: FilePath -> Annex ()
clean file = do
{- We have to consume everything git is sending before we can
- respond to it. But it can be an arbitrarily large file,
- which is being added to the annex, and we do not want to buffer
- all that in memory.
-
- Start by reading enough to determine if the file is an annex
- pointer.
-}
let conv b l = (B.concat (map pktLineToByteString l), b)
(b, readcomplete) <-
either (conv False) (conv True)
<$> liftIO (readUntilFlushPktOrSize unpaddedMaxPointerSz)
let passthrough
| readcomplete = liftIO $ respondFilterRequest b
| otherwise = liftIO $ do
-- Have to buffer the file content in memory here,
-- but it's not an annexed file, so not typically
-- large, and it's all stored in git, which also
-- buffers files in memory.
b' <- B.concat . (map pktLineToByteString)
<$> readUntilFlushPkt
respondFilterRequest (b <> b')
let discardreststdin
| readcomplete = return ()
| otherwise = liftIO discardUntilFlushPkt
let emitpointer = liftIO . respondFilterRequest . formatPointer
-- This does not incrementally hash, so both git and git-annex
-- read from the file. It may be less expensive to incrementally
-- hash the content provided by git, but Backend does not currently
-- have an interface to do so.
Command.Smudge.clean' (toRawFilePath file)
(parseLinkTargetOrPointer b)
passthrough
discardreststdin
emitpointer

View file

@ -74,15 +74,19 @@ seek UpdateOption = commandAction update
smudge :: FilePath -> CommandStart
smudge file = do
b <- liftIO $ L.hGetContents stdin
case parseLinkTargetOrPointerLazy b of
Nothing -> noop
Just k -> do
topfile <- inRepo (toTopFilePath (toRawFilePath file))
Database.Keys.addAssociatedFile k topfile
void $ smudgeLog k topfile
smudge' file b
liftIO $ L.putStr b
stop
-- Handles everything except the IO of the file content.
smudge' :: FilePath -> L.ByteString -> Annex ()
smudge' file b = case parseLinkTargetOrPointerLazy b of
Nothing -> noop
Just k -> do
topfile <- inRepo (toTopFilePath (toRawFilePath file))
Database.Keys.addAssociatedFile k topfile
void $ smudgeLog k topfile
-- Clean filter is fed file content on stdin, decides if a file
-- should be stored in the annex, and outputs a pointer to its
-- injested content if so. Otherwise, the original content.
@ -90,50 +94,72 @@ clean :: RawFilePath -> CommandStart
clean file = do
Annex.BranchState.disableUpdate -- optimisation
b <- liftIO $ L.hGetContents stdin
ifM fileoutsiderepo
( liftIO $ L.hPut stdout b
, do
-- Avoid a potential deadlock.
Annex.changeState $ \s -> s
{ Annex.insmudgecleanfilter = True }
go b
)
let passthrough = liftIO $ L.hPut stdout b
-- Before git 2.5, failing to consume all stdin here would
-- cause a SIGPIPE and crash it.
-- Newer git catches the signal and stops sending, which is
-- much faster. (Also, git seems to forget to free memory
-- when sending the file, so the less we let it send, the
-- less memory it will waste.)
let discardreststdin = if Git.BuildVersion.older "2.5"
then L.length b `seq` return ()
else liftIO $ hClose stdin
let emitpointer = liftIO . S.hPut stdout . formatPointer
clean' file (parseLinkTargetOrPointerLazy b)
passthrough
discardreststdin
emitpointer
stop
where
go b = case parseLinkTargetOrPointerLazy b of
-- Handles everything except the IO of the file content.
clean'
:: RawFilePath
-> Maybe Key
-- ^ If the content provided by git is an annex pointer,
-- this is the key it points to.
-> Annex ()
-- ^ passthrough: Feed the content provided by git back out to git.
-> Annex ()
-- ^ discardreststdin: Called when passthrough will not be called,
-- this has to take care of reading the content provided by git, or
-- otherwise dealing with it.
-> (Key -> Annex ())
-- ^ emitpointer: Emit a pointer file for the key.
-> Annex ()
clean' file mk passthrough discardreststdin emitpointer =
ifM (fileOutsideRepo file)
( passthrough
, inSmudgeCleanFilter go
)
where
go = case mk of
Just k -> do
addingExistingLink file k $ do
getMoveRaceRecovery k file
liftIO $ L.hPut stdout b
passthrough
Nothing -> inRepo (Git.Ref.fileRef file) >>= \case
Just fileref -> do
indexmeta <- catObjectMetaData fileref
oldkey <- case indexmeta of
Just (_, sz, _) -> catKey' fileref sz
Nothing -> return Nothing
go' b indexmeta oldkey
Nothing -> liftIO $ L.hPut stdout b
go' b indexmeta oldkey = ifM (shouldAnnex file indexmeta oldkey)
go' indexmeta oldkey
Nothing -> passthrough
go' indexmeta oldkey = ifM (shouldAnnex file indexmeta oldkey)
( do
-- Before git 2.5, failing to consume all stdin here
-- would cause a SIGPIPE and crash it.
-- Newer git catches the signal and stops sending,
-- which is much faster. (Also, git seems to forget
-- to free memory when sending the file, so the
-- less we let it send, the less memory it will waste.)
if Git.BuildVersion.older "2.5"
then L.length b `seq` return ()
else liftIO $ hClose stdin
discardreststdin
-- Optimization for the case when the file is already
-- annexed and is unmodified.
case oldkey of
Nothing -> doingest Nothing
Just ko -> ifM (isUnmodifiedCheap ko file)
( liftIO $ emitPointer ko
( emitpointer ko
, updateingest ko
)
, liftIO $ L.hPut stdout b
, passthrough
)
-- Use the same backend that was used before, when possible.
@ -150,7 +176,7 @@ clean file = do
-- Can't restage associated files because git add
-- runs this and has the index locked.
let norestage = Restage False
liftIO . emitPointer
emitpointer
=<< postingest
=<< (\ld -> ingest' preferredbackend nullMeterUpdate ld Nothing norestage)
=<< lockDown cfg (fromRawFilePath file)
@ -166,12 +192,22 @@ clean file = do
, checkWritePerms = True
}
-- git diff can run the clean filter on files outside the
-- repository; can't annex those
fileoutsiderepo = do
repopath <- liftIO . absPath =<< fromRepo Git.repoPath
filepath <- liftIO $ absPath file
return $ not $ dirContains repopath filepath
-- git diff can run the clean filter on files outside the
-- repository; can't annex those
fileOutsideRepo :: RawFilePath -> Annex Bool
fileOutsideRepo file = do
repopath <- liftIO . absPath =<< fromRepo Git.repoPath
filepath <- liftIO $ absPath file
return $ not $ dirContains repopath filepath
-- Avoid a potential deadlock.
inSmudgeCleanFilter :: Annex a -> Annex a
inSmudgeCleanFilter = bracket setup cleanup . const
where
setup = Annex.changeState $ \s -> s
{ Annex.insmudgecleanfilter = True }
cleanup () = Annex.changeState $ \s -> s
{ Annex.insmudgecleanfilter = False }
-- If annex.largefiles is configured (and not disabled by annex.gitaddtoannex
-- being set to false), matching files are added to the annex and the rest to
@ -245,9 +281,6 @@ shouldAnnex file indexmeta moldkey = do
_ -> cont
_ -> cont
emitPointer :: Key -> IO ()
emitPointer = S.putStr . formatPointer
-- Recover from a previous race between eg git mv and git-annex get.
-- That could result in the file remaining a pointer file, while
-- its content is present in the annex. Populate the pointer file.