add git-annex filter-process
filter-process: New command that can make git add/checkout faster when there are a lot of unlocked annexed files or non-annexed files, but that also makes git add of large annexed files slower. Use it by running: git config filter.annex.process 'git-annex filter-process' Fully tested and working, but I have not benchmarked it at all. And, incremental hashing is not done when git add uses it, so extra work is done in that case. Sponsored-by: Mark Reidenbach on Patreon
This commit is contained in:
parent
d706b49979
commit
68257e9076
10 changed files with 254 additions and 43 deletions
|
@ -4,6 +4,10 @@ git-annex (8.20211029) UNRELEASED; urgency=medium
|
|||
instead output a blank line like other batch commands do.
|
||||
* metadata --batch --json: Reject input whose "fields" does not consist
|
||||
of arrays of strings. Such invalid input used to be silently ignored.
|
||||
* filter-process: New command that can make git add/checkout faster when
|
||||
there are a lot of unlocked annexed files or non-annexed files, but that
|
||||
also makes git add of large annexed files slower. Use it by running:
|
||||
git config filter.annex.process 'git-annex filter-process'
|
||||
|
||||
-- Joey Hess <id@joeyh.name> Mon, 01 Nov 2021 13:19:46 -0400
|
||||
|
||||
|
|
|
@ -113,6 +113,7 @@ import qualified Command.P2P
|
|||
import qualified Command.Proxy
|
||||
import qualified Command.DiffDriver
|
||||
import qualified Command.Smudge
|
||||
import qualified Command.FilterProcess
|
||||
import qualified Command.Undo
|
||||
import qualified Command.Version
|
||||
import qualified Command.RemoteDaemon
|
||||
|
@ -226,6 +227,7 @@ cmds testoptparser testrunner mkbenchmarkgenerator = map addGitAnnexGlobalOption
|
|||
, Command.Proxy.cmd
|
||||
, Command.DiffDriver.cmd
|
||||
, Command.Smudge.cmd
|
||||
, Command.FilterProcess.cmd
|
||||
, Command.Undo.cmd
|
||||
, Command.Version.cmd
|
||||
, Command.RemoteDaemon.cmd
|
||||
|
|
89
Command/FilterProcess.hs
Normal file
89
Command/FilterProcess.hs
Normal file
|
@ -0,0 +1,89 @@
|
|||
{- git-annex command
|
||||
-
|
||||
- Copyright 2021 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
||||
module Command.FilterProcess where
|
||||
|
||||
import Command
|
||||
import qualified Command.Smudge
|
||||
import Git.FilterProcess
|
||||
import Git.PktLine
|
||||
import Annex.Link
|
||||
|
||||
import qualified Data.ByteString as B
|
||||
import qualified Data.ByteString.Lazy as L
|
||||
|
||||
cmd :: Command
|
||||
cmd = noCommit $ noMessages $
|
||||
command "filter-process" SectionPlumbing
|
||||
"long running git filter process"
|
||||
paramNothing (withParams seek)
|
||||
|
||||
seek :: CmdParams -> CommandSeek
|
||||
seek _ = liftIO longRunningFilterProcessHandshake >>= \case
|
||||
Left err -> giveup err
|
||||
Right () -> go
|
||||
where
|
||||
go = liftIO getFilterRequest >>= \case
|
||||
Just (Smudge f) -> do
|
||||
smudge f
|
||||
go
|
||||
Just (Clean f) -> do
|
||||
clean f
|
||||
go
|
||||
Nothing -> return ()
|
||||
|
||||
smudge :: FilePath -> Annex ()
|
||||
smudge file = do
|
||||
{- The whole git file content is necessarily buffered in memory,
|
||||
- because we have to consume everything git is sending before
|
||||
- we can respond to it. An annexed file will be only a pointer
|
||||
- though. -}
|
||||
b <- B.concat . map pktLineToByteString <$> liftIO readUntilFlushPkt
|
||||
Command.Smudge.smudge' file (L.fromStrict b)
|
||||
{- Git expects us to output the content of unlocked annexed files,
|
||||
- but if we got a pointer, we output only the pointer.
|
||||
- See Command.Smudge.smudge for details of how this works. -}
|
||||
liftIO $ respondFilterRequest b
|
||||
|
||||
clean :: FilePath -> Annex ()
|
||||
clean file = do
|
||||
{- We have to consume everything git is sending before we can
|
||||
- respond to it. But it can be an arbitrarily large file,
|
||||
- which is being added to the annex, and we do not want to buffer
|
||||
- all that in memory.
|
||||
-
|
||||
- Start by reading enough to determine if the file is an annex
|
||||
- pointer.
|
||||
-}
|
||||
let conv b l = (B.concat (map pktLineToByteString l), b)
|
||||
(b, readcomplete) <-
|
||||
either (conv False) (conv True)
|
||||
<$> liftIO (readUntilFlushPktOrSize unpaddedMaxPointerSz)
|
||||
|
||||
let passthrough
|
||||
| readcomplete = liftIO $ respondFilterRequest b
|
||||
| otherwise = liftIO $ do
|
||||
-- Have to buffer the file content in memory here,
|
||||
-- but it's not an annexed file, so not typically
|
||||
-- large, and it's all stored in git, which also
|
||||
-- buffers files in memory.
|
||||
b' <- B.concat . (map pktLineToByteString)
|
||||
<$> readUntilFlushPkt
|
||||
respondFilterRequest (b <> b')
|
||||
let discardreststdin
|
||||
| readcomplete = return ()
|
||||
| otherwise = liftIO discardUntilFlushPkt
|
||||
let emitpointer = liftIO . respondFilterRequest . formatPointer
|
||||
-- This does not incrementally hash, so both git and git-annex
|
||||
-- read from the file. It may be less expensive to incrementally
|
||||
-- hash the content provided by git, but Backend does not currently
|
||||
-- have an interface to do so.
|
||||
Command.Smudge.clean' (toRawFilePath file)
|
||||
(parseLinkTargetOrPointer b)
|
||||
passthrough
|
||||
discardreststdin
|
||||
emitpointer
|
|
@ -74,15 +74,19 @@ seek UpdateOption = commandAction update
|
|||
smudge :: FilePath -> CommandStart
|
||||
smudge file = do
|
||||
b <- liftIO $ L.hGetContents stdin
|
||||
case parseLinkTargetOrPointerLazy b of
|
||||
Nothing -> noop
|
||||
Just k -> do
|
||||
topfile <- inRepo (toTopFilePath (toRawFilePath file))
|
||||
Database.Keys.addAssociatedFile k topfile
|
||||
void $ smudgeLog k topfile
|
||||
smudge' file b
|
||||
liftIO $ L.putStr b
|
||||
stop
|
||||
|
||||
-- Handles everything except the IO of the file content.
|
||||
smudge' :: FilePath -> L.ByteString -> Annex ()
|
||||
smudge' file b = case parseLinkTargetOrPointerLazy b of
|
||||
Nothing -> noop
|
||||
Just k -> do
|
||||
topfile <- inRepo (toTopFilePath (toRawFilePath file))
|
||||
Database.Keys.addAssociatedFile k topfile
|
||||
void $ smudgeLog k topfile
|
||||
|
||||
-- Clean filter is fed file content on stdin, decides if a file
|
||||
-- should be stored in the annex, and outputs a pointer to its
|
||||
-- injested content if so. Otherwise, the original content.
|
||||
|
@ -90,50 +94,72 @@ clean :: RawFilePath -> CommandStart
|
|||
clean file = do
|
||||
Annex.BranchState.disableUpdate -- optimisation
|
||||
b <- liftIO $ L.hGetContents stdin
|
||||
ifM fileoutsiderepo
|
||||
( liftIO $ L.hPut stdout b
|
||||
, do
|
||||
-- Avoid a potential deadlock.
|
||||
Annex.changeState $ \s -> s
|
||||
{ Annex.insmudgecleanfilter = True }
|
||||
go b
|
||||
)
|
||||
let passthrough = liftIO $ L.hPut stdout b
|
||||
-- Before git 2.5, failing to consume all stdin here would
|
||||
-- cause a SIGPIPE and crash it.
|
||||
-- Newer git catches the signal and stops sending, which is
|
||||
-- much faster. (Also, git seems to forget to free memory
|
||||
-- when sending the file, so the less we let it send, the
|
||||
-- less memory it will waste.)
|
||||
let discardreststdin = if Git.BuildVersion.older "2.5"
|
||||
then L.length b `seq` return ()
|
||||
else liftIO $ hClose stdin
|
||||
let emitpointer = liftIO . S.hPut stdout . formatPointer
|
||||
clean' file (parseLinkTargetOrPointerLazy b)
|
||||
passthrough
|
||||
discardreststdin
|
||||
emitpointer
|
||||
stop
|
||||
where
|
||||
go b = case parseLinkTargetOrPointerLazy b of
|
||||
|
||||
-- Handles everything except the IO of the file content.
|
||||
clean'
|
||||
:: RawFilePath
|
||||
-> Maybe Key
|
||||
-- ^ If the content provided by git is an annex pointer,
|
||||
-- this is the key it points to.
|
||||
-> Annex ()
|
||||
-- ^ passthrough: Feed the content provided by git back out to git.
|
||||
-> Annex ()
|
||||
-- ^ discardreststdin: Called when passthrough will not be called,
|
||||
-- this has to take care of reading the content provided by git, or
|
||||
-- otherwise dealing with it.
|
||||
-> (Key -> Annex ())
|
||||
-- ^ emitpointer: Emit a pointer file for the key.
|
||||
-> Annex ()
|
||||
clean' file mk passthrough discardreststdin emitpointer =
|
||||
ifM (fileOutsideRepo file)
|
||||
( passthrough
|
||||
, inSmudgeCleanFilter go
|
||||
)
|
||||
where
|
||||
|
||||
go = case mk of
|
||||
Just k -> do
|
||||
addingExistingLink file k $ do
|
||||
getMoveRaceRecovery k file
|
||||
liftIO $ L.hPut stdout b
|
||||
passthrough
|
||||
Nothing -> inRepo (Git.Ref.fileRef file) >>= \case
|
||||
Just fileref -> do
|
||||
indexmeta <- catObjectMetaData fileref
|
||||
oldkey <- case indexmeta of
|
||||
Just (_, sz, _) -> catKey' fileref sz
|
||||
Nothing -> return Nothing
|
||||
go' b indexmeta oldkey
|
||||
Nothing -> liftIO $ L.hPut stdout b
|
||||
go' b indexmeta oldkey = ifM (shouldAnnex file indexmeta oldkey)
|
||||
go' indexmeta oldkey
|
||||
Nothing -> passthrough
|
||||
go' indexmeta oldkey = ifM (shouldAnnex file indexmeta oldkey)
|
||||
( do
|
||||
-- Before git 2.5, failing to consume all stdin here
|
||||
-- would cause a SIGPIPE and crash it.
|
||||
-- Newer git catches the signal and stops sending,
|
||||
-- which is much faster. (Also, git seems to forget
|
||||
-- to free memory when sending the file, so the
|
||||
-- less we let it send, the less memory it will waste.)
|
||||
if Git.BuildVersion.older "2.5"
|
||||
then L.length b `seq` return ()
|
||||
else liftIO $ hClose stdin
|
||||
discardreststdin
|
||||
|
||||
-- Optimization for the case when the file is already
|
||||
-- annexed and is unmodified.
|
||||
case oldkey of
|
||||
Nothing -> doingest Nothing
|
||||
Just ko -> ifM (isUnmodifiedCheap ko file)
|
||||
( liftIO $ emitPointer ko
|
||||
( emitpointer ko
|
||||
, updateingest ko
|
||||
)
|
||||
, liftIO $ L.hPut stdout b
|
||||
, passthrough
|
||||
)
|
||||
|
||||
-- Use the same backend that was used before, when possible.
|
||||
|
@ -150,7 +176,7 @@ clean file = do
|
|||
-- Can't restage associated files because git add
|
||||
-- runs this and has the index locked.
|
||||
let norestage = Restage False
|
||||
liftIO . emitPointer
|
||||
emitpointer
|
||||
=<< postingest
|
||||
=<< (\ld -> ingest' preferredbackend nullMeterUpdate ld Nothing norestage)
|
||||
=<< lockDown cfg (fromRawFilePath file)
|
||||
|
@ -166,12 +192,22 @@ clean file = do
|
|||
, checkWritePerms = True
|
||||
}
|
||||
|
||||
-- git diff can run the clean filter on files outside the
|
||||
-- repository; can't annex those
|
||||
fileoutsiderepo = do
|
||||
repopath <- liftIO . absPath =<< fromRepo Git.repoPath
|
||||
filepath <- liftIO $ absPath file
|
||||
return $ not $ dirContains repopath filepath
|
||||
-- git diff can run the clean filter on files outside the
|
||||
-- repository; can't annex those
|
||||
fileOutsideRepo :: RawFilePath -> Annex Bool
|
||||
fileOutsideRepo file = do
|
||||
repopath <- liftIO . absPath =<< fromRepo Git.repoPath
|
||||
filepath <- liftIO $ absPath file
|
||||
return $ not $ dirContains repopath filepath
|
||||
|
||||
-- Avoid a potential deadlock.
|
||||
inSmudgeCleanFilter :: Annex a -> Annex a
|
||||
inSmudgeCleanFilter = bracket setup cleanup . const
|
||||
where
|
||||
setup = Annex.changeState $ \s -> s
|
||||
{ Annex.insmudgecleanfilter = True }
|
||||
cleanup () = Annex.changeState $ \s -> s
|
||||
{ Annex.insmudgecleanfilter = False }
|
||||
|
||||
-- If annex.largefiles is configured (and not disabled by annex.gitaddtoannex
|
||||
-- being set to false), matching files are added to the annex and the rest to
|
||||
|
@ -245,9 +281,6 @@ shouldAnnex file indexmeta moldkey = do
|
|||
_ -> cont
|
||||
_ -> cont
|
||||
|
||||
emitPointer :: Key -> IO ()
|
||||
emitPointer = S.putStr . formatPointer
|
||||
|
||||
-- Recover from a previous race between eg git mv and git-annex get.
|
||||
-- That could result in the file remaining a pointer file, while
|
||||
-- its content is present in the annex. Populate the pointer file.
|
||||
|
|
|
@ -15,17 +15,18 @@ module Git.FilterProcess (
|
|||
Version(..),
|
||||
Capability(..),
|
||||
readUntilFlushPkt,
|
||||
readUntilFlushPktOrSize,
|
||||
discardUntilFlushPkt,
|
||||
longRunningProcessHandshake,
|
||||
longRunningFilterProcessHandshake,
|
||||
FilterRequest(..),
|
||||
getFilterRequest,
|
||||
respondFilterRequest,
|
||||
) where
|
||||
|
||||
import Common
|
||||
import Git.PktLine
|
||||
|
||||
import System.IO
|
||||
import qualified Data.ByteString as B
|
||||
|
||||
{- This is a message like "git-filter-client" or "git-filter-server" -}
|
||||
|
@ -85,6 +86,24 @@ readUntilFlushPkt = go []
|
|||
Just pktline | not (isFlushPkt pktline) -> go (pktline:l)
|
||||
_ -> return (reverse l)
|
||||
|
||||
{- Reads PktLines until at least the specified number of bytes have been
|
||||
- read, or until a flushPkt (or EOF). Returns Right if it did read a
|
||||
- flushPkt/EOF, and Left if there is still content leftover that needs to
|
||||
- be read. -}
|
||||
readUntilFlushPktOrSize :: Int -> IO (Either [PktLine] [PktLine])
|
||||
readUntilFlushPktOrSize = go []
|
||||
where
|
||||
go l n = readPktLine stdin >>= \case
|
||||
Just pktline
|
||||
| isFlushPkt pktline -> return (Right (reverse l))
|
||||
| otherwise ->
|
||||
let len = B.length (pktLineToByteString pktline)
|
||||
n' = n - len
|
||||
in if n' <= 0
|
||||
then return (Left (reverse (pktline:l)))
|
||||
else go (pktline:l) n'
|
||||
Nothing -> return (Right (reverse l))
|
||||
|
||||
{- Reads PktLines until a flushPkt (or EOF), and throws them away. -}
|
||||
discardUntilFlushPkt :: IO ()
|
||||
discardUntilFlushPkt = readPktLine stdin >>= \case
|
||||
|
|
|
@ -101,9 +101,14 @@ writePktLine h (PktLine b)
|
|||
hFlush h
|
||||
|
||||
{- Maximum possible length of the string encoded in PktLine;
|
||||
- the length header takes up 4 bytes. -}
|
||||
- the length header takes up 4 bytes.
|
||||
-
|
||||
- While the 4 byte length header can express lengths up to 65535,
|
||||
- git actually does not support packets larger than 65520 (including the
|
||||
- header). See "LARGE_PACKET_MAX" in the git source code.
|
||||
-}
|
||||
maxPktLineLength :: Int
|
||||
maxPktLineLength = 65535 - 4
|
||||
maxPktLineLength = 65520 - 4
|
||||
|
||||
flushPkt :: PktLine
|
||||
flushPkt = PktLine mempty
|
||||
|
|
48
doc/git-annex-filter-process.mdwn
Normal file
48
doc/git-annex-filter-process.mdwn
Normal file
|
@ -0,0 +1,48 @@
|
|||
# NAME
|
||||
|
||||
git-annex filter-process - long running git filter process for git-annex
|
||||
|
||||
# SYNOPSIS
|
||||
|
||||
git annex filter-process
|
||||
|
||||
# DESCRIPTION
|
||||
|
||||
By default, `git-annex smudge` is used as the git filter driver.
|
||||
This is an alternative way to accomplish the same thing, using git's
|
||||
long-running filter process interface.
|
||||
|
||||
To enable using this, run:
|
||||
|
||||
git config filter.annex.process 'git-annex filter-process'
|
||||
|
||||
There will be no visible difference in behavior between enabling this and
|
||||
not, besides changes in speed and memory use when using git.
|
||||
|
||||
When this is not enabled, each file that git wants to filter involves
|
||||
starting up a new `git-annex smudge` process. Starting many such processes
|
||||
for many files can be slow, and can make commands like `git checkout` and
|
||||
`git add` slow when they are operating on a lot of files. (A lot of locked
|
||||
annexed files do not make `git checkout` slow, but unlocked files and
|
||||
non-annexed files do slow it down.)
|
||||
|
||||
On the other hand when this is enabled, `git add` of a large file does an
|
||||
unncessary extra read of the file, and pipes its contents into git-annex.
|
||||
So if you enable this, it will be faster to use `git-annex add` to add
|
||||
large files to the annex, rather than `git add`. Other commands that
|
||||
add files, like `git commit -a`, are also impacted by this.
|
||||
|
||||
# OPTIONS
|
||||
|
||||
* The [[git-annex-common-options]](1) can be used.
|
||||
|
||||
# SEE ALSO
|
||||
|
||||
[[git-annex]](1)
|
||||
[[git-annex-smudge]](1)
|
||||
|
||||
# AUTHOR
|
||||
|
||||
Joey Hess <id@joeyh.name>
|
||||
|
||||
Warning: Automatically converted into a man page by mdwn2man. Edit with care.
|
|
@ -58,6 +58,7 @@ tree.
|
|||
# SEE ALSO
|
||||
|
||||
[[git-annex]](1)
|
||||
[[git-annex-filter-process]](1)
|
||||
|
||||
# AUTHOR
|
||||
|
||||
|
|
|
@ -713,6 +713,13 @@ content from the key-value store.
|
|||
|
||||
See [[git-annex-smudge]](1) for details.
|
||||
|
||||
* `filter-process`
|
||||
|
||||
An alternative implementation of a git filter driver, that is faster
|
||||
in some situations and slower in others than `git-annex smudge`.
|
||||
|
||||
See [[git-annex-filter-process]](1) for details.
|
||||
|
||||
* `findref [ref]`
|
||||
|
||||
Lists files in a git ref. (deprecated)
|
||||
|
|
|
@ -745,6 +745,7 @@ Executable git-annex
|
|||
Command.Expire
|
||||
Command.Export
|
||||
Command.FilterBranch
|
||||
Command.FilterProcess
|
||||
Command.Find
|
||||
Command.FindRef
|
||||
Command.Fix
|
||||
|
@ -871,6 +872,7 @@ Executable git-annex
|
|||
Git.FileMode
|
||||
Git.FilePath
|
||||
Git.Filename
|
||||
Git.FilterProcess
|
||||
Git.Fsck
|
||||
Git.GCrypt
|
||||
Git.HashObject
|
||||
|
@ -882,6 +884,7 @@ Executable git-annex
|
|||
Git.LsTree
|
||||
Git.Merge
|
||||
Git.Objects
|
||||
Git.PktLine
|
||||
Git.Queue
|
||||
Git.Ref
|
||||
Git.RefLog
|
||||
|
|
Loading…
Add table
Reference in a new issue