add git-annex filter-process
filter-process: New command that can make git add/checkout faster when there are a lot of unlocked annexed files or non-annexed files, but that also makes git add of large annexed files slower. Use it by running: git config filter.annex.process 'git-annex filter-process' Fully tested and working, but I have not benchmarked it at all. And, incremental hashing is not done when git add uses it, so extra work is done in that case. Sponsored-by: Mark Reidenbach on Patreon
This commit is contained in:
parent
d706b49979
commit
68257e9076
10 changed files with 254 additions and 43 deletions
89
Command/FilterProcess.hs
Normal file
89
Command/FilterProcess.hs
Normal file
|
@ -0,0 +1,89 @@
|
|||
{- git-annex command
|
||||
-
|
||||
- Copyright 2021 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
||||
module Command.FilterProcess where
|
||||
|
||||
import Command
|
||||
import qualified Command.Smudge
|
||||
import Git.FilterProcess
|
||||
import Git.PktLine
|
||||
import Annex.Link
|
||||
|
||||
import qualified Data.ByteString as B
|
||||
import qualified Data.ByteString.Lazy as L
|
||||
|
||||
cmd :: Command
|
||||
cmd = noCommit $ noMessages $
|
||||
command "filter-process" SectionPlumbing
|
||||
"long running git filter process"
|
||||
paramNothing (withParams seek)
|
||||
|
||||
seek :: CmdParams -> CommandSeek
|
||||
seek _ = liftIO longRunningFilterProcessHandshake >>= \case
|
||||
Left err -> giveup err
|
||||
Right () -> go
|
||||
where
|
||||
go = liftIO getFilterRequest >>= \case
|
||||
Just (Smudge f) -> do
|
||||
smudge f
|
||||
go
|
||||
Just (Clean f) -> do
|
||||
clean f
|
||||
go
|
||||
Nothing -> return ()
|
||||
|
||||
smudge :: FilePath -> Annex ()
|
||||
smudge file = do
|
||||
{- The whole git file content is necessarily buffered in memory,
|
||||
- because we have to consume everything git is sending before
|
||||
- we can respond to it. An annexed file will be only a pointer
|
||||
- though. -}
|
||||
b <- B.concat . map pktLineToByteString <$> liftIO readUntilFlushPkt
|
||||
Command.Smudge.smudge' file (L.fromStrict b)
|
||||
{- Git expects us to output the content of unlocked annexed files,
|
||||
- but if we got a pointer, we output only the pointer.
|
||||
- See Command.Smudge.smudge for details of how this works. -}
|
||||
liftIO $ respondFilterRequest b
|
||||
|
||||
clean :: FilePath -> Annex ()
|
||||
clean file = do
|
||||
{- We have to consume everything git is sending before we can
|
||||
- respond to it. But it can be an arbitrarily large file,
|
||||
- which is being added to the annex, and we do not want to buffer
|
||||
- all that in memory.
|
||||
-
|
||||
- Start by reading enough to determine if the file is an annex
|
||||
- pointer.
|
||||
-}
|
||||
let conv b l = (B.concat (map pktLineToByteString l), b)
|
||||
(b, readcomplete) <-
|
||||
either (conv False) (conv True)
|
||||
<$> liftIO (readUntilFlushPktOrSize unpaddedMaxPointerSz)
|
||||
|
||||
let passthrough
|
||||
| readcomplete = liftIO $ respondFilterRequest b
|
||||
| otherwise = liftIO $ do
|
||||
-- Have to buffer the file content in memory here,
|
||||
-- but it's not an annexed file, so not typically
|
||||
-- large, and it's all stored in git, which also
|
||||
-- buffers files in memory.
|
||||
b' <- B.concat . (map pktLineToByteString)
|
||||
<$> readUntilFlushPkt
|
||||
respondFilterRequest (b <> b')
|
||||
let discardreststdin
|
||||
| readcomplete = return ()
|
||||
| otherwise = liftIO discardUntilFlushPkt
|
||||
let emitpointer = liftIO . respondFilterRequest . formatPointer
|
||||
-- This does not incrementally hash, so both git and git-annex
|
||||
-- read from the file. It may be less expensive to incrementally
|
||||
-- hash the content provided by git, but Backend does not currently
|
||||
-- have an interface to do so.
|
||||
Command.Smudge.clean' (toRawFilePath file)
|
||||
(parseLinkTargetOrPointer b)
|
||||
passthrough
|
||||
discardreststdin
|
||||
emitpointer
|
|
@ -74,15 +74,19 @@ seek UpdateOption = commandAction update
|
|||
smudge :: FilePath -> CommandStart
|
||||
smudge file = do
|
||||
b <- liftIO $ L.hGetContents stdin
|
||||
case parseLinkTargetOrPointerLazy b of
|
||||
Nothing -> noop
|
||||
Just k -> do
|
||||
topfile <- inRepo (toTopFilePath (toRawFilePath file))
|
||||
Database.Keys.addAssociatedFile k topfile
|
||||
void $ smudgeLog k topfile
|
||||
smudge' file b
|
||||
liftIO $ L.putStr b
|
||||
stop
|
||||
|
||||
-- Handles everything except the IO of the file content.
|
||||
smudge' :: FilePath -> L.ByteString -> Annex ()
|
||||
smudge' file b = case parseLinkTargetOrPointerLazy b of
|
||||
Nothing -> noop
|
||||
Just k -> do
|
||||
topfile <- inRepo (toTopFilePath (toRawFilePath file))
|
||||
Database.Keys.addAssociatedFile k topfile
|
||||
void $ smudgeLog k topfile
|
||||
|
||||
-- Clean filter is fed file content on stdin, decides if a file
|
||||
-- should be stored in the annex, and outputs a pointer to its
|
||||
-- injested content if so. Otherwise, the original content.
|
||||
|
@ -90,50 +94,72 @@ clean :: RawFilePath -> CommandStart
|
|||
clean file = do
|
||||
Annex.BranchState.disableUpdate -- optimisation
|
||||
b <- liftIO $ L.hGetContents stdin
|
||||
ifM fileoutsiderepo
|
||||
( liftIO $ L.hPut stdout b
|
||||
, do
|
||||
-- Avoid a potential deadlock.
|
||||
Annex.changeState $ \s -> s
|
||||
{ Annex.insmudgecleanfilter = True }
|
||||
go b
|
||||
)
|
||||
let passthrough = liftIO $ L.hPut stdout b
|
||||
-- Before git 2.5, failing to consume all stdin here would
|
||||
-- cause a SIGPIPE and crash it.
|
||||
-- Newer git catches the signal and stops sending, which is
|
||||
-- much faster. (Also, git seems to forget to free memory
|
||||
-- when sending the file, so the less we let it send, the
|
||||
-- less memory it will waste.)
|
||||
let discardreststdin = if Git.BuildVersion.older "2.5"
|
||||
then L.length b `seq` return ()
|
||||
else liftIO $ hClose stdin
|
||||
let emitpointer = liftIO . S.hPut stdout . formatPointer
|
||||
clean' file (parseLinkTargetOrPointerLazy b)
|
||||
passthrough
|
||||
discardreststdin
|
||||
emitpointer
|
||||
stop
|
||||
where
|
||||
go b = case parseLinkTargetOrPointerLazy b of
|
||||
|
||||
-- Handles everything except the IO of the file content.
|
||||
clean'
|
||||
:: RawFilePath
|
||||
-> Maybe Key
|
||||
-- ^ If the content provided by git is an annex pointer,
|
||||
-- this is the key it points to.
|
||||
-> Annex ()
|
||||
-- ^ passthrough: Feed the content provided by git back out to git.
|
||||
-> Annex ()
|
||||
-- ^ discardreststdin: Called when passthrough will not be called,
|
||||
-- this has to take care of reading the content provided by git, or
|
||||
-- otherwise dealing with it.
|
||||
-> (Key -> Annex ())
|
||||
-- ^ emitpointer: Emit a pointer file for the key.
|
||||
-> Annex ()
|
||||
clean' file mk passthrough discardreststdin emitpointer =
|
||||
ifM (fileOutsideRepo file)
|
||||
( passthrough
|
||||
, inSmudgeCleanFilter go
|
||||
)
|
||||
where
|
||||
|
||||
go = case mk of
|
||||
Just k -> do
|
||||
addingExistingLink file k $ do
|
||||
getMoveRaceRecovery k file
|
||||
liftIO $ L.hPut stdout b
|
||||
passthrough
|
||||
Nothing -> inRepo (Git.Ref.fileRef file) >>= \case
|
||||
Just fileref -> do
|
||||
indexmeta <- catObjectMetaData fileref
|
||||
oldkey <- case indexmeta of
|
||||
Just (_, sz, _) -> catKey' fileref sz
|
||||
Nothing -> return Nothing
|
||||
go' b indexmeta oldkey
|
||||
Nothing -> liftIO $ L.hPut stdout b
|
||||
go' b indexmeta oldkey = ifM (shouldAnnex file indexmeta oldkey)
|
||||
go' indexmeta oldkey
|
||||
Nothing -> passthrough
|
||||
go' indexmeta oldkey = ifM (shouldAnnex file indexmeta oldkey)
|
||||
( do
|
||||
-- Before git 2.5, failing to consume all stdin here
|
||||
-- would cause a SIGPIPE and crash it.
|
||||
-- Newer git catches the signal and stops sending,
|
||||
-- which is much faster. (Also, git seems to forget
|
||||
-- to free memory when sending the file, so the
|
||||
-- less we let it send, the less memory it will waste.)
|
||||
if Git.BuildVersion.older "2.5"
|
||||
then L.length b `seq` return ()
|
||||
else liftIO $ hClose stdin
|
||||
discardreststdin
|
||||
|
||||
-- Optimization for the case when the file is already
|
||||
-- annexed and is unmodified.
|
||||
case oldkey of
|
||||
Nothing -> doingest Nothing
|
||||
Just ko -> ifM (isUnmodifiedCheap ko file)
|
||||
( liftIO $ emitPointer ko
|
||||
( emitpointer ko
|
||||
, updateingest ko
|
||||
)
|
||||
, liftIO $ L.hPut stdout b
|
||||
, passthrough
|
||||
)
|
||||
|
||||
-- Use the same backend that was used before, when possible.
|
||||
|
@ -150,7 +176,7 @@ clean file = do
|
|||
-- Can't restage associated files because git add
|
||||
-- runs this and has the index locked.
|
||||
let norestage = Restage False
|
||||
liftIO . emitPointer
|
||||
emitpointer
|
||||
=<< postingest
|
||||
=<< (\ld -> ingest' preferredbackend nullMeterUpdate ld Nothing norestage)
|
||||
=<< lockDown cfg (fromRawFilePath file)
|
||||
|
@ -166,12 +192,22 @@ clean file = do
|
|||
, checkWritePerms = True
|
||||
}
|
||||
|
||||
-- git diff can run the clean filter on files outside the
|
||||
-- repository; can't annex those
|
||||
fileoutsiderepo = do
|
||||
repopath <- liftIO . absPath =<< fromRepo Git.repoPath
|
||||
filepath <- liftIO $ absPath file
|
||||
return $ not $ dirContains repopath filepath
|
||||
-- git diff can run the clean filter on files outside the
|
||||
-- repository; can't annex those
|
||||
fileOutsideRepo :: RawFilePath -> Annex Bool
|
||||
fileOutsideRepo file = do
|
||||
repopath <- liftIO . absPath =<< fromRepo Git.repoPath
|
||||
filepath <- liftIO $ absPath file
|
||||
return $ not $ dirContains repopath filepath
|
||||
|
||||
-- Avoid a potential deadlock.
|
||||
inSmudgeCleanFilter :: Annex a -> Annex a
|
||||
inSmudgeCleanFilter = bracket setup cleanup . const
|
||||
where
|
||||
setup = Annex.changeState $ \s -> s
|
||||
{ Annex.insmudgecleanfilter = True }
|
||||
cleanup () = Annex.changeState $ \s -> s
|
||||
{ Annex.insmudgecleanfilter = False }
|
||||
|
||||
-- If annex.largefiles is configured (and not disabled by annex.gitaddtoannex
|
||||
-- being set to false), matching files are added to the annex and the rest to
|
||||
|
@ -245,9 +281,6 @@ shouldAnnex file indexmeta moldkey = do
|
|||
_ -> cont
|
||||
_ -> cont
|
||||
|
||||
emitPointer :: Key -> IO ()
|
||||
emitPointer = S.putStr . formatPointer
|
||||
|
||||
-- Recover from a previous race between eg git mv and git-annex get.
|
||||
-- That could result in the file remaining a pointer file, while
|
||||
-- its content is present in the annex. Populate the pointer file.
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue