diff --git a/CHANGELOG b/CHANGELOG index af8d201f03..104285efcf 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,10 @@ git-annex (8.20211029) UNRELEASED; urgency=medium instead output a blank line like other batch commands do. * metadata --batch --json: Reject input whose "fields" does not consist of arrays of strings. Such invalid input used to be silently ignored. + * filter-process: New command that can make git add/checkout faster when + there are a lot of unlocked annexed files or non-annexed files, but that + also makes git add of large annexed files slower. Use it by running: + git config filter.annex.process 'git-annex filter-process' -- Joey Hess Mon, 01 Nov 2021 13:19:46 -0400 diff --git a/CmdLine/GitAnnex.hs b/CmdLine/GitAnnex.hs index 807aa14083..63889f0b44 100644 --- a/CmdLine/GitAnnex.hs +++ b/CmdLine/GitAnnex.hs @@ -113,6 +113,7 @@ import qualified Command.P2P import qualified Command.Proxy import qualified Command.DiffDriver import qualified Command.Smudge +import qualified Command.FilterProcess import qualified Command.Undo import qualified Command.Version import qualified Command.RemoteDaemon @@ -226,6 +227,7 @@ cmds testoptparser testrunner mkbenchmarkgenerator = map addGitAnnexGlobalOption , Command.Proxy.cmd , Command.DiffDriver.cmd , Command.Smudge.cmd + , Command.FilterProcess.cmd , Command.Undo.cmd , Command.Version.cmd , Command.RemoteDaemon.cmd diff --git a/Command/FilterProcess.hs b/Command/FilterProcess.hs new file mode 100644 index 0000000000..ba23038213 --- /dev/null +++ b/Command/FilterProcess.hs @@ -0,0 +1,89 @@ +{- git-annex command + - + - Copyright 2021 Joey Hess + - + - Licensed under the GNU AGPL version 3 or higher. + -} + +module Command.FilterProcess where + +import Command +import qualified Command.Smudge +import Git.FilterProcess +import Git.PktLine +import Annex.Link + +import qualified Data.ByteString as B +import qualified Data.ByteString.Lazy as L + +cmd :: Command +cmd = noCommit $ noMessages $ + command "filter-process" SectionPlumbing + "long running git filter process" + paramNothing (withParams seek) + +seek :: CmdParams -> CommandSeek +seek _ = liftIO longRunningFilterProcessHandshake >>= \case + Left err -> giveup err + Right () -> go + where + go = liftIO getFilterRequest >>= \case + Just (Smudge f) -> do + smudge f + go + Just (Clean f) -> do + clean f + go + Nothing -> return () + +smudge :: FilePath -> Annex () +smudge file = do + {- The whole git file content is necessarily buffered in memory, + - because we have to consume everything git is sending before + - we can respond to it. An annexed file will be only a pointer + - though. -} + b <- B.concat . map pktLineToByteString <$> liftIO readUntilFlushPkt + Command.Smudge.smudge' file (L.fromStrict b) + {- Git expects us to output the content of unlocked annexed files, + - but if we got a pointer, we output only the pointer. + - See Command.Smudge.smudge for details of how this works. -} + liftIO $ respondFilterRequest b + +clean :: FilePath -> Annex () +clean file = do + {- We have to consume everything git is sending before we can + - respond to it. But it can be an arbitrarily large file, + - which is being added to the annex, and we do not want to buffer + - all that in memory. + - + - Start by reading enough to determine if the file is an annex + - pointer. + -} + let conv b l = (B.concat (map pktLineToByteString l), b) + (b, readcomplete) <- + either (conv False) (conv True) + <$> liftIO (readUntilFlushPktOrSize unpaddedMaxPointerSz) + + let passthrough + | readcomplete = liftIO $ respondFilterRequest b + | otherwise = liftIO $ do + -- Have to buffer the file content in memory here, + -- but it's not an annexed file, so not typically + -- large, and it's all stored in git, which also + -- buffers files in memory. + b' <- B.concat . (map pktLineToByteString) + <$> readUntilFlushPkt + respondFilterRequest (b <> b') + let discardreststdin + | readcomplete = return () + | otherwise = liftIO discardUntilFlushPkt + let emitpointer = liftIO . respondFilterRequest . formatPointer + -- This does not incrementally hash, so both git and git-annex + -- read from the file. It may be less expensive to incrementally + -- hash the content provided by git, but Backend does not currently + -- have an interface to do so. + Command.Smudge.clean' (toRawFilePath file) + (parseLinkTargetOrPointer b) + passthrough + discardreststdin + emitpointer diff --git a/Command/Smudge.hs b/Command/Smudge.hs index 64b66670ad..02043341d2 100644 --- a/Command/Smudge.hs +++ b/Command/Smudge.hs @@ -74,15 +74,19 @@ seek UpdateOption = commandAction update smudge :: FilePath -> CommandStart smudge file = do b <- liftIO $ L.hGetContents stdin - case parseLinkTargetOrPointerLazy b of - Nothing -> noop - Just k -> do - topfile <- inRepo (toTopFilePath (toRawFilePath file)) - Database.Keys.addAssociatedFile k topfile - void $ smudgeLog k topfile + smudge' file b liftIO $ L.putStr b stop +-- Handles everything except the IO of the file content. +smudge' :: FilePath -> L.ByteString -> Annex () +smudge' file b = case parseLinkTargetOrPointerLazy b of + Nothing -> noop + Just k -> do + topfile <- inRepo (toTopFilePath (toRawFilePath file)) + Database.Keys.addAssociatedFile k topfile + void $ smudgeLog k topfile + -- Clean filter is fed file content on stdin, decides if a file -- should be stored in the annex, and outputs a pointer to its -- injested content if so. Otherwise, the original content. @@ -90,50 +94,72 @@ clean :: RawFilePath -> CommandStart clean file = do Annex.BranchState.disableUpdate -- optimisation b <- liftIO $ L.hGetContents stdin - ifM fileoutsiderepo - ( liftIO $ L.hPut stdout b - , do - -- Avoid a potential deadlock. - Annex.changeState $ \s -> s - { Annex.insmudgecleanfilter = True } - go b - ) + let passthrough = liftIO $ L.hPut stdout b + -- Before git 2.5, failing to consume all stdin here would + -- cause a SIGPIPE and crash it. + -- Newer git catches the signal and stops sending, which is + -- much faster. (Also, git seems to forget to free memory + -- when sending the file, so the less we let it send, the + -- less memory it will waste.) + let discardreststdin = if Git.BuildVersion.older "2.5" + then L.length b `seq` return () + else liftIO $ hClose stdin + let emitpointer = liftIO . S.hPut stdout . formatPointer + clean' file (parseLinkTargetOrPointerLazy b) + passthrough + discardreststdin + emitpointer stop where - go b = case parseLinkTargetOrPointerLazy b of + +-- Handles everything except the IO of the file content. +clean' + :: RawFilePath + -> Maybe Key + -- ^ If the content provided by git is an annex pointer, + -- this is the key it points to. + -> Annex () + -- ^ passthrough: Feed the content provided by git back out to git. + -> Annex () + -- ^ discardreststdin: Called when passthrough will not be called, + -- this has to take care of reading the content provided by git, or + -- otherwise dealing with it. + -> (Key -> Annex ()) + -- ^ emitpointer: Emit a pointer file for the key. + -> Annex () +clean' file mk passthrough discardreststdin emitpointer = + ifM (fileOutsideRepo file) + ( passthrough + , inSmudgeCleanFilter go + ) + where + + go = case mk of Just k -> do addingExistingLink file k $ do getMoveRaceRecovery k file - liftIO $ L.hPut stdout b + passthrough Nothing -> inRepo (Git.Ref.fileRef file) >>= \case Just fileref -> do indexmeta <- catObjectMetaData fileref oldkey <- case indexmeta of Just (_, sz, _) -> catKey' fileref sz Nothing -> return Nothing - go' b indexmeta oldkey - Nothing -> liftIO $ L.hPut stdout b - go' b indexmeta oldkey = ifM (shouldAnnex file indexmeta oldkey) + go' indexmeta oldkey + Nothing -> passthrough + go' indexmeta oldkey = ifM (shouldAnnex file indexmeta oldkey) ( do - -- Before git 2.5, failing to consume all stdin here - -- would cause a SIGPIPE and crash it. - -- Newer git catches the signal and stops sending, - -- which is much faster. (Also, git seems to forget - -- to free memory when sending the file, so the - -- less we let it send, the less memory it will waste.) - if Git.BuildVersion.older "2.5" - then L.length b `seq` return () - else liftIO $ hClose stdin + discardreststdin -- Optimization for the case when the file is already -- annexed and is unmodified. case oldkey of Nothing -> doingest Nothing Just ko -> ifM (isUnmodifiedCheap ko file) - ( liftIO $ emitPointer ko + ( emitpointer ko , updateingest ko ) - , liftIO $ L.hPut stdout b + , passthrough ) -- Use the same backend that was used before, when possible. @@ -150,7 +176,7 @@ clean file = do -- Can't restage associated files because git add -- runs this and has the index locked. let norestage = Restage False - liftIO . emitPointer + emitpointer =<< postingest =<< (\ld -> ingest' preferredbackend nullMeterUpdate ld Nothing norestage) =<< lockDown cfg (fromRawFilePath file) @@ -166,12 +192,22 @@ clean file = do , checkWritePerms = True } - -- git diff can run the clean filter on files outside the - -- repository; can't annex those - fileoutsiderepo = do - repopath <- liftIO . absPath =<< fromRepo Git.repoPath - filepath <- liftIO $ absPath file - return $ not $ dirContains repopath filepath +-- git diff can run the clean filter on files outside the +-- repository; can't annex those +fileOutsideRepo :: RawFilePath -> Annex Bool +fileOutsideRepo file = do + repopath <- liftIO . absPath =<< fromRepo Git.repoPath + filepath <- liftIO $ absPath file + return $ not $ dirContains repopath filepath + +-- Avoid a potential deadlock. +inSmudgeCleanFilter :: Annex a -> Annex a +inSmudgeCleanFilter = bracket setup cleanup . const + where + setup = Annex.changeState $ \s -> s + { Annex.insmudgecleanfilter = True } + cleanup () = Annex.changeState $ \s -> s + { Annex.insmudgecleanfilter = False } -- If annex.largefiles is configured (and not disabled by annex.gitaddtoannex -- being set to false), matching files are added to the annex and the rest to @@ -245,9 +281,6 @@ shouldAnnex file indexmeta moldkey = do _ -> cont _ -> cont -emitPointer :: Key -> IO () -emitPointer = S.putStr . formatPointer - -- Recover from a previous race between eg git mv and git-annex get. -- That could result in the file remaining a pointer file, while -- its content is present in the annex. Populate the pointer file. diff --git a/Git/FilterProcess.hs b/Git/FilterProcess.hs index 3dcafb3479..b3bcfe12f3 100644 --- a/Git/FilterProcess.hs +++ b/Git/FilterProcess.hs @@ -15,17 +15,18 @@ module Git.FilterProcess ( Version(..), Capability(..), readUntilFlushPkt, + readUntilFlushPktOrSize, discardUntilFlushPkt, longRunningProcessHandshake, longRunningFilterProcessHandshake, FilterRequest(..), getFilterRequest, + respondFilterRequest, ) where import Common import Git.PktLine -import System.IO import qualified Data.ByteString as B {- This is a message like "git-filter-client" or "git-filter-server" -} @@ -85,6 +86,24 @@ readUntilFlushPkt = go [] Just pktline | not (isFlushPkt pktline) -> go (pktline:l) _ -> return (reverse l) +{- Reads PktLines until at least the specified number of bytes have been + - read, or until a flushPkt (or EOF). Returns Right if it did read a + - flushPkt/EOF, and Left if there is still content leftover that needs to + - be read. -} +readUntilFlushPktOrSize :: Int -> IO (Either [PktLine] [PktLine]) +readUntilFlushPktOrSize = go [] + where + go l n = readPktLine stdin >>= \case + Just pktline + | isFlushPkt pktline -> return (Right (reverse l)) + | otherwise -> + let len = B.length (pktLineToByteString pktline) + n' = n - len + in if n' <= 0 + then return (Left (reverse (pktline:l))) + else go (pktline:l) n' + Nothing -> return (Right (reverse l)) + {- Reads PktLines until a flushPkt (or EOF), and throws them away. -} discardUntilFlushPkt :: IO () discardUntilFlushPkt = readPktLine stdin >>= \case diff --git a/Git/PktLine.hs b/Git/PktLine.hs index c554ce7026..29af14c752 100644 --- a/Git/PktLine.hs +++ b/Git/PktLine.hs @@ -101,9 +101,14 @@ writePktLine h (PktLine b) hFlush h {- Maximum possible length of the string encoded in PktLine; - - the length header takes up 4 bytes. -} + - the length header takes up 4 bytes. + - + - While the 4 byte length header can express lengths up to 65535, + - git actually does not support packets larger than 65520 (including the + - header). See "LARGE_PACKET_MAX" in the git source code. + -} maxPktLineLength :: Int -maxPktLineLength = 65535 - 4 +maxPktLineLength = 65520 - 4 flushPkt :: PktLine flushPkt = PktLine mempty diff --git a/doc/git-annex-filter-process.mdwn b/doc/git-annex-filter-process.mdwn new file mode 100644 index 0000000000..424afc954c --- /dev/null +++ b/doc/git-annex-filter-process.mdwn @@ -0,0 +1,48 @@ +# NAME + +git-annex filter-process - long running git filter process for git-annex + +# SYNOPSIS + +git annex filter-process + +# DESCRIPTION + +By default, `git-annex smudge` is used as the git filter driver. +This is an alternative way to accomplish the same thing, using git's +long-running filter process interface. + +To enable using this, run: + + git config filter.annex.process 'git-annex filter-process' + +There will be no visible difference in behavior between enabling this and +not, besides changes in speed and memory use when using git. + +When this is not enabled, each file that git wants to filter involves +starting up a new `git-annex smudge` process. Starting many such processes +for many files can be slow, and can make commands like `git checkout` and +`git add` slow when they are operating on a lot of files. (A lot of locked +annexed files do not make `git checkout` slow, but unlocked files and +non-annexed files do slow it down.) + +On the other hand when this is enabled, `git add` of a large file does an +unncessary extra read of the file, and pipes its contents into git-annex. +So if you enable this, it will be faster to use `git-annex add` to add +large files to the annex, rather than `git add`. Other commands that +add files, like `git commit -a`, are also impacted by this. + +# OPTIONS + +* The [[git-annex-common-options]](1) can be used. + +# SEE ALSO + +[[git-annex]](1) +[[git-annex-smudge]](1) + +# AUTHOR + +Joey Hess + +Warning: Automatically converted into a man page by mdwn2man. Edit with care. diff --git a/doc/git-annex-smudge.mdwn b/doc/git-annex-smudge.mdwn index 0e15f5564a..f3790e3c71 100644 --- a/doc/git-annex-smudge.mdwn +++ b/doc/git-annex-smudge.mdwn @@ -58,6 +58,7 @@ tree. # SEE ALSO [[git-annex]](1) +[[git-annex-filter-process]](1) # AUTHOR diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index def6ec1dc4..3951b6b8da 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -713,6 +713,13 @@ content from the key-value store. See [[git-annex-smudge]](1) for details. +* `filter-process` + + An alternative implementation of a git filter driver, that is faster + in some situations and slower in others than `git-annex smudge`. + + See [[git-annex-filter-process]](1) for details. + * `findref [ref]` Lists files in a git ref. (deprecated) diff --git a/git-annex.cabal b/git-annex.cabal index 83a314d385..b67638743a 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -745,6 +745,7 @@ Executable git-annex Command.Expire Command.Export Command.FilterBranch + Command.FilterProcess Command.Find Command.FindRef Command.Fix @@ -871,6 +872,7 @@ Executable git-annex Git.FileMode Git.FilePath Git.Filename + Git.FilterProcess Git.Fsck Git.GCrypt Git.HashObject @@ -882,6 +884,7 @@ Executable git-annex Git.LsTree Git.Merge Git.Objects + Git.PktLine Git.Queue Git.Ref Git.RefLog