From 5ce61c6b2aca0e89f9aaa416435c10febec5107d Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 4 Jan 2021 13:12:28 -0400 Subject: [PATCH] add: Significantly speed up adding lots of non-large files to git * add: Significantly speed up adding lots of non-large files to git, by disabling the annex smudge filter when running git add. * add --force-small: Run git add rather than updating the index itself, so any other smudge filters than the annex one that may be enabled will be used. --- Annex/GitOverlay.hs | 7 +-- CHANGELOG | 10 ++++ Command/Add.hs | 46 +++++++------------ Config/Smudge.hs | 8 ++++ Database/Keys.hs | 10 ++-- ...speed_up_git_annex_add_of_small_files.mdwn | 2 + 6 files changed, 43 insertions(+), 40 deletions(-) diff --git a/Annex/GitOverlay.hs b/Annex/GitOverlay.hs index 2e441a28b2..568fd28814 100644 --- a/Annex/GitOverlay.hs +++ b/Annex/GitOverlay.hs @@ -20,6 +20,7 @@ import Git.Index import Git.Env import qualified Annex import qualified Annex.Queue +import Config.Smudge {- Runs an action using a different git index file. -} withIndexFile :: AltIndexFile -> (FilePath -> Annex a) -> Annex a @@ -67,16 +68,12 @@ withIndexFile i = withAltRepo usecachedgitenv restoregitenv - Smudge and clean filters are disabled in this work tree. -} withWorkTree :: FilePath -> Annex a -> Annex a withWorkTree d a = withAltRepo - (\g -> return $ (g { location = modlocation (location g), gitGlobalOpts = gitGlobalOpts g ++ disableSmudgeConfig }, ())) + (\g -> return $ (g { location = modlocation (location g), gitGlobalOpts = gitGlobalOpts g ++ bypassSmudgeConfig }, ())) (\g g' -> g' { location = location g, gitGlobalOpts = gitGlobalOpts g }) (const a) where modlocation l@(Local {}) = l { worktree = Just (toRawFilePath d) } modlocation _ = error "withWorkTree of non-local git repo" - disableSmudgeConfig = map Param - [ "-c", "filter.annex.smudge=" - , "-c", "filter.annex.clean=" - ] {- Runs an action with the git index file and HEAD, and a few other - files that are related to the work tree coming from an overlay diff --git a/CHANGELOG b/CHANGELOG index b9f4bc51c8..efc62e305c 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,13 @@ +git-annex (8.20201130) UNRELEASED; urgency=medium + + * add: Significantly speed up adding lots of non-large files to git, + by disabling the annex smudge filter when running git add. + * add --force-small: Run git add rather than updating the index itself, + so any other smudge filters than the annex one that may be enabled will + be used. + + -- Joey Hess Mon, 04 Jan 2021 12:52:41 -0400 + git-annex (8.20201129) upstream; urgency=medium * New borg special remote. This is a new kind of remote, that examines diff --git a/Command/Add.hs b/Command/Add.hs index 4f848d25ba..0835f0946e 100644 --- a/Command/Add.hs +++ b/Command/Add.hs @@ -1,6 +1,6 @@ {- git-annex command - - - Copyright 2010-2020 Joey Hess + - Copyright 2010-2021 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -17,13 +17,10 @@ import qualified Database.Keys import Annex.FileMatcher import Annex.Link import Annex.Tmp -import Annex.HashObject import Messages.Progress -import Git.Types import Git.FilePath import Config.GitConfig -import qualified Git.UpdateIndex -import Utility.FileMode +import Config.Smudge import Utility.OptParse import qualified Utility.RawFilePath as R @@ -119,37 +116,26 @@ startSmall o si file = addSmall :: CheckGitIgnore -> RawFilePath -> Annex Bool addSmall ci file = do showNote "non-large file; adding content to git repository" - addFile ci file + addFile Small ci file startSmallOverridden :: AddOptions -> SeekInput -> RawFilePath -> CommandStart startSmallOverridden o si file = - starting "add" (ActionItemWorkTreeFile file) si $ - next $ addSmallOverridden o file + starting "add" (ActionItemWorkTreeFile file) si $ next $ do + showNote "adding content to git repository" + addFile Small (checkGitIgnoreOption o) file -addSmallOverridden :: AddOptions -> RawFilePath -> Annex Bool -addSmallOverridden o file = do - showNote "adding content to git repository" - s <- liftIO $ R.getSymbolicLinkStatus file - if not (isRegularFile s) - then addFile (checkGitIgnoreOption o) file - else do - -- Can't use addFile because the clean filter will - -- honor annex.largefiles and it has been overridden. - -- Instead, hash the file and add to the index. - sha <- hashFile file - let ty = if isExecutable (fileMode s) - then TreeExecutable - else TreeFile - Annex.Queue.addUpdateIndex =<< - inRepo (Git.UpdateIndex.stageFile sha ty (fromRawFilePath file)) - return True +data SmallOrLarge = Small | Large -addFile :: CheckGitIgnore -> RawFilePath -> Annex Bool -addFile ci file = do +addFile :: SmallOrLarge -> CheckGitIgnore -> RawFilePath -> Annex Bool +addFile smallorlarge ci file = do ps <- gitAddParams ci - Annex.Queue.addCommand [] "add" (ps++[Param "--"]) + Annex.Queue.addCommand cps "add" (ps++[Param "--"]) [fromRawFilePath file] return True + where + cps = case smallorlarge of + Large -> [] + Small -> bypassSmudgeConfig start :: AddOptions -> SeekInput -> RawFilePath -> AddUnlockedMatcher -> CommandStart start o si file addunlockedmatcher = do @@ -164,7 +150,7 @@ start o si file addunlockedmatcher = do | otherwise -> starting "add" (ActionItemWorkTreeFile file) si $ if isSymbolicLink s - then next $ addFile (checkGitIgnoreOption o) file + then next $ addFile Small (checkGitIgnoreOption o) file else perform o file addunlockedmatcher addpresent key = liftIO (catchMaybeIO $ R.getSymbolicLinkStatus file) >>= \case @@ -180,7 +166,7 @@ start o si file addunlockedmatcher = do starting "add" (ActionItemWorkTreeFile file) si $ addingExistingLink file key $ do Database.Keys.addAssociatedFile key =<< inRepo (toTopFilePath file) - next $ addFile (checkGitIgnoreOption o) file + next $ addFile Large (checkGitIgnoreOption o) file perform :: AddOptions -> RawFilePath -> AddUnlockedMatcher -> CommandPerform perform o file addunlockedmatcher = withOtherTmp $ \tmpdir -> do diff --git a/Config/Smudge.hs b/Config/Smudge.hs index d97001885f..487f380d5d 100644 --- a/Config/Smudge.hs +++ b/Config/Smudge.hs @@ -60,3 +60,11 @@ deconfigureSmudgeFilter = do filter (\l -> l `notElem` stdattr && not (null l)) ls unsetConfig (ConfigKey "filter.annex.smudge") unsetConfig (ConfigKey "filter.annex.clean") + +-- Params to pass to git to temporarily avoid using the smudge/clean +-- filters. +bypassSmudgeConfig :: [CommandParam] +bypassSmudgeConfig = map Param + [ "-c", "filter.annex.smudge=" + , "-c", "filter.annex.clean=" + ] diff --git a/Database/Keys.hs b/Database/Keys.hs index 6b305f060c..337f232a36 100644 --- a/Database/Keys.hs +++ b/Database/Keys.hs @@ -43,6 +43,7 @@ import Git.FilePath import Git.Command import Git.Types import Git.Index +import Config.Smudge import qualified Utility.RawFilePath as R import qualified Data.ByteString as S @@ -237,15 +238,14 @@ reconcileStaged qh = do liftIO $ writeFile indexcache $ showInodeCache cur diff = - -- Avoid using external diff command, which would be slow. - -- (The -G option may make it be used otherwise.) - [ Param "-c", Param "diff.external=" -- Avoid running smudge or clean filters, since we want the -- raw output, and they would block trying to access the -- locked database. The --raw normally avoids git diff -- running them, but older versions of git need this. - , Param "-c", Param "filter.annex.smudge=" - , Param "-c", Param "filter.annex.clean=" + bypassSmudgeConfig ++ + -- Avoid using external diff command, which would be slow. + -- (The -G option may make it be used otherwise.) + [ Param "-c", Param "diff.external=" , Param "diff" , Param "--cached" , Param "--raw" diff --git a/doc/todo/speed_up_git_annex_add_of_small_files.mdwn b/doc/todo/speed_up_git_annex_add_of_small_files.mdwn index 1996e40cc6..90567cc091 100644 --- a/doc/todo/speed_up_git_annex_add_of_small_files.mdwn +++ b/doc/todo/speed_up_git_annex_add_of_small_files.mdwn @@ -14,3 +14,5 @@ with the existing `--force-small` too, but at least that's not the default. Possible alternate approach: Unsetting filter.annex.smudge and filter.annex.clean when running `git add`? + +> This approach is a winner! [[done]] --[[Joey]]