add: Significantly speed up adding lots of non-large files to git

* add: Significantly speed up adding lots of non-large files to git,
  by disabling the annex smudge filter when running git add.
* add --force-small: Run git add rather than updating the index itself,
  so any other smudge filters than the annex one that may be enabled will
  be used.
This commit is contained in:
Joey Hess 2021-01-04 13:12:28 -04:00
parent 0b2b666a38
commit 5ce61c6b2a
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
6 changed files with 43 additions and 40 deletions

View file

@ -20,6 +20,7 @@ import Git.Index
import Git.Env import Git.Env
import qualified Annex import qualified Annex
import qualified Annex.Queue import qualified Annex.Queue
import Config.Smudge
{- Runs an action using a different git index file. -} {- Runs an action using a different git index file. -}
withIndexFile :: AltIndexFile -> (FilePath -> Annex a) -> Annex a withIndexFile :: AltIndexFile -> (FilePath -> Annex a) -> Annex a
@ -67,16 +68,12 @@ withIndexFile i = withAltRepo usecachedgitenv restoregitenv
- Smudge and clean filters are disabled in this work tree. -} - Smudge and clean filters are disabled in this work tree. -}
withWorkTree :: FilePath -> Annex a -> Annex a withWorkTree :: FilePath -> Annex a -> Annex a
withWorkTree d a = withAltRepo withWorkTree d a = withAltRepo
(\g -> return $ (g { location = modlocation (location g), gitGlobalOpts = gitGlobalOpts g ++ disableSmudgeConfig }, ())) (\g -> return $ (g { location = modlocation (location g), gitGlobalOpts = gitGlobalOpts g ++ bypassSmudgeConfig }, ()))
(\g g' -> g' { location = location g, gitGlobalOpts = gitGlobalOpts g }) (\g g' -> g' { location = location g, gitGlobalOpts = gitGlobalOpts g })
(const a) (const a)
where where
modlocation l@(Local {}) = l { worktree = Just (toRawFilePath d) } modlocation l@(Local {}) = l { worktree = Just (toRawFilePath d) }
modlocation _ = error "withWorkTree of non-local git repo" modlocation _ = error "withWorkTree of non-local git repo"
disableSmudgeConfig = map Param
[ "-c", "filter.annex.smudge="
, "-c", "filter.annex.clean="
]
{- Runs an action with the git index file and HEAD, and a few other {- Runs an action with the git index file and HEAD, and a few other
- files that are related to the work tree coming from an overlay - files that are related to the work tree coming from an overlay

View file

@ -1,3 +1,13 @@
git-annex (8.20201130) UNRELEASED; urgency=medium
* add: Significantly speed up adding lots of non-large files to git,
by disabling the annex smudge filter when running git add.
* add --force-small: Run git add rather than updating the index itself,
so any other smudge filters than the annex one that may be enabled will
be used.
-- Joey Hess <id@joeyh.name> Mon, 04 Jan 2021 12:52:41 -0400
git-annex (8.20201129) upstream; urgency=medium git-annex (8.20201129) upstream; urgency=medium
* New borg special remote. This is a new kind of remote, that examines * New borg special remote. This is a new kind of remote, that examines

View file

@ -1,6 +1,6 @@
{- git-annex command {- git-annex command
- -
- Copyright 2010-2020 Joey Hess <id@joeyh.name> - Copyright 2010-2021 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
@ -17,13 +17,10 @@ import qualified Database.Keys
import Annex.FileMatcher import Annex.FileMatcher
import Annex.Link import Annex.Link
import Annex.Tmp import Annex.Tmp
import Annex.HashObject
import Messages.Progress import Messages.Progress
import Git.Types
import Git.FilePath import Git.FilePath
import Config.GitConfig import Config.GitConfig
import qualified Git.UpdateIndex import Config.Smudge
import Utility.FileMode
import Utility.OptParse import Utility.OptParse
import qualified Utility.RawFilePath as R import qualified Utility.RawFilePath as R
@ -119,37 +116,26 @@ startSmall o si file =
addSmall :: CheckGitIgnore -> RawFilePath -> Annex Bool addSmall :: CheckGitIgnore -> RawFilePath -> Annex Bool
addSmall ci file = do addSmall ci file = do
showNote "non-large file; adding content to git repository" showNote "non-large file; adding content to git repository"
addFile ci file addFile Small ci file
startSmallOverridden :: AddOptions -> SeekInput -> RawFilePath -> CommandStart startSmallOverridden :: AddOptions -> SeekInput -> RawFilePath -> CommandStart
startSmallOverridden o si file = startSmallOverridden o si file =
starting "add" (ActionItemWorkTreeFile file) si $ starting "add" (ActionItemWorkTreeFile file) si $ next $ do
next $ addSmallOverridden o file
addSmallOverridden :: AddOptions -> RawFilePath -> Annex Bool
addSmallOverridden o file = do
showNote "adding content to git repository" showNote "adding content to git repository"
s <- liftIO $ R.getSymbolicLinkStatus file addFile Small (checkGitIgnoreOption o) file
if not (isRegularFile s)
then addFile (checkGitIgnoreOption o) file
else do
-- Can't use addFile because the clean filter will
-- honor annex.largefiles and it has been overridden.
-- Instead, hash the file and add to the index.
sha <- hashFile file
let ty = if isExecutable (fileMode s)
then TreeExecutable
else TreeFile
Annex.Queue.addUpdateIndex =<<
inRepo (Git.UpdateIndex.stageFile sha ty (fromRawFilePath file))
return True
addFile :: CheckGitIgnore -> RawFilePath -> Annex Bool data SmallOrLarge = Small | Large
addFile ci file = do
addFile :: SmallOrLarge -> CheckGitIgnore -> RawFilePath -> Annex Bool
addFile smallorlarge ci file = do
ps <- gitAddParams ci ps <- gitAddParams ci
Annex.Queue.addCommand [] "add" (ps++[Param "--"]) Annex.Queue.addCommand cps "add" (ps++[Param "--"])
[fromRawFilePath file] [fromRawFilePath file]
return True return True
where
cps = case smallorlarge of
Large -> []
Small -> bypassSmudgeConfig
start :: AddOptions -> SeekInput -> RawFilePath -> AddUnlockedMatcher -> CommandStart start :: AddOptions -> SeekInput -> RawFilePath -> AddUnlockedMatcher -> CommandStart
start o si file addunlockedmatcher = do start o si file addunlockedmatcher = do
@ -164,7 +150,7 @@ start o si file addunlockedmatcher = do
| otherwise -> | otherwise ->
starting "add" (ActionItemWorkTreeFile file) si $ starting "add" (ActionItemWorkTreeFile file) si $
if isSymbolicLink s if isSymbolicLink s
then next $ addFile (checkGitIgnoreOption o) file then next $ addFile Small (checkGitIgnoreOption o) file
else perform o file addunlockedmatcher else perform o file addunlockedmatcher
addpresent key = addpresent key =
liftIO (catchMaybeIO $ R.getSymbolicLinkStatus file) >>= \case liftIO (catchMaybeIO $ R.getSymbolicLinkStatus file) >>= \case
@ -180,7 +166,7 @@ start o si file addunlockedmatcher = do
starting "add" (ActionItemWorkTreeFile file) si $ starting "add" (ActionItemWorkTreeFile file) si $
addingExistingLink file key $ do addingExistingLink file key $ do
Database.Keys.addAssociatedFile key =<< inRepo (toTopFilePath file) Database.Keys.addAssociatedFile key =<< inRepo (toTopFilePath file)
next $ addFile (checkGitIgnoreOption o) file next $ addFile Large (checkGitIgnoreOption o) file
perform :: AddOptions -> RawFilePath -> AddUnlockedMatcher -> CommandPerform perform :: AddOptions -> RawFilePath -> AddUnlockedMatcher -> CommandPerform
perform o file addunlockedmatcher = withOtherTmp $ \tmpdir -> do perform o file addunlockedmatcher = withOtherTmp $ \tmpdir -> do

View file

@ -60,3 +60,11 @@ deconfigureSmudgeFilter = do
filter (\l -> l `notElem` stdattr && not (null l)) ls filter (\l -> l `notElem` stdattr && not (null l)) ls
unsetConfig (ConfigKey "filter.annex.smudge") unsetConfig (ConfigKey "filter.annex.smudge")
unsetConfig (ConfigKey "filter.annex.clean") unsetConfig (ConfigKey "filter.annex.clean")
-- Params to pass to git to temporarily avoid using the smudge/clean
-- filters.
bypassSmudgeConfig :: [CommandParam]
bypassSmudgeConfig = map Param
[ "-c", "filter.annex.smudge="
, "-c", "filter.annex.clean="
]

View file

@ -43,6 +43,7 @@ import Git.FilePath
import Git.Command import Git.Command
import Git.Types import Git.Types
import Git.Index import Git.Index
import Config.Smudge
import qualified Utility.RawFilePath as R import qualified Utility.RawFilePath as R
import qualified Data.ByteString as S import qualified Data.ByteString as S
@ -237,15 +238,14 @@ reconcileStaged qh = do
liftIO $ writeFile indexcache $ showInodeCache cur liftIO $ writeFile indexcache $ showInodeCache cur
diff = diff =
-- Avoid using external diff command, which would be slow.
-- (The -G option may make it be used otherwise.)
[ Param "-c", Param "diff.external="
-- Avoid running smudge or clean filters, since we want the -- Avoid running smudge or clean filters, since we want the
-- raw output, and they would block trying to access the -- raw output, and they would block trying to access the
-- locked database. The --raw normally avoids git diff -- locked database. The --raw normally avoids git diff
-- running them, but older versions of git need this. -- running them, but older versions of git need this.
, Param "-c", Param "filter.annex.smudge=" bypassSmudgeConfig ++
, Param "-c", Param "filter.annex.clean=" -- Avoid using external diff command, which would be slow.
-- (The -G option may make it be used otherwise.)
[ Param "-c", Param "diff.external="
, Param "diff" , Param "diff"
, Param "--cached" , Param "--cached"
, Param "--raw" , Param "--raw"

View file

@ -14,3 +14,5 @@ with the existing `--force-small` too, but at least that's not the default.
Possible alternate approach: Unsetting filter.annex.smudge and Possible alternate approach: Unsetting filter.annex.smudge and
filter.annex.clean when running `git add`? filter.annex.clean when running `git add`?
> This approach is a winner! [[done]] --[[Joey]]