add: Significantly speed up adding lots of non-large files to git

* add: Significantly speed up adding lots of non-large files to git,
  by disabling the annex smudge filter when running git add.
* add --force-small: Run git add rather than updating the index itself,
  so any other smudge filters than the annex one that may be enabled will
  be used.
This commit is contained in:
Joey Hess 2021-01-04 13:12:28 -04:00
parent 0b2b666a38
commit 5ce61c6b2a
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
6 changed files with 43 additions and 40 deletions

View file

@ -20,6 +20,7 @@ import Git.Index
import Git.Env
import qualified Annex
import qualified Annex.Queue
import Config.Smudge
{- Runs an action using a different git index file. -}
withIndexFile :: AltIndexFile -> (FilePath -> Annex a) -> Annex a
@ -67,16 +68,12 @@ withIndexFile i = withAltRepo usecachedgitenv restoregitenv
- Smudge and clean filters are disabled in this work tree. -}
withWorkTree :: FilePath -> Annex a -> Annex a
withWorkTree d a = withAltRepo
(\g -> return $ (g { location = modlocation (location g), gitGlobalOpts = gitGlobalOpts g ++ disableSmudgeConfig }, ()))
(\g -> return $ (g { location = modlocation (location g), gitGlobalOpts = gitGlobalOpts g ++ bypassSmudgeConfig }, ()))
(\g g' -> g' { location = location g, gitGlobalOpts = gitGlobalOpts g })
(const a)
where
modlocation l@(Local {}) = l { worktree = Just (toRawFilePath d) }
modlocation _ = error "withWorkTree of non-local git repo"
disableSmudgeConfig = map Param
[ "-c", "filter.annex.smudge="
, "-c", "filter.annex.clean="
]
{- Runs an action with the git index file and HEAD, and a few other
- files that are related to the work tree coming from an overlay

View file

@ -1,3 +1,13 @@
git-annex (8.20201130) UNRELEASED; urgency=medium
* add: Significantly speed up adding lots of non-large files to git,
by disabling the annex smudge filter when running git add.
* add --force-small: Run git add rather than updating the index itself,
so any other smudge filters than the annex one that may be enabled will
be used.
-- Joey Hess <id@joeyh.name> Mon, 04 Jan 2021 12:52:41 -0400
git-annex (8.20201129) upstream; urgency=medium
* New borg special remote. This is a new kind of remote, that examines

View file

@ -1,6 +1,6 @@
{- git-annex command
-
- Copyright 2010-2020 Joey Hess <id@joeyh.name>
- Copyright 2010-2021 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -17,13 +17,10 @@ import qualified Database.Keys
import Annex.FileMatcher
import Annex.Link
import Annex.Tmp
import Annex.HashObject
import Messages.Progress
import Git.Types
import Git.FilePath
import Config.GitConfig
import qualified Git.UpdateIndex
import Utility.FileMode
import Config.Smudge
import Utility.OptParse
import qualified Utility.RawFilePath as R
@ -119,37 +116,26 @@ startSmall o si file =
addSmall :: CheckGitIgnore -> RawFilePath -> Annex Bool
addSmall ci file = do
showNote "non-large file; adding content to git repository"
addFile ci file
addFile Small ci file
startSmallOverridden :: AddOptions -> SeekInput -> RawFilePath -> CommandStart
startSmallOverridden o si file =
starting "add" (ActionItemWorkTreeFile file) si $
next $ addSmallOverridden o file
addSmallOverridden :: AddOptions -> RawFilePath -> Annex Bool
addSmallOverridden o file = do
starting "add" (ActionItemWorkTreeFile file) si $ next $ do
showNote "adding content to git repository"
s <- liftIO $ R.getSymbolicLinkStatus file
if not (isRegularFile s)
then addFile (checkGitIgnoreOption o) file
else do
-- Can't use addFile because the clean filter will
-- honor annex.largefiles and it has been overridden.
-- Instead, hash the file and add to the index.
sha <- hashFile file
let ty = if isExecutable (fileMode s)
then TreeExecutable
else TreeFile
Annex.Queue.addUpdateIndex =<<
inRepo (Git.UpdateIndex.stageFile sha ty (fromRawFilePath file))
return True
addFile Small (checkGitIgnoreOption o) file
addFile :: CheckGitIgnore -> RawFilePath -> Annex Bool
addFile ci file = do
data SmallOrLarge = Small | Large
addFile :: SmallOrLarge -> CheckGitIgnore -> RawFilePath -> Annex Bool
addFile smallorlarge ci file = do
ps <- gitAddParams ci
Annex.Queue.addCommand [] "add" (ps++[Param "--"])
Annex.Queue.addCommand cps "add" (ps++[Param "--"])
[fromRawFilePath file]
return True
where
cps = case smallorlarge of
Large -> []
Small -> bypassSmudgeConfig
start :: AddOptions -> SeekInput -> RawFilePath -> AddUnlockedMatcher -> CommandStart
start o si file addunlockedmatcher = do
@ -164,7 +150,7 @@ start o si file addunlockedmatcher = do
| otherwise ->
starting "add" (ActionItemWorkTreeFile file) si $
if isSymbolicLink s
then next $ addFile (checkGitIgnoreOption o) file
then next $ addFile Small (checkGitIgnoreOption o) file
else perform o file addunlockedmatcher
addpresent key =
liftIO (catchMaybeIO $ R.getSymbolicLinkStatus file) >>= \case
@ -180,7 +166,7 @@ start o si file addunlockedmatcher = do
starting "add" (ActionItemWorkTreeFile file) si $
addingExistingLink file key $ do
Database.Keys.addAssociatedFile key =<< inRepo (toTopFilePath file)
next $ addFile (checkGitIgnoreOption o) file
next $ addFile Large (checkGitIgnoreOption o) file
perform :: AddOptions -> RawFilePath -> AddUnlockedMatcher -> CommandPerform
perform o file addunlockedmatcher = withOtherTmp $ \tmpdir -> do

View file

@ -60,3 +60,11 @@ deconfigureSmudgeFilter = do
filter (\l -> l `notElem` stdattr && not (null l)) ls
unsetConfig (ConfigKey "filter.annex.smudge")
unsetConfig (ConfigKey "filter.annex.clean")
-- Params to pass to git to temporarily avoid using the smudge/clean
-- filters.
bypassSmudgeConfig :: [CommandParam]
bypassSmudgeConfig = map Param
[ "-c", "filter.annex.smudge="
, "-c", "filter.annex.clean="
]

View file

@ -43,6 +43,7 @@ import Git.FilePath
import Git.Command
import Git.Types
import Git.Index
import Config.Smudge
import qualified Utility.RawFilePath as R
import qualified Data.ByteString as S
@ -237,15 +238,14 @@ reconcileStaged qh = do
liftIO $ writeFile indexcache $ showInodeCache cur
diff =
-- Avoid using external diff command, which would be slow.
-- (The -G option may make it be used otherwise.)
[ Param "-c", Param "diff.external="
-- Avoid running smudge or clean filters, since we want the
-- raw output, and they would block trying to access the
-- locked database. The --raw normally avoids git diff
-- running them, but older versions of git need this.
, Param "-c", Param "filter.annex.smudge="
, Param "-c", Param "filter.annex.clean="
bypassSmudgeConfig ++
-- Avoid using external diff command, which would be slow.
-- (The -G option may make it be used otherwise.)
[ Param "-c", Param "diff.external="
, Param "diff"
, Param "--cached"
, Param "--raw"

View file

@ -14,3 +14,5 @@ with the existing `--force-small` too, but at least that's not the default.
Possible alternate approach: Unsetting filter.annex.smudge and
filter.annex.clean when running `git add`?
> This approach is a winner! [[done]] --[[Joey]]