Avoid git status taking a long time after git-annex unlock of many files.

Implemented by making Git.Queue have a FlushAction, which can accumulate
along with another action on files, and runs only once the other action has
run.

This lets git-annex unlock queue up git update-index actions, without
conflicting with the restagePointerFiles FlushActions.

In a repository with filter-process enabled, git-annex unlock will
often not take any more time than before, though it may when the files are
large. Either way, it should always slow down less than git-annex status
speeds up.

When filter-process is not enabled, git-annex unlock will slow down as much
as git status speeds up.

Sponsored-by: Jochen Bartl on Patreon
This commit is contained in:
Joey Hess 2022-02-18 15:06:40 -04:00
parent c68f52c6a2
commit faf84aa5c2
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
6 changed files with 76 additions and 63 deletions

View file

@ -190,7 +190,7 @@ restagePointerFile (Restage True) f orig = withTSDelta $ \tsd ->
-- fails on "../../repo/path/file" when cwd is not in the repo
-- being acted on. Avoid these problems with an absolute path.
absf <- liftIO $ absPath f
Annex.Queue.addInternalAction runner [(absf, isunmodified tsd, inodeCacheFileSize orig)]
Annex.Queue.addFlushAction runner [(absf, isunmodified tsd, inodeCacheFileSize orig)]
where
isunmodified tsd = genInodeCache f tsd >>= return . \case
Nothing -> False
@ -202,8 +202,8 @@ restagePointerFile (Restage True) f orig = withTSDelta $ \tsd ->
-- on all still-unmodified files, using a copy of the index file,
-- to bypass the lock. Then replace the old index file with the new
-- updated index file.
runner :: Git.Queue.InternalActionRunner Annex
runner = Git.Queue.InternalActionRunner "restagePointerFile" $ \r l -> do
runner :: Git.Queue.FlushActionRunner Annex
runner = Git.Queue.FlushActionRunner "restagePointerFile" $ \r l -> do
-- Flush any queued changes to the keys database, so they
-- are visible to child processes.
-- The database is closed because that may improve behavior

View file

@ -9,7 +9,7 @@
module Annex.Queue (
addCommand,
addInternalAction,
addFlushAction,
addUpdateIndex,
flush,
flushWhenFull,
@ -31,11 +31,11 @@ addCommand commonparams command params files = do
store =<< flushWhenFull =<<
(Git.Queue.addCommand commonparams command params files q =<< gitRepo)
addInternalAction :: Git.Queue.InternalActionRunner Annex -> [(RawFilePath, IO Bool, FileSize)] -> Annex ()
addInternalAction runner files = do
addFlushAction :: Git.Queue.FlushActionRunner Annex -> [(RawFilePath, IO Bool, FileSize)] -> Annex ()
addFlushAction runner files = do
q <- get
store =<< flushWhenFull =<<
(Git.Queue.addInternalAction runner files q =<< gitRepo)
(Git.Queue.addFlushAction runner files q =<< gitRepo)
{- Adds an update-index stream to the queue. -}
addUpdateIndex :: Git.UpdateIndex.Streamer -> Annex ()

View file

@ -10,6 +10,7 @@ git-annex (10.20220128) UNRELEASED; urgency=medium
* registerurl, unregisterurl: Improved output when reading from stdin
to be more like other batch commands.
* registerurl, unregisterurl: Added --json and --json-error-messages options.
* Avoid git status taking a long time after git-annex unlock of many files.
-- Joey Hess <id@joeyh.name> Mon, 31 Jan 2022 13:14:42 -0400

View file

@ -1,6 +1,6 @@
{- git repository command queue
-
- Copyright 2010-2021 Joey Hess <id@joeyh.name>
- Copyright 2010-2022 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -13,8 +13,8 @@ module Git.Queue (
defaultTimelimit,
addCommand,
addUpdateIndex,
addInternalAction,
InternalActionRunner(..),
addFlushAction,
FlushActionRunner(..),
size,
full,
flush,
@ -48,30 +48,34 @@ data Action m
-- ^ parameters that come after the git subcommand
, getFiles :: [CommandParam]
}
{- An internal action to run, on a list of files that can be added
- to as the queue grows. -}
| InternalAction
{ getRunner :: InternalActionRunner m
, getInternalFiles :: [(RawFilePath, IO Bool, FileSize)]
{- A FlushAction can be added along with CommandActions or
- UpdateIndexActions, and when the queue later gets flushed,
- those will be run before the FlushAction is. -}
| FlushAction
{ getFlushActionRunner :: FlushActionRunner m
, getFlushActionFiles :: [(RawFilePath, IO Bool, FileSize)]
}
{- The String must be unique for each internal action. -}
data InternalActionRunner m = InternalActionRunner String (Repo -> [(RawFilePath, IO Bool, FileSize)] -> m ())
{- The String must be unique for each flush action. -}
data FlushActionRunner m = FlushActionRunner String (Repo -> [(RawFilePath, IO Bool, FileSize)] -> m ())
instance Eq (InternalActionRunner m) where
InternalActionRunner s1 _ == InternalActionRunner s2 _ = s1 == s2
instance Eq (FlushActionRunner m) where
FlushActionRunner s1 _ == FlushActionRunner s2 _ = s1 == s2
{- A key that can uniquely represent an action in a Map. -}
{- A key that can uniquely represent an action in a Map.
-
- The ordering controls what order the actions are run in when flushing
- the queue. -}
data ActionKey
= UpdateIndexActionKey
| CommandActionKey [CommandParam] String [CommandParam]
| InternalActionKey String
| FlushActionKey String
deriving (Eq, Ord)
actionKey :: Action m -> ActionKey
actionKey (UpdateIndexAction _) = UpdateIndexActionKey
actionKey CommandAction { getCommonParams = c, getSubcommand = s, getParams = p } = CommandActionKey c s p
actionKey InternalAction { getRunner = InternalActionRunner s _ } = InternalActionKey s
actionKey FlushAction { getFlushActionRunner = FlushActionRunner s _ } = FlushActionKey s
{- A queue of actions to perform (in any order) on a git repository,
- with lists of files to perform them on. This allows coalescing
@ -120,7 +124,7 @@ new lim tlim = do
-}
addCommand :: MonadIO m => [CommandParam] -> String -> [CommandParam] -> [FilePath] -> Queue m -> Repo -> m (Queue m)
addCommand commonparams subcommand params files q repo =
updateQueue action different (length files) q repo
updateQueue action conflicting (length files) q repo
where
action = CommandAction
{ getCommonParams = commonparams
@ -129,36 +133,37 @@ addCommand commonparams subcommand params files q repo =
, getFiles = map File files
}
different (CommandAction { getSubcommand = s }) = s /= subcommand
different _ = True
conflicting (CommandAction { getSubcommand = s }) = s /= subcommand
conflicting (FlushAction {}) = False
conflicting _ = True
{- Adds an internal action to the queue. -}
addInternalAction :: MonadIO m => InternalActionRunner m -> [(RawFilePath, IO Bool, FileSize)] -> Queue m -> Repo -> m (Queue m)
addInternalAction runner files q repo =
updateQueue action different (length files) q repo
{- Adds an flush action to the queue. This can co-exist with anything else
- that gets added to the queue, and when the queue is eventually flushed,
- it will be run after the other things in the queue. -}
addFlushAction :: MonadIO m => FlushActionRunner m -> [(RawFilePath, IO Bool, FileSize)] -> Queue m -> Repo -> m (Queue m)
addFlushAction runner files q repo =
updateQueue action (const False) (length files) q repo
where
action = InternalAction
{ getRunner = runner
, getInternalFiles = files
action = FlushAction
{ getFlushActionRunner = runner
, getFlushActionFiles = files
}
different (InternalAction { getRunner = r }) = r /= runner
different _ = True
{- Adds an update-index streamer to the queue. -}
addUpdateIndex :: MonadIO m => Git.UpdateIndex.Streamer -> Queue m -> Repo -> m (Queue m)
addUpdateIndex streamer q repo =
updateQueue action different 1 q repo
updateQueue action conflicting 1 q repo
where
-- the list is built in reverse order
action = UpdateIndexAction [streamer]
different (UpdateIndexAction _) = False
different _ = True
conflicting (UpdateIndexAction _) = False
conflicting (FlushAction {}) = False
conflicting _ = True
{- Updates or adds an action in the queue.
-
- If the queue already contains a different action, it will be flushed
- If the queue already contains a conflicting action, it will be flushed
- before adding the action; this is to ensure that conflicting actions,
- like add and rm, are run in the right order.
-
@ -166,19 +171,19 @@ addUpdateIndex streamer q repo =
- and the action will be run right away.
-}
updateQueue :: MonadIO m => Action m -> (Action m -> Bool) -> Int -> Queue m -> Repo -> m (Queue m)
updateQueue !action different sizeincrease q repo = do
updateQueue !action conflicting sizeincrease q repo = do
now <- liftIO getPOSIXTime
if now - (_lastchanged q) > _timelimit q
then if isdifferent
then if isconflicting
then do
q' <- flush q repo
flush (mk q') repo
else flush (mk q) repo
else if isdifferent
else if isconflicting
then mk <$> flush q repo
else return $ mk (q { _lastchanged = now })
where
isdifferent = not (null (filter different (M.elems (items q))))
isconflicting = not (null (filter conflicting (M.elems (items q))))
mk q' = newq
where
!newq = q'
@ -196,8 +201,8 @@ combineNewOld (CommandAction _cps1 _sc1 _ps1 fs1) (CommandAction cps2 sc2 ps2 fs
CommandAction cps2 sc2 ps2 (fs1++fs2)
combineNewOld (UpdateIndexAction s1) (UpdateIndexAction s2) =
UpdateIndexAction (s1++s2)
combineNewOld (InternalAction _r1 fs1) (InternalAction r2 fs2) =
InternalAction r2 (fs1++fs2)
combineNewOld (FlushAction _r1 fs1) (FlushAction r2 fs2) =
FlushAction r2 (fs1++fs2)
combineNewOld anew _aold = anew
{- Merges the contents of the second queue into the first.
@ -257,6 +262,6 @@ runAction repo action@(CommandAction {}) = liftIO $ do
forceSuccessProcess p pid
go _ _ _ _ _ = error "internal"
#endif
runAction repo action@(InternalAction {}) =
let InternalActionRunner _ runner = getRunner action
in runner repo (getInternalFiles action)
runAction repo action@(FlushAction {}) =
let FlushActionRunner _ runner = getFlushActionRunner action
in runner repo (getFlushActionFiles action)

View file

@ -8,17 +8,6 @@ git annex filter-process
# DESCRIPTION
By default, `git-annex smudge` is used as the git filter driver.
This is an alternative way to accomplish the same thing, using git's
long-running filter process interface.
To enable using this, run:
git config filter.annex.process 'git-annex filter-process'
There will be no visible difference in behavior between enabling this and
not, besides changes in speed and memory use when using git.
When this is not enabled, each file that git wants to filter involves
starting up a new `git-annex smudge` process. Starting many such processes
for many files can be slow, and can make commands like `git checkout` and
@ -28,10 +17,25 @@ non-annexed files do slow it down.)
On the other hand when this is enabled, `git add` of a large file does an
unncessary extra read of the file, and pipes its contents into git-annex.
So if you enable this, it will be faster to use `git-annex add` to add
So when this is enabled, it will be faster to use `git-annex add` to add
large files to the annex, rather than `git add`. Other commands that
add files, like `git commit -a`, are also impacted by this.
This is used by default in git-annex repositories v9 and above, while
v8 repositories use `git-annex smudge` for backwards compatability with
older versions of git-annex.
To enable this in a v8 repository, run:
git config filter.annex.process 'git-annex filter-process'
To disable it, you can just unset the config:
git config --unset filter.annex.process
There will be no visible difference in behavior between enabling this and
not, besides changes in speed and memory use when using git.
# OPTIONS
* The [[git-annex-common-options]](1) can be used.

View file

@ -27,10 +27,13 @@ commit -a`). Afterwards, `git status` then smudged it again, unsure why!
> slowed git-annex unlock. But git status did then avoid doing any more
> smudgeing. It seems that each call to restagePointerFile is running
> git update-index, so still one git-annex smudge per file, rather
> than combining several together.
> than combining several together. In a v8 repo, the same amount of work
> is done either way. In v9+, this is actually slower than before when the
> files are small.
>
> That's because restagePointerFile uses the git queue, and unlock
> restagePointerFile was running git update-index once per file
> because restagePointerFile uses the git queue, and unlock
> also queues a git add or something, so the queue isn't able to built
> up because two dissimilar things are being queued. This seems an
> unncessary behavior; it could queue up all the git adds and then
> run restagePointerFile after them all.
> run restagePointerFile after them all. Implemented that, and [[done]]! --[[Joey]]