Added a annex.queuesize setting

useful when adding hundreds of thousands of files on a system with plenty
of memory.

git add gets quite slow in such a large repository, so if the system has
more than the ~32 mb of memory the queue can use by default, it's a useful
optimisation to increase the queue size, in order to decrease the number
of times git add is run.
This commit is contained in:
Joey Hess 2012-02-15 11:13:13 -04:00
parent c26db26259
commit 52c5b164d8
5 changed files with 46 additions and 22 deletions

View file

@ -76,12 +76,12 @@ data AnnexState = AnnexState
{ repo :: Git.Repo { repo :: Git.Repo
, backends :: [BackendA Annex] , backends :: [BackendA Annex]
, remotes :: [Types.Remote.RemoteA Annex] , remotes :: [Types.Remote.RemoteA Annex]
, repoqueue :: Git.Queue.Queue
, output :: OutputType , output :: OutputType
, force :: Bool , force :: Bool
, fast :: Bool , fast :: Bool
, auto :: Bool , auto :: Bool
, branchstate :: BranchState , branchstate :: BranchState
, repoqueue :: Maybe Git.Queue.Queue
, catfilehandle :: Maybe CatFileHandle , catfilehandle :: Maybe CatFileHandle
, checkattrhandle :: Maybe CheckAttrHandle , checkattrhandle :: Maybe CheckAttrHandle
, forcebackend :: Maybe String , forcebackend :: Maybe String
@ -100,12 +100,12 @@ newState gitrepo = AnnexState
{ repo = gitrepo { repo = gitrepo
, backends = [] , backends = []
, remotes = [] , remotes = []
, repoqueue = Git.Queue.new
, output = NormalOutput , output = NormalOutput
, force = False , force = False
, fast = False , fast = False
, auto = False , auto = False
, branchstate = startBranchState , branchstate = startBranchState
, repoqueue = Nothing
, catfilehandle = Nothing , catfilehandle = Nothing
, checkattrhandle = Nothing , checkattrhandle = Nothing
, forcebackend = Nothing , forcebackend = Nothing

View file

@ -12,30 +12,42 @@ module Annex.Queue (
) where ) where
import Common.Annex import Common.Annex
import Annex import Annex hiding (new)
import qualified Git.Queue import qualified Git.Queue
import qualified Git.Config
{- Adds a git command to the queue. -} {- Adds a git command to the queue. -}
add :: String -> [CommandParam] -> [FilePath] -> Annex () add :: String -> [CommandParam] -> [FilePath] -> Annex ()
add command params files = do add command params files = do
q <- getState repoqueue q <- get
store $ Git.Queue.add q command params files store $ Git.Queue.add q command params files
{- Runs the queue if it is full. Should be called periodically. -} {- Runs the queue if it is full. Should be called periodically. -}
flushWhenFull :: Annex () flushWhenFull :: Annex ()
flushWhenFull = do flushWhenFull = do
q <- getState repoqueue q <- get
when (Git.Queue.full q) $ flush False when (Git.Queue.full q) $ flush False
{- Runs (and empties) the queue. -} {- Runs (and empties) the queue. -}
flush :: Bool -> Annex () flush :: Bool -> Annex ()
flush silent = do flush silent = do
q <- getState repoqueue q <- get
unless (0 == Git.Queue.size q) $ do unless (0 == Git.Queue.size q) $ do
unless silent $ unless silent $
showSideAction "Recording state in git" showSideAction "Recording state in git"
q' <- inRepo $ Git.Queue.flush q q' <- inRepo $ Git.Queue.flush q
store q' store q'
get :: Annex Git.Queue.Queue
get = maybe new return =<< getState repoqueue
new :: Annex Git.Queue.Queue
new = do
q <- Git.Queue.new <$> fromRepo queuesize
store q
return q
where
queuesize r = readish =<< Git.Config.getMaybe "annex.queuesize" r
store :: Git.Queue.Queue -> Annex () store :: Git.Queue.Queue -> Annex ()
store q = changeState $ \s -> s { repoqueue = q } store q = changeState $ \s -> s { repoqueue = Just q }

View file

@ -5,13 +5,15 @@
- Licensed under the GNU GPL version 3 or higher. - Licensed under the GNU GPL version 3 or higher.
-} -}
{-# LANGUAGE BangPatterns #-}
module Git.Queue ( module Git.Queue (
Queue, Queue,
new, new,
add, add,
size, size,
full, full,
flush flush,
) where ) where
import qualified Data.Map as M import qualified Data.Map as M
@ -34,7 +36,11 @@ data Action = Action
{- A queue of actions to perform (in any order) on a git repository, {- A queue of actions to perform (in any order) on a git repository,
- with lists of files to perform them on. This allows coalescing - with lists of files to perform them on. This allows coalescing
- similar git commands. -} - similar git commands. -}
data Queue = Queue Int (M.Map Action [FilePath]) data Queue = Queue
{ size :: Int
, _limit :: Int
, _items :: M.Map Action [FilePath]
}
deriving (Show, Eq) deriving (Show, Eq)
{- A recommended maximum size for the queue, after which it should be {- A recommended maximum size for the queue, after which it should be
@ -46,37 +52,33 @@ data Queue = Queue Int (M.Map Action [FilePath])
- above 20k, so this is a fairly good balance -- the queue will buffer - above 20k, so this is a fairly good balance -- the queue will buffer
- only a few megabytes of stuff and a minimal number of commands will be - only a few megabytes of stuff and a minimal number of commands will be
- run by xargs. -} - run by xargs. -}
maxSize :: Int defaultLimit :: Int
maxSize = 10240 defaultLimit = 10240
{- Constructor for empty queue. -} {- Constructor for empty queue. -}
new :: Queue new :: Maybe Int -> Queue
new = Queue 0 M.empty new lim = Queue 0 (fromMaybe defaultLimit lim) M.empty
{- Adds an action to a queue. -} {- Adds an action to a queue. -}
add :: Queue -> String -> [CommandParam] -> [FilePath] -> Queue add :: Queue -> String -> [CommandParam] -> [FilePath] -> Queue
add (Queue n m) subcommand params files = Queue (n + 1) m' add (Queue cur lim m) subcommand params files = Queue (cur + 1) lim m'
where where
action = Action subcommand params action = Action subcommand params
-- There are probably few items in the map, but there -- There are probably few items in the map, but there
-- can be a lot of files per item. So, optimise adding -- can be a lot of files per item. So, optimise adding
-- files. -- files.
m' = M.insertWith' const action fs m m' = M.insertWith' const action fs m
fs = files ++ M.findWithDefault [] action m !fs = files ++ M.findWithDefault [] action m
{- Number of items in a queue. -}
size :: Queue -> Int
size (Queue n _) = n
{- Is a queue large enough that it should be flushed? -} {- Is a queue large enough that it should be flushed? -}
full :: Queue -> Bool full :: Queue -> Bool
full (Queue n _) = n > maxSize full (Queue cur lim _) = cur > lim
{- Runs a queue on a git repository. -} {- Runs a queue on a git repository. -}
flush :: Queue -> Repo -> IO Queue flush :: Queue -> Repo -> IO Queue
flush (Queue _ m) repo = do flush (Queue _ lim m) repo = do
forM_ (M.toList m) $ uncurry $ runAction repo forM_ (M.toList m) $ uncurry $ runAction repo
return new return $ Queue 0 lim M.empty
{- Runs an Action on a list of files in a git repository. {- Runs an Action on a list of files in a git repository.
- -

2
debian/changelog vendored
View file

@ -26,6 +26,8 @@ git-annex (3.20120124) UNRELEASED; urgency=low
due to lazy state update thunks when adding/fixing many files. due to lazy state update thunks when adding/fixing many files.
* Fixed some memory leaks that occurred when committing journal files. * Fixed some memory leaks that occurred when committing journal files.
* whereis: Prints the urls of files that the web special remote knows about. * whereis: Prints the urls of files that the web special remote knows about.
* Added a annex.queuesize setting, useful when adding hundreds of thousands
of files on a system with plenty of memory.
-- Joey Hess <joeyh@debian.org> Tue, 24 Jan 2012 16:21:55 -0400 -- Joey Hess <joeyh@debian.org> Tue, 24 Jan 2012 16:21:55 -0400

View file

@ -576,6 +576,14 @@ Here are all the supported configuration settings.
The default reserve is 1 megabyte. The default reserve is 1 megabyte.
* `annex.queuesize`
git-annex builds a queue of git commands, in order to combine similar
commands for speed. By default the size of the queue is limited to
10240 commands; this can be used to change the size. If you have plenty
of memory and are working with very large numbers of files, increasing
the queue size can speed it up.
* `annex.version` * `annex.version`
Automatically maintained, and used to automate upgrades between versions. Automatically maintained, and used to automate upgrades between versions.