Added a annex.queuesize setting

useful when adding hundreds of thousands of files on a system with plenty
of memory.

git add gets quite slow in such a large repository, so if the system has
more than the ~32 mb of memory the queue can use by default, it's a useful
optimisation to increase the queue size, in order to decrease the number
of times git add is run.
This commit is contained in:
Joey Hess 2012-02-15 11:13:13 -04:00
parent c26db26259
commit 52c5b164d8
5 changed files with 46 additions and 22 deletions

View file

@ -76,12 +76,12 @@ data AnnexState = AnnexState
{ repo :: Git.Repo
, backends :: [BackendA Annex]
, remotes :: [Types.Remote.RemoteA Annex]
, repoqueue :: Git.Queue.Queue
, output :: OutputType
, force :: Bool
, fast :: Bool
, auto :: Bool
, branchstate :: BranchState
, repoqueue :: Maybe Git.Queue.Queue
, catfilehandle :: Maybe CatFileHandle
, checkattrhandle :: Maybe CheckAttrHandle
, forcebackend :: Maybe String
@ -100,12 +100,12 @@ newState gitrepo = AnnexState
{ repo = gitrepo
, backends = []
, remotes = []
, repoqueue = Git.Queue.new
, output = NormalOutput
, force = False
, fast = False
, auto = False
, branchstate = startBranchState
, repoqueue = Nothing
, catfilehandle = Nothing
, checkattrhandle = Nothing
, forcebackend = Nothing

View file

@ -12,30 +12,42 @@ module Annex.Queue (
) where
import Common.Annex
import Annex
import Annex hiding (new)
import qualified Git.Queue
import qualified Git.Config
{- Adds a git command to the queue. -}
add :: String -> [CommandParam] -> [FilePath] -> Annex ()
add command params files = do
q <- getState repoqueue
q <- get
store $ Git.Queue.add q command params files
{- Runs the queue if it is full. Should be called periodically. -}
flushWhenFull :: Annex ()
flushWhenFull = do
q <- getState repoqueue
q <- get
when (Git.Queue.full q) $ flush False
{- Runs (and empties) the queue. -}
flush :: Bool -> Annex ()
flush silent = do
q <- getState repoqueue
q <- get
unless (0 == Git.Queue.size q) $ do
unless silent $
showSideAction "Recording state in git"
q' <- inRepo $ Git.Queue.flush q
store q'
get :: Annex Git.Queue.Queue
get = maybe new return =<< getState repoqueue
new :: Annex Git.Queue.Queue
new = do
q <- Git.Queue.new <$> fromRepo queuesize
store q
return q
where
queuesize r = readish =<< Git.Config.getMaybe "annex.queuesize" r
store :: Git.Queue.Queue -> Annex ()
store q = changeState $ \s -> s { repoqueue = q }
store q = changeState $ \s -> s { repoqueue = Just q }

View file

@ -5,13 +5,15 @@
- Licensed under the GNU GPL version 3 or higher.
-}
{-# LANGUAGE BangPatterns #-}
module Git.Queue (
Queue,
new,
add,
size,
full,
flush
flush,
) where
import qualified Data.Map as M
@ -34,7 +36,11 @@ data Action = Action
{- A queue of actions to perform (in any order) on a git repository,
- with lists of files to perform them on. This allows coalescing
- similar git commands. -}
data Queue = Queue Int (M.Map Action [FilePath])
data Queue = Queue
{ size :: Int
, _limit :: Int
, _items :: M.Map Action [FilePath]
}
deriving (Show, Eq)
{- A recommended maximum size for the queue, after which it should be
@ -46,37 +52,33 @@ data Queue = Queue Int (M.Map Action [FilePath])
- above 20k, so this is a fairly good balance -- the queue will buffer
- only a few megabytes of stuff and a minimal number of commands will be
- run by xargs. -}
maxSize :: Int
maxSize = 10240
defaultLimit :: Int
defaultLimit = 10240
{- Constructor for empty queue. -}
new :: Queue
new = Queue 0 M.empty
new :: Maybe Int -> Queue
new lim = Queue 0 (fromMaybe defaultLimit lim) M.empty
{- Adds an action to a queue. -}
add :: Queue -> String -> [CommandParam] -> [FilePath] -> Queue
add (Queue n m) subcommand params files = Queue (n + 1) m'
add (Queue cur lim m) subcommand params files = Queue (cur + 1) lim m'
where
action = Action subcommand params
-- There are probably few items in the map, but there
-- can be a lot of files per item. So, optimise adding
-- files.
m' = M.insertWith' const action fs m
fs = files ++ M.findWithDefault [] action m
{- Number of items in a queue. -}
size :: Queue -> Int
size (Queue n _) = n
!fs = files ++ M.findWithDefault [] action m
{- Is a queue large enough that it should be flushed? -}
full :: Queue -> Bool
full (Queue n _) = n > maxSize
full (Queue cur lim _) = cur > lim
{- Runs a queue on a git repository. -}
flush :: Queue -> Repo -> IO Queue
flush (Queue _ m) repo = do
flush (Queue _ lim m) repo = do
forM_ (M.toList m) $ uncurry $ runAction repo
return new
return $ Queue 0 lim M.empty
{- Runs an Action on a list of files in a git repository.
-

2
debian/changelog vendored
View file

@ -26,6 +26,8 @@ git-annex (3.20120124) UNRELEASED; urgency=low
due to lazy state update thunks when adding/fixing many files.
* Fixed some memory leaks that occurred when committing journal files.
* whereis: Prints the urls of files that the web special remote knows about.
* Added a annex.queuesize setting, useful when adding hundreds of thousands
of files on a system with plenty of memory.
-- Joey Hess <joeyh@debian.org> Tue, 24 Jan 2012 16:21:55 -0400

View file

@ -576,6 +576,14 @@ Here are all the supported configuration settings.
The default reserve is 1 megabyte.
* `annex.queuesize`
git-annex builds a queue of git commands, in order to combine similar
commands for speed. By default the size of the queue is limited to
10240 commands; this can be used to change the size. If you have plenty
of memory and are working with very large numbers of files, increasing
the queue size can speed it up.
* `annex.version`
Automatically maintained, and used to automate upgrades between versions.