From 82186ca58f302aeb6a83b5234c93f8d33bb857e5 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 10 May 2019 13:24:31 -0400 Subject: [PATCH] annex.jobs=cpus etc Added the ability to run one job per CPU (core), by setting annex.jobs=cpus, or using option --jobs=cpus or -Jcpus. Built with future expansion in mind, including not defaulting matching on Concurrency so more constructors can later be added, and using "cpu" instead of "0". --- Annex/Ssh.hs | 3 +- Annex/Transfer.hs | 29 +++++++----- CHANGELOG | 2 + CmdLine/Action.hs | 45 +++++++++++++------ CmdLine/GitAnnex/Options.hs | 7 +-- Messages.hs | 8 ++-- Types/Concurrency.hs | 9 +++- Types/GitConfig.hs | 3 +- doc/git-annex-add.mdwn | 2 + doc/git-annex-addurl.mdwn | 2 + doc/git-annex-copy.mdwn | 2 + doc/git-annex-drop.mdwn | 2 + doc/git-annex-fsck.mdwn | 2 + doc/git-annex-get.mdwn | 2 + doc/git-annex-import.mdwn | 2 + doc/git-annex-mirror.mdwn | 2 + doc/git-annex-move.mdwn | 2 + doc/git-annex-sync.mdwn | 2 + doc/git-annex.mdwn | 2 + ...tion_to_use_all_processors_by_default.mdwn | 9 ++++ 20 files changed, 105 insertions(+), 32 deletions(-) diff --git a/Annex/Ssh.hs b/Annex/Ssh.hs index ab757fc048..a4cb5013eb 100644 --- a/Annex/Ssh.hs +++ b/Annex/Ssh.hs @@ -189,8 +189,9 @@ prepSocket socketfile sshhost sshparams = do let socketlock = socket2lock socketfile Annex.getState Annex.concurrency >>= \case + NonConcurrent -> return () Concurrent {} -> makeconnection socketlock - _ -> return () + ConcurrentPerCpu -> makeconnection socketlock lockFileCached socketlock where diff --git a/Annex/Transfer.hs b/Annex/Transfer.hs index 419c6a3f0f..b9b5354574 100644 --- a/Annex/Transfer.hs +++ b/Annex/Transfer.hs @@ -247,20 +247,29 @@ pickRemote l a = debugLocks $ go l =<< Annex.getState Annex.concurrency where go [] _ = return observeFailure go (r:[]) _ = a r - go rs (Concurrent n) | n > 1 = do - mv <- Annex.getState Annex.activeremotes - active <- liftIO $ takeMVar mv - let rs' = sortBy (lessActiveFirst active) rs - goconcurrent mv active rs' - go (r:rs) _ = do + go rs NonConcurrent = gononconcurrent rs + go rs (Concurrent n) + | n <= 1 = gononconcurrent rs + | otherwise = goconcurrent rs + go rs ConcurrentPerCpu = goconcurrent rs + + gononconcurrent [] = return observeFailure + gononconcurrent (r:rs) = do ok <- a r if observeBool ok then return ok - else go rs NonConcurrent - goconcurrent mv active [] = do + else gononconcurrent rs + + goconcurrent rs = do + mv <- Annex.getState Annex.activeremotes + active <- liftIO $ takeMVar mv + let rs' = sortBy (lessActiveFirst active) rs + goconcurrent' mv active rs' + + goconcurrent' mv active [] = do liftIO $ putMVar mv active return observeFailure - goconcurrent mv active (r:rs) = do + goconcurrent' mv active (r:rs) = do let !active' = M.insertWith (+) r 1 active liftIO $ putMVar mv active' let getnewactive = do @@ -279,7 +288,7 @@ pickRemote l a = debugLocks $ go l =<< Annex.getState Annex.concurrency -- because other threads could have -- been assigned them in the meantime. let rs' = sortBy (lessActiveFirst active'') rs - goconcurrent mv active'' rs' + goconcurrent' mv active'' rs' lessActiveFirst :: M.Map Remote Integer -> Remote -> Remote -> Ordering lessActiveFirst active a b diff --git a/CHANGELOG b/CHANGELOG index b4723ce055..dbcab637ee 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -3,6 +3,8 @@ git-annex (7.20190508) UNRELEASED; urgency=medium * Fixed bug that caused git-annex to fail to add a file when another git-annex process cleaned up the temp directory it was using. * Makefile: Added install-completions to install target. + * Added the ability to run one job per CPU (core), by setting + annex.jobs=cpus, or using option --jobs=cpus or -Jcpus. -- Joey Hess Mon, 06 May 2019 13:52:02 -0400 diff --git a/CmdLine/Action.hs b/CmdLine/Action.hs index 0a65673827..9713b1295d 100644 --- a/CmdLine/Action.hs +++ b/CmdLine/Action.hs @@ -22,6 +22,7 @@ import Control.Concurrent import Control.Concurrent.Async import Control.Concurrent.STM import Control.Exception (throwIO) +import GHC.Conc import Data.Either import qualified Data.Map.Strict as M import qualified System.Console.Regions as Regions @@ -51,9 +52,14 @@ performCommandAction Command { cmdcheck = c, cmdname = name } seek cont = do - This should only be run in the seek stage. -} commandAction :: CommandStart -> Annex () -commandAction a = go =<< Annex.getState Annex.concurrency +commandAction a = Annex.getState Annex.concurrency >>= \case + NonConcurrent -> run + Concurrent n -> runconcurrent n + ConcurrentPerCpu -> runconcurrent =<< liftIO getNumProcessors where - go (Concurrent n) = do + run = void $ includeCommandAction a + + runconcurrent n = do ws <- Annex.getState Annex.workers (st, ws') <- if null ws then do @@ -71,8 +77,6 @@ commandAction a = go =<< Annex.getState Annex.concurrency w <- liftIO $ async $ snd <$> Annex.run st (inOwnConsoleRegion (Annex.output st) run) Annex.changeState $ \s -> s { Annex.workers = Right w:ws' } - go NonConcurrent = run - run = void $ includeCommandAction a commandActions :: [CommandStart] -> Annex () commandActions = mapM_ commandAction @@ -170,18 +174,23 @@ allowConcurrentOutput :: Annex a -> Annex a allowConcurrentOutput a = do fromcmdline <- Annex.getState Annex.concurrency fromgitcfg <- annexJobs <$> Annex.getGitConfig + let usegitcfg = Annex.changeState $ + \c -> c { Annex.concurrency = fromgitcfg } case (fromcmdline, fromgitcfg) of (NonConcurrent, NonConcurrent) -> a - (Concurrent n, _) -> goconcurrent n + (Concurrent n, _) -> do + raisecapabilitiesto n + goconcurrent + (ConcurrentPerCpu, _) -> goconcurrent (NonConcurrent, Concurrent n) -> do - Annex.changeState $ - \c -> c { Annex.concurrency = fromgitcfg } - goconcurrent n + usegitcfg + raisecapabilitiesto n + goconcurrent + (NonConcurrent, ConcurrentPerCpu) -> do + usegitcfg + goconcurrent where - goconcurrent n = do - c <- liftIO getNumCapabilities - when (n > c) $ - liftIO $ setNumCapabilities n + goconcurrent = do withMessageState $ \s -> case outputType s of NormalOutput -> ifM (liftIO concurrentOutputSupported) ( Regions.displayConsoleRegions $ @@ -190,13 +199,21 @@ allowConcurrentOutput a = do ) _ -> goconcurrent' False goconcurrent' b = bracket_ (setup b) cleanup a + setup = setconcurrentoutputenabled + cleanup = do finishCommandActions setconcurrentoutputenabled False + setconcurrentoutputenabled b = Annex.changeState $ \s -> s { Annex.output = (Annex.output s) { concurrentOutputEnabled = b } } + raisecapabilitiesto n = do + c <- liftIO getNumCapabilities + when (n > c) $ + liftIO $ setNumCapabilities n + {- Ensures that only one thread processes a key at a time. - Other threads will block until it's done. -} onlyActionOn :: Key -> CommandStart -> CommandStart @@ -212,7 +229,9 @@ onlyActionOn' :: Key -> Annex a -> Annex a onlyActionOn' k a = go =<< Annex.getState Annex.concurrency where go NonConcurrent = a - go (Concurrent _) = do + go (Concurrent _) = goconcurrent + go ConcurrentPerCpu = goconcurrent + goconcurrent = do tv <- Annex.getState Annex.activekeys bracket (setup tv) id (const a) setup tv = liftIO $ do diff --git a/CmdLine/GitAnnex/Options.hs b/CmdLine/GitAnnex/Options.hs index 55f2612eb1..2a57d59037 100644 --- a/CmdLine/GitAnnex/Options.hs +++ b/CmdLine/GitAnnex/Options.hs @@ -366,14 +366,15 @@ jsonProgressOption = jobsOption :: [GlobalOption] jobsOption = [ globalSetter set $ - option auto - ( long "jobs" <> short 'J' <> metavar paramNumber + option (maybeReader parseConcurrency) + ( long "jobs" <> short 'J' + <> metavar (paramNumber `paramOr` "cpu") <> help "enable concurrent jobs" <> hidden ) ] where - set n = Annex.changeState $ \s -> s { Annex.concurrency = Concurrent n } + set v = Annex.changeState $ \s -> s { Annex.concurrency = v } timeLimitOption :: [GlobalOption] timeLimitOption = diff --git a/Messages.hs b/Messages.hs index 4d353f9ed5..729b891c9e 100644 --- a/Messages.hs +++ b/Messages.hs @@ -262,10 +262,12 @@ implicitMessage = whenM (implicitMessages <$> Annex.getState Annex.output) - the user. -} prompt :: Annex a -> Annex a -prompt a = debugLocks $ go =<< Annex.getState Annex.concurrency +prompt a = debugLocks $ Annex.getState Annex.concurrency >>= \case + NonConcurrent -> a + (Concurrent _) -> goconcurrent + ConcurrentPerCpu -> goconcurrent where - go NonConcurrent = a - go (Concurrent {}) = withMessageState $ \s -> do + goconcurrent = withMessageState $ \s -> do let l = promptLock s bracketIO (takeMVar l) diff --git a/Types/Concurrency.hs b/Types/Concurrency.hs index 7e8778d252..fa58798640 100644 --- a/Types/Concurrency.hs +++ b/Types/Concurrency.hs @@ -5,4 +5,11 @@ module Types.Concurrency where -data Concurrency = NonConcurrent | Concurrent Int +import Utility.PartialPrelude + +data Concurrency = NonConcurrent | Concurrent Int | ConcurrentPerCpu + +parseConcurrency :: String -> Maybe Concurrency +parseConcurrency "cpus" = Just ConcurrentPerCpu +parseConcurrency "cpu" = Just ConcurrentPerCpu +parseConcurrency s = Concurrent <$> readish s diff --git a/Types/GitConfig.hs b/Types/GitConfig.hs index a5035fa487..dfcd010cb7 100644 --- a/Types/GitConfig.hs +++ b/Types/GitConfig.hs @@ -177,7 +177,8 @@ extractGitConfig r = GitConfig , annexAllowUnverifiedDownloads = (== Just "ACKTHPPT") $ getmaybe (annex "security.allow-unverified-downloads") , annexMaxExtensionLength = getmayberead (annex "maxextensionlength") - , annexJobs = maybe NonConcurrent Concurrent $ getmayberead (annex "jobs") + , annexJobs = fromMaybe NonConcurrent $ + parseConcurrency =<< getmaybe (annex "jobs") , annexCacheCreds = getbool (annex "cachecreds") True , coreSymlinks = getbool "core.symlinks" True , coreSharedRepository = getSharedRepository r diff --git a/doc/git-annex-add.mdwn b/doc/git-annex-add.mdwn index aab54d090d..9d80e55560 100644 --- a/doc/git-annex-add.mdwn +++ b/doc/git-annex-add.mdwn @@ -56,6 +56,8 @@ annexed content, and other symlinks. Adds multiple files in parallel. This may be faster. For example: `-J4` + Setting this to "cpus" will run one job per CPU core. + * `--update` `-u` Like `git add --update`, this does not add new files, but any updates diff --git a/doc/git-annex-addurl.mdwn b/doc/git-annex-addurl.mdwn index 62e05d468b..9854f1789a 100644 --- a/doc/git-annex-addurl.mdwn +++ b/doc/git-annex-addurl.mdwn @@ -78,6 +78,8 @@ be used to get better filenames. Enables parallel downloads when multiple urls are being added. For example: `-J4` + Setting this to "cpus" will run one job per CPU core. + * `--batch` Enables batch mode, in which lines containing urls to add are read from diff --git a/doc/git-annex-copy.mdwn b/doc/git-annex-copy.mdwn index 92fe6bee9e..ab60c43d39 100644 --- a/doc/git-annex-copy.mdwn +++ b/doc/git-annex-copy.mdwn @@ -34,6 +34,8 @@ Copies the content of files from or to another remote. Enables parallel transfers with up to the specified number of jobs running at once. For example: `-J10` + Setting this to "cpus" will run one job per CPU core. + * `--auto` Rather than copying all files, only copy files that don't yet have diff --git a/doc/git-annex-drop.mdwn b/doc/git-annex-drop.mdwn index 35e890f75c..3a3a4649cb 100644 --- a/doc/git-annex-drop.mdwn +++ b/doc/git-annex-drop.mdwn @@ -77,6 +77,8 @@ safe to do so. when git-annex has to contact remotes to check if it can drop files. For example: `-J4` + Setting this to "cpus" will run one job per CPU core. + * `--batch` Enables batch mode, in which lines containing names of files to drop diff --git a/doc/git-annex-fsck.mdwn b/doc/git-annex-fsck.mdwn index 83b86676b2..369a583ca0 100644 --- a/doc/git-annex-fsck.mdwn +++ b/doc/git-annex-fsck.mdwn @@ -93,6 +93,8 @@ With parameters, only the specified files are checked. Runs multiple fsck jobs in parallel. For example: `-J4` + Setting this to "cpus" will run one job per CPU core. + * `--json` Enable JSON output. This is intended to be parsed by programs that use diff --git a/doc/git-annex-get.mdwn b/doc/git-annex-get.mdwn index 72980bcd53..230aa6422a 100644 --- a/doc/git-annex-get.mdwn +++ b/doc/git-annex-get.mdwn @@ -33,6 +33,8 @@ or transferring them from some kind of key-value store. Enables parallel download with up to the specified number of jobs running at once. For example: `-J10` + Setting this to "cpus" will run one job per CPU core. + When files can be downloaded from multiple remotes, enabling parallel downloads will split the load between the remotes. For example, if the files are available on remotes A and B, then one file will be diff --git a/doc/git-annex-import.mdwn b/doc/git-annex-import.mdwn index 1a8ed74fbb..b34cd78223 100644 --- a/doc/git-annex-import.mdwn +++ b/doc/git-annex-import.mdwn @@ -141,6 +141,8 @@ and `--reinject-duplicates` documentation below. Imports multiple files in parallel. This may be faster. For example: `-J4` + Setting this to "cpus" will run one job per CPU core. + * `--json` Enable JSON output. This is intended to be parsed by programs that use diff --git a/doc/git-annex-mirror.mdwn b/doc/git-annex-mirror.mdwn index c66a0dd9f4..2c6ef31b8d 100644 --- a/doc/git-annex-mirror.mdwn +++ b/doc/git-annex-mirror.mdwn @@ -36,6 +36,8 @@ contents. Use [[git-annex-sync]](1) for that. Enables parallel transfers with up to the specified number of jobs running at once. For example: `-J10` + Setting this to "cpus" will run one job per CPU core. + * `--all` `-A` Mirror all objects stored in the git annex, not only objects used by diff --git a/doc/git-annex-move.mdwn b/doc/git-annex-move.mdwn index d2160b6f61..8023816aff 100644 --- a/doc/git-annex-move.mdwn +++ b/doc/git-annex-move.mdwn @@ -40,6 +40,8 @@ Moves the content of files from or to another remote. Enables parallel transfers with up to the specified number of jobs running at once. For example: `-J10` + Setting this to "cpus" will run one job per CPU core. + * `--all` `-A` Rather than specifying a filename or path to move, this option can be diff --git a/doc/git-annex-sync.mdwn b/doc/git-annex-sync.mdwn index 83969408c6..440aea59fd 100644 --- a/doc/git-annex-sync.mdwn +++ b/doc/git-annex-sync.mdwn @@ -112,6 +112,8 @@ by running "git annex sync" on the remote. Enables parallel syncing with up to the specified number of jobs running at once. For example: `-J10` + Setting this to "cpus" will run one job per CPU core. + When there are multiple git remotes, pushes will be made to them in parallel. Pulls are not done in parallel because that tends to be less efficient. When --content is synced, the files are processed diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index 5ac7581ee9..250a55a528 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -952,6 +952,8 @@ Here are all the supported configuration settings. Only git-annex commands that support the --jobs option will use this. + Setting this to "cpus" will run one job per CPU core. + * `annex.queuesize` git-annex builds a queue of git commands, in order to combine similar diff --git a/doc/todo/config_option_to_use_all_processors_by_default.mdwn b/doc/todo/config_option_to_use_all_processors_by_default.mdwn index 379148c3ca..63b9bed3e4 100644 --- a/doc/todo/config_option_to_use_all_processors_by_default.mdwn +++ b/doc/todo/config_option_to_use_all_processors_by_default.mdwn @@ -1 +1,10 @@ Can you add a global config flag to tell parallelizable commands to use all available cores? Often I forget to add -JN when it would have sped things up. + +> Added as --jobs=cpus / annex.jobs=cpus. This will allow +> later expansion, perhaps `--jobs=cpus-1` to leave a spare core +> or `--jobs=remotes*2` to run two jobs per remote, or things like that. +> +> It's a bit more typing than -J0, but since it can be configured once in +> annex.jobs, that seemed an acceptable tradeoff to future proof it. +> +> [[done]] --[[Joey]]