From 771a122c9e601cbdaeba692d36ba89de1d56c77e Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 4 Jun 2021 16:08:42 -0400 Subject: [PATCH] add --size-limit option When this option is not used, there should be effectively no added overhead, thanks to the optimisation in b3cd0cc6ba4e5b9e2ae0abd9c8b2ec32475e09d2. When an action fails on a file, the size of the file still counts toward the size limit. This was necessary to support concurrency, but also generally seems like the right choice. Most commands that operate on annexed files support the option. export and import do not, and I don't know if it would make sense for export to.. Why would you want an incomplete export? sync doesn't, and while it would be easy to make it support it for transferring files, it's not clear if dropping files should also take the size limit into account. Commands like add that don't operate on annexed files don't support the option either. Exiting 101 not yet implemented. Sponsored-by: Denis Dzyubenko on Patreon --- Annex.hs | 2 + CHANGELOG | 1 + CmdLine/Action.hs | 64 ++++++++++++++----- CmdLine/GitAnnex/Options.hs | 18 +++++- doc/git-annex-common-options.mdwn | 18 +++++- ..._195e395aab22f31da1dab0ba95f88ef4._comment | 34 ++++++++++ 6 files changed, 118 insertions(+), 19 deletions(-) create mode 100644 doc/todo/size_limits_for_drop__47__move__47__copy__47__get/comment_1_195e395aab22f31da1dab0ba95f88ef4._comment diff --git a/Annex.hs b/Annex.hs index 5168b64113..c9feb7540e 100644 --- a/Annex.hs +++ b/Annex.hs @@ -174,6 +174,7 @@ data AnnexState = AnnexState , forcemincopies :: Maybe MinCopies , limit :: ExpandableMatcher Annex , timelimit :: Maybe (Duration, POSIXTime) + , sizelimit :: Maybe (TVar Integer) , uuiddescmap :: Maybe UUIDDescMap , preferredcontentmap :: Maybe (FileMatcherMap Annex) , requiredcontentmap :: Maybe (FileMatcherMap Annex) @@ -232,6 +233,7 @@ newAnnexState c r = do , forcemincopies = Nothing , limit = BuildingMatcher [] , timelimit = Nothing + , sizelimit = Nothing , uuiddescmap = Nothing , preferredcontentmap = Nothing , requiredcontentmap = Nothing diff --git a/CHANGELOG b/CHANGELOG index 05b190b134..47d1185ef1 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -28,6 +28,7 @@ git-annex (8.20210429) UNRELEASED; urgency=medium * init: When annex.commitmessage is set, use that message for the commit that creates the git-annex branch. * Added annex.adviceNoSshCaching config. + * Added --size-limit option. -- Joey Hess Mon, 03 May 2021 10:33:10 -0400 diff --git a/CmdLine/Action.hs b/CmdLine/Action.hs index 008e8fc993..29baea29ec 100644 --- a/CmdLine/Action.hs +++ b/CmdLine/Action.hs @@ -1,6 +1,6 @@ {- git-annex command-line actions and concurrency - - - Copyright 2010-2020 Joey Hess + - Copyright 2010-2021 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -15,9 +15,11 @@ import Annex.Concurrent import Annex.WorkerPool import Types.Command import Types.Concurrency +import Annex.Content import Messages.Concurrent import Types.Messages import Types.WorkerPool +import Types.ActionItem import Remote.List import Control.Concurrent @@ -58,21 +60,29 @@ commandAction :: CommandStart -> Annex () commandAction start = do st <- Annex.getState id case getConcurrency' (Annex.concurrency st) of - NonConcurrent -> runnonconcurrent + NonConcurrent -> runnonconcurrent (Annex.sizelimit st) Concurrent n - | n > 1 -> runconcurrent (Annex.workers st) - | otherwise -> runnonconcurrent - ConcurrentPerCpu -> runconcurrent (Annex.workers st) + | n > 1 -> runconcurrent (Annex.sizelimit st) (Annex.workers st) + | otherwise -> runnonconcurrent (Annex.sizelimit st) + ConcurrentPerCpu -> runconcurrent (Annex.sizelimit st) (Annex.workers st) where - runnonconcurrent = void $ includeCommandAction start - runconcurrent Nothing = runnonconcurrent - runconcurrent (Just tv) = - liftIO (atomically (waitStartWorkerSlot tv)) >>= - maybe runnonconcurrent (runconcurrent' tv) - runconcurrent' tv (workerstrd, workerstage) = do + runnonconcurrent sizelimit = start >>= \case + Nothing -> noop + Just (startmsg, perform) -> + checkSizeLimit sizelimit startmsg $ do + showStartMessage startmsg + void $ accountCommandAction startmsg $ + performCommandAction' startmsg perform + + runconcurrent sizelimit Nothing = runnonconcurrent sizelimit + runconcurrent sizelimit (Just tv) = + liftIO (atomically (waitStartWorkerSlot tv)) >>= maybe + (runnonconcurrent sizelimit) + (runconcurrent' sizelimit tv) + runconcurrent' sizelimit tv (workerstrd, workerstage) = do aid <- liftIO $ async $ snd <$> Annex.run workerstrd - (concurrentjob (fst workerstrd)) + (concurrentjob sizelimit (fst workerstrd)) liftIO $ atomically $ do pool <- takeTMVar tv let !pool' = addWorkerPool (ActiveWorker aid workerstage) pool @@ -88,10 +98,11 @@ commandAction start = do let !pool' = deactivateWorker pool aid workerstrd' putTMVar tv pool' - concurrentjob workerst = start >>= \case + concurrentjob sizelimit workerst = start >>= \case Nothing -> noop Just (startmsg, perform) -> - concurrentjob' workerst startmsg perform + checkSizeLimit sizelimit startmsg $ + concurrentjob' workerst startmsg perform concurrentjob' workerst startmsg perform = case mkActionItem startmsg of OnlyActionOn k _ -> ensureOnlyActionOn k $ @@ -126,7 +137,7 @@ commandAction start = do Nothing -> do showEndMessage startmsg False return False - + {- Waits for all worker threads to finish and merges their AnnexStates - back into the current Annex's state. -} @@ -294,3 +305,26 @@ ensureOnlyActionOn k a = debugLocks $ writeTVar tv $! M.insert k mytid m return $ liftIO $ atomically $ modifyTVar tv $ M.delete k + +checkSizeLimit :: Maybe (TVar Integer) -> StartMessage -> Annex () -> Annex () +checkSizeLimit Nothing _ a = a +checkSizeLimit (Just sizelimitvar) startmsg a = + case actionItemKey (mkActionItem startmsg) of + Just k -> case fromKey keySize k of + Just sz -> go sz + Nothing -> do + fsz <- catchMaybeIO $ withObjectLoc k $ + liftIO . getFileSize + maybe noop go fsz + Nothing -> a + where + go sz = do + fits <- liftIO $ atomically $ do + n <- readTVar sizelimitvar + let !n' = n - sz + if n' >= 0 + then do + writeTVar sizelimitvar n' + return True + else return False + when fits a diff --git a/CmdLine/GitAnnex/Options.hs b/CmdLine/GitAnnex/Options.hs index 1e2f15c32d..ab7331b957 100644 --- a/CmdLine/GitAnnex/Options.hs +++ b/CmdLine/GitAnnex/Options.hs @@ -12,6 +12,7 @@ module CmdLine.GitAnnex.Options where import Control.Monad.Fail as Fail (MonadFail(..)) import Options.Applicative import Data.Time.Clock.POSIX +import Control.Concurrent.STM import qualified Data.Map as M import Annex.Common @@ -37,6 +38,7 @@ import CmdLine.GlobalSetter import qualified Backend import qualified Types.Backend as Backend import Utility.HumanTime +import Utility.DataUnits import Annex.Concurrent -- Global options that are accepted by all git-annex sub-commands, @@ -233,11 +235,12 @@ annexedMatchingOptions = concat , fileMatchingOptions' Limit.LimitAnnexFiles , combiningOptions , timeLimitOption + , sizeLimitOption ] -- Matching options that can operate on keys as well as files. keyMatchingOptions :: [GlobalOption] -keyMatchingOptions = keyMatchingOptions' ++ combiningOptions ++ timeLimitOption +keyMatchingOptions = keyMatchingOptions' ++ combiningOptions ++ timeLimitOption ++ sizeLimitOption keyMatchingOptions' :: [GlobalOption] keyMatchingOptions' = @@ -435,6 +438,19 @@ timeLimitOption = let cutoff = start + durationToPOSIXTime duration Annex.changeState $ \s -> s { Annex.timelimit = Just (duration, cutoff) } +sizeLimitOption :: [GlobalOption] +sizeLimitOption = + [ globalOption setsizelimit $ option (maybeReader (readSize dataUnits)) + ( long "size-limit" <> metavar paramSize + <> help "total size of annexed files to process" + <> hidden + ) + ] + where + setsizelimit n = setAnnexState $ do + v <- liftIO $ newTVarIO n + Annex.changeState $ \s -> s { Annex.sizelimit = Just v } + data DaemonOptions = DaemonOptions { foregroundDaemonOption :: Bool , stopDaemonOption :: Bool diff --git a/doc/git-annex-common-options.mdwn b/doc/git-annex-common-options.mdwn index ec9b56b851..cf44cf40e5 100644 --- a/doc/git-annex-common-options.mdwn +++ b/doc/git-annex-common-options.mdwn @@ -1,11 +1,12 @@ # NAME -git-annex-common-options - options supported by all git-annex commands +git-annex-common-options - options supported by many git-annex commands # DESCRIPTION -These common options are accepted by all git-annex commands, and +These common options are accepted by many git-annex commands, and may not be explicitly listed on their individual man pages. +Most of these options are accepted by all git-annex commands. (Many commands also accept the [[git-annex-matching-options]](1).) # OPTIONS @@ -74,9 +75,20 @@ may not be explicitly listed on their individual man pages. Note that git-annex may continue running a little past the specified time limit, in order to finish processing a file. - Also, note that if the time limit prevents git-annex from doing all it + When the time limit prevents git-annex from doing all it was asked to, it will exit with a special code, 101. +* `--size-limit=size` + + Limits the total size of annexed files that a git-annex command + can process. + + The size can be specified with any commonly used units, for example, + "50gb". + + In some cases, an annexed file's size is not known. This option will + prevent git-annex from processing such files. + * `--semitrust=repository` * `--untrust=repository` diff --git a/doc/todo/size_limits_for_drop__47__move__47__copy__47__get/comment_1_195e395aab22f31da1dab0ba95f88ef4._comment b/doc/todo/size_limits_for_drop__47__move__47__copy__47__get/comment_1_195e395aab22f31da1dab0ba95f88ef4._comment new file mode 100644 index 0000000000..817b6bf9f6 --- /dev/null +++ b/doc/todo/size_limits_for_drop__47__move__47__copy__47__get/comment_1_195e395aab22f31da1dab0ba95f88ef4._comment @@ -0,0 +1,34 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 1""" + date="2021-06-04T18:07:44Z" + content=""" +I agree this could be useful. + +Implementation is complicated by it needing to only count the size when a +file is acted on. Eg `git annex get` shouldn't stop when it's seen enough +files that already have content present. + +So it seems it would need to be implemented next to where showStartMessage +is used in commandAction, looking at the size of the key in the +StartMessage (or possibly file when there's no key?) and when it would go +over the limit, rather than proceeding to perform the action it could skip +doing anything and go on to the next file. + +I don't think there is a good way to make it immediately exit +when it reaches the limit, so if there were subsequent smaller files +after a skipped file that could be processed still, it still would. + +It would probably also make sense to make it later exit with 101 like +--time-limit does, or another special exit code, to indicate it didn't +process everything. + +Hmm, if an action fails, should the size of the file be counted or not? +If failures are not counted, incomplete transfers could result in a +lot more work/disk space than desired. But if failures are counted +after failing to drop a bunch of files, or failing early on to get a bunch +of files, it could stop seemingly prematurely. Also there's a problem with +concurrency, if it needs to know the result of running jobs before deciding +whether to start a new job. Seems no entirely good answer here, but the +concurrency problem seems only solvable by updating the count at start time. +"""]]