diff --git a/Annex/StallDetection.hs b/Annex/StallDetection.hs index 9c095b4c86..1fa440cc22 100644 --- a/Annex/StallDetection.hs +++ b/Annex/StallDetection.hs @@ -1,6 +1,6 @@ {- Stall detection for transfers. - - - Copyright 2020-2021 Joey Hess + - Copyright 2020-2024 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -16,41 +16,74 @@ import Utility.ThreadScheduler import Control.Concurrent.STM import Control.Monad.IO.Class (MonadIO) +import Data.Time.Clock {- This may be safely canceled (with eg uninterruptibleCancel), - as long as the passed action can be safely canceled. -} detectStalls :: (Monad m, MonadIO m) => Maybe StallDetection -> TVar (Maybe BytesProcessed) -> m () -> m () detectStalls Nothing _ _ = noop detectStalls (Just StallDetectionDisabled) _ _ = noop -detectStalls (Just (StallDetection (BwRate minsz duration))) metervar onstall = - detectStalls' minsz duration metervar onstall Nothing +detectStalls (Just (StallDetection (BwRate minsz duration))) metervar onstall = do + -- If the progress is being updated, but less frequently than + -- the specified duration, a stall would be incorrectly detected. + -- + -- For example, consider the case of a remote that does + -- not support progress updates, but is chunked with a large chunk + -- size. In that case, progress is only updated after each chunk. + -- + -- So, wait for the first update, and see how long it takes. + -- It's longer than the duration, upscale the duration and minsz + -- accordingly. + starttime <- liftIO getCurrentTime + v <- waitforfirstupdate =<< readMeterVar metervar + endtime <- liftIO getCurrentTime + let timepassed = floor (endtime `diffUTCTime` starttime) + let (scaledminsz, scaledduration) = upscale timepassed + detectStalls' scaledminsz scaledduration metervar onstall v + where + upscale timepassed + | timepassed > dsecs = + let scale = scaleamount timepassed + in (minsz * scale, Duration (dsecs * scale)) + | otherwise = (minsz, duration) + scaleamount timepassed = max 1 $ floor $ + (fromIntegral timepassed / fromIntegral (max dsecs 1)) + * scalefudgefactor + scalefudgefactor = 1.5 :: Double + dsecs = durationSeconds duration + minwaitsecs = Seconds $ + min 60 (fromIntegral (durationSeconds duration)) + waitforfirstupdate startval = do + liftIO $ threadDelaySeconds minwaitsecs + v <- readMeterVar metervar + if v > startval + then return v + else waitforfirstupdate startval + detectStalls (Just ProbeStallDetection) metervar onstall = do -- Only do stall detection once the progress is confirmed to be -- consistently updating. After the first update, it needs to -- advance twice within 30 seconds. With that established, -- if no data at all is sent for a 60 second period, it's -- assumed to be a stall. - v <- getval >>= waitforfirstupdate + v <- readMeterVar metervar >>= waitforfirstupdate ontimelyadvance v $ \v' -> ontimelyadvance v' $ detectStalls' 1 duration metervar onstall where - getval = liftIO $ atomically $ fmap fromBytesProcessed - <$> readTVar metervar - duration = Duration 60 delay = Seconds (fromIntegral (durationSeconds duration) `div` 2) waitforfirstupdate startval = do liftIO $ threadDelaySeconds delay - v <- getval + v <- readMeterVar metervar if v > startval then return v else waitforfirstupdate startval ontimelyadvance v cont = do liftIO $ threadDelaySeconds delay - v' <- getval + v' <- readMeterVar metervar when (v' > v) $ cont v' @@ -65,8 +98,7 @@ detectStalls' detectStalls' minsz duration metervar onstall st = do liftIO $ threadDelaySeconds delay -- Get whatever progress value was reported most recently, if any. - v <- liftIO $ atomically $ fmap fromBytesProcessed - <$> readTVar metervar + v <- readMeterVar metervar let cont = detectStalls' minsz duration metervar onstall v case (st, v) of (Nothing, _) -> cont @@ -81,3 +113,10 @@ detectStalls' minsz duration metervar onstall st = do | otherwise -> cont where delay = Seconds (fromIntegral (durationSeconds duration)) + +readMeterVar + :: MonadIO m + => TVar (Maybe BytesProcessed) + -> m (Maybe ByteSize) +readMeterVar metervar = liftIO $ atomically $ + fmap fromBytesProcessed <$> readTVar metervar diff --git a/CHANGELOG b/CHANGELOG index 20278c8ba3..8ab8386fb7 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -13,6 +13,8 @@ git-annex (10.20231228) UNRELEASED; urgency=medium * Improve disk free space checking when transferring unsized keys to local git remotes. * import: --message/-m option. + * Improve annex.stalldetection to handle remotes that update progress + less frequently than the configured time period. -- Joey Hess Fri, 29 Dec 2023 11:52:06 -0400 diff --git a/Remote/Helper/Chunked.hs b/Remote/Helper/Chunked.hs index 29b97b761a..bb903b7266 100644 --- a/Remote/Helper/Chunked.hs +++ b/Remote/Helper/Chunked.hs @@ -159,9 +159,9 @@ storeChunks u chunkconfig encryptor k f p enc encc storer checker = -- stored, update the chunk log. chunksStored u k (FixedSizeChunks chunksize) numchunks | otherwise = do - liftIO $ meterupdate' zeroBytesProcessed let (chunkkey, chunkkeys') = nextChunkKeyStream chunkkeys storechunk chunkkey (ByteContent chunk) meterupdate' + liftIO $ meterupdate' zeroBytesProcessed let bytesprocessed' = addBytesProcessed bytesprocessed (L.length chunk) loop bytesprocessed' (splitchunk bs) chunkkeys' where diff --git a/doc/bugs/too_aggressive_in_claiming___34__Transfer_stalled__34____63__/comment_1_db83f6a38cae36da89f0ab4ef83021d8._comment b/doc/bugs/too_aggressive_in_claiming___34__Transfer_stalled__34____63__/comment_1_db83f6a38cae36da89f0ab4ef83021d8._comment new file mode 100644 index 0000000000..4c2ccc7292 --- /dev/null +++ b/doc/bugs/too_aggressive_in_claiming___34__Transfer_stalled__34____63__/comment_1_db83f6a38cae36da89f0ab4ef83021d8._comment @@ -0,0 +1,40 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 1""" + date="2024-01-18T17:18:07Z" + content=""" +It looks like you must have annex.stalldetection (or the per-remote config) +set. git-annex does not behave this way without that configuration. +What is it set to? + +You are probably right in that it involves rclone special remote not +reporting transfer progress back to git-annex. + +Normally, when a special remote does not do progress reporting, +git-annex does not do any stall detection, because there must have been +at least some previous progress update in order for it to detect a stall. + +But when chunking is enabled (as it was in your case with 1 gb chunks), +git-annex itself updates the progress after each chunk. When the special +remote does not do any progress reporting, and chunk size is large, that +means that the progress will be updated very infrequently. + +So for example, if it takes 2 minutes to upload a chunk, and you had +annex.stalldetection set to eg "10mb/1m", then in a chunk after the 1st one, +git-annex would wake up after 1 minute, see that no data seemed to have +been sent, and conclude there was a stall. You would need to change the +time period in the config to something less granular eg "100mb/10m" +to avoid that. + +This might be a documentation problem, it may not be clear to the user +that "10mb/1m" is any different than "100mb/10m". And finding a value that +works depends on knowing details of how often the progress gets updated +for a given remote. + +But, your transcript show that the stall was detected on chunk 296. +(git-annex's chunk, rclone is doing its own chunking to dropbox) +So the annex.stalldetection configuration must have been one that +worked most of the time, for it to transfer the previous 295 chunks +without a stall having been detected. Unless this was a resume after +previous command(s) had uploaded those other chunks. +"""]] diff --git a/doc/bugs/too_aggressive_in_claiming___34__Transfer_stalled__34____63__/comment_2_2aeb065a257729e852055533aff04650._comment b/doc/bugs/too_aggressive_in_claiming___34__Transfer_stalled__34____63__/comment_2_2aeb065a257729e852055533aff04650._comment new file mode 100644 index 0000000000..5d8e3863e2 --- /dev/null +++ b/doc/bugs/too_aggressive_in_claiming___34__Transfer_stalled__34____63__/comment_2_2aeb065a257729e852055533aff04650._comment @@ -0,0 +1,22 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 2""" + date="2024-01-18T17:50:21Z" + content=""" +I think that what git-annex could do is detect when progress updates are +happening with too low a granularity for the annex.stalldetection +configuration. + +When waiting for the first progress update, it can keep track of how much time +has elapsed. If annex.stalldetection is "10mb/2m" and it took 20 minutes to +get the first progress update, the granularity is clearly too low. + +And then it could disregard the configuration, or suggest a better +configuration value, or adjust what it's expecting to match the +observed granularity. + +(The stall detection auto-prober uses a similar heuristic to that already. +It observes the granularity and only if it's sufficiently low (an update +every 30 seconds or less) does it assume that 60 seconds without an update +may be a stall.) +"""]] diff --git a/doc/bugs/too_aggressive_in_claiming___34__Transfer_stalled__34____63__/comment_3_f8bd6a233d835bdc413bbf0127608431._comment b/doc/bugs/too_aggressive_in_claiming___34__Transfer_stalled__34____63__/comment_3_f8bd6a233d835bdc413bbf0127608431._comment new file mode 100644 index 0000000000..dd93371133 --- /dev/null +++ b/doc/bugs/too_aggressive_in_claiming___34__Transfer_stalled__34____63__/comment_3_f8bd6a233d835bdc413bbf0127608431._comment @@ -0,0 +1,20 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 3""" + date="2024-01-18T19:04:26Z" + content=""" +To reproduce this (more or less), I modified the example.sh external +special remote to sleep for 10 seconds before each key store. +Set up a remote with chunk=1mb, and annex.stalldetection = "0.001mb/1s". + +Uploading a 100 mb file, a stall is detected after the first chunk is +uploaded. As expected, since 1 second passed with no update. + +When I resume the upload, the second chunk is uploaded and then a stall is +detected on the third. And so on. + +I've implemented dynamic granularity scaling now, and in this test case, it notices +it took 11 seconds for the first chunk, and behaves as if it were +configured with annex.stalldetection of "0.022mb/22s". Which keeps it from +detecting a stall. +"""]] diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index b127c317e5..6b0f328710 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -1538,8 +1538,8 @@ Remotes are configured using these settings in `.git/config`. Set to "false" to avoid all attempts at stall detection. - For more fine control over what constitutes a stall, set to a value in - the form "$amount/$timeperiod" to specify how much data git-annex should + To detect and cancel stalled transfers, set this to a value in the form + "$amount/$timeperiod" which specifies how much data git-annex should expect to see flowing, minimum, over a given period of time. For example, to detect outright stalls where no data has been transferred @@ -1550,11 +1550,22 @@ Remotes are configured using these settings in `.git/config`. stuck for a long time, you could use: `git config remote.usbdrive.annex-stalldetection "1MB/1m"` - This is not enabled by default, because it can make git-annex use - more resources. To be able to cancel stalls, git-annex has to run - transfers in separate processes (one per concurrent job). So it - may need to open more connections to a remote than usual, or - the communication with those processes may make it a bit slower. + Some remotes don't report transfer progress, and stalls cannot be + detected when using those. + + Some remotes only report transfer progress occasionally, eg + after each chunk. To avoid false timeouts in such a situation, if the + first progress update takes longer to arrive than the configured time + period, the stall detection will be automically adjusted to use a longer + time period. For example, if the first progress update comes after 10 + minutes, but annex.stalldetection is "1MB/1m", it will be treated as eg + "15MB/15m". + + Configuring stall detection can make git-annex use more resources. To be + able to cancel stalls, git-annex has to run transfers in separate + processes (one per concurrent job). So it may need to open more + connections to a remote than usual, or the communication with those + processes may make it a bit slower. * `remote..annex-checkuuid`