diff --git a/Annex/Transfer.hs b/Annex/Transfer.hs index 1bf50f0228..90337c586e 100644 --- a/Annex/Transfer.hs +++ b/Annex/Transfer.hs @@ -1,6 +1,6 @@ {- git-annex transfers - - - Copyright 2012-2019 Joey Hess + - Copyright 2012-2020 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -81,7 +81,6 @@ alwaysRunTransfer = runTransfer' True runTransfer' :: Observable v => Bool -> Transfer -> AssociatedFile -> RetryDecider -> (MeterUpdate -> Annex v) -> Annex v runTransfer' ignorelock t afile retrydecider transferaction = enteringStage TransferStage $ debugLocks $ checkSecureHashes t $ do - shouldretry <- retrydecider info <- liftIO $ startTransferInfo afile (meter, tfile, createtfile, metervar) <- mkProgressUpdater t info mode <- annexFileMode @@ -91,7 +90,7 @@ runTransfer' ignorelock t afile retrydecider transferaction = enteringStage Tran showNote "transfer already in progress, or unable to take transfer lock" return observeFailure else do - v <- retry shouldretry info metervar $ transferaction meter + v <- retry 0 info metervar (transferaction meter) liftIO $ cleanup tfile lck if observeBool v then removeFailedTransfer t @@ -142,21 +141,25 @@ runTransfer' ignorelock t afile retrydecider transferaction = enteringStage Tran dropLock lockhandle void $ tryIO $ removeFile lck #endif - retry shouldretry oldinfo metervar run = tryNonAsync run >>= \case - Right v - | observeBool v -> return v - | otherwise -> checkretry - Left e -> do - warning (show e) - checkretry + + retry numretries oldinfo metervar run = + tryNonAsync run >>= \case + Right v + | observeBool v -> return v + | otherwise -> checkretry + Left e -> do + warning (show e) + checkretry where checkretry = do b <- getbytescomplete metervar let newinfo = oldinfo { bytesComplete = Just b } - ifM (shouldretry oldinfo newinfo) - ( retry shouldretry newinfo metervar run + let !numretries' = succ numretries + ifM (retrydecider numretries' oldinfo newinfo) + ( retry numretries' newinfo metervar run , return observeFailure ) + getbytescomplete metervar | transferDirection t == Upload = liftIO $ readMVar metervar @@ -189,45 +192,49 @@ checkSecureHashes t a = ifM (isCryptographicallySecure (transferKey t)) where variety = fromKey keyVariety (transferKey t) -type RetryDecider = Annex (TransferInfo -> TransferInfo -> Annex Bool) +type NumRetries = Integer + +type RetryDecider = NumRetries -> TransferInfo -> TransferInfo -> Annex Bool {- The first RetryDecider will be checked first; only if it says not to - retry will the second one be checked. -} combineRetryDeciders :: RetryDecider -> RetryDecider -> RetryDecider -combineRetryDeciders a b = do - ar <- a - br <- b - return $ \old new -> ar old new <||> br old new +combineRetryDeciders a b = \n old new -> a n old new <||> b n old new noRetry :: RetryDecider -noRetry = pure $ \_ _ -> pure False +noRetry _ _ _ = pure False stdRetry :: RetryDecider stdRetry = combineRetryDeciders forwardRetry configuredRetry -{- Retries a transfer when it fails, as long as the failed transfer managed - - to send some data. -} +{- Keep retrying failed transfers, as long as forward progress is being + - made. + - + - Up to a point -- while some remotes can resume where the previous + - transfer left off, and so it would make sense to keep retrying forever, + - other remotes restart each transfer from the beginning, and so even if + - forward progress is being made, it's not real progress. So, retry a + - maximum of 5 times + -} forwardRetry :: RetryDecider -forwardRetry = pure $ \old new -> pure $ - fromMaybe 0 (bytesComplete old) < fromMaybe 0 (bytesComplete new) +forwardRetry = \numretries old new -> pure $ and + [ fromMaybe 0 (bytesComplete old) < fromMaybe 0 (bytesComplete new) + , numretries <= 5 + ] {- Retries a number of times with growing delays in between when enabled - by git configuration. -} configuredRetry :: RetryDecider -configuredRetry = debugLocks $ do - retrycounter <- liftIO $ newMVar 0 - return $ \_old new -> do - (maxretries, Seconds initretrydelay) <- getcfg $ - Remote.gitconfig <$> transferRemote new - retries <- liftIO $ modifyMVar retrycounter $ - \n -> return (n + 1, n + 1) - if retries < maxretries - then do - let retrydelay = Seconds (initretrydelay * 2^(retries-1)) - showSideAction $ "Delaying " ++ show (fromSeconds retrydelay) ++ "s before retrying." - liftIO $ threadDelaySeconds retrydelay - return True - else return False +configuredRetry numretries old new = do + (maxretries, Seconds initretrydelay) <- getcfg $ + Remote.gitconfig <$> transferRemote new + if numretries < maxretries + then do + let retrydelay = Seconds (initretrydelay * 2^(numretries-1)) + showSideAction $ "Delaying " ++ show (fromSeconds retrydelay) ++ "s before retrying." + liftIO $ threadDelaySeconds retrydelay + return True + else return False where globalretrycfg = fromMaybe 0 . annexRetry <$> Annex.getGitConfig diff --git a/CHANGELOG b/CHANGELOG index f11f4e0ce4..6c8819b56b 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -24,6 +24,9 @@ git-annex (8.20200815) UNRELEASED; urgency=medium gpg key. * Support git remotes where .git is a file, not a directory, eg when --separate-git-dir was used. + * Limit retrying of failed transfers when forward progress is being made + to 5, to avoid some unusual edge cases where too much retrying could + result in far more data transfer than makes sense. -- Joey Hess Fri, 14 Aug 2020 14:57:45 -0400 diff --git a/doc/todo/limit_forwardRetry.mdwn b/doc/todo/limit_forwardRetry.mdwn index 55525a5d15..9162b08fd4 100644 --- a/doc/todo/limit_forwardRetry.mdwn +++ b/doc/todo/limit_forwardRetry.mdwn @@ -4,7 +4,8 @@ one more byte got transferred than in the previous, failed try. Suppose that a transfer was restarting from the beginning each time, and it just so happened that each try got a tiny little bit further before failing. Then transferring an `N` byte object could result in `sum [1..N]` -bytes being sent. Worst case. +bytes being sent. Worst case. (Real world it involves the size of chunks +sent in a failing operation, so probably `sum [1..N/1024]` or so.) So I think forwardRetry should cap after some amount of automatic retrying. Ie, it could give up after 5 retries. --[[Joey]] @@ -15,3 +16,14 @@ if a remote is doing that (unless some timing heuristics were used). Around 5 retries seems fairly reasonable for that case too, it would be unlikely for a rsync transfer to keep failing so many times while still making forward progess. --[[Joey]] + +> Or could add data to remotes about this, but it would need to be added +> for external special remotes too, and this does not really seem worth the +> complication. +> +> I think, even if a remote does not support resuming like +> rsync, it makes sense to retry a few failed transfers if it's getting +> closer to success each time, because forward progress suggests whatever +> made it fail is becoming less of a problem. + +[[done]] --[[Joey]]