Limit retrying of failed transfers when forward progress is being made to 5

To avoid some unusual edge cases where too much retrying could result in
far more data transfer than makes sense.
This commit is contained in:
Joey Hess 2020-09-04 12:46:37 -04:00
parent 820d4368b3
commit 1d244bafbd
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
3 changed files with 59 additions and 37 deletions

View file

@ -1,6 +1,6 @@
{- git-annex transfers {- git-annex transfers
- -
- Copyright 2012-2019 Joey Hess <id@joeyh.name> - Copyright 2012-2020 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
@ -81,7 +81,6 @@ alwaysRunTransfer = runTransfer' True
runTransfer' :: Observable v => Bool -> Transfer -> AssociatedFile -> RetryDecider -> (MeterUpdate -> Annex v) -> Annex v runTransfer' :: Observable v => Bool -> Transfer -> AssociatedFile -> RetryDecider -> (MeterUpdate -> Annex v) -> Annex v
runTransfer' ignorelock t afile retrydecider transferaction = enteringStage TransferStage $ debugLocks $ checkSecureHashes t $ do runTransfer' ignorelock t afile retrydecider transferaction = enteringStage TransferStage $ debugLocks $ checkSecureHashes t $ do
shouldretry <- retrydecider
info <- liftIO $ startTransferInfo afile info <- liftIO $ startTransferInfo afile
(meter, tfile, createtfile, metervar) <- mkProgressUpdater t info (meter, tfile, createtfile, metervar) <- mkProgressUpdater t info
mode <- annexFileMode mode <- annexFileMode
@ -91,7 +90,7 @@ runTransfer' ignorelock t afile retrydecider transferaction = enteringStage Tran
showNote "transfer already in progress, or unable to take transfer lock" showNote "transfer already in progress, or unable to take transfer lock"
return observeFailure return observeFailure
else do else do
v <- retry shouldretry info metervar $ transferaction meter v <- retry 0 info metervar (transferaction meter)
liftIO $ cleanup tfile lck liftIO $ cleanup tfile lck
if observeBool v if observeBool v
then removeFailedTransfer t then removeFailedTransfer t
@ -142,7 +141,9 @@ runTransfer' ignorelock t afile retrydecider transferaction = enteringStage Tran
dropLock lockhandle dropLock lockhandle
void $ tryIO $ removeFile lck void $ tryIO $ removeFile lck
#endif #endif
retry shouldretry oldinfo metervar run = tryNonAsync run >>= \case
retry numretries oldinfo metervar run =
tryNonAsync run >>= \case
Right v Right v
| observeBool v -> return v | observeBool v -> return v
| otherwise -> checkretry | otherwise -> checkretry
@ -153,10 +154,12 @@ runTransfer' ignorelock t afile retrydecider transferaction = enteringStage Tran
checkretry = do checkretry = do
b <- getbytescomplete metervar b <- getbytescomplete metervar
let newinfo = oldinfo { bytesComplete = Just b } let newinfo = oldinfo { bytesComplete = Just b }
ifM (shouldretry oldinfo newinfo) let !numretries' = succ numretries
( retry shouldretry newinfo metervar run ifM (retrydecider numretries' oldinfo newinfo)
( retry numretries' newinfo metervar run
, return observeFailure , return observeFailure
) )
getbytescomplete metervar getbytescomplete metervar
| transferDirection t == Upload = | transferDirection t == Upload =
liftIO $ readMVar metervar liftIO $ readMVar metervar
@ -189,41 +192,45 @@ checkSecureHashes t a = ifM (isCryptographicallySecure (transferKey t))
where where
variety = fromKey keyVariety (transferKey t) variety = fromKey keyVariety (transferKey t)
type RetryDecider = Annex (TransferInfo -> TransferInfo -> Annex Bool) type NumRetries = Integer
type RetryDecider = NumRetries -> TransferInfo -> TransferInfo -> Annex Bool
{- The first RetryDecider will be checked first; only if it says not to {- The first RetryDecider will be checked first; only if it says not to
- retry will the second one be checked. -} - retry will the second one be checked. -}
combineRetryDeciders :: RetryDecider -> RetryDecider -> RetryDecider combineRetryDeciders :: RetryDecider -> RetryDecider -> RetryDecider
combineRetryDeciders a b = do combineRetryDeciders a b = \n old new -> a n old new <||> b n old new
ar <- a
br <- b
return $ \old new -> ar old new <||> br old new
noRetry :: RetryDecider noRetry :: RetryDecider
noRetry = pure $ \_ _ -> pure False noRetry _ _ _ = pure False
stdRetry :: RetryDecider stdRetry :: RetryDecider
stdRetry = combineRetryDeciders forwardRetry configuredRetry stdRetry = combineRetryDeciders forwardRetry configuredRetry
{- Retries a transfer when it fails, as long as the failed transfer managed {- Keep retrying failed transfers, as long as forward progress is being
- to send some data. -} - made.
-
- Up to a point -- while some remotes can resume where the previous
- transfer left off, and so it would make sense to keep retrying forever,
- other remotes restart each transfer from the beginning, and so even if
- forward progress is being made, it's not real progress. So, retry a
- maximum of 5 times
-}
forwardRetry :: RetryDecider forwardRetry :: RetryDecider
forwardRetry = pure $ \old new -> pure $ forwardRetry = \numretries old new -> pure $ and
fromMaybe 0 (bytesComplete old) < fromMaybe 0 (bytesComplete new) [ fromMaybe 0 (bytesComplete old) < fromMaybe 0 (bytesComplete new)
, numretries <= 5
]
{- Retries a number of times with growing delays in between when enabled {- Retries a number of times with growing delays in between when enabled
- by git configuration. -} - by git configuration. -}
configuredRetry :: RetryDecider configuredRetry :: RetryDecider
configuredRetry = debugLocks $ do configuredRetry numretries old new = do
retrycounter <- liftIO $ newMVar 0
return $ \_old new -> do
(maxretries, Seconds initretrydelay) <- getcfg $ (maxretries, Seconds initretrydelay) <- getcfg $
Remote.gitconfig <$> transferRemote new Remote.gitconfig <$> transferRemote new
retries <- liftIO $ modifyMVar retrycounter $ if numretries < maxretries
\n -> return (n + 1, n + 1)
if retries < maxretries
then do then do
let retrydelay = Seconds (initretrydelay * 2^(retries-1)) let retrydelay = Seconds (initretrydelay * 2^(numretries-1))
showSideAction $ "Delaying " ++ show (fromSeconds retrydelay) ++ "s before retrying." showSideAction $ "Delaying " ++ show (fromSeconds retrydelay) ++ "s before retrying."
liftIO $ threadDelaySeconds retrydelay liftIO $ threadDelaySeconds retrydelay
return True return True

View file

@ -24,6 +24,9 @@ git-annex (8.20200815) UNRELEASED; urgency=medium
gpg key. gpg key.
* Support git remotes where .git is a file, not a directory, * Support git remotes where .git is a file, not a directory,
eg when --separate-git-dir was used. eg when --separate-git-dir was used.
* Limit retrying of failed transfers when forward progress is being made
to 5, to avoid some unusual edge cases where too much retrying could
result in far more data transfer than makes sense.
-- Joey Hess <id@joeyh.name> Fri, 14 Aug 2020 14:57:45 -0400 -- Joey Hess <id@joeyh.name> Fri, 14 Aug 2020 14:57:45 -0400

View file

@ -4,7 +4,8 @@ one more byte got transferred than in the previous, failed try.
Suppose that a transfer was restarting from the beginning each time, and it Suppose that a transfer was restarting from the beginning each time, and it
just so happened that each try got a tiny little bit further before just so happened that each try got a tiny little bit further before
failing. Then transferring an `N` byte object could result in `sum [1..N]` failing. Then transferring an `N` byte object could result in `sum [1..N]`
bytes being sent. Worst case. bytes being sent. Worst case. (Real world it involves the size of chunks
sent in a failing operation, so probably `sum [1..N/1024]` or so.)
So I think forwardRetry should cap after some amount of automatic retrying. So I think forwardRetry should cap after some amount of automatic retrying.
Ie, it could give up after 5 retries. --[[Joey]] Ie, it could give up after 5 retries. --[[Joey]]
@ -15,3 +16,14 @@ if a remote is doing that (unless some timing heuristics were used). Around
5 retries seems fairly reasonable for that case too, it would be unlikely 5 retries seems fairly reasonable for that case too, it would be unlikely
for a rsync transfer to keep failing so many times while still making for a rsync transfer to keep failing so many times while still making
forward progess. --[[Joey]] forward progess. --[[Joey]]
> Or could add data to remotes about this, but it would need to be added
> for external special remotes too, and this does not really seem worth the
> complication.
>
> I think, even if a remote does not support resuming like
> rsync, it makes sense to retry a few failed transfers if it's getting
> closer to success each time, because forward progress suggests whatever
> made it fail is becoming less of a problem.
[[done]] --[[Joey]]