Limit retrying of failed transfers when forward progress is being made to 5

To avoid some unusual edge cases where too much retrying could result in
far more data transfer than makes sense.
This commit is contained in:
Joey Hess 2020-09-04 12:46:37 -04:00
parent 820d4368b3
commit 1d244bafbd
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
3 changed files with 59 additions and 37 deletions

View file

@ -1,6 +1,6 @@
{- git-annex transfers
-
- Copyright 2012-2019 Joey Hess <id@joeyh.name>
- Copyright 2012-2020 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -81,7 +81,6 @@ alwaysRunTransfer = runTransfer' True
runTransfer' :: Observable v => Bool -> Transfer -> AssociatedFile -> RetryDecider -> (MeterUpdate -> Annex v) -> Annex v
runTransfer' ignorelock t afile retrydecider transferaction = enteringStage TransferStage $ debugLocks $ checkSecureHashes t $ do
shouldretry <- retrydecider
info <- liftIO $ startTransferInfo afile
(meter, tfile, createtfile, metervar) <- mkProgressUpdater t info
mode <- annexFileMode
@ -91,7 +90,7 @@ runTransfer' ignorelock t afile retrydecider transferaction = enteringStage Tran
showNote "transfer already in progress, or unable to take transfer lock"
return observeFailure
else do
v <- retry shouldretry info metervar $ transferaction meter
v <- retry 0 info metervar (transferaction meter)
liftIO $ cleanup tfile lck
if observeBool v
then removeFailedTransfer t
@ -142,21 +141,25 @@ runTransfer' ignorelock t afile retrydecider transferaction = enteringStage Tran
dropLock lockhandle
void $ tryIO $ removeFile lck
#endif
retry shouldretry oldinfo metervar run = tryNonAsync run >>= \case
Right v
| observeBool v -> return v
| otherwise -> checkretry
Left e -> do
warning (show e)
checkretry
retry numretries oldinfo metervar run =
tryNonAsync run >>= \case
Right v
| observeBool v -> return v
| otherwise -> checkretry
Left e -> do
warning (show e)
checkretry
where
checkretry = do
b <- getbytescomplete metervar
let newinfo = oldinfo { bytesComplete = Just b }
ifM (shouldretry oldinfo newinfo)
( retry shouldretry newinfo metervar run
let !numretries' = succ numretries
ifM (retrydecider numretries' oldinfo newinfo)
( retry numretries' newinfo metervar run
, return observeFailure
)
getbytescomplete metervar
| transferDirection t == Upload =
liftIO $ readMVar metervar
@ -189,45 +192,49 @@ checkSecureHashes t a = ifM (isCryptographicallySecure (transferKey t))
where
variety = fromKey keyVariety (transferKey t)
type RetryDecider = Annex (TransferInfo -> TransferInfo -> Annex Bool)
type NumRetries = Integer
type RetryDecider = NumRetries -> TransferInfo -> TransferInfo -> Annex Bool
{- The first RetryDecider will be checked first; only if it says not to
- retry will the second one be checked. -}
combineRetryDeciders :: RetryDecider -> RetryDecider -> RetryDecider
combineRetryDeciders a b = do
ar <- a
br <- b
return $ \old new -> ar old new <||> br old new
combineRetryDeciders a b = \n old new -> a n old new <||> b n old new
noRetry :: RetryDecider
noRetry = pure $ \_ _ -> pure False
noRetry _ _ _ = pure False
stdRetry :: RetryDecider
stdRetry = combineRetryDeciders forwardRetry configuredRetry
{- Retries a transfer when it fails, as long as the failed transfer managed
- to send some data. -}
{- Keep retrying failed transfers, as long as forward progress is being
- made.
-
- Up to a point -- while some remotes can resume where the previous
- transfer left off, and so it would make sense to keep retrying forever,
- other remotes restart each transfer from the beginning, and so even if
- forward progress is being made, it's not real progress. So, retry a
- maximum of 5 times
-}
forwardRetry :: RetryDecider
forwardRetry = pure $ \old new -> pure $
fromMaybe 0 (bytesComplete old) < fromMaybe 0 (bytesComplete new)
forwardRetry = \numretries old new -> pure $ and
[ fromMaybe 0 (bytesComplete old) < fromMaybe 0 (bytesComplete new)
, numretries <= 5
]
{- Retries a number of times with growing delays in between when enabled
- by git configuration. -}
configuredRetry :: RetryDecider
configuredRetry = debugLocks $ do
retrycounter <- liftIO $ newMVar 0
return $ \_old new -> do
(maxretries, Seconds initretrydelay) <- getcfg $
Remote.gitconfig <$> transferRemote new
retries <- liftIO $ modifyMVar retrycounter $
\n -> return (n + 1, n + 1)
if retries < maxretries
then do
let retrydelay = Seconds (initretrydelay * 2^(retries-1))
showSideAction $ "Delaying " ++ show (fromSeconds retrydelay) ++ "s before retrying."
liftIO $ threadDelaySeconds retrydelay
return True
else return False
configuredRetry numretries old new = do
(maxretries, Seconds initretrydelay) <- getcfg $
Remote.gitconfig <$> transferRemote new
if numretries < maxretries
then do
let retrydelay = Seconds (initretrydelay * 2^(numretries-1))
showSideAction $ "Delaying " ++ show (fromSeconds retrydelay) ++ "s before retrying."
liftIO $ threadDelaySeconds retrydelay
return True
else return False
where
globalretrycfg = fromMaybe 0 . annexRetry
<$> Annex.getGitConfig

View file

@ -24,6 +24,9 @@ git-annex (8.20200815) UNRELEASED; urgency=medium
gpg key.
* Support git remotes where .git is a file, not a directory,
eg when --separate-git-dir was used.
* Limit retrying of failed transfers when forward progress is being made
to 5, to avoid some unusual edge cases where too much retrying could
result in far more data transfer than makes sense.
-- Joey Hess <id@joeyh.name> Fri, 14 Aug 2020 14:57:45 -0400

View file

@ -4,7 +4,8 @@ one more byte got transferred than in the previous, failed try.
Suppose that a transfer was restarting from the beginning each time, and it
just so happened that each try got a tiny little bit further before
failing. Then transferring an `N` byte object could result in `sum [1..N]`
bytes being sent. Worst case.
bytes being sent. Worst case. (Real world it involves the size of chunks
sent in a failing operation, so probably `sum [1..N/1024]` or so.)
So I think forwardRetry should cap after some amount of automatic retrying.
Ie, it could give up after 5 retries. --[[Joey]]
@ -15,3 +16,14 @@ if a remote is doing that (unless some timing heuristics were used). Around
5 retries seems fairly reasonable for that case too, it would be unlikely
for a rsync transfer to keep failing so many times while still making
forward progess. --[[Joey]]
> Or could add data to remotes about this, but it would need to be added
> for external special remotes too, and this does not really seem worth the
> complication.
>
> I think, even if a remote does not support resuming like
> rsync, it makes sense to retry a few failed transfers if it's getting
> closer to success each time, because forward progress suggests whatever
> made it fail is becoming less of a problem.
[[done]] --[[Joey]]