Limit retrying of failed transfers when forward progress is being made to 5
To avoid some unusual edge cases where too much retrying could result in far more data transfer than makes sense.
This commit is contained in:
parent
820d4368b3
commit
1d244bafbd
3 changed files with 59 additions and 37 deletions
|
@ -1,6 +1,6 @@
|
||||||
{- git-annex transfers
|
{- git-annex transfers
|
||||||
-
|
-
|
||||||
- Copyright 2012-2019 Joey Hess <id@joeyh.name>
|
- Copyright 2012-2020 Joey Hess <id@joeyh.name>
|
||||||
-
|
-
|
||||||
- Licensed under the GNU AGPL version 3 or higher.
|
- Licensed under the GNU AGPL version 3 or higher.
|
||||||
-}
|
-}
|
||||||
|
@ -81,7 +81,6 @@ alwaysRunTransfer = runTransfer' True
|
||||||
|
|
||||||
runTransfer' :: Observable v => Bool -> Transfer -> AssociatedFile -> RetryDecider -> (MeterUpdate -> Annex v) -> Annex v
|
runTransfer' :: Observable v => Bool -> Transfer -> AssociatedFile -> RetryDecider -> (MeterUpdate -> Annex v) -> Annex v
|
||||||
runTransfer' ignorelock t afile retrydecider transferaction = enteringStage TransferStage $ debugLocks $ checkSecureHashes t $ do
|
runTransfer' ignorelock t afile retrydecider transferaction = enteringStage TransferStage $ debugLocks $ checkSecureHashes t $ do
|
||||||
shouldretry <- retrydecider
|
|
||||||
info <- liftIO $ startTransferInfo afile
|
info <- liftIO $ startTransferInfo afile
|
||||||
(meter, tfile, createtfile, metervar) <- mkProgressUpdater t info
|
(meter, tfile, createtfile, metervar) <- mkProgressUpdater t info
|
||||||
mode <- annexFileMode
|
mode <- annexFileMode
|
||||||
|
@ -91,7 +90,7 @@ runTransfer' ignorelock t afile retrydecider transferaction = enteringStage Tran
|
||||||
showNote "transfer already in progress, or unable to take transfer lock"
|
showNote "transfer already in progress, or unable to take transfer lock"
|
||||||
return observeFailure
|
return observeFailure
|
||||||
else do
|
else do
|
||||||
v <- retry shouldretry info metervar $ transferaction meter
|
v <- retry 0 info metervar (transferaction meter)
|
||||||
liftIO $ cleanup tfile lck
|
liftIO $ cleanup tfile lck
|
||||||
if observeBool v
|
if observeBool v
|
||||||
then removeFailedTransfer t
|
then removeFailedTransfer t
|
||||||
|
@ -142,7 +141,9 @@ runTransfer' ignorelock t afile retrydecider transferaction = enteringStage Tran
|
||||||
dropLock lockhandle
|
dropLock lockhandle
|
||||||
void $ tryIO $ removeFile lck
|
void $ tryIO $ removeFile lck
|
||||||
#endif
|
#endif
|
||||||
retry shouldretry oldinfo metervar run = tryNonAsync run >>= \case
|
|
||||||
|
retry numretries oldinfo metervar run =
|
||||||
|
tryNonAsync run >>= \case
|
||||||
Right v
|
Right v
|
||||||
| observeBool v -> return v
|
| observeBool v -> return v
|
||||||
| otherwise -> checkretry
|
| otherwise -> checkretry
|
||||||
|
@ -153,10 +154,12 @@ runTransfer' ignorelock t afile retrydecider transferaction = enteringStage Tran
|
||||||
checkretry = do
|
checkretry = do
|
||||||
b <- getbytescomplete metervar
|
b <- getbytescomplete metervar
|
||||||
let newinfo = oldinfo { bytesComplete = Just b }
|
let newinfo = oldinfo { bytesComplete = Just b }
|
||||||
ifM (shouldretry oldinfo newinfo)
|
let !numretries' = succ numretries
|
||||||
( retry shouldretry newinfo metervar run
|
ifM (retrydecider numretries' oldinfo newinfo)
|
||||||
|
( retry numretries' newinfo metervar run
|
||||||
, return observeFailure
|
, return observeFailure
|
||||||
)
|
)
|
||||||
|
|
||||||
getbytescomplete metervar
|
getbytescomplete metervar
|
||||||
| transferDirection t == Upload =
|
| transferDirection t == Upload =
|
||||||
liftIO $ readMVar metervar
|
liftIO $ readMVar metervar
|
||||||
|
@ -189,41 +192,45 @@ checkSecureHashes t a = ifM (isCryptographicallySecure (transferKey t))
|
||||||
where
|
where
|
||||||
variety = fromKey keyVariety (transferKey t)
|
variety = fromKey keyVariety (transferKey t)
|
||||||
|
|
||||||
type RetryDecider = Annex (TransferInfo -> TransferInfo -> Annex Bool)
|
type NumRetries = Integer
|
||||||
|
|
||||||
|
type RetryDecider = NumRetries -> TransferInfo -> TransferInfo -> Annex Bool
|
||||||
|
|
||||||
{- The first RetryDecider will be checked first; only if it says not to
|
{- The first RetryDecider will be checked first; only if it says not to
|
||||||
- retry will the second one be checked. -}
|
- retry will the second one be checked. -}
|
||||||
combineRetryDeciders :: RetryDecider -> RetryDecider -> RetryDecider
|
combineRetryDeciders :: RetryDecider -> RetryDecider -> RetryDecider
|
||||||
combineRetryDeciders a b = do
|
combineRetryDeciders a b = \n old new -> a n old new <||> b n old new
|
||||||
ar <- a
|
|
||||||
br <- b
|
|
||||||
return $ \old new -> ar old new <||> br old new
|
|
||||||
|
|
||||||
noRetry :: RetryDecider
|
noRetry :: RetryDecider
|
||||||
noRetry = pure $ \_ _ -> pure False
|
noRetry _ _ _ = pure False
|
||||||
|
|
||||||
stdRetry :: RetryDecider
|
stdRetry :: RetryDecider
|
||||||
stdRetry = combineRetryDeciders forwardRetry configuredRetry
|
stdRetry = combineRetryDeciders forwardRetry configuredRetry
|
||||||
|
|
||||||
{- Retries a transfer when it fails, as long as the failed transfer managed
|
{- Keep retrying failed transfers, as long as forward progress is being
|
||||||
- to send some data. -}
|
- made.
|
||||||
|
-
|
||||||
|
- Up to a point -- while some remotes can resume where the previous
|
||||||
|
- transfer left off, and so it would make sense to keep retrying forever,
|
||||||
|
- other remotes restart each transfer from the beginning, and so even if
|
||||||
|
- forward progress is being made, it's not real progress. So, retry a
|
||||||
|
- maximum of 5 times
|
||||||
|
-}
|
||||||
forwardRetry :: RetryDecider
|
forwardRetry :: RetryDecider
|
||||||
forwardRetry = pure $ \old new -> pure $
|
forwardRetry = \numretries old new -> pure $ and
|
||||||
fromMaybe 0 (bytesComplete old) < fromMaybe 0 (bytesComplete new)
|
[ fromMaybe 0 (bytesComplete old) < fromMaybe 0 (bytesComplete new)
|
||||||
|
, numretries <= 5
|
||||||
|
]
|
||||||
|
|
||||||
{- Retries a number of times with growing delays in between when enabled
|
{- Retries a number of times with growing delays in between when enabled
|
||||||
- by git configuration. -}
|
- by git configuration. -}
|
||||||
configuredRetry :: RetryDecider
|
configuredRetry :: RetryDecider
|
||||||
configuredRetry = debugLocks $ do
|
configuredRetry numretries old new = do
|
||||||
retrycounter <- liftIO $ newMVar 0
|
|
||||||
return $ \_old new -> do
|
|
||||||
(maxretries, Seconds initretrydelay) <- getcfg $
|
(maxretries, Seconds initretrydelay) <- getcfg $
|
||||||
Remote.gitconfig <$> transferRemote new
|
Remote.gitconfig <$> transferRemote new
|
||||||
retries <- liftIO $ modifyMVar retrycounter $
|
if numretries < maxretries
|
||||||
\n -> return (n + 1, n + 1)
|
|
||||||
if retries < maxretries
|
|
||||||
then do
|
then do
|
||||||
let retrydelay = Seconds (initretrydelay * 2^(retries-1))
|
let retrydelay = Seconds (initretrydelay * 2^(numretries-1))
|
||||||
showSideAction $ "Delaying " ++ show (fromSeconds retrydelay) ++ "s before retrying."
|
showSideAction $ "Delaying " ++ show (fromSeconds retrydelay) ++ "s before retrying."
|
||||||
liftIO $ threadDelaySeconds retrydelay
|
liftIO $ threadDelaySeconds retrydelay
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -24,6 +24,9 @@ git-annex (8.20200815) UNRELEASED; urgency=medium
|
||||||
gpg key.
|
gpg key.
|
||||||
* Support git remotes where .git is a file, not a directory,
|
* Support git remotes where .git is a file, not a directory,
|
||||||
eg when --separate-git-dir was used.
|
eg when --separate-git-dir was used.
|
||||||
|
* Limit retrying of failed transfers when forward progress is being made
|
||||||
|
to 5, to avoid some unusual edge cases where too much retrying could
|
||||||
|
result in far more data transfer than makes sense.
|
||||||
|
|
||||||
-- Joey Hess <id@joeyh.name> Fri, 14 Aug 2020 14:57:45 -0400
|
-- Joey Hess <id@joeyh.name> Fri, 14 Aug 2020 14:57:45 -0400
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,8 @@ one more byte got transferred than in the previous, failed try.
|
||||||
Suppose that a transfer was restarting from the beginning each time, and it
|
Suppose that a transfer was restarting from the beginning each time, and it
|
||||||
just so happened that each try got a tiny little bit further before
|
just so happened that each try got a tiny little bit further before
|
||||||
failing. Then transferring an `N` byte object could result in `sum [1..N]`
|
failing. Then transferring an `N` byte object could result in `sum [1..N]`
|
||||||
bytes being sent. Worst case.
|
bytes being sent. Worst case. (Real world it involves the size of chunks
|
||||||
|
sent in a failing operation, so probably `sum [1..N/1024]` or so.)
|
||||||
|
|
||||||
So I think forwardRetry should cap after some amount of automatic retrying.
|
So I think forwardRetry should cap after some amount of automatic retrying.
|
||||||
Ie, it could give up after 5 retries. --[[Joey]]
|
Ie, it could give up after 5 retries. --[[Joey]]
|
||||||
|
@ -15,3 +16,14 @@ if a remote is doing that (unless some timing heuristics were used). Around
|
||||||
5 retries seems fairly reasonable for that case too, it would be unlikely
|
5 retries seems fairly reasonable for that case too, it would be unlikely
|
||||||
for a rsync transfer to keep failing so many times while still making
|
for a rsync transfer to keep failing so many times while still making
|
||||||
forward progess. --[[Joey]]
|
forward progess. --[[Joey]]
|
||||||
|
|
||||||
|
> Or could add data to remotes about this, but it would need to be added
|
||||||
|
> for external special remotes too, and this does not really seem worth the
|
||||||
|
> complication.
|
||||||
|
>
|
||||||
|
> I think, even if a remote does not support resuming like
|
||||||
|
> rsync, it makes sense to retry a few failed transfers if it's getting
|
||||||
|
> closer to success each time, because forward progress suggests whatever
|
||||||
|
> made it fail is becoming less of a problem.
|
||||||
|
|
||||||
|
[[done]] --[[Joey]]
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue