stall detection is working

New config annex.stalldetection, remote.name.annex-stalldetection, which
can be used to deal with remotes that stall during transfers, or are
sometimes too slow to want to use.

This commit was sponsored by Luke Shumaker on Patreon.
This commit is contained in:
Joey Hess 2020-12-08 15:22:18 -04:00
parent 09ed9f7d1f
commit 41f2c308ff
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
11 changed files with 213 additions and 46 deletions

View file

@ -19,6 +19,7 @@ module Annex.Transfer (
noRetry,
stdRetry,
pickRemote,
stallDetection,
) where
import Annex.Common
@ -40,6 +41,7 @@ import Types.WorkerPool
import Annex.WorkerPool
import Annex.TransferrerPool
import Backend (isCryptographicallySecure)
import Types.StallDetection
import qualified Utility.RawFilePath as R
import Control.Concurrent
@ -47,19 +49,15 @@ import qualified Data.Map.Strict as M
import qualified System.FilePath.ByteString as P
import Data.Ord
-- Upload, supporting stall detection.
upload :: Remote -> Key -> AssociatedFile -> RetryDecider -> NotifyWitness -> Annex Bool
upload r key f d _witness =
-- TODO: use this when not handling timeouts
--upload' (Remote.uuid r) key f d $
-- action . Remote.storeKey r key f
-- TODO: RetryDecider
-- TODO: Handle timeouts
withTransferrer $ \transferrer ->
performTransfer transferrer AnnexLevel
(Transfer Upload (Remote.uuid r) (fromKey id key))
(Just r) f id
upload r key f d witness = stallDetection r >>= \case
Nothing -> upload' (Remote.uuid r) key f d go witness
Just sd -> runTransferrer sd r key f d Upload witness
where
go = action . Remote.storeKey r key f
-- Upload, not supporting stall detection.
upload' :: Observable v => UUID -> Key -> AssociatedFile -> RetryDecider -> (MeterUpdate -> Annex v) -> NotifyWitness -> Annex v
upload' u key f d a _witness = guardHaveUUID u $
runTransfer (Transfer Upload u (fromKey id key)) f d a
@ -68,22 +66,17 @@ alwaysUpload :: Observable v => UUID -> Key -> AssociatedFile -> RetryDecider ->
alwaysUpload u key f d a _witness = guardHaveUUID u $
alwaysRunTransfer (Transfer Upload u (fromKey id key)) f d a
-- Download, supporting stall detection.
download :: Remote -> Key -> AssociatedFile -> RetryDecider -> NotifyWitness -> Annex Bool
download r key f d witness =
-- TODO: use this when not handling timeouts
--getViaTmp (Remote.retrievalSecurityPolicy r) (RemoteVerify r) key f $ \dest ->
-- download' (Remote.uuid r) key f d (go dest) witness
-- TODO: RetryDecider
-- TODO: Handle timeouts
withTransferrer $ \transferrer ->
performTransfer transferrer AnnexLevel
(Transfer Download (Remote.uuid r) (fromKey id key))
(Just r) f id
download r key f d witness = stallDetection r >>= \case
Nothing -> getViaTmp (Remote.retrievalSecurityPolicy r) (RemoteVerify r) key f $ \dest ->
download' (Remote.uuid r) key f d (go dest) witness
Just sd -> runTransferrer sd r key f d Download witness
where
go dest p = verifiedAction $
Remote.retrieveKeyFile r key f (fromRawFilePath dest) p
-- Download, not supporting stall detection.
download' :: Observable v => UUID -> Key -> AssociatedFile -> RetryDecider -> (MeterUpdate -> Annex v) -> NotifyWitness -> Annex v
download' u key f d a _witness = guardHaveUUID u $
runTransfer (Transfer Download u (fromKey id key)) f d a
@ -115,7 +108,7 @@ alwaysRunTransfer :: Observable v => Transfer -> AssociatedFile -> RetryDecider
alwaysRunTransfer = runTransfer' True
runTransfer' :: Observable v => Bool -> Transfer -> AssociatedFile -> RetryDecider -> (MeterUpdate -> Annex v) -> Annex v
runTransfer' ignorelock t afile retrydecider transferaction = enteringStage TransferStage $ debugLocks $ preCheckSecureHashes t $ do
runTransfer' ignorelock t afile retrydecider transferaction = enteringStage TransferStage $ debugLocks $ preCheckSecureHashes (transferKey t) $ do
info <- liftIO $ startTransferInfo afile
(meter, tfile, createtfile, metervar) <- mkProgressUpdater t info
mode <- annexFileMode
@ -202,6 +195,31 @@ runTransfer' ignorelock t afile retrydecider transferaction = enteringStage Tran
f <- fromRepo $ gitAnnexTmpObjectLocation (transferKey t)
liftIO $ catchDefaultIO 0 $ getFileSize f
runTransferrer
:: StallDetection
-> Remote
-> Key
-> AssociatedFile
-> RetryDecider
-> Direction
-> NotifyWitness
-> Annex Bool
runTransferrer sd r k afile retrydecider direction _witness =
enteringStage TransferStage $ preCheckSecureHashes k $ do
info <- liftIO $ startTransferInfo afile
go 0 info
where
go numretries info =
withTransferrer (performTransfer (Just sd) AnnexLevel id (Just r) t info) >>= \case
Right () -> return True
Left newinfo -> do
let !numretries' = succ numretries
ifM (retrydecider numretries' info newinfo)
( go numretries' newinfo
, return False
)
t = Transfer direction (Remote.uuid r) (fromKey id k)
{- Avoid download and upload of keys with insecure content when
- annex.securehashesonly is configured.
-
@ -214,8 +232,8 @@ runTransfer' ignorelock t afile retrydecider transferaction = enteringStage Tran
- still contains content using an insecure hash, remotes will likewise
- tend to be configured to reject it, so Upload is also prevented.
-}
preCheckSecureHashes :: Observable v => Transfer -> Annex v -> Annex v
preCheckSecureHashes t a = ifM (isCryptographicallySecure (transferKey t))
preCheckSecureHashes :: Observable v => Key -> Annex v -> Annex v
preCheckSecureHashes k a = ifM (isCryptographicallySecure k)
( a
, ifM (annexSecureHashesOnly <$> Annex.getGitConfig)
( do
@ -225,7 +243,7 @@ preCheckSecureHashes t a = ifM (isCryptographicallySecure (transferKey t))
)
)
where
variety = fromKey keyVariety (transferKey t)
variety = fromKey keyVariety k
type NumRetries = Integer
@ -348,3 +366,9 @@ lessActiveFirst :: M.Map Remote Integer -> Remote -> Remote -> Ordering
lessActiveFirst active a b
| Remote.cost a == Remote.cost b = comparing (`M.lookup` active) a b
| otherwise = comparing Remote.cost a b
stallDetection :: Remote -> Annex (Maybe StallDetection)
stallDetection r = maybe globalcfg (pure . Just) remotecfg
where
globalcfg = annexStallDetection <$> Annex.getGitConfig
remotecfg = remoteAnnexStallDetection $ Remote.gitconfig r

View file

@ -16,15 +16,21 @@ import Types.Transfer
import Types.Key
import qualified Types.Remote as Remote
import Git.Types (RemoteName)
import Types.StallDetection
import Types.Messages
import Messages.Serialized
import Annex.Path
import Utility.Batch
import Utility.Metered
import Utility.HumanTime
import Utility.ThreadScheduler
import Control.Concurrent.STM hiding (check)
import Control.Concurrent
import Control.Concurrent.Async
import Control.Concurrent.STM hiding (check)
import Control.Monad.IO.Class (MonadIO)
import Text.Read (readMaybe)
import Data.Time.Clock.POSIX
import System.Log.Logger (debugM)
data TransferRequest = TransferRequest TransferRequestLevel Direction (Either UUID RemoteName) KeyData AssociatedFile
@ -68,7 +74,11 @@ withTransferrer' minimizeprocesses mkcheck program batchmaker pool a = do
where
returntopool leftinpool check t i
| not minimizeprocesses || leftinpool == 0 =
liftIO $ atomically $ pushTransferrerPool pool i
-- If the transferrer got killed, the handles will
-- be closed, so it should not be returned to the
-- pool.
liftIO $ whenM (hIsOpen (transferrerWrite t)) $
liftIO $ atomically $ pushTransferrerPool pool i
| otherwise = liftIO $ do
void $ forkIO $ shutdownTransferrer t
atomically $ pushTransferrerPool pool $ TransferrerPoolItem Nothing check
@ -90,24 +100,102 @@ checkTransferrerPoolItem program batchmaker i = case i of
return $ TransferrerPoolItem (Just t) check
{- Requests that a Transferrer perform a Transfer, and waits for it to
- finish. -}
- finish.
-
- When a stall is detected, kills the Transferrer.
-
- If the transfer failed or stalled, returns TransferInfo with an
- updated bytesComplete reflecting how much data has been transferred.
-}
performTransfer
:: (Monad m, MonadIO m, MonadMask m)
=> Transferrer
=> Maybe StallDetection
-> TransferRequestLevel
-> Transfer
-> Maybe Remote
-> AssociatedFile
-> (forall a. Annex a -> m a)
-- ^ Run an annex action in the monad. Will not be used with
-- actions that block for a long time.
-> m Bool
performTransfer transferrer level t mremote afile runannex = catchBoolIO $ do
(liftIO $ sendRequest level t mremote afile (transferrerWrite transferrer))
relaySerializedOutput
-> Maybe Remote
-> Transfer
-> TransferInfo
-> Transferrer
-> m (Either TransferInfo ())
performTransfer stalldetection level runannex r t info transferrer = do
bpv <- liftIO $ newTVarIO zeroBytesProcessed
ifM (catchBoolIO $ bracket setup cleanup (go bpv))
( return (Right ())
, do
n <- case transferDirection t of
Upload -> liftIO $ atomically $
fromBytesProcessed <$> readTVar bpv
Download -> do
f <- runannex $ fromRepo $ gitAnnexTmpObjectLocation (transferKey t)
liftIO $ catchDefaultIO 0 $ getFileSize f
return $ Left $ info { bytesComplete = Just n }
)
where
setup = do
liftIO $ sendRequest level t r
(associatedFile info)
(transferrerWrite transferrer)
metervar <- liftIO $ newEmptyTMVarIO
stalledvar <- liftIO $ newTVarIO False
tid <- liftIO $ async $
detectStalls stalldetection metervar $ do
atomically $ writeTVar stalledvar True
killTransferrer transferrer
return (metervar, tid, stalledvar)
cleanup (_, tid, stalledvar) = do
liftIO $ uninterruptibleCancel tid
whenM (liftIO $ atomically $ readTVar stalledvar) $ do
runannex $ showLongNote "Transfer stalled"
-- Close handles, to prevent the transferrer being
-- reused since the process was killed.
liftIO $ hClose $ transferrerRead transferrer
liftIO $ hClose $ transferrerWrite transferrer
go bpv (metervar, _, _) = relaySerializedOutput
(liftIO $ readResponse (transferrerRead transferrer))
(liftIO . sendSerializedOutputResponse (transferrerWrite transferrer))
(updatemeter bpv metervar)
runannex
updatemeter bpv metervar (Just n) = liftIO $ do
atomically $ do
void $ tryTakeTMVar metervar
putTMVar metervar n
atomically $ writeTVar bpv n
updatemeter _bpv metervar Nothing = liftIO $
atomically $ void $ tryTakeTMVar metervar
detectStalls :: Maybe StallDetection -> TMVar BytesProcessed -> IO () -> IO ()
detectStalls Nothing _ _ = noop
detectStalls (Just (StallDetection minsz duration)) metervar onstall = go Nothing
where
go st = do
starttm <- getPOSIXTime
threadDelaySeconds (Seconds (fromIntegral (durationSeconds duration)))
-- Get whatever progress value was reported most recently, or
-- if none were reported since last time, wait until one is
-- reported.
sofar <- atomically $ fromBytesProcessed <$> takeTMVar metervar
case st of
Nothing -> go (Just sofar)
Just prev
-- Just in case a progress meter somehow runs
-- backwards, or a second progress meter was
-- started and is at a smaller value than
-- the previous one.
| prev > sofar -> go (Just sofar)
| otherwise -> do
endtm <- getPOSIXTime
let actualduration = endtm - starttm
let sz = sofar - prev
let expectedsz = (minsz * durationSeconds duration)
`div` max 1 (ceiling actualduration)
if sz < expectedsz
then onstall
else go (Just sofar)
{- Starts a new git-annex transferkeys process, setting up handles
- that will be used to communicate with it. -}
@ -171,8 +259,6 @@ shutdownTransferrer t = do
{- Kill the transferrer, and all its child processes. -}
killTransferrer :: Transferrer -> IO ()
killTransferrer t = do
hClose $ transferrerRead t
hClose $ transferrerWrite t
interruptProcessGroupOf $ transferrerHandle t
threadDelay 50000 -- 0.05 second grace period
terminateProcess $ transferrerHandle t

View file

@ -24,6 +24,7 @@ import Assistant.Alert
import Assistant.Alert.Utility
import Assistant.Commits
import Assistant.Drop
import Annex.Transfer (stallDetection)
import Types.Transfer
import Logs.Transfer
import Logs.Location
@ -37,6 +38,7 @@ import Annex.Path
import Utility.Batch
import Types.NumCopies
import Data.Either
import qualified Data.Map as M
import qualified Control.Exception as E
import Control.Concurrent
@ -123,7 +125,8 @@ genTransfer t info = case transferRemote info of
( do
debug [ "Transferring:" , describeTransfer t info ]
notifyTransfer
return $ Just (t, info, go remote)
sd <- liftAnnex $ stallDetection remote
return $ Just (t, info, go remote sd)
, do
debug [ "Skipping unnecessary transfer:",
describeTransfer t info ]
@ -162,7 +165,7 @@ genTransfer t info = case transferRemote info of
- usual cleanup. However, first check if something else is
- running the transfer, to avoid removing active transfers.
-}
go remote transferrer = ifM (performTransfer transferrer AssistantLevel t (transferRemote info) (associatedFile info) liftAnnex)
go remote sd transferrer = ifM (isRight <$> performTransfer sd AssistantLevel liftAnnex (transferRemote info) t info transferrer)
( do
case associatedFile info of
AssociatedFile Nothing -> noop

View file

@ -1,5 +1,8 @@
git-annex (8.20201128) UNRELEASED; urgency=medium
* New config annex.stalldetection, remote.name.annex-stalldetection,
which can be used to deal with remotes that stall during transfers,
or are sometimes too slow to want to use.
* Fix hang on shutdown of external special remote using ASYNC protocol
extension. (Reversion introduced in version 8.20201007.)
* Fix bug that made the next download after an empty file from a ssh

View file

@ -20,6 +20,7 @@ import Messages
import Messages.Internal
import Messages.Progress
import qualified Messages.JSON as JSON
import Utility.Metered (BytesProcessed)
import Control.Monad.IO.Class (MonadIO)
@ -29,11 +30,16 @@ relaySerializedOutput
=> m (Either SerializedOutput r)
-- ^ Get next serialized output, or final value to return.
-> (SerializedOutputResponse -> m ())
-- ^ Send response to child process.
-> (Maybe BytesProcessed -> m ())
-- ^ When a progress meter is running, is updated with
-- progress meter values sent by the process.
-- When a progress meter is stopped, Nothing is sent.
-> (forall a. Annex a -> m a)
-- ^ Run an annex action in the monad. Will not be used with
-- actions that block for a long time.
-> m r
relaySerializedOutput getso sendsor runannex = go Nothing
relaySerializedOutput getso sendsor meterreport runannex = go Nothing
where
go st = loop st >>= \case
Right r -> return r
@ -69,10 +75,14 @@ relaySerializedOutput getso sendsor runannex = go Nothing
-- output after the progress meter
-- is done.
Left _st' -> loop Nothing
Left EndProgressMeter -> return (Left st)
Left EndProgressMeter -> do
meterreport Nothing
return (Left st)
Left (UpdateProgressMeter n) -> do
case st of
Just meterupdate -> liftIO $ meterupdate n
Just meterupdate -> do
meterreport (Just n)
liftIO $ meterupdate n
Nothing -> noop
loop st
Left StartPrompt -> do

View file

@ -41,6 +41,7 @@ import Types.NumCopies
import Types.Difference
import Types.RefSpec
import Types.RepoVersion
import Types.StallDetection
import Config.DynamicConfig
import Utility.HumanTime
import Utility.Gpg (GpgCmd, mkGpgCmd)
@ -116,6 +117,7 @@ data GitConfig = GitConfig
, annexRetry :: Maybe Integer
, annexForwardRetry :: Maybe Integer
, annexRetryDelay :: Maybe Seconds
, annexStallDetection :: Maybe StallDetection
, annexAllowedUrlSchemes :: S.Set Scheme
, annexAllowedIPAddresses :: String
, annexAllowUnverifiedDownloads :: Bool
@ -202,6 +204,9 @@ extractGitConfig configsource r = GitConfig
, annexForwardRetry = getmayberead (annexConfig "forward-retry")
, annexRetryDelay = Seconds
<$> getmayberead (annexConfig "retrydelay")
, annexStallDetection =
either (const Nothing) Just . parseStallDetection
=<< getmaybe (annexConfig "stalldetection")
, annexAllowedUrlSchemes = S.fromList $ map mkScheme $
maybe ["http", "https", "ftp"] words $
getmaybe (annexConfig "security.allowed-url-schemes")
@ -306,6 +311,7 @@ data RemoteGitConfig = RemoteGitConfig
, remoteAnnexRetry :: Maybe Integer
, remoteAnnexForwardRetry :: Maybe Integer
, remoteAnnexRetryDelay :: Maybe Seconds
, remoteAnnexStallDetection :: Maybe StallDetection
, remoteAnnexAllowUnverifiedDownloads :: Bool
, remoteAnnexConfigUUID :: Maybe UUID
@ -369,6 +375,9 @@ extractRemoteGitConfig r remotename = do
, remoteAnnexForwardRetry = getmayberead "forward-retry"
, remoteAnnexRetryDelay = Seconds
<$> getmayberead "retrydelay"
, remoteAnnexStallDetection =
either (const Nothing) Just . parseStallDetection
=<< getmaybe "stalldetection"
, remoteAnnexAllowUnverifiedDownloads = (== Just "ACKTHPPT") $
getmaybe ("security-allow-unverified-downloads")
, remoteAnnexConfigUUID = toUUID <$> getmaybe "config-uuid"

29
Types/StallDetection.hs Normal file
View file

@ -0,0 +1,29 @@
{- types for stall detection
-
- Copyright 2020 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
module Types.StallDetection where
import Utility.DataUnits
import Utility.HumanTime
import Utility.Misc
-- Unless the given number of bytes have been sent over the given
-- amount of time, there's a stall.
data StallDetection = StallDetection ByteSize Duration
deriving (Show)
-- Parse eg, "0KiB/60s"
parseStallDetection :: String -> Either String StallDetection
parseStallDetection s =
let (bs, ds) = separate (== '/') s
in do
b <- maybe
(Left $ "Unable to parse stall detection amount " ++ bs)
Right
(readSize dataUnits bs)
d <- parseDuration ds
return (StallDetection b d)

View file

@ -34,7 +34,7 @@ module Utility.Process (
) where
import qualified Utility.Process.Shim
import Utility.Process.Shim as X (CreateProcess(..), ProcessHandle, StdStream(..), CmdSpec(..), proc, getPid, getProcessExitCode, shell, terminateProcess)
import Utility.Process.Shim as X (CreateProcess(..), ProcessHandle, StdStream(..), CmdSpec(..), proc, getPid, getProcessExitCode, shell, terminateProcess, interruptProcessGroupOf)
import Utility.Misc
import Utility.Exception
import Utility.Monad

View file

@ -1410,7 +1410,7 @@ Remotes are configured using these settings in `.git/config`.
The format is "$amount/$timeperiod".
For example, to detect outright stalls where no data has been transferred
after 30 seconds: `git config annex.stalldetection "0KiB/60s"`
after 30 seconds: `git config annex.stalldetection "0/30s"`
Or, if you have a remote on a USB drive that is normally capable of
several megabytes per second, but has bad sectors where it gets

View file

@ -1,3 +1,5 @@
In a number of scenarios (e.g. [[bugs/still_seeing_errors_with_parallel_git-annex-add]], [[bugs/parallel_copy_fails]], [[git-annex-sync/#comment-aceb18109c0a536e04bcdd3aa04bda29]]), `git-annex` operations may fail or hang due to transient conditions. It would help a lot if `git-annex` could be configured to fail timed-out operations, and to retry failed operations after a delay. This would especially help when using `git-annex` in a script or a higher-level tool. I've tried wrapping some retry logic around `git-annex` calls, but it seems `git-annex` itself is in the best position to do that sensibly (e.g. only retrying idempotent operations, or capping retries per remote). This would be a catch-all fix for unusual conditions that are hard to test for.
`git-annex` already has config options `annex.retry` and `annex.retry-delay`, but it seems that they don't cover all failure types.
> Added annex.stalldetection, [[done]] --[[Joey]]

View file

@ -1023,6 +1023,7 @@ Executable git-annex
Types.RepoVersion
Types.ScheduledActivity
Types.StandardGroups
Types.StallDetection
Types.StoreRetrieve
Types.Test
Types.Transfer