From 8dcf79694d3b023bdd562ca7c821b3e4830d7bbe Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Wed, 26 Oct 2016 15:38:22 -0400 Subject: [PATCH] enable forwardRetry for command-line transfers If a transfer fails for some reason, but some data managed to be sent, the transfer will be retried. (The assistant already did this.) Possible impacts: * More ssh prompts if ssh needs to prompt for a password to connect to a host, or is prompting about some other problem like a ssh key mismatch. * More data transfer due to retrying, epecially when a remote does not support resuming a transfer. In the worst case, a lot of data will be transferred but it fails before the end, and then all that data gets transferred again plus one byte more; repeat until it manages to get the whole file. --- CHANGELOG | 2 + Command/Get.hs | 2 +- Command/Move.hs | 4 +- Command/SendKey.hs | 1 + Remote/Git.hs | 4 +- .../Allow_automatic_retry_git_annex_get.mdwn | 1 + ..._6d05cd09e1f00fb5ace2b9ae3bffdedb._comment | 66 +++++++++++++++++++ 7 files changed, 75 insertions(+), 5 deletions(-) create mode 100644 doc/bugs/Allow_automatic_retry_git_annex_get/comment_2_6d05cd09e1f00fb5ace2b9ae3bffdedb._comment diff --git a/CHANGELOG b/CHANGELOG index d91b79c630..c4016ef358 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -12,6 +12,8 @@ git-annex (6.20161013) UNRELEASED; urgency=medium * test: Deal with gpg-agent behavior change that broke the test suite. * Improve ssh socket cleanup code to skip over the cruft that NFS sometimes puts in a directory when a file is being deleted. + * If a transfer fails for some reason, but some data managed to be sent, + the transfer will be retried. (The assistant already did this.) -- Joey Hess Mon, 17 Oct 2016 12:46:54 -0400 diff --git a/Command/Get.hs b/Command/Get.hs index a0c7aff47d..abf95e48a6 100644 --- a/Command/Get.hs +++ b/Command/Get.hs @@ -109,7 +109,7 @@ getKey' key afile = dispatch either (const False) id <$> Remote.hasKey r key | otherwise = return True docopy r witness = getViaTmp (RemoteVerify r) key $ \dest -> - download (Remote.uuid r) key afile noRetry + download (Remote.uuid r) key afile forwardRetry (\p -> do showAction $ "from " ++ Remote.name r Remote.retrieveKeyFile r key afile dest p diff --git a/Command/Move.hs b/Command/Move.hs index b44041f6c0..9c43c6f1db 100644 --- a/Command/Move.hs +++ b/Command/Move.hs @@ -114,7 +114,7 @@ toPerform dest move key afile fastcheck isthere = Right False -> do showAction $ "to " ++ Remote.name dest ok <- notifyTransfer Upload afile $ - upload (Remote.uuid dest) key afile noRetry $ + upload (Remote.uuid dest) key afile forwardRetry $ Remote.storeKey dest key afile if ok then finish $ @@ -177,7 +177,7 @@ fromPerform src move key afile = ifM (inAnnex key) ) where go = notifyTransfer Download afile $ - download (Remote.uuid src) key afile noRetry $ \p -> do + download (Remote.uuid src) key afile forwardRetry $ \p -> do showAction $ "from " ++ Remote.name src getViaTmp (RemoteVerify src) key $ \t -> Remote.retrieveKeyFile src key afile t p diff --git a/Command/SendKey.hs b/Command/SendKey.hs index 68da10316b..302810374a 100644 --- a/Command/SendKey.hs +++ b/Command/SendKey.hs @@ -48,6 +48,7 @@ fieldTransfer direction key a = do liftIO $ debugM "fieldTransfer" "transfer start" afile <- Fields.getField Fields.associatedFile ok <- maybe (a $ const noop) + -- Using noRetry here because we're the sender. (\u -> runner (Transfer direction (toUUID u) key) afile noRetry a) =<< Fields.getField Fields.remoteUUID liftIO $ debugM "fieldTransfer" "transfer done" diff --git a/Remote/Git.hs b/Remote/Git.hs index ff5e733ce5..34bdd83a16 100644 --- a/Remote/Git.hs +++ b/Remote/Git.hs @@ -439,7 +439,7 @@ copyFromRemote' r key file dest meterupdate Just (object, checksuccess) -> do copier <- mkCopier hardlink params runTransfer (Transfer Download u key) - file noRetry + file forwardRetry (\p -> copier object dest (combineMeterUpdate p meterupdate) checksuccess) | Git.repoIsSsh (repo r) = unVerified $ feedprogressback $ \p -> do Ssh.rsyncHelper (Just (combineMeterUpdate meterupdate p)) @@ -565,7 +565,7 @@ copyToRemote' r key file meterupdate ensureInitialized copier <- mkCopier hardlink params let verify = Annex.Content.RemoteVerify r - runTransfer (Transfer Download u key) file noRetry $ \p -> + runTransfer (Transfer Download u key) file forwardRetry $ \p -> let p' = combineMeterUpdate meterupdate p in Annex.Content.saveState True `after` Annex.Content.getViaTmp verify key diff --git a/doc/bugs/Allow_automatic_retry_git_annex_get.mdwn b/doc/bugs/Allow_automatic_retry_git_annex_get.mdwn index 757ef7ca1d..c6e406b490 100644 --- a/doc/bugs/Allow_automatic_retry_git_annex_get.mdwn +++ b/doc/bugs/Allow_automatic_retry_git_annex_get.mdwn @@ -58,3 +58,4 @@ SHA256E-s41311329--69c3b054a3fe9676133605730d85b7fcef8696f6782d402a524e41b836253 +[[!meta title="Detect stalled transfer and retry or abort it"]] diff --git a/doc/bugs/Allow_automatic_retry_git_annex_get/comment_2_6d05cd09e1f00fb5ace2b9ae3bffdedb._comment b/doc/bugs/Allow_automatic_retry_git_annex_get/comment_2_6d05cd09e1f00fb5ace2b9ae3bffdedb._comment new file mode 100644 index 0000000000..6d2563e46d --- /dev/null +++ b/doc/bugs/Allow_automatic_retry_git_annex_get/comment_2_6d05cd09e1f00fb5ace2b9ae3bffdedb._comment @@ -0,0 +1,66 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 2""" + date="2016-10-26T18:26:35Z" + content=""" +The most common way a network connection can stall like this is when +moving to a different wifi network: the connection is open but +no more data will be received. I suppose other kinds of network +glitches could also lead to this kind of situation. + +ssh has some things, like ServerAliveInterval and TCPKeepAlive, +that it can use to detect such problems. You may find them useful. + +---- + +As for the retrying once a stall is detected, some transfers use +`forwardRetry` which will automatically retry as long as the failed try +managed to send some data. But the get/move/copy commands currently use +`noRetry`. I can't find any justification for not always using +`forwardRetry`; I think that it was added for the assistant originally and +the other stuff just never switched over. + +Only problem I can think of is, if there actually is a ssh password +prompt, it would prompt again on retry. But most people using git-annex +with ssh have something in place to make ssh not prompt repeatedly for +passwords. + +So, I've gone ahead and enabled `forwardRetry` for everything. + +---- + +Occurs to me that git-annex could try to notice when a transfer is not +progressing, by reusing the existing progress metering code. + +Since some remotes don't update the progress meter, this could +only be used to detect stalls after the progress meter has been updated +at least once. If the stall occurs earlier than that, it would not be able +to be detected. + +It seems quite hard to come up with a good timeout value to detect a +stalled connection. Often progress meters are updated after every small +(eg 32kb) chunk transferred. But others might poll periodically, or might +use a larger chunk size. It's even possible that some special remotes +are looking at a percent output by some program, and only update the meter +when the percent transferred changes -- in which case it could be many +minutes in between each meter update when a large file is being +transferred. + +If the timeout is too short, git-annex will stall in a new way, by +constantly killing "stalled" connections before they can send enough data. + +---- + +So it really seems better to fix the ssh connection to not stall, since +that is not so heuristic a fix. Seems like git-annex could force +ServerAliveInterval to be set, and perhaps lower ServerAliveCountMax from 3 +to 1. The ssh BatchMode setting sets the former to 300, so a stalled +connection will time out after 15 minutes. But BatchMode also disables +prompting, and git-annex should not disable that. + +Catch is, what if the user has configured ssh with some +other ServerAliveInterval value? We don't want git-annex to override that. + +(git-annex does have a rudimentary .ssh/config parser, but it's not +good enough to handle eg, "Host *.example.com ") +"""]]