diff --git a/Annex/Ssh.hs b/Annex/Ssh.hs index b913c154c1..7bb9ad0f2e 100644 --- a/Annex/Ssh.hs +++ b/Annex/Ssh.hs @@ -34,7 +34,6 @@ import Annex.Path import Utility.Env import Utility.FileSystemEncoding import Utility.Hash -import Utility.Process.Transcript import Types.CleanupActions import Types.Concurrency import Git.Env @@ -219,13 +218,17 @@ prepSocket socketfile gc sshhost sshparams = do -- return True. -- (Except there's an unlikely false positive where a forced -- ssh command exits 255.) - tryssh extraps = liftIO $ do + tryssh extraps = liftIO $ withNullHandle $ \nullh -> do let p = proc "ssh" $ concat [ extraps , toCommand sshparams , [fromSshHost sshhost, "true"] ] - (_, exitcode) <- processTranscript'' p Nothing + (Nothing, Nothing, Nothing, pid) <- createProcess $ p + { std_out = UseHandle nullh + , std_err = UseHandle nullh + } + exitcode <- waitForProcess pid return $ case exitcode of ExitFailure 255 -> False _ -> True diff --git a/CHANGELOG b/CHANGELOG index 17db60cc42..d6f710a411 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -33,6 +33,9 @@ git-annex (6.20180228) UNRELEASED; urgency=medium the only copy of a file when it thought the tor remote had a copy. * Better ssh connection warmup when using -J for concurrency. Avoids ugly messages when forced ssh command is not git-annex-shell. + * Fix race condition in ssh warmup that caused git-annex to get + stuck and never process some while when run with high levels of + concurrency. * Note that Remote/Git.hs now contains AGPL licensed code, thus the license of git-annex as a whole is AGPL. This was already the case when git-annex was built with the webapp enabled. diff --git a/doc/bugs/occasional_hang_with_p2pstdio.mdwn b/doc/bugs/occasional_hang_with_p2pstdio.mdwn index bdfce1fe4c..a46e59c35d 100644 --- a/doc/bugs/occasional_hang_with_p2pstdio.mdwn +++ b/doc/bugs/occasional_hang_with_p2pstdio.mdwn @@ -29,3 +29,26 @@ Interestingly, the debug log shows it only ran git-annex-shell p2pstdio 6 times, despite the concurrency of 20. So, the other 14 must have stalled setting up the connection. Suggests the bug is in the connection pool code. + +> The hang does not involve the connection pool code itself; a call to +> Annex.Ssh.sshCommand is hanging. So, this likely affected git-annex +> before p2pstdio, although its timings of calls to sshCommand may be +> exposing the problem. +> +> The hang is in prepSocket; all the threads enter makeconnection near the +> same time, and so all of them try to warm up the ssh connection at the +> same time. And somehow many of those executions of ssh hang. +> (Arguably there should be locking to prevent multiple threads doing +> this, but the actual overhead of multiple threads doing it may be +> smaller than the overhead of such added locking.) +> +> Why is makeconnection's use of processTranscript hanging? +> processTranscriot tries to read the process's output (ssh has none), +> and waiting for the output to get read is for some reason hanging +> forever, despite the ssh process becoming a zombie. +> Converted makeconnection to not use processTranscript, +> and that does seem to avoid the hang. +> +> So, this bug is left open only because processTranscript hangs in situations +> like this. No other uses of it involve concurrency, but we still need to +> get to the bottom of its hang.. --[[Joey]]