remotedaemon: When network connection is lost, close all cached ssh connections.

This commit was sponsored by Cedric Staub.
This commit is contained in:
Joey Hess 2014-04-12 16:32:59 -04:00
parent 15917ec1a8
commit a33b30d0c4
7 changed files with 76 additions and 36 deletions

View file

@ -11,6 +11,7 @@ module Annex.Ssh (
sshCachingOptions, sshCachingOptions,
sshCacheDir, sshCacheDir,
sshReadPort, sshReadPort,
forceSshCleanup,
sshCachingEnv, sshCachingEnv,
sshCachingTo, sshCachingTo,
inRepoWithSshCachingTo, inRepoWithSshCachingTo,
@ -124,21 +125,27 @@ prepSocket socketfile = do
liftIO $ createDirectoryIfMissing True $ parentDir socketfile liftIO $ createDirectoryIfMissing True $ parentDir socketfile
lockFile $ socket2lock socketfile lockFile $ socket2lock socketfile
{- Stop any unused ssh processes. -} enumSocketFiles :: Annex [FilePath]
sshCleanup :: Annex () enumSocketFiles = go =<< sshCacheDir
sshCleanup = go =<< sshCacheDir where
go Nothing = return []
go (Just dir) = liftIO $ filter (not . isLock)
<$> catchDefaultIO [] (dirContents dir)
{- Stop any unused ssh connection caching processes. -}
sshCleanup :: Annex ()
sshCleanup = mapM_ cleanup =<< enumSocketFiles
where where
go Nothing = noop
go (Just dir) = do
sockets <- liftIO $ filter (not . isLock)
<$> catchDefaultIO [] (dirContents dir)
forM_ sockets cleanup
cleanup socketfile = do cleanup socketfile = do
#ifndef mingw32_HOST_OS #ifndef mingw32_HOST_OS
-- Drop any shared lock we have, and take an -- Drop any shared lock we have, and take an
-- exclusive lock, without blocking. If the lock -- exclusive lock, without blocking. If the lock
-- succeeds, nothing is using this ssh, and it can -- succeeds, nothing is using this ssh, and it can
-- be stopped. -- be stopped.
--
-- After ssh is stopped cannot remove the lock file;
-- other processes may be waiting on our exclusive
-- lock to use it.
let lockfile = socket2lock socketfile let lockfile = socket2lock socketfile
unlockFile lockfile unlockFile lockfile
mode <- annexFileMode mode <- annexFileMode
@ -148,24 +155,28 @@ sshCleanup = go =<< sshCacheDir
setLock fd (WriteLock, AbsoluteSeek, 0, 0) setLock fd (WriteLock, AbsoluteSeek, 0, 0)
case v of case v of
Left _ -> noop Left _ -> noop
Right _ -> stopssh socketfile Right _ -> forceStopSsh socketfile
liftIO $ closeFd fd liftIO $ closeFd fd
#else #else
stopssh socketfile forceStopSsh socketfile
#endif #endif
stopssh socketfile = do
let (dir, base) = splitFileName socketfile {- Stop all ssh connection caching processes, even when they're in use. -}
let params = sshConnectionCachingParams base forceSshCleanup :: Annex ()
-- "ssh -O stop" is noisy on stderr even with -q forceSshCleanup = mapM_ forceStopSsh =<< enumSocketFiles
void $ liftIO $ catchMaybeIO $
withQuietOutput createProcessSuccess $ forceStopSsh :: FilePath -> Annex ()
(proc "ssh" $ toCommand $ forceStopSsh socketfile = do
[ Params "-O stop" let (dir, base) = splitFileName socketfile
] ++ params ++ [Param "localhost"]) let params = sshConnectionCachingParams base
{ cwd = Just dir } -- "ssh -O stop" is noisy on stderr even with -q
liftIO $ nukeFile socketfile void $ liftIO $ catchMaybeIO $
-- Cannot remove the lock file; other processes may withQuietOutput createProcessSuccess $
-- be waiting on our exclusive lock to use it. (proc "ssh" $ toCommand $
[ Params "-O stop"
] ++ params ++ [Param "localhost"])
{ cwd = Just dir }
liftIO $ nukeFile socketfile
{- This needs to be as short as possible, due to limitations on the length {- This needs to be as short as possible, due to limitations on the length
- of the path to a socket file. At the same time, it needs to be unique - of the path to a socket file. At the same time, it needs to be unique

View file

@ -71,7 +71,7 @@ dbusThread = do
) )
handleconn = do handleconn = do
debug ["detected network connection"] debug ["detected network connection"]
sendRemoteControl PAUSE sendRemoteControl LOSTNET
notifyNetMessagerRestart notifyNetMessagerRestart
handleConnection handleConnection
sendRemoteControl RESUME sendRemoteControl RESUME

View file

@ -18,6 +18,7 @@ import qualified Git.Types as Git
import qualified Git.CurrentRepo import qualified Git.CurrentRepo
import Utility.SimpleProtocol import Utility.SimpleProtocol
import Config import Config
import Annex.Ssh
import Control.Concurrent.Async import Control.Concurrent.Async
import Control.Concurrent import Control.Concurrent
@ -65,12 +66,19 @@ runController ichan ochan = do
let common = M.intersection m m' let common = M.intersection m m'
let new = M.difference m' m let new = M.difference m' m
let old = M.difference m m' let old = M.difference m m'
stoprunning old broadcast STOP old
unless paused $ unless paused $
startrunning new startrunning new
go h paused (M.union common new) go h paused (M.union common new)
LOSTNET -> do
-- force close all cached ssh connections
-- (done here so that if there are multiple
-- ssh remotes, it's only done once)
liftAnnex h forceSshCleanup
broadcast LOSTNET m
go h True M.empty
PAUSE -> do PAUSE -> do
stoprunning m broadcast STOP m
go h True M.empty go h True M.empty
RESUME -> do RESUME -> do
when paused $ when paused $
@ -89,9 +97,9 @@ runController ichan ochan = do
startrunning m = forM_ (M.elems m) startrunning' startrunning m = forM_ (M.elems m) startrunning'
startrunning' (transport, _) = void $ async transport startrunning' (transport, _) = void $ async transport
-- Ask the transport nicely to stop. broadcast msg m = forM_ (M.elems m) send
stoprunning m = forM_ (M.elems m) stoprunning' where
stoprunning' (_, c) = writeChan c STOP send (_, c) = writeChan c msg
-- Generates a map with a transport for each supported remote in the git repo, -- Generates a map with a transport for each supported remote in the git repo,
-- except those that have annex.sync = false -- except those that have annex.sync = false

View file

@ -84,6 +84,7 @@ transport' r url transporthandle ichan ochan = do
msg <- readChan ichan msg <- readChan ichan
case msg of case msg of
STOP -> return Stopping STOP -> return Stopping
LOSTNET -> return Stopping
_ -> handlecontrol _ -> handlecontrol
-- Old versions of git-annex-shell that do not support -- Old versions of git-annex-shell that do not support

View file

@ -42,6 +42,7 @@ data Emitted
-- Messages that the deamon consumes. -- Messages that the deamon consumes.
data Consumed data Consumed
= PAUSE = PAUSE
| LOSTNET
| RESUME | RESUME
| CHANGED RefList | CHANGED RefList
| RELOAD | RELOAD
@ -63,6 +64,7 @@ instance Proto.Sendable Emitted where
instance Proto.Sendable Consumed where instance Proto.Sendable Consumed where
formatMessage PAUSE = ["PAUSE"] formatMessage PAUSE = ["PAUSE"]
formatMessage LOSTNET = ["LOSTNET"]
formatMessage RESUME = ["RESUME"] formatMessage RESUME = ["RESUME"]
formatMessage (CHANGED refs) =["CHANGED", Proto.serialize refs] formatMessage (CHANGED refs) =["CHANGED", Proto.serialize refs]
formatMessage RELOAD = ["RELOAD"] formatMessage RELOAD = ["RELOAD"]
@ -78,6 +80,7 @@ instance Proto.Receivable Emitted where
instance Proto.Receivable Consumed where instance Proto.Receivable Consumed where
parseCommand "PAUSE" = Proto.parse0 PAUSE parseCommand "PAUSE" = Proto.parse0 PAUSE
parseCommand "LOSTNET" = Proto.parse0 LOSTNET
parseCommand "RESUME" = Proto.parse0 RESUME parseCommand "RESUME" = Proto.parse0 RESUME
parseCommand "CHANGED" = Proto.parse1 CHANGED parseCommand "CHANGED" = Proto.parse1 CHANGED
parseCommand "RELOAD" = Proto.parse0 RELOAD parseCommand "RELOAD" = Proto.parse0 RELOAD

2
debian/changelog vendored
View file

@ -10,6 +10,8 @@ git-annex (5.20140413) UNRELEASED; urgency=medium
set up. set up.
* sync, assistant, remotedaemon: Use ssh connection caching for git pushes * sync, assistant, remotedaemon: Use ssh connection caching for git pushes
and pulls. and pulls.
* remotedaemon: When network connection is lost, close all cached ssh
connections.
* Improve handling on monthly/yearly scheduling. * Improve handling on monthly/yearly scheduling.
-- Joey Hess <joeyh@debian.org> Fri, 11 Apr 2014 21:33:35 -0400 -- Joey Hess <joeyh@debian.org> Fri, 11 Apr 2014 21:33:35 -0400

View file

@ -95,18 +95,18 @@ the webapp.
* `PAUSE` * `PAUSE`
This indicates that the network connection has gone down, The user has requested a pause.
or the user has requested a pause.
git-remote-daemon should close connections and idle. git-remote-daemon should close connections and idle.
Affects all remotes. * `LOSTNET`
The network connection has been lost.
git-remote-daemon should close connections and idle.
* `RESUME` * `RESUME`
This indicates that the network connection has come back up, or the user Undoes PAUSE or DISCONNECTED.
has asked it to run again. Start back up network connections. Start back up network connections.
Affects all remotes.
* `CHANGED ref ...` * `CHANGED ref ...`
@ -170,6 +170,21 @@ TODO:
* Remote system might not be available. Find a smart way to detect it, * Remote system might not be available. Find a smart way to detect it,
ideally w/o generating network traffic. One way might be to check ideally w/o generating network traffic. One way might be to check
if the ssh connection caching control socket exists, for example. if the ssh connection caching control socket exists, for example.
* Now that ssh connection caching is enabled for git push/pull in sync,
there's the possibility that a stale ssh connection may linger when
changing network connections, and so attempts to use it will stall.
(This was already a potential issue with transfers, which already
used the caching.)
One option is ssh's ServerAliveCountMax, which will make a dead
ssh connection disconnect after approx 45 seconds, per ssh manual.
It would need to be enabled by setting ServerAliveInterval=15.
And this would add network traffic..
Another option is to disable all cached connections when the network
connection changes. This would handle *most* cases. The case
not handled is eg, my dialup ppp box getting a new public IP address,
which my laptop won't notice. **done**
## telehash ## telehash