remotedaemon: When network connection is lost, close all cached ssh connections.

This commit was sponsored by Cedric Staub.
This commit is contained in:
Joey Hess 2014-04-12 16:32:59 -04:00
parent 15917ec1a8
commit a33b30d0c4
7 changed files with 76 additions and 36 deletions

View file

@ -11,6 +11,7 @@ module Annex.Ssh (
sshCachingOptions,
sshCacheDir,
sshReadPort,
forceSshCleanup,
sshCachingEnv,
sshCachingTo,
inRepoWithSshCachingTo,
@ -124,21 +125,27 @@ prepSocket socketfile = do
liftIO $ createDirectoryIfMissing True $ parentDir socketfile
lockFile $ socket2lock socketfile
{- Stop any unused ssh processes. -}
sshCleanup :: Annex ()
sshCleanup = go =<< sshCacheDir
enumSocketFiles :: Annex [FilePath]
enumSocketFiles = go =<< sshCacheDir
where
go Nothing = return []
go (Just dir) = liftIO $ filter (not . isLock)
<$> catchDefaultIO [] (dirContents dir)
{- Stop any unused ssh connection caching processes. -}
sshCleanup :: Annex ()
sshCleanup = mapM_ cleanup =<< enumSocketFiles
where
go Nothing = noop
go (Just dir) = do
sockets <- liftIO $ filter (not . isLock)
<$> catchDefaultIO [] (dirContents dir)
forM_ sockets cleanup
cleanup socketfile = do
#ifndef mingw32_HOST_OS
-- Drop any shared lock we have, and take an
-- exclusive lock, without blocking. If the lock
-- succeeds, nothing is using this ssh, and it can
-- be stopped.
--
-- After ssh is stopped cannot remove the lock file;
-- other processes may be waiting on our exclusive
-- lock to use it.
let lockfile = socket2lock socketfile
unlockFile lockfile
mode <- annexFileMode
@ -148,24 +155,28 @@ sshCleanup = go =<< sshCacheDir
setLock fd (WriteLock, AbsoluteSeek, 0, 0)
case v of
Left _ -> noop
Right _ -> stopssh socketfile
Right _ -> forceStopSsh socketfile
liftIO $ closeFd fd
#else
stopssh socketfile
forceStopSsh socketfile
#endif
stopssh socketfile = do
let (dir, base) = splitFileName socketfile
let params = sshConnectionCachingParams base
-- "ssh -O stop" is noisy on stderr even with -q
void $ liftIO $ catchMaybeIO $
withQuietOutput createProcessSuccess $
(proc "ssh" $ toCommand $
[ Params "-O stop"
] ++ params ++ [Param "localhost"])
{ cwd = Just dir }
liftIO $ nukeFile socketfile
-- Cannot remove the lock file; other processes may
-- be waiting on our exclusive lock to use it.
{- Stop all ssh connection caching processes, even when they're in use. -}
forceSshCleanup :: Annex ()
forceSshCleanup = mapM_ forceStopSsh =<< enumSocketFiles
forceStopSsh :: FilePath -> Annex ()
forceStopSsh socketfile = do
let (dir, base) = splitFileName socketfile
let params = sshConnectionCachingParams base
-- "ssh -O stop" is noisy on stderr even with -q
void $ liftIO $ catchMaybeIO $
withQuietOutput createProcessSuccess $
(proc "ssh" $ toCommand $
[ Params "-O stop"
] ++ params ++ [Param "localhost"])
{ cwd = Just dir }
liftIO $ nukeFile socketfile
{- This needs to be as short as possible, due to limitations on the length
- of the path to a socket file. At the same time, it needs to be unique

View file

@ -71,7 +71,7 @@ dbusThread = do
)
handleconn = do
debug ["detected network connection"]
sendRemoteControl PAUSE
sendRemoteControl LOSTNET
notifyNetMessagerRestart
handleConnection
sendRemoteControl RESUME

View file

@ -18,6 +18,7 @@ import qualified Git.Types as Git
import qualified Git.CurrentRepo
import Utility.SimpleProtocol
import Config
import Annex.Ssh
import Control.Concurrent.Async
import Control.Concurrent
@ -65,12 +66,19 @@ runController ichan ochan = do
let common = M.intersection m m'
let new = M.difference m' m
let old = M.difference m m'
stoprunning old
broadcast STOP old
unless paused $
startrunning new
go h paused (M.union common new)
LOSTNET -> do
-- force close all cached ssh connections
-- (done here so that if there are multiple
-- ssh remotes, it's only done once)
liftAnnex h forceSshCleanup
broadcast LOSTNET m
go h True M.empty
PAUSE -> do
stoprunning m
broadcast STOP m
go h True M.empty
RESUME -> do
when paused $
@ -89,9 +97,9 @@ runController ichan ochan = do
startrunning m = forM_ (M.elems m) startrunning'
startrunning' (transport, _) = void $ async transport
-- Ask the transport nicely to stop.
stoprunning m = forM_ (M.elems m) stoprunning'
stoprunning' (_, c) = writeChan c STOP
broadcast msg m = forM_ (M.elems m) send
where
send (_, c) = writeChan c msg
-- Generates a map with a transport for each supported remote in the git repo,
-- except those that have annex.sync = false

View file

@ -84,6 +84,7 @@ transport' r url transporthandle ichan ochan = do
msg <- readChan ichan
case msg of
STOP -> return Stopping
LOSTNET -> return Stopping
_ -> handlecontrol
-- Old versions of git-annex-shell that do not support

View file

@ -42,6 +42,7 @@ data Emitted
-- Messages that the deamon consumes.
data Consumed
= PAUSE
| LOSTNET
| RESUME
| CHANGED RefList
| RELOAD
@ -63,6 +64,7 @@ instance Proto.Sendable Emitted where
instance Proto.Sendable Consumed where
formatMessage PAUSE = ["PAUSE"]
formatMessage LOSTNET = ["LOSTNET"]
formatMessage RESUME = ["RESUME"]
formatMessage (CHANGED refs) =["CHANGED", Proto.serialize refs]
formatMessage RELOAD = ["RELOAD"]
@ -78,6 +80,7 @@ instance Proto.Receivable Emitted where
instance Proto.Receivable Consumed where
parseCommand "PAUSE" = Proto.parse0 PAUSE
parseCommand "LOSTNET" = Proto.parse0 LOSTNET
parseCommand "RESUME" = Proto.parse0 RESUME
parseCommand "CHANGED" = Proto.parse1 CHANGED
parseCommand "RELOAD" = Proto.parse0 RELOAD

2
debian/changelog vendored
View file

@ -10,6 +10,8 @@ git-annex (5.20140413) UNRELEASED; urgency=medium
set up.
* sync, assistant, remotedaemon: Use ssh connection caching for git pushes
and pulls.
* remotedaemon: When network connection is lost, close all cached ssh
connections.
* Improve handling on monthly/yearly scheduling.
-- Joey Hess <joeyh@debian.org> Fri, 11 Apr 2014 21:33:35 -0400

View file

@ -95,18 +95,18 @@ the webapp.
* `PAUSE`
This indicates that the network connection has gone down,
or the user has requested a pause.
The user has requested a pause.
git-remote-daemon should close connections and idle.
Affects all remotes.
* `LOSTNET`
The network connection has been lost.
git-remote-daemon should close connections and idle.
* `RESUME`
This indicates that the network connection has come back up, or the user
has asked it to run again. Start back up network connections.
Affects all remotes.
Undoes PAUSE or DISCONNECTED.
Start back up network connections.
* `CHANGED ref ...`
@ -170,6 +170,21 @@ TODO:
* Remote system might not be available. Find a smart way to detect it,
ideally w/o generating network traffic. One way might be to check
if the ssh connection caching control socket exists, for example.
* Now that ssh connection caching is enabled for git push/pull in sync,
there's the possibility that a stale ssh connection may linger when
changing network connections, and so attempts to use it will stall.
(This was already a potential issue with transfers, which already
used the caching.)
One option is ssh's ServerAliveCountMax, which will make a dead
ssh connection disconnect after approx 45 seconds, per ssh manual.
It would need to be enabled by setting ServerAliveInterval=15.
And this would add network traffic..
Another option is to disable all cached connections when the network
connection changes. This would handle *most* cases. The case
not handled is eg, my dialup ppp box getting a new public IP address,
which my laptop won't notice. **done**
## telehash