ssh connection caching

Ssh connection caching is now enabled automatically by git-annex. Only one
ssh connection is made to each host per git-annex run, which can speed some
things up a lot, as well as avoiding repeated password prompts. Concurrent
git-annex processes also share ssh connections. Cached ssh connections are
shut down when git-annex exits.

Note: The rsync special remote does not yet participate in the ssh
connection caching.
This commit is contained in:
Joey Hess 2012-01-20 15:34:52 -04:00
parent 25f998679c
commit 47250a153a
9 changed files with 173 additions and 23 deletions

View file

@ -29,6 +29,7 @@ module Annex (
import Control.Monad.State
import Control.Monad.Trans.Control (StM, MonadBaseControl, liftBaseWith, restoreM)
import Control.Monad.Base (liftBase, MonadBase)
import System.Posix.Types (Fd)
import Common
import qualified Git
@ -86,6 +87,7 @@ data AnnexState = AnnexState
, forcetrust :: TrustMap
, trustmap :: Maybe TrustMap
, ciphers :: M.Map EncryptedCipher Cipher
, lockpool :: M.Map FilePath Fd
, flags :: M.Map String Bool
, fields :: M.Map String String
}
@ -108,6 +110,7 @@ newState gitrepo = AnnexState
, forcetrust = M.empty
, trustmap = Nothing
, ciphers = M.empty
, lockpool = M.empty
, flags = M.empty
, fields = M.empty
}

43
Annex/LockPool.hs Normal file
View file

@ -0,0 +1,43 @@
{- git-annex lock pool
-
- Copyright 2012 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
module Annex.LockPool where
import qualified Data.Map as M
import System.Posix.Types (Fd)
import Common.Annex
import Annex
{- Create a specified lock file, and takes a shared lock. -}
lockFile :: FilePath -> Annex ()
lockFile file = go =<< fromPool file
where
go (Just _) = return () -- already locked
go Nothing = do
fd <- liftIO $ openFd file ReadOnly (Just stdFileMode) defaultFileFlags
liftIO $ waitToSetLock fd (ReadLock, AbsoluteSeek, 0, 0)
changePool $ M.insert file fd
unlockFile :: FilePath -> Annex ()
unlockFile file = go =<< fromPool file
where
go Nothing = return ()
go (Just fd) = do
liftIO $ closeFd fd
changePool $ M.delete file
getPool :: Annex (M.Map FilePath Fd)
getPool = getState lockpool
fromPool :: FilePath -> Annex (Maybe Fd)
fromPool file = M.lookup file <$> getPool
changePool :: (M.Map FilePath Fd -> M.Map FilePath Fd) -> Annex ()
changePool a = do
m <- getPool
changeState $ \s -> s { lockpool = a m }

107
Annex/Ssh.hs Normal file
View file

@ -0,0 +1,107 @@
{- git-annex ssh interface, with connection caching
-
- Copyright 2012 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
module Annex.Ssh (
sshParams,
sshCleanup,
) where
import qualified Data.Map as M
import System.IO.Error (try)
import Common.Annex
import Annex.LockPool
{- Generates parameters to ssh to a given host (or user@host) on a given
- port, with connection caching. -}
sshParams :: (String, Maybe Integer) -> Annex [CommandParam]
sshParams (host, port) = do
cleanstale
(socketfile, params) <- sshInfo (host, port)
liftIO $ createDirectoryIfMissing True $ parentDir socketfile
lockFile $ socket2lock socketfile
return params
where
-- If the lock pool is empty, this is the first ssh of this
-- run. There could be stale ssh connections hanging around
-- from a previous git-annex run that was interrupted.
cleanstale = whenM (null . filter isLock . M.keys <$> getPool) $
sshCleanup
sshInfo :: (String, Maybe Integer) -> Annex (FilePath, [CommandParam])
sshInfo (host, port) = do
dir <- fromRepo $ gitAnnexSshDir
let socketfile = dir </> hostport2socket host port
return $ (socketfile, cacheParams socketfile ++ portParams port ++ [Param host])
cacheParams :: FilePath -> [CommandParam]
cacheParams socketfile =
[ Param "-S", Param socketfile
, Params "-o ControlMaster=auto -o ControlPersist=yes"
]
portParams :: Maybe Integer -> [CommandParam]
portParams Nothing = []
portParams (Just port) = [Param "-p", Param $ show port]
{- Stop any unused ssh processes. -}
sshCleanup :: Annex ()
sshCleanup = do
dir <- fromRepo $ gitAnnexSshDir
liftIO $ createDirectoryIfMissing True dir
sockets <- filter (not . isLock) <$> liftIO (dirContents dir)
forM_ sockets cleanup
where
cleanup socketfile = do
-- Drop any shared lock we have, and take an
-- exclusive lock, without blocking. If the lock
-- succeeds, nothing is using this ssh, and it can
-- be stopped.
let lockfile = socket2lock socketfile
unlockFile lockfile
fd <- liftIO $ openFd lockfile ReadWrite (Just stdFileMode) defaultFileFlags
v <- liftIO $ try $ setLock fd (WriteLock, AbsoluteSeek, 0, 0)
case v of
Left _ -> return ()
Right _ -> stopssh socketfile
liftIO $ closeFd fd
stopssh socketfile = do
(_, params) <- sshInfo $ socket2hostport socketfile
_ <- liftIO $ do
-- "ssh -O stop" is noisy on stderr even with -q
let cmd = unwords $ toCommand $
[ Params "-O stop"
] ++ params
_ <- boolSystem "sh"
[ Param "-c"
, Param $ "ssh " ++ cmd ++ " >/dev/null 2>/dev/null"
]
--try $ removeFile socketfile
return ()
-- Cannot remove the lock file; other processes may
-- be waiting on our exclusive lock to use it.
return ()
hostport2socket :: String -> Maybe Integer -> FilePath
hostport2socket host Nothing = host
hostport2socket host (Just port) = host ++ "!" ++ show port
socket2hostport :: FilePath -> (String, Maybe Integer)
socket2hostport socket
| null p = (h, Nothing)
| otherwise = (h, readMaybe p)
where
(h, p) = separate (== '!') $ takeFileName socket
socket2lock :: FilePath -> FilePath
socket2lock socket = socket ++ lockExt
isLock :: FilePath -> Bool
isLock f = lockExt `isSuffixOf` f
lockExt :: String
lockExt = ".lock"

View file

@ -22,6 +22,7 @@ import qualified Annex.Queue
import qualified Git
import qualified Git.Command
import Annex.Content
import Annex.Ssh
import Command
type Params = [String]
@ -92,4 +93,5 @@ shutdown :: Annex Bool
shutdown = do
saveState
liftIO Git.Command.reap -- zombies from long-running git processes
sshCleanup -- ssh connection caching
return True

View file

@ -22,6 +22,7 @@ module Locations (
gitAnnexJournalLock,
gitAnnexIndex,
gitAnnexIndexLock,
gitAnnexSshDir,
isLinkToAnnex,
annexHashes,
hashDirMixed,
@ -142,6 +143,10 @@ gitAnnexIndex r = gitAnnexDir r </> "index"
gitAnnexIndexLock :: Git.Repo -> FilePath
gitAnnexIndexLock r = gitAnnexDir r </> "index.lck"
{- .git/annex/ssh/ is used for ssh connection caching -}
gitAnnexSshDir :: Git.Repo -> FilePath
gitAnnexSshDir r = addTrailingPathSeparator $ gitAnnexDir r </> "ssh"
{- Checks a symlink target to see if it appears to point to annexed content. -}
isLinkToAnnex :: FilePath -> Bool
isLinkToAnnex s = ("/.git/" ++ objectDir) `isInfixOf` s

View file

@ -7,25 +7,21 @@
module Remote.Helper.Ssh where
import Common
import Common.Annex
import qualified Git
import qualified Git.Url
import Types
import Config
import Annex.UUID
import Annex.Ssh
{- Generates parameters to ssh to a repository's host and run a command.
- Caller is responsible for doing any neccessary shellEscaping of the
- passed command. -}
sshToRepo :: Git.Repo -> [CommandParam] -> Annex [CommandParam]
sshToRepo repo sshcmd = do
s <- getConfig repo "ssh-options" ""
let sshoptions = map Param (words s)
let sshport = case Git.Url.port repo of
Nothing -> []
Just p -> [Param "-p", Param (show p)]
let sshhost = Param $ Git.Url.hostuser repo
return $ sshoptions ++ sshport ++ [sshhost] ++ sshcmd
opts <- map Param . words <$> getConfig repo "ssh-options" ""
params <- sshParams (Git.Url.hostuser repo, Git.Url.port repo)
return $ opts ++ params ++ sshcmd
{- Generates parameters to run a git-annex-shell command on a remote
- repository. -}

5
debian/changelog vendored
View file

@ -10,6 +10,11 @@ git-annex (3.20120117) UNRELEASED; urgency=low
previous release!
* fsck --from remote --fast: Avoids expensive file transfers, at the
expense of checking file size and/or contents.
* Ssh connection caching is now enabled automatically by git-annex.
Only one ssh connection is made to each host per git-annex run, which
can speed some things up a lot, as well as avoiding repeated password
prompts. Concurrent git-annex processes also share ssh connections.
Cached ssh connections are shut down when git-annex exits.
-- Joey Hess <joeyh@debian.org> Thu, 19 Jan 2012 15:12:03 -0400

View file

@ -57,16 +57,3 @@ b) From the desktop add the remote
So now you can work on the train, pop on the wifi at work upon arrival, and sync up with a `git pull && git annex get`.
An alternative solution may be to use direct tunnels over Openvpn.
## Optimising SSH
Running a `git annex get .`, at least in the version I have, creates a new SSH connection for every file transfer (maybe this should be a feature request?)
Lot's of new small files in an _annex_ cause lot's of connections to be made quickly: this is an relatively expensive overhead and is enough for connection limiting to start in my case. The process can be made much faster by using SSH's connection sharing capabilities. An SSH config like this should do it:
# Global Settings
ControlMaster auto
ControlPersist 30
ControlPath ~/.ssh/master-%r@%h:%p
This will create a master connection for sharing if one isn't present, maintain it for 30 seconds after closing down the connection (just-in-cases') and automatically use the master connection for subsequent connections. Wins all round!

View file

@ -15,7 +15,9 @@ Simple, when performing various git annex command over ssh, in particular a mult
>> pid. Then at shutdown, run `ssh -O exit` on each such socket.
>>
>> Complicated slightly by not doing this if the user has already set up
>> more broad ssh connection caching. --[[Joey]]
>> more broad ssh connection caching.
>>
>> [[done]]! --[[Joey]]
---