better HTTP connection reuse

Enable HTTP connection reuse across multiple files, when git-annex
uses http-conduit. Before, a new Manager was created each time
Utility.Url used it. Now, a single Manager gets created the first time,
so connections are reused.

Doesn't help when external programs are used for url download,
but does speed up addurl --fast, fsck --from web, etc.

Testing fsck --fast --from web with 3 files, over high-latency
satellite internet, it sped up from 19.37s to 14.96s.

This commit was supported by the NSF-funded DataLad project.
This commit is contained in:
Joey Hess 2018-04-04 15:15:12 -04:00
parent 2ec07bc29f
commit 9b98d3f630
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
12 changed files with 61 additions and 53 deletions

View file

@ -947,8 +947,8 @@ downloadUrl k p urls file = meteredFile file (Just p) k $
( return Url.downloadQuiet ( return Url.downloadQuiet
, return Url.download , return Url.download
) )
Url.withUrlOptions $ \uo -> Url.withUrlOptions $ \uo ->
anyM (\u -> a u file uo) urls liftIO $ anyM (\u -> a u file uo) urls
go (Just basecmd) = anyM (downloadcmd basecmd) urls go (Just basecmd) = anyM (downloadcmd basecmd) urls
downloadcmd basecmd url = downloadcmd basecmd url =
progressCommand "sh" [Param "-c", Param $ gencmd url basecmd] progressCommand "sh" [Param "-c", Param $ gencmd url basecmd]

View file

@ -9,6 +9,7 @@
module Annex.Url ( module Annex.Url (
module U, module U,
withUrlOptions, withUrlOptions,
getUrlOptions,
getUserAgent, getUserAgent,
) where ) where
@ -24,20 +25,24 @@ getUserAgent :: Annex (Maybe U.UserAgent)
getUserAgent = Annex.getState $ getUserAgent = Annex.getState $
Just . fromMaybe defaultUserAgent . Annex.useragent Just . fromMaybe defaultUserAgent . Annex.useragent
withUrlOptions :: (U.UrlOptions -> Annex a) -> Annex a getUrlOptions :: Annex U.UrlOptions
withUrlOptions a = Annex.getState Annex.urloptions >>= \case getUrlOptions = Annex.getState Annex.urloptions >>= \case
Just uo -> a uo Just uo -> return uo
Nothing -> do Nothing -> do
uo <- mk uo <- mk
Annex.changeState $ \s -> s Annex.changeState $ \s -> s
{ Annex.urloptions = Just uo } { Annex.urloptions = Just uo }
a uo return uo
where where
mk = mkUrlOptions mk = mkUrlOptions
<$> getUserAgent <$> getUserAgent
<*> headers <*> headers
<*> options <*> options
<*> liftIO (U.newManager U.managerSettings)
headers = annexHttpHeadersCommand <$> Annex.getGitConfig >>= \case headers = annexHttpHeadersCommand <$> Annex.getGitConfig >>= \case
Just cmd -> lines <$> liftIO (readProcess "sh" ["-c", cmd]) Just cmd -> lines <$> liftIO (readProcess "sh" ["-c", cmd])
Nothing -> annexHttpHeaders <$> Annex.getGitConfig Nothing -> annexHttpHeaders <$> Annex.getGitConfig
options = map Param . annexWebOptions <$> Annex.getGitConfig options = map Param . annexWebOptions <$> Annex.getGitConfig
withUrlOptions :: (U.UrlOptions -> Annex a) -> Annex a
withUrlOptions a = a =<< getUrlOptions

View file

@ -95,7 +95,7 @@ newAssistantUrl repo = do
- warp-tls listens to http, in order to show an error page, so this works. - warp-tls listens to http, in order to show an error page, so this works.
-} -}
assistantListening :: URLString -> IO Bool assistantListening :: URLString -> IO Bool
assistantListening url = catchBoolIO $ exists url' def assistantListening url = catchBoolIO $ exists url' =<< defUrlOptions
where where
url' = case parseURI url of url' = case parseURI url of
Nothing -> url Nothing -> url

View file

@ -316,7 +316,8 @@ usingDistribution :: IO Bool
usingDistribution = isJust <$> getEnv "GIT_ANNEX_STANDLONE_ENV" usingDistribution = isJust <$> getEnv "GIT_ANNEX_STANDLONE_ENV"
downloadDistributionInfo :: Assistant (Maybe GitAnnexDistribution) downloadDistributionInfo :: Assistant (Maybe GitAnnexDistribution)
downloadDistributionInfo = Url.withUrlOptions $ \uo -> do downloadDistributionInfo = do
uo <- liftAnnex Url.getUrlOptions
gpgcmd <- liftAnnex $ gpgCmd <$> Annex.getGitConfig gpgcmd <- liftAnnex $ gpgCmd <$> Annex.getGitConfig
liftIO $ withTmpDir "git-annex.tmp" $ \tmpdir -> do liftIO $ withTmpDir "git-annex.tmp" $ \tmpdir -> do
let infof = tmpdir </> "info" let infof = tmpdir </> "info"

View file

@ -189,7 +189,8 @@ escapeHeader :: String -> String
escapeHeader = escapeURIString (\c -> isUnescapedInURI c && c /= ' ') escapeHeader = escapeURIString (\c -> isUnescapedInURI c && c /= ' ')
getRepoInfo :: RemoteConfig -> Widget getRepoInfo :: RemoteConfig -> Widget
getRepoInfo c = Url.withUrlOptions $ \uo -> getRepoInfo c = do
uo <- liftAnnex Url.getUrlOptions
exists <- liftIO $ catchDefaultIO False $ Url.exists url uo exists <- liftIO $ catchDefaultIO False $ Url.exists url uo
[whamlet| [whamlet|
<a href="#{url}"> <a href="#{url}">

View file

@ -12,6 +12,8 @@ git-annex (6.20180317) UNRELEASED; urgency=medium
don't copy the data metadata from the old version of the file, don't copy the data metadata from the old version of the file,
instead use the mtime of the file. instead use the mtime of the file.
* Avoid running annex.http-headers-command more than once. * Avoid running annex.http-headers-command more than once.
* Enable HTTP connection reuse across multiple files, when git-annex
uses http-conduit.
-- Joey Hess <id@joeyh.name> Mon, 19 Mar 2018 23:13:59 -0400 -- Joey Hess <id@joeyh.name> Mon, 19 Mar 2018 23:13:59 -0400

View file

@ -196,7 +196,8 @@ startWeb o urlstring = go $ fromMaybe bad $ parseURI urlstring
pathmax <- liftIO $ fileNameLengthLimit "." pathmax <- liftIO $ fileNameLengthLimit "."
urlinfo <- if relaxedOption (downloadOptions o) urlinfo <- if relaxedOption (downloadOptions o)
then pure Url.assumeUrlExists then pure Url.assumeUrlExists
else Url.withUrlOptions (Url.getUrlInfo urlstring) else Url.withUrlOptions $
liftIO . Url.getUrlInfo urlstring
file <- adjustFile o <$> case fileOption (downloadOptions o) of file <- adjustFile o <$> case fileOption (downloadOptions o) of
Just f -> pure f Just f -> pure f
Nothing -> case Url.urlSuggestedFile urlinfo of Nothing -> case Url.urlSuggestedFile urlinfo of

View file

@ -150,7 +150,7 @@ downloadFeed url
| Url.parseURIRelaxed url == Nothing = giveup "invalid feed url" | Url.parseURIRelaxed url == Nothing = giveup "invalid feed url"
| otherwise = do | otherwise = do
showOutput showOutput
Url.withUrlOptions $ \ou -> Url.withUrlOptions $ \uo ->
liftIO $ withTmpFile "feed" $ \f h -> do liftIO $ withTmpFile "feed" $ \f h -> do
hClose h hClose h
ifM (Url.download url f uo) ifM (Url.download url f uo)
@ -167,7 +167,8 @@ performDownload opts cache todownload = case location todownload of
then do then do
urlinfo <- if relaxedOption (downloadOptions opts) urlinfo <- if relaxedOption (downloadOptions opts)
then pure Url.assumeUrlExists then pure Url.assumeUrlExists
else Url.withUrlOptions (Url.getUrlInfo url) else Url.withUrlOptions $
liftIO . Url.getUrlInfo url
let dlopts = (downloadOptions opts) let dlopts = (downloadOptions opts)
-- force using the filename -- force using the filename
-- chosen here -- chosen here

View file

@ -336,10 +336,11 @@ inAnnex rmt (State connpool duc) key
r = repo rmt r = repo rmt
checkhttp = do checkhttp = do
showChecking r showChecking r
ifM (Url.withUrlOptions $ \uo -> anyM (\u -> Url.checkBoth u (keySize key) uo) (keyUrls rmt key)) ifM (Url.withUrlOptions $ \uo -> liftIO $
( return True anyM (\u -> Url.checkBoth u (keySize key) uo) (keyUrls rmt key))
, giveup "not found" ( return True
) , giveup "not found"
)
checkremote = checkremote =
let fallback = Ssh.inAnnex r key let fallback = Ssh.inAnnex r key
in P2PHelper.checkpresent (Ssh.runProto rmt connpool (cantCheck rmt) fallback) key in P2PHelper.checkpresent (Ssh.runProto rmt connpool (cantCheck rmt) fallback) key

View file

@ -1,6 +1,6 @@
{- S3 remotes {- S3 remotes
- -
- Copyright 2011-2017 Joey Hess <id@joeyh.name> - Copyright 2011-2018 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU GPL version 3 or higher. - Licensed under the GNU GPL version 3 or higher.
-} -}
@ -22,7 +22,7 @@ import qualified Data.ByteString as S
import qualified Data.Map as M import qualified Data.Map as M
import Data.Char import Data.Char
import Network.Socket (HostName) import Network.Socket (HostName)
import Network.HTTP.Conduit (Manager, newManager) import Network.HTTP.Conduit (Manager)
import Network.HTTP.Client (responseStatus, responseBody, RequestBody(..)) import Network.HTTP.Client (responseStatus, responseBody, RequestBody(..))
import Network.HTTP.Types import Network.HTTP.Types
import Control.Monad.Trans.Resource import Control.Monad.Trans.Resource
@ -51,7 +51,7 @@ import Utility.DataUnits
import Utility.FileSystemEncoding import Utility.FileSystemEncoding
import Annex.Content import Annex.Content
import Annex.Url (withUrlOptions) import Annex.Url (withUrlOptions)
import Utility.Url (checkBoth, managerSettings, closeManager) import Utility.Url (checkBoth, UrlOptions(..))
type BucketName = String type BucketName = String
@ -295,7 +295,7 @@ checkKey r info Nothing k = case getpublicurl info of
giveup "No S3 credentials configured" giveup "No S3 credentials configured"
Just geturl -> do Just geturl -> do
showChecking r showChecking r
withUrlOptions $ checkBoth (geturl k) (keySize k) withUrlOptions $ liftIO . checkBoth (geturl k) (keySize k)
checkKey r info (Just h) k = do checkKey r info (Just h) k = do
showChecking r showChecking r
checkKeyHelper info h (T.pack $ bucketObject info k) checkKeyHelper info h (T.pack $ bucketObject info k)
@ -500,8 +500,8 @@ withS3HandleMaybe c gc u a = do
#if MIN_VERSION_aws(0,17,0) #if MIN_VERSION_aws(0,17,0)
Nothing Nothing
#endif #endif
bracketIO (newManager managerSettings) closeManager $ \mgr -> withUrlOptions $ \ou ->
a $ Just $ S3Handle mgr awscfg s3cfg a $ Just $ S3Handle (httpManager ou) awscfg s3cfg
Nothing -> a Nothing Nothing -> a Nothing
where where
s3cfg = s3Configuration c s3cfg = s3Configuration c

View file

@ -1,6 +1,6 @@
{- Url downloading. {- Url downloading.
- -
- Copyright 2011-2017 Joey Hess <id@joeyh.name> - Copyright 2011-2018 Joey Hess <id@joeyh.name>
- -
- License: BSD-2-clause - License: BSD-2-clause
-} -}
@ -11,11 +11,12 @@
{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleContexts #-}
module Utility.Url ( module Utility.Url (
closeManager, newManager,
managerSettings, managerSettings,
URLString, URLString,
UserAgent, UserAgent,
UrlOptions, UrlOptions(..),
defUrlOptions,
mkUrlOptions, mkUrlOptions,
check, check,
checkBoth, checkBoth,
@ -42,18 +43,9 @@ import qualified Data.ByteString as B
import qualified Data.ByteString.UTF8 as B8 import qualified Data.ByteString.UTF8 as B8
import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Lazy as L
import Control.Monad.Trans.Resource import Control.Monad.Trans.Resource
import Network.HTTP.Conduit hiding (closeManager) import Network.HTTP.Conduit
import Network.HTTP.Client (brRead, withResponse) import Network.HTTP.Client (brRead, withResponse)
-- closeManager is needed with older versions of http-client,
-- but not new versions, which warn about using it. Urgh.
#if ! MIN_VERSION_http_client(0,4,18)
import Network.HTTP.Client (closeManager)
#else
closeManager :: Manager -> IO ()
closeManager _ = return ()
#endif
#if ! MIN_VERSION_http_client(0,5,0) #if ! MIN_VERSION_http_client(0,5,0)
responseTimeoutNone :: Maybe Int responseTimeoutNone :: Maybe Int
responseTimeoutNone = Nothing responseTimeoutNone = Nothing
@ -78,15 +70,20 @@ data UrlOptions = UrlOptions
, reqHeaders :: Headers , reqHeaders :: Headers
, reqParams :: [CommandParam] , reqParams :: [CommandParam]
, applyRequest :: Request -> Request , applyRequest :: Request -> Request
, httpManager :: Manager
} }
instance Default UrlOptions defUrlOptions :: IO UrlOptions
where defUrlOptions = UrlOptions
def = UrlOptions Nothing [] [] id <$> pure Nothing
<*> pure []
<*> pure []
<*> pure id
<*> newManager managerSettings
mkUrlOptions :: Maybe UserAgent -> Headers -> [CommandParam] -> UrlOptions mkUrlOptions :: Maybe UserAgent -> Headers -> [CommandParam] -> Manager -> UrlOptions
mkUrlOptions defuseragent reqheaders reqparams = mkUrlOptions defuseragent reqheaders reqparams manager =
UrlOptions useragent reqheaders reqparams applyrequest UrlOptions useragent reqheaders reqparams applyrequest manager
where where
applyrequest = \r -> r { requestHeaders = requestHeaders r ++ addedheaders } applyrequest = \r -> r { requestHeaders = requestHeaders r ++ addedheaders }
addedheaders = uaheader ++ otherheaders addedheaders = uaheader ++ otherheaders
@ -118,7 +115,7 @@ checkBoth url expected_size uo = do
return (fst v && snd v) return (fst v && snd v)
check :: URLString -> Maybe Integer -> UrlOptions -> IO (Bool, Bool) check :: URLString -> Maybe Integer -> UrlOptions -> IO (Bool, Bool)
check url expected_size = go <$$> getUrlInfo url check url expected_size uo = go <$> getUrlInfo url uo
where where
go (UrlInfo False _ _) = (False, False) go (UrlInfo False _ _) = (False, False)
go (UrlInfo True Nothing _) = (True, True) go (UrlInfo True Nothing _) = (True, True)
@ -192,19 +189,16 @@ getUrlInfo url uo = case parseURIRelaxed url of
filter (\p -> fst p == h) . responseHeaders filter (\p -> fst p == h) . responseHeaders
existsconduit req = do existsconduit req = do
mgr <- newManager managerSettings
let req' = headRequest (applyRequest uo req) let req' = headRequest (applyRequest uo req)
ret <- runResourceT $ do runResourceT $ do
resp <- http req' mgr resp <- http req' (httpManager uo)
-- forces processing the response before the -- forces processing the response while
-- manager is closed -- within the runResourceT
liftIO $ if responseStatus resp == ok200 liftIO $ if responseStatus resp == ok200
then found then found
(extractlen resp) (extractlen resp)
(extractfilename resp) (extractfilename resp)
else dne else dne
liftIO $ closeManager mgr
return ret
existscurl u = do existscurl u = do
output <- catchDefaultIO "" $ output <- catchDefaultIO "" $
@ -327,14 +321,11 @@ downloadPartial url uo n = case parseURIRelaxed url of
go u = case parseUrlConduit (show u) of go u = case parseUrlConduit (show u) of
Nothing -> return Nothing Nothing -> return Nothing
Just req -> do Just req -> do
mgr <- newManager managerSettings
let req' = applyRequest uo req let req' = applyRequest uo req
ret <- withResponse req' mgr $ \resp -> withResponse req' (httpManager uo) $ \resp ->
if responseStatus resp == ok200 if responseStatus resp == ok200
then Just <$> brread n [] (responseBody resp) then Just <$> brread n [] (responseBody resp)
else return Nothing else return Nothing
liftIO $ closeManager mgr
return ret
-- could use brReadSome here, needs newer http-client dependency -- could use brReadSome here, needs newer http-client dependency
brread n' l rb brread n' l rb

View file

@ -12,6 +12,11 @@ and pipelining requests, so it makes mass url downloads a lot slower than
if git-annex used http-conduit to do url downloads itself. [[users/yoh]] if git-annex used http-conduit to do url downloads itself. [[users/yoh]]
has requested http pipelining. has requested http pipelining.
(git-annex was creating a new http manager each time it hit an url,
except for in the S3 remote which reused a single manager. That's now been
improved, so all http-conduit use in git-annex reuses a http manager, and
so will do http pipelining.)
For file: ftp: and more unusual urls, http-conduit can't support them. For file: ftp: and more unusual urls, http-conduit can't support them.
git-annex does support those urls, and people rely on that, so it would git-annex does support those urls, and people rely on that, so it would
still need to use wget or curl for those. still need to use wget or curl for those.