better HTTP connection reuse

Enable HTTP connection reuse across multiple files, when git-annex
uses http-conduit. Before, a new Manager was created each time
Utility.Url used it. Now, a single Manager gets created the first time,
so connections are reused.

Doesn't help when external programs are used for url download,
but does speed up addurl --fast, fsck --from web, etc.

Testing fsck --fast --from web with 3 files, over high-latency
satellite internet, it sped up from 19.37s to 14.96s.

This commit was supported by the NSF-funded DataLad project.
This commit is contained in:
Joey Hess 2018-04-04 15:15:12 -04:00
parent 2ec07bc29f
commit 9b98d3f630
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
12 changed files with 61 additions and 53 deletions

View file

@ -947,8 +947,8 @@ downloadUrl k p urls file = meteredFile file (Just p) k $
( return Url.downloadQuiet
, return Url.download
)
Url.withUrlOptions $ \uo ->
anyM (\u -> a u file uo) urls
Url.withUrlOptions $ \uo ->
liftIO $ anyM (\u -> a u file uo) urls
go (Just basecmd) = anyM (downloadcmd basecmd) urls
downloadcmd basecmd url =
progressCommand "sh" [Param "-c", Param $ gencmd url basecmd]

View file

@ -9,6 +9,7 @@
module Annex.Url (
module U,
withUrlOptions,
getUrlOptions,
getUserAgent,
) where
@ -24,20 +25,24 @@ getUserAgent :: Annex (Maybe U.UserAgent)
getUserAgent = Annex.getState $
Just . fromMaybe defaultUserAgent . Annex.useragent
withUrlOptions :: (U.UrlOptions -> Annex a) -> Annex a
withUrlOptions a = Annex.getState Annex.urloptions >>= \case
Just uo -> a uo
getUrlOptions :: Annex U.UrlOptions
getUrlOptions = Annex.getState Annex.urloptions >>= \case
Just uo -> return uo
Nothing -> do
uo <- mk
Annex.changeState $ \s -> s
{ Annex.urloptions = Just uo }
a uo
return uo
where
mk = mkUrlOptions
<$> getUserAgent
<*> headers
<*> options
<*> liftIO (U.newManager U.managerSettings)
headers = annexHttpHeadersCommand <$> Annex.getGitConfig >>= \case
Just cmd -> lines <$> liftIO (readProcess "sh" ["-c", cmd])
Nothing -> annexHttpHeaders <$> Annex.getGitConfig
options = map Param . annexWebOptions <$> Annex.getGitConfig
withUrlOptions :: (U.UrlOptions -> Annex a) -> Annex a
withUrlOptions a = a =<< getUrlOptions

View file

@ -95,7 +95,7 @@ newAssistantUrl repo = do
- warp-tls listens to http, in order to show an error page, so this works.
-}
assistantListening :: URLString -> IO Bool
assistantListening url = catchBoolIO $ exists url' def
assistantListening url = catchBoolIO $ exists url' =<< defUrlOptions
where
url' = case parseURI url of
Nothing -> url

View file

@ -316,7 +316,8 @@ usingDistribution :: IO Bool
usingDistribution = isJust <$> getEnv "GIT_ANNEX_STANDLONE_ENV"
downloadDistributionInfo :: Assistant (Maybe GitAnnexDistribution)
downloadDistributionInfo = Url.withUrlOptions $ \uo -> do
downloadDistributionInfo = do
uo <- liftAnnex Url.getUrlOptions
gpgcmd <- liftAnnex $ gpgCmd <$> Annex.getGitConfig
liftIO $ withTmpDir "git-annex.tmp" $ \tmpdir -> do
let infof = tmpdir </> "info"

View file

@ -189,7 +189,8 @@ escapeHeader :: String -> String
escapeHeader = escapeURIString (\c -> isUnescapedInURI c && c /= ' ')
getRepoInfo :: RemoteConfig -> Widget
getRepoInfo c = Url.withUrlOptions $ \uo ->
getRepoInfo c = do
uo <- liftAnnex Url.getUrlOptions
exists <- liftIO $ catchDefaultIO False $ Url.exists url uo
[whamlet|
<a href="#{url}">

View file

@ -12,6 +12,8 @@ git-annex (6.20180317) UNRELEASED; urgency=medium
don't copy the data metadata from the old version of the file,
instead use the mtime of the file.
* Avoid running annex.http-headers-command more than once.
* Enable HTTP connection reuse across multiple files, when git-annex
uses http-conduit.
-- Joey Hess <id@joeyh.name> Mon, 19 Mar 2018 23:13:59 -0400

View file

@ -196,7 +196,8 @@ startWeb o urlstring = go $ fromMaybe bad $ parseURI urlstring
pathmax <- liftIO $ fileNameLengthLimit "."
urlinfo <- if relaxedOption (downloadOptions o)
then pure Url.assumeUrlExists
else Url.withUrlOptions (Url.getUrlInfo urlstring)
else Url.withUrlOptions $
liftIO . Url.getUrlInfo urlstring
file <- adjustFile o <$> case fileOption (downloadOptions o) of
Just f -> pure f
Nothing -> case Url.urlSuggestedFile urlinfo of

View file

@ -150,7 +150,7 @@ downloadFeed url
| Url.parseURIRelaxed url == Nothing = giveup "invalid feed url"
| otherwise = do
showOutput
Url.withUrlOptions $ \ou ->
Url.withUrlOptions $ \uo ->
liftIO $ withTmpFile "feed" $ \f h -> do
hClose h
ifM (Url.download url f uo)
@ -167,7 +167,8 @@ performDownload opts cache todownload = case location todownload of
then do
urlinfo <- if relaxedOption (downloadOptions opts)
then pure Url.assumeUrlExists
else Url.withUrlOptions (Url.getUrlInfo url)
else Url.withUrlOptions $
liftIO . Url.getUrlInfo url
let dlopts = (downloadOptions opts)
-- force using the filename
-- chosen here

View file

@ -336,10 +336,11 @@ inAnnex rmt (State connpool duc) key
r = repo rmt
checkhttp = do
showChecking r
ifM (Url.withUrlOptions $ \uo -> anyM (\u -> Url.checkBoth u (keySize key) uo) (keyUrls rmt key))
( return True
, giveup "not found"
)
ifM (Url.withUrlOptions $ \uo -> liftIO $
anyM (\u -> Url.checkBoth u (keySize key) uo) (keyUrls rmt key))
( return True
, giveup "not found"
)
checkremote =
let fallback = Ssh.inAnnex r key
in P2PHelper.checkpresent (Ssh.runProto rmt connpool (cantCheck rmt) fallback) key

View file

@ -1,6 +1,6 @@
{- S3 remotes
-
- Copyright 2011-2017 Joey Hess <id@joeyh.name>
- Copyright 2011-2018 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU GPL version 3 or higher.
-}
@ -22,7 +22,7 @@ import qualified Data.ByteString as S
import qualified Data.Map as M
import Data.Char
import Network.Socket (HostName)
import Network.HTTP.Conduit (Manager, newManager)
import Network.HTTP.Conduit (Manager)
import Network.HTTP.Client (responseStatus, responseBody, RequestBody(..))
import Network.HTTP.Types
import Control.Monad.Trans.Resource
@ -51,7 +51,7 @@ import Utility.DataUnits
import Utility.FileSystemEncoding
import Annex.Content
import Annex.Url (withUrlOptions)
import Utility.Url (checkBoth, managerSettings, closeManager)
import Utility.Url (checkBoth, UrlOptions(..))
type BucketName = String
@ -295,7 +295,7 @@ checkKey r info Nothing k = case getpublicurl info of
giveup "No S3 credentials configured"
Just geturl -> do
showChecking r
withUrlOptions $ checkBoth (geturl k) (keySize k)
withUrlOptions $ liftIO . checkBoth (geturl k) (keySize k)
checkKey r info (Just h) k = do
showChecking r
checkKeyHelper info h (T.pack $ bucketObject info k)
@ -500,8 +500,8 @@ withS3HandleMaybe c gc u a = do
#if MIN_VERSION_aws(0,17,0)
Nothing
#endif
bracketIO (newManager managerSettings) closeManager $ \mgr ->
a $ Just $ S3Handle mgr awscfg s3cfg
withUrlOptions $ \ou ->
a $ Just $ S3Handle (httpManager ou) awscfg s3cfg
Nothing -> a Nothing
where
s3cfg = s3Configuration c

View file

@ -1,6 +1,6 @@
{- Url downloading.
-
- Copyright 2011-2017 Joey Hess <id@joeyh.name>
- Copyright 2011-2018 Joey Hess <id@joeyh.name>
-
- License: BSD-2-clause
-}
@ -11,11 +11,12 @@
{-# LANGUAGE FlexibleContexts #-}
module Utility.Url (
closeManager,
newManager,
managerSettings,
URLString,
UserAgent,
UrlOptions,
UrlOptions(..),
defUrlOptions,
mkUrlOptions,
check,
checkBoth,
@ -42,18 +43,9 @@ import qualified Data.ByteString as B
import qualified Data.ByteString.UTF8 as B8
import qualified Data.ByteString.Lazy as L
import Control.Monad.Trans.Resource
import Network.HTTP.Conduit hiding (closeManager)
import Network.HTTP.Conduit
import Network.HTTP.Client (brRead, withResponse)
-- closeManager is needed with older versions of http-client,
-- but not new versions, which warn about using it. Urgh.
#if ! MIN_VERSION_http_client(0,4,18)
import Network.HTTP.Client (closeManager)
#else
closeManager :: Manager -> IO ()
closeManager _ = return ()
#endif
#if ! MIN_VERSION_http_client(0,5,0)
responseTimeoutNone :: Maybe Int
responseTimeoutNone = Nothing
@ -78,15 +70,20 @@ data UrlOptions = UrlOptions
, reqHeaders :: Headers
, reqParams :: [CommandParam]
, applyRequest :: Request -> Request
, httpManager :: Manager
}
instance Default UrlOptions
where
def = UrlOptions Nothing [] [] id
defUrlOptions :: IO UrlOptions
defUrlOptions = UrlOptions
<$> pure Nothing
<*> pure []
<*> pure []
<*> pure id
<*> newManager managerSettings
mkUrlOptions :: Maybe UserAgent -> Headers -> [CommandParam] -> UrlOptions
mkUrlOptions defuseragent reqheaders reqparams =
UrlOptions useragent reqheaders reqparams applyrequest
mkUrlOptions :: Maybe UserAgent -> Headers -> [CommandParam] -> Manager -> UrlOptions
mkUrlOptions defuseragent reqheaders reqparams manager =
UrlOptions useragent reqheaders reqparams applyrequest manager
where
applyrequest = \r -> r { requestHeaders = requestHeaders r ++ addedheaders }
addedheaders = uaheader ++ otherheaders
@ -118,7 +115,7 @@ checkBoth url expected_size uo = do
return (fst v && snd v)
check :: URLString -> Maybe Integer -> UrlOptions -> IO (Bool, Bool)
check url expected_size = go <$$> getUrlInfo url
check url expected_size uo = go <$> getUrlInfo url uo
where
go (UrlInfo False _ _) = (False, False)
go (UrlInfo True Nothing _) = (True, True)
@ -192,19 +189,16 @@ getUrlInfo url uo = case parseURIRelaxed url of
filter (\p -> fst p == h) . responseHeaders
existsconduit req = do
mgr <- newManager managerSettings
let req' = headRequest (applyRequest uo req)
ret <- runResourceT $ do
resp <- http req' mgr
-- forces processing the response before the
-- manager is closed
runResourceT $ do
resp <- http req' (httpManager uo)
-- forces processing the response while
-- within the runResourceT
liftIO $ if responseStatus resp == ok200
then found
(extractlen resp)
(extractfilename resp)
else dne
liftIO $ closeManager mgr
return ret
existscurl u = do
output <- catchDefaultIO "" $
@ -327,14 +321,11 @@ downloadPartial url uo n = case parseURIRelaxed url of
go u = case parseUrlConduit (show u) of
Nothing -> return Nothing
Just req -> do
mgr <- newManager managerSettings
let req' = applyRequest uo req
ret <- withResponse req' mgr $ \resp ->
withResponse req' (httpManager uo) $ \resp ->
if responseStatus resp == ok200
then Just <$> brread n [] (responseBody resp)
else return Nothing
liftIO $ closeManager mgr
return ret
-- could use brReadSome here, needs newer http-client dependency
brread n' l rb

View file

@ -12,6 +12,11 @@ and pipelining requests, so it makes mass url downloads a lot slower than
if git-annex used http-conduit to do url downloads itself. [[users/yoh]]
has requested http pipelining.
(git-annex was creating a new http manager each time it hit an url,
except for in the S3 remote which reused a single manager. That's now been
improved, so all http-conduit use in git-annex reuses a http manager, and
so will do http pipelining.)
For file: ftp: and more unusual urls, http-conduit can't support them.
git-annex does support those urls, and people rely on that, so it would
still need to use wget or curl for those.