better HTTP connection reuse
Enable HTTP connection reuse across multiple files, when git-annex uses http-conduit. Before, a new Manager was created each time Utility.Url used it. Now, a single Manager gets created the first time, so connections are reused. Doesn't help when external programs are used for url download, but does speed up addurl --fast, fsck --from web, etc. Testing fsck --fast --from web with 3 files, over high-latency satellite internet, it sped up from 19.37s to 14.96s. This commit was supported by the NSF-funded DataLad project.
This commit is contained in:
parent
2ec07bc29f
commit
9b98d3f630
12 changed files with 61 additions and 53 deletions
|
@ -947,8 +947,8 @@ downloadUrl k p urls file = meteredFile file (Just p) k $
|
|||
( return Url.downloadQuiet
|
||||
, return Url.download
|
||||
)
|
||||
Url.withUrlOptions $ \uo ->
|
||||
anyM (\u -> a u file uo) urls
|
||||
Url.withUrlOptions $ \uo ->
|
||||
liftIO $ anyM (\u -> a u file uo) urls
|
||||
go (Just basecmd) = anyM (downloadcmd basecmd) urls
|
||||
downloadcmd basecmd url =
|
||||
progressCommand "sh" [Param "-c", Param $ gencmd url basecmd]
|
||||
|
|
13
Annex/Url.hs
13
Annex/Url.hs
|
@ -9,6 +9,7 @@
|
|||
module Annex.Url (
|
||||
module U,
|
||||
withUrlOptions,
|
||||
getUrlOptions,
|
||||
getUserAgent,
|
||||
) where
|
||||
|
||||
|
@ -24,20 +25,24 @@ getUserAgent :: Annex (Maybe U.UserAgent)
|
|||
getUserAgent = Annex.getState $
|
||||
Just . fromMaybe defaultUserAgent . Annex.useragent
|
||||
|
||||
withUrlOptions :: (U.UrlOptions -> Annex a) -> Annex a
|
||||
withUrlOptions a = Annex.getState Annex.urloptions >>= \case
|
||||
Just uo -> a uo
|
||||
getUrlOptions :: Annex U.UrlOptions
|
||||
getUrlOptions = Annex.getState Annex.urloptions >>= \case
|
||||
Just uo -> return uo
|
||||
Nothing -> do
|
||||
uo <- mk
|
||||
Annex.changeState $ \s -> s
|
||||
{ Annex.urloptions = Just uo }
|
||||
a uo
|
||||
return uo
|
||||
where
|
||||
mk = mkUrlOptions
|
||||
<$> getUserAgent
|
||||
<*> headers
|
||||
<*> options
|
||||
<*> liftIO (U.newManager U.managerSettings)
|
||||
headers = annexHttpHeadersCommand <$> Annex.getGitConfig >>= \case
|
||||
Just cmd -> lines <$> liftIO (readProcess "sh" ["-c", cmd])
|
||||
Nothing -> annexHttpHeaders <$> Annex.getGitConfig
|
||||
options = map Param . annexWebOptions <$> Annex.getGitConfig
|
||||
|
||||
withUrlOptions :: (U.UrlOptions -> Annex a) -> Annex a
|
||||
withUrlOptions a = a =<< getUrlOptions
|
||||
|
|
|
@ -95,7 +95,7 @@ newAssistantUrl repo = do
|
|||
- warp-tls listens to http, in order to show an error page, so this works.
|
||||
-}
|
||||
assistantListening :: URLString -> IO Bool
|
||||
assistantListening url = catchBoolIO $ exists url' def
|
||||
assistantListening url = catchBoolIO $ exists url' =<< defUrlOptions
|
||||
where
|
||||
url' = case parseURI url of
|
||||
Nothing -> url
|
||||
|
|
|
@ -316,7 +316,8 @@ usingDistribution :: IO Bool
|
|||
usingDistribution = isJust <$> getEnv "GIT_ANNEX_STANDLONE_ENV"
|
||||
|
||||
downloadDistributionInfo :: Assistant (Maybe GitAnnexDistribution)
|
||||
downloadDistributionInfo = Url.withUrlOptions $ \uo -> do
|
||||
downloadDistributionInfo = do
|
||||
uo <- liftAnnex Url.getUrlOptions
|
||||
gpgcmd <- liftAnnex $ gpgCmd <$> Annex.getGitConfig
|
||||
liftIO $ withTmpDir "git-annex.tmp" $ \tmpdir -> do
|
||||
let infof = tmpdir </> "info"
|
||||
|
|
|
@ -189,7 +189,8 @@ escapeHeader :: String -> String
|
|||
escapeHeader = escapeURIString (\c -> isUnescapedInURI c && c /= ' ')
|
||||
|
||||
getRepoInfo :: RemoteConfig -> Widget
|
||||
getRepoInfo c = Url.withUrlOptions $ \uo ->
|
||||
getRepoInfo c = do
|
||||
uo <- liftAnnex Url.getUrlOptions
|
||||
exists <- liftIO $ catchDefaultIO False $ Url.exists url uo
|
||||
[whamlet|
|
||||
<a href="#{url}">
|
||||
|
|
|
@ -12,6 +12,8 @@ git-annex (6.20180317) UNRELEASED; urgency=medium
|
|||
don't copy the data metadata from the old version of the file,
|
||||
instead use the mtime of the file.
|
||||
* Avoid running annex.http-headers-command more than once.
|
||||
* Enable HTTP connection reuse across multiple files, when git-annex
|
||||
uses http-conduit.
|
||||
|
||||
-- Joey Hess <id@joeyh.name> Mon, 19 Mar 2018 23:13:59 -0400
|
||||
|
||||
|
|
|
@ -196,7 +196,8 @@ startWeb o urlstring = go $ fromMaybe bad $ parseURI urlstring
|
|||
pathmax <- liftIO $ fileNameLengthLimit "."
|
||||
urlinfo <- if relaxedOption (downloadOptions o)
|
||||
then pure Url.assumeUrlExists
|
||||
else Url.withUrlOptions (Url.getUrlInfo urlstring)
|
||||
else Url.withUrlOptions $
|
||||
liftIO . Url.getUrlInfo urlstring
|
||||
file <- adjustFile o <$> case fileOption (downloadOptions o) of
|
||||
Just f -> pure f
|
||||
Nothing -> case Url.urlSuggestedFile urlinfo of
|
||||
|
|
|
@ -150,7 +150,7 @@ downloadFeed url
|
|||
| Url.parseURIRelaxed url == Nothing = giveup "invalid feed url"
|
||||
| otherwise = do
|
||||
showOutput
|
||||
Url.withUrlOptions $ \ou ->
|
||||
Url.withUrlOptions $ \uo ->
|
||||
liftIO $ withTmpFile "feed" $ \f h -> do
|
||||
hClose h
|
||||
ifM (Url.download url f uo)
|
||||
|
@ -167,7 +167,8 @@ performDownload opts cache todownload = case location todownload of
|
|||
then do
|
||||
urlinfo <- if relaxedOption (downloadOptions opts)
|
||||
then pure Url.assumeUrlExists
|
||||
else Url.withUrlOptions (Url.getUrlInfo url)
|
||||
else Url.withUrlOptions $
|
||||
liftIO . Url.getUrlInfo url
|
||||
let dlopts = (downloadOptions opts)
|
||||
-- force using the filename
|
||||
-- chosen here
|
||||
|
|
|
@ -336,10 +336,11 @@ inAnnex rmt (State connpool duc) key
|
|||
r = repo rmt
|
||||
checkhttp = do
|
||||
showChecking r
|
||||
ifM (Url.withUrlOptions $ \uo -> anyM (\u -> Url.checkBoth u (keySize key) uo) (keyUrls rmt key))
|
||||
( return True
|
||||
, giveup "not found"
|
||||
)
|
||||
ifM (Url.withUrlOptions $ \uo -> liftIO $
|
||||
anyM (\u -> Url.checkBoth u (keySize key) uo) (keyUrls rmt key))
|
||||
( return True
|
||||
, giveup "not found"
|
||||
)
|
||||
checkremote =
|
||||
let fallback = Ssh.inAnnex r key
|
||||
in P2PHelper.checkpresent (Ssh.runProto rmt connpool (cantCheck rmt) fallback) key
|
||||
|
|
12
Remote/S3.hs
12
Remote/S3.hs
|
@ -1,6 +1,6 @@
|
|||
{- S3 remotes
|
||||
-
|
||||
- Copyright 2011-2017 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2011-2018 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU GPL version 3 or higher.
|
||||
-}
|
||||
|
@ -22,7 +22,7 @@ import qualified Data.ByteString as S
|
|||
import qualified Data.Map as M
|
||||
import Data.Char
|
||||
import Network.Socket (HostName)
|
||||
import Network.HTTP.Conduit (Manager, newManager)
|
||||
import Network.HTTP.Conduit (Manager)
|
||||
import Network.HTTP.Client (responseStatus, responseBody, RequestBody(..))
|
||||
import Network.HTTP.Types
|
||||
import Control.Monad.Trans.Resource
|
||||
|
@ -51,7 +51,7 @@ import Utility.DataUnits
|
|||
import Utility.FileSystemEncoding
|
||||
import Annex.Content
|
||||
import Annex.Url (withUrlOptions)
|
||||
import Utility.Url (checkBoth, managerSettings, closeManager)
|
||||
import Utility.Url (checkBoth, UrlOptions(..))
|
||||
|
||||
type BucketName = String
|
||||
|
||||
|
@ -295,7 +295,7 @@ checkKey r info Nothing k = case getpublicurl info of
|
|||
giveup "No S3 credentials configured"
|
||||
Just geturl -> do
|
||||
showChecking r
|
||||
withUrlOptions $ checkBoth (geturl k) (keySize k)
|
||||
withUrlOptions $ liftIO . checkBoth (geturl k) (keySize k)
|
||||
checkKey r info (Just h) k = do
|
||||
showChecking r
|
||||
checkKeyHelper info h (T.pack $ bucketObject info k)
|
||||
|
@ -500,8 +500,8 @@ withS3HandleMaybe c gc u a = do
|
|||
#if MIN_VERSION_aws(0,17,0)
|
||||
Nothing
|
||||
#endif
|
||||
bracketIO (newManager managerSettings) closeManager $ \mgr ->
|
||||
a $ Just $ S3Handle mgr awscfg s3cfg
|
||||
withUrlOptions $ \ou ->
|
||||
a $ Just $ S3Handle (httpManager ou) awscfg s3cfg
|
||||
Nothing -> a Nothing
|
||||
where
|
||||
s3cfg = s3Configuration c
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{- Url downloading.
|
||||
-
|
||||
- Copyright 2011-2017 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2011-2018 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- License: BSD-2-clause
|
||||
-}
|
||||
|
@ -11,11 +11,12 @@
|
|||
{-# LANGUAGE FlexibleContexts #-}
|
||||
|
||||
module Utility.Url (
|
||||
closeManager,
|
||||
newManager,
|
||||
managerSettings,
|
||||
URLString,
|
||||
UserAgent,
|
||||
UrlOptions,
|
||||
UrlOptions(..),
|
||||
defUrlOptions,
|
||||
mkUrlOptions,
|
||||
check,
|
||||
checkBoth,
|
||||
|
@ -42,18 +43,9 @@ import qualified Data.ByteString as B
|
|||
import qualified Data.ByteString.UTF8 as B8
|
||||
import qualified Data.ByteString.Lazy as L
|
||||
import Control.Monad.Trans.Resource
|
||||
import Network.HTTP.Conduit hiding (closeManager)
|
||||
import Network.HTTP.Conduit
|
||||
import Network.HTTP.Client (brRead, withResponse)
|
||||
|
||||
-- closeManager is needed with older versions of http-client,
|
||||
-- but not new versions, which warn about using it. Urgh.
|
||||
#if ! MIN_VERSION_http_client(0,4,18)
|
||||
import Network.HTTP.Client (closeManager)
|
||||
#else
|
||||
closeManager :: Manager -> IO ()
|
||||
closeManager _ = return ()
|
||||
#endif
|
||||
|
||||
#if ! MIN_VERSION_http_client(0,5,0)
|
||||
responseTimeoutNone :: Maybe Int
|
||||
responseTimeoutNone = Nothing
|
||||
|
@ -78,15 +70,20 @@ data UrlOptions = UrlOptions
|
|||
, reqHeaders :: Headers
|
||||
, reqParams :: [CommandParam]
|
||||
, applyRequest :: Request -> Request
|
||||
, httpManager :: Manager
|
||||
}
|
||||
|
||||
instance Default UrlOptions
|
||||
where
|
||||
def = UrlOptions Nothing [] [] id
|
||||
defUrlOptions :: IO UrlOptions
|
||||
defUrlOptions = UrlOptions
|
||||
<$> pure Nothing
|
||||
<*> pure []
|
||||
<*> pure []
|
||||
<*> pure id
|
||||
<*> newManager managerSettings
|
||||
|
||||
mkUrlOptions :: Maybe UserAgent -> Headers -> [CommandParam] -> UrlOptions
|
||||
mkUrlOptions defuseragent reqheaders reqparams =
|
||||
UrlOptions useragent reqheaders reqparams applyrequest
|
||||
mkUrlOptions :: Maybe UserAgent -> Headers -> [CommandParam] -> Manager -> UrlOptions
|
||||
mkUrlOptions defuseragent reqheaders reqparams manager =
|
||||
UrlOptions useragent reqheaders reqparams applyrequest manager
|
||||
where
|
||||
applyrequest = \r -> r { requestHeaders = requestHeaders r ++ addedheaders }
|
||||
addedheaders = uaheader ++ otherheaders
|
||||
|
@ -118,7 +115,7 @@ checkBoth url expected_size uo = do
|
|||
return (fst v && snd v)
|
||||
|
||||
check :: URLString -> Maybe Integer -> UrlOptions -> IO (Bool, Bool)
|
||||
check url expected_size = go <$$> getUrlInfo url
|
||||
check url expected_size uo = go <$> getUrlInfo url uo
|
||||
where
|
||||
go (UrlInfo False _ _) = (False, False)
|
||||
go (UrlInfo True Nothing _) = (True, True)
|
||||
|
@ -192,19 +189,16 @@ getUrlInfo url uo = case parseURIRelaxed url of
|
|||
filter (\p -> fst p == h) . responseHeaders
|
||||
|
||||
existsconduit req = do
|
||||
mgr <- newManager managerSettings
|
||||
let req' = headRequest (applyRequest uo req)
|
||||
ret <- runResourceT $ do
|
||||
resp <- http req' mgr
|
||||
-- forces processing the response before the
|
||||
-- manager is closed
|
||||
runResourceT $ do
|
||||
resp <- http req' (httpManager uo)
|
||||
-- forces processing the response while
|
||||
-- within the runResourceT
|
||||
liftIO $ if responseStatus resp == ok200
|
||||
then found
|
||||
(extractlen resp)
|
||||
(extractfilename resp)
|
||||
else dne
|
||||
liftIO $ closeManager mgr
|
||||
return ret
|
||||
|
||||
existscurl u = do
|
||||
output <- catchDefaultIO "" $
|
||||
|
@ -327,14 +321,11 @@ downloadPartial url uo n = case parseURIRelaxed url of
|
|||
go u = case parseUrlConduit (show u) of
|
||||
Nothing -> return Nothing
|
||||
Just req -> do
|
||||
mgr <- newManager managerSettings
|
||||
let req' = applyRequest uo req
|
||||
ret <- withResponse req' mgr $ \resp ->
|
||||
withResponse req' (httpManager uo) $ \resp ->
|
||||
if responseStatus resp == ok200
|
||||
then Just <$> brread n [] (responseBody resp)
|
||||
else return Nothing
|
||||
liftIO $ closeManager mgr
|
||||
return ret
|
||||
|
||||
-- could use brReadSome here, needs newer http-client dependency
|
||||
brread n' l rb
|
||||
|
|
|
@ -12,6 +12,11 @@ and pipelining requests, so it makes mass url downloads a lot slower than
|
|||
if git-annex used http-conduit to do url downloads itself. [[users/yoh]]
|
||||
has requested http pipelining.
|
||||
|
||||
(git-annex was creating a new http manager each time it hit an url,
|
||||
except for in the S3 remote which reused a single manager. That's now been
|
||||
improved, so all http-conduit use in git-annex reuses a http manager, and
|
||||
so will do http pipelining.)
|
||||
|
||||
For file: ftp: and more unusual urls, http-conduit can't support them.
|
||||
git-annex does support those urls, and people rely on that, so it would
|
||||
still need to use wget or curl for those.
|
||||
|
|
Loading…
Reference in a new issue