fix regression in addurl --file caused by youtube-dl support
Now youtubeDlCheck downloads the beginning of the url's content and checks if it's html, only when it is does it pass it off the youtube-dl to check if it supports it. This means more work is done for urls that youtube-dl does support, but is probably more efficient for other urls, since it only downloads the first chunk of content, while youtube-dl probably downloads more. As well as the reported bug, this also fixes behavior when an url was added with youtube-dl, but the url content has now changed from a html page to something else. Remote.Web.checkKey used to wrongly succeed in that situation, since youtube-dl said sure it can download that something else. This commit was supported by the NSF-funded DataLad project.
This commit is contained in:
parent
6b5e55a154
commit
c6e4bc0a22
4 changed files with 71 additions and 18 deletions
|
@ -1,6 +1,6 @@
|
|||
{- Url downloading.
|
||||
-
|
||||
- Copyright 2011-2014 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2011-2017 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- License: BSD-2-clause
|
||||
-}
|
||||
|
@ -25,6 +25,7 @@ module Utility.Url (
|
|||
assumeUrlExists,
|
||||
download,
|
||||
downloadQuiet,
|
||||
downloadPartial,
|
||||
parseURIRelaxed,
|
||||
matchStatusCodeException,
|
||||
matchHttpExceptionContent,
|
||||
|
@ -39,8 +40,10 @@ import Network.HTTP.Types
|
|||
import qualified Data.CaseInsensitive as CI
|
||||
import qualified Data.ByteString as B
|
||||
import qualified Data.ByteString.UTF8 as B8
|
||||
import qualified Data.ByteString.Lazy as L
|
||||
import Control.Monad.Trans.Resource
|
||||
import Network.HTTP.Conduit hiding (closeManager)
|
||||
import Network.HTTP.Client (brRead, withResponse)
|
||||
|
||||
-- closeManager is needed with older versions of http-client,
|
||||
-- but not new versions, which warn about using it. Urgh.
|
||||
|
@ -140,7 +143,7 @@ assumeUrlExists = UrlInfo True Nothing Nothing
|
|||
- also returning its size and suggested filename if available. -}
|
||||
getUrlInfo :: URLString -> UrlOptions -> IO UrlInfo
|
||||
getUrlInfo url uo = case parseURIRelaxed url of
|
||||
Just u -> case parseurlconduit (show u) of
|
||||
Just u -> case parseUrlConduit (show u) of
|
||||
Just req -> catchJust
|
||||
-- When http redirects to a protocol which
|
||||
-- conduit does not support, it will throw
|
||||
|
@ -220,12 +223,6 @@ getUrlInfo url uo = case parseURIRelaxed url of
|
|||
_ | isftp && isJust len -> good
|
||||
_ -> dne
|
||||
|
||||
#if MIN_VERSION_http_client(0,4,30)
|
||||
parseurlconduit = parseUrlThrow
|
||||
#else
|
||||
parseurlconduit = parseUrl
|
||||
#endif
|
||||
|
||||
-- Parse eg: attachment; filename="fname.ext"
|
||||
-- per RFC 2616
|
||||
contentDispositionFilename :: String -> Maybe FilePath
|
||||
|
@ -321,11 +318,45 @@ download' quiet url file uo = do
|
|||
| quiet = [Param s]
|
||||
| otherwise = []
|
||||
|
||||
{- Downloads at least the specified number of bytes from an url. -}
|
||||
downloadPartial :: URLString -> UrlOptions -> Int -> IO (Maybe L.ByteString)
|
||||
downloadPartial url uo n = case parseURIRelaxed url of
|
||||
Nothing -> return Nothing
|
||||
Just u -> go u `catchNonAsync` const (return Nothing)
|
||||
where
|
||||
go u = case parseUrlConduit (show u) of
|
||||
Nothing -> return Nothing
|
||||
Just req -> do
|
||||
mgr <- newManager managerSettings
|
||||
let req' = applyRequest uo req
|
||||
ret <- withResponse req' mgr $ \resp ->
|
||||
if responseStatus resp == ok200
|
||||
then Just <$> brread n [] (responseBody resp)
|
||||
else return Nothing
|
||||
liftIO $ closeManager mgr
|
||||
return ret
|
||||
|
||||
-- could use brReadSome here, needs newer http-client dependency
|
||||
brread n' l rb
|
||||
| n' <= 0 = return (L.fromChunks (reverse l))
|
||||
| otherwise = do
|
||||
bs <- brRead rb
|
||||
if B.null bs
|
||||
then return (L.fromChunks (reverse l))
|
||||
else brread (n' - B.length bs) (bs:l) rb
|
||||
|
||||
{- Allows for spaces and other stuff in urls, properly escaping them. -}
|
||||
parseURIRelaxed :: URLString -> Maybe URI
|
||||
parseURIRelaxed s = maybe (parseURIRelaxed' s) Just $
|
||||
parseURI $ escapeURIString isAllowedInURI s
|
||||
|
||||
#if MIN_VERSION_http_client(0,4,30)
|
||||
parseUrlConduit :: URLString -> Maybe Request
|
||||
parseUrlConduit = parseUrlThrow
|
||||
#else
|
||||
parseUrlConduit = parseUrl
|
||||
#endif
|
||||
|
||||
{- Some characters like '[' are allowed in eg, the address of
|
||||
- an uri, but cannot appear unescaped further along in the uri.
|
||||
- This handles that, expensively, by successively escaping each character
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue