fix regression in addurl --file caused by youtube-dl support

Now youtubeDlCheck downloads the beginning of the url's content and
checks if it's html, only when it is does it pass it off the youtube-dl
to check if it supports it.

This means more work is done for urls that youtube-dl does support,
but is probably more efficient for other urls, since it only downloads
the first chunk of content, while youtube-dl probably downloads more.

As well as the reported bug, this also fixes behavior when an url
was added with youtube-dl, but the url content has now changed from
a html page to something else. Remote.Web.checkKey used to wrongly
succeed in that situation, since youtube-dl said sure it can download
that something else.

This commit was supported by the NSF-funded DataLad project.
This commit is contained in:
Joey Hess 2017-12-06 13:16:06 -04:00
parent 6b5e55a154
commit c6e4bc0a22
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
4 changed files with 71 additions and 18 deletions

View file

@ -1,6 +1,6 @@
{- Url downloading.
-
- Copyright 2011-2014 Joey Hess <id@joeyh.name>
- Copyright 2011-2017 Joey Hess <id@joeyh.name>
-
- License: BSD-2-clause
-}
@ -25,6 +25,7 @@ module Utility.Url (
assumeUrlExists,
download,
downloadQuiet,
downloadPartial,
parseURIRelaxed,
matchStatusCodeException,
matchHttpExceptionContent,
@ -39,8 +40,10 @@ import Network.HTTP.Types
import qualified Data.CaseInsensitive as CI
import qualified Data.ByteString as B
import qualified Data.ByteString.UTF8 as B8
import qualified Data.ByteString.Lazy as L
import Control.Monad.Trans.Resource
import Network.HTTP.Conduit hiding (closeManager)
import Network.HTTP.Client (brRead, withResponse)
-- closeManager is needed with older versions of http-client,
-- but not new versions, which warn about using it. Urgh.
@ -140,7 +143,7 @@ assumeUrlExists = UrlInfo True Nothing Nothing
- also returning its size and suggested filename if available. -}
getUrlInfo :: URLString -> UrlOptions -> IO UrlInfo
getUrlInfo url uo = case parseURIRelaxed url of
Just u -> case parseurlconduit (show u) of
Just u -> case parseUrlConduit (show u) of
Just req -> catchJust
-- When http redirects to a protocol which
-- conduit does not support, it will throw
@ -220,12 +223,6 @@ getUrlInfo url uo = case parseURIRelaxed url of
_ | isftp && isJust len -> good
_ -> dne
#if MIN_VERSION_http_client(0,4,30)
parseurlconduit = parseUrlThrow
#else
parseurlconduit = parseUrl
#endif
-- Parse eg: attachment; filename="fname.ext"
-- per RFC 2616
contentDispositionFilename :: String -> Maybe FilePath
@ -321,11 +318,45 @@ download' quiet url file uo = do
| quiet = [Param s]
| otherwise = []
{- Downloads at least the specified number of bytes from an url. -}
downloadPartial :: URLString -> UrlOptions -> Int -> IO (Maybe L.ByteString)
downloadPartial url uo n = case parseURIRelaxed url of
Nothing -> return Nothing
Just u -> go u `catchNonAsync` const (return Nothing)
where
go u = case parseUrlConduit (show u) of
Nothing -> return Nothing
Just req -> do
mgr <- newManager managerSettings
let req' = applyRequest uo req
ret <- withResponse req' mgr $ \resp ->
if responseStatus resp == ok200
then Just <$> brread n [] (responseBody resp)
else return Nothing
liftIO $ closeManager mgr
return ret
-- could use brReadSome here, needs newer http-client dependency
brread n' l rb
| n' <= 0 = return (L.fromChunks (reverse l))
| otherwise = do
bs <- brRead rb
if B.null bs
then return (L.fromChunks (reverse l))
else brread (n' - B.length bs) (bs:l) rb
{- Allows for spaces and other stuff in urls, properly escaping them. -}
parseURIRelaxed :: URLString -> Maybe URI
parseURIRelaxed s = maybe (parseURIRelaxed' s) Just $
parseURI $ escapeURIString isAllowedInURI s
#if MIN_VERSION_http_client(0,4,30)
parseUrlConduit :: URLString -> Maybe Request
parseUrlConduit = parseUrlThrow
#else
parseUrlConduit = parseUrl
#endif
{- Some characters like '[' are allowed in eg, the address of
- an uri, but cannot appear unescaped further along in the uri.
- This handles that, expensively, by successively escaping each character