fix regression in addurl --file caused by youtube-dl support

Now youtubeDlCheck downloads the beginning of the url's content and
checks if it's html, only when it is does it pass it off the youtube-dl
to check if it supports it.

This means more work is done for urls that youtube-dl does support,
but is probably more efficient for other urls, since it only downloads
the first chunk of content, while youtube-dl probably downloads more.

As well as the reported bug, this also fixes behavior when an url
was added with youtube-dl, but the url content has now changed from
a html page to something else. Remote.Web.checkKey used to wrongly
succeed in that situation, since youtube-dl said sure it can download
that something else.

This commit was supported by the NSF-funded DataLad project.
This commit is contained in:
Joey Hess 2017-12-06 13:16:06 -04:00
parent 6b5e55a154
commit c6e4bc0a22
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
4 changed files with 71 additions and 18 deletions

View file

@ -10,8 +10,10 @@ module Annex.YoutubeDl where
import Annex.Common import Annex.Common
import qualified Annex import qualified Annex
import Annex.Content import Annex.Content
import Annex.Url
import Utility.Url (URLString) import Utility.Url (URLString)
import Utility.DiskFree import Utility.DiskFree
import Utility.HtmlDetect
import Logs.Transfer import Logs.Transfer
-- Runs youtube-dl in a work directory, to download a single media file -- Runs youtube-dl in a work directory, to download a single media file
@ -108,10 +110,19 @@ youtubeDlSupported :: URLString -> Annex Bool
youtubeDlSupported url = either (const False) id <$> youtubeDlCheck url youtubeDlSupported url = either (const False) id <$> youtubeDlCheck url
-- Check if youtube-dl can find media in an url. -- Check if youtube-dl can find media in an url.
--
-- youtube-dl supports downloading urls that are not html pages,
-- but we don't want to use it for such urls, since they can be downloaded
-- without it. So, this first downloads part of the content and checks
-- if it's a html page; only then is youtube-dl used.
youtubeDlCheck :: URLString -> Annex (Either String Bool) youtubeDlCheck :: URLString -> Annex (Either String Bool)
youtubeDlCheck url = catchMsgIO $ do youtubeDlCheck url = catchMsgIO $ do
uo <- getUrlOptions
liftIO (downloadPartial url uo htmlPrefixLength) >>= \case
Just bs | isHtmlBs bs -> do
opts <- youtubeDlOpts [ Param url, Param "--simulate" ] opts <- youtubeDlOpts [ Param url, Param "--simulate" ]
liftIO $ snd <$> processTranscript "youtube-dl" (toCommand opts) Nothing liftIO $ snd <$> processTranscript "youtube-dl" (toCommand opts) Nothing
_ -> return False
-- Ask youtube-dl for the filename of media in an url. -- Ask youtube-dl for the filename of media in an url.
-- --

View file

@ -9,20 +9,20 @@ module Utility.HtmlDetect where
import Text.HTML.TagSoup import Text.HTML.TagSoup
import Data.Char import Data.Char
import qualified Data.ByteString.Lazy as B
import qualified Data.ByteString.Lazy.Char8 as B8
-- | Detect if a string is a html document. -- | Detect if a String is a html document.
-- --
-- The document many not be valid, and will still be detected as html, -- The document many not be valid, or may be truncated, and will
-- as long as it starts with a "<html>" or "<!DOCTYPE html>" tag. -- still be detected as html, as long as it starts with a
-- "<html>" or "<!DOCTYPE html>" tag.
-- --
-- Html fragments like "<p>this</p>" are not detected as being html, -- Html fragments like "<p>this</p>" are not detected as being html,
-- although some browsers may chose to render them as html. -- although some browsers may chose to render them as html.
isHtml :: String -> Bool isHtml :: String -> Bool
isHtml = evaluate . canonicalizeTags . parseTags . shorten isHtml = evaluate . canonicalizeTags . parseTags . take htmlPrefixLength
where where
-- We only care about the beginning of the file,
-- so although tagsoup parses lazily anyway, truncate it.
shorten = take 16384
evaluate (TagOpen "!DOCTYPE" ((t, _):_):_) = map toLower t == "html" evaluate (TagOpen "!DOCTYPE" ((t, _):_):_) = map toLower t == "html"
evaluate (TagOpen "html" _:_) = True evaluate (TagOpen "html" _:_) = True
-- Allow some leading whitespace before the tag. -- Allow some leading whitespace before the tag.
@ -33,3 +33,14 @@ isHtml = evaluate . canonicalizeTags . parseTags . shorten
-- tag, but easy to allow for. -- tag, but easy to allow for.
evaluate (TagComment _:rest) = evaluate rest evaluate (TagComment _:rest) = evaluate rest
evaluate _ = False evaluate _ = False
-- | Detect if a ByteString is a html document.
isHtmlBs :: B.ByteString -> Bool
-- The encoding of the ByteString is not known, but isHtml only
-- looks for ascii strings.
isHtmlBs = isHtml . B8.unpack
-- | How much of the beginning of a html document is needed to detect it.
-- (conservatively)
htmlPrefixLength :: Int
htmlPrefixLength = 8192

View file

@ -1,6 +1,6 @@
{- Url downloading. {- Url downloading.
- -
- Copyright 2011-2014 Joey Hess <id@joeyh.name> - Copyright 2011-2017 Joey Hess <id@joeyh.name>
- -
- License: BSD-2-clause - License: BSD-2-clause
-} -}
@ -25,6 +25,7 @@ module Utility.Url (
assumeUrlExists, assumeUrlExists,
download, download,
downloadQuiet, downloadQuiet,
downloadPartial,
parseURIRelaxed, parseURIRelaxed,
matchStatusCodeException, matchStatusCodeException,
matchHttpExceptionContent, matchHttpExceptionContent,
@ -39,8 +40,10 @@ import Network.HTTP.Types
import qualified Data.CaseInsensitive as CI import qualified Data.CaseInsensitive as CI
import qualified Data.ByteString as B import qualified Data.ByteString as B
import qualified Data.ByteString.UTF8 as B8 import qualified Data.ByteString.UTF8 as B8
import qualified Data.ByteString.Lazy as L
import Control.Monad.Trans.Resource import Control.Monad.Trans.Resource
import Network.HTTP.Conduit hiding (closeManager) import Network.HTTP.Conduit hiding (closeManager)
import Network.HTTP.Client (brRead, withResponse)
-- closeManager is needed with older versions of http-client, -- closeManager is needed with older versions of http-client,
-- but not new versions, which warn about using it. Urgh. -- but not new versions, which warn about using it. Urgh.
@ -140,7 +143,7 @@ assumeUrlExists = UrlInfo True Nothing Nothing
- also returning its size and suggested filename if available. -} - also returning its size and suggested filename if available. -}
getUrlInfo :: URLString -> UrlOptions -> IO UrlInfo getUrlInfo :: URLString -> UrlOptions -> IO UrlInfo
getUrlInfo url uo = case parseURIRelaxed url of getUrlInfo url uo = case parseURIRelaxed url of
Just u -> case parseurlconduit (show u) of Just u -> case parseUrlConduit (show u) of
Just req -> catchJust Just req -> catchJust
-- When http redirects to a protocol which -- When http redirects to a protocol which
-- conduit does not support, it will throw -- conduit does not support, it will throw
@ -220,12 +223,6 @@ getUrlInfo url uo = case parseURIRelaxed url of
_ | isftp && isJust len -> good _ | isftp && isJust len -> good
_ -> dne _ -> dne
#if MIN_VERSION_http_client(0,4,30)
parseurlconduit = parseUrlThrow
#else
parseurlconduit = parseUrl
#endif
-- Parse eg: attachment; filename="fname.ext" -- Parse eg: attachment; filename="fname.ext"
-- per RFC 2616 -- per RFC 2616
contentDispositionFilename :: String -> Maybe FilePath contentDispositionFilename :: String -> Maybe FilePath
@ -321,11 +318,45 @@ download' quiet url file uo = do
| quiet = [Param s] | quiet = [Param s]
| otherwise = [] | otherwise = []
{- Downloads at least the specified number of bytes from an url. -}
downloadPartial :: URLString -> UrlOptions -> Int -> IO (Maybe L.ByteString)
downloadPartial url uo n = case parseURIRelaxed url of
Nothing -> return Nothing
Just u -> go u `catchNonAsync` const (return Nothing)
where
go u = case parseUrlConduit (show u) of
Nothing -> return Nothing
Just req -> do
mgr <- newManager managerSettings
let req' = applyRequest uo req
ret <- withResponse req' mgr $ \resp ->
if responseStatus resp == ok200
then Just <$> brread n [] (responseBody resp)
else return Nothing
liftIO $ closeManager mgr
return ret
-- could use brReadSome here, needs newer http-client dependency
brread n' l rb
| n' <= 0 = return (L.fromChunks (reverse l))
| otherwise = do
bs <- brRead rb
if B.null bs
then return (L.fromChunks (reverse l))
else brread (n' - B.length bs) (bs:l) rb
{- Allows for spaces and other stuff in urls, properly escaping them. -} {- Allows for spaces and other stuff in urls, properly escaping them. -}
parseURIRelaxed :: URLString -> Maybe URI parseURIRelaxed :: URLString -> Maybe URI
parseURIRelaxed s = maybe (parseURIRelaxed' s) Just $ parseURIRelaxed s = maybe (parseURIRelaxed' s) Just $
parseURI $ escapeURIString isAllowedInURI s parseURI $ escapeURIString isAllowedInURI s
#if MIN_VERSION_http_client(0,4,30)
parseUrlConduit :: URLString -> Maybe Request
parseUrlConduit = parseUrlThrow
#else
parseUrlConduit = parseUrl
#endif
{- Some characters like '[' are allowed in eg, the address of {- Some characters like '[' are allowed in eg, the address of
- an uri, but cannot appear unescaped further along in the uri. - an uri, but cannot appear unescaped further along in the uri.
- This handles that, expensively, by successively escaping each character - This handles that, expensively, by successively escaping each character

View file

@ -28,4 +28,4 @@ Some tests also failed related to our datalad archives git annex special remote
[[!meta author=yoh]] [[!meta author=yoh]]
> [[fixed|done]] --[[Joey]]