fix regression in addurl --file caused by youtube-dl support
Now youtubeDlCheck downloads the beginning of the url's content and checks if it's html, only when it is does it pass it off the youtube-dl to check if it supports it. This means more work is done for urls that youtube-dl does support, but is probably more efficient for other urls, since it only downloads the first chunk of content, while youtube-dl probably downloads more. As well as the reported bug, this also fixes behavior when an url was added with youtube-dl, but the url content has now changed from a html page to something else. Remote.Web.checkKey used to wrongly succeed in that situation, since youtube-dl said sure it can download that something else. This commit was supported by the NSF-funded DataLad project.
This commit is contained in:
parent
6b5e55a154
commit
c6e4bc0a22
4 changed files with 71 additions and 18 deletions
|
@ -10,8 +10,10 @@ module Annex.YoutubeDl where
|
||||||
import Annex.Common
|
import Annex.Common
|
||||||
import qualified Annex
|
import qualified Annex
|
||||||
import Annex.Content
|
import Annex.Content
|
||||||
|
import Annex.Url
|
||||||
import Utility.Url (URLString)
|
import Utility.Url (URLString)
|
||||||
import Utility.DiskFree
|
import Utility.DiskFree
|
||||||
|
import Utility.HtmlDetect
|
||||||
import Logs.Transfer
|
import Logs.Transfer
|
||||||
|
|
||||||
-- Runs youtube-dl in a work directory, to download a single media file
|
-- Runs youtube-dl in a work directory, to download a single media file
|
||||||
|
@ -108,10 +110,19 @@ youtubeDlSupported :: URLString -> Annex Bool
|
||||||
youtubeDlSupported url = either (const False) id <$> youtubeDlCheck url
|
youtubeDlSupported url = either (const False) id <$> youtubeDlCheck url
|
||||||
|
|
||||||
-- Check if youtube-dl can find media in an url.
|
-- Check if youtube-dl can find media in an url.
|
||||||
|
--
|
||||||
|
-- youtube-dl supports downloading urls that are not html pages,
|
||||||
|
-- but we don't want to use it for such urls, since they can be downloaded
|
||||||
|
-- without it. So, this first downloads part of the content and checks
|
||||||
|
-- if it's a html page; only then is youtube-dl used.
|
||||||
youtubeDlCheck :: URLString -> Annex (Either String Bool)
|
youtubeDlCheck :: URLString -> Annex (Either String Bool)
|
||||||
youtubeDlCheck url = catchMsgIO $ do
|
youtubeDlCheck url = catchMsgIO $ do
|
||||||
|
uo <- getUrlOptions
|
||||||
|
liftIO (downloadPartial url uo htmlPrefixLength) >>= \case
|
||||||
|
Just bs | isHtmlBs bs -> do
|
||||||
opts <- youtubeDlOpts [ Param url, Param "--simulate" ]
|
opts <- youtubeDlOpts [ Param url, Param "--simulate" ]
|
||||||
liftIO $ snd <$> processTranscript "youtube-dl" (toCommand opts) Nothing
|
liftIO $ snd <$> processTranscript "youtube-dl" (toCommand opts) Nothing
|
||||||
|
_ -> return False
|
||||||
|
|
||||||
-- Ask youtube-dl for the filename of media in an url.
|
-- Ask youtube-dl for the filename of media in an url.
|
||||||
--
|
--
|
||||||
|
|
|
@ -9,20 +9,20 @@ module Utility.HtmlDetect where
|
||||||
|
|
||||||
import Text.HTML.TagSoup
|
import Text.HTML.TagSoup
|
||||||
import Data.Char
|
import Data.Char
|
||||||
|
import qualified Data.ByteString.Lazy as B
|
||||||
|
import qualified Data.ByteString.Lazy.Char8 as B8
|
||||||
|
|
||||||
-- | Detect if a string is a html document.
|
-- | Detect if a String is a html document.
|
||||||
--
|
--
|
||||||
-- The document many not be valid, and will still be detected as html,
|
-- The document many not be valid, or may be truncated, and will
|
||||||
-- as long as it starts with a "<html>" or "<!DOCTYPE html>" tag.
|
-- still be detected as html, as long as it starts with a
|
||||||
|
-- "<html>" or "<!DOCTYPE html>" tag.
|
||||||
--
|
--
|
||||||
-- Html fragments like "<p>this</p>" are not detected as being html,
|
-- Html fragments like "<p>this</p>" are not detected as being html,
|
||||||
-- although some browsers may chose to render them as html.
|
-- although some browsers may chose to render them as html.
|
||||||
isHtml :: String -> Bool
|
isHtml :: String -> Bool
|
||||||
isHtml = evaluate . canonicalizeTags . parseTags . shorten
|
isHtml = evaluate . canonicalizeTags . parseTags . take htmlPrefixLength
|
||||||
where
|
where
|
||||||
-- We only care about the beginning of the file,
|
|
||||||
-- so although tagsoup parses lazily anyway, truncate it.
|
|
||||||
shorten = take 16384
|
|
||||||
evaluate (TagOpen "!DOCTYPE" ((t, _):_):_) = map toLower t == "html"
|
evaluate (TagOpen "!DOCTYPE" ((t, _):_):_) = map toLower t == "html"
|
||||||
evaluate (TagOpen "html" _:_) = True
|
evaluate (TagOpen "html" _:_) = True
|
||||||
-- Allow some leading whitespace before the tag.
|
-- Allow some leading whitespace before the tag.
|
||||||
|
@ -33,3 +33,14 @@ isHtml = evaluate . canonicalizeTags . parseTags . shorten
|
||||||
-- tag, but easy to allow for.
|
-- tag, but easy to allow for.
|
||||||
evaluate (TagComment _:rest) = evaluate rest
|
evaluate (TagComment _:rest) = evaluate rest
|
||||||
evaluate _ = False
|
evaluate _ = False
|
||||||
|
|
||||||
|
-- | Detect if a ByteString is a html document.
|
||||||
|
isHtmlBs :: B.ByteString -> Bool
|
||||||
|
-- The encoding of the ByteString is not known, but isHtml only
|
||||||
|
-- looks for ascii strings.
|
||||||
|
isHtmlBs = isHtml . B8.unpack
|
||||||
|
|
||||||
|
-- | How much of the beginning of a html document is needed to detect it.
|
||||||
|
-- (conservatively)
|
||||||
|
htmlPrefixLength :: Int
|
||||||
|
htmlPrefixLength = 8192
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{- Url downloading.
|
{- Url downloading.
|
||||||
-
|
-
|
||||||
- Copyright 2011-2014 Joey Hess <id@joeyh.name>
|
- Copyright 2011-2017 Joey Hess <id@joeyh.name>
|
||||||
-
|
-
|
||||||
- License: BSD-2-clause
|
- License: BSD-2-clause
|
||||||
-}
|
-}
|
||||||
|
@ -25,6 +25,7 @@ module Utility.Url (
|
||||||
assumeUrlExists,
|
assumeUrlExists,
|
||||||
download,
|
download,
|
||||||
downloadQuiet,
|
downloadQuiet,
|
||||||
|
downloadPartial,
|
||||||
parseURIRelaxed,
|
parseURIRelaxed,
|
||||||
matchStatusCodeException,
|
matchStatusCodeException,
|
||||||
matchHttpExceptionContent,
|
matchHttpExceptionContent,
|
||||||
|
@ -39,8 +40,10 @@ import Network.HTTP.Types
|
||||||
import qualified Data.CaseInsensitive as CI
|
import qualified Data.CaseInsensitive as CI
|
||||||
import qualified Data.ByteString as B
|
import qualified Data.ByteString as B
|
||||||
import qualified Data.ByteString.UTF8 as B8
|
import qualified Data.ByteString.UTF8 as B8
|
||||||
|
import qualified Data.ByteString.Lazy as L
|
||||||
import Control.Monad.Trans.Resource
|
import Control.Monad.Trans.Resource
|
||||||
import Network.HTTP.Conduit hiding (closeManager)
|
import Network.HTTP.Conduit hiding (closeManager)
|
||||||
|
import Network.HTTP.Client (brRead, withResponse)
|
||||||
|
|
||||||
-- closeManager is needed with older versions of http-client,
|
-- closeManager is needed with older versions of http-client,
|
||||||
-- but not new versions, which warn about using it. Urgh.
|
-- but not new versions, which warn about using it. Urgh.
|
||||||
|
@ -140,7 +143,7 @@ assumeUrlExists = UrlInfo True Nothing Nothing
|
||||||
- also returning its size and suggested filename if available. -}
|
- also returning its size and suggested filename if available. -}
|
||||||
getUrlInfo :: URLString -> UrlOptions -> IO UrlInfo
|
getUrlInfo :: URLString -> UrlOptions -> IO UrlInfo
|
||||||
getUrlInfo url uo = case parseURIRelaxed url of
|
getUrlInfo url uo = case parseURIRelaxed url of
|
||||||
Just u -> case parseurlconduit (show u) of
|
Just u -> case parseUrlConduit (show u) of
|
||||||
Just req -> catchJust
|
Just req -> catchJust
|
||||||
-- When http redirects to a protocol which
|
-- When http redirects to a protocol which
|
||||||
-- conduit does not support, it will throw
|
-- conduit does not support, it will throw
|
||||||
|
@ -220,12 +223,6 @@ getUrlInfo url uo = case parseURIRelaxed url of
|
||||||
_ | isftp && isJust len -> good
|
_ | isftp && isJust len -> good
|
||||||
_ -> dne
|
_ -> dne
|
||||||
|
|
||||||
#if MIN_VERSION_http_client(0,4,30)
|
|
||||||
parseurlconduit = parseUrlThrow
|
|
||||||
#else
|
|
||||||
parseurlconduit = parseUrl
|
|
||||||
#endif
|
|
||||||
|
|
||||||
-- Parse eg: attachment; filename="fname.ext"
|
-- Parse eg: attachment; filename="fname.ext"
|
||||||
-- per RFC 2616
|
-- per RFC 2616
|
||||||
contentDispositionFilename :: String -> Maybe FilePath
|
contentDispositionFilename :: String -> Maybe FilePath
|
||||||
|
@ -321,11 +318,45 @@ download' quiet url file uo = do
|
||||||
| quiet = [Param s]
|
| quiet = [Param s]
|
||||||
| otherwise = []
|
| otherwise = []
|
||||||
|
|
||||||
|
{- Downloads at least the specified number of bytes from an url. -}
|
||||||
|
downloadPartial :: URLString -> UrlOptions -> Int -> IO (Maybe L.ByteString)
|
||||||
|
downloadPartial url uo n = case parseURIRelaxed url of
|
||||||
|
Nothing -> return Nothing
|
||||||
|
Just u -> go u `catchNonAsync` const (return Nothing)
|
||||||
|
where
|
||||||
|
go u = case parseUrlConduit (show u) of
|
||||||
|
Nothing -> return Nothing
|
||||||
|
Just req -> do
|
||||||
|
mgr <- newManager managerSettings
|
||||||
|
let req' = applyRequest uo req
|
||||||
|
ret <- withResponse req' mgr $ \resp ->
|
||||||
|
if responseStatus resp == ok200
|
||||||
|
then Just <$> brread n [] (responseBody resp)
|
||||||
|
else return Nothing
|
||||||
|
liftIO $ closeManager mgr
|
||||||
|
return ret
|
||||||
|
|
||||||
|
-- could use brReadSome here, needs newer http-client dependency
|
||||||
|
brread n' l rb
|
||||||
|
| n' <= 0 = return (L.fromChunks (reverse l))
|
||||||
|
| otherwise = do
|
||||||
|
bs <- brRead rb
|
||||||
|
if B.null bs
|
||||||
|
then return (L.fromChunks (reverse l))
|
||||||
|
else brread (n' - B.length bs) (bs:l) rb
|
||||||
|
|
||||||
{- Allows for spaces and other stuff in urls, properly escaping them. -}
|
{- Allows for spaces and other stuff in urls, properly escaping them. -}
|
||||||
parseURIRelaxed :: URLString -> Maybe URI
|
parseURIRelaxed :: URLString -> Maybe URI
|
||||||
parseURIRelaxed s = maybe (parseURIRelaxed' s) Just $
|
parseURIRelaxed s = maybe (parseURIRelaxed' s) Just $
|
||||||
parseURI $ escapeURIString isAllowedInURI s
|
parseURI $ escapeURIString isAllowedInURI s
|
||||||
|
|
||||||
|
#if MIN_VERSION_http_client(0,4,30)
|
||||||
|
parseUrlConduit :: URLString -> Maybe Request
|
||||||
|
parseUrlConduit = parseUrlThrow
|
||||||
|
#else
|
||||||
|
parseUrlConduit = parseUrl
|
||||||
|
#endif
|
||||||
|
|
||||||
{- Some characters like '[' are allowed in eg, the address of
|
{- Some characters like '[' are allowed in eg, the address of
|
||||||
- an uri, but cannot appear unescaped further along in the uri.
|
- an uri, but cannot appear unescaped further along in the uri.
|
||||||
- This handles that, expensively, by successively escaping each character
|
- This handles that, expensively, by successively escaping each character
|
||||||
|
|
|
@ -28,4 +28,4 @@ Some tests also failed related to our datalad archives git annex special remote
|
||||||
|
|
||||||
[[!meta author=yoh]]
|
[[!meta author=yoh]]
|
||||||
|
|
||||||
|
> [[fixed|done]] --[[Joey]]
|
||||||
|
|
Loading…
Reference in a new issue