fix regression in addurl --file caused by youtube-dl support

Now youtubeDlCheck downloads the beginning of the url's content and checks if it's html, only when it is does it pass it off the youtube-dl to check if it supports it. This means more work is done for urls that youtube-dl does support, but is probably more efficient for other urls, since it only downloads the first chunk of content, while youtube-dl probably downloads more. As well as the reported bug, this also fixes behavior when an url was added with youtube-dl, but the url content has now changed from a html page to something else. Remote.Web.checkKey used to wrongly succeed in that situation, since youtube-dl said sure it can download that something else. This commit was supported by the NSF-funded DataLad project.
2017-12-06 13:16:06 -04:00 · 2017-12-06 13:16:06 -04:00 · c6e4bc0a22
commit c6e4bc0a22
parent 6b5e55a154
4 changed files with 71 additions and 18 deletions
--- a/Annex/YoutubeDl.hs
+++ b/Annex/YoutubeDl.hs
@ -10,8 +10,10 @@ module Annex.YoutubeDl where
 import Annex.Common
 import qualified Annex
 import Annex.Content
 import Annex.Url
 import Utility.Url (URLString)
 import Utility.DiskFree
 import Utility.HtmlDetect
 import Logs.Transfer
 -- Runs youtube-dl in a work directory, to download a single media file
@ -108,10 +110,19 @@ youtubeDlSupported :: URLString -> Annex Bool
 youtubeDlSupported url = either (const False) id <$> youtubeDlCheck url
 -- Check if youtube-dl can find media in an url.
 --
 -- youtube-dl supports downloading urls that are not html pages,
 -- but we don't want to use it for such urls, since they can be downloaded
 -- without it. So, this first downloads part of the content and checks 
 -- if it's a html page; only then is youtube-dl used.
 youtubeDlCheck :: URLString -> Annex (Either String Bool)
 youtubeDlCheck url = catchMsgIO $ do
 	uo <- getUrlOptions
 	liftIO (downloadPartial url uo htmlPrefixLength) >>= \case
 		Just bs | isHtmlBs bs -> do
 			opts <- youtubeDlOpts [ Param url, Param "--simulate" ]
 			liftIO $ snd <$> processTranscript "youtube-dl" (toCommand opts) Nothing
 		_ -> return False
 -- Ask youtube-dl for the filename of media in an url.
 --
--- a/Utility/HtmlDetect.hs
+++ b/Utility/HtmlDetect.hs
@ -9,20 +9,20 @@ module Utility.HtmlDetect where
 import Text.HTML.TagSoup
 import Data.Char
 import qualified Data.ByteString.Lazy as B
 import qualified Data.ByteString.Lazy.Char8 as B8
-- | Detect if a string is a html document.
+-- | Detect if a String is a html document.
 --
-- The document many not be valid, and will still be detected as html,
+-- The document many not be valid, or may be truncated, and will
-- as long as it starts with a "<html>" or "<!DOCTYPE html>" tag.
+-- still be detected as html, as long as it starts with a
 -- "<html>" or "<!DOCTYPE html>" tag.
 --
 -- Html fragments like "<p>this</p>" are not detected as being html,
 -- although some browsers may chose to render them as html.
 isHtml :: String -> Bool
-isHtml = evaluate . canonicalizeTags . parseTags . shorten
+isHtml = evaluate . canonicalizeTags . parseTags . take htmlPrefixLength
  where
 	-- We only care about the beginning of the file,
 	-- so although tagsoup parses lazily anyway, truncate it.
 	shorten = take 16384
 	evaluate (TagOpen "!DOCTYPE" ((t, _):_):_) = map toLower t == "html"
 	evaluate (TagOpen "html" _:_) = True
 	-- Allow some leading whitespace before the tag.
@ -33,3 +33,14 @@ isHtml = evaluate . canonicalizeTags . parseTags . shorten
 	-- tag, but easy to allow for.
 	evaluate (TagComment _:rest) = evaluate rest
 	evaluate _ = False
 -- | Detect if a ByteString is a html document.
 isHtmlBs :: B.ByteString -> Bool
 -- The encoding of the ByteString is not known, but isHtml only
 -- looks for ascii strings.
 isHtmlBs = isHtml . B8.unpack
 -- | How much of the beginning of a html document is needed to detect it.
 -- (conservatively)
 htmlPrefixLength :: Int
 htmlPrefixLength = 8192
--- a/Utility/Url.hs
+++ b/Utility/Url.hs
@ -1,6 +1,6 @@
 {- Url downloading.
 -
- - Copyright 2011-2014 Joey Hess <id@joeyh.name>
+ - Copyright 2011-2017 Joey Hess <id@joeyh.name>
 -
 - License: BSD-2-clause
 -}
@ -25,6 +25,7 @@ module Utility.Url (
 	assumeUrlExists,
 	download,
 	downloadQuiet,
 	downloadPartial,
 	parseURIRelaxed,
 	matchStatusCodeException,
 	matchHttpExceptionContent,
@ -39,8 +40,10 @@ import Network.HTTP.Types
 import qualified Data.CaseInsensitive as CI
 import qualified Data.ByteString as B
 import qualified Data.ByteString.UTF8 as B8
 import qualified Data.ByteString.Lazy as L
 import Control.Monad.Trans.Resource
 import Network.HTTP.Conduit hiding (closeManager)
 import Network.HTTP.Client (brRead, withResponse)
 -- closeManager is needed with older versions of http-client,
 -- but not new versions, which warn about using it. Urgh.
@ -140,7 +143,7 @@ assumeUrlExists = UrlInfo True Nothing Nothing
 - also returning its size and suggested filename if available. -}
 getUrlInfo :: URLString -> UrlOptions -> IO UrlInfo
 getUrlInfo url uo = case parseURIRelaxed url of
-	Just u -> case parseurlconduit (show u) of
+	Just u -> case parseUrlConduit (show u) of
 		Just req -> catchJust
 			-- When http redirects to a protocol which 
 			-- conduit does not support, it will throw
@ -220,12 +223,6 @@ getUrlInfo url uo = case parseURIRelaxed url of
 			_ | isftp && isJust len -> good
 			_ -> dne
 #if MIN_VERSION_http_client(0,4,30)
 	parseurlconduit = parseUrlThrow
 #else
 	parseurlconduit = parseUrl
 #endif
 -- Parse eg: attachment; filename="fname.ext"
 -- per RFC 2616
 contentDispositionFilename :: String -> Maybe FilePath
@ -321,11 +318,45 @@ download' quiet url file uo = do
 		| quiet = [Param s]
 		| otherwise = []
 {- Downloads at least the specified number of bytes from an url. -}
 downloadPartial :: URLString -> UrlOptions -> Int -> IO (Maybe L.ByteString)
 downloadPartial url uo n = case parseURIRelaxed url of
 	Nothing -> return Nothing
 	Just u -> go u `catchNonAsync` const (return Nothing)
  where
 	go u = case parseUrlConduit (show u) of
 		Nothing -> return Nothing
 		Just req -> do
 			mgr <- newManager managerSettings
 			let req' = applyRequest uo req
 			ret <- withResponse req' mgr $ \resp ->
 				if responseStatus resp == ok200
 					then Just <$> brread n [] (responseBody resp)
 					else return Nothing
 			liftIO $ closeManager mgr
 			return ret
 	-- could use brReadSome here, needs newer http-client dependency
 	brread n' l rb
 		| n' <= 0 = return (L.fromChunks (reverse l))
 		| otherwise = do
 			bs <- brRead rb
 			if B.null bs
 				then return (L.fromChunks (reverse l))
 				else brread (n' - B.length bs) (bs:l) rb
 {- Allows for spaces and other stuff in urls, properly escaping them. -}
 parseURIRelaxed :: URLString -> Maybe URI
 parseURIRelaxed s = maybe (parseURIRelaxed' s) Just $
 	parseURI $ escapeURIString isAllowedInURI s
 #if MIN_VERSION_http_client(0,4,30)
 parseUrlConduit :: URLString -> Maybe Request
 parseUrlConduit = parseUrlThrow
 #else
 parseUrlConduit = parseUrl
 #endif
 {- Some characters like '[' are allowed in eg, the address of
 - an uri, but cannot appear unescaped further along in the uri.
 - This handles that, expensively, by successively escaping each character
--- a/doc/bugs/regression_-_yt58_prefix_for_34regular34_urls.mdwn
+++ b/doc/bugs/regression_-_yt58_prefix_for_34regular34_urls.mdwn
@ -28,4 +28,4 @@ Some tests also failed related to our datalad archives git annex special remote
 [[!meta author=yoh]]
-
+> [[fixed|done]] --[[Joey]]
`@ -28,4 +28,4 @@ Some tests also failed related to our datalad archives git annex special remote`

	`[[!meta author=yoh]]`	`[[!meta author=yoh]]`

		`> [[fixed\|done]] --[[Joey]]`