fix regression in addurl --file caused by youtube-dl support
Now youtubeDlCheck downloads the beginning of the url's content and checks if it's html, only when it is does it pass it off the youtube-dl to check if it supports it. This means more work is done for urls that youtube-dl does support, but is probably more efficient for other urls, since it only downloads the first chunk of content, while youtube-dl probably downloads more. As well as the reported bug, this also fixes behavior when an url was added with youtube-dl, but the url content has now changed from a html page to something else. Remote.Web.checkKey used to wrongly succeed in that situation, since youtube-dl said sure it can download that something else. This commit was supported by the NSF-funded DataLad project.
This commit is contained in:
parent
6b5e55a154
commit
c6e4bc0a22
4 changed files with 71 additions and 18 deletions
|
@ -9,20 +9,20 @@ module Utility.HtmlDetect where
|
|||
|
||||
import Text.HTML.TagSoup
|
||||
import Data.Char
|
||||
import qualified Data.ByteString.Lazy as B
|
||||
import qualified Data.ByteString.Lazy.Char8 as B8
|
||||
|
||||
-- | Detect if a string is a html document.
|
||||
-- | Detect if a String is a html document.
|
||||
--
|
||||
-- The document many not be valid, and will still be detected as html,
|
||||
-- as long as it starts with a "<html>" or "<!DOCTYPE html>" tag.
|
||||
-- The document many not be valid, or may be truncated, and will
|
||||
-- still be detected as html, as long as it starts with a
|
||||
-- "<html>" or "<!DOCTYPE html>" tag.
|
||||
--
|
||||
-- Html fragments like "<p>this</p>" are not detected as being html,
|
||||
-- although some browsers may chose to render them as html.
|
||||
isHtml :: String -> Bool
|
||||
isHtml = evaluate . canonicalizeTags . parseTags . shorten
|
||||
isHtml = evaluate . canonicalizeTags . parseTags . take htmlPrefixLength
|
||||
where
|
||||
-- We only care about the beginning of the file,
|
||||
-- so although tagsoup parses lazily anyway, truncate it.
|
||||
shorten = take 16384
|
||||
evaluate (TagOpen "!DOCTYPE" ((t, _):_):_) = map toLower t == "html"
|
||||
evaluate (TagOpen "html" _:_) = True
|
||||
-- Allow some leading whitespace before the tag.
|
||||
|
@ -33,3 +33,14 @@ isHtml = evaluate . canonicalizeTags . parseTags . shorten
|
|||
-- tag, but easy to allow for.
|
||||
evaluate (TagComment _:rest) = evaluate rest
|
||||
evaluate _ = False
|
||||
|
||||
-- | Detect if a ByteString is a html document.
|
||||
isHtmlBs :: B.ByteString -> Bool
|
||||
-- The encoding of the ByteString is not known, but isHtml only
|
||||
-- looks for ascii strings.
|
||||
isHtmlBs = isHtml . B8.unpack
|
||||
|
||||
-- | How much of the beginning of a html document is needed to detect it.
|
||||
-- (conservatively)
|
||||
htmlPrefixLength :: Int
|
||||
htmlPrefixLength = 8192
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue