From 57b4c5bdff0a2f160514a02b792c7c570651568f Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 28 Nov 2017 12:50:30 -0400 Subject: [PATCH] add Utility.HtmlDetect This will be used in youtube-dl integration, to tell when a html page has been downloaded by addurl, in which case it is worth running youtube-dl to see if it can extract media from it. tagsoup is an almost free dependency, because yesod depends on it. So, this only really adds a dep when git-annex is built without the webapp. I'd like this to as closely as possible match how browsers decide if a page is html or not. Unfortunately, that is fairly heuristic, in order to support malformed html. And, we don't want to falsely detect something as html just because it has something that looks like a html tag embedded somewhere in it. Probably any major video hosting site is going to be serving html documents that at least start with a tag, so requiring that or a DOCTYPE should be good enough. This commit was sponsored by Jeff Goeke-Smith on Patreon. --- Utility/HtmlDetect.hs | 35 ++++++++++++++++++++ debian/control | 1 + doc/todo/switch_from_quvi_to_youtube-dl.mdwn | 8 +++++ git-annex.cabal | 2 ++ 4 files changed, 46 insertions(+) create mode 100644 Utility/HtmlDetect.hs diff --git a/Utility/HtmlDetect.hs b/Utility/HtmlDetect.hs new file mode 100644 index 0000000000..ca516e9607 --- /dev/null +++ b/Utility/HtmlDetect.hs @@ -0,0 +1,35 @@ +{- html detection + - + - Copyright 2017 Joey Hess + - + - License: BSD-2-clause + -} + +module Utility.HtmlDetect where + +import Text.HTML.TagSoup +import Data.Char + +-- | Detect if a string is a html document. +-- +-- The document many not be valid, and will still be detected as html, +-- as long as it starts with a "" or "" tag. +-- +-- Html fragments like "

this

" are not detected as being html, +-- although some browsers may chose to render them as html. +isHtml :: String -> Bool +isHtml = evaluate . canonicalizeTags . parseTags . truncate + where + -- We only care about the beginning of the file, + -- so although tagsoup parses lazily anyway, truncate it. + truncate = take 16384 + evaluate (TagOpen "!DOCTYPE" ((t, _):_):_) = map toLower t == "html" + evaluate (TagOpen "html" _:_) = True + -- Allow some leading whitespace before the tag. + evaluate (TagText t:rest) + | all isSpace t = evaluate rest + | otherwise = False + -- It would be pretty weird to have a html comment before the html + -- tag, but easy to allow for. + evaluate (TagComment _:rest) = evaluate rest + evaluate _ = False diff --git a/debian/control b/debian/control index b34a790020..9bbe4bade0 100644 --- a/debian/control +++ b/debian/control @@ -25,6 +25,7 @@ Build-Depends: libghc-dlist-dev, libghc-uuid-dev, libghc-aeson-dev, + libghc-tagsoup-dev, libghc-unordered-containers-dev, libghc-ifelse-dev, libghc-bloomfilter-dev, diff --git a/doc/todo/switch_from_quvi_to_youtube-dl.mdwn b/doc/todo/switch_from_quvi_to_youtube-dl.mdwn index cfdd8a8a62..82d61804ac 100644 --- a/doc/todo/switch_from_quvi_to_youtube-dl.mdwn +++ b/doc/todo/switch_from_quvi_to_youtube-dl.mdwn @@ -23,6 +23,14 @@ Both of those changes would need changes to user's workflows and cron jobs. git-annex could keep supporting quvi for some time, and warn when it uses quvi, to help with the transition. +> Alternatively, git-annex addurl could download the url first, and then +> check the file to see if it looks like html. If so, run youtube-dl (which +> unfortunately has to download it again) and see if it manages to rip +> media from it. This way, addurl of non-html files does not have extra +> overhead, and the redundant download is fairly small compared to ripping +> the media. Only the unusual case where addurl is being used on html that +> does not contain media becomes more expensive. + Another gotcha is playlists. youtube-dl downloads playlists automatically. But, git-annex needs to record an url that downloads a single file so that `git annex get` works right. So, playlists will need to be disabled when diff --git a/git-annex.cabal b/git-annex.cabal index 5d46caed32..780961d887 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -347,6 +347,7 @@ Executable git-annex persistent, persistent-template, aeson, + tagsoup, unordered-containers, feed (>= 0.3.9), regex-tdfa, @@ -1001,6 +1002,7 @@ Executable git-annex Utility.Glob Utility.Gpg Utility.Hash + Utility.HtmlDetect Utility.HumanNumber Utility.HumanTime Utility.InodeCache