2017-11-28 16:50:30 +00:00
|
|
|
{- html detection
|
|
|
|
-
|
2021-07-05 17:02:40 +00:00
|
|
|
- Copyright 2017-2021 Joey Hess <id@joeyh.name>
|
2017-11-28 16:50:30 +00:00
|
|
|
-
|
|
|
|
- License: BSD-2-clause
|
|
|
|
-}
|
|
|
|
|
2019-11-21 19:38:06 +00:00
|
|
|
module Utility.HtmlDetect (
|
|
|
|
isHtml,
|
|
|
|
isHtmlBs,
|
2021-07-05 17:02:40 +00:00
|
|
|
isHtmlFile,
|
2019-11-21 19:38:06 +00:00
|
|
|
htmlPrefixLength,
|
|
|
|
) where
|
2017-11-28 16:50:30 +00:00
|
|
|
|
|
|
|
import Text.HTML.TagSoup
|
2021-07-05 17:02:40 +00:00
|
|
|
import System.IO
|
2017-11-28 16:50:30 +00:00
|
|
|
import Data.Char
|
2017-12-06 17:16:06 +00:00
|
|
|
import qualified Data.ByteString.Lazy as B
|
|
|
|
import qualified Data.ByteString.Lazy.Char8 as B8
|
2017-11-28 16:50:30 +00:00
|
|
|
|
2017-12-06 17:16:06 +00:00
|
|
|
-- | Detect if a String is a html document.
|
2017-11-28 16:50:30 +00:00
|
|
|
--
|
2017-12-06 17:16:06 +00:00
|
|
|
-- The document many not be valid, or may be truncated, and will
|
|
|
|
-- still be detected as html, as long as it starts with a
|
|
|
|
-- "<html>" or "<!DOCTYPE html>" tag.
|
2017-11-28 16:50:30 +00:00
|
|
|
--
|
|
|
|
-- Html fragments like "<p>this</p>" are not detected as being html,
|
|
|
|
-- although some browsers may chose to render them as html.
|
|
|
|
isHtml :: String -> Bool
|
2017-12-06 17:16:06 +00:00
|
|
|
isHtml = evaluate . canonicalizeTags . parseTags . take htmlPrefixLength
|
2017-11-28 16:50:30 +00:00
|
|
|
where
|
|
|
|
evaluate (TagOpen "!DOCTYPE" ((t, _):_):_) = map toLower t == "html"
|
|
|
|
evaluate (TagOpen "html" _:_) = True
|
|
|
|
-- Allow some leading whitespace before the tag.
|
|
|
|
evaluate (TagText t:rest)
|
|
|
|
| all isSpace t = evaluate rest
|
|
|
|
| otherwise = False
|
|
|
|
-- It would be pretty weird to have a html comment before the html
|
|
|
|
-- tag, but easy to allow for.
|
|
|
|
evaluate (TagComment _:rest) = evaluate rest
|
|
|
|
evaluate _ = False
|
2017-12-06 17:16:06 +00:00
|
|
|
|
|
|
|
-- | Detect if a ByteString is a html document.
|
|
|
|
isHtmlBs :: B.ByteString -> Bool
|
|
|
|
-- The encoding of the ByteString is not known, but isHtml only
|
|
|
|
-- looks for ascii strings.
|
|
|
|
isHtmlBs = isHtml . B8.unpack
|
|
|
|
|
2021-07-05 17:02:40 +00:00
|
|
|
-- | Check if the file is html.
|
|
|
|
--
|
2023-03-14 02:39:16 +00:00
|
|
|
-- It would be equivalent to use isHtml <$> readFile file,
|
2021-07-05 17:02:40 +00:00
|
|
|
-- but since that would not read all of the file, the handle
|
|
|
|
-- would remain open until it got garbage collected sometime later.
|
|
|
|
isHtmlFile :: FilePath -> IO Bool
|
|
|
|
isHtmlFile file = withFile file ReadMode $ \h ->
|
|
|
|
isHtmlBs <$> B.hGet h htmlPrefixLength
|
|
|
|
|
2017-12-06 17:16:06 +00:00
|
|
|
-- | How much of the beginning of a html document is needed to detect it.
|
|
|
|
-- (conservatively)
|
|
|
|
htmlPrefixLength :: Int
|
|
|
|
htmlPrefixLength = 8192
|