this
" are not detected as being html, -- although some browsers may chose to render them as html. isHtml :: String -> Bool isHtml = evaluate . canonicalizeTags . parseTags . take htmlPrefixLength where evaluate (TagOpen "!DOCTYPE" ((t, _):_):_) = map toLower t == "html" evaluate (TagOpen "html" _:_) = True -- Allow some leading whitespace before the tag. evaluate (TagText t:rest) | all isSpace t = evaluate rest | otherwise = False -- It would be pretty weird to have a html comment before the html -- tag, but easy to allow for. evaluate (TagComment _:rest) = evaluate rest evaluate _ = False -- | Detect if a ByteString is a html document. isHtmlBs :: B.ByteString -> Bool -- The encoding of the ByteString is not known, but isHtml only -- looks for ascii strings. isHtmlBs = isHtml . B8.unpack -- | Check if the file is html. -- -- It would be equivilant to use isHtml <$> readFile file, -- but since that would not read all of the file, the handle -- would remain open until it got garbage collected sometime later. isHtmlFile :: FilePath -> IO Bool isHtmlFile file = withFile file ReadMode $ \h -> isHtmlBs <$> B.hGet h htmlPrefixLength -- | How much of the beginning of a html document is needed to detect it. -- (conservatively) htmlPrefixLength :: Int htmlPrefixLength = 8192