// Copyright 2020-2021 Signal Messenger, LLC // SPDX-License-Identifier: AGPL-3.0-only import { RequestInit, Response } from 'node-fetch'; import type { AbortSignal as AbortSignalForNodeFetch } from 'abort-controller'; import { IMAGE_GIF, IMAGE_ICO, IMAGE_JPEG, IMAGE_PNG, IMAGE_WEBP, MIMEType, stringToMIMEType, } from '../types/MIME'; import * as log from '../logging/log'; const USER_AGENT = 'WhatsApp/2'; const MAX_REQUEST_COUNT_WITH_REDIRECTS = 20; // Lifted from the `fetch` spec [here][0]. // [0]: https://fetch.spec.whatwg.org/#redirect-status const REDIRECT_STATUSES = new Set([301, 302, 303, 307, 308]); const MAX_CONTENT_TYPE_LENGTH_TO_PARSE = 100; // Though we'll accept HTML of any Content-Length (including no specified length), we // will only load some of the HTML. So we might start loading a 99 gigabyte HTML page // but only parse the first 1000 kilobytes. However, if the Content-Length is less than // this, we won't waste space. const MAX_HTML_BYTES_TO_LOAD = 1000 * 1024; // `x` is 8 bytes. Nothing else (meta tags, etc) will even fit, so we can ignore // it. This is mostly to protect us against empty response bodies. const MIN_HTML_CONTENT_LENGTH = 8; // Similar to the above. We don't want to show tiny images (even though the more likely // case is that the Content-Length is 0). const MIN_IMAGE_CONTENT_LENGTH = 8; const MAX_IMAGE_CONTENT_LENGTH = 1024 * 1024; const VALID_IMAGE_MIME_TYPES: Set<MIMEType> = new Set([ IMAGE_GIF, IMAGE_ICO, IMAGE_JPEG, IMAGE_PNG, IMAGE_WEBP, ]); // We want to discard unreasonable dates. Update this in ~950 years. (This may discard // some reasonable dates, which is okay because it is only for link previews.) const MIN_DATE = 0; const MAX_DATE = new Date(3000, 0, 1).valueOf(); const emptyContentType = { type: null, charset: null }; export type FetchFn = (href: string, init: RequestInit) => Promise<Response>; export type LinkPreviewMetadata = { title: string; description: null | string; date: null | number; imageHref: null | string; }; export type LinkPreviewImage = { data: Uint8Array; contentType: MIMEType; }; type ParsedContentType = | { type: null; charset: null } | { type: MIMEType; charset: null | string }; // This throws non-helpful errors because (1) it logs (2) it will be immediately caught. async function fetchWithRedirects( fetchFn: FetchFn, href: string, options: RequestInit ): Promise<Response> { const urlsSeen = new Set<string>(); let nextHrefToLoad = href; for (let i = 0; i < MAX_REQUEST_COUNT_WITH_REDIRECTS; i += 1) { if (urlsSeen.has(nextHrefToLoad)) { log.warn('fetchWithRedirects: found a redirect loop'); throw new Error('redirect loop'); } urlsSeen.add(nextHrefToLoad); // This `await` is deliberately inside of a loop. // eslint-disable-next-line no-await-in-loop const response = await fetchFn(nextHrefToLoad, { ...options, redirect: 'manual', }); if (!REDIRECT_STATUSES.has(response.status)) { return response; } const location = response.headers.get('location'); if (!location) { log.warn( 'fetchWithRedirects: got a redirect status code but no Location header; bailing' ); throw new Error('no location with redirect'); } const newUrl = maybeParseUrl(location, nextHrefToLoad); if (newUrl?.protocol !== 'https:') { log.warn( 'fetchWithRedirects: got a redirect status code and an invalid Location header' ); throw new Error('invalid location'); } nextHrefToLoad = newUrl.href; } log.warn('fetchWithRedirects: too many redirects'); throw new Error('too many redirects'); } function maybeParseUrl(href: string, base: string): null | URL { let result: URL; try { result = new URL(href, base); } catch (err) { return null; } // We never need the hash result.hash = ''; return result; } /** * Parses a Content-Type header value. Refer to [RFC 2045][0] for details (though this is * a simplified version for link previews. * [0]: https://tools.ietf.org/html/rfc2045 */ const parseContentType = (headerValue: string | null): ParsedContentType => { if (!headerValue || headerValue.length > MAX_CONTENT_TYPE_LENGTH_TO_PARSE) { return emptyContentType; } const [rawType, ...rawParameters] = headerValue .toLowerCase() .split(/;/g) .map(part => part.trim()) .filter(Boolean); if (!rawType) { return emptyContentType; } let charset: null | string = null; for (let i = 0; i < rawParameters.length; i += 1) { const rawParameter = rawParameters[i]; const parsed = new URLSearchParams(rawParameter); const parsedCharset = parsed.get('charset')?.trim(); if (parsedCharset) { charset = parsedCharset; break; } } return { type: stringToMIMEType(rawType), charset, }; }; const isInlineContentDisposition = (headerValue: string | null): boolean => !headerValue || headerValue.split(';', 1)[0] === 'inline'; const parseContentLength = (headerValue: string | null): number => { // No need to parse gigantic Content-Lengths; only parse the first 10 digits. if (typeof headerValue !== 'string' || !/^\d{1,10}$/g.test(headerValue)) { return Infinity; } const result = parseInt(headerValue, 10); return Number.isNaN(result) ? Infinity : result; }; type ValidCharset = | 'ascii' | 'utf8' | 'utf-8' | 'utf16le' | 'ucs2' | 'ucs-2' | 'base64' | 'latin1' | 'binary' | 'hex'; const VALID_CHARSETS = new Set([ 'ascii', 'utf8', 'utf-8', 'utf16le', 'ucs2', 'ucs-2', 'base64', 'latin1', 'binary', 'hex', ]); const checkCharset = (charSet: string | null): charSet is ValidCharset => { if (!charSet) { return false; } return VALID_CHARSETS.has(charSet); }; const emptyHtmlDocument = (): HTMLDocument => new DOMParser().parseFromString('', 'text/html'); // The charset behavior here follows the [W3 guidelines][0]. The priority is BOM, HTTP // header, `http-equiv` meta tag, `charset` meta tag, and finally a UTF-8 fallback. // (This fallback could, perhaps, be smarter based on user locale.) // [0]: https://www.w3.org/International/questions/qa-html-encoding-declarations.en const parseHtmlBytes = ( bytes: Readonly<Uint8Array>, httpCharset: string | null ): HTMLDocument => { const hasBom = bytes[0] === 0xef && bytes[1] === 0xbb && bytes[2] === 0xbf; let isSureOfCharset: boolean; let decoder: TextDecoder; if (hasBom) { decoder = new TextDecoder(); isSureOfCharset = true; } else if (httpCharset) { try { decoder = new TextDecoder(httpCharset); isSureOfCharset = true; } catch (err) { decoder = new TextDecoder(); isSureOfCharset = false; } } else { decoder = new TextDecoder(); isSureOfCharset = false; } let decoded: string; try { decoded = decoder.decode(bytes); } catch (err) { decoded = ''; } let document: HTMLDocument; try { document = new DOMParser().parseFromString(decoded, 'text/html'); } catch (err) { document = emptyHtmlDocument(); } if (!isSureOfCharset) { const httpEquiv = document .querySelector('meta[http-equiv="content-type"]') ?.getAttribute('content'); if (httpEquiv) { const httpEquivCharset = parseContentType(httpEquiv).charset; if (httpEquivCharset) { return parseHtmlBytes(bytes, httpEquivCharset); } } const metaCharset = document .querySelector('meta[charset]') ?.getAttribute('charset'); if (metaCharset) { return parseHtmlBytes(bytes, metaCharset); } } return document; }; const getHtmlDocument = async ( body: AsyncIterable<string | Uint8Array>, contentLength: number, httpCharset: string | null, abortSignal: AbortSignal ): Promise<HTMLDocument> => { let result: HTMLDocument = emptyHtmlDocument(); const maxHtmlBytesToLoad = Math.min(contentLength, MAX_HTML_BYTES_TO_LOAD); const buffer = new Uint8Array(maxHtmlBytesToLoad); let bytesLoadedSoFar = 0; try { for await (let chunk of body) { if (abortSignal.aborted) { break; } // This check exists to satisfy TypeScript; chunk should always be a Buffer. if (typeof chunk === 'string') { if (!checkCharset(httpCharset)) { throw new Error(`Invalid charset: ${httpCharset}`); } chunk = Buffer.from(chunk, httpCharset || 'utf8'); } const truncatedChunk = chunk.slice( 0, maxHtmlBytesToLoad - bytesLoadedSoFar ); buffer.set(truncatedChunk, bytesLoadedSoFar); bytesLoadedSoFar += truncatedChunk.byteLength; result = parseHtmlBytes(buffer.slice(0, bytesLoadedSoFar), httpCharset); const hasLoadedMaxBytes = bytesLoadedSoFar >= maxHtmlBytesToLoad; if (hasLoadedMaxBytes) { break; } } } catch (err) { log.warn( 'getHtmlDocument: error when reading body; continuing with what we got' ); } return result; }; const getOpenGraphContent = ( document: HTMLDocument, properties: ReadonlyArray<string> ): string | null => { for (let i = 0; i < properties.length; i += 1) { const property = properties[i]; const content = document .querySelector(`meta[property="${property}"]`) ?.getAttribute('content') ?.trim(); if (content) { return content; } } return null; }; const getLinkHrefAttribute = ( document: HTMLDocument, rels: ReadonlyArray<string> ): string | null => { for (let i = 0; i < rels.length; i += 1) { const rel = rels[i]; const href = document .querySelector(`link[rel="${rel}"]`) ?.getAttribute('href') ?.trim(); if (href) { return href; } } return null; }; const parseMetadata = ( document: HTMLDocument, href: string ): LinkPreviewMetadata | null => { const title = getOpenGraphContent(document, ['og:title']) || document.title.trim(); if (!title) { log.warn("parseMetadata: HTML document doesn't have a title; bailing"); return null; } const description = getOpenGraphContent(document, ['og:description']) || document .querySelector('meta[name="description"]') ?.getAttribute('content') ?.trim() || null; const rawImageHref = getOpenGraphContent(document, ['og:image', 'og:image:url']) || getLinkHrefAttribute(document, [ 'apple-touch-icon', 'apple-touch-icon-precomposed', 'shortcut icon', 'icon', ]); const imageUrl = rawImageHref ? maybeParseUrl(rawImageHref, href) : null; const imageHref = imageUrl ? imageUrl.href : null; let date: number | null = null; const rawDate = getOpenGraphContent(document, [ 'og:published_time', 'article:published_time', 'og:modified_time', 'article:modified_time', ]); if (rawDate) { const parsed = Date.parse(rawDate); if (parsed > MIN_DATE && parsed < MAX_DATE) { date = parsed; } } return { title, description, imageHref, date, }; }; /** * This attempts to fetch link preview metadata, returning `null` if it cannot be found * for any reason. * * NOTE: This does NOT validate the incoming URL for safety. For example, it may fetch an * insecure HTTP href. It also does not offer a timeout; that is up to the caller. * * At a high level, it: * * 1. Makes a GET request, following up to 20 redirects (`fetch`'s default). * 2. Checks the response status code and headers to make sure it's a normal HTML * response. * 3. Streams up to `MAX_HTML_BYTES_TO_LOAD`, stopping when (1) it has loaded all of the * HTML (2) loaded the maximum number of bytes (3) finished loading the `<head>`. * 4. Parses the resulting HTML with `DOMParser`. * 5. Grabs the title, description, image URL, and date. */ export async function fetchLinkPreviewMetadata( fetchFn: FetchFn, href: string, abortSignal: AbortSignal ): Promise<null | LinkPreviewMetadata> { let response: Response; try { response = await fetchWithRedirects(fetchFn, href, { headers: { Accept: 'text/html,application/xhtml+xml', 'User-Agent': USER_AGENT, }, signal: abortSignal as AbortSignalForNodeFetch, }); } catch (err) { log.warn( 'fetchLinkPreviewMetadata: failed to fetch link preview HTML; bailing' ); return null; } if (!response.ok) { log.warn( `fetchLinkPreviewMetadata: got a ${response.status} status code; bailing` ); return null; } if (!response.body) { log.warn('fetchLinkPreviewMetadata: no response body; bailing'); return null; } if ( !isInlineContentDisposition(response.headers.get('Content-Disposition')) ) { log.warn( 'fetchLinkPreviewMetadata: Content-Disposition header is not inline; bailing' ); return null; } if (abortSignal.aborted) { return null; } const contentLength = parseContentLength( response.headers.get('Content-Length') ); if (contentLength < MIN_HTML_CONTENT_LENGTH) { log.warn('fetchLinkPreviewMetadata: Content-Length is too short; bailing'); return null; } const contentType = parseContentType(response.headers.get('Content-Type')); if (contentType.type !== 'text/html') { log.warn('fetchLinkPreviewMetadata: Content-Type is not HTML; bailing'); return null; } const document = await getHtmlDocument( response.body, contentLength, contentType.charset, abortSignal ); // [The Node docs about `ReadableStream.prototype[Symbol.asyncIterator]`][0] say that // the stream will be destroyed if you `break` out of the loop, but I could not // reproduce this. Also [`destroy` is a documented method][1] but it is not in the // Node types, which is why we do this cast to `any`. // [0]: https://nodejs.org/docs/latest-v12.x/api/stream.html#stream_readable_symbol_asynciterator // [1]: https://nodejs.org/docs/latest-v12.x/api/stream.html#stream_readable_destroy_error try { // eslint-disable-next-line @typescript-eslint/no-explicit-any (response.body as any).destroy(); } catch (err) { // Ignored. } if (abortSignal.aborted) { return null; } return parseMetadata(document, response.url); } /** * This attempts to fetch an image, returning `null` if it fails for any reason. * * NOTE: This does NOT validate the incoming URL for safety. For example, it may fetch an * insecure HTTP href. It also does not offer a timeout; that is up to the caller. */ export async function fetchLinkPreviewImage( fetchFn: FetchFn, href: string, abortSignal: AbortSignal ): Promise<null | LinkPreviewImage> { let response: Response; try { response = await fetchWithRedirects(fetchFn, href, { headers: { 'User-Agent': USER_AGENT, }, size: MAX_IMAGE_CONTENT_LENGTH, signal: abortSignal as AbortSignalForNodeFetch, }); } catch (err) { log.warn('fetchLinkPreviewImage: failed to fetch image; bailing'); return null; } if (abortSignal.aborted) { return null; } if (!response.ok) { log.warn( `fetchLinkPreviewImage: got a ${response.status} status code; bailing` ); return null; } const contentLength = parseContentLength( response.headers.get('Content-Length') ); if (contentLength < MIN_IMAGE_CONTENT_LENGTH) { log.warn('fetchLinkPreviewImage: Content-Length is too short; bailing'); return null; } if (contentLength > MAX_IMAGE_CONTENT_LENGTH) { log.warn( 'fetchLinkPreviewImage: Content-Length is too large or is unset; bailing' ); return null; } const { type: contentType } = parseContentType( response.headers.get('Content-Type') ); if (!contentType || !VALID_IMAGE_MIME_TYPES.has(contentType)) { log.warn('fetchLinkPreviewImage: Content-Type is not an image; bailing'); return null; } let data: Uint8Array; try { data = await response.buffer(); } catch (err) { log.warn('fetchLinkPreviewImage: failed to read body; bailing'); return null; } if (abortSignal.aborted) { return null; } return { data, contentType }; }