// Copyright 2020-2021 Signal Messenger, LLC
// SPDX-License-Identifier: AGPL-3.0-only
import { RequestInit, Response } from 'node-fetch';
import type { AbortSignal as AbortSignalForNodeFetch } from 'abort-controller';
import {
IMAGE_GIF,
IMAGE_ICO,
IMAGE_JPEG,
IMAGE_PNG,
IMAGE_WEBP,
MIMEType,
stringToMIMEType,
} from '../types/MIME';
import * as log from '../logging/log';
const USER_AGENT = 'WhatsApp/2';
const MAX_REQUEST_COUNT_WITH_REDIRECTS = 20;
// Lifted from the `fetch` spec [here][0].
// [0]: https://fetch.spec.whatwg.org/#redirect-status
const REDIRECT_STATUSES = new Set([301, 302, 303, 307, 308]);
const MAX_CONTENT_TYPE_LENGTH_TO_PARSE = 100;
// Though we'll accept HTML of any Content-Length (including no specified length), we
// will only load some of the HTML. So we might start loading a 99 gigabyte HTML page
// but only parse the first 1000 kilobytes. However, if the Content-Length is less than
// this, we won't waste space.
const MAX_HTML_BYTES_TO_LOAD = 1000 * 1024;
// `
x` is 8 bytes. Nothing else (meta tags, etc) will even fit, so we can ignore
// it. This is mostly to protect us against empty response bodies.
const MIN_HTML_CONTENT_LENGTH = 8;
// Similar to the above. We don't want to show tiny images (even though the more likely
// case is that the Content-Length is 0).
const MIN_IMAGE_CONTENT_LENGTH = 8;
const MAX_IMAGE_CONTENT_LENGTH = 1024 * 1024;
const VALID_IMAGE_MIME_TYPES: Set = new Set([
IMAGE_GIF,
IMAGE_ICO,
IMAGE_JPEG,
IMAGE_PNG,
IMAGE_WEBP,
]);
// We want to discard unreasonable dates. Update this in ~950 years. (This may discard
// some reasonable dates, which is okay because it is only for link previews.)
const MIN_DATE = 0;
const MAX_DATE = new Date(3000, 0, 1).valueOf();
const emptyContentType = { type: null, charset: null };
export type FetchFn = (href: string, init: RequestInit) => Promise;
export type LinkPreviewMetadata = {
title: string;
description: null | string;
date: null | number;
imageHref: null | string;
};
export type LinkPreviewImage = {
data: Uint8Array;
contentType: MIMEType;
};
type ParsedContentType =
| { type: null; charset: null }
| { type: MIMEType; charset: null | string };
// This throws non-helpful errors because (1) it logs (2) it will be immediately caught.
async function fetchWithRedirects(
fetchFn: FetchFn,
href: string,
options: RequestInit
): Promise {
const urlsSeen = new Set();
let nextHrefToLoad = href;
for (let i = 0; i < MAX_REQUEST_COUNT_WITH_REDIRECTS; i += 1) {
if (urlsSeen.has(nextHrefToLoad)) {
log.warn('fetchWithRedirects: found a redirect loop');
throw new Error('redirect loop');
}
urlsSeen.add(nextHrefToLoad);
// This `await` is deliberately inside of a loop.
// eslint-disable-next-line no-await-in-loop
const response = await fetchFn(nextHrefToLoad, {
...options,
redirect: 'manual',
});
if (!REDIRECT_STATUSES.has(response.status)) {
return response;
}
const location = response.headers.get('location');
if (!location) {
log.warn(
'fetchWithRedirects: got a redirect status code but no Location header; bailing'
);
throw new Error('no location with redirect');
}
const newUrl = maybeParseUrl(location, nextHrefToLoad);
if (newUrl?.protocol !== 'https:') {
log.warn(
'fetchWithRedirects: got a redirect status code and an invalid Location header'
);
throw new Error('invalid location');
}
nextHrefToLoad = newUrl.href;
}
log.warn('fetchWithRedirects: too many redirects');
throw new Error('too many redirects');
}
function maybeParseUrl(href: string, base: string): null | URL {
let result: URL;
try {
result = new URL(href, base);
} catch (err) {
return null;
}
// We never need the hash
result.hash = '';
return result;
}
/**
* Parses a Content-Type header value. Refer to [RFC 2045][0] for details (though this is
* a simplified version for link previews.
* [0]: https://tools.ietf.org/html/rfc2045
*/
const parseContentType = (headerValue: string | null): ParsedContentType => {
if (!headerValue || headerValue.length > MAX_CONTENT_TYPE_LENGTH_TO_PARSE) {
return emptyContentType;
}
const [rawType, ...rawParameters] = headerValue
.toLowerCase()
.split(/;/g)
.map(part => part.trim())
.filter(Boolean);
if (!rawType) {
return emptyContentType;
}
let charset: null | string = null;
for (let i = 0; i < rawParameters.length; i += 1) {
const rawParameter = rawParameters[i];
const parsed = new URLSearchParams(rawParameter);
const parsedCharset = parsed.get('charset')?.trim();
if (parsedCharset) {
charset = parsedCharset;
break;
}
}
return {
type: stringToMIMEType(rawType),
charset,
};
};
const isInlineContentDisposition = (headerValue: string | null): boolean =>
!headerValue || headerValue.split(';', 1)[0] === 'inline';
const parseContentLength = (headerValue: string | null): number => {
// No need to parse gigantic Content-Lengths; only parse the first 10 digits.
if (typeof headerValue !== 'string' || !/^\d{1,10}$/g.test(headerValue)) {
return Infinity;
}
const result = parseInt(headerValue, 10);
return Number.isNaN(result) ? Infinity : result;
};
type ValidCharset =
| 'ascii'
| 'utf8'
| 'utf-8'
| 'utf16le'
| 'ucs2'
| 'ucs-2'
| 'base64'
| 'latin1'
| 'binary'
| 'hex';
const VALID_CHARSETS = new Set([
'ascii',
'utf8',
'utf-8',
'utf16le',
'ucs2',
'ucs-2',
'base64',
'latin1',
'binary',
'hex',
]);
const checkCharset = (charSet: string | null): charSet is ValidCharset => {
if (!charSet) {
return false;
}
return VALID_CHARSETS.has(charSet);
};
const emptyHtmlDocument = (): HTMLDocument =>
new DOMParser().parseFromString('', 'text/html');
// The charset behavior here follows the [W3 guidelines][0]. The priority is BOM, HTTP
// header, `http-equiv` meta tag, `charset` meta tag, and finally a UTF-8 fallback.
// (This fallback could, perhaps, be smarter based on user locale.)
// [0]: https://www.w3.org/International/questions/qa-html-encoding-declarations.en
const parseHtmlBytes = (
bytes: Readonly,
httpCharset: string | null
): HTMLDocument => {
const hasBom = bytes[0] === 0xef && bytes[1] === 0xbb && bytes[2] === 0xbf;
let isSureOfCharset: boolean;
let decoder: TextDecoder;
if (hasBom) {
decoder = new TextDecoder();
isSureOfCharset = true;
} else if (httpCharset) {
try {
decoder = new TextDecoder(httpCharset);
isSureOfCharset = true;
} catch (err) {
decoder = new TextDecoder();
isSureOfCharset = false;
}
} else {
decoder = new TextDecoder();
isSureOfCharset = false;
}
let decoded: string;
try {
decoded = decoder.decode(bytes);
} catch (err) {
decoded = '';
}
let document: HTMLDocument;
try {
document = new DOMParser().parseFromString(decoded, 'text/html');
} catch (err) {
document = emptyHtmlDocument();
}
if (!isSureOfCharset) {
const httpEquiv = document
.querySelector('meta[http-equiv="content-type"]')
?.getAttribute('content');
if (httpEquiv) {
const httpEquivCharset = parseContentType(httpEquiv).charset;
if (httpEquivCharset) {
return parseHtmlBytes(bytes, httpEquivCharset);
}
}
const metaCharset = document
.querySelector('meta[charset]')
?.getAttribute('charset');
if (metaCharset) {
return parseHtmlBytes(bytes, metaCharset);
}
}
return document;
};
const getHtmlDocument = async (
body: AsyncIterable,
contentLength: number,
httpCharset: string | null,
abortSignal: AbortSignal
): Promise => {
let result: HTMLDocument = emptyHtmlDocument();
const maxHtmlBytesToLoad = Math.min(contentLength, MAX_HTML_BYTES_TO_LOAD);
const buffer = new Uint8Array(maxHtmlBytesToLoad);
let bytesLoadedSoFar = 0;
try {
for await (let chunk of body) {
if (abortSignal.aborted) {
break;
}
// This check exists to satisfy TypeScript; chunk should always be a Buffer.
if (typeof chunk === 'string') {
if (!checkCharset(httpCharset)) {
throw new Error(`Invalid charset: ${httpCharset}`);
}
chunk = Buffer.from(chunk, httpCharset || 'utf8');
}
const truncatedChunk = chunk.slice(
0,
maxHtmlBytesToLoad - bytesLoadedSoFar
);
buffer.set(truncatedChunk, bytesLoadedSoFar);
bytesLoadedSoFar += truncatedChunk.byteLength;
result = parseHtmlBytes(buffer.slice(0, bytesLoadedSoFar), httpCharset);
const hasLoadedMaxBytes = bytesLoadedSoFar >= maxHtmlBytesToLoad;
if (hasLoadedMaxBytes) {
break;
}
}
} catch (err) {
log.warn(
'getHtmlDocument: error when reading body; continuing with what we got'
);
}
return result;
};
const getOpenGraphContent = (
document: HTMLDocument,
properties: ReadonlyArray
): string | null => {
for (let i = 0; i < properties.length; i += 1) {
const property = properties[i];
const content = document
.querySelector(`meta[property="${property}"]`)
?.getAttribute('content')
?.trim();
if (content) {
return content;
}
}
return null;
};
const getLinkHrefAttribute = (
document: HTMLDocument,
rels: ReadonlyArray
): string | null => {
for (let i = 0; i < rels.length; i += 1) {
const rel = rels[i];
const href = document
.querySelector(`link[rel="${rel}"]`)
?.getAttribute('href')
?.trim();
if (href) {
return href;
}
}
return null;
};
const parseMetadata = (
document: HTMLDocument,
href: string
): LinkPreviewMetadata | null => {
const title =
getOpenGraphContent(document, ['og:title']) || document.title.trim();
if (!title) {
log.warn("parseMetadata: HTML document doesn't have a title; bailing");
return null;
}
const description =
getOpenGraphContent(document, ['og:description']) ||
document
.querySelector('meta[name="description"]')
?.getAttribute('content')
?.trim() ||
null;
const rawImageHref =
getOpenGraphContent(document, ['og:image', 'og:image:url']) ||
getLinkHrefAttribute(document, [
'apple-touch-icon',
'apple-touch-icon-precomposed',
'shortcut icon',
'icon',
]);
const imageUrl = rawImageHref ? maybeParseUrl(rawImageHref, href) : null;
const imageHref = imageUrl ? imageUrl.href : null;
let date: number | null = null;
const rawDate = getOpenGraphContent(document, [
'og:published_time',
'article:published_time',
'og:modified_time',
'article:modified_time',
]);
if (rawDate) {
const parsed = Date.parse(rawDate);
if (parsed > MIN_DATE && parsed < MAX_DATE) {
date = parsed;
}
}
return {
title,
description,
imageHref,
date,
};
};
/**
* This attempts to fetch link preview metadata, returning `null` if it cannot be found
* for any reason.
*
* NOTE: This does NOT validate the incoming URL for safety. For example, it may fetch an
* insecure HTTP href. It also does not offer a timeout; that is up to the caller.
*
* At a high level, it:
*
* 1. Makes a GET request, following up to 20 redirects (`fetch`'s default).
* 2. Checks the response status code and headers to make sure it's a normal HTML
* response.
* 3. Streams up to `MAX_HTML_BYTES_TO_LOAD`, stopping when (1) it has loaded all of the
* HTML (2) loaded the maximum number of bytes (3) finished loading the ``.
* 4. Parses the resulting HTML with `DOMParser`.
* 5. Grabs the title, description, image URL, and date.
*/
export async function fetchLinkPreviewMetadata(
fetchFn: FetchFn,
href: string,
abortSignal: AbortSignal
): Promise {
let response: Response;
try {
response = await fetchWithRedirects(fetchFn, href, {
headers: {
Accept: 'text/html,application/xhtml+xml',
'User-Agent': USER_AGENT,
},
signal: abortSignal as AbortSignalForNodeFetch,
});
} catch (err) {
log.warn(
'fetchLinkPreviewMetadata: failed to fetch link preview HTML; bailing'
);
return null;
}
if (!response.ok) {
log.warn(
`fetchLinkPreviewMetadata: got a ${response.status} status code; bailing`
);
return null;
}
if (!response.body) {
log.warn('fetchLinkPreviewMetadata: no response body; bailing');
return null;
}
if (
!isInlineContentDisposition(response.headers.get('Content-Disposition'))
) {
log.warn(
'fetchLinkPreviewMetadata: Content-Disposition header is not inline; bailing'
);
return null;
}
if (abortSignal.aborted) {
return null;
}
const contentLength = parseContentLength(
response.headers.get('Content-Length')
);
if (contentLength < MIN_HTML_CONTENT_LENGTH) {
log.warn('fetchLinkPreviewMetadata: Content-Length is too short; bailing');
return null;
}
const contentType = parseContentType(response.headers.get('Content-Type'));
if (contentType.type !== 'text/html') {
log.warn('fetchLinkPreviewMetadata: Content-Type is not HTML; bailing');
return null;
}
const document = await getHtmlDocument(
response.body,
contentLength,
contentType.charset,
abortSignal
);
// [The Node docs about `ReadableStream.prototype[Symbol.asyncIterator]`][0] say that
// the stream will be destroyed if you `break` out of the loop, but I could not
// reproduce this. Also [`destroy` is a documented method][1] but it is not in the
// Node types, which is why we do this cast to `any`.
// [0]: https://nodejs.org/docs/latest-v12.x/api/stream.html#stream_readable_symbol_asynciterator
// [1]: https://nodejs.org/docs/latest-v12.x/api/stream.html#stream_readable_destroy_error
try {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(response.body as any).destroy();
} catch (err) {
// Ignored.
}
if (abortSignal.aborted) {
return null;
}
return parseMetadata(document, response.url);
}
/**
* This attempts to fetch an image, returning `null` if it fails for any reason.
*
* NOTE: This does NOT validate the incoming URL for safety. For example, it may fetch an
* insecure HTTP href. It also does not offer a timeout; that is up to the caller.
*/
export async function fetchLinkPreviewImage(
fetchFn: FetchFn,
href: string,
abortSignal: AbortSignal
): Promise {
let response: Response;
try {
response = await fetchWithRedirects(fetchFn, href, {
headers: {
'User-Agent': USER_AGENT,
},
size: MAX_IMAGE_CONTENT_LENGTH,
signal: abortSignal as AbortSignalForNodeFetch,
});
} catch (err) {
log.warn('fetchLinkPreviewImage: failed to fetch image; bailing');
return null;
}
if (abortSignal.aborted) {
return null;
}
if (!response.ok) {
log.warn(
`fetchLinkPreviewImage: got a ${response.status} status code; bailing`
);
return null;
}
const contentLength = parseContentLength(
response.headers.get('Content-Length')
);
if (contentLength < MIN_IMAGE_CONTENT_LENGTH) {
log.warn('fetchLinkPreviewImage: Content-Length is too short; bailing');
return null;
}
if (contentLength > MAX_IMAGE_CONTENT_LENGTH) {
log.warn(
'fetchLinkPreviewImage: Content-Length is too large or is unset; bailing'
);
return null;
}
const { type: contentType } = parseContentType(
response.headers.get('Content-Type')
);
if (!contentType || !VALID_IMAGE_MIME_TYPES.has(contentType)) {
log.warn('fetchLinkPreviewImage: Content-Type is not an image; bailing');
return null;
}
let data: Uint8Array;
try {
data = await response.buffer();
} catch (err) {
log.warn('fetchLinkPreviewImage: failed to read body; bailing');
return null;
}
if (abortSignal.aborted) {
return null;
}
return { data, contentType };
}