594 lines
16 KiB
TypeScript
594 lines
16 KiB
TypeScript
// Copyright 2020-2021 Signal Messenger, LLC
|
|
// SPDX-License-Identifier: AGPL-3.0-only
|
|
|
|
import { RequestInit, Response } from 'node-fetch';
|
|
import type { AbortSignal as AbortSignalForNodeFetch } from 'abort-controller';
|
|
|
|
import {
|
|
IMAGE_GIF,
|
|
IMAGE_ICO,
|
|
IMAGE_JPEG,
|
|
IMAGE_PNG,
|
|
IMAGE_WEBP,
|
|
MIMEType,
|
|
stringToMIMEType,
|
|
} from '../types/MIME';
|
|
import * as log from '../logging/log';
|
|
|
|
const USER_AGENT = 'WhatsApp/2';
|
|
|
|
const MAX_REQUEST_COUNT_WITH_REDIRECTS = 20;
|
|
|
|
// Lifted from the `fetch` spec [here][0].
|
|
// [0]: https://fetch.spec.whatwg.org/#redirect-status
|
|
const REDIRECT_STATUSES = new Set([301, 302, 303, 307, 308]);
|
|
|
|
const MAX_CONTENT_TYPE_LENGTH_TO_PARSE = 100;
|
|
|
|
// Though we'll accept HTML of any Content-Length (including no specified length), we
|
|
// will only load some of the HTML. So we might start loading a 99 gigabyte HTML page
|
|
// but only parse the first 500 kilobytes. However, if the Content-Length is less than
|
|
// this, we won't waste space.
|
|
const MAX_HTML_BYTES_TO_LOAD = 500 * 1024;
|
|
|
|
// `<title>x` is 8 bytes. Nothing else (meta tags, etc) will even fit, so we can ignore
|
|
// it. This is mostly to protect us against empty response bodies.
|
|
const MIN_HTML_CONTENT_LENGTH = 8;
|
|
|
|
// Similar to the above. We don't want to show tiny images (even though the more likely
|
|
// case is that the Content-Length is 0).
|
|
const MIN_IMAGE_CONTENT_LENGTH = 8;
|
|
const MAX_IMAGE_CONTENT_LENGTH = 1024 * 1024;
|
|
const VALID_IMAGE_MIME_TYPES: Set<MIMEType> = new Set([
|
|
IMAGE_GIF,
|
|
IMAGE_ICO,
|
|
IMAGE_JPEG,
|
|
IMAGE_PNG,
|
|
IMAGE_WEBP,
|
|
]);
|
|
|
|
// We want to discard unreasonable dates. Update this in ~950 years. (This may discard
|
|
// some reasonable dates, which is okay because it is only for link previews.)
|
|
const MIN_DATE = 0;
|
|
const MAX_DATE = new Date(3000, 0, 1).valueOf();
|
|
|
|
const emptyContentType = { type: null, charset: null };
|
|
|
|
type FetchFn = (href: string, init: RequestInit) => Promise<Response>;
|
|
|
|
export type LinkPreviewMetadata = {
|
|
title: string;
|
|
description: null | string;
|
|
date: null | number;
|
|
imageHref: null | string;
|
|
};
|
|
|
|
export type LinkPreviewImage = {
|
|
data: Uint8Array;
|
|
contentType: MIMEType;
|
|
};
|
|
|
|
type ParsedContentType =
|
|
| { type: null; charset: null }
|
|
| { type: MIMEType; charset: null | string };
|
|
|
|
// This throws non-helpful errors because (1) it logs (2) it will be immediately caught.
|
|
async function fetchWithRedirects(
|
|
fetchFn: FetchFn,
|
|
href: string,
|
|
options: RequestInit
|
|
): Promise<Response> {
|
|
const urlsSeen = new Set<string>();
|
|
|
|
let nextHrefToLoad = href;
|
|
for (let i = 0; i < MAX_REQUEST_COUNT_WITH_REDIRECTS; i += 1) {
|
|
if (urlsSeen.has(nextHrefToLoad)) {
|
|
log.warn('fetchWithRedirects: found a redirect loop');
|
|
throw new Error('redirect loop');
|
|
}
|
|
urlsSeen.add(nextHrefToLoad);
|
|
|
|
// This `await` is deliberately inside of a loop.
|
|
// eslint-disable-next-line no-await-in-loop
|
|
const response = await fetchFn(nextHrefToLoad, {
|
|
...options,
|
|
redirect: 'manual',
|
|
});
|
|
|
|
if (!REDIRECT_STATUSES.has(response.status)) {
|
|
return response;
|
|
}
|
|
|
|
const location = response.headers.get('location');
|
|
if (!location) {
|
|
log.warn(
|
|
'fetchWithRedirects: got a redirect status code but no Location header; bailing'
|
|
);
|
|
throw new Error('no location with redirect');
|
|
}
|
|
|
|
const newUrl = maybeParseUrl(location, nextHrefToLoad);
|
|
if (newUrl?.protocol !== 'https:') {
|
|
log.warn(
|
|
'fetchWithRedirects: got a redirect status code and an invalid Location header'
|
|
);
|
|
throw new Error('invalid location');
|
|
}
|
|
|
|
nextHrefToLoad = newUrl.href;
|
|
}
|
|
|
|
log.warn('fetchWithRedirects: too many redirects');
|
|
throw new Error('too many redirects');
|
|
}
|
|
|
|
function maybeParseUrl(href: string, base: string): null | URL {
|
|
let result: URL;
|
|
try {
|
|
result = new URL(href, base);
|
|
} catch (err) {
|
|
return null;
|
|
}
|
|
// We never need the hash
|
|
result.hash = '';
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Parses a Content-Type header value. Refer to [RFC 2045][0] for details (though this is
|
|
* a simplified version for link previews.
|
|
* [0]: https://tools.ietf.org/html/rfc2045
|
|
*/
|
|
const parseContentType = (headerValue: string | null): ParsedContentType => {
|
|
if (!headerValue || headerValue.length > MAX_CONTENT_TYPE_LENGTH_TO_PARSE) {
|
|
return emptyContentType;
|
|
}
|
|
|
|
const [rawType, ...rawParameters] = headerValue
|
|
.toLowerCase()
|
|
.split(/;/g)
|
|
.map(part => part.trim())
|
|
.filter(Boolean);
|
|
if (!rawType) {
|
|
return emptyContentType;
|
|
}
|
|
|
|
let charset: null | string = null;
|
|
for (let i = 0; i < rawParameters.length; i += 1) {
|
|
const rawParameter = rawParameters[i];
|
|
const parsed = new URLSearchParams(rawParameter);
|
|
const parsedCharset = parsed.get('charset')?.trim();
|
|
if (parsedCharset) {
|
|
charset = parsedCharset;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return {
|
|
type: stringToMIMEType(rawType),
|
|
charset,
|
|
};
|
|
};
|
|
|
|
const isInlineContentDisposition = (headerValue: string | null): boolean =>
|
|
!headerValue || headerValue.split(';', 1)[0] === 'inline';
|
|
|
|
const parseContentLength = (headerValue: string | null): number => {
|
|
// No need to parse gigantic Content-Lengths; only parse the first 10 digits.
|
|
if (typeof headerValue !== 'string' || !/^\d{1,10}$/g.test(headerValue)) {
|
|
return Infinity;
|
|
}
|
|
const result = parseInt(headerValue, 10);
|
|
return Number.isNaN(result) ? Infinity : result;
|
|
};
|
|
|
|
type ValidCharset =
|
|
| 'ascii'
|
|
| 'utf8'
|
|
| 'utf-8'
|
|
| 'utf16le'
|
|
| 'ucs2'
|
|
| 'ucs-2'
|
|
| 'base64'
|
|
| 'latin1'
|
|
| 'binary'
|
|
| 'hex';
|
|
|
|
const VALID_CHARSETS = new Set([
|
|
'ascii',
|
|
'utf8',
|
|
'utf-8',
|
|
'utf16le',
|
|
'ucs2',
|
|
'ucs-2',
|
|
'base64',
|
|
'latin1',
|
|
'binary',
|
|
'hex',
|
|
]);
|
|
|
|
const checkCharset = (charSet: string | null): charSet is ValidCharset => {
|
|
if (!charSet) {
|
|
return false;
|
|
}
|
|
return VALID_CHARSETS.has(charSet);
|
|
};
|
|
|
|
const emptyHtmlDocument = (): HTMLDocument =>
|
|
new DOMParser().parseFromString('', 'text/html');
|
|
|
|
// The charset behavior here follows the [W3 guidelines][0]. The priority is BOM, HTTP
|
|
// header, `http-equiv` meta tag, `charset` meta tag, and finally a UTF-8 fallback.
|
|
// (This fallback could, perhaps, be smarter based on user locale.)
|
|
// [0]: https://www.w3.org/International/questions/qa-html-encoding-declarations.en
|
|
const parseHtmlBytes = (
|
|
bytes: Readonly<Uint8Array>,
|
|
httpCharset: string | null
|
|
): HTMLDocument => {
|
|
const hasBom = bytes[0] === 0xef && bytes[1] === 0xbb && bytes[2] === 0xbf;
|
|
|
|
let isSureOfCharset: boolean;
|
|
let decoder: TextDecoder;
|
|
if (hasBom) {
|
|
decoder = new TextDecoder();
|
|
isSureOfCharset = true;
|
|
} else if (httpCharset) {
|
|
try {
|
|
decoder = new TextDecoder(httpCharset);
|
|
isSureOfCharset = true;
|
|
} catch (err) {
|
|
decoder = new TextDecoder();
|
|
isSureOfCharset = false;
|
|
}
|
|
} else {
|
|
decoder = new TextDecoder();
|
|
isSureOfCharset = false;
|
|
}
|
|
|
|
let decoded: string;
|
|
try {
|
|
decoded = decoder.decode(bytes);
|
|
} catch (err) {
|
|
decoded = '';
|
|
}
|
|
|
|
let document: HTMLDocument;
|
|
try {
|
|
document = new DOMParser().parseFromString(decoded, 'text/html');
|
|
} catch (err) {
|
|
document = emptyHtmlDocument();
|
|
}
|
|
|
|
if (!isSureOfCharset) {
|
|
const httpEquiv = document
|
|
.querySelector('meta[http-equiv="content-type"]')
|
|
?.getAttribute('content');
|
|
if (httpEquiv) {
|
|
const httpEquivCharset = parseContentType(httpEquiv).charset;
|
|
if (httpEquivCharset) {
|
|
return parseHtmlBytes(bytes, httpEquivCharset);
|
|
}
|
|
}
|
|
|
|
const metaCharset = document
|
|
.querySelector('meta[charset]')
|
|
?.getAttribute('charset');
|
|
if (metaCharset) {
|
|
return parseHtmlBytes(bytes, metaCharset);
|
|
}
|
|
}
|
|
|
|
return document;
|
|
};
|
|
|
|
const getHtmlDocument = async (
|
|
body: AsyncIterable<string | Uint8Array>,
|
|
contentLength: number,
|
|
httpCharset: string | null,
|
|
abortSignal: AbortSignal
|
|
): Promise<HTMLDocument> => {
|
|
let result: HTMLDocument = emptyHtmlDocument();
|
|
|
|
const maxHtmlBytesToLoad = Math.min(contentLength, MAX_HTML_BYTES_TO_LOAD);
|
|
const buffer = new Uint8Array(maxHtmlBytesToLoad);
|
|
let bytesLoadedSoFar = 0;
|
|
|
|
try {
|
|
for await (let chunk of body) {
|
|
if (abortSignal.aborted) {
|
|
break;
|
|
}
|
|
|
|
// This check exists to satisfy TypeScript; chunk should always be a Buffer.
|
|
if (typeof chunk === 'string') {
|
|
if (!checkCharset(httpCharset)) {
|
|
throw new Error(`Invalid charset: ${httpCharset}`);
|
|
}
|
|
chunk = Buffer.from(chunk, httpCharset || 'utf8');
|
|
}
|
|
|
|
const truncatedChunk = chunk.slice(
|
|
0,
|
|
maxHtmlBytesToLoad - bytesLoadedSoFar
|
|
);
|
|
buffer.set(truncatedChunk, bytesLoadedSoFar);
|
|
bytesLoadedSoFar += truncatedChunk.byteLength;
|
|
|
|
result = parseHtmlBytes(buffer.slice(0, bytesLoadedSoFar), httpCharset);
|
|
|
|
const hasLoadedMaxBytes = bytesLoadedSoFar >= maxHtmlBytesToLoad;
|
|
if (hasLoadedMaxBytes) {
|
|
break;
|
|
}
|
|
}
|
|
} catch (err) {
|
|
log.warn(
|
|
'getHtmlDocument: error when reading body; continuing with what we got'
|
|
);
|
|
}
|
|
|
|
return result;
|
|
};
|
|
|
|
const getOpenGraphContent = (
|
|
document: HTMLDocument,
|
|
properties: ReadonlyArray<string>
|
|
): string | null => {
|
|
for (let i = 0; i < properties.length; i += 1) {
|
|
const property = properties[i];
|
|
const content = document
|
|
.querySelector(`meta[property="${property}"]`)
|
|
?.getAttribute('content')
|
|
?.trim();
|
|
if (content) {
|
|
return content;
|
|
}
|
|
}
|
|
return null;
|
|
};
|
|
|
|
const getLinkHrefAttribute = (
|
|
document: HTMLDocument,
|
|
rels: ReadonlyArray<string>
|
|
): string | null => {
|
|
for (let i = 0; i < rels.length; i += 1) {
|
|
const rel = rels[i];
|
|
const href = document
|
|
.querySelector(`link[rel="${rel}"]`)
|
|
?.getAttribute('href')
|
|
?.trim();
|
|
if (href) {
|
|
return href;
|
|
}
|
|
}
|
|
return null;
|
|
};
|
|
|
|
const parseMetadata = (
|
|
document: HTMLDocument,
|
|
href: string
|
|
): LinkPreviewMetadata | null => {
|
|
const title =
|
|
getOpenGraphContent(document, ['og:title']) || document.title.trim();
|
|
if (!title) {
|
|
log.warn("parseMetadata: HTML document doesn't have a title; bailing");
|
|
return null;
|
|
}
|
|
|
|
const description =
|
|
getOpenGraphContent(document, ['og:description']) ||
|
|
document
|
|
.querySelector('meta[name="description"]')
|
|
?.getAttribute('content')
|
|
?.trim() ||
|
|
null;
|
|
|
|
const rawImageHref =
|
|
getOpenGraphContent(document, ['og:image', 'og:image:url']) ||
|
|
getLinkHrefAttribute(document, [
|
|
'apple-touch-icon',
|
|
'apple-touch-icon-precomposed',
|
|
'shortcut icon',
|
|
'icon',
|
|
]);
|
|
const imageUrl = rawImageHref ? maybeParseUrl(rawImageHref, href) : null;
|
|
const imageHref = imageUrl ? imageUrl.href : null;
|
|
|
|
let date: number | null = null;
|
|
const rawDate = getOpenGraphContent(document, [
|
|
'og:published_time',
|
|
'article:published_time',
|
|
'og:modified_time',
|
|
'article:modified_time',
|
|
]);
|
|
if (rawDate) {
|
|
const parsed = Date.parse(rawDate);
|
|
if (parsed > MIN_DATE && parsed < MAX_DATE) {
|
|
date = parsed;
|
|
}
|
|
}
|
|
|
|
return {
|
|
title,
|
|
description,
|
|
imageHref,
|
|
date,
|
|
};
|
|
};
|
|
|
|
/**
|
|
* This attempts to fetch link preview metadata, returning `null` if it cannot be found
|
|
* for any reason.
|
|
*
|
|
* NOTE: This does NOT validate the incoming URL for safety. For example, it may fetch an
|
|
* insecure HTTP href. It also does not offer a timeout; that is up to the caller.
|
|
*
|
|
* At a high level, it:
|
|
*
|
|
* 1. Makes a GET request, following up to 20 redirects (`fetch`'s default).
|
|
* 2. Checks the response status code and headers to make sure it's a normal HTML
|
|
* response.
|
|
* 3. Streams up to `MAX_HTML_BYTES_TO_LOAD`, stopping when (1) it has loaded all of the
|
|
* HTML (2) loaded the maximum number of bytes (3) finished loading the `<head>`.
|
|
* 4. Parses the resulting HTML with `DOMParser`.
|
|
* 5. Grabs the title, description, image URL, and date.
|
|
*/
|
|
export async function fetchLinkPreviewMetadata(
|
|
fetchFn: FetchFn,
|
|
href: string,
|
|
abortSignal: AbortSignal
|
|
): Promise<null | LinkPreviewMetadata> {
|
|
let response: Response;
|
|
try {
|
|
response = await fetchWithRedirects(fetchFn, href, {
|
|
headers: {
|
|
Accept: 'text/html,application/xhtml+xml',
|
|
'User-Agent': USER_AGENT,
|
|
},
|
|
signal: abortSignal as AbortSignalForNodeFetch,
|
|
});
|
|
} catch (err) {
|
|
log.warn(
|
|
'fetchLinkPreviewMetadata: failed to fetch link preview HTML; bailing'
|
|
);
|
|
return null;
|
|
}
|
|
|
|
if (!response.ok) {
|
|
log.warn(
|
|
`fetchLinkPreviewMetadata: got a ${response.status} status code; bailing`
|
|
);
|
|
return null;
|
|
}
|
|
|
|
if (!response.body) {
|
|
log.warn('fetchLinkPreviewMetadata: no response body; bailing');
|
|
return null;
|
|
}
|
|
|
|
if (
|
|
!isInlineContentDisposition(response.headers.get('Content-Disposition'))
|
|
) {
|
|
log.warn(
|
|
'fetchLinkPreviewMetadata: Content-Disposition header is not inline; bailing'
|
|
);
|
|
return null;
|
|
}
|
|
|
|
if (abortSignal.aborted) {
|
|
return null;
|
|
}
|
|
|
|
const contentLength = parseContentLength(
|
|
response.headers.get('Content-Length')
|
|
);
|
|
if (contentLength < MIN_HTML_CONTENT_LENGTH) {
|
|
log.warn('fetchLinkPreviewMetadata: Content-Length is too short; bailing');
|
|
return null;
|
|
}
|
|
|
|
const contentType = parseContentType(response.headers.get('Content-Type'));
|
|
if (contentType.type !== 'text/html') {
|
|
log.warn('fetchLinkPreviewMetadata: Content-Type is not HTML; bailing');
|
|
return null;
|
|
}
|
|
|
|
const document = await getHtmlDocument(
|
|
response.body,
|
|
contentLength,
|
|
contentType.charset,
|
|
abortSignal
|
|
);
|
|
|
|
// [The Node docs about `ReadableStream.prototype[Symbol.asyncIterator]`][0] say that
|
|
// the stream will be destroyed if you `break` out of the loop, but I could not
|
|
// reproduce this. Also [`destroy` is a documented method][1] but it is not in the
|
|
// Node types, which is why we do this cast to `any`.
|
|
// [0]: https://nodejs.org/docs/latest-v12.x/api/stream.html#stream_readable_symbol_asynciterator
|
|
// [1]: https://nodejs.org/docs/latest-v12.x/api/stream.html#stream_readable_destroy_error
|
|
try {
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
(response.body as any).destroy();
|
|
} catch (err) {
|
|
// Ignored.
|
|
}
|
|
|
|
if (abortSignal.aborted) {
|
|
return null;
|
|
}
|
|
|
|
return parseMetadata(document, response.url);
|
|
}
|
|
|
|
/**
|
|
* This attempts to fetch an image, returning `null` if it fails for any reason.
|
|
*
|
|
* NOTE: This does NOT validate the incoming URL for safety. For example, it may fetch an
|
|
* insecure HTTP href. It also does not offer a timeout; that is up to the caller.
|
|
*/
|
|
export async function fetchLinkPreviewImage(
|
|
fetchFn: FetchFn,
|
|
href: string,
|
|
abortSignal: AbortSignal
|
|
): Promise<null | LinkPreviewImage> {
|
|
let response: Response;
|
|
try {
|
|
response = await fetchWithRedirects(fetchFn, href, {
|
|
headers: {
|
|
'User-Agent': USER_AGENT,
|
|
},
|
|
size: MAX_IMAGE_CONTENT_LENGTH,
|
|
signal: abortSignal as AbortSignalForNodeFetch,
|
|
});
|
|
} catch (err) {
|
|
log.warn('fetchLinkPreviewImage: failed to fetch image; bailing');
|
|
return null;
|
|
}
|
|
|
|
if (abortSignal.aborted) {
|
|
return null;
|
|
}
|
|
|
|
if (!response.ok) {
|
|
log.warn(
|
|
`fetchLinkPreviewImage: got a ${response.status} status code; bailing`
|
|
);
|
|
return null;
|
|
}
|
|
|
|
const contentLength = parseContentLength(
|
|
response.headers.get('Content-Length')
|
|
);
|
|
if (contentLength < MIN_IMAGE_CONTENT_LENGTH) {
|
|
log.warn('fetchLinkPreviewImage: Content-Length is too short; bailing');
|
|
return null;
|
|
}
|
|
if (contentLength > MAX_IMAGE_CONTENT_LENGTH) {
|
|
log.warn(
|
|
'fetchLinkPreviewImage: Content-Length is too large or is unset; bailing'
|
|
);
|
|
return null;
|
|
}
|
|
|
|
const { type: contentType } = parseContentType(
|
|
response.headers.get('Content-Type')
|
|
);
|
|
if (!contentType || !VALID_IMAGE_MIME_TYPES.has(contentType)) {
|
|
log.warn('fetchLinkPreviewImage: Content-Type is not an image; bailing');
|
|
return null;
|
|
}
|
|
|
|
let data: Uint8Array;
|
|
try {
|
|
data = await response.buffer();
|
|
} catch (err) {
|
|
log.warn('fetchLinkPreviewImage: failed to read body; bailing');
|
|
return null;
|
|
}
|
|
|
|
if (abortSignal.aborted) {
|
|
return null;
|
|
}
|
|
|
|
return { data, contentType };
|
|
}
|