Outbound link previews

2020-09-28 18:46:31 -05:00 · 2020-09-28 18:46:31 -05:00 · 313faab774
commit 313faab774
parent bb3ab816dd
25 changed files with 2136 additions and 641 deletions
--- a/ts/linkPreviews/isLinkPreviewDateValid.ts
+++ b/ts/linkPreviews/isLinkPreviewDateValid.ts
@ -0,0 +1,11 @@
+const ONE_DAY = 24 * 60 * 60 * 1000;
+
+export function isLinkPreviewDateValid(value: unknown): value is number {
+  const maximumLinkPreviewDate = Date.now() + ONE_DAY;
+  return (
+    typeof value === 'number' &&
+    value !== 0 &&
+    Number.isFinite(value) &&
+    value < maximumLinkPreviewDate
+  );
+}
--- a/ts/linkPreviews/linkPreviewFetch.ts
+++ b/ts/linkPreviews/linkPreviewFetch.ts
@ -0,0 +1,506 @@
+import { RequestInit, Response } from 'node-fetch';
+import { AbortSignal } from 'abort-controller';
+
+import {
+  IMAGE_GIF,
+  IMAGE_ICO,
+  IMAGE_JPEG,
+  IMAGE_PNG,
+  IMAGE_WEBP,
+  MIMEType,
+} from '../types/MIME';
+
+const MAX_CONTENT_TYPE_LENGTH_TO_PARSE = 100;
+
+// Though we'll accept HTML of any Content-Length (including no specified length), we
+//   will only load some of the HTML. So we might start loading a 99 gigabyte HTML page
+//   but only parse the first 100 kilobytes. However, if the Content-Length is less than
+//   this, we won't waste space.
+const MAX_HTML_BYTES_TO_LOAD = 100 * 1024;
+
+// `<title>x` is 8 bytes. Nothing else (meta tags, etc) will even fit, so we can ignore
+//   it. This is mostly to protect us against empty response bodies.
+const MIN_HTML_CONTENT_LENGTH = 8;
+
+// Similar to the above. We don't want to show tiny images (even though the more likely
+//   case is that the Content-Length is 0).
+const MIN_IMAGE_CONTENT_LENGTH = 8;
+const MAX_IMAGE_CONTENT_LENGTH = 1024 * 1024;
+const VALID_IMAGE_MIME_TYPES: Set<MIMEType> = new Set([
+  IMAGE_GIF,
+  IMAGE_ICO,
+  IMAGE_JPEG,
+  IMAGE_PNG,
+  IMAGE_WEBP,
+]);
+
+// We want to discard unreasonable dates. Update this in ~950 years. (This may discard
+//   some reasonable dates, which is okay because it is only for link previews.)
+const MIN_DATE = 0;
+const MAX_DATE = new Date(3000, 0, 1).valueOf();
+
+const emptyContentType = { type: null, charset: null };
+
+type FetchFn = (href: string, init: RequestInit) => Promise<Response>;
+
+export interface LinkPreviewMetadata {
+  title: string;
+  description: null | string;
+  date: null | number;
+  imageHref: null | string;
+}
+
+export interface LinkPreviewImage {
+  data: ArrayBuffer;
+  contentType: MIMEType;
+}
+
+type ParsedContentType =
+  | { type: null; charset: null }
+  | { type: MIMEType; charset: null | string };
+
+/**
+ * Parses a Content-Type header value. Refer to [RFC 2045][0] for details (though this is
+ * a simplified version for link previews.
+ * [0]: https://tools.ietf.org/html/rfc2045
+ */
+const parseContentType = (headerValue: string | null): ParsedContentType => {
+  if (!headerValue || headerValue.length > MAX_CONTENT_TYPE_LENGTH_TO_PARSE) {
+    return emptyContentType;
+  }
+
+  const [rawType, ...rawParameters] = headerValue
+    .toLowerCase()
+    .split(/;/g)
+    .map(part => part.trim())
+    .filter(Boolean);
+  if (!rawType) {
+    return emptyContentType;
+  }
+
+  let charset: null | string = null;
+  for (let i = 0; i < rawParameters.length; i += 1) {
+    const rawParameter = rawParameters[i];
+    const parsed = new URLSearchParams(rawParameter);
+    const parsedCharset = parsed.get('charset')?.trim();
+    if (parsedCharset) {
+      charset = parsedCharset;
+      break;
+    }
+  }
+
+  return {
+    type: rawType as MIMEType,
+    charset,
+  };
+};
+
+const isInlineContentDisposition = (headerValue: string | null): boolean =>
+  !headerValue || headerValue.split(';', 1)[0] === 'inline';
+
+const parseContentLength = (headerValue: string | null): number => {
+  // No need to parse gigantic Content-Lengths; only parse the first 10 digits.
+  if (typeof headerValue !== 'string' || !/^\d{1,10}$/g.test(headerValue)) {
+    return Infinity;
+  }
+  const result = parseInt(headerValue, 10);
+  return Number.isNaN(result) ? Infinity : result;
+};
+
+const emptyHtmlDocument = (): HTMLDocument =>
+  new DOMParser().parseFromString('', 'text/html');
+
+// The charset behavior here follows the [W3 guidelines][0]. The priority is BOM, HTTP
+//   header, `http-equiv` meta tag, `charset` meta tag, and finally a UTF-8 fallback.
+//   (This fallback could, perhaps, be smarter based on user locale.)
+// [0]: https://www.w3.org/International/questions/qa-html-encoding-declarations.en
+const parseHtmlBytes = (
+  bytes: Readonly<Uint8Array>,
+  httpCharset: string | null
+): HTMLDocument => {
+  const hasBom = bytes[0] === 0xef && bytes[1] === 0xbb && bytes[2] === 0xbf;
+
+  let isSureOfCharset: boolean;
+  let decoder: TextDecoder;
+  if (hasBom) {
+    decoder = new TextDecoder();
+    isSureOfCharset = true;
+  } else if (httpCharset) {
+    try {
+      decoder = new TextDecoder(httpCharset);
+      isSureOfCharset = true;
+    } catch (err) {
+      decoder = new TextDecoder();
+      isSureOfCharset = false;
+    }
+  } else {
+    decoder = new TextDecoder();
+    isSureOfCharset = false;
+  }
+
+  let decoded: string;
+  try {
+    decoded = decoder.decode(bytes);
+  } catch (err) {
+    decoded = '';
+  }
+
+  let document: HTMLDocument;
+  try {
+    document = new DOMParser().parseFromString(decoded, 'text/html');
+  } catch (err) {
+    document = emptyHtmlDocument();
+  }
+
+  if (!isSureOfCharset) {
+    const httpEquiv = document
+      .querySelector('meta[http-equiv="content-type"]')
+      ?.getAttribute('content');
+    if (httpEquiv) {
+      const httpEquivCharset = parseContentType(httpEquiv).charset;
+      if (httpEquivCharset) {
+        return parseHtmlBytes(bytes, httpEquivCharset);
+      }
+    }
+
+    const metaCharset = document
+      .querySelector('meta[charset]')
+      ?.getAttribute('charset');
+    if (metaCharset) {
+      return parseHtmlBytes(bytes, metaCharset);
+    }
+  }
+
+  return document;
+};
+
+const getHtmlDocument = async (
+  body: AsyncIterable<string | Uint8Array>,
+  contentLength: number,
+  httpCharset: string | null,
+  abortSignal: AbortSignal
+): Promise<HTMLDocument> => {
+  let result: HTMLDocument = emptyHtmlDocument();
+
+  const maxHtmlBytesToLoad = Math.min(contentLength, MAX_HTML_BYTES_TO_LOAD);
+  const buffer = new Uint8Array(new ArrayBuffer(maxHtmlBytesToLoad));
+  let bytesLoadedSoFar = 0;
+
+  try {
+    // `for ... of` is much cleaner here, so we allow it.
+    /* eslint-disable no-restricted-syntax */
+    for await (let chunk of body) {
+      if (abortSignal.aborted) {
+        break;
+      }
+
+      // This check exists to satisfy TypeScript; chunk should always be a Buffer.
+      if (typeof chunk === 'string') {
+        chunk = Buffer.from(chunk, httpCharset || 'utf8');
+      }
+
+      const truncatedChunk = chunk.slice(
+        0,
+        maxHtmlBytesToLoad - bytesLoadedSoFar
+      );
+      buffer.set(truncatedChunk, bytesLoadedSoFar);
+      bytesLoadedSoFar += truncatedChunk.byteLength;
+
+      result = parseHtmlBytes(buffer.slice(0, bytesLoadedSoFar), httpCharset);
+
+      const hasLoadedMaxBytes = bytesLoadedSoFar >= maxHtmlBytesToLoad;
+      if (hasLoadedMaxBytes) {
+        break;
+      }
+      const hasFinishedLoadingHead = result.body.innerHTML.length > 0;
+      if (hasFinishedLoadingHead) {
+        break;
+      }
+    }
+    /* eslint-enable no-restricted-syntax */
+  } catch (err) {
+    window.log.warn(
+      'getHtmlDocument: error when reading body; continuing with what we got'
+    );
+  }
+
+  return result;
+};
+
+const getOpenGraphContent = (
+  document: HTMLDocument,
+  properties: ReadonlyArray<string>
+): string | null => {
+  for (let i = 0; i < properties.length; i += 1) {
+    const property = properties[i];
+    const content = document
+      .querySelector(`meta[property="${property}"]`)
+      ?.getAttribute('content')
+      ?.trim();
+    if (content) {
+      return content;
+    }
+  }
+  return null;
+};
+
+const getLinkHrefAttribute = (
+  document: HTMLDocument,
+  rels: ReadonlyArray<string>
+): string | null => {
+  for (let i = 0; i < rels.length; i += 1) {
+    const rel = rels[i];
+    const href = document
+      .querySelector(`link[rel="${rel}"]`)
+      ?.getAttribute('href')
+      ?.trim();
+    if (href) {
+      return href;
+    }
+  }
+  return null;
+};
+
+const parseMetadata = (
+  document: HTMLDocument,
+  href: string
+): LinkPreviewMetadata | null => {
+  const title =
+    getOpenGraphContent(document, ['og:title']) || document.title.trim();
+  if (!title) {
+    window.log.warn(
+      "parseMetadata: HTML document doesn't have a title; bailing"
+    );
+    return null;
+  }
+
+  const description =
+    getOpenGraphContent(document, ['og:description']) ||
+    document
+      .querySelector('meta[name="description"]')
+      ?.getAttribute('content')
+      ?.trim() ||
+    null;
+
+  const rawImageHref =
+    getOpenGraphContent(document, ['og:image', 'og:image:url']) ||
+    getLinkHrefAttribute(document, [
+      'shortcut icon',
+      'icon',
+      'apple-touch-icon',
+    ]);
+  let imageHref: null | string;
+  if (rawImageHref) {
+    try {
+      imageHref = new URL(rawImageHref, href).href;
+    } catch (err) {
+      imageHref = null;
+    }
+  } else {
+    imageHref = null;
+  }
+
+  let date: number | null = null;
+  const rawDate = getOpenGraphContent(document, [
+    'og:published_time',
+    'article:published_time',
+    'og:modified_time',
+    'article:modified_time',
+  ]);
+  if (rawDate) {
+    const parsed = Date.parse(rawDate);
+    if (parsed > MIN_DATE && parsed < MAX_DATE) {
+      date = parsed;
+    }
+  }
+
+  return {
+    title,
+    description,
+    imageHref,
+    date,
+  };
+};
+
+/**
+ * This attempts to fetch link preview metadata, returning `null` if it cannot be found
+ * for any reason.
+ *
+ * NOTE: This does NOT validate the incoming URL for safety. For example, it may fetch an
+ * insecure HTTP href. It also does not offer a timeout; that is up to the caller.
+ *
+ * At a high level, it:
+ *
+ * 1. Makes a GET request, following up to 20 redirects (`fetch`'s default).
+ * 2. Checks the response status code and headers to make sure it's a normal HTML
+ *    response.
+ * 3. Streams up to `MAX_HTML_BYTES_TO_LOAD`, stopping when (1) it has loaded all of the
+ *    HTML (2) loaded the maximum number of bytes (3) finished loading the `<head>`.
+ * 4. Parses the resulting HTML with `DOMParser`.
+ * 5. Grabs the title, description, image URL, and date.
+ */
+export async function fetchLinkPreviewMetadata(
+  fetchFn: FetchFn,
+  href: string,
+  abortSignal: AbortSignal
+): Promise<null | LinkPreviewMetadata> {
+  let response: Response;
+  try {
+    response = await fetchFn(href, {
+      headers: {
+        Accept: 'text/html,application/xhtml+xml',
+        'User-Agent': 'WhatsApp',
+      },
+      redirect: 'follow',
+      signal: abortSignal,
+    });
+  } catch (err) {
+    window.log.warn(
+      'fetchLinkPreviewMetadata: failed to fetch link preview HTML; bailing'
+    );
+    return null;
+  }
+
+  if (!response.ok) {
+    window.log.warn(
+      `fetchLinkPreviewMetadata: got a ${response.status} status code; bailing`
+    );
+    return null;
+  }
+
+  if (!response.body) {
+    window.log.warn('fetchLinkPreviewMetadata: no response body; bailing');
+    return null;
+  }
+
+  if (
+    !isInlineContentDisposition(response.headers.get('Content-Disposition'))
+  ) {
+    window.log.warn(
+      'fetchLinkPreviewMetadata: Content-Disposition header is not inline; bailing'
+    );
+    return null;
+  }
+
+  if (abortSignal.aborted) {
+    return null;
+  }
+
+  const contentLength = parseContentLength(
+    response.headers.get('Content-Length')
+  );
+  if (contentLength < MIN_HTML_CONTENT_LENGTH) {
+    window.log.warn(
+      'fetchLinkPreviewMetadata: Content-Length is too short; bailing'
+    );
+    return null;
+  }
+
+  const contentType = parseContentType(response.headers.get('Content-Type'));
+  if (contentType.type !== 'text/html') {
+    window.log.warn(
+      'fetchLinkPreviewMetadata: Content-Type is not HTML; bailing'
+    );
+    return null;
+  }
+
+  const document = await getHtmlDocument(
+    response.body,
+    contentLength,
+    contentType.charset,
+    abortSignal
+  );
+
+  // [The Node docs about `ReadableStream.prototype[Symbol.asyncIterator]`][0] say that
+  //   the stream will be destroyed if you `break` out of the loop, but I could not
+  //   reproduce this. Also [`destroy` is a documented method][1] but it is not in the
+  //   Node types, which is why we do this cast to `any`.
+  // [0]: https://nodejs.org/docs/latest-v12.x/api/stream.html#stream_readable_symbol_asynciterator
+  // [1]: https://nodejs.org/docs/latest-v12.x/api/stream.html#stream_readable_destroy_error
+  try {
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
+    (response.body as any).destroy();
+  } catch (err) {
+    // Ignored.
+  }
+
+  if (abortSignal.aborted) {
+    return null;
+  }
+
+  return parseMetadata(document, response.url);
+}
+
+/**
+ * This attempts to fetch an image, returning `null` if it fails for any reason.
+ *
+ * NOTE: This does NOT validate the incoming URL for safety. For example, it may fetch an
+ * insecure HTTP href. It also does not offer a timeout; that is up to the caller.
+ */
+export async function fetchLinkPreviewImage(
+  fetchFn: FetchFn,
+  href: string,
+  abortSignal: AbortSignal
+): Promise<null | LinkPreviewImage> {
+  let response: Response;
+  try {
+    response = await fetchFn(href, {
+      headers: {
+        'User-Agent': 'WhatsApp',
+      },
+      size: MAX_IMAGE_CONTENT_LENGTH,
+      redirect: 'follow',
+      signal: abortSignal,
+    });
+  } catch (err) {
+    window.log.warn('fetchLinkPreviewImage: failed to fetch image; bailing');
+    return null;
+  }
+
+  if (abortSignal.aborted) {
+    return null;
+  }
+
+  if (!response.ok) {
+    window.log.warn(
+      `fetchLinkPreviewImage: got a ${response.status} status code; bailing`
+    );
+    return null;
+  }
+
+  const contentLength = parseContentLength(
+    response.headers.get('Content-Length')
+  );
+  if (contentLength < MIN_IMAGE_CONTENT_LENGTH) {
+    window.log.warn(
+      'fetchLinkPreviewImage: Content-Length is too short; bailing'
+    );
+    return null;
+  }
+  if (contentLength > MAX_IMAGE_CONTENT_LENGTH) {
+    window.log.warn(
+      'fetchLinkPreviewImage: Content-Length is too large or is unset; bailing'
+    );
+    return null;
+  }
+
+  const { type: contentType } = parseContentType(
+    response.headers.get('Content-Type')
+  );
+  if (!contentType || !VALID_IMAGE_MIME_TYPES.has(contentType)) {
+    window.log.warn(
+      'fetchLinkPreviewImage: Content-Type is not an image; bailing'
+    );
+    return null;
+  }
+
+  let data: ArrayBuffer;
+  try {
+    data = await response.arrayBuffer();
+  } catch (err) {
+    window.log.warn('fetchLinkPreviewImage: failed to read body; bailing');
+    return null;
+  }
+
+  return { data, contentType };
+}