Mark long hrefs or those with invalid characters as sneaky

2020-10-08 11:50:55 -05:00 · 2020-10-08 11:50:55 -05:00 · f21dad1519
commit f21dad1519
parent 0d83076799
2 changed files with 232 additions and 97 deletions
--- a/js/modules/link_previews.js
+++ b/js/modules/link_previews.js
@ -1,6 +1,6 @@
 /* global URL */

-const { isNumber, compact, isEmpty } = require('lodash');
+const { isNumber, compact, isEmpty, range } = require('lodash');
 const nodeUrl = require('url');
 const LinkifyIt = require('linkify-it');

@ -14,14 +14,17 @@ module.exports = {
  isStickerPack,
 };

-function isLinkSafeToPreview(link) {
-  let url;
+function maybeParseHref(href) {
  try {
-    url = new URL(link);
+    return new URL(href);
  } catch (err) {
-    return false;
+    return null;
  }
-  return url.protocol === 'https:' && !isLinkSneaky(link);
+}
+
+function isLinkSafeToPreview(href) {
+  const url = maybeParseHref(href);
+  return Boolean(url && url.protocol === 'https:' && !isLinkSneaky(href));
 }

 function isStickerPack(link) {
@ -52,35 +55,66 @@ function findLinks(text, caretLocation) {
  );
 }

-function hasAuth(url) {
-  try {
-    const urlObject = new URL(url);
-    return Boolean(urlObject.username);
-  } catch (e) {
-    return null;
-  }
-}
-
-function getDomain(url) {
-  try {
-    const urlObject = new URL(url);
-    return urlObject.hostname;
-  } catch (error) {
-    return null;
-  }
+function getDomain(href) {
+  const url = maybeParseHref(href);
+  return url ? url.hostname : null;
 }

+// See <https://tools.ietf.org/html/rfc3986>.
+const VALID_URI_CHARACTERS = new Set([
+  '%',
+  // "gen-delims"
+  ':',
+  '/',
+  '?',
+  '#',
+  '[',
+  ']',
+  '@',
+  // "sub-delims"
+  '!',
+  '$',
+  '&',
+  "'",
+  '(',
+  ')',
+  '*',
+  '+',
+  ',',
+  ';',
+  '=',
+  // unreserved
+  ...String.fromCharCode(...range(65, 91), ...range(97, 123)),
+  ...range(10).map(String),
+  '-',
+  '.',
+  '_',
+  '~',
+]);
 const ASCII_PATTERN = new RegExp('[\\u0020-\\u007F]', 'g');
+const MAX_HREF_LENGTH = 2 ** 12;

-function isLinkSneaky(link) {
-  // Any links which contain auth are considered sneaky
-  if (hasAuth(link)) {
+function isLinkSneaky(href) {
+  // This helps users avoid extremely long links (which could be hiding something
+  //   sketchy) and also sidesteps the performance implications of extremely long hrefs.
+  if (href.length > MAX_HREF_LENGTH) {
+    return true;
+  }
+
+  const url = maybeParseHref(href);
+
+  // If we can't parse it, it's sneaky.
+  if (!url) {
+    return true;
+  }
+
+  // Any links which contain auth are considered sneaky
+  if (url.username) {
    return true;
  }

-  const domain = getDomain(link);
  // If the domain is falsy, something fishy is going on
-  if (!domain) {
+  if (!url.hostname) {
    return true;
  }

@ -89,25 +123,25 @@ function isLinkSneaky(link) {
  //   maximum of 2048. (This also uses the string's `.length` property,
  //   which isn't exactly the same thing as the number of octets.)
  // [0]: https://tools.ietf.org/html/rfc1034
-  if (domain.length > 2048) {
+  if (url.hostname.length > 2048) {
    return true;
  }

  // Domains cannot contain encoded characters
-  if (domain.includes('%')) {
+  if (url.hostname.includes('%')) {
    return true;
  }

  // There must be at least 2 domain labels, and none of them can be empty.
-  const labels = domain.split('.');
+  const labels = url.hostname.split('.');
  if (labels.length < 2 || labels.some(isEmpty)) {
    return true;
  }

  // This is necesary because getDomain returns domains in punycode form.
  const unicodeDomain = nodeUrl.domainToUnicode
-    ? nodeUrl.domainToUnicode(domain)
-    : domain;
+    ? nodeUrl.domainToUnicode(url.hostname)
+    : url.hostname;

  const withoutPeriods = unicodeDomain.replace(/\./g, '');

@ -119,5 +153,12 @@ function isLinkSneaky(link) {
    return true;
  }

-  return false;
+  // We can't use `url.pathname` (and so on) because it automatically encodes strings.
+  //   For example, it turns `/aquí` into `/aqu%C3%AD`.
+  const startOfPathAndHash = href.indexOf('/', url.protocol.length + 4);
+  const pathAndHash =
+    startOfPathAndHash === -1 ? '' : href.substr(startOfPathAndHash);
+  return [...pathAndHash].some(
+    character => !VALID_URI_CHARACTERS.has(character)
+  );
 }