signal-desktop/js/modules/link_previews.js

/* global URL */

const { isNumber, compact } = require('lodash');
const he = require('he');
const nodeUrl = require('url');
const LinkifyIt = require('linkify-it');

const linkify = LinkifyIt();
const { concatenateBytes, getViewOfArrayBuffer } = require('./crypto');

module.exports = {
  assembleChunks,
  findLinks,
  getChunkPattern,
  getDomain,
  getTitleMetaTag,
  getImageMetaTag,
  isLinkInWhitelist,
  isMediaLinkInWhitelist,
  isLinkSneaky,
  isStickerPack,
};

const SUPPORTED_DOMAINS = [
  'youtube.com',
  'www.youtube.com',
  'm.youtube.com',
  'youtu.be',
  'reddit.com',
  'www.reddit.com',
  'm.reddit.com',
  'imgur.com',
  'www.imgur.com',
  'm.imgur.com',
  'instagram.com',
  'www.instagram.com',
  'm.instagram.com',
  'pinterest.com',
  'www.pinterest.com',
  'pin.it',
  'signal.org',
];

function isLinkInWhitelist(link) {
  try {
    const url = new URL(link);

    if (url.protocol !== 'https:') {
      return false;
    }

    if (!url.pathname || url.pathname.length < 2) {
      return false;
    }

    const lowercase = url.host.toLowerCase();
    if (!SUPPORTED_DOMAINS.includes(lowercase)) {
      return false;
    }

    return true;
  } catch (error) {
    return false;
  }
}

function isStickerPack(link) {
  return (link || '').startsWith('https://signal.org/addstickers/');
}

const SUPPORTED_MEDIA_DOMAINS = /^([^.]+\.)*(ytimg\.com|cdninstagram\.com|redd\.it|imgur\.com|fbcdn\.net|pinimg\.com)$/i;
function isMediaLinkInWhitelist(link) {
  try {
    const url = new URL(link);

    if (url.protocol !== 'https:') {
      return false;
    }

    if (!url.pathname || url.pathname.length < 2) {
      return false;
    }

    if (!SUPPORTED_MEDIA_DOMAINS.test(url.host)) {
      return false;
    }

    return true;
  } catch (error) {
    return false;
  }
}

const META_TITLE = /<meta\s+property="og:title"[^>]+?content="([\s\S]+?)"[^>]*>/im;
const META_IMAGE = /<meta\s+property="og:image"[^>]+?content="([\s\S]+?)"[^>]*>/im;
function _getMetaTag(html, regularExpression) {
  const match = regularExpression.exec(html);
  if (match && match[1]) {
    return he.decode(match[1]).trim();
  }

  return null;
}

function getTitleMetaTag(html) {
  return _getMetaTag(html, META_TITLE);
}
function getImageMetaTag(html) {
  return _getMetaTag(html, META_IMAGE);
}

function findLinks(text, caretLocation) {
  const haveCaretLocation = isNumber(caretLocation);
  const textLength = text ? text.length : 0;

  const matches = linkify.match(text || '') || [];
  return compact(
    matches.map(match => {
      if (!haveCaretLocation) {
        return match.text;
      }

      if (match.lastIndex === textLength && caretLocation === textLength) {
        return match.text;
      }

      if (match.index > caretLocation || match.lastIndex < caretLocation) {
        return match.text;
      }

      return null;
    })
  );
}

function getDomain(url) {
  try {
    const urlObject = new URL(url);
    return urlObject.hostname;
  } catch (error) {
    return null;
  }
}

const MB = 1024 * 1024;
const KB = 1024;

function getChunkPattern(size, initialOffset) {
  if (size > MB) {
    return _getRequestPattern(size, MB, initialOffset);
  } else if (size > 500 * KB) {
    return _getRequestPattern(size, 500 * KB, initialOffset);
  } else if (size > 100 * KB) {
    return _getRequestPattern(size, 100 * KB, initialOffset);
  } else if (size > 50 * KB) {
    return _getRequestPattern(size, 50 * KB, initialOffset);
  } else if (size > 10 * KB) {
    return _getRequestPattern(size, 10 * KB, initialOffset);
  } else if (size > KB) {
    return _getRequestPattern(size, KB, initialOffset);
  }

  return {
    start: {
      start: initialOffset,
      end: size - 1,
    },
  };
}

function _getRequestPattern(size, increment, initialOffset) {
  const results = [];

  let offset = initialOffset || 0;
  while (size - offset > increment) {
    results.push({
      start: offset,
      end: offset + increment - 1,
      overlap: 0,
    });
    offset += increment;
  }

  if (size - offset > 0) {
    results.push({
      start: size - increment,
      end: size - 1,
      overlap: increment - (size - offset),
    });
  }

  return results;
}

function assembleChunks(chunkDescriptors) {
  const chunks = chunkDescriptors.map((chunk, index) => {
    if (index !== chunkDescriptors.length - 1) {
      return chunk.data;
    }

    if (!chunk.overlap) {
      return chunk.data;
    }

    return getViewOfArrayBuffer(
      chunk.data,
      chunk.overlap,
      chunk.data.byteLength
    );
  });

  return concatenateBytes(...chunks);
}

const LATIN_PATTERN = new RegExp(
  '[' +
    '\\u0041-\\u005A' +
    '\\u0061-\\u007A' +
    '\\u00AA' +
    '\\u00BA' +
    '\\u00C0-\\u00DC' +
    '\\u00D8-\\u00F6' +
    '\\u00F8-\\u01BA' +
    ']'
);

const CYRILLIC_PATTERN = new RegExp(
  '[' +
    '\\u0400-\\u0481' +
    '\\u0482' +
    '\\u0483-\\u0484' +
    '\\u0487' +
    '\\u0488-\\u0489' +
    '\\u048A-\\u052F' +
    '\\u1C80-\\u1C88' +
    '\\u1D2B' +
    '\\u1D78' +
    '\\u2DE0-\\u2DFF' +
    '\\uA640-\\uA66D' +
    '\\uA66E' +
    '\\uA66F' +
    '\\uA670-\\uA672' +
    '\\uA673' +
    '\\uA674-\\uA67D' +
    '\\uA67E' +
    '\\uA67F' +
    '\\uA680-\\uA69B' +
    '\\uA69C-\\uA69D' +
    '\\uA69E-\\uA69F' +
    '\\uFE2E-\\uFE2F' +
    ']'
);

const GREEK_PATTERN = new RegExp(
  '[' +
    '\\u0370-\\u0373' +
    '\\u0375' +
    '\\u0376-\\u0377' +
    '\\u037A' +
    '\\u037B-\\u037D' +
    '\\u037F' +
    '\\u0384' +
    '\\u0386' +
    '\\u0388-\\u038A' +
    '\\u038C' +
    '\\u038E-\\u03A1' +
    '\\u03A3-\\u03E1' +
    '\\u03F0-\\u03F5' +
    '\\u03F6' +
    '\\u03F7-\\u03FF' +
    '\\u1D26-\\u1D2A' +
    '\\u1D5D-\\u1D61' +
    '\\u1D66-\\u1D6A' +
    '\\u1DBF' +
    '\\u1F00-\\u1F15' +
    '\\u1F18-\\u1F1D' +
    '\\u1F20-\\u1F45' +
    '\\u1F48-\\u1F4D' +
    '\\u1F50-\\u1F57' +
    '\\u1F59' +
    '\\u1F5B' +
    '\\u1F5D' +
    '\\u1F5F-\\u1F7D' +
    '\\u1F80-\\u1FB4' +
    '\\u1FB6-\\u1FBC' +
    '\\u1FBD' +
    '\\u1FBE' +
    '\\u1FBF-\\u1FC1' +
    '\\u1FC2-\\u1FC4' +
    '\\u1FC6-\\u1FCC' +
    '\\u1FCD-\\u1FCF' +
    '\\u1FD0-\\u1FD3' +
    '\\u1FD6-\\u1FDB' +
    '\\u1FDD-\\u1FDF' +
    '\\u1FE0-\\u1FEC' +
    '\\u1FED-\\u1FEF' +
    '\\u1FF2-\\u1FF4' +
    '\\u1FF6-\\u1FFC' +
    '\\u1FFD-\\u1FFE' +
    '\\u2126' +
    '\\uAB65' +
    ']'
);

const HIGH_GREEK_PATTERN = new RegExp(
  '[' +
    `${String.fromCodePoint(0x10140)}-${String.fromCodePoint(0x10174)}` +
    `${String.fromCodePoint(0x10175)}-${String.fromCodePoint(0x10178)}` +
    `${String.fromCodePoint(0x10179)}-${String.fromCodePoint(0x10189)}` +
    `${String.fromCodePoint(0x1018a)}-${String.fromCodePoint(0x1018b)}` +
    `${String.fromCodePoint(0x1018c)}-${String.fromCodePoint(0x1018e)}` +
    `${String.fromCodePoint(0x101a0)}` +
    `${String.fromCodePoint(0x1d200)}-${String.fromCodePoint(0x1d241)}` +
    `${String.fromCodePoint(0x1d242)}-${String.fromCodePoint(0x1d244)}` +
    `${String.fromCodePoint(0x1d245)}` +
    ']',
  'u'
);

function isChunkSneaky(chunk) {
  const hasLatin = LATIN_PATTERN.test(chunk);
  if (!hasLatin) {
    return false;
  }

  const hasCyrillic = CYRILLIC_PATTERN.test(chunk);
  if (hasCyrillic) {
    return true;
  }

  const hasGreek = GREEK_PATTERN.test(chunk);
  if (hasGreek) {
    return true;
  }

  const hasHighGreek = HIGH_GREEK_PATTERN.test(chunk);
  if (hasHighGreek) {
    return true;
  }

  return false;
}

function isLinkSneaky(link) {
  const domain = getDomain(link);

  // This is necesary because getDomain returns domains in punycode form. We check whether
  //   it's available for the StyleGuide.
  const unicodeDomain = nodeUrl.domainToUnicode
    ? nodeUrl.domainToUnicode(domain)
    : domain;

  const chunks = unicodeDomain.split('.');
  for (let i = 0, max = chunks.length; i < max; i += 1) {
    const chunk = chunks[i];
    if (isChunkSneaky(chunk)) {
      return true;
    }
  }

  return false;
}