signal-desktop/js/modules/link_previews.js

293 lines
6.7 KiB
JavaScript
Raw Normal View History

2019-01-16 03:03:56 +00:00
/* global URL */
2020-08-26 19:47:50 +00:00
const { isNumber, compact, isEmpty } = require('lodash');
2019-01-16 03:03:56 +00:00
const he = require('he');
2019-02-21 22:41:17 +00:00
const nodeUrl = require('url');
2019-01-16 03:03:56 +00:00
const LinkifyIt = require('linkify-it');
const linkify = LinkifyIt();
const { concatenateBytes, getViewOfArrayBuffer } = require('../../ts/Crypto');
2019-01-16 03:03:56 +00:00
module.exports = {
assembleChunks,
findLinks,
getChunkPattern,
getDomain,
getTitleMetaTag,
getImageMetaTag,
isLinkSafeToPreview,
2019-01-16 03:03:56 +00:00
isLinkInWhitelist,
isMediaLinkInWhitelist,
isLinkSneaky,
isStickerPack,
2019-01-16 03:03:56 +00:00
};
function isLinkSafeToPreview(link) {
let url;
try {
url = new URL(link);
} catch (err) {
return false;
}
return url.protocol === 'https:' && !isLinkSneaky(link);
}
2019-01-16 03:03:56 +00:00
const SUPPORTED_DOMAINS = [
'youtube.com',
'www.youtube.com',
'm.youtube.com',
'youtu.be',
'reddit.com',
'www.reddit.com',
'm.reddit.com',
'imgur.com',
'www.imgur.com',
'm.imgur.com',
'instagram.com',
'www.instagram.com',
'm.instagram.com',
'pinterest.com',
'www.pinterest.com',
'pin.it',
'signal.art',
2019-01-16 03:03:56 +00:00
];
// This function will soon be removed in favor of `isLinkSafeToPreview`. It is
// currently used because outbound-from-Desktop link previews only support a
// few domains (see the list above). We will soon remove this restriction to
// allow link previews from all domains, making this function obsolete.
2019-01-16 03:03:56 +00:00
function isLinkInWhitelist(link) {
try {
const url = new URL(link);
if (url.protocol !== 'https:') {
return false;
}
if (!url.pathname || url.pathname.length < 2) {
return false;
}
const lowercase = url.host.toLowerCase();
if (!SUPPORTED_DOMAINS.includes(lowercase)) {
return false;
}
return true;
} catch (error) {
return false;
}
}
function isStickerPack(link) {
return (link || '').startsWith('https://signal.art/addstickers/');
}
const SUPPORTED_MEDIA_DOMAINS = /^([^.]+\.)*(ytimg\.com|cdninstagram\.com|redd\.it|imgur\.com|fbcdn\.net|pinimg\.com)$/i;
// This function will soon be removed. See the comment in `isLinkInWhitelist`
// for more info.
2019-01-16 03:03:56 +00:00
function isMediaLinkInWhitelist(link) {
try {
const url = new URL(link);
if (url.protocol !== 'https:') {
return false;
}
if (!url.pathname || url.pathname.length < 2) {
return false;
}
if (!SUPPORTED_MEDIA_DOMAINS.test(url.host)) {
return false;
}
return true;
} catch (error) {
return false;
}
}
const META_TITLE = /<meta\s+property="og:title"[^>]+?content="([\s\S]+?)"[^>]*>/im;
const META_IMAGE = /<meta\s+property="og:image"[^>]+?content="([\s\S]+?)"[^>]*>/im;
2019-01-16 03:03:56 +00:00
function _getMetaTag(html, regularExpression) {
const match = regularExpression.exec(html);
if (match && match[1]) {
return he.decode(match[1]).trim();
}
return null;
}
function getTitleMetaTag(html) {
return _getMetaTag(html, META_TITLE);
}
function getImageMetaTag(html) {
return _getMetaTag(html, META_IMAGE);
}
function findLinks(text, caretLocation) {
const haveCaretLocation = isNumber(caretLocation);
const textLength = text ? text.length : 0;
2019-01-16 03:03:56 +00:00
const matches = linkify.match(text || '') || [];
return compact(
matches.map(match => {
if (!haveCaretLocation) {
return match.text;
}
if (match.lastIndex === textLength && caretLocation === textLength) {
return match.text;
}
if (match.index > caretLocation || match.lastIndex < caretLocation) {
return match.text;
}
return null;
})
);
2019-01-16 03:03:56 +00:00
}
2020-04-24 16:57:04 +00:00
function hasAuth(url) {
try {
const urlObject = new URL(url);
return Boolean(urlObject.username);
} catch (e) {
return null;
}
}
2019-01-16 03:03:56 +00:00
function getDomain(url) {
try {
const urlObject = new URL(url);
return urlObject.hostname;
} catch (error) {
return null;
}
}
const MB = 1024 * 1024;
const KB = 1024;
function getChunkPattern(size, initialOffset) {
2019-01-16 03:03:56 +00:00
if (size > MB) {
return _getRequestPattern(size, MB, initialOffset);
2019-01-16 03:03:56 +00:00
} else if (size > 500 * KB) {
return _getRequestPattern(size, 500 * KB, initialOffset);
2019-01-16 03:03:56 +00:00
} else if (size > 100 * KB) {
return _getRequestPattern(size, 100 * KB, initialOffset);
2019-01-16 03:03:56 +00:00
} else if (size > 50 * KB) {
return _getRequestPattern(size, 50 * KB, initialOffset);
2019-01-16 03:03:56 +00:00
} else if (size > 10 * KB) {
return _getRequestPattern(size, 10 * KB, initialOffset);
2019-01-16 03:03:56 +00:00
} else if (size > KB) {
return _getRequestPattern(size, KB, initialOffset);
2019-01-16 03:03:56 +00:00
}
return {
start: {
start: initialOffset,
end: size - 1,
},
};
2019-01-16 03:03:56 +00:00
}
function _getRequestPattern(size, increment, initialOffset) {
2019-01-16 03:03:56 +00:00
const results = [];
let offset = initialOffset || 0;
2019-01-16 03:03:56 +00:00
while (size - offset > increment) {
results.push({
start: offset,
end: offset + increment - 1,
overlap: 0,
});
offset += increment;
}
if (size - offset > 0) {
results.push({
start: size - increment,
end: size - 1,
overlap: increment - (size - offset),
});
}
return results;
}
function assembleChunks(chunkDescriptors) {
const chunks = chunkDescriptors.map((chunk, index) => {
if (index !== chunkDescriptors.length - 1) {
return chunk.data;
}
if (!chunk.overlap) {
return chunk.data;
}
return getViewOfArrayBuffer(
chunk.data,
chunk.overlap,
chunk.data.byteLength
);
});
return concatenateBytes(...chunks);
}
const ASCII_PATTERN = new RegExp('[\\u0000-\\u007F]', 'g');
function isLinkSneaky(link) {
2020-04-24 16:57:04 +00:00
// Any links which contain auth are considered sneaky
if (hasAuth(link)) {
return true;
}
const domain = getDomain(link);
2020-02-19 21:14:18 +00:00
// If the domain is falsy, something fishy is going on
if (!domain) {
return true;
}
2020-08-26 19:47:50 +00:00
// To quote [RFC 1034][0]: "the total number of octets that represent a
// domain name [...] is limited to 255." To be extra careful, we set a
// maximum of 2048. (This also uses the string's `.length` property,
// which isn't exactly the same thing as the number of octets.)
// [0]: https://tools.ietf.org/html/rfc1034
if (domain.length > 2048) {
return true;
}
2020-02-19 21:14:18 +00:00
// Domains cannot contain encoded characters
if (domain.includes('%')) {
return true;
}
2020-08-26 19:47:50 +00:00
// There must be at least 2 domain labels, and none of them can be empty.
const labels = domain.split('.');
if (labels.length < 2 || labels.some(isEmpty)) {
return true;
}
2020-02-19 21:14:18 +00:00
// This is necesary because getDomain returns domains in punycode form.
2019-01-14 21:49:58 +00:00
const unicodeDomain = nodeUrl.domainToUnicode
? nodeUrl.domainToUnicode(domain)
: domain;
const withoutPeriods = unicodeDomain.replace(/\./g, '');
const hasASCII = ASCII_PATTERN.test(withoutPeriods);
const withoutASCII = withoutPeriods.replace(ASCII_PATTERN, '');
const isMixed = hasASCII && withoutASCII.length > 0;
if (isMixed) {
return true;
}
return false;
}