Mark long hrefs or those with invalid characters as sneaky
This commit is contained in:
parent
0d83076799
commit
f21dad1519
2 changed files with 232 additions and 97 deletions
|
@ -1,6 +1,6 @@
|
|||
/* global URL */
|
||||
|
||||
const { isNumber, compact, isEmpty } = require('lodash');
|
||||
const { isNumber, compact, isEmpty, range } = require('lodash');
|
||||
const nodeUrl = require('url');
|
||||
const LinkifyIt = require('linkify-it');
|
||||
|
||||
|
@ -14,14 +14,17 @@ module.exports = {
|
|||
isStickerPack,
|
||||
};
|
||||
|
||||
function isLinkSafeToPreview(link) {
|
||||
let url;
|
||||
function maybeParseHref(href) {
|
||||
try {
|
||||
url = new URL(link);
|
||||
return new URL(href);
|
||||
} catch (err) {
|
||||
return false;
|
||||
return null;
|
||||
}
|
||||
return url.protocol === 'https:' && !isLinkSneaky(link);
|
||||
}
|
||||
|
||||
function isLinkSafeToPreview(href) {
|
||||
const url = maybeParseHref(href);
|
||||
return Boolean(url && url.protocol === 'https:' && !isLinkSneaky(href));
|
||||
}
|
||||
|
||||
function isStickerPack(link) {
|
||||
|
@ -52,35 +55,66 @@ function findLinks(text, caretLocation) {
|
|||
);
|
||||
}
|
||||
|
||||
function hasAuth(url) {
|
||||
try {
|
||||
const urlObject = new URL(url);
|
||||
return Boolean(urlObject.username);
|
||||
} catch (e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function getDomain(url) {
|
||||
try {
|
||||
const urlObject = new URL(url);
|
||||
return urlObject.hostname;
|
||||
} catch (error) {
|
||||
return null;
|
||||
}
|
||||
function getDomain(href) {
|
||||
const url = maybeParseHref(href);
|
||||
return url ? url.hostname : null;
|
||||
}
|
||||
|
||||
// See <https://tools.ietf.org/html/rfc3986>.
|
||||
const VALID_URI_CHARACTERS = new Set([
|
||||
'%',
|
||||
// "gen-delims"
|
||||
':',
|
||||
'/',
|
||||
'?',
|
||||
'#',
|
||||
'[',
|
||||
']',
|
||||
'@',
|
||||
// "sub-delims"
|
||||
'!',
|
||||
'$',
|
||||
'&',
|
||||
"'",
|
||||
'(',
|
||||
')',
|
||||
'*',
|
||||
'+',
|
||||
',',
|
||||
';',
|
||||
'=',
|
||||
// unreserved
|
||||
...String.fromCharCode(...range(65, 91), ...range(97, 123)),
|
||||
...range(10).map(String),
|
||||
'-',
|
||||
'.',
|
||||
'_',
|
||||
'~',
|
||||
]);
|
||||
const ASCII_PATTERN = new RegExp('[\\u0020-\\u007F]', 'g');
|
||||
const MAX_HREF_LENGTH = 2 ** 12;
|
||||
|
||||
function isLinkSneaky(link) {
|
||||
// Any links which contain auth are considered sneaky
|
||||
if (hasAuth(link)) {
|
||||
function isLinkSneaky(href) {
|
||||
// This helps users avoid extremely long links (which could be hiding something
|
||||
// sketchy) and also sidesteps the performance implications of extremely long hrefs.
|
||||
if (href.length > MAX_HREF_LENGTH) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const url = maybeParseHref(href);
|
||||
|
||||
// If we can't parse it, it's sneaky.
|
||||
if (!url) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Any links which contain auth are considered sneaky
|
||||
if (url.username) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const domain = getDomain(link);
|
||||
// If the domain is falsy, something fishy is going on
|
||||
if (!domain) {
|
||||
if (!url.hostname) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -89,25 +123,25 @@ function isLinkSneaky(link) {
|
|||
// maximum of 2048. (This also uses the string's `.length` property,
|
||||
// which isn't exactly the same thing as the number of octets.)
|
||||
// [0]: https://tools.ietf.org/html/rfc1034
|
||||
if (domain.length > 2048) {
|
||||
if (url.hostname.length > 2048) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Domains cannot contain encoded characters
|
||||
if (domain.includes('%')) {
|
||||
if (url.hostname.includes('%')) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// There must be at least 2 domain labels, and none of them can be empty.
|
||||
const labels = domain.split('.');
|
||||
const labels = url.hostname.split('.');
|
||||
if (labels.length < 2 || labels.some(isEmpty)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// This is necesary because getDomain returns domains in punycode form.
|
||||
const unicodeDomain = nodeUrl.domainToUnicode
|
||||
? nodeUrl.domainToUnicode(domain)
|
||||
: domain;
|
||||
? nodeUrl.domainToUnicode(url.hostname)
|
||||
: url.hostname;
|
||||
|
||||
const withoutPeriods = unicodeDomain.replace(/\./g, '');
|
||||
|
||||
|
@ -119,5 +153,12 @@ function isLinkSneaky(link) {
|
|||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
// We can't use `url.pathname` (and so on) because it automatically encodes strings.
|
||||
// For example, it turns `/aquí` into `/aqu%C3%AD`.
|
||||
const startOfPathAndHash = href.indexOf('/', url.protocol.length + 4);
|
||||
const pathAndHash =
|
||||
startOfPathAndHash === -1 ? '' : href.substr(startOfPathAndHash);
|
||||
return [...pathAndHash].some(
|
||||
character => !VALID_URI_CHARACTERS.has(character)
|
||||
);
|
||||
}
|
||||
|
|
|
@ -114,49 +114,6 @@ describe('Link previews', () => {
|
|||
});
|
||||
|
||||
describe('#isLinkSneaky', () => {
|
||||
it('returns false for all-latin domain', () => {
|
||||
const link = 'https://www.amazon.com';
|
||||
const actual = isLinkSneaky(link);
|
||||
assert.strictEqual(actual, false);
|
||||
});
|
||||
|
||||
it('returns false for IPv4 addresses', () => {
|
||||
assert.isFalse(isLinkSneaky('https://127.0.0.1/path'));
|
||||
});
|
||||
|
||||
// It's possible that this should return `false` but we'd need to add special logic
|
||||
// for it.
|
||||
it('returns true for IPv6 addresses', () => {
|
||||
assert.isTrue(
|
||||
isLinkSneaky('https://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]/path')
|
||||
);
|
||||
assert.isTrue(isLinkSneaky('https://[::]/path'));
|
||||
});
|
||||
|
||||
it('returns true for Latin + Cyrillic domain', () => {
|
||||
const link = 'https://www.aмazon.com';
|
||||
const actual = isLinkSneaky(link);
|
||||
assert.strictEqual(actual, true);
|
||||
});
|
||||
|
||||
it('returns true for Latin + Greek domain', () => {
|
||||
const link = 'https://www.αpple.com';
|
||||
const actual = isLinkSneaky(link);
|
||||
assert.strictEqual(actual, true);
|
||||
});
|
||||
|
||||
it('returns true for ASCII and non-ASCII mix', () => {
|
||||
const link = 'https://www.аррӏе.com';
|
||||
const actual = isLinkSneaky(link);
|
||||
assert.strictEqual(actual, true);
|
||||
});
|
||||
|
||||
it('returns true for Latin + High Greek domain', () => {
|
||||
const link = `https://www.apple${String.fromCodePoint(0x101a0)}.com`;
|
||||
const actual = isLinkSneaky(link);
|
||||
assert.strictEqual(actual, true);
|
||||
});
|
||||
|
||||
it('returns true for =', () => {
|
||||
const link = 'r.id=s.id';
|
||||
assert.strictEqual(isLinkSneaky(link), true);
|
||||
|
@ -177,35 +134,172 @@ describe('Link previews', () => {
|
|||
assert.strictEqual(isLinkSneaky(link), true);
|
||||
});
|
||||
|
||||
it('returns true for auth (or pretend auth)', () => {
|
||||
const link = 'http://whatever.com&login=someuser@77777777';
|
||||
assert.strictEqual(isLinkSneaky(link), true);
|
||||
it('returns true for URLs with a length of 4097 or higher', () => {
|
||||
const href = `https://example.com/${'a'.repeat(4077)}`;
|
||||
assert.lengthOf(href, 4097, 'Test href is not the proper length');
|
||||
|
||||
assert.isTrue(isLinkSneaky(href));
|
||||
assert.isTrue(isLinkSneaky(`${href}?foo=bar`));
|
||||
});
|
||||
|
||||
it("returns true if the domain doesn't contain a .", () => {
|
||||
assert.isTrue(isLinkSneaky('https://example'));
|
||||
assert.isTrue(isLinkSneaky('https://localhost'));
|
||||
assert.isTrue(isLinkSneaky('https://localhost:3000'));
|
||||
describe('auth', () => {
|
||||
it('returns true for hrefs with auth (or pretend auth)', () => {
|
||||
assert.isTrue(isLinkSneaky('https://user:pass@example.com'));
|
||||
assert.isTrue(
|
||||
isLinkSneaky('http://whatever.com&login=someuser@77777777')
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
it('returns true if the domain has any empty labels', () => {
|
||||
assert.isTrue(isLinkSneaky('https://example.'));
|
||||
assert.isTrue(isLinkSneaky('https://example.com.'));
|
||||
assert.isTrue(isLinkSneaky('https://.example.com'));
|
||||
assert.isTrue(isLinkSneaky('https://..example.com'));
|
||||
describe('domain', () => {
|
||||
it('returns false for all-latin domain', () => {
|
||||
const link = 'https://www.amazon.com';
|
||||
const actual = isLinkSneaky(link);
|
||||
assert.strictEqual(actual, false);
|
||||
});
|
||||
|
||||
it('returns false for IPv4 addresses', () => {
|
||||
assert.isFalse(isLinkSneaky('https://127.0.0.1/path'));
|
||||
});
|
||||
|
||||
// It's possible that this should return `false` but we'd need to add special logic
|
||||
// for it.
|
||||
it('returns true for IPv6 addresses', () => {
|
||||
assert.isTrue(
|
||||
isLinkSneaky('https://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]/path')
|
||||
);
|
||||
assert.isTrue(isLinkSneaky('https://[::]/path'));
|
||||
});
|
||||
|
||||
it('returns true for Latin + Cyrillic domain', () => {
|
||||
const link = 'https://www.aмazon.com';
|
||||
const actual = isLinkSneaky(link);
|
||||
assert.strictEqual(actual, true);
|
||||
});
|
||||
|
||||
it('returns true for Latin + Greek domain', () => {
|
||||
const link = 'https://www.αpple.com';
|
||||
const actual = isLinkSneaky(link);
|
||||
assert.strictEqual(actual, true);
|
||||
});
|
||||
|
||||
it('returns true for ASCII and non-ASCII mix', () => {
|
||||
const link = 'https://www.аррӏе.com';
|
||||
const actual = isLinkSneaky(link);
|
||||
assert.strictEqual(actual, true);
|
||||
});
|
||||
|
||||
it('returns true for Latin + High Greek domain', () => {
|
||||
const link = `https://www.apple${String.fromCodePoint(0x101a0)}.com`;
|
||||
const actual = isLinkSneaky(link);
|
||||
assert.strictEqual(actual, true);
|
||||
});
|
||||
|
||||
it("returns true if the domain doesn't contain a .", () => {
|
||||
assert.isTrue(isLinkSneaky('https://example'));
|
||||
assert.isTrue(isLinkSneaky('https://localhost'));
|
||||
assert.isTrue(isLinkSneaky('https://localhost:3000'));
|
||||
});
|
||||
|
||||
it('returns true if the domain has any empty labels', () => {
|
||||
assert.isTrue(isLinkSneaky('https://example.'));
|
||||
assert.isTrue(isLinkSneaky('https://example.com.'));
|
||||
assert.isTrue(isLinkSneaky('https://.example.com'));
|
||||
assert.isTrue(isLinkSneaky('https://..example.com'));
|
||||
});
|
||||
|
||||
it('returns true if the domain is longer than 2048 UTF-16 code points', () => {
|
||||
const domain = `${'a'.repeat(2041)}.example`;
|
||||
assert.lengthOf(domain, 2049, 'Test domain is the incorrect length');
|
||||
const link = `https://${domain}/foo/bar`;
|
||||
assert.isTrue(isLinkSneaky(link));
|
||||
});
|
||||
});
|
||||
|
||||
it('returns true if the domain is longer than 2048 UTF-16 code points', () => {
|
||||
const domain = `${'a'.repeat(2041)}.example`;
|
||||
assert.lengthOf(domain, 2049, 'Test domain is the incorrect length');
|
||||
const link = `https://${domain}/foo/bar`;
|
||||
assert.isTrue(isLinkSneaky(link));
|
||||
describe('pathname', () => {
|
||||
it('returns false for no pathname', () => {
|
||||
assert.isFalse(isLinkSneaky('https://example.com'));
|
||||
assert.isFalse(isLinkSneaky('https://example.com/'));
|
||||
});
|
||||
|
||||
it('returns false if the pathname contains valid characters', () => {
|
||||
assert.isFalse(isLinkSneaky('https://example.com/foo'));
|
||||
assert.isFalse(isLinkSneaky('https://example.com/foo/bar'));
|
||||
assert.isFalse(
|
||||
isLinkSneaky("https://example.com/:/[]@!$&'()*+,;=abc123-._~%")
|
||||
);
|
||||
assert.isFalse(
|
||||
isLinkSneaky(
|
||||
'https://lbry.tv/@ScammerRevolts:b0/DELETING-EVERY-FILE-OFF-A-SCAMMERS-LAPTOP-Destroyed:1'
|
||||
)
|
||||
);
|
||||
});
|
||||
|
||||
it('returns true if the pathname contains invalid characters', () => {
|
||||
assert.isTrue(isLinkSneaky('https://example.com/hello world'));
|
||||
assert.isTrue(isLinkSneaky('https://example.com/aquí-está'));
|
||||
assert.isTrue(isLinkSneaky('https://example.com/hello\x00world'));
|
||||
assert.isTrue(isLinkSneaky('https://example.com/hello\nworld'));
|
||||
assert.isTrue(isLinkSneaky('https://example.com/hello😈world'));
|
||||
});
|
||||
});
|
||||
|
||||
it('returns false for regular @ in url', () => {
|
||||
const link =
|
||||
'https://lbry.tv/@ScammerRevolts:b0/DELETING-EVERY-FILE-OFF-A-SCAMMERS-LAPTOP-Destroyed:1';
|
||||
assert.strictEqual(isLinkSneaky(link), false);
|
||||
describe('query string', () => {
|
||||
it('returns false for no query', () => {
|
||||
assert.isFalse(isLinkSneaky('https://example.com/foo'));
|
||||
assert.isFalse(isLinkSneaky('https://example.com/foo?'));
|
||||
});
|
||||
|
||||
it('returns false if the query string contains valid characters', () => {
|
||||
assert.isFalse(isLinkSneaky('https://example.com/foo?bar'));
|
||||
assert.isFalse(isLinkSneaky('https://example.com/foo?bar=baz'));
|
||||
assert.isFalse(
|
||||
isLinkSneaky(
|
||||
"https://example.com/foo?bar=:/[]@!$&'()*+,;=abc123-._~%"
|
||||
)
|
||||
);
|
||||
assert.isFalse(
|
||||
isLinkSneaky(
|
||||
"https://example.com/foo?:/[]@!$&'()*+,;=abc123-._~%=baz"
|
||||
)
|
||||
);
|
||||
});
|
||||
|
||||
it('returns true if the query string contains invalid characters', () => {
|
||||
assert.isTrue(isLinkSneaky('https://example.com/foo?bar baz'));
|
||||
assert.isTrue(isLinkSneaky('https://example.com/foo?bar baz=qux'));
|
||||
assert.isTrue(isLinkSneaky('https://example.com/foo?bar=baz qux'));
|
||||
assert.isTrue(isLinkSneaky('https://example.com/foo?aquí=está'));
|
||||
assert.isTrue(isLinkSneaky('https://example.com/foo?hello=\x00world'));
|
||||
assert.isTrue(
|
||||
isLinkSneaky('https://example.com/foo?hello=hello\nworld')
|
||||
);
|
||||
assert.isTrue(isLinkSneaky('https://example.com/foo?hello=😈world'));
|
||||
});
|
||||
});
|
||||
|
||||
describe('hash', () => {
|
||||
it('returns false for no hash', () => {
|
||||
assert.isFalse(isLinkSneaky('https://example.com/foo'));
|
||||
assert.isFalse(isLinkSneaky('https://example.com/foo#'));
|
||||
});
|
||||
|
||||
it('returns false if the hash contains valid characters', () => {
|
||||
assert.isFalse(isLinkSneaky('https://example.com/foo#bar'));
|
||||
assert.isFalse(
|
||||
isLinkSneaky("https://example.com/foo#:/[]@!$&'()*+,;=abc123-._~%")
|
||||
);
|
||||
});
|
||||
|
||||
it('returns true if the hash contains invalid characters', () => {
|
||||
assert.isTrue(isLinkSneaky('https://example.com/foo#bar baz'));
|
||||
assert.isTrue(isLinkSneaky('https://example.com/foo#bar baz=qux'));
|
||||
assert.isTrue(isLinkSneaky('https://example.com/foo#bar=baz qux'));
|
||||
assert.isTrue(isLinkSneaky('https://example.com/foo#aquí_está'));
|
||||
assert.isTrue(isLinkSneaky('https://example.com/foo#hello\x00world'));
|
||||
assert.isTrue(isLinkSneaky('https://example.com/foo#hello\nworld'));
|
||||
assert.isTrue(isLinkSneaky('https://example.com/foo#hello😈world'));
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
Loading…
Add table
Reference in a new issue