Use signal_tokenizer for search query

This commit is contained in:
Fedor Indutny 2023-12-04 18:38:40 +01:00 committed by GitHub
parent cc15d630a7
commit a81833d3ed
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 15 additions and 52 deletions

View file

@ -95,7 +95,7 @@
"@popperjs/core": "2.11.6",
"@react-aria/utils": "3.16.0",
"@react-spring/web": "9.5.5",
"@signalapp/better-sqlite3": "8.5.2",
"@signalapp/better-sqlite3": "8.6.0",
"@signalapp/libsignal-client": "0.32.1",
"@signalapp/ringrtc": "2.34.5",
"@signalapp/windows-dummy-keystroke": "1.0.0",

View file

@ -1749,6 +1749,11 @@ async function searchMessages({
const db = getUnsafeWritableInstance('only temp table use');
const normalizedQuery = db
.signalTokenize(query)
.map(token => `"${token.replace(/"/g, '""')}"*`)
.join(' ');
// sqlite queries with a join on a virtual table (like FTS5) are de-optimized
// and can't use indices for ordering results. Instead an in-memory index of
// the join rows is sorted on the fly, and this becomes substantially
@ -1778,7 +1783,7 @@ async function searchMessages({
WHERE
messages_fts.body MATCH $query;
`
).run({ query });
).run({ query: normalizedQuery });
if (conversationId === undefined) {
db.prepare<Query>(
@ -1829,7 +1834,7 @@ async function searchMessages({
INNER JOIN messages
ON messages.rowid = tmp_filtered_results.rowid
WHERE
messages_fts.body MATCH ${query}
messages_fts.body MATCH ${normalizedQuery}
ORDER BY messages.received_at DESC, messages.sent_at DESC
LIMIT ${limit}
`;

View file

@ -6,7 +6,6 @@ import { debounce, omit, reject } from 'lodash';
import type { ReadonlyDeep } from 'type-fest';
import type { StateType as RootStateType } from '../reducer';
import { cleanSearchTerm } from '../../util/cleanSearchTerm';
import { filterAndSortConversationsByRecent } from '../../util/filterAndSortConversations';
import type {
ClientSearchResultMessageType,
@ -294,21 +293,20 @@ async function queryMessages({
contactServiceIdsMatchingQuery?: Array<ServiceIdString>;
}): Promise<Array<ClientSearchResultMessageType>> {
try {
const normalized = cleanSearchTerm(query);
if (normalized.length === 0) {
if (query.length === 0) {
return [];
}
if (searchConversationId) {
return dataSearchMessages({
query: normalized,
query,
conversationId: searchConversationId,
contactServiceIdsMatchingQuery,
});
}
return dataSearchMessages({
query: normalized,
query,
contactServiceIdsMatchingQuery,
});
} catch (e) {

View file

@ -1,13 +0,0 @@
// Copyright 2021 Signal Messenger, LLC
// SPDX-License-Identifier: AGPL-3.0-only
import { assert } from 'chai';
import { cleanSearchTerm } from '../../util/cleanSearchTerm';
describe('cleanSearchTerm', () => {
it('should remove \\ from a search term', () => {
const searchTerm = '\\search\\term';
const sanitizedSearchTerm = cleanSearchTerm(searchTerm);
assert.strictEqual(sanitizedSearchTerm, 'search* term*');
});
});

View file

@ -1,27 +0,0 @@
// Copyright 2019 Signal Messenger, LLC
// SPDX-License-Identifier: AGPL-3.0-only
export function cleanSearchTerm(searchTerm: string): string {
const lowercase = searchTerm.toLowerCase();
const withoutSpecialCharacters = lowercase.replace(
/([-!"#$%&'()*+,./\\:;<=>?@[\]^_`{|}~])/g,
' '
);
const whiteSpaceNormalized = withoutSpecialCharacters.replace(/\s+/g, ' ');
const byToken = whiteSpaceNormalized.split(' ');
const withoutSpecialTokens = byToken.filter(
token =>
token &&
token !== 'and' &&
token !== 'or' &&
token !== 'not' &&
token !== ')' &&
token !== '(' &&
token !== '+' &&
token !== ',' &&
token !== 'near'
);
const withWildcards = withoutSpecialTokens.map(token => `${token}*`);
return withWildcards.join(' ').trim();
}

View file

@ -3902,10 +3902,10 @@
resolved "https://registry.yarnpkg.com/@sideway/pinpoint/-/pinpoint-2.0.0.tgz#cff8ffadc372ad29fd3f78277aeb29e632cc70df"
integrity sha512-RNiOoTPkptFtSVzQevY/yWtZwf/RxyVnPy/OcA9HBM3MlGDnBEYL5B41H0MTn0Uec8Hi+2qUtTfG2WWZBmMejQ==
"@signalapp/better-sqlite3@8.5.2":
version "8.5.2"
resolved "https://registry.yarnpkg.com/@signalapp/better-sqlite3/-/better-sqlite3-8.5.2.tgz#910669f44e76a46d06df45fabefcd3ac2e7c4cce"
integrity sha512-t7XalDxuRP115EratM6i1kbvIXJvzETcl8wqnt3NlWZdzil7kelS/RYz+PE1G+z8ZwtFyn/ViAFMt76AsArifw==
"@signalapp/better-sqlite3@8.6.0":
version "8.6.0"
resolved "https://registry.yarnpkg.com/@signalapp/better-sqlite3/-/better-sqlite3-8.6.0.tgz#0413f4d0626b99838cd64ad09c88720aa2bec6ed"
integrity sha512-dSLWG4m6XtPq/jbUjckLaiR/nFFkY95pWZI8VSm0dEVJC8S2YTXHm6VZ7vZiErt4h6EjBaa827WyK1oheElE2A==
dependencies:
bindings "^1.5.0"
tar "^6.1.0"