Use signal_tokenizer for search query

This commit is contained in:
Fedor Indutny 2023-12-04 18:38:40 +01:00 committed by GitHub
parent cc15d630a7
commit a81833d3ed
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 15 additions and 52 deletions

View file

@ -1749,6 +1749,11 @@ async function searchMessages({
const db = getUnsafeWritableInstance('only temp table use');
const normalizedQuery = db
.signalTokenize(query)
.map(token => `"${token.replace(/"/g, '""')}"*`)
.join(' ');
// sqlite queries with a join on a virtual table (like FTS5) are de-optimized
// and can't use indices for ordering results. Instead an in-memory index of
// the join rows is sorted on the fly, and this becomes substantially
@ -1778,7 +1783,7 @@ async function searchMessages({
WHERE
messages_fts.body MATCH $query;
`
).run({ query });
).run({ query: normalizedQuery });
if (conversationId === undefined) {
db.prepare<Query>(
@ -1829,7 +1834,7 @@ async function searchMessages({
INNER JOIN messages
ON messages.rowid = tmp_filtered_results.rowid
WHERE
messages_fts.body MATCH ${query}
messages_fts.body MATCH ${normalizedQuery}
ORDER BY messages.received_at DESC, messages.sent_at DESC
LIMIT ${limit}
`;

View file

@ -6,7 +6,6 @@ import { debounce, omit, reject } from 'lodash';
import type { ReadonlyDeep } from 'type-fest';
import type { StateType as RootStateType } from '../reducer';
import { cleanSearchTerm } from '../../util/cleanSearchTerm';
import { filterAndSortConversationsByRecent } from '../../util/filterAndSortConversations';
import type {
ClientSearchResultMessageType,
@ -294,21 +293,20 @@ async function queryMessages({
contactServiceIdsMatchingQuery?: Array<ServiceIdString>;
}): Promise<Array<ClientSearchResultMessageType>> {
try {
const normalized = cleanSearchTerm(query);
if (normalized.length === 0) {
if (query.length === 0) {
return [];
}
if (searchConversationId) {
return dataSearchMessages({
query: normalized,
query,
conversationId: searchConversationId,
contactServiceIdsMatchingQuery,
});
}
return dataSearchMessages({
query: normalized,
query,
contactServiceIdsMatchingQuery,
});
} catch (e) {

View file

@ -1,13 +0,0 @@
// Copyright 2021 Signal Messenger, LLC
// SPDX-License-Identifier: AGPL-3.0-only
import { assert } from 'chai';
import { cleanSearchTerm } from '../../util/cleanSearchTerm';
describe('cleanSearchTerm', () => {
it('should remove \\ from a search term', () => {
const searchTerm = '\\search\\term';
const sanitizedSearchTerm = cleanSearchTerm(searchTerm);
assert.strictEqual(sanitizedSearchTerm, 'search* term*');
});
});

View file

@ -1,27 +0,0 @@
// Copyright 2019 Signal Messenger, LLC
// SPDX-License-Identifier: AGPL-3.0-only
export function cleanSearchTerm(searchTerm: string): string {
const lowercase = searchTerm.toLowerCase();
const withoutSpecialCharacters = lowercase.replace(
/([-!"#$%&'()*+,./\\:;<=>?@[\]^_`{|}~])/g,
' '
);
const whiteSpaceNormalized = withoutSpecialCharacters.replace(/\s+/g, ' ');
const byToken = whiteSpaceNormalized.split(' ');
const withoutSpecialTokens = byToken.filter(
token =>
token &&
token !== 'and' &&
token !== 'or' &&
token !== 'not' &&
token !== ')' &&
token !== '(' &&
token !== '+' &&
token !== ',' &&
token !== 'near'
);
const withWildcards = withoutSpecialTokens.map(token => `${token}*`);
return withWildcards.join(' ').trim();
}