"Attachment Content" search improvements
- Fix incorrect results for ANY search with multiple "Attachment Content" conditions and no other conditions - Dramatically speed up single-word searches by avoiding unnecessary text scans (which probably addresses #1595) - Clean up code
This commit is contained in:
parent
977eb8d965
commit
1061893998
3 changed files with 176 additions and 144 deletions
|
@ -617,148 +617,130 @@ Zotero.Search.prototype.search = Zotero.Promise.coroutine(function* (asTempTable
|
|||
|
||||
//Zotero.debug('IDs from main search or subsearch: ');
|
||||
//Zotero.debug(ids);
|
||||
|
||||
//Zotero.debug('Join mode: ' + joinMode);
|
||||
|
||||
// Filter results with fulltext search
|
||||
// Filter results with full-text search
|
||||
//
|
||||
// If join mode ALL, return the (intersection of main and fulltext word search)
|
||||
// filtered by fulltext content
|
||||
// If join mode ALL, return the (intersection of main and full-text word search)
|
||||
// filtered by full-text content.
|
||||
//
|
||||
// If join mode ANY or there's a quicksearch (which we assume
|
||||
// fulltextContent is part of), return the union of the main search and
|
||||
// (a separate fulltext word search filtered by fulltext content)
|
||||
for (let condition of Object.values(this._conditions)){
|
||||
if (condition['condition']=='fulltextContent'){
|
||||
var fulltextWordIntersectionFilter = (val, index, array) => !!hash[val];
|
||||
var fulltextWordIntersectionConditionFilter = function(val, index, array) {
|
||||
return hash[val] ?
|
||||
(condition.operator == 'contains') :
|
||||
(condition.operator == 'doesNotContain');
|
||||
};
|
||||
// If join mode ANY or there's a quicksearch (which we assume fulltextContent is part of)
|
||||
// and the main search is filtered by other conditions, return the union of the main search
|
||||
// and (separate full-text word searches filtered by fulltext content).
|
||||
//
|
||||
// If join mode ANY or there's a quicksearch and the main search isn't filtered, return just
|
||||
// the union of (separate full-text word searches filtered by full-text content).
|
||||
var fullTextResults;
|
||||
var joinModeAny = joinMode == 'any' || hasQuicksearch;
|
||||
for (let condition of Object.values(this._conditions)) {
|
||||
if (condition.condition != 'fulltextContent') continue;
|
||||
|
||||
// Regexp mode -- don't use fulltext word index
|
||||
if (condition.mode && condition.mode.startsWith('regexp')) {
|
||||
// In an ANY search with other conditions that alter the results, only bother
|
||||
// scanning items that haven't already been found by the main search, as long as
|
||||
// they're in the right library
|
||||
if (joinMode == 'any' && this._hasPrimaryConditions) {
|
||||
if (!tmpTable) {
|
||||
tmpTable = yield Zotero.Search.idsToTempTable(ids);
|
||||
if (!fullTextResults) {
|
||||
// For join mode ANY, if we already filtered the main set, add those as results.
|
||||
// Otherwise, start with an empty set.
|
||||
fullTextResults = joinModeAny && this._hasPrimaryConditions
|
||||
? ids
|
||||
: [];
|
||||
}
|
||||
|
||||
var sql = "SELECT GROUP_CONCAT(itemID) FROM items WHERE "
|
||||
let scopeIDs;
|
||||
// Regexp mode -- don't use full-text word index
|
||||
let numSplits;
|
||||
if (condition.mode && condition.mode.startsWith('regexp')) {
|
||||
// In ANY mode, include items that haven't already been found, as long as they're in
|
||||
// the right library
|
||||
if (joinModeAny) {
|
||||
let tmpTable = yield Zotero.Search.idsToTempTable(fullTextResults);
|
||||
let sql = "SELECT GROUP_CONCAT(itemID) FROM items WHERE "
|
||||
+ "itemID NOT IN (SELECT itemID FROM " + tmpTable + ")";
|
||||
if (this.libraryID) {
|
||||
sql += " AND libraryID=" + parseInt(this.libraryID);
|
||||
sql += " AND libraryID=?";
|
||||
}
|
||||
|
||||
var res = yield Zotero.DB.valueQueryAsync(sql);
|
||||
var scopeIDs = res ? res.split(",").map(id => parseInt(id)) : [];
|
||||
let res = yield Zotero.DB.valueQueryAsync(sql, this.libraryID);
|
||||
scopeIDs = res ? res.split(",").map(id => parseInt(id)) : [];
|
||||
yield Zotero.DB.queryAsync("DROP TABLE " + tmpTable);
|
||||
}
|
||||
// If an ALL search, scan only items from the main search
|
||||
// In ALL mode, include remaining items from the main search
|
||||
else {
|
||||
var scopeIDs = ids;
|
||||
scopeIDs = ids;
|
||||
}
|
||||
}
|
||||
// If not regexp mode, run a new search against the fulltext word
|
||||
// index for words in this phrase
|
||||
// If not regexp mode, run a new search against the full-text word index for words in
|
||||
// this phrase
|
||||
else {
|
||||
Zotero.debug('Running subsearch against fulltext word index');
|
||||
var s = new Zotero.Search();
|
||||
//Zotero.debug('Running subsearch against full-text word index');
|
||||
let s = new Zotero.Search();
|
||||
if (this.libraryID) {
|
||||
s.libraryID = this.libraryID;
|
||||
}
|
||||
|
||||
// Add any necessary conditions to the fulltext word search --
|
||||
// those that are required in an ANY search and any outside the
|
||||
// quicksearch in an ALL search
|
||||
for (let c of Object.values(this._conditions)) {
|
||||
if (c.condition == 'blockStart') {
|
||||
var inQS = true;
|
||||
continue;
|
||||
}
|
||||
else if (c.condition == 'blockEnd') {
|
||||
inQS = false;
|
||||
continue;
|
||||
}
|
||||
else if (c.condition == 'fulltextContent' || inQS) {
|
||||
continue;
|
||||
}
|
||||
else if (joinMode == 'any' && !c.required) {
|
||||
continue;
|
||||
}
|
||||
s.addCondition(c.condition, c.operator, c.value);
|
||||
}
|
||||
|
||||
var splits = Zotero.Fulltext.semanticSplitter(condition.value);
|
||||
let splits = Zotero.Fulltext.semanticSplitter(condition.value);
|
||||
for (let split of splits){
|
||||
s.addCondition('fulltextWord', condition.operator, split);
|
||||
}
|
||||
var fulltextWordIDs = yield s.search();
|
||||
numSplits = splits.length;
|
||||
let wordMatches = yield s.search();
|
||||
|
||||
//Zotero.debug("Fulltext word IDs");
|
||||
//Zotero.debug(fulltextWordIDs);
|
||||
//Zotero.debug("Word index matches");
|
||||
//Zotero.debug(wordMatches);
|
||||
|
||||
// If ALL mode, set intersection of main search and fulltext word index
|
||||
// as the scope for the fulltext content search
|
||||
if (joinMode == 'all' && !hasQuicksearch) {
|
||||
var hash = {};
|
||||
for (let i=0; i<fulltextWordIDs.length; i++) {
|
||||
hash[fulltextWordIDs[i]] = true;
|
||||
}
|
||||
|
||||
if (ids) {
|
||||
var scopeIDs = ids.filter(fulltextWordIntersectionFilter);
|
||||
// In ANY mode, include hits from word index that aren't already in the results
|
||||
if (joinModeAny) {
|
||||
let resultsSet = new Set(fullTextResults);
|
||||
scopeIDs = wordMatches.filter(id => !resultsSet.has(id));
|
||||
}
|
||||
// In ALL mode, include the intersection of hits from word index and remaining
|
||||
// main search matches
|
||||
else {
|
||||
var scopeIDs = [];
|
||||
}
|
||||
}
|
||||
// If ANY mode, just use fulltext word index hits for content search,
|
||||
// since the main results will be added in below
|
||||
else {
|
||||
var scopeIDs = fulltextWordIDs;
|
||||
let wordIDs = new Set(wordMatches);
|
||||
scopeIDs = ids.filter(id => wordIDs.has(id));
|
||||
}
|
||||
}
|
||||
|
||||
if (scopeIDs && scopeIDs.length) {
|
||||
var fulltextIDs = yield Zotero.Fulltext.findTextInItems(scopeIDs,
|
||||
condition['value'], condition['mode']);
|
||||
|
||||
var hash = {};
|
||||
for (let i=0; i<fulltextIDs.length; i++) {
|
||||
hash[fulltextIDs[i].id] = true;
|
||||
// If only one word, just use the results from the word index
|
||||
let filteredIDs = [];
|
||||
if (numSplits === 1) {
|
||||
filteredIDs = scopeIDs;
|
||||
}
|
||||
|
||||
filteredIDs = scopeIDs.filter(fulltextWordIntersectionConditionFilter);
|
||||
}
|
||||
else {
|
||||
var filteredIDs = [];
|
||||
// Search the full-text content
|
||||
else if (scopeIDs.length) {
|
||||
let found = new Set(
|
||||
yield Zotero.Fulltext.findTextInItems(
|
||||
scopeIDs,
|
||||
condition.value,
|
||||
condition.mode
|
||||
).map(x => x.id)
|
||||
);
|
||||
// Either include or exclude the results, depending on the operator
|
||||
filteredIDs = scopeIDs.filter((id) => {
|
||||
return found.has(id)
|
||||
? condition.operator == 'contains'
|
||||
: condition.operator == 'doesNotContain';
|
||||
});
|
||||
}
|
||||
|
||||
//Zotero.debug("Filtered IDs:")
|
||||
//Zotero.debug(filteredIDs);
|
||||
|
||||
// If join mode ANY, add any new items from the fulltext content
|
||||
// search to the main search results
|
||||
//
|
||||
// We only do this if there are primary conditions that alter the
|
||||
// main search, since otherwise all items will match
|
||||
if (this._hasPrimaryConditions && (joinMode == 'any' || hasQuicksearch)) {
|
||||
//Zotero.debug("Adding filtered IDs to main set");
|
||||
for (let i=0; i<filteredIDs.length; i++) {
|
||||
let id = filteredIDs[i];
|
||||
if (ids.indexOf(id) == -1) {
|
||||
ids.push(id);
|
||||
}
|
||||
// If join mode ANY, add any new items from the full-text content search to the results,
|
||||
// and remove from the scope so that we don't search through items we already matched
|
||||
if (joinModeAny) {
|
||||
//Zotero.debug("Adding filtered IDs to results and removing from scope");
|
||||
fullTextResults = fullTextResults.concat(filteredIDs);
|
||||
|
||||
let idSet = new Set(ids);
|
||||
for (let id of filteredIDs) {
|
||||
idSet.delete(id);
|
||||
}
|
||||
ids = Array.from(idSet);
|
||||
}
|
||||
else {
|
||||
//Zotero.debug("Replacing main set with filtered IDs");
|
||||
//Zotero.debug("Replacing results with filtered IDs");
|
||||
ids = filteredIDs;
|
||||
fullTextResults = filteredIDs;
|
||||
}
|
||||
}
|
||||
if (fullTextResults) {
|
||||
ids = Array.from(new Set(fullTextResults));
|
||||
}
|
||||
|
||||
if (this.hasPostSearchFilter() &&
|
||||
|
|
BIN
test/tests/data/search/baz.pdf
Normal file
BIN
test/tests/data/search/baz.pdf
Normal file
Binary file not shown.
|
@ -120,20 +120,29 @@ describe("Zotero.Search", function() {
|
|||
var userLibraryID;
|
||||
var fooItem;
|
||||
var foobarItem;
|
||||
var bazItem;
|
||||
var fooItemGroup;
|
||||
var foobarItemGroup;
|
||||
var bazItemGroup;
|
||||
|
||||
before(async function () {
|
||||
await resetDB({
|
||||
thisArg: this,
|
||||
skipBundledFiles: true
|
||||
});
|
||||
|
||||
before(function* () {
|
||||
// Hidden browser, which requires a browser window, needed for charset detection
|
||||
// (until we figure out a better way)
|
||||
win = yield loadBrowserWindow();
|
||||
fooItem = yield importFileAttachment("search/foo.html");
|
||||
foobarItem = yield importFileAttachment("search/foobar.html");
|
||||
win = await loadBrowserWindow();
|
||||
fooItem = await importFileAttachment("search/foo.html");
|
||||
foobarItem = await importFileAttachment("search/foobar.html");
|
||||
bazItem = await importFileAttachment("search/baz.pdf");
|
||||
userLibraryID = fooItem.libraryID;
|
||||
|
||||
let group = yield getGroup();
|
||||
fooItemGroup = yield importFileAttachment("search/foo.html", { libraryID: group.libraryID });
|
||||
foobarItemGroup = yield importFileAttachment("search/foobar.html", { libraryID: group.libraryID });
|
||||
let group = await getGroup();
|
||||
fooItemGroup = await importFileAttachment("search/foo.html", { libraryID: group.libraryID });
|
||||
foobarItemGroup = await importFileAttachment("search/foobar.html", { libraryID: group.libraryID });
|
||||
bazItemGroup = await importFileAttachment("search/baz.pdf", { libraryID: group.libraryID });
|
||||
});
|
||||
|
||||
after(function* () {
|
||||
|
@ -142,8 +151,10 @@ describe("Zotero.Search", function() {
|
|||
}
|
||||
yield fooItem.eraseTx();
|
||||
yield foobarItem.eraseTx();
|
||||
yield bazItem.eraseTx();
|
||||
yield fooItemGroup.eraseTx();
|
||||
yield foobarItemGroup.eraseTx();
|
||||
yield bazItemGroup.eraseTx();
|
||||
});
|
||||
|
||||
describe("Conditions", function () {
|
||||
|
@ -238,14 +249,25 @@ describe("Zotero.Search", function() {
|
|||
assert.sameMembers(matches, [foobarItem.id]);
|
||||
});
|
||||
|
||||
it("should find matching item with joinMode=ANY and non-matching other condition", function* () {
|
||||
it("should find matching items with joinMode=ANY with no other conditions", async function () {
|
||||
var s = new Zotero.Search();
|
||||
s.libraryID = userLibraryID;
|
||||
s.addCondition('joinMode', 'any');
|
||||
s.addCondition('fulltextContent', 'contains', 'foo bar');
|
||||
s.addCondition('fulltextContent', 'contains', 'foo');
|
||||
s.addCondition('fulltextContent', 'contains', 'bar');
|
||||
var matches = await s.search();
|
||||
assert.sameMembers(matches, [fooItem.id, foobarItem.id]);
|
||||
});
|
||||
|
||||
it("should find matching items with joinMode=ANY and non-matching other condition", function* () {
|
||||
var s = new Zotero.Search();
|
||||
s.libraryID = userLibraryID;
|
||||
s.addCondition('joinMode', 'any');
|
||||
s.addCondition('fulltextContent', 'contains', 'foo');
|
||||
s.addCondition('fulltextContent', 'contains', 'bar');
|
||||
s.addCondition('title', 'contains', 'nomatch');
|
||||
var matches = yield s.search();
|
||||
assert.sameMembers(matches, [foobarItem.id]);
|
||||
assert.sameMembers(matches, [fooItem.id, foobarItem.id]);
|
||||
});
|
||||
|
||||
it("should find matching items in regexp mode with joinMode=ANY with matching other condition", function* () {
|
||||
|
@ -287,6 +309,34 @@ describe("Zotero.Search", function() {
|
|||
var matches = yield s.search();
|
||||
assert.sameMembers(matches, [foobarItem.id]);
|
||||
});
|
||||
|
||||
it("should find items that don't contain a single word with joinMode=ANY", async function () {
|
||||
var s = new Zotero.Search();
|
||||
s.libraryID = userLibraryID;
|
||||
s.addCondition('joinMode', 'any');
|
||||
s.addCondition('fulltextContent', 'doesNotContain', 'foo');
|
||||
var matches = await s.search();
|
||||
assert.notIncludeMembers(matches, [fooItem.id, foobarItem.id]);
|
||||
});
|
||||
|
||||
it("should find items that don't contain a phrase with joinMode=ANY", async function () {
|
||||
var s = new Zotero.Search();
|
||||
s.libraryID = userLibraryID;
|
||||
s.addCondition('joinMode', 'any');
|
||||
s.addCondition('fulltextContent', 'doesNotContain', 'foo bar');
|
||||
var matches = await s.search();
|
||||
assert.notIncludeMembers(matches, [foobarItem.id]);
|
||||
});
|
||||
|
||||
it("should find items that don't contain a regexp pattern with joinMode=ANY", async function () {
|
||||
var s = new Zotero.Search();
|
||||
s.libraryID = userLibraryID;
|
||||
s.addCondition('joinMode', 'any');
|
||||
s.addCondition('fulltextContent/regexp', 'doesNotContain', 'foo.+bar');
|
||||
var matches = await s.search();
|
||||
assert.notIncludeMembers(matches, [foobarItem.id]);
|
||||
assert.includeMembers(matches, [fooItem.id, bazItem.id]);
|
||||
});
|
||||
});
|
||||
|
||||
describe("fulltextWord", function () {
|
||||
|
@ -302,7 +352,7 @@ describe("Zotero.Search", function() {
|
|||
it("should not return non-matches with full-text conditions", function* () {
|
||||
let s = new Zotero.Search();
|
||||
s.libraryID = userLibraryID;
|
||||
s.addCondition('fulltextWord', 'contains', 'baz');
|
||||
s.addCondition('fulltextWord', 'contains', 'nomatch');
|
||||
let matches = yield s.search();
|
||||
assert.lengthOf(matches, 0);
|
||||
});
|
||||
|
@ -332,7 +382,7 @@ describe("Zotero.Search", function() {
|
|||
s.libraryID = userLibraryID;
|
||||
s.addCondition('joinMode', 'any');
|
||||
s.addCondition('fulltextWord', 'contains', 'bar');
|
||||
s.addCondition('fulltextWord', 'contains', 'baz');
|
||||
s.addCondition('fulltextWord', 'contains', 'nomatch');
|
||||
let matches = yield s.search();
|
||||
assert.deepEqual(matches, [foobarItem.id]);
|
||||
});
|
||||
|
|
Loading…
Reference in a new issue