"Attachment Content" search improvements
- Fix incorrect results for ANY search with multiple "Attachment Content" conditions and no other conditions - Dramatically speed up single-word searches by avoiding unnecessary text scans (which probably addresses #1595) - Clean up code
This commit is contained in:
parent
977eb8d965
commit
1061893998
3 changed files with 176 additions and 144 deletions
|
@ -617,148 +617,130 @@ Zotero.Search.prototype.search = Zotero.Promise.coroutine(function* (asTempTable
|
||||||
|
|
||||||
//Zotero.debug('IDs from main search or subsearch: ');
|
//Zotero.debug('IDs from main search or subsearch: ');
|
||||||
//Zotero.debug(ids);
|
//Zotero.debug(ids);
|
||||||
|
|
||||||
//Zotero.debug('Join mode: ' + joinMode);
|
//Zotero.debug('Join mode: ' + joinMode);
|
||||||
|
|
||||||
// Filter results with fulltext search
|
// Filter results with full-text search
|
||||||
//
|
//
|
||||||
// If join mode ALL, return the (intersection of main and fulltext word search)
|
// If join mode ALL, return the (intersection of main and full-text word search)
|
||||||
// filtered by fulltext content
|
// filtered by full-text content.
|
||||||
//
|
//
|
||||||
// If join mode ANY or there's a quicksearch (which we assume
|
// If join mode ANY or there's a quicksearch (which we assume fulltextContent is part of)
|
||||||
// fulltextContent is part of), return the union of the main search and
|
// and the main search is filtered by other conditions, return the union of the main search
|
||||||
// (a separate fulltext word search filtered by fulltext content)
|
// and (separate full-text word searches filtered by fulltext content).
|
||||||
for (let condition of Object.values(this._conditions)){
|
//
|
||||||
if (condition['condition']=='fulltextContent'){
|
// If join mode ANY or there's a quicksearch and the main search isn't filtered, return just
|
||||||
var fulltextWordIntersectionFilter = (val, index, array) => !!hash[val];
|
// the union of (separate full-text word searches filtered by full-text content).
|
||||||
var fulltextWordIntersectionConditionFilter = function(val, index, array) {
|
var fullTextResults;
|
||||||
return hash[val] ?
|
var joinModeAny = joinMode == 'any' || hasQuicksearch;
|
||||||
(condition.operator == 'contains') :
|
for (let condition of Object.values(this._conditions)) {
|
||||||
(condition.operator == 'doesNotContain');
|
if (condition.condition != 'fulltextContent') continue;
|
||||||
};
|
|
||||||
|
if (!fullTextResults) {
|
||||||
// Regexp mode -- don't use fulltext word index
|
// For join mode ANY, if we already filtered the main set, add those as results.
|
||||||
if (condition.mode && condition.mode.startsWith('regexp')) {
|
// Otherwise, start with an empty set.
|
||||||
// In an ANY search with other conditions that alter the results, only bother
|
fullTextResults = joinModeAny && this._hasPrimaryConditions
|
||||||
// scanning items that haven't already been found by the main search, as long as
|
? ids
|
||||||
// they're in the right library
|
: [];
|
||||||
if (joinMode == 'any' && this._hasPrimaryConditions) {
|
}
|
||||||
if (!tmpTable) {
|
|
||||||
tmpTable = yield Zotero.Search.idsToTempTable(ids);
|
let scopeIDs;
|
||||||
}
|
// Regexp mode -- don't use full-text word index
|
||||||
|
let numSplits;
|
||||||
var sql = "SELECT GROUP_CONCAT(itemID) FROM items WHERE "
|
if (condition.mode && condition.mode.startsWith('regexp')) {
|
||||||
+ "itemID NOT IN (SELECT itemID FROM " + tmpTable + ")";
|
// In ANY mode, include items that haven't already been found, as long as they're in
|
||||||
if (this.libraryID) {
|
// the right library
|
||||||
sql += " AND libraryID=" + parseInt(this.libraryID);
|
if (joinModeAny) {
|
||||||
}
|
let tmpTable = yield Zotero.Search.idsToTempTable(fullTextResults);
|
||||||
|
let sql = "SELECT GROUP_CONCAT(itemID) FROM items WHERE "
|
||||||
var res = yield Zotero.DB.valueQueryAsync(sql);
|
+ "itemID NOT IN (SELECT itemID FROM " + tmpTable + ")";
|
||||||
var scopeIDs = res ? res.split(",").map(id => parseInt(id)) : [];
|
|
||||||
}
|
|
||||||
// If an ALL search, scan only items from the main search
|
|
||||||
else {
|
|
||||||
var scopeIDs = ids;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// If not regexp mode, run a new search against the fulltext word
|
|
||||||
// index for words in this phrase
|
|
||||||
else {
|
|
||||||
Zotero.debug('Running subsearch against fulltext word index');
|
|
||||||
var s = new Zotero.Search();
|
|
||||||
if (this.libraryID) {
|
if (this.libraryID) {
|
||||||
s.libraryID = this.libraryID;
|
sql += " AND libraryID=?";
|
||||||
}
|
|
||||||
|
|
||||||
// Add any necessary conditions to the fulltext word search --
|
|
||||||
// those that are required in an ANY search and any outside the
|
|
||||||
// quicksearch in an ALL search
|
|
||||||
for (let c of Object.values(this._conditions)) {
|
|
||||||
if (c.condition == 'blockStart') {
|
|
||||||
var inQS = true;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
else if (c.condition == 'blockEnd') {
|
|
||||||
inQS = false;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
else if (c.condition == 'fulltextContent' || inQS) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
else if (joinMode == 'any' && !c.required) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
s.addCondition(c.condition, c.operator, c.value);
|
|
||||||
}
|
|
||||||
|
|
||||||
var splits = Zotero.Fulltext.semanticSplitter(condition.value);
|
|
||||||
for (let split of splits){
|
|
||||||
s.addCondition('fulltextWord', condition.operator, split);
|
|
||||||
}
|
|
||||||
var fulltextWordIDs = yield s.search();
|
|
||||||
|
|
||||||
//Zotero.debug("Fulltext word IDs");
|
|
||||||
//Zotero.debug(fulltextWordIDs);
|
|
||||||
|
|
||||||
// If ALL mode, set intersection of main search and fulltext word index
|
|
||||||
// as the scope for the fulltext content search
|
|
||||||
if (joinMode == 'all' && !hasQuicksearch) {
|
|
||||||
var hash = {};
|
|
||||||
for (let i=0; i<fulltextWordIDs.length; i++) {
|
|
||||||
hash[fulltextWordIDs[i]] = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ids) {
|
|
||||||
var scopeIDs = ids.filter(fulltextWordIntersectionFilter);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
var scopeIDs = [];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// If ANY mode, just use fulltext word index hits for content search,
|
|
||||||
// since the main results will be added in below
|
|
||||||
else {
|
|
||||||
var scopeIDs = fulltextWordIDs;
|
|
||||||
}
|
}
|
||||||
|
let res = yield Zotero.DB.valueQueryAsync(sql, this.libraryID);
|
||||||
|
scopeIDs = res ? res.split(",").map(id => parseInt(id)) : [];
|
||||||
|
yield Zotero.DB.queryAsync("DROP TABLE " + tmpTable);
|
||||||
}
|
}
|
||||||
|
// In ALL mode, include remaining items from the main search
|
||||||
if (scopeIDs && scopeIDs.length) {
|
|
||||||
var fulltextIDs = yield Zotero.Fulltext.findTextInItems(scopeIDs,
|
|
||||||
condition['value'], condition['mode']);
|
|
||||||
|
|
||||||
var hash = {};
|
|
||||||
for (let i=0; i<fulltextIDs.length; i++) {
|
|
||||||
hash[fulltextIDs[i].id] = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
filteredIDs = scopeIDs.filter(fulltextWordIntersectionConditionFilter);
|
|
||||||
}
|
|
||||||
else {
|
else {
|
||||||
var filteredIDs = [];
|
scopeIDs = ids;
|
||||||
}
|
|
||||||
|
|
||||||
//Zotero.debug("Filtered IDs:")
|
|
||||||
//Zotero.debug(filteredIDs);
|
|
||||||
|
|
||||||
// If join mode ANY, add any new items from the fulltext content
|
|
||||||
// search to the main search results
|
|
||||||
//
|
|
||||||
// We only do this if there are primary conditions that alter the
|
|
||||||
// main search, since otherwise all items will match
|
|
||||||
if (this._hasPrimaryConditions && (joinMode == 'any' || hasQuicksearch)) {
|
|
||||||
//Zotero.debug("Adding filtered IDs to main set");
|
|
||||||
for (let i=0; i<filteredIDs.length; i++) {
|
|
||||||
let id = filteredIDs[i];
|
|
||||||
if (ids.indexOf(id) == -1) {
|
|
||||||
ids.push(id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
//Zotero.debug("Replacing main set with filtered IDs");
|
|
||||||
ids = filteredIDs;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// If not regexp mode, run a new search against the full-text word index for words in
|
||||||
|
// this phrase
|
||||||
|
else {
|
||||||
|
//Zotero.debug('Running subsearch against full-text word index');
|
||||||
|
let s = new Zotero.Search();
|
||||||
|
if (this.libraryID) {
|
||||||
|
s.libraryID = this.libraryID;
|
||||||
|
}
|
||||||
|
let splits = Zotero.Fulltext.semanticSplitter(condition.value);
|
||||||
|
for (let split of splits){
|
||||||
|
s.addCondition('fulltextWord', condition.operator, split);
|
||||||
|
}
|
||||||
|
numSplits = splits.length;
|
||||||
|
let wordMatches = yield s.search();
|
||||||
|
|
||||||
|
//Zotero.debug("Word index matches");
|
||||||
|
//Zotero.debug(wordMatches);
|
||||||
|
|
||||||
|
// In ANY mode, include hits from word index that aren't already in the results
|
||||||
|
if (joinModeAny) {
|
||||||
|
let resultsSet = new Set(fullTextResults);
|
||||||
|
scopeIDs = wordMatches.filter(id => !resultsSet.has(id));
|
||||||
|
}
|
||||||
|
// In ALL mode, include the intersection of hits from word index and remaining
|
||||||
|
// main search matches
|
||||||
|
else {
|
||||||
|
let wordIDs = new Set(wordMatches);
|
||||||
|
scopeIDs = ids.filter(id => wordIDs.has(id));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If only one word, just use the results from the word index
|
||||||
|
let filteredIDs = [];
|
||||||
|
if (numSplits === 1) {
|
||||||
|
filteredIDs = scopeIDs;
|
||||||
|
}
|
||||||
|
// Search the full-text content
|
||||||
|
else if (scopeIDs.length) {
|
||||||
|
let found = new Set(
|
||||||
|
yield Zotero.Fulltext.findTextInItems(
|
||||||
|
scopeIDs,
|
||||||
|
condition.value,
|
||||||
|
condition.mode
|
||||||
|
).map(x => x.id)
|
||||||
|
);
|
||||||
|
// Either include or exclude the results, depending on the operator
|
||||||
|
filteredIDs = scopeIDs.filter((id) => {
|
||||||
|
return found.has(id)
|
||||||
|
? condition.operator == 'contains'
|
||||||
|
: condition.operator == 'doesNotContain';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
//Zotero.debug("Filtered IDs:")
|
||||||
|
//Zotero.debug(filteredIDs);
|
||||||
|
|
||||||
|
// If join mode ANY, add any new items from the full-text content search to the results,
|
||||||
|
// and remove from the scope so that we don't search through items we already matched
|
||||||
|
if (joinModeAny) {
|
||||||
|
//Zotero.debug("Adding filtered IDs to results and removing from scope");
|
||||||
|
fullTextResults = fullTextResults.concat(filteredIDs);
|
||||||
|
|
||||||
|
let idSet = new Set(ids);
|
||||||
|
for (let id of filteredIDs) {
|
||||||
|
idSet.delete(id);
|
||||||
|
}
|
||||||
|
ids = Array.from(idSet);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
//Zotero.debug("Replacing results with filtered IDs");
|
||||||
|
ids = filteredIDs;
|
||||||
|
fullTextResults = filteredIDs;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (fullTextResults) {
|
||||||
|
ids = Array.from(new Set(fullTextResults));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.hasPostSearchFilter() &&
|
if (this.hasPostSearchFilter() &&
|
||||||
|
|
BIN
test/tests/data/search/baz.pdf
Normal file
BIN
test/tests/data/search/baz.pdf
Normal file
Binary file not shown.
|
@ -120,20 +120,29 @@ describe("Zotero.Search", function() {
|
||||||
var userLibraryID;
|
var userLibraryID;
|
||||||
var fooItem;
|
var fooItem;
|
||||||
var foobarItem;
|
var foobarItem;
|
||||||
|
var bazItem;
|
||||||
var fooItemGroup;
|
var fooItemGroup;
|
||||||
var foobarItemGroup;
|
var foobarItemGroup;
|
||||||
|
var bazItemGroup;
|
||||||
|
|
||||||
before(function* () {
|
before(async function () {
|
||||||
|
await resetDB({
|
||||||
|
thisArg: this,
|
||||||
|
skipBundledFiles: true
|
||||||
|
});
|
||||||
|
|
||||||
// Hidden browser, which requires a browser window, needed for charset detection
|
// Hidden browser, which requires a browser window, needed for charset detection
|
||||||
// (until we figure out a better way)
|
// (until we figure out a better way)
|
||||||
win = yield loadBrowserWindow();
|
win = await loadBrowserWindow();
|
||||||
fooItem = yield importFileAttachment("search/foo.html");
|
fooItem = await importFileAttachment("search/foo.html");
|
||||||
foobarItem = yield importFileAttachment("search/foobar.html");
|
foobarItem = await importFileAttachment("search/foobar.html");
|
||||||
|
bazItem = await importFileAttachment("search/baz.pdf");
|
||||||
userLibraryID = fooItem.libraryID;
|
userLibraryID = fooItem.libraryID;
|
||||||
|
|
||||||
let group = yield getGroup();
|
let group = await getGroup();
|
||||||
fooItemGroup = yield importFileAttachment("search/foo.html", { libraryID: group.libraryID });
|
fooItemGroup = await importFileAttachment("search/foo.html", { libraryID: group.libraryID });
|
||||||
foobarItemGroup = yield importFileAttachment("search/foobar.html", { libraryID: group.libraryID });
|
foobarItemGroup = await importFileAttachment("search/foobar.html", { libraryID: group.libraryID });
|
||||||
|
bazItemGroup = await importFileAttachment("search/baz.pdf", { libraryID: group.libraryID });
|
||||||
});
|
});
|
||||||
|
|
||||||
after(function* () {
|
after(function* () {
|
||||||
|
@ -142,8 +151,10 @@ describe("Zotero.Search", function() {
|
||||||
}
|
}
|
||||||
yield fooItem.eraseTx();
|
yield fooItem.eraseTx();
|
||||||
yield foobarItem.eraseTx();
|
yield foobarItem.eraseTx();
|
||||||
|
yield bazItem.eraseTx();
|
||||||
yield fooItemGroup.eraseTx();
|
yield fooItemGroup.eraseTx();
|
||||||
yield foobarItemGroup.eraseTx();
|
yield foobarItemGroup.eraseTx();
|
||||||
|
yield bazItemGroup.eraseTx();
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("Conditions", function () {
|
describe("Conditions", function () {
|
||||||
|
@ -238,14 +249,25 @@ describe("Zotero.Search", function() {
|
||||||
assert.sameMembers(matches, [foobarItem.id]);
|
assert.sameMembers(matches, [foobarItem.id]);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should find matching item with joinMode=ANY and non-matching other condition", function* () {
|
it("should find matching items with joinMode=ANY with no other conditions", async function () {
|
||||||
var s = new Zotero.Search();
|
var s = new Zotero.Search();
|
||||||
s.libraryID = userLibraryID;
|
s.libraryID = userLibraryID;
|
||||||
s.addCondition('joinMode', 'any');
|
s.addCondition('joinMode', 'any');
|
||||||
s.addCondition('fulltextContent', 'contains', 'foo bar');
|
s.addCondition('fulltextContent', 'contains', 'foo');
|
||||||
|
s.addCondition('fulltextContent', 'contains', 'bar');
|
||||||
|
var matches = await s.search();
|
||||||
|
assert.sameMembers(matches, [fooItem.id, foobarItem.id]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should find matching items with joinMode=ANY and non-matching other condition", function* () {
|
||||||
|
var s = new Zotero.Search();
|
||||||
|
s.libraryID = userLibraryID;
|
||||||
|
s.addCondition('joinMode', 'any');
|
||||||
|
s.addCondition('fulltextContent', 'contains', 'foo');
|
||||||
|
s.addCondition('fulltextContent', 'contains', 'bar');
|
||||||
s.addCondition('title', 'contains', 'nomatch');
|
s.addCondition('title', 'contains', 'nomatch');
|
||||||
var matches = yield s.search();
|
var matches = yield s.search();
|
||||||
assert.sameMembers(matches, [foobarItem.id]);
|
assert.sameMembers(matches, [fooItem.id, foobarItem.id]);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should find matching items in regexp mode with joinMode=ANY with matching other condition", function* () {
|
it("should find matching items in regexp mode with joinMode=ANY with matching other condition", function* () {
|
||||||
|
@ -287,6 +309,34 @@ describe("Zotero.Search", function() {
|
||||||
var matches = yield s.search();
|
var matches = yield s.search();
|
||||||
assert.sameMembers(matches, [foobarItem.id]);
|
assert.sameMembers(matches, [foobarItem.id]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("should find items that don't contain a single word with joinMode=ANY", async function () {
|
||||||
|
var s = new Zotero.Search();
|
||||||
|
s.libraryID = userLibraryID;
|
||||||
|
s.addCondition('joinMode', 'any');
|
||||||
|
s.addCondition('fulltextContent', 'doesNotContain', 'foo');
|
||||||
|
var matches = await s.search();
|
||||||
|
assert.notIncludeMembers(matches, [fooItem.id, foobarItem.id]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should find items that don't contain a phrase with joinMode=ANY", async function () {
|
||||||
|
var s = new Zotero.Search();
|
||||||
|
s.libraryID = userLibraryID;
|
||||||
|
s.addCondition('joinMode', 'any');
|
||||||
|
s.addCondition('fulltextContent', 'doesNotContain', 'foo bar');
|
||||||
|
var matches = await s.search();
|
||||||
|
assert.notIncludeMembers(matches, [foobarItem.id]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should find items that don't contain a regexp pattern with joinMode=ANY", async function () {
|
||||||
|
var s = new Zotero.Search();
|
||||||
|
s.libraryID = userLibraryID;
|
||||||
|
s.addCondition('joinMode', 'any');
|
||||||
|
s.addCondition('fulltextContent/regexp', 'doesNotContain', 'foo.+bar');
|
||||||
|
var matches = await s.search();
|
||||||
|
assert.notIncludeMembers(matches, [foobarItem.id]);
|
||||||
|
assert.includeMembers(matches, [fooItem.id, bazItem.id]);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("fulltextWord", function () {
|
describe("fulltextWord", function () {
|
||||||
|
@ -302,7 +352,7 @@ describe("Zotero.Search", function() {
|
||||||
it("should not return non-matches with full-text conditions", function* () {
|
it("should not return non-matches with full-text conditions", function* () {
|
||||||
let s = new Zotero.Search();
|
let s = new Zotero.Search();
|
||||||
s.libraryID = userLibraryID;
|
s.libraryID = userLibraryID;
|
||||||
s.addCondition('fulltextWord', 'contains', 'baz');
|
s.addCondition('fulltextWord', 'contains', 'nomatch');
|
||||||
let matches = yield s.search();
|
let matches = yield s.search();
|
||||||
assert.lengthOf(matches, 0);
|
assert.lengthOf(matches, 0);
|
||||||
});
|
});
|
||||||
|
@ -332,7 +382,7 @@ describe("Zotero.Search", function() {
|
||||||
s.libraryID = userLibraryID;
|
s.libraryID = userLibraryID;
|
||||||
s.addCondition('joinMode', 'any');
|
s.addCondition('joinMode', 'any');
|
||||||
s.addCondition('fulltextWord', 'contains', 'bar');
|
s.addCondition('fulltextWord', 'contains', 'bar');
|
||||||
s.addCondition('fulltextWord', 'contains', 'baz');
|
s.addCondition('fulltextWord', 'contains', 'nomatch');
|
||||||
let matches = yield s.search();
|
let matches = yield s.search();
|
||||||
assert.deepEqual(matches, [foobarItem.id]);
|
assert.deepEqual(matches, [foobarItem.id]);
|
||||||
});
|
});
|
||||||
|
|
Loading…
Reference in a new issue