"Attachment Content" search improvements

- Fix incorrect results for ANY search with multiple "Attachment
  Content" conditions and no other conditions
- Dramatically speed up single-word searches by avoiding unnecessary
  text scans (which probably addresses #1595)
- Clean up code
This commit is contained in:
Dan Stillman 2019-02-19 01:58:22 -05:00
parent 977eb8d965
commit 1061893998
3 changed files with 176 additions and 144 deletions

View file

@ -617,148 +617,130 @@ Zotero.Search.prototype.search = Zotero.Promise.coroutine(function* (asTempTable
//Zotero.debug('IDs from main search or subsearch: ');
//Zotero.debug(ids);
//Zotero.debug('Join mode: ' + joinMode);
// Filter results with fulltext search
// Filter results with full-text search
//
// If join mode ALL, return the (intersection of main and fulltext word search)
// filtered by fulltext content
// If join mode ALL, return the (intersection of main and full-text word search)
// filtered by full-text content.
//
// If join mode ANY or there's a quicksearch (which we assume
// fulltextContent is part of), return the union of the main search and
// (a separate fulltext word search filtered by fulltext content)
// If join mode ANY or there's a quicksearch (which we assume fulltextContent is part of)
// and the main search is filtered by other conditions, return the union of the main search
// and (separate full-text word searches filtered by fulltext content).
//
// If join mode ANY or there's a quicksearch and the main search isn't filtered, return just
// the union of (separate full-text word searches filtered by full-text content).
var fullTextResults;
var joinModeAny = joinMode == 'any' || hasQuicksearch;
for (let condition of Object.values(this._conditions)) {
if (condition['condition']=='fulltextContent'){
var fulltextWordIntersectionFilter = (val, index, array) => !!hash[val];
var fulltextWordIntersectionConditionFilter = function(val, index, array) {
return hash[val] ?
(condition.operator == 'contains') :
(condition.operator == 'doesNotContain');
};
if (condition.condition != 'fulltextContent') continue;
// Regexp mode -- don't use fulltext word index
if (condition.mode && condition.mode.startsWith('regexp')) {
// In an ANY search with other conditions that alter the results, only bother
// scanning items that haven't already been found by the main search, as long as
// they're in the right library
if (joinMode == 'any' && this._hasPrimaryConditions) {
if (!tmpTable) {
tmpTable = yield Zotero.Search.idsToTempTable(ids);
if (!fullTextResults) {
// For join mode ANY, if we already filtered the main set, add those as results.
// Otherwise, start with an empty set.
fullTextResults = joinModeAny && this._hasPrimaryConditions
? ids
: [];
}
var sql = "SELECT GROUP_CONCAT(itemID) FROM items WHERE "
let scopeIDs;
// Regexp mode -- don't use full-text word index
let numSplits;
if (condition.mode && condition.mode.startsWith('regexp')) {
// In ANY mode, include items that haven't already been found, as long as they're in
// the right library
if (joinModeAny) {
let tmpTable = yield Zotero.Search.idsToTempTable(fullTextResults);
let sql = "SELECT GROUP_CONCAT(itemID) FROM items WHERE "
+ "itemID NOT IN (SELECT itemID FROM " + tmpTable + ")";
if (this.libraryID) {
sql += " AND libraryID=" + parseInt(this.libraryID);
sql += " AND libraryID=?";
}
var res = yield Zotero.DB.valueQueryAsync(sql);
var scopeIDs = res ? res.split(",").map(id => parseInt(id)) : [];
let res = yield Zotero.DB.valueQueryAsync(sql, this.libraryID);
scopeIDs = res ? res.split(",").map(id => parseInt(id)) : [];
yield Zotero.DB.queryAsync("DROP TABLE " + tmpTable);
}
// If an ALL search, scan only items from the main search
// In ALL mode, include remaining items from the main search
else {
var scopeIDs = ids;
scopeIDs = ids;
}
}
// If not regexp mode, run a new search against the fulltext word
// index for words in this phrase
// If not regexp mode, run a new search against the full-text word index for words in
// this phrase
else {
Zotero.debug('Running subsearch against fulltext word index');
var s = new Zotero.Search();
//Zotero.debug('Running subsearch against full-text word index');
let s = new Zotero.Search();
if (this.libraryID) {
s.libraryID = this.libraryID;
}
// Add any necessary conditions to the fulltext word search --
// those that are required in an ANY search and any outside the
// quicksearch in an ALL search
for (let c of Object.values(this._conditions)) {
if (c.condition == 'blockStart') {
var inQS = true;
continue;
}
else if (c.condition == 'blockEnd') {
inQS = false;
continue;
}
else if (c.condition == 'fulltextContent' || inQS) {
continue;
}
else if (joinMode == 'any' && !c.required) {
continue;
}
s.addCondition(c.condition, c.operator, c.value);
}
var splits = Zotero.Fulltext.semanticSplitter(condition.value);
let splits = Zotero.Fulltext.semanticSplitter(condition.value);
for (let split of splits){
s.addCondition('fulltextWord', condition.operator, split);
}
var fulltextWordIDs = yield s.search();
numSplits = splits.length;
let wordMatches = yield s.search();
//Zotero.debug("Fulltext word IDs");
//Zotero.debug(fulltextWordIDs);
//Zotero.debug("Word index matches");
//Zotero.debug(wordMatches);
// If ALL mode, set intersection of main search and fulltext word index
// as the scope for the fulltext content search
if (joinMode == 'all' && !hasQuicksearch) {
var hash = {};
for (let i=0; i<fulltextWordIDs.length; i++) {
hash[fulltextWordIDs[i]] = true;
}
if (ids) {
var scopeIDs = ids.filter(fulltextWordIntersectionFilter);
// In ANY mode, include hits from word index that aren't already in the results
if (joinModeAny) {
let resultsSet = new Set(fullTextResults);
scopeIDs = wordMatches.filter(id => !resultsSet.has(id));
}
// In ALL mode, include the intersection of hits from word index and remaining
// main search matches
else {
var scopeIDs = [];
}
}
// If ANY mode, just use fulltext word index hits for content search,
// since the main results will be added in below
else {
var scopeIDs = fulltextWordIDs;
let wordIDs = new Set(wordMatches);
scopeIDs = ids.filter(id => wordIDs.has(id));
}
}
if (scopeIDs && scopeIDs.length) {
var fulltextIDs = yield Zotero.Fulltext.findTextInItems(scopeIDs,
condition['value'], condition['mode']);
var hash = {};
for (let i=0; i<fulltextIDs.length; i++) {
hash[fulltextIDs[i].id] = true;
// If only one word, just use the results from the word index
let filteredIDs = [];
if (numSplits === 1) {
filteredIDs = scopeIDs;
}
filteredIDs = scopeIDs.filter(fulltextWordIntersectionConditionFilter);
}
else {
var filteredIDs = [];
// Search the full-text content
else if (scopeIDs.length) {
let found = new Set(
yield Zotero.Fulltext.findTextInItems(
scopeIDs,
condition.value,
condition.mode
).map(x => x.id)
);
// Either include or exclude the results, depending on the operator
filteredIDs = scopeIDs.filter((id) => {
return found.has(id)
? condition.operator == 'contains'
: condition.operator == 'doesNotContain';
});
}
//Zotero.debug("Filtered IDs:")
//Zotero.debug(filteredIDs);
// If join mode ANY, add any new items from the fulltext content
// search to the main search results
//
// We only do this if there are primary conditions that alter the
// main search, since otherwise all items will match
if (this._hasPrimaryConditions && (joinMode == 'any' || hasQuicksearch)) {
//Zotero.debug("Adding filtered IDs to main set");
for (let i=0; i<filteredIDs.length; i++) {
let id = filteredIDs[i];
if (ids.indexOf(id) == -1) {
ids.push(id);
}
// If join mode ANY, add any new items from the full-text content search to the results,
// and remove from the scope so that we don't search through items we already matched
if (joinModeAny) {
//Zotero.debug("Adding filtered IDs to results and removing from scope");
fullTextResults = fullTextResults.concat(filteredIDs);
let idSet = new Set(ids);
for (let id of filteredIDs) {
idSet.delete(id);
}
ids = Array.from(idSet);
}
else {
//Zotero.debug("Replacing main set with filtered IDs");
//Zotero.debug("Replacing results with filtered IDs");
ids = filteredIDs;
fullTextResults = filteredIDs;
}
}
if (fullTextResults) {
ids = Array.from(new Set(fullTextResults));
}
if (this.hasPostSearchFilter() &&

Binary file not shown.

View file

@ -120,20 +120,29 @@ describe("Zotero.Search", function() {
var userLibraryID;
var fooItem;
var foobarItem;
var bazItem;
var fooItemGroup;
var foobarItemGroup;
var bazItemGroup;
before(async function () {
await resetDB({
thisArg: this,
skipBundledFiles: true
});
before(function* () {
// Hidden browser, which requires a browser window, needed for charset detection
// (until we figure out a better way)
win = yield loadBrowserWindow();
fooItem = yield importFileAttachment("search/foo.html");
foobarItem = yield importFileAttachment("search/foobar.html");
win = await loadBrowserWindow();
fooItem = await importFileAttachment("search/foo.html");
foobarItem = await importFileAttachment("search/foobar.html");
bazItem = await importFileAttachment("search/baz.pdf");
userLibraryID = fooItem.libraryID;
let group = yield getGroup();
fooItemGroup = yield importFileAttachment("search/foo.html", { libraryID: group.libraryID });
foobarItemGroup = yield importFileAttachment("search/foobar.html", { libraryID: group.libraryID });
let group = await getGroup();
fooItemGroup = await importFileAttachment("search/foo.html", { libraryID: group.libraryID });
foobarItemGroup = await importFileAttachment("search/foobar.html", { libraryID: group.libraryID });
bazItemGroup = await importFileAttachment("search/baz.pdf", { libraryID: group.libraryID });
});
after(function* () {
@ -142,8 +151,10 @@ describe("Zotero.Search", function() {
}
yield fooItem.eraseTx();
yield foobarItem.eraseTx();
yield bazItem.eraseTx();
yield fooItemGroup.eraseTx();
yield foobarItemGroup.eraseTx();
yield bazItemGroup.eraseTx();
});
describe("Conditions", function () {
@ -238,14 +249,25 @@ describe("Zotero.Search", function() {
assert.sameMembers(matches, [foobarItem.id]);
});
it("should find matching item with joinMode=ANY and non-matching other condition", function* () {
it("should find matching items with joinMode=ANY with no other conditions", async function () {
var s = new Zotero.Search();
s.libraryID = userLibraryID;
s.addCondition('joinMode', 'any');
s.addCondition('fulltextContent', 'contains', 'foo bar');
s.addCondition('fulltextContent', 'contains', 'foo');
s.addCondition('fulltextContent', 'contains', 'bar');
var matches = await s.search();
assert.sameMembers(matches, [fooItem.id, foobarItem.id]);
});
it("should find matching items with joinMode=ANY and non-matching other condition", function* () {
var s = new Zotero.Search();
s.libraryID = userLibraryID;
s.addCondition('joinMode', 'any');
s.addCondition('fulltextContent', 'contains', 'foo');
s.addCondition('fulltextContent', 'contains', 'bar');
s.addCondition('title', 'contains', 'nomatch');
var matches = yield s.search();
assert.sameMembers(matches, [foobarItem.id]);
assert.sameMembers(matches, [fooItem.id, foobarItem.id]);
});
it("should find matching items in regexp mode with joinMode=ANY with matching other condition", function* () {
@ -287,6 +309,34 @@ describe("Zotero.Search", function() {
var matches = yield s.search();
assert.sameMembers(matches, [foobarItem.id]);
});
it("should find items that don't contain a single word with joinMode=ANY", async function () {
var s = new Zotero.Search();
s.libraryID = userLibraryID;
s.addCondition('joinMode', 'any');
s.addCondition('fulltextContent', 'doesNotContain', 'foo');
var matches = await s.search();
assert.notIncludeMembers(matches, [fooItem.id, foobarItem.id]);
});
it("should find items that don't contain a phrase with joinMode=ANY", async function () {
var s = new Zotero.Search();
s.libraryID = userLibraryID;
s.addCondition('joinMode', 'any');
s.addCondition('fulltextContent', 'doesNotContain', 'foo bar');
var matches = await s.search();
assert.notIncludeMembers(matches, [foobarItem.id]);
});
it("should find items that don't contain a regexp pattern with joinMode=ANY", async function () {
var s = new Zotero.Search();
s.libraryID = userLibraryID;
s.addCondition('joinMode', 'any');
s.addCondition('fulltextContent/regexp', 'doesNotContain', 'foo.+bar');
var matches = await s.search();
assert.notIncludeMembers(matches, [foobarItem.id]);
assert.includeMembers(matches, [fooItem.id, bazItem.id]);
});
});
describe("fulltextWord", function () {
@ -302,7 +352,7 @@ describe("Zotero.Search", function() {
it("should not return non-matches with full-text conditions", function* () {
let s = new Zotero.Search();
s.libraryID = userLibraryID;
s.addCondition('fulltextWord', 'contains', 'baz');
s.addCondition('fulltextWord', 'contains', 'nomatch');
let matches = yield s.search();
assert.lengthOf(matches, 0);
});
@ -332,7 +382,7 @@ describe("Zotero.Search", function() {
s.libraryID = userLibraryID;
s.addCondition('joinMode', 'any');
s.addCondition('fulltextWord', 'contains', 'bar');
s.addCondition('fulltextWord', 'contains', 'baz');
s.addCondition('fulltextWord', 'contains', 'nomatch');
let matches = yield s.search();
assert.deepEqual(matches, [foobarItem.id]);
});