"Attachment Content" search improvements

- Fix incorrect results for ANY search with multiple "Attachment Content" conditions and no other conditions - Dramatically speed up single-word searches by avoiding unnecessary text scans (which probably addresses #1595) - Clean up code
2019-02-19 01:58:22 -05:00 · 2019-02-19 01:58:22 -05:00 · 1061893998
commit 1061893998
parent 977eb8d965
3 changed files with 176 additions and 144 deletions
--- a/chrome/content/zotero/xpcom/data/search.js
+++ b/chrome/content/zotero/xpcom/data/search.js
@ -617,148 +617,130 @@ Zotero.Search.prototype.search = Zotero.Promise.coroutine(function* (asTempTable
 		//Zotero.debug('IDs from main search or subsearch: ');
 		//Zotero.debug(ids);
 		//Zotero.debug('Join mode: ' + joinMode);
-		// Filter results with fulltext search
+		// Filter results with full-text search
 		//
-		// If join mode ALL, return the (intersection of main and fulltext word search)
+		// If join mode ALL, return the (intersection of main and full-text word search)
-		// filtered by fulltext content
+		// filtered by full-text content.
 		//
-		// If join mode ANY or there's a quicksearch (which we assume
+		// If join mode ANY or there's a quicksearch (which we assume fulltextContent is part of)
-		// fulltextContent is part of), return the union of the main search and
+		// and the main search is filtered by other conditions, return the union of the main search
-		// (a separate fulltext word search filtered by fulltext content)
+		// and (separate full-text word searches filtered by fulltext content).
-		for (let condition of Object.values(this._conditions)){
+		//
-			if (condition['condition']=='fulltextContent'){
+		// If join mode ANY or there's a quicksearch and the main search isn't filtered, return just
-				var fulltextWordIntersectionFilter = (val, index, array) => !!hash[val];
+		// the union of (separate full-text word searches filtered by full-text content).
-				var fulltextWordIntersectionConditionFilter = function(val, index, array) {
+		var fullTextResults;
-					return hash[val] ?
+		var joinModeAny = joinMode == 'any' || hasQuicksearch;
-						(condition.operator == 'contains') :
+		for (let condition of Object.values(this._conditions)) {
-						(condition.operator == 'doesNotContain');
+			if (condition.condition != 'fulltextContent') continue;
-				};
+			
-				
+			if (!fullTextResults) {
-				// Regexp mode -- don't use fulltext word index
+				// For join mode ANY, if we already filtered the main set, add those as results.
-				if (condition.mode && condition.mode.startsWith('regexp')) {
+				// Otherwise, start with an empty set.
-					// In an ANY search with other conditions that alter the results, only bother
+				fullTextResults = joinModeAny && this._hasPrimaryConditions
-					// scanning items that haven't already been found by the main search, as long as
+					? ids
-					// they're in the right library
+					: [];
-					if (joinMode == 'any' && this._hasPrimaryConditions) {
+			}
-						if (!tmpTable) {
+			
-							tmpTable = yield Zotero.Search.idsToTempTable(ids);
+			let scopeIDs;
-						}
+			// Regexp mode -- don't use full-text word index
-						
+			let numSplits;
-						var sql = "SELECT GROUP_CONCAT(itemID) FROM items WHERE "
+			if (condition.mode && condition.mode.startsWith('regexp')) {
-							+ "itemID NOT IN (SELECT itemID FROM " + tmpTable + ")";
+				// In ANY mode, include items that haven't already been found, as long as they're in
-						if (this.libraryID) {
+				// the right library
-							sql += " AND libraryID=" + parseInt(this.libraryID);
+				if (joinModeAny) {
-						}
+					let tmpTable = yield Zotero.Search.idsToTempTable(fullTextResults);
-						
+					let sql = "SELECT GROUP_CONCAT(itemID) FROM items WHERE "
-						var res = yield Zotero.DB.valueQueryAsync(sql);
+						+ "itemID NOT IN (SELECT itemID FROM " + tmpTable + ")";
 						var scopeIDs = res ? res.split(",").map(id => parseInt(id)) : [];
 					}
 					// If an ALL search, scan only items from the main search
 					else {
 						var scopeIDs = ids;
 					}
 				}
 				// If not regexp mode, run a new search against the fulltext word
 				// index for words in this phrase
 				else {
 					Zotero.debug('Running subsearch against fulltext word index');
 					var s = new Zotero.Search();
 					if (this.libraryID) {
-						s.libraryID = this.libraryID;
+						sql += " AND libraryID=?";
 					}
 					// Add any necessary conditions to the fulltext word search --
 					// those that are required in an ANY search and any outside the
 					// quicksearch in an ALL search
 					for (let c of Object.values(this._conditions)) {
 						if (c.condition == 'blockStart') {
 							var inQS = true;
 							continue;
 						}
 						else if (c.condition == 'blockEnd') {
 							inQS = false;
 							continue;
 						}
 						else if (c.condition == 'fulltextContent' || inQS) {
 							continue;
 						}
 						else if (joinMode == 'any' && !c.required) {
 							continue;
 						}
 						s.addCondition(c.condition, c.operator, c.value);
 					}
 					var splits = Zotero.Fulltext.semanticSplitter(condition.value);
 					for (let split of splits){
 						s.addCondition('fulltextWord', condition.operator, split);
 					}
 					var fulltextWordIDs = yield s.search();
 					//Zotero.debug("Fulltext word IDs");
 					//Zotero.debug(fulltextWordIDs);
 					// If ALL mode, set intersection of main search and fulltext word index
 					// as the scope for the fulltext content search
 					if (joinMode == 'all' && !hasQuicksearch) {
 						var hash = {};
 						for (let i=0; i<fulltextWordIDs.length; i++) {
 							hash[fulltextWordIDs[i]] = true;
 						}
 						if (ids) {
 							var scopeIDs = ids.filter(fulltextWordIntersectionFilter);
 						}
 						else {
 							var scopeIDs = [];
 						}
 					}
 					// If ANY mode, just use fulltext word index hits for content search,
 					// since the main results will be added in below
 					else {
 						var scopeIDs = fulltextWordIDs;
 					}
 					let res = yield Zotero.DB.valueQueryAsync(sql, this.libraryID);
 					scopeIDs = res ? res.split(",").map(id => parseInt(id)) : [];
 					yield Zotero.DB.queryAsync("DROP TABLE " + tmpTable);
 				}
-				
+				// In ALL mode, include remaining items from the main search
 				if (scopeIDs && scopeIDs.length) {
 					var fulltextIDs = yield Zotero.Fulltext.findTextInItems(scopeIDs,
 						condition['value'], condition['mode']);
 					var hash = {};
 					for (let i=0; i<fulltextIDs.length; i++) {
 						hash[fulltextIDs[i].id] = true;
 					}
 					filteredIDs = scopeIDs.filter(fulltextWordIntersectionConditionFilter);
 				}
 				else {
-					var filteredIDs = [];
+					scopeIDs = ids;
 				}
 				//Zotero.debug("Filtered IDs:")
 				//Zotero.debug(filteredIDs);
 				// If join mode ANY, add any new items from the fulltext content
 				// search to the main search results
 				//
 				// We only do this if there are primary conditions that alter the
 				// main search, since otherwise all items will match
 				if (this._hasPrimaryConditions && (joinMode == 'any' || hasQuicksearch)) {
 					//Zotero.debug("Adding filtered IDs to main set");
 					for (let i=0; i<filteredIDs.length; i++) {
 						let id = filteredIDs[i];
 						if (ids.indexOf(id) == -1) {
 							ids.push(id);
 						}
 					}
 				}
 				else {
 					//Zotero.debug("Replacing main set with filtered IDs");
 					ids = filteredIDs;
 				}
 			}
 			// If not regexp mode, run a new search against the full-text word index for words in
 			// this phrase
 			else {
 				//Zotero.debug('Running subsearch against full-text word index');
 				let s = new Zotero.Search();
 				if (this.libraryID) {
 					s.libraryID = this.libraryID;
 				}
 				let splits = Zotero.Fulltext.semanticSplitter(condition.value);
 				for (let split of splits){
 					s.addCondition('fulltextWord', condition.operator, split);
 				}
 				numSplits = splits.length;
 				let wordMatches = yield s.search();
 				//Zotero.debug("Word index matches");
 				//Zotero.debug(wordMatches);
 				// In ANY mode, include hits from word index that aren't already in the results
 				if (joinModeAny) {
 					let resultsSet = new Set(fullTextResults);
 					scopeIDs = wordMatches.filter(id => !resultsSet.has(id));
 				}
 				// In ALL mode, include the intersection of hits from word index and remaining
 				// main search matches
 				else {
 					let wordIDs = new Set(wordMatches);
 					scopeIDs = ids.filter(id => wordIDs.has(id));
 				}
 			}
 			// If only one word, just use the results from the word index
 			let filteredIDs = [];
 			if (numSplits === 1) {
 				filteredIDs = scopeIDs;
 			}
 			// Search the full-text content
 			else if (scopeIDs.length) {
 				let found = new Set(
 					yield Zotero.Fulltext.findTextInItems(
 						scopeIDs,
 						condition.value,
 						condition.mode
 					).map(x => x.id)
 				);
 				// Either include or exclude the results, depending on the operator
 				filteredIDs = scopeIDs.filter((id) => {
 					return found.has(id)
 						? condition.operator == 'contains'
 						: condition.operator == 'doesNotContain';
 				});
 			}
 			//Zotero.debug("Filtered IDs:")
 			//Zotero.debug(filteredIDs);
 			// If join mode ANY, add any new items from the full-text content search to the results,
 			// and remove from the scope so that we don't search through items we already matched
 			if (joinModeAny) {
 				//Zotero.debug("Adding filtered IDs to results and removing from scope");
 				fullTextResults = fullTextResults.concat(filteredIDs);
 				let idSet = new Set(ids);
 				for (let id of filteredIDs) {
 					idSet.delete(id);
 				}
 				ids = Array.from(idSet);
 			}
 			else {
 				//Zotero.debug("Replacing results with filtered IDs");
 				ids = filteredIDs;
 				fullTextResults = filteredIDs;
 			}
 		}
 		if (fullTextResults) {
 			ids = Array.from(new Set(fullTextResults));
 		}
 		if (this.hasPostSearchFilter() &&
--- a/test/tests/data/search/baz.pdf
+++ b/test/tests/data/search/baz.pdf
--- a/test/tests/searchTest.js
+++ b/test/tests/searchTest.js
@ -120,20 +120,29 @@ describe("Zotero.Search", function() {
 		var userLibraryID;
 		var fooItem;
 		var foobarItem;
 		var bazItem;
 		var fooItemGroup;
 		var foobarItemGroup;
 		var bazItemGroup;
-		before(function* () {
+		before(async function () {
 			await resetDB({
 				thisArg: this,
 				skipBundledFiles: true
 			});
 			// Hidden browser, which requires a browser window, needed for charset detection
 			// (until we figure out a better way)
-			win = yield loadBrowserWindow();
+			win = await loadBrowserWindow();
-			fooItem = yield importFileAttachment("search/foo.html");
+			fooItem = await importFileAttachment("search/foo.html");
-			foobarItem = yield importFileAttachment("search/foobar.html");
+			foobarItem = await importFileAttachment("search/foobar.html");
 			bazItem = await importFileAttachment("search/baz.pdf");
 			userLibraryID = fooItem.libraryID;
-			let group = yield getGroup();
+			let group = await getGroup();
-			fooItemGroup = yield importFileAttachment("search/foo.html", { libraryID: group.libraryID });
+			fooItemGroup = await importFileAttachment("search/foo.html", { libraryID: group.libraryID });
-			foobarItemGroup = yield importFileAttachment("search/foobar.html", { libraryID: group.libraryID });
+			foobarItemGroup = await importFileAttachment("search/foobar.html", { libraryID: group.libraryID });
 			bazItemGroup = await importFileAttachment("search/baz.pdf", { libraryID: group.libraryID });
 		});
 		after(function* () {
@ -142,8 +151,10 @@ describe("Zotero.Search", function() {
 			}
 			yield fooItem.eraseTx();
 			yield foobarItem.eraseTx();
 			yield bazItem.eraseTx();
 			yield fooItemGroup.eraseTx();
 			yield foobarItemGroup.eraseTx();
 			yield bazItemGroup.eraseTx();
 		});
 		describe("Conditions", function () {
@ -238,14 +249,25 @@ describe("Zotero.Search", function() {
 					assert.sameMembers(matches, [foobarItem.id]);
 				});
-				it("should find matching item with joinMode=ANY and non-matching other condition", function* () {
+				it("should find matching items with joinMode=ANY with no other conditions", async function () {
 					var s = new Zotero.Search();
 					s.libraryID = userLibraryID;
 					s.addCondition('joinMode', 'any');
-					s.addCondition('fulltextContent', 'contains', 'foo bar');
+					s.addCondition('fulltextContent', 'contains', 'foo');
 					s.addCondition('fulltextContent', 'contains', 'bar');
 					var matches = await s.search();
 					assert.sameMembers(matches, [fooItem.id, foobarItem.id]);
 				});
 				it("should find matching items with joinMode=ANY and non-matching other condition", function* () {
 					var s = new Zotero.Search();
 					s.libraryID = userLibraryID;
 					s.addCondition('joinMode', 'any');
 					s.addCondition('fulltextContent', 'contains', 'foo');
 					s.addCondition('fulltextContent', 'contains', 'bar');
 					s.addCondition('title', 'contains', 'nomatch');
 					var matches = yield s.search();
-					assert.sameMembers(matches, [foobarItem.id]);
+					assert.sameMembers(matches, [fooItem.id, foobarItem.id]);
 				});
 				it("should find matching items in regexp mode with joinMode=ANY with matching other condition", function* () {
@ -287,6 +309,34 @@ describe("Zotero.Search", function() {
 					var matches = yield s.search();
 					assert.sameMembers(matches, [foobarItem.id]);
 				});
 				it("should find items that don't contain a single word with joinMode=ANY", async function () {
 					var s = new Zotero.Search();
 					s.libraryID = userLibraryID;
 					s.addCondition('joinMode', 'any');
 					s.addCondition('fulltextContent', 'doesNotContain', 'foo');
 					var matches = await s.search();
 					assert.notIncludeMembers(matches, [fooItem.id, foobarItem.id]);
 				});
 				it("should find items that don't contain a phrase with joinMode=ANY", async function () {
 					var s = new Zotero.Search();
 					s.libraryID = userLibraryID;
 					s.addCondition('joinMode', 'any');
 					s.addCondition('fulltextContent', 'doesNotContain', 'foo bar');
 					var matches = await s.search();
 					assert.notIncludeMembers(matches, [foobarItem.id]);
 				});
 				it("should find items that don't contain a regexp pattern with joinMode=ANY", async function () {
 					var s = new Zotero.Search();
 					s.libraryID = userLibraryID;
 					s.addCondition('joinMode', 'any');
 					s.addCondition('fulltextContent/regexp', 'doesNotContain', 'foo.+bar');
 					var matches = await s.search();
 					assert.notIncludeMembers(matches, [foobarItem.id]);
 					assert.includeMembers(matches, [fooItem.id, bazItem.id]);
 				});
 			});
 			describe("fulltextWord", function () {
@ -302,7 +352,7 @@ describe("Zotero.Search", function() {
 				it("should not return non-matches with full-text conditions", function* () {
 					let s = new Zotero.Search();
 					s.libraryID = userLibraryID;
-					s.addCondition('fulltextWord', 'contains', 'baz');
+					s.addCondition('fulltextWord', 'contains', 'nomatch');
 					let matches = yield s.search();
 					assert.lengthOf(matches, 0);
 				});
@ -332,7 +382,7 @@ describe("Zotero.Search", function() {
 					s.libraryID = userLibraryID;
 					s.addCondition('joinMode', 'any');
 					s.addCondition('fulltextWord', 'contains', 'bar');
-					s.addCondition('fulltextWord', 'contains', 'baz');
+					s.addCondition('fulltextWord', 'contains', 'nomatch');
 					let matches = yield s.search();
 					assert.deepEqual(matches, [foobarItem.id]);
 				});