"Attachment Content" search improvements

- Fix incorrect results for ANY search with multiple "Attachment Content" conditions and no other conditions - Dramatically speed up single-word searches by avoiding unnecessary text scans (which probably addresses #1595) - Clean up code
2019-02-19 01:58:22 -05:00 · 2019-02-19 01:58:22 -05:00 · 1061893998
commit 1061893998
parent 977eb8d965
3 changed files with 176 additions and 144 deletions
--- a/chrome/content/zotero/xpcom/data/search.js
+++ b/chrome/content/zotero/xpcom/data/search.js
@ -617,148 +617,130 @@ Zotero.Search.prototype.search = Zotero.Promise.coroutine(function* (asTempTable
 		
 		//Zotero.debug('IDs from main search or subsearch: ');
 		//Zotero.debug(ids);
-		
 		//Zotero.debug('Join mode: ' + joinMode);
 		
-		// Filter results with fulltext search
+		// Filter results with full-text search
 		//
-		// If join mode ALL, return the (intersection of main and fulltext word search)
-		// filtered by fulltext content
+		// If join mode ALL, return the (intersection of main and full-text word search)
+		// filtered by full-text content.
 		//
-		// If join mode ANY or there's a quicksearch (which we assume
-		// fulltextContent is part of), return the union of the main search and
-		// (a separate fulltext word search filtered by fulltext content)
-		for (let condition of Object.values(this._conditions)){
-			if (condition['condition']=='fulltextContent'){
-				var fulltextWordIntersectionFilter = (val, index, array) => !!hash[val];
-				var fulltextWordIntersectionConditionFilter = function(val, index, array) {
-					return hash[val] ?
-						(condition.operator == 'contains') :
-						(condition.operator == 'doesNotContain');
-				};
+		// If join mode ANY or there's a quicksearch (which we assume fulltextContent is part of)
+		// and the main search is filtered by other conditions, return the union of the main search
+		// and (separate full-text word searches filtered by fulltext content).
+		//
+		// If join mode ANY or there's a quicksearch and the main search isn't filtered, return just
+		// the union of (separate full-text word searches filtered by full-text content).
+		var fullTextResults;
+		var joinModeAny = joinMode == 'any' || hasQuicksearch;
+		for (let condition of Object.values(this._conditions)) {
+			if (condition.condition != 'fulltextContent') continue;
 			
-				// Regexp mode -- don't use fulltext word index
-				if (condition.mode && condition.mode.startsWith('regexp')) {
-					// In an ANY search with other conditions that alter the results, only bother
-					// scanning items that haven't already been found by the main search, as long as
-					// they're in the right library
-					if (joinMode == 'any' && this._hasPrimaryConditions) {
-						if (!tmpTable) {
-							tmpTable = yield Zotero.Search.idsToTempTable(ids);
+			if (!fullTextResults) {
+				// For join mode ANY, if we already filtered the main set, add those as results.
+				// Otherwise, start with an empty set.
+				fullTextResults = joinModeAny && this._hasPrimaryConditions
+					? ids
+					: [];
 			}
 			
-						var sql = "SELECT GROUP_CONCAT(itemID) FROM items WHERE "
+			let scopeIDs;
+			// Regexp mode -- don't use full-text word index
+			let numSplits;
+			if (condition.mode && condition.mode.startsWith('regexp')) {
+				// In ANY mode, include items that haven't already been found, as long as they're in
+				// the right library
+				if (joinModeAny) {
+					let tmpTable = yield Zotero.Search.idsToTempTable(fullTextResults);
+					let sql = "SELECT GROUP_CONCAT(itemID) FROM items WHERE "
 						+ "itemID NOT IN (SELECT itemID FROM " + tmpTable + ")";
 					if (this.libraryID) {
-							sql += " AND libraryID=" + parseInt(this.libraryID);
+						sql += " AND libraryID=?";
 					}
-						
-						var res = yield Zotero.DB.valueQueryAsync(sql);
-						var scopeIDs = res ? res.split(",").map(id => parseInt(id)) : [];
+					let res = yield Zotero.DB.valueQueryAsync(sql, this.libraryID);
+					scopeIDs = res ? res.split(",").map(id => parseInt(id)) : [];
+					yield Zotero.DB.queryAsync("DROP TABLE " + tmpTable);
 				}
-					// If an ALL search, scan only items from the main search
+				// In ALL mode, include remaining items from the main search
 				else {
-						var scopeIDs = ids;
+					scopeIDs = ids;
 				}
 			}
-				// If not regexp mode, run a new search against the fulltext word
-				// index for words in this phrase
+			// If not regexp mode, run a new search against the full-text word index for words in
+			// this phrase
 			else {
-					Zotero.debug('Running subsearch against fulltext word index');
-					var s = new Zotero.Search();
+				//Zotero.debug('Running subsearch against full-text word index');
+				let s = new Zotero.Search();
 				if (this.libraryID) {
 					s.libraryID = this.libraryID;
 				}
-					
-					// Add any necessary conditions to the fulltext word search --
-					// those that are required in an ANY search and any outside the
-					// quicksearch in an ALL search
-					for (let c of Object.values(this._conditions)) {
-						if (c.condition == 'blockStart') {
-							var inQS = true;
-							continue;
-						}
-						else if (c.condition == 'blockEnd') {
-							inQS = false;
-							continue;
-						}
-						else if (c.condition == 'fulltextContent' || inQS) {
-							continue;
-						}
-						else if (joinMode == 'any' && !c.required) {
-							continue;
-						}
-						s.addCondition(c.condition, c.operator, c.value);
-					}
-					
-					var splits = Zotero.Fulltext.semanticSplitter(condition.value);
+				let splits = Zotero.Fulltext.semanticSplitter(condition.value);
 				for (let split of splits){
 					s.addCondition('fulltextWord', condition.operator, split);
 				}
-					var fulltextWordIDs = yield s.search();
+				numSplits = splits.length;
+				let wordMatches = yield s.search();
 				
-					//Zotero.debug("Fulltext word IDs");
-					//Zotero.debug(fulltextWordIDs);
+				//Zotero.debug("Word index matches");
+				//Zotero.debug(wordMatches);
 				
-					// If ALL mode, set intersection of main search and fulltext word index
-					// as the scope for the fulltext content search
-					if (joinMode == 'all' && !hasQuicksearch) {
-						var hash = {};
-						for (let i=0; i<fulltextWordIDs.length; i++) {
-							hash[fulltextWordIDs[i]] = true;
-						}
-						
-						if (ids) {
-							var scopeIDs = ids.filter(fulltextWordIntersectionFilter);
+				// In ANY mode, include hits from word index that aren't already in the results
+				if (joinModeAny) {
+					let resultsSet = new Set(fullTextResults);
+					scopeIDs = wordMatches.filter(id => !resultsSet.has(id));
 				}
+				// In ALL mode, include the intersection of hits from word index and remaining
+				// main search matches
 				else {
-							var scopeIDs = [];
-						}
-					}
-					// If ANY mode, just use fulltext word index hits for content search,
-					// since the main results will be added in below
-					else {
-						var scopeIDs = fulltextWordIDs;
+					let wordIDs = new Set(wordMatches);
+					scopeIDs = ids.filter(id => wordIDs.has(id));
 				}
 			}
 			
-				if (scopeIDs && scopeIDs.length) {
-					var fulltextIDs = yield Zotero.Fulltext.findTextInItems(scopeIDs,
-						condition['value'], condition['mode']);
-					
-					var hash = {};
-					for (let i=0; i<fulltextIDs.length; i++) {
-						hash[fulltextIDs[i].id] = true;
+			// If only one word, just use the results from the word index
+			let filteredIDs = [];
+			if (numSplits === 1) {
+				filteredIDs = scopeIDs;
 			}
-					
-					filteredIDs = scopeIDs.filter(fulltextWordIntersectionConditionFilter);
-				}
-				else {
-					var filteredIDs = [];
+			// Search the full-text content
+			else if (scopeIDs.length) {
+				let found = new Set(
+					yield Zotero.Fulltext.findTextInItems(
+						scopeIDs,
+						condition.value,
+						condition.mode
+					).map(x => x.id)
+				);
+				// Either include or exclude the results, depending on the operator
+				filteredIDs = scopeIDs.filter((id) => {
+					return found.has(id)
+						? condition.operator == 'contains'
+						: condition.operator == 'doesNotContain';
+				});
 			}
 			
 			//Zotero.debug("Filtered IDs:")
 			//Zotero.debug(filteredIDs);
 			
-				// If join mode ANY, add any new items from the fulltext content
-				// search to the main search results
-				//
-				// We only do this if there are primary conditions that alter the
-				// main search, since otherwise all items will match
-				if (this._hasPrimaryConditions && (joinMode == 'any' || hasQuicksearch)) {
-					//Zotero.debug("Adding filtered IDs to main set");
-					for (let i=0; i<filteredIDs.length; i++) {
-						let id = filteredIDs[i];
-						if (ids.indexOf(id) == -1) {
-							ids.push(id);
-						}
+			// If join mode ANY, add any new items from the full-text content search to the results,
+			// and remove from the scope so that we don't search through items we already matched
+			if (joinModeAny) {
+				//Zotero.debug("Adding filtered IDs to results and removing from scope");
+				fullTextResults = fullTextResults.concat(filteredIDs);
+				
+				let idSet = new Set(ids);
+				for (let id of filteredIDs) {
+					idSet.delete(id);
 				}
+				ids = Array.from(idSet);
 			}
 			else {
-					//Zotero.debug("Replacing main set with filtered IDs");
+				//Zotero.debug("Replacing results with filtered IDs");
 				ids = filteredIDs;
+				fullTextResults = filteredIDs;
 			}
 		}
+		if (fullTextResults) {
+			ids = Array.from(new Set(fullTextResults));
 		}
 		
 		if (this.hasPostSearchFilter() &&
--- a/test/tests/data/search/baz.pdf
+++ b/test/tests/data/search/baz.pdf
--- a/test/tests/searchTest.js
+++ b/test/tests/searchTest.js
@ -120,20 +120,29 @@ describe("Zotero.Search", function() {
 		var userLibraryID;
 		var fooItem;
 		var foobarItem;
+		var bazItem;
 		var fooItemGroup;
 		var foobarItemGroup;
+		var bazItemGroup;
+
+		before(async function () {
+			await resetDB({
+				thisArg: this,
+				skipBundledFiles: true
+			});
 			
-		before(function* () {
 			// Hidden browser, which requires a browser window, needed for charset detection
 			// (until we figure out a better way)
-			win = yield loadBrowserWindow();
-			fooItem = yield importFileAttachment("search/foo.html");
-			foobarItem = yield importFileAttachment("search/foobar.html");
+			win = await loadBrowserWindow();
+			fooItem = await importFileAttachment("search/foo.html");
+			foobarItem = await importFileAttachment("search/foobar.html");
+			bazItem = await importFileAttachment("search/baz.pdf");
 			userLibraryID = fooItem.libraryID;
 			
-			let group = yield getGroup();
-			fooItemGroup = yield importFileAttachment("search/foo.html", { libraryID: group.libraryID });
-			foobarItemGroup = yield importFileAttachment("search/foobar.html", { libraryID: group.libraryID });
+			let group = await getGroup();
+			fooItemGroup = await importFileAttachment("search/foo.html", { libraryID: group.libraryID });
+			foobarItemGroup = await importFileAttachment("search/foobar.html", { libraryID: group.libraryID });
+			bazItemGroup = await importFileAttachment("search/baz.pdf", { libraryID: group.libraryID });
 		});

 		after(function* () {
@ -142,8 +151,10 @@ describe("Zotero.Search", function() {
 			}
 			yield fooItem.eraseTx();
 			yield foobarItem.eraseTx();
+			yield bazItem.eraseTx();
 			yield fooItemGroup.eraseTx();
 			yield foobarItemGroup.eraseTx();
+			yield bazItemGroup.eraseTx();
 		});
 		
 		describe("Conditions", function () {
@ -238,14 +249,25 @@ describe("Zotero.Search", function() {
 					assert.sameMembers(matches, [foobarItem.id]);
 				});
 				
-				it("should find matching item with joinMode=ANY and non-matching other condition", function* () {
+				it("should find matching items with joinMode=ANY with no other conditions", async function () {
 					var s = new Zotero.Search();
 					s.libraryID = userLibraryID;
 					s.addCondition('joinMode', 'any');
-					s.addCondition('fulltextContent', 'contains', 'foo bar');
+					s.addCondition('fulltextContent', 'contains', 'foo');
+					s.addCondition('fulltextContent', 'contains', 'bar');
+					var matches = await s.search();
+					assert.sameMembers(matches, [fooItem.id, foobarItem.id]);
+				});
+				
+				it("should find matching items with joinMode=ANY and non-matching other condition", function* () {
+					var s = new Zotero.Search();
+					s.libraryID = userLibraryID;
+					s.addCondition('joinMode', 'any');
+					s.addCondition('fulltextContent', 'contains', 'foo');
+					s.addCondition('fulltextContent', 'contains', 'bar');
 					s.addCondition('title', 'contains', 'nomatch');
 					var matches = yield s.search();
-					assert.sameMembers(matches, [foobarItem.id]);
+					assert.sameMembers(matches, [fooItem.id, foobarItem.id]);
 				});
 				
 				it("should find matching items in regexp mode with joinMode=ANY with matching other condition", function* () {
@ -287,6 +309,34 @@ describe("Zotero.Search", function() {
 					var matches = yield s.search();
 					assert.sameMembers(matches, [foobarItem.id]);
 				});
+				
+				it("should find items that don't contain a single word with joinMode=ANY", async function () {
+					var s = new Zotero.Search();
+					s.libraryID = userLibraryID;
+					s.addCondition('joinMode', 'any');
+					s.addCondition('fulltextContent', 'doesNotContain', 'foo');
+					var matches = await s.search();
+					assert.notIncludeMembers(matches, [fooItem.id, foobarItem.id]);
+				});
+				
+				it("should find items that don't contain a phrase with joinMode=ANY", async function () {
+					var s = new Zotero.Search();
+					s.libraryID = userLibraryID;
+					s.addCondition('joinMode', 'any');
+					s.addCondition('fulltextContent', 'doesNotContain', 'foo bar');
+					var matches = await s.search();
+					assert.notIncludeMembers(matches, [foobarItem.id]);
+				});
+				
+				it("should find items that don't contain a regexp pattern with joinMode=ANY", async function () {
+					var s = new Zotero.Search();
+					s.libraryID = userLibraryID;
+					s.addCondition('joinMode', 'any');
+					s.addCondition('fulltextContent/regexp', 'doesNotContain', 'foo.+bar');
+					var matches = await s.search();
+					assert.notIncludeMembers(matches, [foobarItem.id]);
+					assert.includeMembers(matches, [fooItem.id, bazItem.id]);
+				});
 			});
 			
 			describe("fulltextWord", function () {
@ -302,7 +352,7 @@ describe("Zotero.Search", function() {
 				it("should not return non-matches with full-text conditions", function* () {
 					let s = new Zotero.Search();
 					s.libraryID = userLibraryID;
-					s.addCondition('fulltextWord', 'contains', 'baz');
+					s.addCondition('fulltextWord', 'contains', 'nomatch');
 					let matches = yield s.search();
 					assert.lengthOf(matches, 0);
 				});
@ -332,7 +382,7 @@ describe("Zotero.Search", function() {
 					s.libraryID = userLibraryID;
 					s.addCondition('joinMode', 'any');
 					s.addCondition('fulltextWord', 'contains', 'bar');
-					s.addCondition('fulltextWord', 'contains', 'baz');
+					s.addCondition('fulltextWord', 'contains', 'nomatch');
 					let matches = yield s.search();
 					assert.deepEqual(matches, [foobarItem.id]);
 				});