fx-compat: Update full-text indexing

Use the new PageData mechanism for character set detection, don't try to index HTML files directly without properly detecting the charset, and generally simplify the indexing code. HTML files are now considered cached files that require indexing and won't be indexed automatically in Zotero.FullText.findTextInItems(), which breaks certain expectations, including in some tests. This will need to be addressed.
2022-06-17 04:57:34 -04:00 · 2022-06-17 04:57:34 -04:00 · 13adfd131c
commit 13adfd131c
parent 1dd24f7082
3 changed files with 163 additions and 308 deletions
--- a/chrome/content/zotero/xpcom/attachments.js
+++ b/chrome/content/zotero/xpcom/attachments.js
@ -131,7 +131,7 @@ Zotero.Attachments = new function(){
 				await attachmentItem.save(saveOptions);
 			}.bind(this));
 			try {
-				yield _postProcessFile(attachmentItem, newFile, contentType);
+				yield _postProcessFile(attachmentItem);
 			}
 			catch (e) {
 				Zotero.logError(e);
@ -194,7 +194,7 @@ Zotero.Attachments = new function(){
 			saveOptions
 		});
 		try {
-			yield _postProcessFile(item, file, contentType);
+			yield _postProcessFile(item);
 		}
 		catch (e) {
 			Zotero.logError(e);
@ -258,7 +258,7 @@ Zotero.Attachments = new function(){
 		var file = this.resolveRelativePath(path);
 		if (file && await OS.File.exists(file)) {
 			try {
-				await _postProcessFile(item, file, contentType);
+				await _postProcessFile(item);
 			}
 			catch (e) {
 				Zotero.logError(e);
@ -334,12 +334,7 @@ Zotero.Attachments = new function(){
 				}
 			}.bind(this));
 			try {
-				yield _postProcessFile(
-					attachmentItem,
-					Zotero.File.pathToFile(newPath),
-					contentType,
-					charset
-				);
+				yield _postProcessFile(attachmentItem);
 			}
 			catch (e) {
 				Zotero.logError(e);
@ -2912,115 +2907,15 @@ Zotero.Attachments = new function(){
 	/**
 	 * If necessary/possible, detect the file charset and index the file
 	 *
-	 * Since we have to load the content into the browser to get the
-	 * character set (at least until we figure out a better way to get
-	 * at the native detectors), we create the item above and update
-	 * asynchronously after the fact
+	 * Since we have to load the content into the browser to get the character set, we create the
+	 * item above and update asynchronously after the fact
 	 *
 	 * @return {Promise}
 	 */
-	var _postProcessFile = Zotero.Promise.coroutine(function* (item, file, contentType) {
-		// Don't try to process if MIME type is unknown
-		if (!contentType) {
-			return;
-		}
-		
-		// Items with content types that get cached by the fulltext indexer can just be indexed,
-		// since a charset isn't necessary
-		if (Zotero.Fulltext.isCachedMIMEType(contentType)) {
-			return Zotero.Fulltext.indexItems([item.id]);
-		}
-		
-		// Ignore non-text types
-		var ext = Zotero.File.getExtension(file);
-		if (!Zotero.MIME.hasInternalHandler(contentType, ext) || !Zotero.MIME.isTextType(contentType)) {
-			return;
-		}
-		
-		// If the charset is already set, index item directly
-		if (item.attachmentCharset) {
-			return Zotero.Fulltext.indexItems([item.id]);
-		}
-		
-		// Otherwise, load in a hidden browser to get the charset, and then index the document
-		return new Zotero.Promise(function (resolve, reject) {
-			var browser = Zotero.Browser.createHiddenBrowser(
-				null,
-				// Disable JavaScript, since it can cause imports that include HTML files to hang
-				// (from network requests that fail?)
-				{ allowJavaScript: false }
-			);
-			
-			var pageshown = false;
-			
-			if (item.attachmentCharset) {
-				var onpageshow = async function () {
-					// ignore spurious about:blank loads
-					if(browser.contentDocument.location.href == "about:blank") return;
-					
-					pageshown = true;
-					
-					browser.removeEventListener("pageshow", onpageshow, false);
-					
-					try {
-						await Zotero.Fulltext.indexDocument(browser.contentDocument, itemID);
-						resolve();
-					}
-					catch (e) {
-						reject(e);
-					}
-					finally {
-						Zotero.Browser.deleteHiddenBrowser(browser);
-					}
-				};
-				browser.addEventListener("pageshow", onpageshow, false);
-			}
-			else {
-				let callback = async function (charset, args) {
-					// ignore spurious about:blank loads
-					if(browser.contentDocument.location.href == "about:blank") return;
-					
-					pageshown = true;
-					
-					try {
-						if (charset) {
-							charset = Zotero.CharacterSets.toCanonical(charset);
-							if (charset) {
-								item.attachmentCharset = charset;
-								await item.saveTx({
-									skipNotifier: true
-								});
-							}
-						}
-						
-						await Zotero.Fulltext.indexDocument(browser.contentDocument, item.id);
-						resolve();
-					}
-					catch (e) {
-						reject(e);
-					}
-					finally {
-						Zotero.Browser.deleteHiddenBrowser(browser);
-					}
-				};
-				Zotero.File.addCharsetListener(browser, callback, item.id);
-			}
-			
-			var url = Components.classes["@mozilla.org/network/protocol;1?name=file"]
-						.getService(Components.interfaces.nsIFileProtocolHandler)
-						.getURLSpecFromFile(file);
-			browser.loadURI(url);
-			
-			// Avoid a hang if a pageshow is never called on the hidden browser (which can happen
-			// if a .pdf file is really HTML, which can also result in the file being launched,
-			// which we should try to fix)
-			setTimeout(function () {
-				if (!pageshown) {
-					reject(new Error("pageshow not called in hidden browser"));
-				}
-			}, 5000);
-		});
-	});
+	var _postProcessFile = async function (item) {
+		return Zotero.Fulltext.indexItems([item.id]);
+	};
+	
 	
 	/**
 	 * Determines if a given document is an instance of PDFJS
--- a/chrome/content/zotero/xpcom/fulltext.js
+++ b/chrome/content/zotero/xpcom/fulltext.js
@ -24,8 +24,6 @@
 */

 Zotero.Fulltext = Zotero.FullText = new function(){
-	this.isCachedMIMEType = isCachedMIMEType;
-	
 	this.__defineGetter__("pdfConverterCacheFile", function () { return '.zotero-ft-cache'; });
 	this.__defineGetter__("pdfInfoCacheFile", function () { return '.zotero-ft-info'; });
 	
@ -67,9 +65,6 @@ Zotero.Fulltext = Zotero.FullText = new function(){
 		yield Zotero.DB.queryAsync("ATTACH ':memory:' AS 'indexing'");
 		yield Zotero.DB.queryAsync('CREATE TABLE indexing.fulltextWords (word NOT NULL)');
 		
-		this.unicodeConverter = Cc["@mozilla.org/intl/scriptableunicodeconverter"]
-			.createInstance(Ci.nsIScriptableUnicodeConverter);
-		
 		let pdfConverterFileName = "pdftotext";
 		let pdfInfoFileName = "pdfinfo";
 		
@ -78,7 +73,8 @@ Zotero.Fulltext = Zotero.FullText = new function(){
 			pdfInfoFileName += '.exe';
 		}
 		
-		let dir = FileUtils.getDir('AChrom', []).parent;
+		// AChrome is app/chrome
+		let dir = FileUtils.getDir('AChrom', []).parent.parent;
 		
 		_pdfData = dir.clone();
 		_pdfData.append('poppler-data');
@ -222,13 +218,14 @@ Zotero.Fulltext = Zotero.FullText = new function(){
 	 * Returns true if MIME type is converted to text and cached before indexing
 	 *   (e.g. application/pdf is run through pdftotext)
 	 */
-	function isCachedMIMEType(mimeType) {
+	this.isCachedMIMEType = function (mimeType) {
 		switch (mimeType) {
 			case 'application/pdf':
+			case 'text/html':
 				return true;
 		}
 		return false;
-	}
+	};
 	
 	
 	/**
@ -274,8 +271,12 @@ Zotero.Fulltext = Zotero.FullText = new function(){
 	/**
 	 * @return {Promise}
 	 */
-	var indexString = Zotero.Promise.coroutine(function* (text, charset, itemID, stats, version, synced) {
-		var words = this.semanticSplitter(text, charset);
+	var indexString = Zotero.Promise.coroutine(function* (text, itemID, stats, version, synced) {
+		if (itemID != parseInt(itemID)) {
+			throw new Error("itemID not provided");
+		}
+		
+		var words = this.semanticSplitter(text);
 		
 		while (Zotero.DB.inTransaction()) {
 			yield Zotero.DB.waitForTransaction('indexString()');
@ -334,9 +335,12 @@ Zotero.Fulltext = Zotero.FullText = new function(){
 		if (!maxLength) {
 			return false;
 		}
-		var obj = yield convertItemHTMLToText(itemID, document.body.innerHTML, maxLength);
-		var text = obj.text;
-		var totalChars = obj.totalChars;
+		var text = document.documentElement.innerText;
+		var totalChars = text.length;
+		var item = Zotero.Items.get(itemID);
+		if (document.contentType == 'text/html') {
+			yield writeCacheFile(item, text, maxLength);
+		}
 		
 		if (totalChars > maxLength) {
 			Zotero.debug('Only indexing first ' + maxLength + ' characters of item '
@ -345,82 +349,15 @@ Zotero.Fulltext = Zotero.FullText = new function(){
 		
 		yield indexString(
 			text,
-			document.characterSet,
 			itemID,
 			{ indexedChars: text.length, totalChars }
 		);
 	});
 	
 	
-	/**
-	 * @param {String} path
-	 * @param {Boolean} [complete=FALSE]  Index the file in its entirety, ignoring maxLength
-	 */
-	var indexFile = Zotero.Promise.coroutine(function* (path, contentType, charset, itemID, complete, stats) {
-		if (!(yield OS.File.exists(path))) {
-			Zotero.debug('File not found in indexFile()', 2);
-			return false;
-		}
-		
-		if (!contentType) {
-			Zotero.debug("Content type not provided in indexFile()", 1);
-			return false;
-		}
-		
-		if (!itemID) {
-			throw new Error('Item ID not provided');
-		}
-		
-		if (contentType == 'application/pdf') {
-			return this.indexPDF(path, itemID, complete);
-		}
-		
-		if (!Zotero.MIME.isTextType(contentType)) {
-			Zotero.debug('File is not text in indexFile()', 2);
-			return false;
-		}
-		
-		if (!charset) {
-			Zotero.logError(`Item ${itemID} didn't have a charset`);
-			return false;
-		}
-		
-		var maxLength = Zotero.Prefs.get('fulltext.textMaxLength');
-		if (!maxLength) {
-			return false;
-		}
-		if (complete) {
-			maxLength = null;
-		}
-		
-		Zotero.debug('Indexing file ' + path);
-		var text = yield Zotero.File.getContentsAsync(path, charset);
-		var totalChars = text.length;
-		if (contentType == 'text/html') {
-			let obj = yield convertItemHTMLToText(itemID, text, maxLength);
-			text = obj.text;
-			totalChars = obj.totalChars;
-		}
-		else {
-			if (maxLength && text.length > maxLength) {
-				text = text.substr(0, maxLength);
-			}
-		}
-		
-		// Record the number of characters indexed (unless we're indexing a (PDF) cache file,
-		// in which case the stats are coming from elsewhere)
-		if (!stats) {
-			stats = { indexedChars: text.length, totalChars: totalChars };
-		}
-		yield indexString(text, charset, itemID, stats);
-		
-		return true;
-	}.bind(this));
-	
-	
 	/**
 	 * Run PDF through pdfinfo and pdftotext to generate .zotero-ft-info
-	 * and .zotero-ft-cache, and pass the text file back to indexFile()
+	 * and .zotero-ft-cache, and pass the text file to indexString()
 	 *
 	 * @param {nsIFile} file
 	 * @param {Number} itemID
@ -494,14 +431,9 @@ Zotero.Fulltext = Zotero.FullText = new function(){
 			return false;
 		}
 		
-		yield indexFile(
-			cacheFilePath,
-			'text/plain',
-			'utf-8',
-			itemID,
-			true,
-			{ indexedPages, totalPages }
-		);
+		var text = Zotero.File.getContentsAsync(cacheFilePath);
+		var stats = { indexedPages, totalPages };
+		yield indexString(text, itemID, stats);
 		
 		return true;
 	});
@ -554,7 +486,7 @@ Zotero.Fulltext = Zotero.FullText = new function(){
 			}
 			
 			try {
-				await indexFile(path, item.attachmentContentType, item.attachmentCharset, itemID, complete);
+				await indexItem(item, path, complete);
 			}
 			catch (e) {
 				if (ignoreErrors) {
@ -568,6 +500,87 @@ Zotero.Fulltext = Zotero.FullText = new function(){
 	};
 	
 	
+	var indexItem = async function (item, path, complete) {
+		if (!await OS.File.exists(path)) {
+			Zotero.debug(`${path} does not exist in indexItem()`, 2);
+			return false;
+		}
+		
+		var contentType = item.attachmentContentType;
+		var charset = item.attachmentCharacterSet;
+		
+		if (!contentType) {
+			Zotero.debug("No content type in indexItem()", 2);
+			return false;
+		}
+		
+		if (contentType == 'application/pdf') {
+			return this.indexPDF(path, item.id, complete);
+		}
+		
+		if (!Zotero.MIME.isTextType(contentType)) {
+			Zotero.debug('File is not text in indexItem()', 2);
+			return false;
+		}
+		
+		var maxLength = Zotero.Prefs.get('fulltext.textMaxLength');
+		if (!maxLength) {
+			Zotero.debug('fulltext.textMaxLength is 0 -- skipping indexing');
+			return false;
+		}
+		
+		Zotero.debug('Indexing file ' + path);
+		
+		var text;
+		
+		// If it's a plain-text file and we know the charset, just get the contents
+		if (contentType == 'text/plain' && charset) {
+			text = await Zotero.File.getContentsAsync(path, charset);
+		}
+		// Otherwise load it in a hidden browser
+		else {
+			let pageData = await getPageData(path);
+			text = pageData.bodyText;
+			if (!charset) {
+				charset = pageData.characterSet;
+			}
+			if (contentType == 'text/html') {
+				await writeCacheFile(item, text, maxLength, complete);
+			}
+			
+			// If the item didn't have a charset assigned and the library is editable, update it now
+			if (charset && !item.attachmentCharset && item.library.editable) {
+				let canonical = Zotero.CharacterSets.toCanonical(charset);
+				let msg = `Character set is ${canonical}`;
+				if (charset != canonical) {
+					msg += ` (detected: ${charset})`;
+					charset = canonical;
+				}
+				Zotero.debug(msg);
+				
+				if (charset) {
+					item.attachmentCharset = charset;
+					await item.saveTx({
+						skipNotifier: true
+					});
+				}
+			}
+			
+			if (!charset) {
+				Zotero.logError(`Could not detect character set for ${item.libraryKey} -- skipping indexing`);
+				return false;
+			}
+		}
+		
+		var totalChars = text.length;
+		if (!complete) {
+			text = text.substr(0, maxLength);
+		}
+		var stats = { indexedChars: text.length, totalChars };
+		await indexString(text, item.id, stats);
+	}.bind(this);
+	
+	
 	// TEMP: Temporary mechanism to serialize indexing of new attachments
 	//
 	// This should instead save the itemID to a table that's read by the content processor
@ -640,7 +653,7 @@ Zotero.Fulltext = Zotero.FullText = new function(){
 			let item = yield Zotero.Items.getAsync(itemID);
 			let libraryKey = item.libraryKey;
 			let contentType = item.attachmentContentType;
-			if (contentType && (isCachedMIMEType(contentType) || Zotero.MIME.isTextType(contentType))) {
+			if (contentType && (this.isCachedMIMEType(contentType) || Zotero.MIME.isTextType(contentType))) {
 				try {
 					let cacheFile = this.getItemCacheFile(item).path;
 					if (yield OS.File.exists(cacheFile)) {
@ -649,8 +662,8 @@ Zotero.Fulltext = Zotero.FullText = new function(){
 						content = yield Zotero.File.getContentsAsync(cacheFile);
 					}
 					else {
-						// If there should be a cache file and isn't, mark the full text as missing
-						if (!Zotero.MIME.isTextType(contentType)) {
+						// If a cache file is required, mark the full text as missing
+						if (this.isCachedMIMEType(contentType)) {
 							Zotero.debug("Full-text content cache file doesn't exist for item "
 								+ libraryKey, 2);
 							let sql = "UPDATE fulltextItems SET synced=? WHERE itemID=?";
@ -671,21 +684,8 @@ Zotero.Fulltext = Zotero.FullText = new function(){
 						Zotero.debug("Getting full-text content from file for item " + libraryKey);
 						content = yield Zotero.File.getContentsAsync(path, item.attachmentCharset);
 						
-						// If HTML, convert to plain text first, and cache the result
-						if (item.attachmentContentType == 'text/html') {
-							let obj = yield convertItemHTMLToText(
-								itemID,
-								content,
-								// Include in the cache file only as many characters as we
-								// indexed previously
-								row.indexedChars
-							);
-							content = obj.text;
-						}
-						else {
-							// Include only as many characters as we've indexed
-							content = content.substr(0, row.indexedChars);
-						}
+						// Include only as many characters as we've indexed
+						content = content.substr(0, row.indexedChars);
 					}
 				}
 				catch (e) {
@ -982,7 +982,6 @@ Zotero.Fulltext = Zotero.FullText = new function(){
 			
 			yield indexString(
 				data.text,
-				"UTF-8",
 				itemID,
 				{
 					indexedChars: data.indexedChars,
@ -1104,9 +1103,12 @@ Zotero.Fulltext = Zotero.FullText = new function(){
 			let maxLength = Zotero.Prefs.get('fulltext.textMaxLength');
 			let binaryMode = mode && mode.indexOf('Binary') != -1;
 			
-			if (isCachedMIMEType(mimeType)) {
+			if (this.isCachedMIMEType(mimeType)) {
 				let file = this.getItemCacheFile(item).path;
 				if (!(yield OS.File.exists(file))) {
+					Zotero.debug("No cache file at " + file, 2);
+					// TODO: Index on-demand?
+					// What about a cleared full-text index?
 					continue;
 				}
 				
@ -1122,33 +1124,13 @@ Zotero.Fulltext = Zotero.FullText = new function(){
 					}
 				}
 				
-				// Check for a cache file
-				let cacheFile = this.getItemCacheFile(item).path;
-				if (yield OS.File.exists(cacheFile)) {
-					Zotero.debug("Searching for text '" + searchText + "' in " + cacheFile);
-					content = yield Zotero.File.getContentsAsync(cacheFile, 'utf-8', maxLength);
-				}
-				else {
-					// If that doesn't exist, check for the actual file
-					let path = yield item.getFilePathAsync();
-					if (!path) {
-						continue;
-					}
-					
-					Zotero.debug("Searching for text '" + searchText + "' in " + path);
-					content = yield Zotero.File.getContentsAsync(path, item.attachmentCharset);
-					
-					// If HTML and not binary mode, convert to text
-					if (mimeType == 'text/html' && !binaryMode) {
-						// Include in the cache file only as many characters as we've indexed
-						let chars = yield getChars(itemID);
-						
-						let obj = yield convertItemHTMLToText(
-							itemID, content, chars ? chars.indexedChars : null
-						);
-						content = obj.text;
-					}
+				let path = yield item.getFilePathAsync();
+				if (!path) {
+					continue;
 				}
+				
+				Zotero.debug("Searching for text '" + searchText + "' in " + path);
+				content = yield Zotero.File.getContentsAsync(path, item.attachmentCharset, maxLength);
 			}
 			
 			let match = findTextInString(content, searchText, mode);
@ -1608,58 +1590,46 @@ Zotero.Fulltext = Zotero.FullText = new function(){
 	});
 	
 	
+	async function getPageData(path) {
+		const { HiddenBrowser } = ChromeUtils.import("chrome://zotero/content/HiddenBrowser.jsm");
+		var browser;
+		var pageData;
+		try {
+			let url = Zotero.File.pathToFileURI(path);
+			browser = await HiddenBrowser.create(url);
+			pageData = await HiddenBrowser.getPageData(browser, ['characterSet', 'bodyText']);
+		}
+		finally {
+			if (browser) {
+				HiddenBrowser.destroy(browser);
+			}
+		}
+		return {
+			characterSet: pageData.characterSet,
+			bodyText: pageData.bodyText
+		};
+	}
+	
+	
 	/**
-	 * Convert HTML to text for an item and cache the result
-	 *
-	 * @return {Promise}
+	 * Write the converted text to a cache file
 	 */
-	var convertItemHTMLToText = Zotero.Promise.coroutine(function* (itemID, html, maxLength) {
-		// Split elements to avoid word concatenation
-		html = html.replace(/>/g, '> ');
-		
-		var text = HTMLToText(html);
-		var totalChars = text.length;
-		
-		if (maxLength) {
+	var writeCacheFile = async function (item, text, maxLength, complete) {
+		if (!complete) {
 			text = text.substr(0, maxLength);
 		}
-		
-		// Write the converted text to a cache file
-		var item = yield Zotero.Items.getAsync(itemID);
-		var cacheFile = Zotero.Fulltext.getItemCacheFile(item).path;
-		Zotero.debug("Writing converted full-text HTML content to " + cacheFile);
-		if (!(yield OS.File.exists(OS.Path.dirname(cacheFile)))) {
-			yield Zotero.Attachments.createDirectoryForItem(item);
+		var cacheFile = this.getItemCacheFile(item).path;
+		Zotero.debug("Writing converted full-text content to " + cacheFile);
+		if (!await OS.File.exists(OS.Path.dirname(cacheFile))) {
+			await Zotero.Attachments.createDirectoryForItem(item);
 		}
-		yield Zotero.File.putContentsAsync(cacheFile, text)
-		.catch(function (e) {
-			Zotero.debug(e, 1);
-			Components.utils.reportError(e);
-		});
-		
-		return {
-			text: text,
-			totalChars: totalChars
-		};
-	});
-	
-	function HTMLToText(html) {
-		var	nsIFC = Components.classes['@mozilla.org/widget/htmlformatconverter;1']
-			.createInstance(Components.interfaces.nsIFormatConverter);
-		var from = Components.classes['@mozilla.org/supports-string;1']
-			.createInstance(Components.interfaces.nsISupportsString);
-		from.data = html;
-		var to = { value: null };
 		try {
-			nsIFC.convert('text/html', from, from.toString().length, 'text/unicode', to, {});
-			to = to.value.QueryInterface(Components.interfaces.nsISupportsString);
-			return to.toString();
+			await Zotero.File.putContentsAsync(cacheFile, text);
 		}
-		catch(e) {
-			Zotero.debug(e, 1);
-			return html;
+		catch (e) {
+			Zotero.logError(e);
 		}
-	}
+	}.bind(this);
 	
 	
 	/**
@ -1673,16 +1643,6 @@ Zotero.Fulltext = Zotero.FullText = new function(){
 			return [];
 		}
 		
-		try {
-			if (charset && charset != 'utf-8') {
-				this.converter.charset = charset;
-				text = this.converter.ConvertToUnicode(text);
-			}
-		} catch (err) {
-			Zotero.debug("Error converting from charset " + charset, 1);
-			Zotero.debug(err, 1);
-		}
-		
 		var words = {};
 		var word = '';
 		var cclass = null;
--- a/test/tests/attachmentsTest.js
+++ b/test/tests/attachmentsTest.js
@ -259,8 +259,7 @@ describe("Zotero.Attachments", function() {
 			assert.propertyVal(matches[0], 'id', attachment.id);
 		});
 		
-		// This isn't particularly the behavior we want, but it documents the expected behavior
-		it("shouldn't index JavaScript-created text in an HTML file when the charset isn't known in advance", async function () {
+		it("should index JavaScript-created text in an HTML file", async function () {
 			var item = await createDataObject('item');
 			var file = getTestDataDirectory();
 			file.append('test-js.html');
@ -275,7 +274,8 @@ describe("Zotero.Attachments", function() {
 			assert.equal(attachment.attachmentCharset, 'utf-8');
 			
 			var matches = await Zotero.Fulltext.findTextInItems([attachment.id], 'test');
-			assert.lengthOf(matches, 0);
+			assert.lengthOf(matches, 1);
+			assert.propertyVal(matches[0], 'id', attachment.id);
 		});
 	});