Replace pdftotext and pdfinfo with pdf-worker

2023-03-31 12:48:05 +01:00 · 2023-03-31 12:48:05 +01:00 · bd9a40562f
commit bd9a40562f
parent 3a0731a024
6 changed files with 142 additions and 125 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -36,7 +36,7 @@
 [submodule "pdf-worker"]
 	path = pdf-worker
 	url = https://github.com/zotero/pdf-worker.git
-	branch = master
+	branch = worker2
 [submodule "note-editor"]
 	path = note-editor
 	url = https://github.com/zotero/note-editor.git
--- a/chrome/content/zotero/xpcom/fulltext.js
+++ b/chrome/content/zotero/xpcom/fulltext.js
@ -24,8 +24,7 @@
 */

 Zotero.Fulltext = Zotero.FullText = new function(){
-	this.__defineGetter__("pdfConverterCacheFile", function () { return '.zotero-ft-cache'; });
-	this.__defineGetter__("pdfInfoCacheFile", function () { return '.zotero-ft-info'; });
+	this.__defineGetter__("fulltextCacheFile", function () { return '.zotero-ft-cache'; });

 	this.INDEX_STATE_UNAVAILABLE = 0;
 	this.INDEX_STATE_UNINDEXED = 1;
@ -356,87 +355,48 @@ Zotero.Fulltext = Zotero.FullText = new function(){
 	

 	/**
-	 * Run PDF through pdfinfo and pdftotext to generate .zotero-ft-info
-	 * and .zotero-ft-cache, and pass the text file to indexString()
+	 * Index PDF file and store the fulltext content in a file
 	 *
-	 * @param {nsIFile} file
+	 * @param {nsIFile} filePath
 	 * @param {Number} itemID
 	 * @param {Boolean} [allPages] - If true, index all pages rather than pdfMaxPages
 	 * @return {Promise}
 	 */
-	this.indexPDF = Zotero.Promise.coroutine(function* (filePath, itemID, allPages) {
+	this.indexPDF = async function (filePath, itemID, allPages) {
 		var maxPages = Zotero.Prefs.get('fulltext.pdfMaxPages');
 		if (maxPages == 0) {
 			return false;
 		}
-		
-		var item = yield Zotero.Items.getAsync(itemID);
+		var item = await Zotero.Items.getAsync(itemID);
 		var linkMode = item.attachmentLinkMode;
 		// If file is stored outside of Zotero, create a directory for the item
 		// in the storage directory and save the cache file there
 		if (linkMode == Zotero.Attachments.LINK_MODE_LINKED_FILE) {
-			var parentDirPath = yield Zotero.Attachments.createDirectoryForItem(item);
+			var parentDirPath = await Zotero.Attachments.createDirectoryForItem(item);
 		}
 		else {
 			var parentDirPath = OS.Path.dirname(filePath);
 		}
-		var infoFilePath = OS.Path.join(parentDirPath, this.pdfInfoCacheFile);
-		var cacheFilePath = OS.Path.join(parentDirPath, this.pdfConverterCacheFile);
-		
-
-		var args = [filePath, infoFilePath];
-
+		var cacheFilePath = OS.Path.join(parentDirPath, this.fulltextCacheFile);
 		try {
-			yield Zotero.Utilities.Internal.exec(_pdfInfo, args);
-			var totalPages = yield getTotalPagesFromFile(itemID);
+			var {
+				text,
+				extractedPages,
+				totalPages
+			} = await Zotero.PDFWorker.getFullText(itemID, allPages ? null : maxPages);
 		}
 		catch (e) {
-			Zotero.debug("Error running " + _pdfInfo.path, 1);
-			Zotero.logError(e);
-		}
-
-		
-		var {exec, args} = this.getPDFConverterExecAndArgs();
-		// Keep in sync with Item::attachmentText
-		args.push('-nopgbrk');
-		
-		if (allPages) {
-			if (totalPages) {
-				var indexedPages = totalPages;
-			}
-		}
-		else {
-			args.push('-l', maxPages);
-			var indexedPages = Math.min(maxPages, totalPages);
-		}
-		args.push(filePath, cacheFilePath);
-		
-		try {
-			yield Zotero.Utilities.Internal.exec(exec, args);
-		}
-		catch (e) {
-			Zotero.debug("Error running " + exec.path, 1);
 			Zotero.logError(e);
 			return false;
 		}
-		
-		if (!(yield OS.File.exists(cacheFilePath))) {
-			let fileName = OS.Path.basename(filePath);
-			let msg = fileName + " was not indexed";
-			if (!fileName.match(/^[\u0000-\u007F]+$/)) {
-				msg += " -- PDFs with filenames containing extended characters cannot currently be indexed due to a Mozilla limitation";
-			}
-			Zotero.debug(msg, 2);
-			Components.utils.reportError(msg);
+		if (!text || !extractedPages) {
 			return false;
 		}
-		
-		var text = Zotero.File.getContentsAsync(cacheFilePath);
-		var stats = { indexedPages, totalPages };
-		yield indexString(text, itemID, stats);
-		
+		await Zotero.File.putContentsAsync(cacheFilePath, text);
+		var stats = { indexedPages: extractedPages, totalPages };
+		await indexString(text, itemID, stats);
 		return true;
-	});
+	};
 	
 	
 	/**
@ -1213,33 +1173,6 @@ Zotero.Fulltext = Zotero.FullText = new function(){
 	}


-	/**
-	 * Gets the number of pages from the PDF info cache file
-	 *
-	 * @private
-	 * @return {Promise}
-	 */
-	var getTotalPagesFromFile = Zotero.Promise.coroutine(function* (itemID) {
-		var file = OS.Path.join(
-			Zotero.Attachments.getStorageDirectoryByID(itemID).path,
-			Zotero.Fulltext.pdfInfoCacheFile
-		);
-		if (!(yield OS.File.exists(file))) {
-			return false;
-		}
-		var contents = yield Zotero.File.getContentsAsync(file);
-		try {
-			// Parse pdfinfo output
-			var pages = contents.match('Pages:[^0-9]+([0-9]+)')[1];
-		}
-		catch (e) {
-			Zotero.debug(e);
-			return false;
-		}
-		return pages;
-	});
-	
-	
 	/**
 	 * @return {Promise}
 	 */
@ -1261,7 +1194,7 @@ Zotero.Fulltext = Zotero.FullText = new function(){
 			case 'application/pdf':
 				var file = OS.Path.join(
 					Zotero.Attachments.getStorageDirectory(item).path,
-					this.pdfConverterCacheFile
+					this.fulltextCacheFile
 				);
 				if (!(yield OS.File.exists(file))) {
 					return false;
@ -1412,7 +1345,7 @@ Zotero.Fulltext = Zotero.FullText = new function(){
 	
 	this.getItemCacheFile = function (item) {
 		var cacheFile = Zotero.Attachments.getStorageDirectory(item);
-		cacheFile.append(this.pdfConverterCacheFile);
+		cacheFile.append(this.fulltextCacheFile);
 		return cacheFile;
 	}
 	
--- a/chrome/content/zotero/xpcom/pdfWorker/manager.js
+++ b/chrome/content/zotero/xpcom/pdfWorker/manager.js
@ -24,7 +24,8 @@
 */

 const WORKER_URL = 'chrome://zotero/content/xpcom/pdfWorker/worker.js';
-const CMAPS_URL = 'resource://zotero/pdf-reader/cmaps/';
+const CMAPS_URL = 'chrome://zotero/content/xpcom/pdfWorker/cmaps/';
+const STANDARD_FONTS_URL = 'chrome://zotero/content/xpcom/pdfWorker/standard_fonts/';
 const RENDERER_URL = 'resource://zotero/pdf-renderer/renderer.html';

 class PDFWorker {
@ -55,8 +56,8 @@ class PDFWorker {
 			}
 		}
 		this._processingQueue = false;
-		this._worker.terminate();
-		this._worker = null;
+		// this._worker.terminate();
+		// this._worker = null;
 	}

 	async _enqueue(fn, isPriority) {
@ -114,6 +115,20 @@ class PDFWorker {
 					Zotero.debug('Failed to fetch CMap data:');
 					Zotero.debug(e);
 				}
+				try {
+					if (message.action === 'FetchStandardFontData') {
+						let response = await Zotero.HTTP.request(
+							'GET',
+							STANDARD_FONTS_URL + message.data,
+							{ responseType: 'arraybuffer' }
+						);
+						respData = new Uint8Array(response.response);
+					}
+				}
+				catch (e) {
+					Zotero.debug('Failed to fetch standard font data:');
+					Zotero.debug(e);
+				}
 				this._worker.postMessage({ responseID: event.data.id, data: respData });
 			}
 		});
@ -578,6 +593,97 @@ class PDFWorker {
 			Zotero.debug(`Rotated pages for item ${attachment.libraryKey} in ${new Date() - t} ms`);
 		}, isPriority);
 	}
+
+	/**
+	 * Get fulltext
+	 *
+	 * @param {Integer} itemID Attachment item id
+	 * @param {Integer|null} maxPages Pages count to extract, or all pages if 'null'
+	 * @param {Boolean} [isPriority]
+	 * @param {String} [password]
+	 * @returns {Promise}
+	 */
+	async getFullText(itemID, maxPages, isPriority, password) {
+		return this._enqueue(async () => {
+			let attachment = await Zotero.Items.getAsync(itemID);
+
+			Zotero.debug(`Getting fulltext content from item ${attachment.libraryKey}`);
+			let t = new Date();
+
+			if (!attachment.isPDFAttachment()) {
+				throw new Error('Item must be a PDF attachment');
+			}
+
+			let path = await attachment.getFilePathAsync();
+			let buf = await OS.File.read(path, {});
+			buf = new Uint8Array(buf).buffer;
+
+			try {
+				var result = await this._query('getFulltext', {
+					buf, maxPages, password
+				}, [buf]);
+			}
+			catch (e) {
+				let error = new Error(`Worker 'getFullText' failed: ${JSON.stringify({ error: e.message })}`);
+				try {
+					error.name = JSON.parse(e.message).name;
+				}
+				catch (e) {
+					Zotero.logError(e);
+				}
+				Zotero.logError(error);
+				throw error;
+			}
+
+			Zotero.debug(`Extracted full text for item ${attachment.libraryKey} in ${new Date() - t} ms`);
+
+			return result;
+		}, isPriority);
+	}
+
+	/**
+	 * Get data for recognizer-server
+	 *
+	 * @param {Integer} itemID Attachment item id
+	 * @param {Boolean} [isPriority]
+	 * @param {String} [password]
+	 * @returns {Promise}
+	 */
+	async getRecognizerData(itemID, isPriority, password) {
+		return this._enqueue(async () => {
+			let attachment = await Zotero.Items.getAsync(itemID);
+
+			Zotero.debug(`Getting PDF recognizer data from item ${attachment.libraryKey}`);
+			let t = new Date();
+
+			if (!attachment.isPDFAttachment()) {
+				throw new Error('Item must be a PDF attachment');
+			}
+
+			let path = await attachment.getFilePathAsync();
+			let buf = await OS.File.read(path, {});
+			buf = new Uint8Array(buf).buffer;
+
+			try {
+				var result = await this._query('getRecognizerData', { buf, password }, [buf]);
+			}
+			catch (e) {
+				let error = new Error(`Worker 'getRecognizerData' failed: ${JSON.stringify({ error: e.message })}`);
+				try {
+					error.name = JSON.parse(e.message).name;
+				}
+				catch (e) {
+					Zotero.logError(e);
+				}
+				Zotero.logError(error);
+				throw error;
+			}
+
+			Zotero.debug(`Extracted PDF recognizer data for item ${attachment.libraryKey} in ${new Date() - t} ms`);
+
+			return result;
+		}, isPriority);
+	}
 }

 Zotero.PDFWorker = new PDFWorker();
--- a/chrome/content/zotero/xpcom/recognizePDF.js
+++ b/chrome/content/zotero/xpcom/recognizePDF.js
@ -223,7 +223,7 @@ Zotero.RecognizePDF = new function () {
 		}
 		
 		var version = Zotero.version;
-		var json = await extractJSON(filePath, MAX_PAGES);
+		var json = await extractJSON(attachment.id);
 		var metadata = item.toJSON();
 		
 		var data = { description, version, json, metadata };
@ -323,39 +323,16 @@ Zotero.RecognizePDF = new function () {
 	}
 	
 	/**
-	 * Get json from a PDF
-	 * @param {String} filePath PDF file path
-	 * @param {Number} pages Number of pages to extract
+	 * Get recognizer data from PDF file
+	 * @param {Number} itemID Attachment item id
 	 * @return {Promise}
 	 */
-	async function extractJSON(filePath, pages) {
-		let cacheFile = Zotero.getTempDirectory();
-		cacheFile.append("recognizePDFcache.txt");
-		if (cacheFile.exists()) {
-			cacheFile.remove(false);
-		}
-		
-		let {exec, args} = Zotero.Fulltext.getPDFConverterExecAndArgs();
-		args.push('-json', '-l', pages, filePath, cacheFile.path);
-		
-		Zotero.debug("RecognizePDF: Running " + exec.path + " " + args.map(arg => "'" + arg + "'").join(" "));
-		
+	async function extractJSON(itemID) {
 		try {
-			await Zotero.Utilities.Internal.exec(exec, args);
-			let content = await Zotero.File.getContentsAsync(cacheFile.path);
-			Zotero.debug("RecognizePDF: Extracted JSON:");
-			Zotero.debug(content);
-			cacheFile.remove(false);
-			return JSON.parse(content);
+			return await Zotero.PDFWorker.getRecognizerData(itemID, true);
 		}
 		catch (e) {
 			Zotero.logError(e);
-			try {
-				cacheFile.remove(false);
-			}
-			catch (e) {
-				Zotero.logError(e);
-			}
 			throw new Zotero.Exception.Alert("recognizePDF.couldNotRead");
 		}
 	}
@ -416,7 +393,7 @@ Zotero.RecognizePDF = new function () {
 		
 		if (!filePath || !await OS.File.exists(filePath)) throw new Zotero.Exception.Alert('recognizePDF.fileNotFound');

-		let json = await extractJSON(filePath, MAX_PAGES);
+		let json = await extractJSON(item.id);
 		json.fileName = OS.Path.basename(filePath);
 		
 		let containingTextPages = 0;
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 4456d0eeacb9ef8a276adba61410b2c4620bc00d
+Subproject commit caa9f27a000e3a17fb59f86ca2736f035f296267
--- a/scripts/pdf-worker.js
+++ b/scripts/pdf-worker.js
@ -35,7 +35,8 @@ async function getPDFWorker(signatures) {
 		catch (e) {
 			await exec('npm ci', { cwd: modulePath });
 			await exec('npm run build', { cwd: modulePath });
-			await fs.copy(path.join(modulePath, 'build', 'worker.js'), path.join(targetDir, 'worker.js'));
+			// TODO: Don't copy 'cmaps' and 'standard_fonts' directories once pdf-reader is updated
+			await fs.copy(path.join(modulePath, 'build'), targetDir);
 		}
 		signatures['pdf-worker'] = { hash };
 	}