From bd9a40562fe341b6891a7458bb1e7efb0fee6ea5 Mon Sep 17 00:00:00 2001 From: Martynas Bagdonas Date: Fri, 31 Mar 2023 12:48:05 +0100 Subject: [PATCH] Replace pdftotext and pdfinfo with pdf-worker --- .gitmodules | 2 +- chrome/content/zotero/xpcom/fulltext.js | 113 ++++-------------- .../content/zotero/xpcom/pdfWorker/manager.js | 112 ++++++++++++++++- chrome/content/zotero/xpcom/recognizePDF.js | 35 +----- pdf-worker | 2 +- scripts/pdf-worker.js | 3 +- 6 files changed, 142 insertions(+), 125 deletions(-) diff --git a/.gitmodules b/.gitmodules index 06d169ba53..9883f028bb 100644 --- a/.gitmodules +++ b/.gitmodules @@ -36,7 +36,7 @@ [submodule "pdf-worker"] path = pdf-worker url = https://github.com/zotero/pdf-worker.git - branch = master + branch = worker2 [submodule "note-editor"] path = note-editor url = https://github.com/zotero/note-editor.git diff --git a/chrome/content/zotero/xpcom/fulltext.js b/chrome/content/zotero/xpcom/fulltext.js index 83aee5d3ea..1be619968d 100644 --- a/chrome/content/zotero/xpcom/fulltext.js +++ b/chrome/content/zotero/xpcom/fulltext.js @@ -24,9 +24,8 @@ */ Zotero.Fulltext = Zotero.FullText = new function(){ - this.__defineGetter__("pdfConverterCacheFile", function () { return '.zotero-ft-cache'; }); - this.__defineGetter__("pdfInfoCacheFile", function () { return '.zotero-ft-info'; }); - + this.__defineGetter__("fulltextCacheFile", function () { return '.zotero-ft-cache'; }); + this.INDEX_STATE_UNAVAILABLE = 0; this.INDEX_STATE_UNINDEXED = 1; this.INDEX_STATE_PARTIAL = 2; @@ -354,89 +353,50 @@ Zotero.Fulltext = Zotero.FullText = new function(){ ); }); - + /** - * Run PDF through pdfinfo and pdftotext to generate .zotero-ft-info - * and .zotero-ft-cache, and pass the text file to indexString() + * Index PDF file and store the fulltext content in a file * - * @param {nsIFile} file + * @param {nsIFile} filePath * @param {Number} itemID * @param {Boolean} [allPages] - If true, index all pages rather than pdfMaxPages * @return {Promise} */ - this.indexPDF = Zotero.Promise.coroutine(function* (filePath, itemID, allPages) { + this.indexPDF = async function (filePath, itemID, allPages) { var maxPages = Zotero.Prefs.get('fulltext.pdfMaxPages'); if (maxPages == 0) { return false; } - - var item = yield Zotero.Items.getAsync(itemID); + var item = await Zotero.Items.getAsync(itemID); var linkMode = item.attachmentLinkMode; // If file is stored outside of Zotero, create a directory for the item // in the storage directory and save the cache file there if (linkMode == Zotero.Attachments.LINK_MODE_LINKED_FILE) { - var parentDirPath = yield Zotero.Attachments.createDirectoryForItem(item); + var parentDirPath = await Zotero.Attachments.createDirectoryForItem(item); } else { var parentDirPath = OS.Path.dirname(filePath); } - var infoFilePath = OS.Path.join(parentDirPath, this.pdfInfoCacheFile); - var cacheFilePath = OS.Path.join(parentDirPath, this.pdfConverterCacheFile); - - - var args = [filePath, infoFilePath]; - + var cacheFilePath = OS.Path.join(parentDirPath, this.fulltextCacheFile); try { - yield Zotero.Utilities.Internal.exec(_pdfInfo, args); - var totalPages = yield getTotalPagesFromFile(itemID); + var { + text, + extractedPages, + totalPages + } = await Zotero.PDFWorker.getFullText(itemID, allPages ? null : maxPages); } catch (e) { - Zotero.debug("Error running " + _pdfInfo.path, 1); - Zotero.logError(e); - } - - - var {exec, args} = this.getPDFConverterExecAndArgs(); - // Keep in sync with Item::attachmentText - args.push('-nopgbrk'); - - if (allPages) { - if (totalPages) { - var indexedPages = totalPages; - } - } - else { - args.push('-l', maxPages); - var indexedPages = Math.min(maxPages, totalPages); - } - args.push(filePath, cacheFilePath); - - try { - yield Zotero.Utilities.Internal.exec(exec, args); - } - catch (e) { - Zotero.debug("Error running " + exec.path, 1); Zotero.logError(e); return false; } - - if (!(yield OS.File.exists(cacheFilePath))) { - let fileName = OS.Path.basename(filePath); - let msg = fileName + " was not indexed"; - if (!fileName.match(/^[\u0000-\u007F]+$/)) { - msg += " -- PDFs with filenames containing extended characters cannot currently be indexed due to a Mozilla limitation"; - } - Zotero.debug(msg, 2); - Components.utils.reportError(msg); + if (!text || !extractedPages) { return false; } - - var text = Zotero.File.getContentsAsync(cacheFilePath); - var stats = { indexedPages, totalPages }; - yield indexString(text, itemID, stats); - + await Zotero.File.putContentsAsync(cacheFilePath, text); + var stats = { indexedPages: extractedPages, totalPages }; + await indexString(text, itemID, stats); return true; - }); + }; /** @@ -1211,35 +1171,8 @@ Zotero.Fulltext = Zotero.FullText = new function(){ + "FROM fulltextItems WHERE itemID=?"; return Zotero.DB.rowQueryAsync(sql, itemID); } - - - /** - * Gets the number of pages from the PDF info cache file - * - * @private - * @return {Promise} - */ - var getTotalPagesFromFile = Zotero.Promise.coroutine(function* (itemID) { - var file = OS.Path.join( - Zotero.Attachments.getStorageDirectoryByID(itemID).path, - Zotero.Fulltext.pdfInfoCacheFile - ); - if (!(yield OS.File.exists(file))) { - return false; - } - var contents = yield Zotero.File.getContentsAsync(file); - try { - // Parse pdfinfo output - var pages = contents.match('Pages:[^0-9]+([0-9]+)')[1]; - } - catch (e) { - Zotero.debug(e); - return false; - } - return pages; - }); - - + + /** * @return {Promise} */ @@ -1261,7 +1194,7 @@ Zotero.Fulltext = Zotero.FullText = new function(){ case 'application/pdf': var file = OS.Path.join( Zotero.Attachments.getStorageDirectory(item).path, - this.pdfConverterCacheFile + this.fulltextCacheFile ); if (!(yield OS.File.exists(file))) { return false; @@ -1412,7 +1345,7 @@ Zotero.Fulltext = Zotero.FullText = new function(){ this.getItemCacheFile = function (item) { var cacheFile = Zotero.Attachments.getStorageDirectory(item); - cacheFile.append(this.pdfConverterCacheFile); + cacheFile.append(this.fulltextCacheFile); return cacheFile; } diff --git a/chrome/content/zotero/xpcom/pdfWorker/manager.js b/chrome/content/zotero/xpcom/pdfWorker/manager.js index 14dadc16ac..bcd6785219 100644 --- a/chrome/content/zotero/xpcom/pdfWorker/manager.js +++ b/chrome/content/zotero/xpcom/pdfWorker/manager.js @@ -24,7 +24,8 @@ */ const WORKER_URL = 'chrome://zotero/content/xpcom/pdfWorker/worker.js'; -const CMAPS_URL = 'resource://zotero/pdf-reader/cmaps/'; +const CMAPS_URL = 'chrome://zotero/content/xpcom/pdfWorker/cmaps/'; +const STANDARD_FONTS_URL = 'chrome://zotero/content/xpcom/pdfWorker/standard_fonts/'; const RENDERER_URL = 'resource://zotero/pdf-renderer/renderer.html'; class PDFWorker { @@ -55,8 +56,8 @@ class PDFWorker { } } this._processingQueue = false; - this._worker.terminate(); - this._worker = null; + // this._worker.terminate(); + // this._worker = null; } async _enqueue(fn, isPriority) { @@ -114,6 +115,20 @@ class PDFWorker { Zotero.debug('Failed to fetch CMap data:'); Zotero.debug(e); } + try { + if (message.action === 'FetchStandardFontData') { + let response = await Zotero.HTTP.request( + 'GET', + STANDARD_FONTS_URL + message.data, + { responseType: 'arraybuffer' } + ); + respData = new Uint8Array(response.response); + } + } + catch (e) { + Zotero.debug('Failed to fetch standard font data:'); + Zotero.debug(e); + } this._worker.postMessage({ responseID: event.data.id, data: respData }); } }); @@ -578,6 +593,97 @@ class PDFWorker { Zotero.debug(`Rotated pages for item ${attachment.libraryKey} in ${new Date() - t} ms`); }, isPriority); } + + /** + * Get fulltext + * + * @param {Integer} itemID Attachment item id + * @param {Integer|null} maxPages Pages count to extract, or all pages if 'null' + * @param {Boolean} [isPriority] + * @param {String} [password] + * @returns {Promise} + */ + async getFullText(itemID, maxPages, isPriority, password) { + return this._enqueue(async () => { + let attachment = await Zotero.Items.getAsync(itemID); + + Zotero.debug(`Getting fulltext content from item ${attachment.libraryKey}`); + let t = new Date(); + + if (!attachment.isPDFAttachment()) { + throw new Error('Item must be a PDF attachment'); + } + + let path = await attachment.getFilePathAsync(); + let buf = await OS.File.read(path, {}); + buf = new Uint8Array(buf).buffer; + + try { + var result = await this._query('getFulltext', { + buf, maxPages, password + }, [buf]); + } + catch (e) { + let error = new Error(`Worker 'getFullText' failed: ${JSON.stringify({ error: e.message })}`); + try { + error.name = JSON.parse(e.message).name; + } + catch (e) { + Zotero.logError(e); + } + Zotero.logError(error); + throw error; + } + + Zotero.debug(`Extracted full text for item ${attachment.libraryKey} in ${new Date() - t} ms`); + + return result; + }, isPriority); + } + + /** + * Get data for recognizer-server + * + * @param {Integer} itemID Attachment item id + * @param {Boolean} [isPriority] + * @param {String} [password] + * @returns {Promise} + */ + async getRecognizerData(itemID, isPriority, password) { + return this._enqueue(async () => { + let attachment = await Zotero.Items.getAsync(itemID); + + Zotero.debug(`Getting PDF recognizer data from item ${attachment.libraryKey}`); + let t = new Date(); + + if (!attachment.isPDFAttachment()) { + throw new Error('Item must be a PDF attachment'); + } + + let path = await attachment.getFilePathAsync(); + let buf = await OS.File.read(path, {}); + buf = new Uint8Array(buf).buffer; + + try { + var result = await this._query('getRecognizerData', { buf, password }, [buf]); + } + catch (e) { + let error = new Error(`Worker 'getRecognizerData' failed: ${JSON.stringify({ error: e.message })}`); + try { + error.name = JSON.parse(e.message).name; + } + catch (e) { + Zotero.logError(e); + } + Zotero.logError(error); + throw error; + } + + Zotero.debug(`Extracted PDF recognizer data for item ${attachment.libraryKey} in ${new Date() - t} ms`); + + return result; + }, isPriority); + } } Zotero.PDFWorker = new PDFWorker(); diff --git a/chrome/content/zotero/xpcom/recognizePDF.js b/chrome/content/zotero/xpcom/recognizePDF.js index 12f89e3026..1094bacd39 100644 --- a/chrome/content/zotero/xpcom/recognizePDF.js +++ b/chrome/content/zotero/xpcom/recognizePDF.js @@ -223,7 +223,7 @@ Zotero.RecognizePDF = new function () { } var version = Zotero.version; - var json = await extractJSON(filePath, MAX_PAGES); + var json = await extractJSON(attachment.id); var metadata = item.toJSON(); var data = { description, version, json, metadata }; @@ -323,39 +323,16 @@ Zotero.RecognizePDF = new function () { } /** - * Get json from a PDF - * @param {String} filePath PDF file path - * @param {Number} pages Number of pages to extract + * Get recognizer data from PDF file + * @param {Number} itemID Attachment item id * @return {Promise} */ - async function extractJSON(filePath, pages) { - let cacheFile = Zotero.getTempDirectory(); - cacheFile.append("recognizePDFcache.txt"); - if (cacheFile.exists()) { - cacheFile.remove(false); - } - - let {exec, args} = Zotero.Fulltext.getPDFConverterExecAndArgs(); - args.push('-json', '-l', pages, filePath, cacheFile.path); - - Zotero.debug("RecognizePDF: Running " + exec.path + " " + args.map(arg => "'" + arg + "'").join(" ")); - + async function extractJSON(itemID) { try { - await Zotero.Utilities.Internal.exec(exec, args); - let content = await Zotero.File.getContentsAsync(cacheFile.path); - Zotero.debug("RecognizePDF: Extracted JSON:"); - Zotero.debug(content); - cacheFile.remove(false); - return JSON.parse(content); + return await Zotero.PDFWorker.getRecognizerData(itemID, true); } catch (e) { Zotero.logError(e); - try { - cacheFile.remove(false); - } - catch (e) { - Zotero.logError(e); - } throw new Zotero.Exception.Alert("recognizePDF.couldNotRead"); } } @@ -416,7 +393,7 @@ Zotero.RecognizePDF = new function () { if (!filePath || !await OS.File.exists(filePath)) throw new Zotero.Exception.Alert('recognizePDF.fileNotFound'); - let json = await extractJSON(filePath, MAX_PAGES); + let json = await extractJSON(item.id); json.fileName = OS.Path.basename(filePath); let containingTextPages = 0; diff --git a/pdf-worker b/pdf-worker index 4456d0eeac..caa9f27a00 160000 --- a/pdf-worker +++ b/pdf-worker @@ -1 +1 @@ -Subproject commit 4456d0eeacb9ef8a276adba61410b2c4620bc00d +Subproject commit caa9f27a000e3a17fb59f86ca2736f035f296267 diff --git a/scripts/pdf-worker.js b/scripts/pdf-worker.js index 9aef082022..484fc4ad4c 100644 --- a/scripts/pdf-worker.js +++ b/scripts/pdf-worker.js @@ -35,7 +35,8 @@ async function getPDFWorker(signatures) { catch (e) { await exec('npm ci', { cwd: modulePath }); await exec('npm run build', { cwd: modulePath }); - await fs.copy(path.join(modulePath, 'build', 'worker.js'), path.join(targetDir, 'worker.js')); + // TODO: Don't copy 'cmaps' and 'standard_fonts' directories once pdf-reader is updated + await fs.copy(path.join(modulePath, 'build'), targetDir); } signatures['pdf-worker'] = { hash }; }