Replace pdftotext and pdfinfo with pdf-worker

This commit is contained in:
Martynas Bagdonas 2023-03-31 12:48:05 +01:00
parent 3a0731a024
commit bd9a40562f
6 changed files with 142 additions and 125 deletions

2
.gitmodules vendored
View file

@ -36,7 +36,7 @@
[submodule "pdf-worker"] [submodule "pdf-worker"]
path = pdf-worker path = pdf-worker
url = https://github.com/zotero/pdf-worker.git url = https://github.com/zotero/pdf-worker.git
branch = master branch = worker2
[submodule "note-editor"] [submodule "note-editor"]
path = note-editor path = note-editor
url = https://github.com/zotero/note-editor.git url = https://github.com/zotero/note-editor.git

View file

@ -24,9 +24,8 @@
*/ */
Zotero.Fulltext = Zotero.FullText = new function(){ Zotero.Fulltext = Zotero.FullText = new function(){
this.__defineGetter__("pdfConverterCacheFile", function () { return '.zotero-ft-cache'; }); this.__defineGetter__("fulltextCacheFile", function () { return '.zotero-ft-cache'; });
this.__defineGetter__("pdfInfoCacheFile", function () { return '.zotero-ft-info'; });
this.INDEX_STATE_UNAVAILABLE = 0; this.INDEX_STATE_UNAVAILABLE = 0;
this.INDEX_STATE_UNINDEXED = 1; this.INDEX_STATE_UNINDEXED = 1;
this.INDEX_STATE_PARTIAL = 2; this.INDEX_STATE_PARTIAL = 2;
@ -354,89 +353,50 @@ Zotero.Fulltext = Zotero.FullText = new function(){
); );
}); });
/** /**
* Run PDF through pdfinfo and pdftotext to generate .zotero-ft-info * Index PDF file and store the fulltext content in a file
* and .zotero-ft-cache, and pass the text file to indexString()
* *
* @param {nsIFile} file * @param {nsIFile} filePath
* @param {Number} itemID * @param {Number} itemID
* @param {Boolean} [allPages] - If true, index all pages rather than pdfMaxPages * @param {Boolean} [allPages] - If true, index all pages rather than pdfMaxPages
* @return {Promise} * @return {Promise}
*/ */
this.indexPDF = Zotero.Promise.coroutine(function* (filePath, itemID, allPages) { this.indexPDF = async function (filePath, itemID, allPages) {
var maxPages = Zotero.Prefs.get('fulltext.pdfMaxPages'); var maxPages = Zotero.Prefs.get('fulltext.pdfMaxPages');
if (maxPages == 0) { if (maxPages == 0) {
return false; return false;
} }
var item = await Zotero.Items.getAsync(itemID);
var item = yield Zotero.Items.getAsync(itemID);
var linkMode = item.attachmentLinkMode; var linkMode = item.attachmentLinkMode;
// If file is stored outside of Zotero, create a directory for the item // If file is stored outside of Zotero, create a directory for the item
// in the storage directory and save the cache file there // in the storage directory and save the cache file there
if (linkMode == Zotero.Attachments.LINK_MODE_LINKED_FILE) { if (linkMode == Zotero.Attachments.LINK_MODE_LINKED_FILE) {
var parentDirPath = yield Zotero.Attachments.createDirectoryForItem(item); var parentDirPath = await Zotero.Attachments.createDirectoryForItem(item);
} }
else { else {
var parentDirPath = OS.Path.dirname(filePath); var parentDirPath = OS.Path.dirname(filePath);
} }
var infoFilePath = OS.Path.join(parentDirPath, this.pdfInfoCacheFile); var cacheFilePath = OS.Path.join(parentDirPath, this.fulltextCacheFile);
var cacheFilePath = OS.Path.join(parentDirPath, this.pdfConverterCacheFile);
var args = [filePath, infoFilePath];
try { try {
yield Zotero.Utilities.Internal.exec(_pdfInfo, args); var {
var totalPages = yield getTotalPagesFromFile(itemID); text,
extractedPages,
totalPages
} = await Zotero.PDFWorker.getFullText(itemID, allPages ? null : maxPages);
} }
catch (e) { catch (e) {
Zotero.debug("Error running " + _pdfInfo.path, 1);
Zotero.logError(e);
}
var {exec, args} = this.getPDFConverterExecAndArgs();
// Keep in sync with Item::attachmentText
args.push('-nopgbrk');
if (allPages) {
if (totalPages) {
var indexedPages = totalPages;
}
}
else {
args.push('-l', maxPages);
var indexedPages = Math.min(maxPages, totalPages);
}
args.push(filePath, cacheFilePath);
try {
yield Zotero.Utilities.Internal.exec(exec, args);
}
catch (e) {
Zotero.debug("Error running " + exec.path, 1);
Zotero.logError(e); Zotero.logError(e);
return false; return false;
} }
if (!text || !extractedPages) {
if (!(yield OS.File.exists(cacheFilePath))) {
let fileName = OS.Path.basename(filePath);
let msg = fileName + " was not indexed";
if (!fileName.match(/^[\u0000-\u007F]+$/)) {
msg += " -- PDFs with filenames containing extended characters cannot currently be indexed due to a Mozilla limitation";
}
Zotero.debug(msg, 2);
Components.utils.reportError(msg);
return false; return false;
} }
await Zotero.File.putContentsAsync(cacheFilePath, text);
var text = Zotero.File.getContentsAsync(cacheFilePath); var stats = { indexedPages: extractedPages, totalPages };
var stats = { indexedPages, totalPages }; await indexString(text, itemID, stats);
yield indexString(text, itemID, stats);
return true; return true;
}); };
/** /**
@ -1211,35 +1171,8 @@ Zotero.Fulltext = Zotero.FullText = new function(){
+ "FROM fulltextItems WHERE itemID=?"; + "FROM fulltextItems WHERE itemID=?";
return Zotero.DB.rowQueryAsync(sql, itemID); return Zotero.DB.rowQueryAsync(sql, itemID);
} }
/**
* Gets the number of pages from the PDF info cache file
*
* @private
* @return {Promise}
*/
var getTotalPagesFromFile = Zotero.Promise.coroutine(function* (itemID) {
var file = OS.Path.join(
Zotero.Attachments.getStorageDirectoryByID(itemID).path,
Zotero.Fulltext.pdfInfoCacheFile
);
if (!(yield OS.File.exists(file))) {
return false;
}
var contents = yield Zotero.File.getContentsAsync(file);
try {
// Parse pdfinfo output
var pages = contents.match('Pages:[^0-9]+([0-9]+)')[1];
}
catch (e) {
Zotero.debug(e);
return false;
}
return pages;
});
/** /**
* @return {Promise} * @return {Promise}
*/ */
@ -1261,7 +1194,7 @@ Zotero.Fulltext = Zotero.FullText = new function(){
case 'application/pdf': case 'application/pdf':
var file = OS.Path.join( var file = OS.Path.join(
Zotero.Attachments.getStorageDirectory(item).path, Zotero.Attachments.getStorageDirectory(item).path,
this.pdfConverterCacheFile this.fulltextCacheFile
); );
if (!(yield OS.File.exists(file))) { if (!(yield OS.File.exists(file))) {
return false; return false;
@ -1412,7 +1345,7 @@ Zotero.Fulltext = Zotero.FullText = new function(){
this.getItemCacheFile = function (item) { this.getItemCacheFile = function (item) {
var cacheFile = Zotero.Attachments.getStorageDirectory(item); var cacheFile = Zotero.Attachments.getStorageDirectory(item);
cacheFile.append(this.pdfConverterCacheFile); cacheFile.append(this.fulltextCacheFile);
return cacheFile; return cacheFile;
} }

View file

@ -24,7 +24,8 @@
*/ */
const WORKER_URL = 'chrome://zotero/content/xpcom/pdfWorker/worker.js'; const WORKER_URL = 'chrome://zotero/content/xpcom/pdfWorker/worker.js';
const CMAPS_URL = 'resource://zotero/pdf-reader/cmaps/'; const CMAPS_URL = 'chrome://zotero/content/xpcom/pdfWorker/cmaps/';
const STANDARD_FONTS_URL = 'chrome://zotero/content/xpcom/pdfWorker/standard_fonts/';
const RENDERER_URL = 'resource://zotero/pdf-renderer/renderer.html'; const RENDERER_URL = 'resource://zotero/pdf-renderer/renderer.html';
class PDFWorker { class PDFWorker {
@ -55,8 +56,8 @@ class PDFWorker {
} }
} }
this._processingQueue = false; this._processingQueue = false;
this._worker.terminate(); // this._worker.terminate();
this._worker = null; // this._worker = null;
} }
async _enqueue(fn, isPriority) { async _enqueue(fn, isPriority) {
@ -114,6 +115,20 @@ class PDFWorker {
Zotero.debug('Failed to fetch CMap data:'); Zotero.debug('Failed to fetch CMap data:');
Zotero.debug(e); Zotero.debug(e);
} }
try {
if (message.action === 'FetchStandardFontData') {
let response = await Zotero.HTTP.request(
'GET',
STANDARD_FONTS_URL + message.data,
{ responseType: 'arraybuffer' }
);
respData = new Uint8Array(response.response);
}
}
catch (e) {
Zotero.debug('Failed to fetch standard font data:');
Zotero.debug(e);
}
this._worker.postMessage({ responseID: event.data.id, data: respData }); this._worker.postMessage({ responseID: event.data.id, data: respData });
} }
}); });
@ -578,6 +593,97 @@ class PDFWorker {
Zotero.debug(`Rotated pages for item ${attachment.libraryKey} in ${new Date() - t} ms`); Zotero.debug(`Rotated pages for item ${attachment.libraryKey} in ${new Date() - t} ms`);
}, isPriority); }, isPriority);
} }
/**
* Get fulltext
*
* @param {Integer} itemID Attachment item id
* @param {Integer|null} maxPages Pages count to extract, or all pages if 'null'
* @param {Boolean} [isPriority]
* @param {String} [password]
* @returns {Promise}
*/
async getFullText(itemID, maxPages, isPriority, password) {
return this._enqueue(async () => {
let attachment = await Zotero.Items.getAsync(itemID);
Zotero.debug(`Getting fulltext content from item ${attachment.libraryKey}`);
let t = new Date();
if (!attachment.isPDFAttachment()) {
throw new Error('Item must be a PDF attachment');
}
let path = await attachment.getFilePathAsync();
let buf = await OS.File.read(path, {});
buf = new Uint8Array(buf).buffer;
try {
var result = await this._query('getFulltext', {
buf, maxPages, password
}, [buf]);
}
catch (e) {
let error = new Error(`Worker 'getFullText' failed: ${JSON.stringify({ error: e.message })}`);
try {
error.name = JSON.parse(e.message).name;
}
catch (e) {
Zotero.logError(e);
}
Zotero.logError(error);
throw error;
}
Zotero.debug(`Extracted full text for item ${attachment.libraryKey} in ${new Date() - t} ms`);
return result;
}, isPriority);
}
/**
* Get data for recognizer-server
*
* @param {Integer} itemID Attachment item id
* @param {Boolean} [isPriority]
* @param {String} [password]
* @returns {Promise}
*/
async getRecognizerData(itemID, isPriority, password) {
return this._enqueue(async () => {
let attachment = await Zotero.Items.getAsync(itemID);
Zotero.debug(`Getting PDF recognizer data from item ${attachment.libraryKey}`);
let t = new Date();
if (!attachment.isPDFAttachment()) {
throw new Error('Item must be a PDF attachment');
}
let path = await attachment.getFilePathAsync();
let buf = await OS.File.read(path, {});
buf = new Uint8Array(buf).buffer;
try {
var result = await this._query('getRecognizerData', { buf, password }, [buf]);
}
catch (e) {
let error = new Error(`Worker 'getRecognizerData' failed: ${JSON.stringify({ error: e.message })}`);
try {
error.name = JSON.parse(e.message).name;
}
catch (e) {
Zotero.logError(e);
}
Zotero.logError(error);
throw error;
}
Zotero.debug(`Extracted PDF recognizer data for item ${attachment.libraryKey} in ${new Date() - t} ms`);
return result;
}, isPriority);
}
} }
Zotero.PDFWorker = new PDFWorker(); Zotero.PDFWorker = new PDFWorker();

View file

@ -223,7 +223,7 @@ Zotero.RecognizePDF = new function () {
} }
var version = Zotero.version; var version = Zotero.version;
var json = await extractJSON(filePath, MAX_PAGES); var json = await extractJSON(attachment.id);
var metadata = item.toJSON(); var metadata = item.toJSON();
var data = { description, version, json, metadata }; var data = { description, version, json, metadata };
@ -323,39 +323,16 @@ Zotero.RecognizePDF = new function () {
} }
/** /**
* Get json from a PDF * Get recognizer data from PDF file
* @param {String} filePath PDF file path * @param {Number} itemID Attachment item id
* @param {Number} pages Number of pages to extract
* @return {Promise} * @return {Promise}
*/ */
async function extractJSON(filePath, pages) { async function extractJSON(itemID) {
let cacheFile = Zotero.getTempDirectory();
cacheFile.append("recognizePDFcache.txt");
if (cacheFile.exists()) {
cacheFile.remove(false);
}
let {exec, args} = Zotero.Fulltext.getPDFConverterExecAndArgs();
args.push('-json', '-l', pages, filePath, cacheFile.path);
Zotero.debug("RecognizePDF: Running " + exec.path + " " + args.map(arg => "'" + arg + "'").join(" "));
try { try {
await Zotero.Utilities.Internal.exec(exec, args); return await Zotero.PDFWorker.getRecognizerData(itemID, true);
let content = await Zotero.File.getContentsAsync(cacheFile.path);
Zotero.debug("RecognizePDF: Extracted JSON:");
Zotero.debug(content);
cacheFile.remove(false);
return JSON.parse(content);
} }
catch (e) { catch (e) {
Zotero.logError(e); Zotero.logError(e);
try {
cacheFile.remove(false);
}
catch (e) {
Zotero.logError(e);
}
throw new Zotero.Exception.Alert("recognizePDF.couldNotRead"); throw new Zotero.Exception.Alert("recognizePDF.couldNotRead");
} }
} }
@ -416,7 +393,7 @@ Zotero.RecognizePDF = new function () {
if (!filePath || !await OS.File.exists(filePath)) throw new Zotero.Exception.Alert('recognizePDF.fileNotFound'); if (!filePath || !await OS.File.exists(filePath)) throw new Zotero.Exception.Alert('recognizePDF.fileNotFound');
let json = await extractJSON(filePath, MAX_PAGES); let json = await extractJSON(item.id);
json.fileName = OS.Path.basename(filePath); json.fileName = OS.Path.basename(filePath);
let containingTextPages = 0; let containingTextPages = 0;

@ -1 +1 @@
Subproject commit 4456d0eeacb9ef8a276adba61410b2c4620bc00d Subproject commit caa9f27a000e3a17fb59f86ca2736f035f296267

View file

@ -35,7 +35,8 @@ async function getPDFWorker(signatures) {
catch (e) { catch (e) {
await exec('npm ci', { cwd: modulePath }); await exec('npm ci', { cwd: modulePath });
await exec('npm run build', { cwd: modulePath }); await exec('npm run build', { cwd: modulePath });
await fs.copy(path.join(modulePath, 'build', 'worker.js'), path.join(targetDir, 'worker.js')); // TODO: Don't copy 'cmaps' and 'standard_fonts' directories once pdf-reader is updated
await fs.copy(path.join(modulePath, 'build'), targetDir);
} }
signatures['pdf-worker'] = { hash }; signatures['pdf-worker'] = { hash };
} }