Replace pdftotext and pdfinfo with pdf-worker
This commit is contained in:
parent
3a0731a024
commit
bd9a40562f
6 changed files with 142 additions and 125 deletions
2
.gitmodules
vendored
2
.gitmodules
vendored
|
@ -36,7 +36,7 @@
|
||||||
[submodule "pdf-worker"]
|
[submodule "pdf-worker"]
|
||||||
path = pdf-worker
|
path = pdf-worker
|
||||||
url = https://github.com/zotero/pdf-worker.git
|
url = https://github.com/zotero/pdf-worker.git
|
||||||
branch = master
|
branch = worker2
|
||||||
[submodule "note-editor"]
|
[submodule "note-editor"]
|
||||||
path = note-editor
|
path = note-editor
|
||||||
url = https://github.com/zotero/note-editor.git
|
url = https://github.com/zotero/note-editor.git
|
||||||
|
|
|
@ -24,8 +24,7 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
Zotero.Fulltext = Zotero.FullText = new function(){
|
Zotero.Fulltext = Zotero.FullText = new function(){
|
||||||
this.__defineGetter__("pdfConverterCacheFile", function () { return '.zotero-ft-cache'; });
|
this.__defineGetter__("fulltextCacheFile", function () { return '.zotero-ft-cache'; });
|
||||||
this.__defineGetter__("pdfInfoCacheFile", function () { return '.zotero-ft-info'; });
|
|
||||||
|
|
||||||
this.INDEX_STATE_UNAVAILABLE = 0;
|
this.INDEX_STATE_UNAVAILABLE = 0;
|
||||||
this.INDEX_STATE_UNINDEXED = 1;
|
this.INDEX_STATE_UNINDEXED = 1;
|
||||||
|
@ -356,87 +355,48 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Run PDF through pdfinfo and pdftotext to generate .zotero-ft-info
|
* Index PDF file and store the fulltext content in a file
|
||||||
* and .zotero-ft-cache, and pass the text file to indexString()
|
|
||||||
*
|
*
|
||||||
* @param {nsIFile} file
|
* @param {nsIFile} filePath
|
||||||
* @param {Number} itemID
|
* @param {Number} itemID
|
||||||
* @param {Boolean} [allPages] - If true, index all pages rather than pdfMaxPages
|
* @param {Boolean} [allPages] - If true, index all pages rather than pdfMaxPages
|
||||||
* @return {Promise}
|
* @return {Promise}
|
||||||
*/
|
*/
|
||||||
this.indexPDF = Zotero.Promise.coroutine(function* (filePath, itemID, allPages) {
|
this.indexPDF = async function (filePath, itemID, allPages) {
|
||||||
var maxPages = Zotero.Prefs.get('fulltext.pdfMaxPages');
|
var maxPages = Zotero.Prefs.get('fulltext.pdfMaxPages');
|
||||||
if (maxPages == 0) {
|
if (maxPages == 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
var item = await Zotero.Items.getAsync(itemID);
|
||||||
var item = yield Zotero.Items.getAsync(itemID);
|
|
||||||
var linkMode = item.attachmentLinkMode;
|
var linkMode = item.attachmentLinkMode;
|
||||||
// If file is stored outside of Zotero, create a directory for the item
|
// If file is stored outside of Zotero, create a directory for the item
|
||||||
// in the storage directory and save the cache file there
|
// in the storage directory and save the cache file there
|
||||||
if (linkMode == Zotero.Attachments.LINK_MODE_LINKED_FILE) {
|
if (linkMode == Zotero.Attachments.LINK_MODE_LINKED_FILE) {
|
||||||
var parentDirPath = yield Zotero.Attachments.createDirectoryForItem(item);
|
var parentDirPath = await Zotero.Attachments.createDirectoryForItem(item);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
var parentDirPath = OS.Path.dirname(filePath);
|
var parentDirPath = OS.Path.dirname(filePath);
|
||||||
}
|
}
|
||||||
var infoFilePath = OS.Path.join(parentDirPath, this.pdfInfoCacheFile);
|
var cacheFilePath = OS.Path.join(parentDirPath, this.fulltextCacheFile);
|
||||||
var cacheFilePath = OS.Path.join(parentDirPath, this.pdfConverterCacheFile);
|
|
||||||
|
|
||||||
|
|
||||||
var args = [filePath, infoFilePath];
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
yield Zotero.Utilities.Internal.exec(_pdfInfo, args);
|
var {
|
||||||
var totalPages = yield getTotalPagesFromFile(itemID);
|
text,
|
||||||
|
extractedPages,
|
||||||
|
totalPages
|
||||||
|
} = await Zotero.PDFWorker.getFullText(itemID, allPages ? null : maxPages);
|
||||||
}
|
}
|
||||||
catch (e) {
|
catch (e) {
|
||||||
Zotero.debug("Error running " + _pdfInfo.path, 1);
|
|
||||||
Zotero.logError(e);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
var {exec, args} = this.getPDFConverterExecAndArgs();
|
|
||||||
// Keep in sync with Item::attachmentText
|
|
||||||
args.push('-nopgbrk');
|
|
||||||
|
|
||||||
if (allPages) {
|
|
||||||
if (totalPages) {
|
|
||||||
var indexedPages = totalPages;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
args.push('-l', maxPages);
|
|
||||||
var indexedPages = Math.min(maxPages, totalPages);
|
|
||||||
}
|
|
||||||
args.push(filePath, cacheFilePath);
|
|
||||||
|
|
||||||
try {
|
|
||||||
yield Zotero.Utilities.Internal.exec(exec, args);
|
|
||||||
}
|
|
||||||
catch (e) {
|
|
||||||
Zotero.debug("Error running " + exec.path, 1);
|
|
||||||
Zotero.logError(e);
|
Zotero.logError(e);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if (!text || !extractedPages) {
|
||||||
if (!(yield OS.File.exists(cacheFilePath))) {
|
|
||||||
let fileName = OS.Path.basename(filePath);
|
|
||||||
let msg = fileName + " was not indexed";
|
|
||||||
if (!fileName.match(/^[\u0000-\u007F]+$/)) {
|
|
||||||
msg += " -- PDFs with filenames containing extended characters cannot currently be indexed due to a Mozilla limitation";
|
|
||||||
}
|
|
||||||
Zotero.debug(msg, 2);
|
|
||||||
Components.utils.reportError(msg);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
await Zotero.File.putContentsAsync(cacheFilePath, text);
|
||||||
var text = Zotero.File.getContentsAsync(cacheFilePath);
|
var stats = { indexedPages: extractedPages, totalPages };
|
||||||
var stats = { indexedPages, totalPages };
|
await indexString(text, itemID, stats);
|
||||||
yield indexString(text, itemID, stats);
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
});
|
};
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1213,33 +1173,6 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Gets the number of pages from the PDF info cache file
|
|
||||||
*
|
|
||||||
* @private
|
|
||||||
* @return {Promise}
|
|
||||||
*/
|
|
||||||
var getTotalPagesFromFile = Zotero.Promise.coroutine(function* (itemID) {
|
|
||||||
var file = OS.Path.join(
|
|
||||||
Zotero.Attachments.getStorageDirectoryByID(itemID).path,
|
|
||||||
Zotero.Fulltext.pdfInfoCacheFile
|
|
||||||
);
|
|
||||||
if (!(yield OS.File.exists(file))) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
var contents = yield Zotero.File.getContentsAsync(file);
|
|
||||||
try {
|
|
||||||
// Parse pdfinfo output
|
|
||||||
var pages = contents.match('Pages:[^0-9]+([0-9]+)')[1];
|
|
||||||
}
|
|
||||||
catch (e) {
|
|
||||||
Zotero.debug(e);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return pages;
|
|
||||||
});
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return {Promise}
|
* @return {Promise}
|
||||||
*/
|
*/
|
||||||
|
@ -1261,7 +1194,7 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
||||||
case 'application/pdf':
|
case 'application/pdf':
|
||||||
var file = OS.Path.join(
|
var file = OS.Path.join(
|
||||||
Zotero.Attachments.getStorageDirectory(item).path,
|
Zotero.Attachments.getStorageDirectory(item).path,
|
||||||
this.pdfConverterCacheFile
|
this.fulltextCacheFile
|
||||||
);
|
);
|
||||||
if (!(yield OS.File.exists(file))) {
|
if (!(yield OS.File.exists(file))) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -1412,7 +1345,7 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
||||||
|
|
||||||
this.getItemCacheFile = function (item) {
|
this.getItemCacheFile = function (item) {
|
||||||
var cacheFile = Zotero.Attachments.getStorageDirectory(item);
|
var cacheFile = Zotero.Attachments.getStorageDirectory(item);
|
||||||
cacheFile.append(this.pdfConverterCacheFile);
|
cacheFile.append(this.fulltextCacheFile);
|
||||||
return cacheFile;
|
return cacheFile;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -24,7 +24,8 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
const WORKER_URL = 'chrome://zotero/content/xpcom/pdfWorker/worker.js';
|
const WORKER_URL = 'chrome://zotero/content/xpcom/pdfWorker/worker.js';
|
||||||
const CMAPS_URL = 'resource://zotero/pdf-reader/cmaps/';
|
const CMAPS_URL = 'chrome://zotero/content/xpcom/pdfWorker/cmaps/';
|
||||||
|
const STANDARD_FONTS_URL = 'chrome://zotero/content/xpcom/pdfWorker/standard_fonts/';
|
||||||
const RENDERER_URL = 'resource://zotero/pdf-renderer/renderer.html';
|
const RENDERER_URL = 'resource://zotero/pdf-renderer/renderer.html';
|
||||||
|
|
||||||
class PDFWorker {
|
class PDFWorker {
|
||||||
|
@ -55,8 +56,8 @@ class PDFWorker {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
this._processingQueue = false;
|
this._processingQueue = false;
|
||||||
this._worker.terminate();
|
// this._worker.terminate();
|
||||||
this._worker = null;
|
// this._worker = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
async _enqueue(fn, isPriority) {
|
async _enqueue(fn, isPriority) {
|
||||||
|
@ -114,6 +115,20 @@ class PDFWorker {
|
||||||
Zotero.debug('Failed to fetch CMap data:');
|
Zotero.debug('Failed to fetch CMap data:');
|
||||||
Zotero.debug(e);
|
Zotero.debug(e);
|
||||||
}
|
}
|
||||||
|
try {
|
||||||
|
if (message.action === 'FetchStandardFontData') {
|
||||||
|
let response = await Zotero.HTTP.request(
|
||||||
|
'GET',
|
||||||
|
STANDARD_FONTS_URL + message.data,
|
||||||
|
{ responseType: 'arraybuffer' }
|
||||||
|
);
|
||||||
|
respData = new Uint8Array(response.response);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (e) {
|
||||||
|
Zotero.debug('Failed to fetch standard font data:');
|
||||||
|
Zotero.debug(e);
|
||||||
|
}
|
||||||
this._worker.postMessage({ responseID: event.data.id, data: respData });
|
this._worker.postMessage({ responseID: event.data.id, data: respData });
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -578,6 +593,97 @@ class PDFWorker {
|
||||||
Zotero.debug(`Rotated pages for item ${attachment.libraryKey} in ${new Date() - t} ms`);
|
Zotero.debug(`Rotated pages for item ${attachment.libraryKey} in ${new Date() - t} ms`);
|
||||||
}, isPriority);
|
}, isPriority);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get fulltext
|
||||||
|
*
|
||||||
|
* @param {Integer} itemID Attachment item id
|
||||||
|
* @param {Integer|null} maxPages Pages count to extract, or all pages if 'null'
|
||||||
|
* @param {Boolean} [isPriority]
|
||||||
|
* @param {String} [password]
|
||||||
|
* @returns {Promise}
|
||||||
|
*/
|
||||||
|
async getFullText(itemID, maxPages, isPriority, password) {
|
||||||
|
return this._enqueue(async () => {
|
||||||
|
let attachment = await Zotero.Items.getAsync(itemID);
|
||||||
|
|
||||||
|
Zotero.debug(`Getting fulltext content from item ${attachment.libraryKey}`);
|
||||||
|
let t = new Date();
|
||||||
|
|
||||||
|
if (!attachment.isPDFAttachment()) {
|
||||||
|
throw new Error('Item must be a PDF attachment');
|
||||||
|
}
|
||||||
|
|
||||||
|
let path = await attachment.getFilePathAsync();
|
||||||
|
let buf = await OS.File.read(path, {});
|
||||||
|
buf = new Uint8Array(buf).buffer;
|
||||||
|
|
||||||
|
try {
|
||||||
|
var result = await this._query('getFulltext', {
|
||||||
|
buf, maxPages, password
|
||||||
|
}, [buf]);
|
||||||
|
}
|
||||||
|
catch (e) {
|
||||||
|
let error = new Error(`Worker 'getFullText' failed: ${JSON.stringify({ error: e.message })}`);
|
||||||
|
try {
|
||||||
|
error.name = JSON.parse(e.message).name;
|
||||||
|
}
|
||||||
|
catch (e) {
|
||||||
|
Zotero.logError(e);
|
||||||
|
}
|
||||||
|
Zotero.logError(error);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
|
||||||
|
Zotero.debug(`Extracted full text for item ${attachment.libraryKey} in ${new Date() - t} ms`);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}, isPriority);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get data for recognizer-server
|
||||||
|
*
|
||||||
|
* @param {Integer} itemID Attachment item id
|
||||||
|
* @param {Boolean} [isPriority]
|
||||||
|
* @param {String} [password]
|
||||||
|
* @returns {Promise}
|
||||||
|
*/
|
||||||
|
async getRecognizerData(itemID, isPriority, password) {
|
||||||
|
return this._enqueue(async () => {
|
||||||
|
let attachment = await Zotero.Items.getAsync(itemID);
|
||||||
|
|
||||||
|
Zotero.debug(`Getting PDF recognizer data from item ${attachment.libraryKey}`);
|
||||||
|
let t = new Date();
|
||||||
|
|
||||||
|
if (!attachment.isPDFAttachment()) {
|
||||||
|
throw new Error('Item must be a PDF attachment');
|
||||||
|
}
|
||||||
|
|
||||||
|
let path = await attachment.getFilePathAsync();
|
||||||
|
let buf = await OS.File.read(path, {});
|
||||||
|
buf = new Uint8Array(buf).buffer;
|
||||||
|
|
||||||
|
try {
|
||||||
|
var result = await this._query('getRecognizerData', { buf, password }, [buf]);
|
||||||
|
}
|
||||||
|
catch (e) {
|
||||||
|
let error = new Error(`Worker 'getRecognizerData' failed: ${JSON.stringify({ error: e.message })}`);
|
||||||
|
try {
|
||||||
|
error.name = JSON.parse(e.message).name;
|
||||||
|
}
|
||||||
|
catch (e) {
|
||||||
|
Zotero.logError(e);
|
||||||
|
}
|
||||||
|
Zotero.logError(error);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
|
||||||
|
Zotero.debug(`Extracted PDF recognizer data for item ${attachment.libraryKey} in ${new Date() - t} ms`);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}, isPriority);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Zotero.PDFWorker = new PDFWorker();
|
Zotero.PDFWorker = new PDFWorker();
|
||||||
|
|
|
@ -223,7 +223,7 @@ Zotero.RecognizePDF = new function () {
|
||||||
}
|
}
|
||||||
|
|
||||||
var version = Zotero.version;
|
var version = Zotero.version;
|
||||||
var json = await extractJSON(filePath, MAX_PAGES);
|
var json = await extractJSON(attachment.id);
|
||||||
var metadata = item.toJSON();
|
var metadata = item.toJSON();
|
||||||
|
|
||||||
var data = { description, version, json, metadata };
|
var data = { description, version, json, metadata };
|
||||||
|
@ -323,39 +323,16 @@ Zotero.RecognizePDF = new function () {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get json from a PDF
|
* Get recognizer data from PDF file
|
||||||
* @param {String} filePath PDF file path
|
* @param {Number} itemID Attachment item id
|
||||||
* @param {Number} pages Number of pages to extract
|
|
||||||
* @return {Promise}
|
* @return {Promise}
|
||||||
*/
|
*/
|
||||||
async function extractJSON(filePath, pages) {
|
async function extractJSON(itemID) {
|
||||||
let cacheFile = Zotero.getTempDirectory();
|
|
||||||
cacheFile.append("recognizePDFcache.txt");
|
|
||||||
if (cacheFile.exists()) {
|
|
||||||
cacheFile.remove(false);
|
|
||||||
}
|
|
||||||
|
|
||||||
let {exec, args} = Zotero.Fulltext.getPDFConverterExecAndArgs();
|
|
||||||
args.push('-json', '-l', pages, filePath, cacheFile.path);
|
|
||||||
|
|
||||||
Zotero.debug("RecognizePDF: Running " + exec.path + " " + args.map(arg => "'" + arg + "'").join(" "));
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await Zotero.Utilities.Internal.exec(exec, args);
|
return await Zotero.PDFWorker.getRecognizerData(itemID, true);
|
||||||
let content = await Zotero.File.getContentsAsync(cacheFile.path);
|
|
||||||
Zotero.debug("RecognizePDF: Extracted JSON:");
|
|
||||||
Zotero.debug(content);
|
|
||||||
cacheFile.remove(false);
|
|
||||||
return JSON.parse(content);
|
|
||||||
}
|
}
|
||||||
catch (e) {
|
catch (e) {
|
||||||
Zotero.logError(e);
|
Zotero.logError(e);
|
||||||
try {
|
|
||||||
cacheFile.remove(false);
|
|
||||||
}
|
|
||||||
catch (e) {
|
|
||||||
Zotero.logError(e);
|
|
||||||
}
|
|
||||||
throw new Zotero.Exception.Alert("recognizePDF.couldNotRead");
|
throw new Zotero.Exception.Alert("recognizePDF.couldNotRead");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -416,7 +393,7 @@ Zotero.RecognizePDF = new function () {
|
||||||
|
|
||||||
if (!filePath || !await OS.File.exists(filePath)) throw new Zotero.Exception.Alert('recognizePDF.fileNotFound');
|
if (!filePath || !await OS.File.exists(filePath)) throw new Zotero.Exception.Alert('recognizePDF.fileNotFound');
|
||||||
|
|
||||||
let json = await extractJSON(filePath, MAX_PAGES);
|
let json = await extractJSON(item.id);
|
||||||
json.fileName = OS.Path.basename(filePath);
|
json.fileName = OS.Path.basename(filePath);
|
||||||
|
|
||||||
let containingTextPages = 0;
|
let containingTextPages = 0;
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
Subproject commit 4456d0eeacb9ef8a276adba61410b2c4620bc00d
|
Subproject commit caa9f27a000e3a17fb59f86ca2736f035f296267
|
|
@ -35,7 +35,8 @@ async function getPDFWorker(signatures) {
|
||||||
catch (e) {
|
catch (e) {
|
||||||
await exec('npm ci', { cwd: modulePath });
|
await exec('npm ci', { cwd: modulePath });
|
||||||
await exec('npm run build', { cwd: modulePath });
|
await exec('npm run build', { cwd: modulePath });
|
||||||
await fs.copy(path.join(modulePath, 'build', 'worker.js'), path.join(targetDir, 'worker.js'));
|
// TODO: Don't copy 'cmaps' and 'standard_fonts' directories once pdf-reader is updated
|
||||||
|
await fs.copy(path.join(modulePath, 'build'), targetDir);
|
||||||
}
|
}
|
||||||
signatures['pdf-worker'] = { hash };
|
signatures['pdf-worker'] = { hash };
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue