Replace pdftotext and pdfinfo with pdf-worker
This commit is contained in:
parent
3a0731a024
commit
bd9a40562f
6 changed files with 142 additions and 125 deletions
2
.gitmodules
vendored
2
.gitmodules
vendored
|
@ -36,7 +36,7 @@
|
|||
[submodule "pdf-worker"]
|
||||
path = pdf-worker
|
||||
url = https://github.com/zotero/pdf-worker.git
|
||||
branch = master
|
||||
branch = worker2
|
||||
[submodule "note-editor"]
|
||||
path = note-editor
|
||||
url = https://github.com/zotero/note-editor.git
|
||||
|
|
|
@ -24,9 +24,8 @@
|
|||
*/
|
||||
|
||||
Zotero.Fulltext = Zotero.FullText = new function(){
|
||||
this.__defineGetter__("pdfConverterCacheFile", function () { return '.zotero-ft-cache'; });
|
||||
this.__defineGetter__("pdfInfoCacheFile", function () { return '.zotero-ft-info'; });
|
||||
|
||||
this.__defineGetter__("fulltextCacheFile", function () { return '.zotero-ft-cache'; });
|
||||
|
||||
this.INDEX_STATE_UNAVAILABLE = 0;
|
||||
this.INDEX_STATE_UNINDEXED = 1;
|
||||
this.INDEX_STATE_PARTIAL = 2;
|
||||
|
@ -354,89 +353,50 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
|||
);
|
||||
});
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Run PDF through pdfinfo and pdftotext to generate .zotero-ft-info
|
||||
* and .zotero-ft-cache, and pass the text file to indexString()
|
||||
* Index PDF file and store the fulltext content in a file
|
||||
*
|
||||
* @param {nsIFile} file
|
||||
* @param {nsIFile} filePath
|
||||
* @param {Number} itemID
|
||||
* @param {Boolean} [allPages] - If true, index all pages rather than pdfMaxPages
|
||||
* @return {Promise}
|
||||
*/
|
||||
this.indexPDF = Zotero.Promise.coroutine(function* (filePath, itemID, allPages) {
|
||||
this.indexPDF = async function (filePath, itemID, allPages) {
|
||||
var maxPages = Zotero.Prefs.get('fulltext.pdfMaxPages');
|
||||
if (maxPages == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
var item = yield Zotero.Items.getAsync(itemID);
|
||||
var item = await Zotero.Items.getAsync(itemID);
|
||||
var linkMode = item.attachmentLinkMode;
|
||||
// If file is stored outside of Zotero, create a directory for the item
|
||||
// in the storage directory and save the cache file there
|
||||
if (linkMode == Zotero.Attachments.LINK_MODE_LINKED_FILE) {
|
||||
var parentDirPath = yield Zotero.Attachments.createDirectoryForItem(item);
|
||||
var parentDirPath = await Zotero.Attachments.createDirectoryForItem(item);
|
||||
}
|
||||
else {
|
||||
var parentDirPath = OS.Path.dirname(filePath);
|
||||
}
|
||||
var infoFilePath = OS.Path.join(parentDirPath, this.pdfInfoCacheFile);
|
||||
var cacheFilePath = OS.Path.join(parentDirPath, this.pdfConverterCacheFile);
|
||||
|
||||
|
||||
var args = [filePath, infoFilePath];
|
||||
|
||||
var cacheFilePath = OS.Path.join(parentDirPath, this.fulltextCacheFile);
|
||||
try {
|
||||
yield Zotero.Utilities.Internal.exec(_pdfInfo, args);
|
||||
var totalPages = yield getTotalPagesFromFile(itemID);
|
||||
var {
|
||||
text,
|
||||
extractedPages,
|
||||
totalPages
|
||||
} = await Zotero.PDFWorker.getFullText(itemID, allPages ? null : maxPages);
|
||||
}
|
||||
catch (e) {
|
||||
Zotero.debug("Error running " + _pdfInfo.path, 1);
|
||||
Zotero.logError(e);
|
||||
}
|
||||
|
||||
|
||||
var {exec, args} = this.getPDFConverterExecAndArgs();
|
||||
// Keep in sync with Item::attachmentText
|
||||
args.push('-nopgbrk');
|
||||
|
||||
if (allPages) {
|
||||
if (totalPages) {
|
||||
var indexedPages = totalPages;
|
||||
}
|
||||
}
|
||||
else {
|
||||
args.push('-l', maxPages);
|
||||
var indexedPages = Math.min(maxPages, totalPages);
|
||||
}
|
||||
args.push(filePath, cacheFilePath);
|
||||
|
||||
try {
|
||||
yield Zotero.Utilities.Internal.exec(exec, args);
|
||||
}
|
||||
catch (e) {
|
||||
Zotero.debug("Error running " + exec.path, 1);
|
||||
Zotero.logError(e);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!(yield OS.File.exists(cacheFilePath))) {
|
||||
let fileName = OS.Path.basename(filePath);
|
||||
let msg = fileName + " was not indexed";
|
||||
if (!fileName.match(/^[\u0000-\u007F]+$/)) {
|
||||
msg += " -- PDFs with filenames containing extended characters cannot currently be indexed due to a Mozilla limitation";
|
||||
}
|
||||
Zotero.debug(msg, 2);
|
||||
Components.utils.reportError(msg);
|
||||
if (!text || !extractedPages) {
|
||||
return false;
|
||||
}
|
||||
|
||||
var text = Zotero.File.getContentsAsync(cacheFilePath);
|
||||
var stats = { indexedPages, totalPages };
|
||||
yield indexString(text, itemID, stats);
|
||||
|
||||
await Zotero.File.putContentsAsync(cacheFilePath, text);
|
||||
var stats = { indexedPages: extractedPages, totalPages };
|
||||
await indexString(text, itemID, stats);
|
||||
return true;
|
||||
});
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
|
@ -1211,35 +1171,8 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
|||
+ "FROM fulltextItems WHERE itemID=?";
|
||||
return Zotero.DB.rowQueryAsync(sql, itemID);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Gets the number of pages from the PDF info cache file
|
||||
*
|
||||
* @private
|
||||
* @return {Promise}
|
||||
*/
|
||||
var getTotalPagesFromFile = Zotero.Promise.coroutine(function* (itemID) {
|
||||
var file = OS.Path.join(
|
||||
Zotero.Attachments.getStorageDirectoryByID(itemID).path,
|
||||
Zotero.Fulltext.pdfInfoCacheFile
|
||||
);
|
||||
if (!(yield OS.File.exists(file))) {
|
||||
return false;
|
||||
}
|
||||
var contents = yield Zotero.File.getContentsAsync(file);
|
||||
try {
|
||||
// Parse pdfinfo output
|
||||
var pages = contents.match('Pages:[^0-9]+([0-9]+)')[1];
|
||||
}
|
||||
catch (e) {
|
||||
Zotero.debug(e);
|
||||
return false;
|
||||
}
|
||||
return pages;
|
||||
});
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* @return {Promise}
|
||||
*/
|
||||
|
@ -1261,7 +1194,7 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
|||
case 'application/pdf':
|
||||
var file = OS.Path.join(
|
||||
Zotero.Attachments.getStorageDirectory(item).path,
|
||||
this.pdfConverterCacheFile
|
||||
this.fulltextCacheFile
|
||||
);
|
||||
if (!(yield OS.File.exists(file))) {
|
||||
return false;
|
||||
|
@ -1412,7 +1345,7 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
|||
|
||||
this.getItemCacheFile = function (item) {
|
||||
var cacheFile = Zotero.Attachments.getStorageDirectory(item);
|
||||
cacheFile.append(this.pdfConverterCacheFile);
|
||||
cacheFile.append(this.fulltextCacheFile);
|
||||
return cacheFile;
|
||||
}
|
||||
|
||||
|
|
|
@ -24,7 +24,8 @@
|
|||
*/
|
||||
|
||||
const WORKER_URL = 'chrome://zotero/content/xpcom/pdfWorker/worker.js';
|
||||
const CMAPS_URL = 'resource://zotero/pdf-reader/cmaps/';
|
||||
const CMAPS_URL = 'chrome://zotero/content/xpcom/pdfWorker/cmaps/';
|
||||
const STANDARD_FONTS_URL = 'chrome://zotero/content/xpcom/pdfWorker/standard_fonts/';
|
||||
const RENDERER_URL = 'resource://zotero/pdf-renderer/renderer.html';
|
||||
|
||||
class PDFWorker {
|
||||
|
@ -55,8 +56,8 @@ class PDFWorker {
|
|||
}
|
||||
}
|
||||
this._processingQueue = false;
|
||||
this._worker.terminate();
|
||||
this._worker = null;
|
||||
// this._worker.terminate();
|
||||
// this._worker = null;
|
||||
}
|
||||
|
||||
async _enqueue(fn, isPriority) {
|
||||
|
@ -114,6 +115,20 @@ class PDFWorker {
|
|||
Zotero.debug('Failed to fetch CMap data:');
|
||||
Zotero.debug(e);
|
||||
}
|
||||
try {
|
||||
if (message.action === 'FetchStandardFontData') {
|
||||
let response = await Zotero.HTTP.request(
|
||||
'GET',
|
||||
STANDARD_FONTS_URL + message.data,
|
||||
{ responseType: 'arraybuffer' }
|
||||
);
|
||||
respData = new Uint8Array(response.response);
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
Zotero.debug('Failed to fetch standard font data:');
|
||||
Zotero.debug(e);
|
||||
}
|
||||
this._worker.postMessage({ responseID: event.data.id, data: respData });
|
||||
}
|
||||
});
|
||||
|
@ -578,6 +593,97 @@ class PDFWorker {
|
|||
Zotero.debug(`Rotated pages for item ${attachment.libraryKey} in ${new Date() - t} ms`);
|
||||
}, isPriority);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get fulltext
|
||||
*
|
||||
* @param {Integer} itemID Attachment item id
|
||||
* @param {Integer|null} maxPages Pages count to extract, or all pages if 'null'
|
||||
* @param {Boolean} [isPriority]
|
||||
* @param {String} [password]
|
||||
* @returns {Promise}
|
||||
*/
|
||||
async getFullText(itemID, maxPages, isPriority, password) {
|
||||
return this._enqueue(async () => {
|
||||
let attachment = await Zotero.Items.getAsync(itemID);
|
||||
|
||||
Zotero.debug(`Getting fulltext content from item ${attachment.libraryKey}`);
|
||||
let t = new Date();
|
||||
|
||||
if (!attachment.isPDFAttachment()) {
|
||||
throw new Error('Item must be a PDF attachment');
|
||||
}
|
||||
|
||||
let path = await attachment.getFilePathAsync();
|
||||
let buf = await OS.File.read(path, {});
|
||||
buf = new Uint8Array(buf).buffer;
|
||||
|
||||
try {
|
||||
var result = await this._query('getFulltext', {
|
||||
buf, maxPages, password
|
||||
}, [buf]);
|
||||
}
|
||||
catch (e) {
|
||||
let error = new Error(`Worker 'getFullText' failed: ${JSON.stringify({ error: e.message })}`);
|
||||
try {
|
||||
error.name = JSON.parse(e.message).name;
|
||||
}
|
||||
catch (e) {
|
||||
Zotero.logError(e);
|
||||
}
|
||||
Zotero.logError(error);
|
||||
throw error;
|
||||
}
|
||||
|
||||
Zotero.debug(`Extracted full text for item ${attachment.libraryKey} in ${new Date() - t} ms`);
|
||||
|
||||
return result;
|
||||
}, isPriority);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get data for recognizer-server
|
||||
*
|
||||
* @param {Integer} itemID Attachment item id
|
||||
* @param {Boolean} [isPriority]
|
||||
* @param {String} [password]
|
||||
* @returns {Promise}
|
||||
*/
|
||||
async getRecognizerData(itemID, isPriority, password) {
|
||||
return this._enqueue(async () => {
|
||||
let attachment = await Zotero.Items.getAsync(itemID);
|
||||
|
||||
Zotero.debug(`Getting PDF recognizer data from item ${attachment.libraryKey}`);
|
||||
let t = new Date();
|
||||
|
||||
if (!attachment.isPDFAttachment()) {
|
||||
throw new Error('Item must be a PDF attachment');
|
||||
}
|
||||
|
||||
let path = await attachment.getFilePathAsync();
|
||||
let buf = await OS.File.read(path, {});
|
||||
buf = new Uint8Array(buf).buffer;
|
||||
|
||||
try {
|
||||
var result = await this._query('getRecognizerData', { buf, password }, [buf]);
|
||||
}
|
||||
catch (e) {
|
||||
let error = new Error(`Worker 'getRecognizerData' failed: ${JSON.stringify({ error: e.message })}`);
|
||||
try {
|
||||
error.name = JSON.parse(e.message).name;
|
||||
}
|
||||
catch (e) {
|
||||
Zotero.logError(e);
|
||||
}
|
||||
Zotero.logError(error);
|
||||
throw error;
|
||||
}
|
||||
|
||||
Zotero.debug(`Extracted PDF recognizer data for item ${attachment.libraryKey} in ${new Date() - t} ms`);
|
||||
|
||||
return result;
|
||||
}, isPriority);
|
||||
}
|
||||
}
|
||||
|
||||
Zotero.PDFWorker = new PDFWorker();
|
||||
|
|
|
@ -223,7 +223,7 @@ Zotero.RecognizePDF = new function () {
|
|||
}
|
||||
|
||||
var version = Zotero.version;
|
||||
var json = await extractJSON(filePath, MAX_PAGES);
|
||||
var json = await extractJSON(attachment.id);
|
||||
var metadata = item.toJSON();
|
||||
|
||||
var data = { description, version, json, metadata };
|
||||
|
@ -323,39 +323,16 @@ Zotero.RecognizePDF = new function () {
|
|||
}
|
||||
|
||||
/**
|
||||
* Get json from a PDF
|
||||
* @param {String} filePath PDF file path
|
||||
* @param {Number} pages Number of pages to extract
|
||||
* Get recognizer data from PDF file
|
||||
* @param {Number} itemID Attachment item id
|
||||
* @return {Promise}
|
||||
*/
|
||||
async function extractJSON(filePath, pages) {
|
||||
let cacheFile = Zotero.getTempDirectory();
|
||||
cacheFile.append("recognizePDFcache.txt");
|
||||
if (cacheFile.exists()) {
|
||||
cacheFile.remove(false);
|
||||
}
|
||||
|
||||
let {exec, args} = Zotero.Fulltext.getPDFConverterExecAndArgs();
|
||||
args.push('-json', '-l', pages, filePath, cacheFile.path);
|
||||
|
||||
Zotero.debug("RecognizePDF: Running " + exec.path + " " + args.map(arg => "'" + arg + "'").join(" "));
|
||||
|
||||
async function extractJSON(itemID) {
|
||||
try {
|
||||
await Zotero.Utilities.Internal.exec(exec, args);
|
||||
let content = await Zotero.File.getContentsAsync(cacheFile.path);
|
||||
Zotero.debug("RecognizePDF: Extracted JSON:");
|
||||
Zotero.debug(content);
|
||||
cacheFile.remove(false);
|
||||
return JSON.parse(content);
|
||||
return await Zotero.PDFWorker.getRecognizerData(itemID, true);
|
||||
}
|
||||
catch (e) {
|
||||
Zotero.logError(e);
|
||||
try {
|
||||
cacheFile.remove(false);
|
||||
}
|
||||
catch (e) {
|
||||
Zotero.logError(e);
|
||||
}
|
||||
throw new Zotero.Exception.Alert("recognizePDF.couldNotRead");
|
||||
}
|
||||
}
|
||||
|
@ -416,7 +393,7 @@ Zotero.RecognizePDF = new function () {
|
|||
|
||||
if (!filePath || !await OS.File.exists(filePath)) throw new Zotero.Exception.Alert('recognizePDF.fileNotFound');
|
||||
|
||||
let json = await extractJSON(filePath, MAX_PAGES);
|
||||
let json = await extractJSON(item.id);
|
||||
json.fileName = OS.Path.basename(filePath);
|
||||
|
||||
let containingTextPages = 0;
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit 4456d0eeacb9ef8a276adba61410b2c4620bc00d
|
||||
Subproject commit caa9f27a000e3a17fb59f86ca2736f035f296267
|
|
@ -35,7 +35,8 @@ async function getPDFWorker(signatures) {
|
|||
catch (e) {
|
||||
await exec('npm ci', { cwd: modulePath });
|
||||
await exec('npm run build', { cwd: modulePath });
|
||||
await fs.copy(path.join(modulePath, 'build', 'worker.js'), path.join(targetDir, 'worker.js'));
|
||||
// TODO: Don't copy 'cmaps' and 'standard_fonts' directories once pdf-reader is updated
|
||||
await fs.copy(path.join(modulePath, 'build'), targetDir);
|
||||
}
|
||||
signatures['pdf-worker'] = { hash };
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue