zotero/chrome/content/zotero/xpcom/recognizePDF.js
2018-10-05 01:56:46 -04:00

572 lines
15 KiB
JavaScript

/*
***** BEGIN LICENSE BLOCK *****
Copyright © 2018 Center for History and New Media
George Mason University, Fairfax, Virginia, USA
http://zotero.org
This file is part of Zotero.
Zotero is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Zotero is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with Zotero. If not, see <http://www.gnu.org/licenses/>.
***** END LICENSE BLOCK *****
*/
Zotero.RecognizePDF = new function () {
const OFFLINE_RECHECK_DELAY = 60 * 1000;
const MAX_PAGES = 5;
const UNRECOGNIZE_TIMEOUT = 86400 * 1000;
let _newItems = new WeakMap();
let _queue = [];
let _queueProcessing = false;
let _processingItemID = null;
let _progressQueue = Zotero.ProgressQueues.create({
id: 'recognize',
title: 'recognizePDF.title',
columns: [
'recognizePDF.pdfName.label',
'recognizePDF.itemName.label'
]
});
_progressQueue.addListener('cancel', function () {
_queue = [];
});
/**
* Triggers queue processing and returns when all items in the queue are processed
* @return {Promise}
*/
async function _processQueue() {
await Zotero.Schema.schemaUpdatePromise;
if (_queueProcessing) return;
_queueProcessing = true;
while (1) {
// While all current progress queue usages are related with
// online APIs, check internet connectivity here
if (Zotero.HTTP.browserIsOffline()) {
await Zotero.Promise.delay(OFFLINE_RECHECK_DELAY);
continue;
}
let itemID = _queue.pop();
if (!itemID) break;
_processingItemID = itemID;
_progressQueue.updateRow(itemID, Zotero.ProgressQueue.ROW_PROCESSING, Zotero.getString('general.processing'));
try {
let item = await Zotero.Items.getAsync(itemID);
if (!item) {
throw new Error();
}
let res = await _processItem(item);
_progressQueue.updateRow(itemID, Zotero.ProgressQueue.ROW_SUCCEEDED, item.getField('title'));
}
catch (e) {
Zotero.logError(e);
_progressQueue.updateRow(
itemID,
Zotero.ProgressQueue.ROW_FAILED,
e instanceof Zotero.Exception.Alert
? e.message
: Zotero.getString('general.error')
);
}
}
_queueProcessing = false;
_processingItemID = null;
}
/**
* Adds items to the queue and triggers processing
* @param {Zotero.Item[]} items
*/
this.recognizeItems = function (items) {
for (let item of items) {
if(
_processingItemID === item.id ||
_queue.includes(item.id) ||
!this.canRecognize(item)
) {
continue;
}
_queue.unshift(item.id);
_progressQueue.addRow(item);
}
_processQueue();
};
/**
* Checks whether a given PDF could theoretically be recognized
* @param {Zotero.Item} item
* @return {Boolean} True if the PDF can be recognized, false if it cannot be
*/
this.canRecognize = function (item) {
return item.attachmentContentType
&& item.attachmentContentType === 'application/pdf'
&& item.isTopLevelItem();
};
this.autoRecognizeItems = function (items) {
if (!Zotero.Prefs.get('autoRecognizeFiles')) return;
var pdfs = items.filter((item) => {
return item
&& item.isFileAttachment()
&& item.attachmentContentType == 'application/pdf';
});
if (!pdfs.length) {
return;
}
this.recognizeItems(pdfs);
Zotero.ProgressQueues.get('recognize').getDialog().open();
};
this.canUnrecognize = function (item) {
var { dateModified } = _newItems.get(item) || {};
// Item must have been recognized recently, must not have been modified since it was
// created, and must have only one attachment and no other children
if (!dateModified
|| Zotero.Date.sqlToDate(dateModified, true) < new Date() - UNRECOGNIZE_TIMEOUT
|| item.dateModified != dateModified
|| item.numAttachments(true) != 1
|| item.numChildren(true) != 1) {
_newItems.delete(item);
return false;
}
// Child attachment must be not be in trash and must be a PDF
var attachments = Zotero.Items.get(item.getAttachments());
if (!attachments.length || attachments[0].attachmentContentType != 'application/pdf') {
_newItems.delete(item);
return false;
}
return true;
};
this.unrecognize = async function (item) {
var { originalTitle, originalFilename } = _newItems.get(item);
var attachment = Zotero.Items.get(item.getAttachments()[0]);
try {
let currentFilename = attachment.attachmentFilename;
if (currentFilename != originalFilename) {
let renamed = await attachment.renameAttachmentFile(originalFilename);
if (renamed) {
attachment.setField('title', originalTitle);
}
}
}
catch (e) {
Zotero.logError(e);
}
return Zotero.DB.executeTransaction(async function () {
let collections = item.getCollections();
attachment.parentItemID = null
attachment.setCollections(collections);
await attachment.save();
await item.erase();
}.bind(this));
};
this.report = async function (item, description) {
var attachment = Zotero.Items.get(item.getAttachments()[0]);
var filePath = attachment.getFilePath();
if (!filePath || !await OS.File.exists(filePath)) {
throw new Error("File not found when reporting metadata");
}
var version = Zotero.version;
var json = await extractJSON(filePath, MAX_PAGES);
var metadata = item.toJSON();
var data = { description, version, json, metadata };
var uri = ZOTERO_CONFIG.RECOGNIZE_URL + 'report';
return Zotero.HTTP.request(
"POST",
uri,
{
successCodes: [200, 204],
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(data)
}
);
};
/**
* Processes the item and places it as a children of the new item
* @param itemID
* @return {Promise}
*/
async function _processItem(attachment) {
// Make sure the attachment still doesn't have a parent
if (attachment.parentItemID) {
throw new Error('Already has parent');
}
var zp = Zotero.getActiveZoteroPane();
var selectParent = false;
if (zp) {
let selected = zp.getSelectedItems();
if (selected.length) {
// If only the PDF was selected, select the parent when we're done
selectParent = selected.length == 1 && selected[0] == attachment;
}
}
let parentItem = await _recognize(attachment);
if (!parentItem) {
throw new Zotero.Exception.Alert("recognizePDF.noMatches");
}
// Put new item in same collections as the old one
let collections = attachment.getCollections();
await Zotero.DB.executeTransaction(async function () {
if (collections.length) {
for (let collectionID of collections) {
parentItem.addToCollection(collectionID);
}
await parentItem.save();
}
// Put old item as a child of the new item
attachment.parentID = parentItem.id;
await attachment.save();
});
var originalTitle = attachment.getField('title');
var path = attachment.getFilePath();
var originalFilename = OS.Path.basename(path);
// Rename attachment file to match new metadata
if (Zotero.Prefs.get('autoRenameFiles')) {
let ext = Zotero.File.getExtension(path);
let fileBaseName = Zotero.Attachments.getFileBaseNameFromItem(parentItem);
let newName = fileBaseName + (ext ? '.' + ext : '');
let result = await attachment.renameAttachmentFile(newName, false, true);
if (result !== true) {
throw new Error("Error renaming " + path);
}
// Rename attachment title
attachment.setField('title', newName);
await attachment.saveTx();
}
try {
zp = Zotero.getActiveZoteroPane();
if (zp) {
if (selectParent) {
await zp.selectItem(parentItem.id);
}
}
}
catch (e) {
Zotero.logError(e);
}
_newItems.set(
parentItem,
{
originalTitle,
originalFilename,
dateModified: parentItem.dateModified
}
);
return parentItem;
}
/**
* Get json from a PDF
* @param {String} filePath PDF file path
* @param {Number} pages Number of pages to extract
* @return {Promise}
*/
async function extractJSON(filePath, pages) {
let cacheFile = Zotero.getTempDirectory();
cacheFile.append("recognizePDFcache.txt");
if (cacheFile.exists()) {
cacheFile.remove(false);
}
let {exec, args} = Zotero.Fulltext.getPDFConverterExecAndArgs();
args.push('-json', '-l', pages, filePath, cacheFile.path);
Zotero.debug("RecognizePDF: Running " + exec.path + " " + args.map(arg => "'" + arg + "'").join(" "));
try {
await Zotero.Utilities.Internal.exec(exec, args);
let content = await Zotero.File.getContentsAsync(cacheFile.path);
Zotero.debug("RecognizePDF: Extracted JSON:");
Zotero.debug(content);
cacheFile.remove(false);
return JSON.parse(content);
}
catch (e) {
Zotero.logError(e);
try {
cacheFile.remove(false);
}
catch (e) {
Zotero.logError(e);
}
throw new Zotero.Exception.Alert("recognizePDF.couldNotRead");
}
}
/**
* Attach appropriate handlers to a Zotero.Translate instance and begin translation
* @return {Promise}
*/
async function _promiseTranslate(translate, libraryID) {
translate.setHandler('select', function (translate, items, callback) {
for (let i in items) {
let obj = {};
obj[i] = items[i];
callback(obj);
return;
}
});
let newItems = await translate.translate({
libraryID,
saveAttachments: false
});
if (newItems.length) {
return newItems[0];
}
throw new Error('No items found');
}
async function _query(json) {
// TODO: Use main API URL for recognizer server
//let uri = Zotero.Prefs.get("api.url") || ZOTERO_CONFIG.API_URL;
let uri = Zotero.Prefs.get("api.url") || ZOTERO_CONFIG.RECOGNIZE_URL;
if (!uri.endsWith('/')) {
uri += '/';
}
uri += 'recognize';
let client = Zotero.Sync.Runner.getAPIClient();
let req = await client.makeRequest(
'POST',
uri,
{
successCodes: [200],
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(json),
noAPIKey: true
}
);
return JSON.parse(req.responseText);
}
/**
* Retrieves metadata for a PDF and saves it as an item
* @param {Zotero.Item} item
* @return {Promise}
*/
async function _recognize(item) {
let filePath = await item.getFilePath();
if (!filePath || !await OS.File.exists(filePath)) throw new Zotero.Exception.Alert('recognizePDF.fileNotFound');
let json = await extractJSON(filePath, MAX_PAGES);
let containingTextPages = 0;
for(let page of json.pages) {
if(page[2].length) {
containingTextPages++;
}
}
if(!containingTextPages) {
throw new Zotero.Exception.Alert('recognizePDF.noOCR');
}
let libraryID = item.libraryID;
let res = await _query(json);
if (!res) return null;
if (res.arxiv) {
Zotero.debug('RecognizePDF: Getting metadata by arXiv');
let translate = new Zotero.Translate.Search();
translate.setIdentifier({arXiv: res.arxiv});
let translators = await translate.getTranslators();
translate.setTranslator(translators);
try {
let newItem = await _promiseTranslate(translate, libraryID);
if (!newItem.abstractNote && res.abstract) {
newItem.setField('abstractNote', res.abstract);
}
if (!newItem.language && res.language) {
newItem.setField('language', res.language);
}
newItem.saveTx();
return newItem;
}
catch (e) {
Zotero.debug('RecognizePDF: ' + e);
}
}
if (res.doi) {
Zotero.debug('RecognizePDF: Getting metadata by DOI');
let translate = new Zotero.Translate.Search();
translate.setIdentifier({
DOI: res.doi
});
let translators = await translate.getTranslators();
translate.setTranslator(translators);
try {
let newItem = await _promiseTranslate(translate, libraryID);
if (!newItem.abstractNote && res.abstract) {
newItem.setField('abstractNote', res.abstract);
}
if (!newItem.language && res.language) {
newItem.setField('language', res.language);
}
newItem.saveTx();
return newItem;
}
catch (e) {
Zotero.debug('RecognizePDF: ' + e);
}
}
if (res.isbn) {
Zotero.debug('RecognizePDF: Getting metadata by ISBN');
let translate = new Zotero.Translate.Search();
translate.setSearch({'itemType': 'book', 'ISBN': res.isbn});
try {
let translatedItems = await translate.translate({
libraryID: false,
saveAttachments: false
});
Zotero.debug('RecognizePDF: Translated items:');
Zotero.debug(translatedItems);
if (translatedItems.length) {
let newItem = new Zotero.Item;
newItem.libraryID = libraryID;
// Convert tags to automatic. For other items this is done automatically in
// translate.js for other items, but for ISBNs we just get the data
// (libraryID=false) and do the saving manually.
translatedItems[0].tags = translatedItems[0].tags.map(tag => {
if (typeof tag == 'string') {
return {
tag,
type: 1
};
}
tag.type = 1;
return tag;
});
newItem.fromJSON(translatedItems[0]);
if (!newItem.abstractNote && res.abstract) {
newItem.setField('abstractNote', res.abstract);
}
if (!newItem.language && res.language) {
newItem.setField('language', res.language);
}
newItem.saveTx();
return newItem;
}
}
catch (e) {
Zotero.debug('RecognizePDF: ' + e);
}
}
if (res.title) {
let type = 'journalArticle';
if (res.type === 'book-chapter') {
type = 'bookSection';
}
let newItem = new Zotero.Item(type);
newItem.libraryID = libraryID;
newItem.setField('title', res.title);
let creators = [];
for (let author of res.authors) {
creators.push({
firstName: author.firstName,
lastName: author.lastName,
creatorType: 'author'
})
}
newItem.setCreators(creators);
if (res.abstract) newItem.setField('abstractNote', res.abstract);
if (res.year) newItem.setField('date', res.year);
if (res.pages) newItem.setField('pages', res.pages);
if (res.volume) newItem.setField('volume', res.volume);
if (res.url) newItem.setField('url', res.url);
if (res.language) newItem.setField('language', res.language);
if (type === 'journalArticle') {
if (res.issue) newItem.setField('issue', res.issue);
if (res.ISSN) newItem.setField('issn', res.issn);
if (res.container) newItem.setField('publicationTitle', res.container);
}
else if (type === 'bookSection') {
if (res.container) newItem.setField('bookTitle', res.container);
if (res.publisher) newItem.setField('publisher', res.publisher);
}
newItem.setField('libraryCatalog', 'Zotero');
await newItem.saveTx();
return newItem;
}
return null;
}
};