572 lines
15 KiB
JavaScript
572 lines
15 KiB
JavaScript
/*
|
|
***** BEGIN LICENSE BLOCK *****
|
|
|
|
Copyright © 2018 Center for History and New Media
|
|
George Mason University, Fairfax, Virginia, USA
|
|
http://zotero.org
|
|
|
|
This file is part of Zotero.
|
|
|
|
Zotero is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU Affero General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
Zotero is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU Affero General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Affero General Public License
|
|
along with Zotero. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
***** END LICENSE BLOCK *****
|
|
*/
|
|
|
|
Zotero.RecognizePDF = new function () {
|
|
const OFFLINE_RECHECK_DELAY = 60 * 1000;
|
|
const MAX_PAGES = 5;
|
|
const UNRECOGNIZE_TIMEOUT = 86400 * 1000;
|
|
|
|
let _newItems = new WeakMap();
|
|
|
|
let _queue = [];
|
|
let _queueProcessing = false;
|
|
let _processingItemID = null;
|
|
|
|
let _progressQueue = Zotero.ProgressQueues.create({
|
|
id: 'recognize',
|
|
title: 'recognizePDF.title',
|
|
columns: [
|
|
'recognizePDF.pdfName.label',
|
|
'recognizePDF.itemName.label'
|
|
]
|
|
});
|
|
|
|
_progressQueue.addListener('cancel', function () {
|
|
_queue = [];
|
|
});
|
|
|
|
|
|
/**
|
|
* Triggers queue processing and returns when all items in the queue are processed
|
|
* @return {Promise}
|
|
*/
|
|
async function _processQueue() {
|
|
await Zotero.Schema.schemaUpdatePromise;
|
|
|
|
if (_queueProcessing) return;
|
|
_queueProcessing = true;
|
|
|
|
while (1) {
|
|
// While all current progress queue usages are related with
|
|
// online APIs, check internet connectivity here
|
|
if (Zotero.HTTP.browserIsOffline()) {
|
|
await Zotero.Promise.delay(OFFLINE_RECHECK_DELAY);
|
|
continue;
|
|
}
|
|
|
|
let itemID = _queue.pop();
|
|
if (!itemID) break;
|
|
|
|
_processingItemID = itemID;
|
|
|
|
_progressQueue.updateRow(itemID, Zotero.ProgressQueue.ROW_PROCESSING, Zotero.getString('general.processing'));
|
|
|
|
try {
|
|
let item = await Zotero.Items.getAsync(itemID);
|
|
|
|
if (!item) {
|
|
throw new Error();
|
|
}
|
|
|
|
let res = await _processItem(item);
|
|
_progressQueue.updateRow(itemID, Zotero.ProgressQueue.ROW_SUCCEEDED, item.getField('title'));
|
|
}
|
|
catch (e) {
|
|
Zotero.logError(e);
|
|
|
|
_progressQueue.updateRow(
|
|
itemID,
|
|
Zotero.ProgressQueue.ROW_FAILED,
|
|
e instanceof Zotero.Exception.Alert
|
|
? e.message
|
|
: Zotero.getString('general.error')
|
|
);
|
|
}
|
|
}
|
|
|
|
_queueProcessing = false;
|
|
_processingItemID = null;
|
|
}
|
|
|
|
|
|
/**
|
|
* Adds items to the queue and triggers processing
|
|
* @param {Zotero.Item[]} items
|
|
*/
|
|
this.recognizeItems = function (items) {
|
|
for (let item of items) {
|
|
if(
|
|
_processingItemID === item.id ||
|
|
_queue.includes(item.id) ||
|
|
!this.canRecognize(item)
|
|
) {
|
|
continue;
|
|
}
|
|
_queue.unshift(item.id);
|
|
_progressQueue.addRow(item);
|
|
}
|
|
_processQueue();
|
|
};
|
|
|
|
|
|
/**
|
|
* Checks whether a given PDF could theoretically be recognized
|
|
* @param {Zotero.Item} item
|
|
* @return {Boolean} True if the PDF can be recognized, false if it cannot be
|
|
*/
|
|
this.canRecognize = function (item) {
|
|
return item.attachmentContentType
|
|
&& item.attachmentContentType === 'application/pdf'
|
|
&& item.isTopLevelItem();
|
|
};
|
|
|
|
|
|
this.autoRecognizeItems = function (items) {
|
|
if (!Zotero.Prefs.get('autoRecognizeFiles')) return;
|
|
|
|
var pdfs = items.filter((item) => {
|
|
return item
|
|
&& item.isFileAttachment()
|
|
&& item.attachmentContentType == 'application/pdf';
|
|
});
|
|
if (!pdfs.length) {
|
|
return;
|
|
}
|
|
this.recognizeItems(pdfs);
|
|
Zotero.ProgressQueues.get('recognize').getDialog().open();
|
|
};
|
|
|
|
|
|
this.canUnrecognize = function (item) {
|
|
var { dateModified } = _newItems.get(item) || {};
|
|
// Item must have been recognized recently, must not have been modified since it was
|
|
// created, and must have only one attachment and no other children
|
|
if (!dateModified
|
|
|| Zotero.Date.sqlToDate(dateModified, true) < new Date() - UNRECOGNIZE_TIMEOUT
|
|
|| item.dateModified != dateModified
|
|
|| item.numAttachments(true) != 1
|
|
|| item.numChildren(true) != 1) {
|
|
_newItems.delete(item);
|
|
return false;
|
|
}
|
|
|
|
// Child attachment must be not be in trash and must be a PDF
|
|
var attachments = Zotero.Items.get(item.getAttachments());
|
|
if (!attachments.length || attachments[0].attachmentContentType != 'application/pdf') {
|
|
_newItems.delete(item);
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
};
|
|
|
|
|
|
this.unrecognize = async function (item) {
|
|
var { originalTitle, originalFilename } = _newItems.get(item);
|
|
var attachment = Zotero.Items.get(item.getAttachments()[0]);
|
|
|
|
try {
|
|
let currentFilename = attachment.attachmentFilename;
|
|
if (currentFilename != originalFilename) {
|
|
let renamed = await attachment.renameAttachmentFile(originalFilename);
|
|
if (renamed) {
|
|
attachment.setField('title', originalTitle);
|
|
}
|
|
}
|
|
}
|
|
catch (e) {
|
|
Zotero.logError(e);
|
|
}
|
|
|
|
return Zotero.DB.executeTransaction(async function () {
|
|
let collections = item.getCollections();
|
|
attachment.parentItemID = null
|
|
attachment.setCollections(collections);
|
|
await attachment.save();
|
|
|
|
await item.erase();
|
|
}.bind(this));
|
|
};
|
|
|
|
|
|
this.report = async function (item, description) {
|
|
var attachment = Zotero.Items.get(item.getAttachments()[0]);
|
|
var filePath = attachment.getFilePath();
|
|
if (!filePath || !await OS.File.exists(filePath)) {
|
|
throw new Error("File not found when reporting metadata");
|
|
}
|
|
|
|
var version = Zotero.version;
|
|
var json = await extractJSON(filePath, MAX_PAGES);
|
|
var metadata = item.toJSON();
|
|
|
|
var data = { description, version, json, metadata };
|
|
var uri = ZOTERO_CONFIG.RECOGNIZE_URL + 'report';
|
|
return Zotero.HTTP.request(
|
|
"POST",
|
|
uri,
|
|
{
|
|
successCodes: [200, 204],
|
|
headers: {
|
|
'Content-Type': 'application/json'
|
|
},
|
|
body: JSON.stringify(data)
|
|
}
|
|
);
|
|
};
|
|
|
|
|
|
/**
|
|
* Processes the item and places it as a children of the new item
|
|
* @param itemID
|
|
* @return {Promise}
|
|
*/
|
|
async function _processItem(attachment) {
|
|
// Make sure the attachment still doesn't have a parent
|
|
if (attachment.parentItemID) {
|
|
throw new Error('Already has parent');
|
|
}
|
|
|
|
var zp = Zotero.getActiveZoteroPane();
|
|
var selectParent = false;
|
|
if (zp) {
|
|
let selected = zp.getSelectedItems();
|
|
if (selected.length) {
|
|
// If only the PDF was selected, select the parent when we're done
|
|
selectParent = selected.length == 1 && selected[0] == attachment;
|
|
}
|
|
}
|
|
|
|
let parentItem = await _recognize(attachment);
|
|
if (!parentItem) {
|
|
throw new Zotero.Exception.Alert("recognizePDF.noMatches");
|
|
}
|
|
|
|
// Put new item in same collections as the old one
|
|
let collections = attachment.getCollections();
|
|
await Zotero.DB.executeTransaction(async function () {
|
|
if (collections.length) {
|
|
for (let collectionID of collections) {
|
|
parentItem.addToCollection(collectionID);
|
|
}
|
|
await parentItem.save();
|
|
}
|
|
|
|
// Put old item as a child of the new item
|
|
attachment.parentID = parentItem.id;
|
|
await attachment.save();
|
|
});
|
|
|
|
var originalTitle = attachment.getField('title');
|
|
var path = attachment.getFilePath();
|
|
var originalFilename = OS.Path.basename(path);
|
|
|
|
// Rename attachment file to match new metadata
|
|
if (Zotero.Prefs.get('autoRenameFiles')) {
|
|
let ext = Zotero.File.getExtension(path);
|
|
let fileBaseName = Zotero.Attachments.getFileBaseNameFromItem(parentItem);
|
|
let newName = fileBaseName + (ext ? '.' + ext : '');
|
|
let result = await attachment.renameAttachmentFile(newName, false, true);
|
|
if (result !== true) {
|
|
throw new Error("Error renaming " + path);
|
|
}
|
|
// Rename attachment title
|
|
attachment.setField('title', newName);
|
|
await attachment.saveTx();
|
|
}
|
|
|
|
try {
|
|
zp = Zotero.getActiveZoteroPane();
|
|
if (zp) {
|
|
if (selectParent) {
|
|
await zp.selectItem(parentItem.id);
|
|
}
|
|
}
|
|
}
|
|
catch (e) {
|
|
Zotero.logError(e);
|
|
}
|
|
|
|
_newItems.set(
|
|
parentItem,
|
|
{
|
|
originalTitle,
|
|
originalFilename,
|
|
dateModified: parentItem.dateModified
|
|
}
|
|
);
|
|
return parentItem;
|
|
}
|
|
|
|
/**
|
|
* Get json from a PDF
|
|
* @param {String} filePath PDF file path
|
|
* @param {Number} pages Number of pages to extract
|
|
* @return {Promise}
|
|
*/
|
|
async function extractJSON(filePath, pages) {
|
|
let cacheFile = Zotero.getTempDirectory();
|
|
cacheFile.append("recognizePDFcache.txt");
|
|
if (cacheFile.exists()) {
|
|
cacheFile.remove(false);
|
|
}
|
|
|
|
let {exec, args} = Zotero.Fulltext.getPDFConverterExecAndArgs();
|
|
args.push('-json', '-l', pages, filePath, cacheFile.path);
|
|
|
|
Zotero.debug("RecognizePDF: Running " + exec.path + " " + args.map(arg => "'" + arg + "'").join(" "));
|
|
|
|
try {
|
|
await Zotero.Utilities.Internal.exec(exec, args);
|
|
let content = await Zotero.File.getContentsAsync(cacheFile.path);
|
|
Zotero.debug("RecognizePDF: Extracted JSON:");
|
|
Zotero.debug(content);
|
|
cacheFile.remove(false);
|
|
return JSON.parse(content);
|
|
}
|
|
catch (e) {
|
|
Zotero.logError(e);
|
|
try {
|
|
cacheFile.remove(false);
|
|
}
|
|
catch (e) {
|
|
Zotero.logError(e);
|
|
}
|
|
throw new Zotero.Exception.Alert("recognizePDF.couldNotRead");
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Attach appropriate handlers to a Zotero.Translate instance and begin translation
|
|
* @return {Promise}
|
|
*/
|
|
async function _promiseTranslate(translate, libraryID) {
|
|
translate.setHandler('select', function (translate, items, callback) {
|
|
for (let i in items) {
|
|
let obj = {};
|
|
obj[i] = items[i];
|
|
callback(obj);
|
|
return;
|
|
}
|
|
});
|
|
|
|
let newItems = await translate.translate({
|
|
libraryID,
|
|
saveAttachments: false
|
|
});
|
|
if (newItems.length) {
|
|
return newItems[0];
|
|
}
|
|
throw new Error('No items found');
|
|
}
|
|
|
|
async function _query(json) {
|
|
// TODO: Use main API URL for recognizer server
|
|
//let uri = Zotero.Prefs.get("api.url") || ZOTERO_CONFIG.API_URL;
|
|
let uri = Zotero.Prefs.get("api.url") || ZOTERO_CONFIG.RECOGNIZE_URL;
|
|
|
|
if (!uri.endsWith('/')) {
|
|
uri += '/';
|
|
}
|
|
|
|
uri += 'recognize';
|
|
|
|
let client = Zotero.Sync.Runner.getAPIClient();
|
|
|
|
let req = await client.makeRequest(
|
|
'POST',
|
|
uri,
|
|
{
|
|
successCodes: [200],
|
|
headers: {
|
|
'Content-Type': 'application/json'
|
|
},
|
|
body: JSON.stringify(json),
|
|
noAPIKey: true
|
|
}
|
|
);
|
|
|
|
return JSON.parse(req.responseText);
|
|
}
|
|
|
|
/**
|
|
* Retrieves metadata for a PDF and saves it as an item
|
|
* @param {Zotero.Item} item
|
|
* @return {Promise}
|
|
*/
|
|
async function _recognize(item) {
|
|
let filePath = await item.getFilePath();
|
|
|
|
if (!filePath || !await OS.File.exists(filePath)) throw new Zotero.Exception.Alert('recognizePDF.fileNotFound');
|
|
|
|
let json = await extractJSON(filePath, MAX_PAGES);
|
|
|
|
let containingTextPages = 0;
|
|
|
|
for(let page of json.pages) {
|
|
if(page[2].length) {
|
|
containingTextPages++;
|
|
}
|
|
}
|
|
|
|
if(!containingTextPages) {
|
|
throw new Zotero.Exception.Alert('recognizePDF.noOCR');
|
|
}
|
|
|
|
let libraryID = item.libraryID;
|
|
|
|
let res = await _query(json);
|
|
if (!res) return null;
|
|
|
|
if (res.arxiv) {
|
|
Zotero.debug('RecognizePDF: Getting metadata by arXiv');
|
|
let translate = new Zotero.Translate.Search();
|
|
translate.setIdentifier({arXiv: res.arxiv});
|
|
let translators = await translate.getTranslators();
|
|
translate.setTranslator(translators);
|
|
|
|
try {
|
|
let newItem = await _promiseTranslate(translate, libraryID);
|
|
if (!newItem.abstractNote && res.abstract) {
|
|
newItem.setField('abstractNote', res.abstract);
|
|
}
|
|
if (!newItem.language && res.language) {
|
|
newItem.setField('language', res.language);
|
|
}
|
|
newItem.saveTx();
|
|
return newItem;
|
|
}
|
|
catch (e) {
|
|
Zotero.debug('RecognizePDF: ' + e);
|
|
}
|
|
}
|
|
|
|
if (res.doi) {
|
|
Zotero.debug('RecognizePDF: Getting metadata by DOI');
|
|
let translate = new Zotero.Translate.Search();
|
|
translate.setIdentifier({
|
|
DOI: res.doi
|
|
});
|
|
let translators = await translate.getTranslators();
|
|
translate.setTranslator(translators);
|
|
|
|
try {
|
|
let newItem = await _promiseTranslate(translate, libraryID);
|
|
if (!newItem.abstractNote && res.abstract) {
|
|
newItem.setField('abstractNote', res.abstract);
|
|
}
|
|
if (!newItem.language && res.language) {
|
|
newItem.setField('language', res.language);
|
|
}
|
|
newItem.saveTx();
|
|
return newItem;
|
|
}
|
|
catch (e) {
|
|
Zotero.debug('RecognizePDF: ' + e);
|
|
}
|
|
}
|
|
|
|
if (res.isbn) {
|
|
Zotero.debug('RecognizePDF: Getting metadata by ISBN');
|
|
let translate = new Zotero.Translate.Search();
|
|
translate.setSearch({'itemType': 'book', 'ISBN': res.isbn});
|
|
try {
|
|
let translatedItems = await translate.translate({
|
|
libraryID: false,
|
|
saveAttachments: false
|
|
});
|
|
Zotero.debug('RecognizePDF: Translated items:');
|
|
Zotero.debug(translatedItems);
|
|
if (translatedItems.length) {
|
|
let newItem = new Zotero.Item;
|
|
newItem.libraryID = libraryID;
|
|
// Convert tags to automatic. For other items this is done automatically in
|
|
// translate.js for other items, but for ISBNs we just get the data
|
|
// (libraryID=false) and do the saving manually.
|
|
translatedItems[0].tags = translatedItems[0].tags.map(tag => {
|
|
if (typeof tag == 'string') {
|
|
return {
|
|
tag,
|
|
type: 1
|
|
};
|
|
}
|
|
tag.type = 1;
|
|
return tag;
|
|
});
|
|
newItem.fromJSON(translatedItems[0]);
|
|
if (!newItem.abstractNote && res.abstract) {
|
|
newItem.setField('abstractNote', res.abstract);
|
|
}
|
|
if (!newItem.language && res.language) {
|
|
newItem.setField('language', res.language);
|
|
}
|
|
newItem.saveTx();
|
|
return newItem;
|
|
}
|
|
}
|
|
catch (e) {
|
|
Zotero.debug('RecognizePDF: ' + e);
|
|
}
|
|
}
|
|
|
|
if (res.title) {
|
|
let type = 'journalArticle';
|
|
|
|
if (res.type === 'book-chapter') {
|
|
type = 'bookSection';
|
|
}
|
|
|
|
let newItem = new Zotero.Item(type);
|
|
newItem.libraryID = libraryID;
|
|
newItem.setField('title', res.title);
|
|
|
|
let creators = [];
|
|
for (let author of res.authors) {
|
|
creators.push({
|
|
firstName: author.firstName,
|
|
lastName: author.lastName,
|
|
creatorType: 'author'
|
|
})
|
|
}
|
|
|
|
newItem.setCreators(creators);
|
|
|
|
if (res.abstract) newItem.setField('abstractNote', res.abstract);
|
|
if (res.year) newItem.setField('date', res.year);
|
|
if (res.pages) newItem.setField('pages', res.pages);
|
|
if (res.volume) newItem.setField('volume', res.volume);
|
|
if (res.url) newItem.setField('url', res.url);
|
|
if (res.language) newItem.setField('language', res.language);
|
|
|
|
if (type === 'journalArticle') {
|
|
if (res.issue) newItem.setField('issue', res.issue);
|
|
if (res.ISSN) newItem.setField('issn', res.issn);
|
|
if (res.container) newItem.setField('publicationTitle', res.container);
|
|
}
|
|
else if (type === 'bookSection') {
|
|
if (res.container) newItem.setField('bookTitle', res.container);
|
|
if (res.publisher) newItem.setField('publisher', res.publisher);
|
|
}
|
|
|
|
newItem.setField('libraryCatalog', 'Zotero');
|
|
|
|
await newItem.saveTx();
|
|
return newItem;
|
|
}
|
|
|
|
return null;
|
|
}
|
|
};
|
|
|