New PDF recognizer
This commit is contained in:
parent
ce0dd5cc5e
commit
991a50d090
12 changed files with 735 additions and 989 deletions
|
@ -125,8 +125,7 @@ var Zotero_DownloadOverlay = new function() {
|
||||||
try {
|
try {
|
||||||
if (item && item.getFile()) {
|
if (item && item.getFile()) {
|
||||||
timer.cancel();
|
timer.cancel();
|
||||||
var recognizer = new win.Zotero_RecognizePDF.ItemRecognizer();
|
Zotero.RecognizePDF.recognizeItems([item]);
|
||||||
recognizer.recognizeItems([item]);
|
|
||||||
}
|
}
|
||||||
} catch(e) { dump(e.toSource()) };
|
} catch(e) { dump(e.toSource()) };
|
||||||
}, 1000, Components.interfaces.nsITimer.TYPE_REPEATING_SLACK);
|
}, 1000, Components.interfaces.nsITimer.TYPE_REPEATING_SLACK);
|
||||||
|
|
|
@ -1,938 +0,0 @@
|
||||||
/*
|
|
||||||
***** BEGIN LICENSE BLOCK *****
|
|
||||||
|
|
||||||
Copyright © 2009 Center for History and New Media
|
|
||||||
George Mason University, Fairfax, Virginia, USA
|
|
||||||
http://zotero.org
|
|
||||||
|
|
||||||
This file is part of Zotero.
|
|
||||||
|
|
||||||
Zotero is free software: you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU Affero General Public License as published by
|
|
||||||
the Free Software Foundation, either version 3 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
Zotero is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU Affero General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU Affero General Public License
|
|
||||||
along with Zotero. If not, see <http://www.gnu.org/licenses/>.
|
|
||||||
|
|
||||||
***** END LICENSE BLOCK *****
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @fileOverview Tools for automatically retrieving a citation for the given PDF
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Front end for recognizing PDFs
|
|
||||||
* @namespace
|
|
||||||
*/
|
|
||||||
var Zotero_RecognizePDF = new function() {
|
|
||||||
var _progressWindow, _progressIndicator;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Checks whether a given PDF could theoretically be recognized
|
|
||||||
* @returns {Boolean} True if the PDF can be recognized, false if it cannot be
|
|
||||||
*/
|
|
||||||
this.canRecognize = function(/**Zotero.Item*/ item) {
|
|
||||||
return item.attachmentContentType
|
|
||||||
&& item.attachmentContentType == "application/pdf"
|
|
||||||
&& item.isTopLevelItem();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Retrieves metadata for the PDF(s) selected in the Zotero Pane, placing the PDFs as a children
|
|
||||||
* of the new items
|
|
||||||
*/
|
|
||||||
this.recognizeSelected = function() {
|
|
||||||
var items = ZoteroPane_Local.getSelectedItems();
|
|
||||||
if (!items) return;
|
|
||||||
var itemRecognizer = new Zotero_RecognizePDF.ItemRecognizer();
|
|
||||||
itemRecognizer.recognizeItems(items);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Retrieves metadata for a PDF and saves it as an item
|
|
||||||
*
|
|
||||||
* @param {nsIFile} file The PDF file to retrieve metadata for
|
|
||||||
* @param {Integer} libraryID The library in which to save the PDF
|
|
||||||
* @param {Function} stopCheckCallback Function that returns true if the
|
|
||||||
* process is to be interrupted
|
|
||||||
* @return {Promise} A promise resolved when PDF metadata has been retrieved
|
|
||||||
*/
|
|
||||||
this.recognize = Zotero.Promise.coroutine(function* (file, libraryID, stopCheckCallback) {
|
|
||||||
const MAX_PAGES = 15;
|
|
||||||
var me = this;
|
|
||||||
|
|
||||||
var lines = yield _extractText(file, MAX_PAGES);
|
|
||||||
// Look for DOI - Use only first 80 lines to avoid catching article references
|
|
||||||
var allText = lines.join("\n"),
|
|
||||||
firstChunk = lines.slice(0,80).join('\n'),
|
|
||||||
doi = Zotero.Utilities.cleanDOI(firstChunk),
|
|
||||||
promise;
|
|
||||||
Zotero.debug(allText);
|
|
||||||
|
|
||||||
if(!doi) {
|
|
||||||
// Look for a JSTOR stable URL, which can be converted to a DOI by prepending 10.2307
|
|
||||||
doi = firstChunk.match(/www.\jstor\.org\/stable\/(\S+)/i);
|
|
||||||
if(doi) {
|
|
||||||
doi = Zotero.Utilities.cleanDOI(
|
|
||||||
doi[1].indexOf('10.') == 0 ? doi[1] : '10.2307/' + doi[1]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var newItem;
|
|
||||||
if (doi) {
|
|
||||||
// Look up DOI
|
|
||||||
Zotero.debug("RecognizePDF: Found DOI: "+doi);
|
|
||||||
|
|
||||||
var translateDOI = new Zotero.Translate.Search();
|
|
||||||
translateDOI.setTranslator("11645bd1-0420-45c1-badb-53fb41eeb753");
|
|
||||||
translateDOI.setSearch({"itemType":"journalArticle", "DOI":doi});
|
|
||||||
try {
|
|
||||||
newItem = yield _promiseTranslate(translateDOI, libraryID);
|
|
||||||
return newItem;
|
|
||||||
}
|
|
||||||
catch (e) {
|
|
||||||
Zotero.debug("RecognizePDF: " + e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
Zotero.debug("RecognizePDF: No DOI found in text");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Look for ISBNs if no DOI
|
|
||||||
var isbns = _findISBNs(allText);
|
|
||||||
if (isbns.length) {
|
|
||||||
Zotero.debug("RecognizePDF: Found ISBNs: " + isbns);
|
|
||||||
|
|
||||||
var translate = new Zotero.Translate.Search();
|
|
||||||
translate.setSearch({"itemType":"book", "ISBN":isbns[0]});
|
|
||||||
try {
|
|
||||||
newItem = yield _promiseTranslate(translate, libraryID);
|
|
||||||
return newItem;
|
|
||||||
}
|
|
||||||
catch (e) {
|
|
||||||
// If no DOI or ISBN, query Google Scholar
|
|
||||||
Zotero.debug("RecognizePDF: " + e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
Zotero.debug("RecognizePDF: No ISBN found in text");
|
|
||||||
}
|
|
||||||
|
|
||||||
return this.GSFullTextSearch.findItem(lines, libraryID, stopCheckCallback);
|
|
||||||
});
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get text from a PDF
|
|
||||||
* @param {nsIFile} file PDF
|
|
||||||
* @param {Number} pages Number of pages to extract
|
|
||||||
* @return {Promise}
|
|
||||||
*/
|
|
||||||
function _extractText(file, pages) {
|
|
||||||
var cacheFile = Zotero.getTempDirectory();
|
|
||||||
cacheFile.append("recognizePDFcache.txt");
|
|
||||||
if(cacheFile.exists()) {
|
|
||||||
cacheFile.remove(false);
|
|
||||||
}
|
|
||||||
|
|
||||||
var {exec, args} = Zotero.Fulltext.getPDFConverterExecAndArgs();
|
|
||||||
args.push('-nopgbrk', '-layout', '-l', pages, file.path, cacheFile.path);
|
|
||||||
|
|
||||||
Zotero.debug("RecognizePDF: Running " + exec.path + " " + args.map(arg => "'" + arg + "'").join(" "));
|
|
||||||
|
|
||||||
return Zotero.Utilities.Internal.exec(exec, args).then(function() {
|
|
||||||
if(!cacheFile.exists()) {
|
|
||||||
throw new Zotero.Exception.Alert("recognizePDF.couldNotRead");
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
var inputStream = Components.classes["@mozilla.org/network/file-input-stream;1"]
|
|
||||||
.createInstance(Components.interfaces.nsIFileInputStream);
|
|
||||||
inputStream.init(cacheFile, 0x01, 0o664, 0);
|
|
||||||
try {
|
|
||||||
var intlStream = Components.classes["@mozilla.org/intl/converter-input-stream;1"]
|
|
||||||
.createInstance(Components.interfaces.nsIConverterInputStream);
|
|
||||||
intlStream.init(inputStream, "UTF-8", 65535,
|
|
||||||
Components.interfaces.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER);
|
|
||||||
intlStream.QueryInterface(Components.interfaces.nsIUnicharLineInputStream);
|
|
||||||
|
|
||||||
// get the lines in this sample
|
|
||||||
var lines = [], str = {};
|
|
||||||
while(intlStream.readLine(str)) {
|
|
||||||
var line = str.value.trim();
|
|
||||||
if(line) lines.push(line);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
inputStream.close();
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
cacheFile.remove(false);
|
|
||||||
}
|
|
||||||
|
|
||||||
return lines;
|
|
||||||
}, function() {
|
|
||||||
throw new Zotero.Exception.Alert("recognizePDF.couldNotRead");
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Attach appropriate handlers to a Zotero.Translate instance and begin translation
|
|
||||||
* @return {Promise}
|
|
||||||
*/
|
|
||||||
var _promiseTranslate = Zotero.Promise.coroutine(function* (translate, libraryID) {
|
|
||||||
translate.setHandler("select", function(translate, items, callback) {
|
|
||||||
for(var i in items) {
|
|
||||||
var obj = {};
|
|
||||||
obj[i] = items[i];
|
|
||||||
callback(obj);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
/*translate.setHandler("done", function(translate, success) {
|
|
||||||
if(success && translate.newItems.length) {
|
|
||||||
deferred.resolve(translate.newItems[0]);
|
|
||||||
} else {
|
|
||||||
deferred.reject(translate.translator && translate.translator.length
|
|
||||||
? "Translation with " + translate.translator.map(t => t.label) + " failed"
|
|
||||||
: "Could not find a translator for given search item"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
});*/
|
|
||||||
var newItems = yield translate.translate({
|
|
||||||
libraryID,
|
|
||||||
saveAttachments: false
|
|
||||||
});
|
|
||||||
if (newItems.length) {
|
|
||||||
return newItems[0];
|
|
||||||
}
|
|
||||||
throw new Error("No items found");
|
|
||||||
});
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Search ISBNs in text
|
|
||||||
* @private
|
|
||||||
* @return {String[]} Array of ISBNs
|
|
||||||
*/
|
|
||||||
function _findISBNs(x) {
|
|
||||||
if(typeof(x) != "string") {
|
|
||||||
throw "findISBNs: argument must be a string";
|
|
||||||
}
|
|
||||||
var isbns = [];
|
|
||||||
|
|
||||||
// Match lines saying "isbn: " or "ISBN-10:" or similar, consider m-dashes and n-dashes as well
|
|
||||||
var pattern = /(SBN|sbn)[ \u2014\u2013\u2012-]?(10|13)?[: ]*([0-9X][0-9X \u2014\u2013\u2012-]+)/g;
|
|
||||||
var match;
|
|
||||||
|
|
||||||
while (match = pattern.exec(x)) {
|
|
||||||
var isbn = match[3];
|
|
||||||
isbn = isbn.replace(/[ \u2014\u2013\u2012-]/g, '');
|
|
||||||
if(isbn.length==20 || isbn.length==26) {
|
|
||||||
// Handle the case of two isbns (e.g. paper+hardback) next to each other
|
|
||||||
isbns.push(isbn.slice(0,isbn.length/2), isbn.slice(isbn.length/2));
|
|
||||||
} else if(isbn.length==23) {
|
|
||||||
// Handle the case of two isbns (10+13) next to each other
|
|
||||||
isbns.push(isbn.slice(0,10), isbn.slice(10));
|
|
||||||
} else if(isbn.length==10 || isbn.length==13) {
|
|
||||||
isbns.push(isbn);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Validate ISBNs
|
|
||||||
var validIsbns = [], cleanISBN;
|
|
||||||
for (var i =0; i < isbns.length; i++) {
|
|
||||||
cleanISBN = Zotero.Utilities.cleanISBN(isbns[i]);
|
|
||||||
if(cleanISBN) validIsbns.push(cleanISBN);
|
|
||||||
}
|
|
||||||
return validIsbns;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @class Handles UI, etc. for recognizing multiple items
|
|
||||||
*/
|
|
||||||
this.ItemRecognizer = function () {
|
|
||||||
this._items = [];
|
|
||||||
}
|
|
||||||
|
|
||||||
this.ItemRecognizer.prototype = {
|
|
||||||
"_stopped": false,
|
|
||||||
"_itemsTotal": 0,
|
|
||||||
"_progressWindow": null,
|
|
||||||
"_progressIndicator": null,
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Retreives metadata for the PDF items passed, displaying a progress dialog during conversion
|
|
||||||
* and placing the PDFs as a children of the new items
|
|
||||||
* @param {Zotero.Item[]} items
|
|
||||||
*/
|
|
||||||
"recognizeItems": function(items) {
|
|
||||||
var me = this;
|
|
||||||
this._items = items.slice();
|
|
||||||
this._itemTotal = items.length;
|
|
||||||
|
|
||||||
_progressWindow = this._progressWindow = window.openDialog("chrome://zotero/content/pdfProgress.xul", "", "chrome,close=yes,resizable=yes,dependent,dialog,centerscreen");
|
|
||||||
this._progressWindow.addEventListener("pageshow", function() { me._onWindowLoaded() }, false);
|
|
||||||
},
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Halts recognition of PDFs
|
|
||||||
*/
|
|
||||||
"stop": function() {
|
|
||||||
this._stopped = true;
|
|
||||||
},
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Halts recognition and closes window
|
|
||||||
*/
|
|
||||||
"close": function() {
|
|
||||||
this.stop();
|
|
||||||
this._progressWindow.close();
|
|
||||||
},
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Called when the progress window has been opened; adds items to the tree and begins recognizing
|
|
||||||
* @param
|
|
||||||
*/
|
|
||||||
"_onWindowLoaded": function() {
|
|
||||||
// populate progress window
|
|
||||||
var treechildren = this._progressWindow.document.getElementById("treechildren");
|
|
||||||
this._rowIDs = [];
|
|
||||||
for(var i in this._items) {
|
|
||||||
var treeitem = this._progressWindow.document.createElement('treeitem');
|
|
||||||
var treerow = this._progressWindow.document.createElement('treerow');
|
|
||||||
this._rowIDs.push(this._items[i].id);
|
|
||||||
|
|
||||||
var treecell = this._progressWindow.document.createElement('treecell');
|
|
||||||
treecell.setAttribute("id", "item-"+this._items[i].id+"-icon");
|
|
||||||
treerow.appendChild(treecell);
|
|
||||||
|
|
||||||
treecell = this._progressWindow.document.createElement('treecell');
|
|
||||||
treecell.setAttribute("label", this._items[i].getField("title"));
|
|
||||||
treerow.appendChild(treecell);
|
|
||||||
|
|
||||||
treecell = this._progressWindow.document.createElement('treecell');
|
|
||||||
treecell.setAttribute("id", "item-"+this._items[i].id+"-title");
|
|
||||||
treerow.appendChild(treecell);
|
|
||||||
|
|
||||||
treeitem.appendChild(treerow);
|
|
||||||
treechildren.appendChild(treeitem);
|
|
||||||
}
|
|
||||||
|
|
||||||
var me = this;
|
|
||||||
|
|
||||||
this._progressWindow.document.getElementById("tree").addEventListener(
|
|
||||||
"dblclick", function(event) { me._onDblClick(event, this); });
|
|
||||||
|
|
||||||
this._cancelHandler = function() { me.stop() };
|
|
||||||
this._keypressCancelHandler = function(e) {
|
|
||||||
if(e.keyCode === KeyEvent.DOM_VK_ESCAPE) me.stop();
|
|
||||||
};
|
|
||||||
|
|
||||||
_progressIndicator = this._progressIndicator = this._progressWindow.document.getElementById("progress-indicator");
|
|
||||||
this._progressWindow.document.getElementById("cancel-button")
|
|
||||||
.addEventListener("command", this._cancelHandler, false);
|
|
||||||
// Also cancel if the user presses Esc
|
|
||||||
this._progressWindow.addEventListener("keypress", this._keypressCancelHandler);
|
|
||||||
this._progressWindow.addEventListener("close", this._cancelHandler, false);
|
|
||||||
Zotero_RecognizePDF.GSFullTextSearch.resetQueryLimit();
|
|
||||||
return this._recognizeItem();
|
|
||||||
},
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Shifts an item off of this._items and recognizes it, then calls itself again if there are more
|
|
||||||
* @private
|
|
||||||
*/
|
|
||||||
"_recognizeItem": Zotero.Promise.coroutine(function* () {
|
|
||||||
const SUCCESS_IMAGE = "chrome://zotero/skin/tick.png";
|
|
||||||
const FAILURE_IMAGE = "chrome://zotero/skin/cross.png";
|
|
||||||
const LOADING_IMAGE = "chrome://global/skin/icons/loading_16.png";
|
|
||||||
|
|
||||||
if(!this._items.length) {
|
|
||||||
this._done();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Order here matters. Otherwise we may show an incorrect label
|
|
||||||
if(this._stopped) {
|
|
||||||
this._done(true);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
this._progressIndicator.value = (this._itemTotal-this._items.length)/this._itemTotal*100;
|
|
||||||
|
|
||||||
var item = this._items.shift(),
|
|
||||||
itemIcon = this._progressWindow.document.getElementById("item-"+item.id+"-icon"),
|
|
||||||
itemTitle = this._progressWindow.document.getElementById("item-"+item.id+"-title"),
|
|
||||||
rowNumber = this._rowIDs.indexOf(item.id);
|
|
||||||
itemIcon.setAttribute("src", LOADING_IMAGE);
|
|
||||||
itemTitle.setAttribute("label", "");
|
|
||||||
|
|
||||||
var file = item.getFile(), me = this;
|
|
||||||
|
|
||||||
try {
|
|
||||||
if (file) {
|
|
||||||
let newItem = yield Zotero_RecognizePDF.recognize(
|
|
||||||
file,
|
|
||||||
item.libraryID,
|
|
||||||
() => this._stopped
|
|
||||||
);
|
|
||||||
|
|
||||||
// If already stopped, delete
|
|
||||||
if (this._stopped) {
|
|
||||||
yield Zotero.Items.erase(newItem.id);
|
|
||||||
throw new Zotero.Exception.Alert('recognizePDF.stopped');
|
|
||||||
}
|
|
||||||
|
|
||||||
// put new item in same collections as the old one
|
|
||||||
let itemCollections = item.getCollections();
|
|
||||||
yield Zotero.DB.executeTransaction(function* () {
|
|
||||||
for (let i = 0; i < itemCollections.length; i++) {
|
|
||||||
let collection = Zotero.Collections.get(itemCollections[i]);
|
|
||||||
yield collection.addItem(newItem.id);
|
|
||||||
}
|
|
||||||
|
|
||||||
// put old item as a child of the new item
|
|
||||||
item.parentID = newItem.id;
|
|
||||||
yield item.save();
|
|
||||||
});
|
|
||||||
|
|
||||||
itemTitle.setAttribute("label", newItem.getField("title"));
|
|
||||||
itemIcon.setAttribute("src", SUCCESS_IMAGE);
|
|
||||||
this._rowIDs[rowNumber] = newItem.id;
|
|
||||||
|
|
||||||
return this._recognizeItem();
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
throw new Zotero.Exception.Alert("recognizePDF.fileNotFound");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (e) {
|
|
||||||
Zotero.logError(e);
|
|
||||||
|
|
||||||
itemTitle.setAttribute(
|
|
||||||
"label",
|
|
||||||
e instanceof Zotero.Exception.Alert
|
|
||||||
? e.message
|
|
||||||
: Zotero.getString("recognizePDF.error")
|
|
||||||
);
|
|
||||||
itemIcon.setAttribute("src", FAILURE_IMAGE);
|
|
||||||
|
|
||||||
// Don't show "completed" label if stopped on last item
|
|
||||||
if (this._stopped && !this._items.length) {
|
|
||||||
this._done(true);
|
|
||||||
} else {
|
|
||||||
return this._recognizeItem();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
finally {
|
|
||||||
// scroll to this item
|
|
||||||
this._progressWindow.document.getElementById("tree").treeBoxObject.scrollToRow(
|
|
||||||
Math.max(0, this._itemTotal - this._items.length - 4)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}),
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Cleans up after items are recognized, disabling the cancel button and
|
|
||||||
* making the progress window close on blur.
|
|
||||||
* @param {Boolean} cancelled Whether the process was cancelled
|
|
||||||
*/
|
|
||||||
"_done": function(cancelled) {
|
|
||||||
this._progressIndicator.value = 100;
|
|
||||||
// Switch out cancel for close
|
|
||||||
var cancelButton = this._progressWindow.document.getElementById("cancel-button"),
|
|
||||||
me = this;
|
|
||||||
cancelButton.label = Zotero.getString("recognizePDF.close.label");
|
|
||||||
cancelButton.removeEventListener("command", this._cancelHandler, false);
|
|
||||||
cancelButton.addEventListener("command", function() { me.close() }, false);
|
|
||||||
this._progressWindow.removeEventListener("keypress", this._keypressCancelHandler);
|
|
||||||
this._progressWindow.addEventListener("keypress", function() { me.close() });
|
|
||||||
|
|
||||||
if(Zotero.isMac) {
|
|
||||||
// On MacOS X, the windows are not always on top, so we hide them on
|
|
||||||
// blur to avoid clutter
|
|
||||||
this._setCloseTimer();
|
|
||||||
}
|
|
||||||
this._progressWindow.document.getElementById("label").value =
|
|
||||||
cancelled ? Zotero.getString("recognizePDF.cancelled.label")
|
|
||||||
: Zotero.getString("recognizePDF.complete.label");
|
|
||||||
},
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Set a timer after which the window will close automatically. If the
|
|
||||||
* window is refocused, clear the timer and do not attempt to auto-close
|
|
||||||
* any more
|
|
||||||
* @private
|
|
||||||
*/
|
|
||||||
"_setCloseTimer": function() {
|
|
||||||
var me = this, win = this._progressWindow;
|
|
||||||
var focusListener = function() {
|
|
||||||
if(!win.zoteroCloseTimeoutID) return;
|
|
||||||
|
|
||||||
win.clearTimeout(win.zoteroCloseTimeoutID);
|
|
||||||
delete win.zoteroCloseTimeoutID;
|
|
||||||
|
|
||||||
win.removeEventListener('blur', blurListener, false);
|
|
||||||
win.removeEventListener('focus', focusListener, false);
|
|
||||||
};
|
|
||||||
var blurListener = function() {
|
|
||||||
// Close window after losing focus for 5 seconds
|
|
||||||
win.zoteroCloseTimeoutID = win.setTimeout(function() { win.close() }, 5000);
|
|
||||||
// Prevent auto-close if we gain focus again
|
|
||||||
win.addEventListener("focus", focusListener, false);
|
|
||||||
};
|
|
||||||
win.addEventListener("blur", blurListener, false);
|
|
||||||
},
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Focus items in Zotero library when double-clicking them in the Retrieve
|
|
||||||
* metadata window.
|
|
||||||
* @param {Event} event
|
|
||||||
* @param {tree} tree XUL tree object
|
|
||||||
* @private
|
|
||||||
*/
|
|
||||||
"_onDblClick": function(event, tree) {
|
|
||||||
if (event && tree && event.type == "dblclick") {
|
|
||||||
var itemID = this._rowIDs[tree.treeBoxObject.getRowAt(event.clientX, event.clientY)];
|
|
||||||
if(!itemID) return;
|
|
||||||
|
|
||||||
// Get the right window. In tab mode, it's the container window
|
|
||||||
var lastWin = (window.ZoteroTab ? window.ZoteroTab.containerWindow : window);
|
|
||||||
|
|
||||||
if (lastWin.ZoteroOverlay) {
|
|
||||||
lastWin.ZoteroOverlay.toggleDisplay(true);
|
|
||||||
}
|
|
||||||
|
|
||||||
lastWin.ZoteroPane.selectItem(itemID, false, true);
|
|
||||||
lastWin.focus();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Singleton for querying Google Scholar. Ensures that all queries are
|
|
||||||
* sequential and respect the delay inbetween queries.
|
|
||||||
* @namespace
|
|
||||||
*/
|
|
||||||
this.GSFullTextSearch = new function() {
|
|
||||||
const GOOGLE_SCHOLAR_QUERY_DELAY = 2000; // In ms
|
|
||||||
var queryLimitReached = false,
|
|
||||||
inProgress = false,
|
|
||||||
queue = [],
|
|
||||||
stopCheckCallback; // As long as we process one query at a time, this is ok
|
|
||||||
// Load nsICookieManager2
|
|
||||||
Components.utils.import("resource://gre/modules/Services.jsm");
|
|
||||||
var cookieService = Services.cookies;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reset "Query Limit Reached" flag, so that we attempt to query Google again
|
|
||||||
*/
|
|
||||||
this.resetQueryLimit = function() {
|
|
||||||
queryLimitReached = false;
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Queue up item for Google Scholar query
|
|
||||||
* @param {String[]} lines Lines of text to use for full-text query
|
|
||||||
* @param {Integer | null} libraryID Library to save the item to
|
|
||||||
* @param {Function} stopCheckCallback Function that returns true if the
|
|
||||||
* process is to be interrupted
|
|
||||||
* @return {Promise} A promise resolved when PDF metadata has been retrieved
|
|
||||||
*/
|
|
||||||
this.findItem = function(lines, libraryID, stopCheckCallback) {
|
|
||||||
if(!inProgress && queryLimitReached) {
|
|
||||||
// There's no queue, so we can reject immediately
|
|
||||||
return Zotero.Promise.reject(new Zotero.Exception.Alert("recognizePDF.limit"));
|
|
||||||
}
|
|
||||||
|
|
||||||
var deferred = Zotero.Promise.defer();
|
|
||||||
queue.push({
|
|
||||||
deferred: deferred,
|
|
||||||
lines: lines,
|
|
||||||
libraryID: libraryID,
|
|
||||||
stopCheckCallback: stopCheckCallback
|
|
||||||
});
|
|
||||||
_processQueue();
|
|
||||||
return deferred.promise;
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Process Google Scholar queue
|
|
||||||
* @private
|
|
||||||
* @param {Boolean} proceed Whether we should pop the next item off the queue
|
|
||||||
* This should not be true unless being called after processing
|
|
||||||
* another item
|
|
||||||
*/
|
|
||||||
function _processQueue(proceed) {
|
|
||||||
if(inProgress && !proceed) return; //only one at a time
|
|
||||||
|
|
||||||
if(!queue.length) {
|
|
||||||
inProgress = false;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
inProgress = true;
|
|
||||||
if(queryLimitReached) {
|
|
||||||
// Irreversibly blocked. Reject remaining items in queue
|
|
||||||
var item;
|
|
||||||
while(item = queue.shift()) {
|
|
||||||
item.deferred.reject(new Zotero.Exception.Alert("recognizePDF.limit"));
|
|
||||||
}
|
|
||||||
_processQueue(true); // Wrap it up
|
|
||||||
} else {
|
|
||||||
var item = queue.shift();
|
|
||||||
|
|
||||||
stopCheckCallback = item.stopCheckCallback;
|
|
||||||
if(stopCheckCallback && stopCheckCallback()) {
|
|
||||||
item.deferred.reject(new Zotero.Exception.Alert('recognizePDF.stopped'));
|
|
||||||
_processQueue(true);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
item.deferred.resolve(
|
|
||||||
Zotero.Promise.try(function () {
|
|
||||||
var lines = getGoodLines(item.lines);
|
|
||||||
return queryGoogle(lines, item.libraryID, 3); // Try querying 3 times
|
|
||||||
})
|
|
||||||
.finally(function() { _processQueue(true); })
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Select lines that are good candidates for Google Scholar query
|
|
||||||
* @private
|
|
||||||
* @param {String[]} lines
|
|
||||||
* @return {String[]}
|
|
||||||
*/
|
|
||||||
function getGoodLines(lines) {
|
|
||||||
// Use only first column from multi-column lines
|
|
||||||
const lineRe = /^[\s_]*([^\s]+(?: [^\s_]+)+)/;
|
|
||||||
var cleanedLines = [], cleanedLineLengths = [];
|
|
||||||
for(var i=0; i<lines.length && cleanedLines.length<100; i++) {
|
|
||||||
var m = lineRe.exec(
|
|
||||||
lines[i]
|
|
||||||
// Replace non-breaking spaces
|
|
||||||
.replace(/\xA0/g, ' ')
|
|
||||||
);
|
|
||||||
if(m && m[1].split(' ').length > 3) {
|
|
||||||
cleanedLines.push(m[1]);
|
|
||||||
cleanedLineLengths.push(m[1].length);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get (not quite) median length
|
|
||||||
var lineLengthsLength = cleanedLineLengths.length;
|
|
||||||
if(lineLengthsLength < 20
|
|
||||||
|| cleanedLines[0] === "This is a digital copy of a book that was preserved for generations on library shelves before it was carefully scanned by Google as part of a project") {
|
|
||||||
throw new Zotero.Exception.Alert("recognizePDF.noOCR");
|
|
||||||
}
|
|
||||||
|
|
||||||
var sortedLengths = cleanedLineLengths.sort(),
|
|
||||||
medianLength = sortedLengths[Math.floor(lineLengthsLength/2)];
|
|
||||||
|
|
||||||
// Pick lines within 6 chars of the median (this is completely arbitrary)
|
|
||||||
var goodLines = [],
|
|
||||||
uBound = medianLength + 6,
|
|
||||||
lBound = medianLength - 6;
|
|
||||||
for (var i=0; i<lineLengthsLength; i++) {
|
|
||||||
if(cleanedLineLengths[i] > lBound && cleanedLineLengths[i] < uBound) {
|
|
||||||
// Strip quotation marks so they don't mess up search query quoting
|
|
||||||
var line = cleanedLines[i].replace('"', '');
|
|
||||||
goodLines.push(line);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return goodLines;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Query Google Scholar
|
|
||||||
* @private
|
|
||||||
* @param {String[]} goodLines
|
|
||||||
* @param {Integer | null} libraryID
|
|
||||||
* @param {Integer} tries Number of queries to attempt before giving up
|
|
||||||
* @return {Promise} A promise resolved when PDF metadata has been retrieved
|
|
||||||
*/
|
|
||||||
var queryGoogle = Zotero.Promise.coroutine(function* (goodLines, libraryID, tries) {
|
|
||||||
if(tries <= 0) throw new Zotero.Exception.Alert("recognizePDF.noMatches");
|
|
||||||
|
|
||||||
// Take the relevant parts of some lines (exclude hyphenated word)
|
|
||||||
var queryString = "", queryStringWords = 0, nextLine = 0;
|
|
||||||
while(queryStringWords < 25) {
|
|
||||||
if(!goodLines.length) throw new Zotero.Exception.Alert("recognizePDF.noMatches");
|
|
||||||
|
|
||||||
var words = goodLines.splice(nextLine, 1)[0].split(/\s+/);
|
|
||||||
// Try to avoid picking adjacent strings so the odds of them appearing in another
|
|
||||||
// document quoting our document is low. Every 7th line is a magic value
|
|
||||||
nextLine = (nextLine + 7) % goodLines.length;
|
|
||||||
|
|
||||||
// Get rid of first and last words
|
|
||||||
words.shift();
|
|
||||||
words.pop();
|
|
||||||
// Make sure there are no long words (probably OCR mistakes)
|
|
||||||
var skipLine = false;
|
|
||||||
for(var i=0; i<words.length; i++) {
|
|
||||||
if(words[i].length > 20) {
|
|
||||||
skipLine = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Add words to query
|
|
||||||
if(!skipLine && words.length) {
|
|
||||||
queryStringWords += words.length;
|
|
||||||
queryString += '"'+words.join(" ")+'" ';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Zotero.debug("RecognizePDF: Query string " + queryString);
|
|
||||||
|
|
||||||
var url = "https://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search",
|
|
||||||
delay = GOOGLE_SCHOLAR_QUERY_DELAY - (Date.now() - Zotero.HTTP.lastGoogleScholarQueryTime);
|
|
||||||
|
|
||||||
// Delay
|
|
||||||
if (delay > 0) {
|
|
||||||
yield Zotero.Promise.delay(delay);
|
|
||||||
}
|
|
||||||
Zotero.HTTP.lastGoogleScholarQueryTime = Date.now();
|
|
||||||
try {
|
|
||||||
let xmlhttp = yield Zotero.HTTP.request("GET", url, { "responseType": "document" })
|
|
||||||
.then(
|
|
||||||
function (xmlhttp) {
|
|
||||||
return _checkCaptchaOK(xmlhttp, 3);
|
|
||||||
},
|
|
||||||
function (e) {
|
|
||||||
return _checkCaptchaError(e, 3);
|
|
||||||
}
|
|
||||||
);
|
|
||||||
|
|
||||||
let doc = xmlhttp.response,
|
|
||||||
deferred = Zotero.Promise.defer(),
|
|
||||||
translate = new Zotero.Translate.Web();
|
|
||||||
|
|
||||||
translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289");
|
|
||||||
translate.setDocument(Zotero.HTTP.wrapDocument(doc, url));
|
|
||||||
translate.setHandler("translators", function(translate, detected) {
|
|
||||||
if(detected.length) {
|
|
||||||
deferred.resolve(_promiseTranslate(translate, libraryID));
|
|
||||||
} else {
|
|
||||||
deferred.resolve(Zotero.Promise.try(function() {
|
|
||||||
return queryGoogle(goodLines, libraryID, tries-1);
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
});
|
|
||||||
translate.getTranslators();
|
|
||||||
|
|
||||||
return deferred.promise;
|
|
||||||
}
|
|
||||||
catch (e) {
|
|
||||||
if(e.name == "recognizePDF.limit") {
|
|
||||||
queryLimitReached = true;
|
|
||||||
}
|
|
||||||
throw e;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Check for CAPTCHA on a page with HTTP 200 status
|
|
||||||
* @private
|
|
||||||
* @param {XMLHttpRequest} xmlhttp
|
|
||||||
* @param {Integer} tries Number of queries to attempt before giving up
|
|
||||||
* @return {Promise} A promise resolved when PDF metadata has been retrieved
|
|
||||||
*/
|
|
||||||
function _checkCaptchaOK(xmlhttp, tries) {
|
|
||||||
if(stopCheckCallback && stopCheckCallback()) {
|
|
||||||
throw new Zotero.Exception.Alert('recognizePDF.stopped');
|
|
||||||
}
|
|
||||||
|
|
||||||
Zotero.debug("RecognizePDF: (" + xmlhttp.status + ") Got page with title " + xmlhttp.response.title);
|
|
||||||
|
|
||||||
if(Zotero.Utilities.xpath(xmlhttp.response, "//form[@action='Captcha']").length) {
|
|
||||||
Zotero.debug("RecognizePDF: Found CAPTCHA on page.");
|
|
||||||
return _solveCaptcha(xmlhttp, tries);
|
|
||||||
}
|
|
||||||
return xmlhttp;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Check for CAPTCHA on an error page. Handle 403 and 503 pages
|
|
||||||
* @private
|
|
||||||
* @param {Zotero.HTTP.UnexpectedStatusException} e HTTP response error object
|
|
||||||
* @param {Integer} tries Number of queries to attempt before giving up
|
|
||||||
* @param {Boolean} dontClearCookies Whether to attempt to clear cookies in
|
|
||||||
* in order to get CAPTCHA to show up
|
|
||||||
* @return {Promise} A promise resolved when PDF metadata has been retrieved
|
|
||||||
*/
|
|
||||||
var _checkCaptchaError = Zotero.Promise.coroutine(function* (e, tries, dontClearCookies) {
|
|
||||||
if(stopCheckCallback && stopCheckCallback()) {
|
|
||||||
throw new Zotero.Exception.Alert('recognizePDF.stopped');
|
|
||||||
}
|
|
||||||
|
|
||||||
Zotero.debug("RecognizePDF: Checking for CAPTCHA on Google Scholar error page (" + e.status + ")");
|
|
||||||
|
|
||||||
// Check for captcha on error page
|
|
||||||
if(e instanceof Zotero.HTTP.UnexpectedStatusException
|
|
||||||
&& (e.status == 403 || e.status == 503) && e.xmlhttp.response) {
|
|
||||||
if(_extractCaptchaFormData(e.xmlhttp.response)) {
|
|
||||||
Zotero.debug("RecognizePDF: CAPTCHA found");
|
|
||||||
return _solveCaptcha(e.xmlhttp, tries);
|
|
||||||
} else if(!dontClearCookies && e.xmlhttp.channel) { // Make sure we can obtain original URL
|
|
||||||
// AFAICT, for 403 errors, GS just says "sorry, try later",
|
|
||||||
// but if you clear cookies, you get a CAPTCHA
|
|
||||||
Zotero.debug("RecognizePDF: No CAPTCHA detected on page. Clearing cookies.");
|
|
||||||
if(!_clearGSCookies(e.xmlhttp.channel.originalURI.host)) {
|
|
||||||
//user said no or no cookies removed
|
|
||||||
throw new Zotero.Exception.Alert('recognizePDF.limit');
|
|
||||||
}
|
|
||||||
// Redo GET request
|
|
||||||
Zotero.debug("RecognizePDF: Reloading page after clearing cookies.");
|
|
||||||
return Zotero.HTTP.request(
|
|
||||||
"GET", e.xmlhttp.channel.originalURI.spec, { "responseType": "document" }
|
|
||||||
)
|
|
||||||
.then(
|
|
||||||
function (xmlhttp) {
|
|
||||||
return _checkCaptchaOK(xmlhttp, tries);
|
|
||||||
},
|
|
||||||
function (e) {
|
|
||||||
return _checkCaptchaError(e, tries, true); // Don't try this again
|
|
||||||
}
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
Zotero.debug("RecognizePDF: Google Scholar returned an unexpected page"
|
|
||||||
+ " with status " + e.status);
|
|
||||||
throw new Zotero.Exception.Alert('recognizePDF.limit');
|
|
||||||
}
|
|
||||||
throw e;
|
|
||||||
});
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Prompt user to enter CPATCHA
|
|
||||||
* @private
|
|
||||||
* @param {XMLHttpRequest} xmlhttp
|
|
||||||
* @param {Integer} [tries] Number of queries to attempt before giving up
|
|
||||||
* @return {Promise} A promise resolved when PDF metadata has been retrieved
|
|
||||||
*/
|
|
||||||
function _solveCaptcha(xmlhttp, tries) {
|
|
||||||
var doc = xmlhttp.response;
|
|
||||||
|
|
||||||
if(tries === undefined) tries = 3;
|
|
||||||
|
|
||||||
if(!tries) {
|
|
||||||
Zotero.debug("RecognizePDF: Failed to solve CAPTCHA after multiple attempts.");
|
|
||||||
throw new Zotero.Exception.Alert('recognizePDF.limit');
|
|
||||||
}
|
|
||||||
|
|
||||||
tries--;
|
|
||||||
var formData = doc && _extractCaptchaFormData(doc);
|
|
||||||
if(!formData) {
|
|
||||||
Zotero.debug("RecognizePDF: Could not find CAPTCHA on page.");
|
|
||||||
throw new Zotero.Exception.Alert('recognizePDF.limit');
|
|
||||||
}
|
|
||||||
|
|
||||||
var io = { dataIn: {
|
|
||||||
title: Zotero.getString("recognizePDF.captcha.title"),
|
|
||||||
description: Zotero.getString("recognizePDF.captcha.description"),
|
|
||||||
imgUrl: formData.img
|
|
||||||
}};
|
|
||||||
|
|
||||||
_progressWindow.openDialog("chrome://zotero/content/captcha.xul", "",
|
|
||||||
"chrome,modal,resizable=no,centerscreen", io);
|
|
||||||
|
|
||||||
if(!io.dataOut) {
|
|
||||||
Zotero.debug("RecognizePDF: No CAPTCHA entered");
|
|
||||||
throw new Zotero.Exception.Alert('recognizePDF.limit');
|
|
||||||
}
|
|
||||||
|
|
||||||
Zotero.debug('RecognizePDF: User entered "' + io.dataOut.captcha + '" for CAPTCHA');
|
|
||||||
formData.input.captcha = io.dataOut.captcha;
|
|
||||||
var url = '', prop;
|
|
||||||
for(prop in formData.input) {
|
|
||||||
url += '&' + encodeURIComponent(prop) + '='
|
|
||||||
+ encodeURIComponent(formData.input[prop]);
|
|
||||||
}
|
|
||||||
|
|
||||||
url = formData.action + '?' + url.substr(1);
|
|
||||||
|
|
||||||
return Zotero.HTTP.promise("GET", url, {"responseType":"document"})
|
|
||||||
.then(function(xmlhttp) {
|
|
||||||
return _checkCaptchaOK(xmlhttp, tries);
|
|
||||||
},
|
|
||||||
function(e) {
|
|
||||||
return _checkCaptchaError(e, tries);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Extract CAPTCHA form-related data from the CAPTCHA page
|
|
||||||
* @private
|
|
||||||
* @param {Document} doc DOM document object for the CAPTCHA page
|
|
||||||
* @return {Object} Object containing data describing CAPTCHA form
|
|
||||||
*/
|
|
||||||
function _extractCaptchaFormData(doc) {
|
|
||||||
var formData = {};
|
|
||||||
|
|
||||||
var img = doc.getElementsByTagName('img')[0];
|
|
||||||
if(!img) return;
|
|
||||||
formData.img = img.src;
|
|
||||||
|
|
||||||
var form = doc.forms[0];
|
|
||||||
if(!form) return;
|
|
||||||
|
|
||||||
formData.action = form.action;
|
|
||||||
formData.input = {};
|
|
||||||
var inputs = form.getElementsByTagName('input');
|
|
||||||
for(var i=0, n=inputs.length; i<n; i++) {
|
|
||||||
if(!inputs[i].name) continue;
|
|
||||||
formData.input[inputs[i].name] = inputs[i].value;
|
|
||||||
}
|
|
||||||
|
|
||||||
formData.continue = "https://scholar.google.com";
|
|
||||||
|
|
||||||
return formData;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Clear Google cookies to get the CAPTCHA page to appear
|
|
||||||
* @private
|
|
||||||
* @param {String} host Host of the Google Scholar page (in case it's proxied)
|
|
||||||
* @return {Boolean} Whether any cookies were cleared
|
|
||||||
*/
|
|
||||||
function _clearGSCookies(host) {
|
|
||||||
/* There don't seem to be any negative effects of deleting GDSESS
|
|
||||||
if(!Zotero.isStandalone) {
|
|
||||||
//ask user first
|
|
||||||
var response = Components.classes["@mozilla.org/embedcomp/prompt-service;1"]
|
|
||||||
.getService(Components.interfaces.nsIPromptService)
|
|
||||||
.confirm(null, "Clear Google Scholar cookies?",
|
|
||||||
"Google Scholar is attempting to block further queries. We can "
|
|
||||||
+ "clear certain cookies and try again. This may affect some "
|
|
||||||
+ "temporary Google preferences or it may log you out. May we clear"
|
|
||||||
+ " your Google Scholar cookies?");
|
|
||||||
if(!response) return;
|
|
||||||
}*/
|
|
||||||
|
|
||||||
var removed = false, cookies = cookieService.getCookiesFromHost(host);
|
|
||||||
while(cookies.hasMoreElements()) {
|
|
||||||
var cookie = cookies.getNext().QueryInterface(Components.interfaces.nsICookie2);
|
|
||||||
if(["GDSESS", "PREF"].indexOf(cookie.name) !== -1) { // GDSESS doesn't seem to always be enough
|
|
||||||
Zotero.debug("RecognizePDF: Removing cookie " + cookie.name + " for host "
|
|
||||||
+ cookie.host + " and path " + cookie.path);
|
|
||||||
cookieService.remove(cookie.host, cookie.name, cookie.path, false);
|
|
||||||
removed = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if(!removed) {
|
|
||||||
Zotero.debug("RecognizePDF: No cookies removed");
|
|
||||||
}
|
|
||||||
|
|
||||||
return removed;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
212
chrome/content/zotero/recognizePDFDialog.js
Normal file
212
chrome/content/zotero/recognizePDFDialog.js
Normal file
|
@ -0,0 +1,212 @@
|
||||||
|
/*
|
||||||
|
***** BEGIN LICENSE BLOCK *****
|
||||||
|
|
||||||
|
Copyright © 2018 Center for History and New Media
|
||||||
|
George Mason University, Fairfax, Virginia, USA
|
||||||
|
http://zotero.org
|
||||||
|
|
||||||
|
This file is part of Zotero.
|
||||||
|
|
||||||
|
Zotero is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU Affero General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
Zotero is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU Affero General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Affero General Public License
|
||||||
|
along with Zotero. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
***** END LICENSE BLOCK *****
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @fileOverview Tools for automatically retrieving a citation for the given PDF
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Front end for recognizing PDFs
|
||||||
|
* @namespace
|
||||||
|
*/
|
||||||
|
|
||||||
|
let Zotero_RecognizePDF_Dialog = new function () {
|
||||||
|
const SUCCESS_IMAGE = 'chrome://zotero/skin/tick.png';
|
||||||
|
const FAILURE_IMAGE = 'chrome://zotero/skin/cross.png';
|
||||||
|
const LOADING_IMAGE = 'chrome://zotero/skin/arrow_refresh.png';
|
||||||
|
|
||||||
|
let _progressWindow = null;
|
||||||
|
let _progressIndicator = null;
|
||||||
|
let _rowIDs = [];
|
||||||
|
|
||||||
|
this.open = function() {
|
||||||
|
if (_progressWindow) {
|
||||||
|
_progressWindow.focus();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
_progressWindow = window.openDialog('chrome://zotero/content/recognizePDFDialog.xul', '', 'chrome,close=yes,resizable=yes,dependent,dialog,centerscreen');
|
||||||
|
_progressWindow.addEventListener('pageshow', _onWindowLoaded.bind(this), false);
|
||||||
|
};
|
||||||
|
|
||||||
|
function close() {
|
||||||
|
if (!_progressWindow) return;
|
||||||
|
Zotero.RecognizePDF.removeListener('rowadded');
|
||||||
|
Zotero.RecognizePDF.removeListener('rowupdated');
|
||||||
|
Zotero.RecognizePDF.removeListener('rowdeleted');
|
||||||
|
_progressWindow.close();
|
||||||
|
_progressWindow = null;
|
||||||
|
_progressIndicator = null;
|
||||||
|
_rowIDs = [];
|
||||||
|
}
|
||||||
|
|
||||||
|
function _getImageByStatus(status) {
|
||||||
|
if (status === Zotero.RecognizePDF.ROW_PROCESSING) {
|
||||||
|
return LOADING_IMAGE;
|
||||||
|
}
|
||||||
|
else if (status === Zotero.RecognizePDF.ROW_FAILED) {
|
||||||
|
return FAILURE_IMAGE;
|
||||||
|
}
|
||||||
|
else if (status === Zotero.RecognizePDF.ROW_SUCCEEDED) {
|
||||||
|
return SUCCESS_IMAGE;
|
||||||
|
}
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
function _rowToTreeItem(row) {
|
||||||
|
let treeitem = _progressWindow.document.createElement('treeitem');
|
||||||
|
treeitem.setAttribute('id', 'item-' + row.id);
|
||||||
|
|
||||||
|
let treerow = _progressWindow.document.createElement('treerow');
|
||||||
|
|
||||||
|
let treecell = _progressWindow.document.createElement('treecell');
|
||||||
|
treecell.setAttribute('id', 'item-' + row.id + '-icon');
|
||||||
|
treecell.setAttribute('src', _getImageByStatus(row.status));
|
||||||
|
|
||||||
|
treerow.appendChild(treecell);
|
||||||
|
|
||||||
|
treecell = _progressWindow.document.createElement('treecell');
|
||||||
|
treecell.setAttribute('label', row.fileName);
|
||||||
|
treerow.appendChild(treecell);
|
||||||
|
|
||||||
|
treecell = _progressWindow.document.createElement('treecell');
|
||||||
|
treecell.setAttribute('id', 'item-' + row.id + '-title');
|
||||||
|
treecell.setAttribute('label', row.message);
|
||||||
|
treerow.appendChild(treecell);
|
||||||
|
|
||||||
|
treeitem.appendChild(treerow);
|
||||||
|
return treeitem;
|
||||||
|
}
|
||||||
|
|
||||||
|
function _onWindowLoaded() {
|
||||||
|
let rows = Zotero.RecognizePDF.getRows();
|
||||||
|
_rowIDs = [];
|
||||||
|
let treechildren = _progressWindow.document.getElementById('treechildren');
|
||||||
|
|
||||||
|
for (let row of rows) {
|
||||||
|
_rowIDs.push(row.id);
|
||||||
|
let treeitem = _rowToTreeItem(row);
|
||||||
|
treechildren.appendChild(treeitem);
|
||||||
|
}
|
||||||
|
|
||||||
|
_progressWindow.document.getElementById('tree').addEventListener('dblclick',
|
||||||
|
function (event) {
|
||||||
|
_onDblClick(event, this);
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
_progressIndicator = _progressWindow.document.getElementById('progress-indicator');
|
||||||
|
_progressWindow.document.getElementById('cancel-button')
|
||||||
|
.addEventListener('command', function () {
|
||||||
|
close();
|
||||||
|
Zotero.RecognizePDF.cancel();
|
||||||
|
}, false);
|
||||||
|
|
||||||
|
_progressWindow.document.getElementById('minimize-button')
|
||||||
|
.addEventListener('command', function () {
|
||||||
|
close();
|
||||||
|
}, false);
|
||||||
|
|
||||||
|
_progressWindow.document.getElementById('close-button')
|
||||||
|
.addEventListener('command', function () {
|
||||||
|
close();
|
||||||
|
Zotero.RecognizePDF.cancel();
|
||||||
|
}, false);
|
||||||
|
|
||||||
|
_progressWindow.addEventListener('keypress', function (e) {
|
||||||
|
if (e.keyCode === KeyEvent.DOM_VK_ESCAPE) close();
|
||||||
|
});
|
||||||
|
_progressWindow.addEventListener('close', close.bind(this), false);
|
||||||
|
|
||||||
|
_updateProgress();
|
||||||
|
|
||||||
|
Zotero.RecognizePDF.addListener('rowadded', function (row) {
|
||||||
|
_rowIDs.push(row.id);
|
||||||
|
let treeitem = _rowToTreeItem(row);
|
||||||
|
treechildren.appendChild(treeitem);
|
||||||
|
_updateProgress();
|
||||||
|
});
|
||||||
|
|
||||||
|
Zotero.RecognizePDF.addListener('rowupdated', function (row) {
|
||||||
|
let itemIcon = _progressWindow.document.getElementById('item-' + row.id + '-icon');
|
||||||
|
let itemTitle = _progressWindow.document.getElementById('item-' + row.id + '-title');
|
||||||
|
|
||||||
|
itemIcon.setAttribute('src', _getImageByStatus(row.status));
|
||||||
|
itemTitle.setAttribute('label', row.message);
|
||||||
|
_updateProgress();
|
||||||
|
});
|
||||||
|
|
||||||
|
Zotero.RecognizePDF.addListener('rowdeleted', function (row) {
|
||||||
|
_rowIDs.splice(_rowIDs.indexOf(row.id), 1);
|
||||||
|
let treeitem = _progressWindow.document.getElementById('item-' + row.id);
|
||||||
|
treeitem.parentNode.removeChild(treeitem);
|
||||||
|
_updateProgress();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function _updateProgress() {
|
||||||
|
if (!_progressWindow) return;
|
||||||
|
let total = Zotero.RecognizePDF.getTotal();
|
||||||
|
let processed = Zotero.RecognizePDF.getProcessedTotal();
|
||||||
|
_progressIndicator.value = processed * 100 / total;
|
||||||
|
if (processed === total) {
|
||||||
|
_progressWindow.document.getElementById("cancel-button").hidden = true;
|
||||||
|
_progressWindow.document.getElementById("minimize-button").hidden = true;
|
||||||
|
_progressWindow.document.getElementById("close-button").hidden = false;
|
||||||
|
_progressWindow.document.getElementById("label").value = Zotero.getString('recognizePDF.complete.label');
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
_progressWindow.document.getElementById("cancel-button").hidden = false;
|
||||||
|
_progressWindow.document.getElementById("minimize-button").hidden = false;
|
||||||
|
_progressWindow.document.getElementById("close-button").hidden = true;
|
||||||
|
_progressWindow.document.getElementById("label").value = Zotero.getString('recognizePDF.recognizing.label');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Focus items in Zotero library when double-clicking them in the Retrieve
|
||||||
|
* metadata window.
|
||||||
|
* @param {Event} event
|
||||||
|
* @param {tree} tree XUL tree object
|
||||||
|
* @private
|
||||||
|
*/
|
||||||
|
async function _onDblClick(event, tree) {
|
||||||
|
if (event && tree && event.type === 'dblclick') {
|
||||||
|
let itemID = _rowIDs[tree.treeBoxObject.getRowAt(event.clientX, event.clientY)];
|
||||||
|
if (!itemID) return;
|
||||||
|
|
||||||
|
let item = await Zotero.Items.getAsync(itemID);
|
||||||
|
if (!item) return;
|
||||||
|
|
||||||
|
if (item.parentItemID) itemID = item.parentItemID;
|
||||||
|
|
||||||
|
if (window.ZoteroOverlay) {
|
||||||
|
window.ZoteroOverlay.toggleDisplay(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
window.ZoteroPane.selectItem(itemID, false, true);
|
||||||
|
window.focus();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
|
@ -6,10 +6,12 @@
|
||||||
title="&zotero.progress.title;" width="550" height="230"
|
title="&zotero.progress.title;" width="550" height="230"
|
||||||
id="zotero-progress">
|
id="zotero-progress">
|
||||||
<vbox style="padding:10px" flex="1">
|
<vbox style="padding:10px" flex="1">
|
||||||
<label id="label" control="progress-indicator" value="&zotero.recognizePDF.recognizing.label;"/>
|
<label id="label" control="progress-indicator" value=""/>
|
||||||
<hbox align="center">
|
<hbox align="center">
|
||||||
<progressmeter id="progress-indicator" mode="determined" flex="1"/>
|
<progressmeter id="progress-indicator" mode="determined" flex="1"/>
|
||||||
<button id="cancel-button" label="&zotero.recognizePDF.cancel.label;"/>
|
<button id="cancel-button" label="&zotero.general.cancel;"/>
|
||||||
|
<button id="minimize-button" label="&zotero.general.minimize;"/>
|
||||||
|
<button id="close-button" label="&zotero.general.close;"/>
|
||||||
</hbox>
|
</hbox>
|
||||||
<tree flex="1" id="tree" hidecolumnpicker="true">
|
<tree flex="1" id="tree" hidecolumnpicker="true">
|
||||||
<treecols>
|
<treecols>
|
479
chrome/content/zotero/xpcom/recognizePDF.js
Normal file
479
chrome/content/zotero/xpcom/recognizePDF.js
Normal file
|
@ -0,0 +1,479 @@
|
||||||
|
/*
|
||||||
|
***** BEGIN LICENSE BLOCK *****
|
||||||
|
|
||||||
|
Copyright © 2018 Center for History and New Media
|
||||||
|
George Mason University, Fairfax, Virginia, USA
|
||||||
|
http://zotero.org
|
||||||
|
|
||||||
|
This file is part of Zotero.
|
||||||
|
|
||||||
|
Zotero is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU Affero General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
Zotero is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU Affero General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Affero General Public License
|
||||||
|
along with Zotero. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
***** END LICENSE BLOCK *****
|
||||||
|
*/
|
||||||
|
|
||||||
|
Zotero.RecognizePDF = new function () {
|
||||||
|
const OFFLINE_RECHECK_DELAY = 60 * 1000;
|
||||||
|
const MAX_PAGES = 5;
|
||||||
|
|
||||||
|
this.ROW_QUEUED = 1;
|
||||||
|
this.ROW_PROCESSING = 2;
|
||||||
|
this.ROW_FAILED = 3;
|
||||||
|
this.ROW_SUCCEEDED = 4;
|
||||||
|
|
||||||
|
let _listeners = {};
|
||||||
|
let _rows = [];
|
||||||
|
let _queue = [];
|
||||||
|
let _queueProcessing = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add listener
|
||||||
|
* @param name Event name
|
||||||
|
* @param callback
|
||||||
|
*/
|
||||||
|
this.addListener = function (name, callback) {
|
||||||
|
_listeners[name] = callback;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Remove listener
|
||||||
|
* @param name Event name
|
||||||
|
*/
|
||||||
|
this.removeListener = function (name) {
|
||||||
|
delete _listeners[name];
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks whether a given PDF could theoretically be recognized
|
||||||
|
* @param {Zotero.Item} item
|
||||||
|
* @return {Boolean} True if the PDF can be recognized, false if it cannot be
|
||||||
|
*/
|
||||||
|
this.canRecognize = function (item) {
|
||||||
|
return item.attachmentContentType
|
||||||
|
&& item.attachmentContentType === 'application/pdf'
|
||||||
|
&& item.isTopLevelItem();
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds items to the queue and starts processing it
|
||||||
|
* @param items {Zotero.Item}
|
||||||
|
*/
|
||||||
|
this.recognizeItems = function (items) {
|
||||||
|
for (let item of items) {
|
||||||
|
_addItem(item);
|
||||||
|
}
|
||||||
|
_processQueue();
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns all rows
|
||||||
|
* @return {Array}
|
||||||
|
*/
|
||||||
|
this.getRows = function () {
|
||||||
|
return _rows;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns rows count
|
||||||
|
* @return {Number}
|
||||||
|
*/
|
||||||
|
this.getTotal = function () {
|
||||||
|
return _rows.length;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns processed rows count
|
||||||
|
* @return {Number}
|
||||||
|
*/
|
||||||
|
this.getProcessedTotal = function () {
|
||||||
|
return _rows.filter(row => row.status > Zotero.RecognizePDF.ROW_PROCESSING).length;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stop processing items
|
||||||
|
*/
|
||||||
|
this.cancel = function () {
|
||||||
|
_queue = [];
|
||||||
|
_rows = [];
|
||||||
|
if (_listeners['empty']) {
|
||||||
|
_listeners['empty']();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add item for processing
|
||||||
|
* @param item
|
||||||
|
* @return {null}
|
||||||
|
*/
|
||||||
|
function _addItem(item) {
|
||||||
|
for (let row of _rows) {
|
||||||
|
if (row.id === item.id) {
|
||||||
|
if (row.status > Zotero.RecognizePDF.ROW_PROCESSING) {
|
||||||
|
_deleteRow(row.id);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let row = {
|
||||||
|
id: item.id,
|
||||||
|
status: Zotero.RecognizePDF.ROW_QUEUED,
|
||||||
|
fileName: item.getField('title'),
|
||||||
|
message: ''
|
||||||
|
};
|
||||||
|
|
||||||
|
_rows.unshift(row);
|
||||||
|
_queue.unshift(item.id);
|
||||||
|
|
||||||
|
if (_listeners['rowadded']) {
|
||||||
|
_listeners['rowadded'](row);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (_listeners['nonempty'] && _rows.length === 1) {
|
||||||
|
_listeners['nonempty']();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update row status and message
|
||||||
|
* @param itemID
|
||||||
|
* @param status
|
||||||
|
* @param message
|
||||||
|
*/
|
||||||
|
function _updateRow(itemID, status, message) {
|
||||||
|
for (let row of _rows) {
|
||||||
|
if (row.id === itemID) {
|
||||||
|
row.status = status;
|
||||||
|
row.message = message;
|
||||||
|
if (_listeners['rowupdated']) {
|
||||||
|
_listeners['rowupdated']({
|
||||||
|
id: row.id,
|
||||||
|
status,
|
||||||
|
message: message || ''
|
||||||
|
});
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delete row
|
||||||
|
* @param itemID
|
||||||
|
*/
|
||||||
|
function _deleteRow(itemID) {
|
||||||
|
for (let i = 0; i < _rows.length; i++) {
|
||||||
|
let row = _rows[i];
|
||||||
|
if (row.id === itemID) {
|
||||||
|
_rows.splice(i, 1);
|
||||||
|
if (_listeners['rowdeleted']) {
|
||||||
|
_listeners['rowdeleted']({
|
||||||
|
id: row.id
|
||||||
|
});
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Triggers queue processing and returns when all items in the queue are processed
|
||||||
|
* @return {Promise}
|
||||||
|
*/
|
||||||
|
async function _processQueue() {
|
||||||
|
await Zotero.Schema.schemaUpdatePromise;
|
||||||
|
|
||||||
|
if (_queueProcessing) return;
|
||||||
|
_queueProcessing = true;
|
||||||
|
|
||||||
|
while (1) {
|
||||||
|
if (Zotero.HTTP.browserIsOffline()) {
|
||||||
|
await Zotero.Promise.delay(OFFLINE_RECHECK_DELAY);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let itemID = _queue.shift();
|
||||||
|
if (!itemID) break;
|
||||||
|
|
||||||
|
_updateRow(itemID, Zotero.RecognizePDF.ROW_PROCESSING, Zotero.getString('recognizePDF.processing'));
|
||||||
|
|
||||||
|
try {
|
||||||
|
let newItem = await _processItem(itemID);
|
||||||
|
|
||||||
|
if (newItem) {
|
||||||
|
_updateRow(itemID, Zotero.RecognizePDF.ROW_SUCCEEDED, newItem.getField('title'));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
_updateRow(itemID, Zotero.RecognizePDF.ROW_FAILED, Zotero.getString('recognizePDF.noMatches'));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (e) {
|
||||||
|
Zotero.logError(e);
|
||||||
|
|
||||||
|
_updateRow(
|
||||||
|
itemID,
|
||||||
|
Zotero.RecognizePDF.ROW_FAILED,
|
||||||
|
e instanceof Zotero.Exception.Alert
|
||||||
|
? e.message
|
||||||
|
: Zotero.getString('recognizePDF.error')
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
_queueProcessing = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Processes the item and places it as a children of the new item
|
||||||
|
* @param itemID
|
||||||
|
* @return {Promise}
|
||||||
|
*/
|
||||||
|
async function _processItem(itemID) {
|
||||||
|
let item = await Zotero.Items.getAsync(itemID);
|
||||||
|
|
||||||
|
if (!item || item.parentItemID) throw new Zotero.Exception.Alert('recognizePDF.fileNotFound');
|
||||||
|
|
||||||
|
let newItem = await _recognize(item);
|
||||||
|
|
||||||
|
if (newItem) {
|
||||||
|
// put new item in same collections as the old one
|
||||||
|
let itemCollections = item.getCollections();
|
||||||
|
await Zotero.DB.executeTransaction(async function () {
|
||||||
|
for (let itemCollection of itemCollections) {
|
||||||
|
let collection = Zotero.Collections.get(itemCollection);
|
||||||
|
await collection.addItem(newItem.id);
|
||||||
|
}
|
||||||
|
|
||||||
|
// put old item as a child of the new item
|
||||||
|
item.parentID = newItem.id;
|
||||||
|
await item.save();
|
||||||
|
});
|
||||||
|
|
||||||
|
return newItem
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get json from a PDF
|
||||||
|
* @param {String} filePath PDF file path
|
||||||
|
* @param {Number} pages Number of pages to extract
|
||||||
|
* @return {Promise}
|
||||||
|
*/
|
||||||
|
async function extractJSON(filePath, pages) {
|
||||||
|
let cacheFile = Zotero.getTempDirectory();
|
||||||
|
cacheFile.append("recognizePDFcache.txt");
|
||||||
|
if (cacheFile.exists()) {
|
||||||
|
cacheFile.remove(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
let {exec, args} = Zotero.Fulltext.getPDFConverterExecAndArgs();
|
||||||
|
args.push('-json', '-l', pages, filePath, cacheFile.path);
|
||||||
|
|
||||||
|
Zotero.debug("RecognizePDF: Running " + exec.path + " " + args.map(arg => "'" + arg + "'").join(" "));
|
||||||
|
|
||||||
|
try {
|
||||||
|
await Zotero.Utilities.Internal.exec(exec, args);
|
||||||
|
let content = await Zotero.File.getContentsAsync(cacheFile.path);
|
||||||
|
Zotero.debug("RecognizePDF: Extracted JSON:");
|
||||||
|
Zotero.debug(content);
|
||||||
|
cacheFile.remove(false);
|
||||||
|
return JSON.parse(content);
|
||||||
|
}
|
||||||
|
catch (e) {
|
||||||
|
Zotero.logError(e);
|
||||||
|
try {
|
||||||
|
cacheFile.remove(false);
|
||||||
|
} catch(e) {
|
||||||
|
Zotero.logError(e);
|
||||||
|
}
|
||||||
|
throw new Zotero.Exception.Alert("recognizePDF.couldNotRead");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Attach appropriate handlers to a Zotero.Translate instance and begin translation
|
||||||
|
* @return {Promise}
|
||||||
|
*/
|
||||||
|
async function _promiseTranslate(translate, libraryID) {
|
||||||
|
translate.setHandler('select', function (translate, items, callback) {
|
||||||
|
for (let i in items) {
|
||||||
|
let obj = {};
|
||||||
|
obj[i] = items[i];
|
||||||
|
callback(obj);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
let newItems = await translate.translate({
|
||||||
|
libraryID,
|
||||||
|
saveAttachments: false
|
||||||
|
});
|
||||||
|
if (newItems.length) {
|
||||||
|
return newItems[0];
|
||||||
|
}
|
||||||
|
throw new Error('No items found');
|
||||||
|
}
|
||||||
|
|
||||||
|
async function _query(json) {
|
||||||
|
let uri = Zotero.Prefs.get("api.url") || ZOTERO_CONFIG.API_URL;
|
||||||
|
|
||||||
|
if (!uri.endsWith('/')) {
|
||||||
|
uri += '/';
|
||||||
|
}
|
||||||
|
|
||||||
|
uri += 'recognize';
|
||||||
|
|
||||||
|
let client = Zotero.Sync.Runner.getAPIClient();
|
||||||
|
|
||||||
|
let req = await client.makeRequest(
|
||||||
|
'POST',
|
||||||
|
uri,
|
||||||
|
{
|
||||||
|
successCodes: [200],
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json'
|
||||||
|
},
|
||||||
|
body: JSON.stringify(json),
|
||||||
|
noAPIKey: true
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
return JSON.parse(req.responseText);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieves metadata for a PDF and saves it as an item
|
||||||
|
* @param {Zotero.Item} item
|
||||||
|
* @return {Promise}
|
||||||
|
*/
|
||||||
|
async function _recognize(item) {
|
||||||
|
let filePath = await item.getFilePath();
|
||||||
|
|
||||||
|
if (!filePath || !await OS.File.exists(filePath)) throw new Zotero.Exception.Alert('recognizePDF.fileNotFound');
|
||||||
|
|
||||||
|
let json = await extractJSON(filePath, MAX_PAGES);
|
||||||
|
|
||||||
|
let containingTextPages = 0;
|
||||||
|
|
||||||
|
for(let page of json.pages) {
|
||||||
|
if(page[2].length) {
|
||||||
|
containingTextPages++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!containingTextPages) {
|
||||||
|
throw new Zotero.Exception.Alert('recognizePDF.noOCR');
|
||||||
|
}
|
||||||
|
|
||||||
|
let libraryID = item.libraryID;
|
||||||
|
|
||||||
|
let res = await _query(json);
|
||||||
|
if (!res) return null;
|
||||||
|
|
||||||
|
if (res.doi) {
|
||||||
|
Zotero.debug('RecognizePDF: Getting metadata by DOI');
|
||||||
|
let translateDOI = new Zotero.Translate.Search();
|
||||||
|
translateDOI.setTranslator('11645bd1-0420-45c1-badb-53fb41eeb753');
|
||||||
|
translateDOI.setSearch({'itemType': 'journalArticle', 'DOI': res.doi});
|
||||||
|
try {
|
||||||
|
let newItem = await _promiseTranslate(translateDOI, libraryID);
|
||||||
|
if (!newItem.abstractNote && res.abstract) {
|
||||||
|
newItem.setField('abstractNote', res.abstract);
|
||||||
|
}
|
||||||
|
newItem.saveTx();
|
||||||
|
return newItem;
|
||||||
|
}
|
||||||
|
catch (e) {
|
||||||
|
Zotero.debug('RecognizePDF: ' + e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (res.isbn) {
|
||||||
|
Zotero.debug('RecognizePDF: Getting metadata by ISBN');
|
||||||
|
let translate = new Zotero.Translate.Search();
|
||||||
|
translate.setSearch({'itemType': 'book', 'ISBN': res.isbn});
|
||||||
|
try {
|
||||||
|
let translatedItems = await translate.translate({
|
||||||
|
libraryID: false,
|
||||||
|
saveAttachments: false
|
||||||
|
});
|
||||||
|
Zotero.debug('RecognizePDF: Translated items:');
|
||||||
|
Zotero.debug(translatedItems);
|
||||||
|
if (translatedItems.length) {
|
||||||
|
let newItem = new Zotero.Item;
|
||||||
|
newItem.fromJSON(translatedItems[0]);
|
||||||
|
newItem.libraryID = libraryID;
|
||||||
|
if (!newItem.abstractNote && res.abstract) {
|
||||||
|
newItem.setField('abstractNote', res.abstract);
|
||||||
|
}
|
||||||
|
newItem.saveTx();
|
||||||
|
return newItem;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (e) {
|
||||||
|
Zotero.debug('RecognizePDF: ' + e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (res.title) {
|
||||||
|
|
||||||
|
let type = 'journalArticle';
|
||||||
|
|
||||||
|
if (res.type === 'book-chapter') {
|
||||||
|
type = 'bookSection';
|
||||||
|
}
|
||||||
|
|
||||||
|
let newItem = new Zotero.Item(type);
|
||||||
|
newItem.setField('title', res.title);
|
||||||
|
|
||||||
|
let creators = [];
|
||||||
|
for (let author of res.authors) {
|
||||||
|
creators.push({
|
||||||
|
firstName: author.firstName,
|
||||||
|
lastName: author.lastName,
|
||||||
|
creatorType: 'author'
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
newItem.setCreators(creators);
|
||||||
|
|
||||||
|
if (res.abstract) newItem.setField('abstractNote', res.abstract);
|
||||||
|
if (res.year) newItem.setField('date', res.year);
|
||||||
|
if (res.pages) newItem.setField('pages', res.pages);
|
||||||
|
if (res.volume) newItem.setField('volume', res.volume);
|
||||||
|
if (res.url) newItem.setField('url', res.url);
|
||||||
|
|
||||||
|
if (type === 'journalArticle') {
|
||||||
|
if (res.issue) newItem.setField('issue', res.issue);
|
||||||
|
if (res.ISSN) newItem.setField('issn', res.issn);
|
||||||
|
if (res.container) newItem.setField('publicationTitle', res.container);
|
||||||
|
}
|
||||||
|
else if (type === 'bookSection') {
|
||||||
|
if (res.container) newItem.setField('bookTitle', res.container);
|
||||||
|
if (res.publisher) newItem.setField('publisher', res.publisher);
|
||||||
|
}
|
||||||
|
|
||||||
|
newItem.setField('libraryCatalog', 'Zotero');
|
||||||
|
|
||||||
|
await newItem.saveTx();
|
||||||
|
return newItem;
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
|
@ -85,6 +85,14 @@ var ZoteroPane = new function()
|
||||||
// Set key down handler
|
// Set key down handler
|
||||||
document.getElementById('appcontent').addEventListener('keydown', ZoteroPane_Local.handleKeyDown, true);
|
document.getElementById('appcontent').addEventListener('keydown', ZoteroPane_Local.handleKeyDown, true);
|
||||||
|
|
||||||
|
Zotero.RecognizePDF.addListener('empty', function (row) {
|
||||||
|
document.getElementById('zotero-tb-recognize').hidden = true;
|
||||||
|
});
|
||||||
|
|
||||||
|
Zotero.RecognizePDF.addListener('nonempty', function (row) {
|
||||||
|
document.getElementById('zotero-tb-recognize').hidden = false;
|
||||||
|
});
|
||||||
|
|
||||||
_loaded = true;
|
_loaded = true;
|
||||||
|
|
||||||
var zp = document.getElementById('zotero-pane');
|
var zp = document.getElementById('zotero-pane');
|
||||||
|
@ -2783,7 +2791,7 @@ var ZoteroPane = new function()
|
||||||
canIndex = false;
|
canIndex = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (canRecognize && !Zotero_RecognizePDF.canRecognize(item)) {
|
if (canRecognize && !Zotero.RecognizePDF.canRecognize(item)) {
|
||||||
canRecognize = false;
|
canRecognize = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2876,7 +2884,7 @@ var ZoteroPane = new function()
|
||||||
if (item.isAttachment()) {
|
if (item.isAttachment()) {
|
||||||
var showSep4 = false;
|
var showSep4 = false;
|
||||||
|
|
||||||
if (Zotero_RecognizePDF.canRecognize(item)) {
|
if (Zotero.RecognizePDF.canRecognize(item)) {
|
||||||
show.push(m.recognizePDF);
|
show.push(m.recognizePDF);
|
||||||
showSep4 = true;
|
showSep4 = true;
|
||||||
}
|
}
|
||||||
|
@ -4908,6 +4916,11 @@ var ZoteroPane = new function()
|
||||||
if(_beforeReloadFunctions.indexOf(func) === -1) _beforeReloadFunctions.push(func);
|
if(_beforeReloadFunctions.indexOf(func) === -1) _beforeReloadFunctions.push(func);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
this.recognizeSelected = function() {
|
||||||
|
Zotero.RecognizePDF.recognizeItems(ZoteroPane.getSelectedItems());
|
||||||
|
Zotero_RecognizePDF_Dialog.open();
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Implements nsIObserver for Zotero reload
|
* Implements nsIObserver for Zotero reload
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -42,7 +42,7 @@
|
||||||
<script src="fileInterface.js"/>
|
<script src="fileInterface.js"/>
|
||||||
<script src="reportInterface.js"/>
|
<script src="reportInterface.js"/>
|
||||||
<script src="timelineInterface.js"/>
|
<script src="timelineInterface.js"/>
|
||||||
<script src="recognizePDF.js"/>
|
<script src="recognizePDFDialog.js"/>
|
||||||
<script src="browser.js" type="application/javascript;version=1.8"/>
|
<script src="browser.js" type="application/javascript;version=1.8"/>
|
||||||
<script src="lookup.js"/>
|
<script src="lookup.js"/>
|
||||||
<script src="locateMenu.js" type="application/javascript;version=1.8"/>
|
<script src="locateMenu.js" type="application/javascript;version=1.8"/>
|
||||||
|
@ -242,6 +242,10 @@
|
||||||
</tooltip>
|
</tooltip>
|
||||||
</hbox>
|
</hbox>
|
||||||
</hbox>
|
</hbox>
|
||||||
|
|
||||||
|
<toolbarbutton id="zotero-tb-recognize" hidden="true"
|
||||||
|
oncommand="Zotero_RecognizePDF_Dialog.open()"/>
|
||||||
|
|
||||||
<toolbarbutton id="zotero-tb-sync-error" hidden="true"/>
|
<toolbarbutton id="zotero-tb-sync-error" hidden="true"/>
|
||||||
|
|
||||||
<!--
|
<!--
|
||||||
|
@ -322,7 +326,7 @@
|
||||||
<menuitem class="menuitem-iconic zotero-menuitem-create-bibliography" oncommand="Zotero_File_Interface.bibliographyFromItems();"/>
|
<menuitem class="menuitem-iconic zotero-menuitem-create-bibliography" oncommand="Zotero_File_Interface.bibliographyFromItems();"/>
|
||||||
<menuitem class="menuitem-iconic zotero-menuitem-create-report" oncommand="Zotero_Report_Interface.loadItemReport(event)"/>
|
<menuitem class="menuitem-iconic zotero-menuitem-create-report" oncommand="Zotero_Report_Interface.loadItemReport(event)"/>
|
||||||
<menuseparator/>
|
<menuseparator/>
|
||||||
<menuitem class="menuitem-iconic zotero-menuitem-retrieve-metadata" oncommand="Zotero_RecognizePDF.recognizeSelected();"/>
|
<menuitem class="menuitem-iconic zotero-menuitem-retrieve-metadata" oncommand="ZoteroPane.recognizeSelected();"/>
|
||||||
<menuitem class="menuitem-iconic zotero-menuitem-create-parent" oncommand="ZoteroPane_Local.createParentItemsFromSelected();"/>
|
<menuitem class="menuitem-iconic zotero-menuitem-create-parent" oncommand="ZoteroPane_Local.createParentItemsFromSelected();"/>
|
||||||
<menuitem class="menuitem-iconic zotero-menuitem-rename-from-parent" oncommand="ZoteroPane_Local.renameSelectedAttachmentsFromParents()"/>
|
<menuitem class="menuitem-iconic zotero-menuitem-rename-from-parent" oncommand="ZoteroPane_Local.renameSelectedAttachmentsFromParents()"/>
|
||||||
<menuitem class="menuitem-iconic zotero-menuitem-reindex" oncommand="ZoteroPane_Local.reindexItem();"/>
|
<menuitem class="menuitem-iconic zotero-menuitem-reindex" oncommand="ZoteroPane_Local.reindexItem();"/>
|
||||||
|
|
|
@ -14,6 +14,9 @@
|
||||||
<!ENTITY zotero.general.tools "Tools">
|
<!ENTITY zotero.general.tools "Tools">
|
||||||
<!ENTITY zotero.general.more "More">
|
<!ENTITY zotero.general.more "More">
|
||||||
<!ENTITY zotero.general.loading "Loading…">
|
<!ENTITY zotero.general.loading "Loading…">
|
||||||
|
<!ENTITY zotero.general.close "Close">
|
||||||
|
<!ENTITY zotero.general.minimize "Minimize">
|
||||||
|
|
||||||
|
|
||||||
<!ENTITY zotero.errorReport.title "Zotero Error Report">
|
<!ENTITY zotero.errorReport.title "Zotero Error Report">
|
||||||
<!ENTITY zotero.errorReport.submissionInProgress "Please wait while the error report is submitted.">
|
<!ENTITY zotero.errorReport.submissionInProgress "Please wait while the error report is submitted.">
|
||||||
|
@ -281,9 +284,6 @@
|
||||||
<!ENTITY zotero.feedSettings.cleanupReadAfter.label1 "Remove read feed items after">
|
<!ENTITY zotero.feedSettings.cleanupReadAfter.label1 "Remove read feed items after">
|
||||||
<!ENTITY zotero.feedSettings.cleanupReadAfter.label2 "day(s)">
|
<!ENTITY zotero.feedSettings.cleanupReadAfter.label2 "day(s)">
|
||||||
|
|
||||||
|
|
||||||
<!ENTITY zotero.recognizePDF.recognizing.label "Retrieving Metadata…">
|
|
||||||
<!ENTITY zotero.recognizePDF.cancel.label "Cancel">
|
|
||||||
<!ENTITY zotero.recognizePDF.pdfName.label "PDF Name">
|
<!ENTITY zotero.recognizePDF.pdfName.label "PDF Name">
|
||||||
<!ENTITY zotero.recognizePDF.itemName.label "Item Name">
|
<!ENTITY zotero.recognizePDF.itemName.label "Item Name">
|
||||||
|
|
||||||
|
|
|
@ -63,6 +63,8 @@ general.tryLater = Try Later
|
||||||
general.showDirectory = Show Directory
|
general.showDirectory = Show Directory
|
||||||
general.continue = Continue
|
general.continue = Continue
|
||||||
general.copyToClipboard = Copy to Clipboard
|
general.copyToClipboard = Copy to Clipboard
|
||||||
|
general.cancel = Cancel
|
||||||
|
general.clear = Clear
|
||||||
|
|
||||||
general.operationInProgress = A Zotero operation is currently in progress.
|
general.operationInProgress = A Zotero operation is currently in progress.
|
||||||
general.operationInProgress.waitUntilFinished = Please wait until it has finished.
|
general.operationInProgress.waitUntilFinished = Please wait until it has finished.
|
||||||
|
@ -1051,14 +1053,10 @@ recognizePDF.noOCR = PDF does not contain OCRed text.
|
||||||
recognizePDF.couldNotRead = Could not read text from PDF.
|
recognizePDF.couldNotRead = Could not read text from PDF.
|
||||||
recognizePDF.noMatches = No matching references found
|
recognizePDF.noMatches = No matching references found
|
||||||
recognizePDF.fileNotFound = File not found
|
recognizePDF.fileNotFound = File not found
|
||||||
recognizePDF.limit = Google Scholar query limit reached. Try again later.
|
recognizePDF.error = An unexpected error occurred.
|
||||||
recognizePDF.error = An unexpected error occurred.
|
recognizePDF.recognizing.label = Retrieving Metadata…
|
||||||
recognizePDF.stopped = Cancelled
|
|
||||||
recognizePDF.complete.label = Metadata Retrieval Complete
|
recognizePDF.complete.label = Metadata Retrieval Complete
|
||||||
recognizePDF.cancelled.label = Metadata Retrieval Cancelled
|
recognizePDF.processing = Processing
|
||||||
recognizePDF.close.label = Close
|
|
||||||
recognizePDF.captcha.title = Please enter CAPTCHA
|
|
||||||
recognizePDF.captcha.description = Zotero uses Google Scholar to help identify PDFs. To continue using Google Scholar, please enter the text from the image below.
|
|
||||||
|
|
||||||
rtfScan.openTitle = Select a file to scan
|
rtfScan.openTitle = Select a file to scan
|
||||||
rtfScan.scanning.label = Scanning RTF Document…
|
rtfScan.scanning.label = Scanning RTF Document…
|
||||||
|
|
|
@ -622,6 +622,10 @@
|
||||||
text-align: right;
|
text-align: right;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#zotero-tb-recognize {
|
||||||
|
list-style-image: url(chrome://zotero/skin/document-search-result.png);
|
||||||
|
}
|
||||||
|
|
||||||
/* Sync error icon */
|
/* Sync error icon */
|
||||||
#zotero-tb-sync-error {
|
#zotero-tb-sync-error {
|
||||||
list-style-image: url(chrome://zotero/skin/error.png);
|
list-style-image: url(chrome://zotero/skin/error.png);
|
||||||
|
|
|
@ -101,6 +101,7 @@ const xpcomFilesLocal = [
|
||||||
'mime',
|
'mime',
|
||||||
'notifier',
|
'notifier',
|
||||||
'quickCopy',
|
'quickCopy',
|
||||||
|
'recognizePDF',
|
||||||
'report',
|
'report',
|
||||||
'router',
|
'router',
|
||||||
'schema',
|
'schema',
|
||||||
|
|
|
@ -16,7 +16,7 @@ describe("PDF Recognition", function() {
|
||||||
});
|
});
|
||||||
|
|
||||||
afterEach(function() {
|
afterEach(function() {
|
||||||
for(let win of getWindows("chrome://zotero/content/pdfProgress.xul")) {
|
for(let win of getWindows("chrome://zotero/content/recognizePDFDialog.xul")) {
|
||||||
win.close();
|
win.close();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -27,34 +27,7 @@ describe("PDF Recognition", function() {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should recognize a PDF with a DOI within a collection", function* () {
|
it("should recognize a PDF", function* () {
|
||||||
this.timeout(30000);
|
|
||||||
// Import the PDF
|
|
||||||
var testdir = getTestDataDirectory();
|
|
||||||
testdir.append("recognizePDF_test_DOI.pdf");
|
|
||||||
|
|
||||||
var col = yield createDataObject('collection');
|
|
||||||
yield waitForItemsLoad(win);
|
|
||||||
|
|
||||||
var attachment = yield Zotero.Attachments.importFromFile({
|
|
||||||
file: testdir,
|
|
||||||
collections: [col.id]
|
|
||||||
});
|
|
||||||
|
|
||||||
// Recognize the PDF
|
|
||||||
win.Zotero_RecognizePDF.recognizeSelected();
|
|
||||||
|
|
||||||
var ids = yield waitForItemEvent("add");
|
|
||||||
yield waitForNotifierEvent('add', 'collection-item')
|
|
||||||
|
|
||||||
var item = Zotero.Items.get(ids[0]);
|
|
||||||
assert.equal(item.getField("title"), "Shaping the Research Agenda");
|
|
||||||
assert.equal(item.getField("libraryCatalog"), "CrossRef");
|
|
||||||
assert.equal(attachment.parentID, item.id);
|
|
||||||
assert.isTrue(col.hasItem(item.id));
|
|
||||||
});
|
|
||||||
|
|
||||||
it("should recognize a PDF without a DOI", function* () {
|
|
||||||
this.timeout(30000);
|
this.timeout(30000);
|
||||||
// Import the PDF
|
// Import the PDF
|
||||||
var testdir = getTestDataDirectory();
|
var testdir = getTestDataDirectory();
|
||||||
|
@ -64,19 +37,18 @@ describe("PDF Recognition", function() {
|
||||||
});
|
});
|
||||||
|
|
||||||
// Recognize the PDF
|
// Recognize the PDF
|
||||||
win.Zotero_RecognizePDF.recognizeSelected();
|
win.ZoteroPane.recognizeSelected();
|
||||||
|
|
||||||
var addedIDs = yield waitForItemEvent("add");
|
var addedIDs = yield waitForItemEvent("add");
|
||||||
var modifiedIDs = yield waitForItemEvent("modify");
|
var modifiedIDs = yield waitForItemEvent("modify");
|
||||||
assert.lengthOf(addedIDs, 1);
|
assert.lengthOf(addedIDs, 1);
|
||||||
var item = Zotero.Items.get(addedIDs[0]);
|
var item = Zotero.Items.get(addedIDs[0]);
|
||||||
assert.equal(item.getField("title"), "Scaling study of an improved fermion action on quenched lattices");
|
assert.equal(item.getField("title"), "Scaling study of an improved fermion action on quenched lattices");
|
||||||
assert.equal(item.getField("libraryCatalog"), "Google Scholar");
|
|
||||||
assert.lengthOf(modifiedIDs, 2);
|
assert.lengthOf(modifiedIDs, 2);
|
||||||
|
|
||||||
yield Zotero.Promise.delay(0);
|
yield Zotero.Promise.delay(0);
|
||||||
|
|
||||||
var progressWindow = getWindows("chrome://zotero/content/pdfProgress.xul")[0];
|
var progressWindow = getWindows("chrome://zotero/content/recognizePDFDialog.xul")[0];
|
||||||
assert.equal(
|
assert.equal(
|
||||||
progressWindow.document.getElementById("label").value,
|
progressWindow.document.getElementById("label").value,
|
||||||
Zotero.getString("recognizePDF.complete.label")
|
Zotero.getString("recognizePDF.complete.label")
|
||||||
|
|
Loading…
Reference in a new issue