Add feature to retrieve metadata for PDFs, currently accessible as a contextual menu item.

The feature grabs relevant fragments from the document and then searches them on Google Scholar. This will only work with OCRed PDFs, so it doesn't work with JSTOR, but it should work fairly well with everything else.
This commit is contained in:
Simon Kornblith 2008-08-22 05:35:44 +00:00
parent f20ed8507e
commit 4cf79691ed
5 changed files with 290 additions and 20 deletions

View file

@ -1482,7 +1482,8 @@ var ZoteroPane = new function()
createBib: 11,
loadReport: 12,
sep4: 13,
reindexItem: 14
reindexItem: 14,
recognizePDF: 15
};
var menu = document.getElementById('zotero-itemmenu');
@ -1507,20 +1508,37 @@ var ZoteroPane = new function()
hide.push(m.showInLibrary, m.sep1, m.addNote, m.attachSnapshot,
m.attachLink, m.sep2, m.duplicateItem);
// If all items can be reindexed, show option
// If all items can be reindexed, or all items can be recognized, show option
var items = this.getSelectedItems();
var canIndex = true;
var canRecognize = true;
for (var i=0; i<items.length; i++) {
if (!Zotero.Fulltext.canReindex()) {
if (!Zotero.Fulltext.canReindex(items[i].id)) {
canIndex = false;
}
if(!Zotero_RecognizePDF.canRecognize(items[i])) {
canRecognize = false;
}
if(!canIndex && !canRecognize) {
break;
}
}
if (canIndex) {
show.push(m.sep4, m.reindexItem);
show.push(m.reindexItem);
} else {
hide.push(m.reindexItem);
}
else {
hide.push(m.sep4, m.reindexItem);
if (canRecognize) {
show.push(m.recognizePDF);
} else {
hide.push(m.recognizePDF);
}
if(canIndex || canRecognize) {
show.push(m.sep4);
} else {
hide.push(m.sep4);
}
}
// Single item selected
@ -1551,15 +1569,28 @@ var ZoteroPane = new function()
hide.push(m.duplicateItem);
// If not linked URL, show reindex line
if (Zotero.Fulltext.canReindex(item.id)) {
show.push(m.sep4, m.reindexItem);
show.push(m.reindexItem);
showSep4 = true;
} else {
hide.push(m.reindexItem);
}
else {
hide.push(m.sep4, m.reindexItem);
if (Zotero_RecognizePDF.canRecognize(item)) {
show.push(m.recognizePDF);
showSep4 = true;
} else {
hide.push(m.recognizePDF);
}
if(showSep4) {
show.push(m.sep4);
} else {
hide.push(m.sep4);
}
}
else {
show.push(m.duplicateItem);
hide.push(m.sep4, m.reindexItem);
hide.push(m.sep4, m.reindexItem, m.recognizePDF);
}
}
}
@ -1576,7 +1607,8 @@ var ZoteroPane = new function()
disable.push(m.showInLibrary, m.duplicateItem, m.deleteItem,
m.deleteFromLibrary, m.exportItems, m.createBib, m.loadReport);
hide.push(m.addNote, m.attachSnapshot, m.attachLink, m.sep2, m.sep4, m.reindexItem);
hide.push(m.addNote, m.attachSnapshot, m.attachLink, m.sep2, m.sep4, m.reindexItem,
m.recognizePDF);
}
// Remove from collection
@ -1596,6 +1628,7 @@ var ZoteroPane = new function()
menu.childNodes[m.createBib].setAttribute('label', Zotero.getString('pane.items.menu.createBib' + multiple));
menu.childNodes[m.loadReport].setAttribute('label', Zotero.getString('pane.items.menu.generateReport' + multiple));
menu.childNodes[m.reindexItem].setAttribute('label', Zotero.getString('pane.items.menu.reindexItem' + multiple));
menu.childNodes[m.recognizePDF].setAttribute('label', Zotero.getString('pane.items.menu.recognizePDF' + multiple));
for (var i in disable)
{

View file

@ -37,6 +37,7 @@
<script src="fileInterface.js"/>
<script src="reportInterface.js"/>
<script src="timelineInterface.js"/>
<script src="recognizePDF.js"/>
<script src="browser.js"/>
<script src="chrome://global/content/nsDragAndDrop.js"/>
<script src="chrome://global/content/nsTransferable.js"/>
@ -108,6 +109,7 @@
<menuitem oncommand="Zotero_Report_Interface.loadItemReport()"/>
<menuseparator/>
<menuitem oncommand="ZoteroPane.reindexItem();"/>
<menuitem oncommand="Zotero_RecognizePDF.recognizeSelected();"/>
</popup>
</popupset>

View file

@ -0,0 +1,234 @@
/*
***** BEGIN LICENSE BLOCK *****
Copyright (c) 2006 Center for History and New Media
George Mason University, Fairfax, Virginia, USA
http://chnm.gmu.edu
Licensed under the Educational Community License, Version 1.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.opensource.org/licenses/ecl1.php
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
***** END LICENSE BLOCK *****
*/
/**
* @fileOverview Tools for automatically retrieving a citation for the given PDF
*/
const MAX_PAGES = 2;
/**
* Front end for recognizing PDFs
* @namespace
*/
Zotero_RecognizePDF = new function() {
/**
* Checks whether a given PDF could theoretically be recognized
* @returns {Boolean} True if the PDF can be recognized, false if it cannot be
*/
this.canRecognize = function(/**Zotero.Item*/ item) {
return (item.attachmentMIMEType && item.attachmentMIMEType == "application/pdf" && !item.getSource());
}
/**
* Retrieves metadata for the PDF(s) selected in the Zotero Pane, placing the PDFs as a children
* of the new items
*/
this.recognizeSelected = function() {
var items = ZoteroPane.getSelectedItems();
if (!items) {
return;
}
this.recognizeItems(items);
}
/**
* Retreives metadata for the PDF items passed, placing the PDFs as a children of the new items
*/
this.recognizeItems = function(/**Zotero.Item[]*/ items) {
var itemsCopy = items.slice();
var item = itemsCopy.shift();
var file = item.getFile();
if(file) {
var recognizer = new Zotero_RecognizePDF.Recognizer();
recognizer.recognize(file, item.getField("title"),
function(translate, newItem) {
// put new item in same collections as the old one
var itemCollections = item.getCollections();
for(var j=0; j<itemCollections.length; j++) {
var collection = Zotero.Collections.get(itemCollections[j]);
collection.addItem(newItem.id);
}
// put old item as a child of the new item
item.setSource(newItem.id);
item.save();
// continue recognizing
if(itemsCopy.length) Zotero_RecognizePDF.recognizeItems(itemsCopy);
});
} else {
if(itemsCopy.length) Zotero_RecognizePDF.recognizeItems(itemsCopy);
}
}
}
/**
* @class PDF recognizer backend
*/
Zotero_RecognizePDF.Recognizer = function () {}
/**
* Retrieves metadata for a PDF and saves it as an item
*
* @param {nsIFile} file The PDF file to retrieve metadata for
* @param {String} pdfTitle The title of the PDF
* @param {Function} callback The function to be executed when recognition is complete
*/
Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, pdfTitle, callback) {
this._pdfTitle = pdfTitle;
this._callback = callback;
const whitespaceRe = /^\s*$/;
var cacheFile = Zotero.getZoteroDirectory();
cacheFile.append("recognizePDFcache.txt");
Zotero.debug('Running pdftotext -enc UTF-8 -nopgbrk '
+ '-l ' + MAX_PAGES + ' "' + file.path + '" "'
+ cacheFile.path + '"');
var proc = Components.classes["@mozilla.org/process/util;1"].
createInstance(Components.interfaces.nsIProcess);
var exec = Zotero.getZoteroDirectory();
exec.append(Zotero.Fulltext.pdfConverterFileName);
proc.init(exec);
var args = ['-enc', 'UTF-8', '-nopgbrk', '-raw', '-l', MAX_PAGES];
args.push(file.path, cacheFile.path);
proc.run(true, args, args.length);
var inputStream = Components.classes["@mozilla.org/network/file-input-stream;1"]
.createInstance(Components.interfaces.nsIFileInputStream);
inputStream.init(cacheFile, 0x01, 0664, 0);
var intlStream = Components.classes["@mozilla.org/intl/converter-input-stream;1"]
.createInstance(Components.interfaces.nsIConverterInputStream);
intlStream.init(inputStream, "UTF-8", 65535,
Components.interfaces.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER);
intlStream.QueryInterface(Components.interfaces.nsIUnicharLineInputStream);
// get the lines in this sample
var lines = [];
var lineLengths = [];
var str = {};
while(intlStream.readLine(str)) {
if(!whitespaceRe.test(str.value)) {
lines.push(str.value);
lineLengths.push(str.value.length);
}
}
// get (not quite) median length
var lineLengthsLength = lineLengths.length;
if(lineLengthsLength < 20) {
this._error();
return;
}
var sortedLengths = lineLengths.sort();
var medianLength = sortedLengths[Math.floor(lineLengthsLength/2)];
// pick lines within 4 chars of the median
this._goodLines = [];
var uBound = medianLength + 4;
var lBound = medianLength - 4;
for (var i=0; i<lineLengthsLength; i++) {
if(lineLengths[i] >= lBound && lineLengths[i] <= uBound) this._goodLines.push(lines[i]);
}
this._startLine = this._iteration = 0;
this._queryGoogle();
}
/**
* Queries Google Scholar for metadata for this PDF
* @private
*/
Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() {
if(this._iteration > 3 || this._startLine >= this._goodLines.length) {
this._error();
return;
}
// take the relevant parts of some lines (exclude hyphenated word)
var queryStringWords = 0;
var queryString = "";
while(queryStringWords < 25 && this._startLine < this._goodLines.length) {
var words = this._goodLines[this._startLine].split(/\s+/);
words.shift();
words.pop();
if(words.length) {
queryStringWords += words.length;
queryString += '"'+words.join(" ")+'" ';
}
this._startLine++;
}
Zotero.debug("RecognizePDF: Query string "+queryString);
// pass query string to Google Scholar and translate
var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString);
this.hiddenBrowser = Zotero.Browser.createHiddenBrowser();
var me = this;
var translate = new Zotero.Translate("web", true, false);
translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289");
translate.setHandler("itemDone", this._callback);
translate.setHandler("select", function(translate, items) { return me._selectItems(translate, items) });
translate.setHandler("done", function(translate, success) { if(!success) me._queryGoogle() });
this.hiddenBrowser.addEventListener("pageshow", function() { me._scrape(translate) }, true);
this.hiddenBrowser.loadURI(url);
}
/**
* Callback to be executed when Google Scholar is loaded
* @private
*/
Zotero_RecognizePDF.Recognizer.prototype._scrape = function(/**Zotero.Translate*/ translate) {
this.hiddenBrowser.removeEventListener("pageshow", this._scrape.caller, true);
translate.setDocument(this.hiddenBrowser.contentDocument);
translate.translate();
}
/**
* Callback to pick first item in the Google Scholar item list
* @private
* @type Object
*/
Zotero_RecognizePDF.Recognizer.prototype._selectItems = function(/**Zotero.Translate*/ translate, /**Object*/ items) {
for(var i in items) {
var obj = {};
obj[i] = items;
return obj;
}
}
/**
* Displays an error when a PDF cannot be recognized
* @private
*/
Zotero_RecognizePDF.Recognizer.prototype._error = function() {
var promptService = Components.classes["@mozilla.org/embedcomp/prompt-service;1"]
.getService(Components.interfaces.nsIPromptService);
promptService.alert(window,
Zotero.getString('recognizePDF.couldNotRecognize.title'),
Zotero.getString('recognizePDF.couldNotRecognize.message', this._pdfTitle));
}

View file

@ -117,7 +117,7 @@ const BOMs = {
*
* output - export output (if no location has been specified)
*/
Zotero.Translate = function(type, saveItem) {
Zotero.Translate = function(type, saveItem, saveAttachments) {
this.type = type;
// import = 0001 = 1
@ -147,12 +147,8 @@ Zotero.Translate = function(type, saveItem) {
}
this._numericTypes = this._numericTypes.substr(1);
if(saveItem === false) { // three equals signs means if it's left
// undefined, this.saveItem will still be true
this.saveItem = false;
} else {
this.saveItem = true;
}
this.saveItem = !(saveItem === false);
this.saveAttachments = !(saveAttachments === false);
this._handlers = new Array();
this._streams = new Array();
@ -1320,7 +1316,7 @@ Zotero.Translate.prototype._itemDone = function(item, attachedTo) {
var downloadAssociatedFiles = Zotero.Prefs.get("downloadAssociatedFiles");
// handle attachments
if(item.attachments && (automaticSnapshots || downloadAssociatedFiles)) {
if(item.attachments && this.saveAttachments && (automaticSnapshots || downloadAssociatedFiles)) {
for each(var attachment in item.attachments) {
if(this.type == "web") {
if(!attachment.url && !attachment.document) {

View file

@ -95,6 +95,8 @@ pane.items.menu.generateReport = Generate Report from Selected Item...
pane.items.menu.generateReport.multiple = Generate Report from Selected Items...
pane.items.menu.reindexItem = Reindex Item
pane.items.menu.reindexItem.multiple = Reindex Items
pane.items.menu.recognizePDF = Retrieve Metadata for PDF
pane.items.menu.recognizePDF.multiple = Retrieve Metadata for PDFs
pane.items.letter.oneParticipant = Letter to %S
pane.items.letter.twoParticipants = Letter to %S and %S
@ -508,4 +510,7 @@ proxies.error.scheme.noPath = A valid proxy scheme must contain either the pat
proxies.recognized.message = Adding this proxy will allow Zotero to recognize items from its pages and will automatically redirect future requests to %1$S through %2$S.
proxies.recognized.add = Add Proxy
proxies.enableTransparentWarning.title = Warning
proxies.enableTransparentWarning.description = Please ensure that the proxies listed below belong to a library, school, or other institution with which you are affiliated. A malicious proxy could pose a security risk when transparent redirection is enabled.
proxies.enableTransparentWarning.description = Please ensure that the proxies listed below belong to a library, school, or other institution with which you are affiliated. A malicious proxy could pose a security risk.
recognizePDF.couldNotRecognize.title = Could Not Retrieve Metada
recognizePDF.couldNotRecognize.message = Zotero could not retrieve metadata for "%1$S".