Add feature to retrieve metadata for PDFs, currently accessible as a contextual menu item.

The feature grabs relevant fragments from the document and then searches them on Google Scholar. This will only work with OCRed PDFs, so it doesn't work with JSTOR, but it should work fairly well with everything else.
2008-08-22 05:35:44 +00:00 · 2008-08-22 05:35:44 +00:00 · 4cf79691ed
commit 4cf79691ed
parent f20ed8507e
5 changed files with 290 additions and 20 deletions
--- a/chrome/content/zotero/overlay.js
+++ b/chrome/content/zotero/overlay.js
@ -1482,7 +1482,8 @@ var ZoteroPane = new function()
 			createBib: 11,
 			loadReport: 12,
 			sep4: 13,
-			reindexItem: 14
+			reindexItem: 14,
+			recognizePDF: 15
 		};
 		
 		var menu = document.getElementById('zotero-itemmenu');
@ -1507,20 +1508,37 @@ var ZoteroPane = new function()
 				hide.push(m.showInLibrary, m.sep1, m.addNote, m.attachSnapshot,
 					m.attachLink, m.sep2, m.duplicateItem);
 				
-				// If all items can be reindexed, show option
+				// If all items can be reindexed, or all items can be recognized, show option
 				var items = this.getSelectedItems();
 				var canIndex = true;
+				var canRecognize = true;
 				for (var i=0; i<items.length; i++) {
-					if (!Zotero.Fulltext.canReindex()) {
+					if (!Zotero.Fulltext.canReindex(items[i].id)) {
 						canIndex = false;
+					}
+					
+					if(!Zotero_RecognizePDF.canRecognize(items[i])) {
+						canRecognize = false;
+					}
+					
+					if(!canIndex && !canRecognize) {
 						break;
 					}
 				}
 				if (canIndex) {
-					show.push(m.sep4, m.reindexItem);
+					show.push(m.reindexItem);
+				} else {
+					hide.push(m.reindexItem);
 				}
-				else {
-					hide.push(m.sep4, m.reindexItem);
+				if (canRecognize) {
+					show.push(m.recognizePDF);
+				} else {
+					hide.push(m.recognizePDF);
+				}
+				if(canIndex || canRecognize) {
+					show.push(m.sep4);
+				} else {
+					hide.push(m.sep4);
 				}
 			}
 			// Single item selected
@ -1551,15 +1569,28 @@ var ZoteroPane = new function()
 					hide.push(m.duplicateItem);
 					// If not linked URL, show reindex line
 					if (Zotero.Fulltext.canReindex(item.id)) {
-						show.push(m.sep4, m.reindexItem);
+						show.push(m.reindexItem);
+						showSep4 = true;
+					} else {
+						hide.push(m.reindexItem);
 					}
-					else {
-						hide.push(m.sep4, m.reindexItem);
+					
+					if (Zotero_RecognizePDF.canRecognize(item)) {
+						show.push(m.recognizePDF);
+						showSep4 = true;
+					} else {
+						hide.push(m.recognizePDF);
+					}
+					
+					if(showSep4) {
+						show.push(m.sep4);
+					} else {
+						hide.push(m.sep4);
 					}
 				}
 				else {
 					show.push(m.duplicateItem);
-					hide.push(m.sep4, m.reindexItem);
+					hide.push(m.sep4, m.reindexItem, m.recognizePDF);
 				}
 			}
 		}
@ -1576,7 +1607,8 @@ var ZoteroPane = new function()
 			
 			disable.push(m.showInLibrary, m.duplicateItem, m.deleteItem,
 				m.deleteFromLibrary, m.exportItems, m.createBib, m.loadReport);
-			hide.push(m.addNote, m.attachSnapshot, m.attachLink, m.sep2, m.sep4, m.reindexItem);
+			hide.push(m.addNote, m.attachSnapshot, m.attachLink, m.sep2, m.sep4, m.reindexItem,
+				m.recognizePDF);
 		}
 		
 		// Remove from collection
@ -1596,6 +1628,7 @@ var ZoteroPane = new function()
 		menu.childNodes[m.createBib].setAttribute('label', Zotero.getString('pane.items.menu.createBib' + multiple));
 		menu.childNodes[m.loadReport].setAttribute('label', Zotero.getString('pane.items.menu.generateReport' + multiple));
 		menu.childNodes[m.reindexItem].setAttribute('label', Zotero.getString('pane.items.menu.reindexItem' + multiple));
+		menu.childNodes[m.recognizePDF].setAttribute('label', Zotero.getString('pane.items.menu.recognizePDF' + multiple));
 		
 		for (var i in disable)
 		{
--- a/chrome/content/zotero/overlay.xul
+++ b/chrome/content/zotero/overlay.xul
@ -37,6 +37,7 @@
    <script src="fileInterface.js"/>
 	<script src="reportInterface.js"/>
 	<script src="timelineInterface.js"/>
+	<script src="recognizePDF.js"/>
 	<script src="browser.js"/>
 	<script src="chrome://global/content/nsDragAndDrop.js"/>
 	<script src="chrome://global/content/nsTransferable.js"/>
@ -108,6 +109,7 @@
 					<menuitem oncommand="Zotero_Report_Interface.loadItemReport()"/>
 					<menuseparator/>
 					<menuitem oncommand="ZoteroPane.reindexItem();"/>
+					<menuitem oncommand="Zotero_RecognizePDF.recognizeSelected();"/>
 				</popup>
 			</popupset>
 			
--- a/chrome/content/zotero/recognizePDF.js
+++ b/chrome/content/zotero/recognizePDF.js
@ -0,0 +1,234 @@
+/*
+    ***** BEGIN LICENSE BLOCK *****
+    
+    Copyright (c) 2006  Center for History and New Media
+                        George Mason University, Fairfax, Virginia, USA
+                        http://chnm.gmu.edu
+    
+    Licensed under the Educational Community License, Version 1.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+    
+    http://www.opensource.org/licenses/ecl1.php
+    
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+    
+    ***** END LICENSE BLOCK *****
+*/
+
+/**
+ * @fileOverview Tools for automatically retrieving a citation for the given PDF
+ */
+const MAX_PAGES = 2;
+
+/**
+ * Front end for recognizing PDFs
+ * @namespace
+ */
+Zotero_RecognizePDF = new function() {
+	/**
+	 * Checks whether a given PDF could theoretically be recognized
+	 * @returns {Boolean} True if the PDF can be recognized, false if it cannot be
+	 */
+	this.canRecognize = function(/**Zotero.Item*/ item) {
+		return (item.attachmentMIMEType && item.attachmentMIMEType == "application/pdf" && !item.getSource());
+	}
+	
+	/**
+	 * Retrieves metadata for the PDF(s) selected in the Zotero Pane, placing the PDFs as a children
+	 * of the new items
+	 */
+	this.recognizeSelected = function() {
+		var items = ZoteroPane.getSelectedItems();
+		if (!items) {
+			return;
+		}
+		this.recognizeItems(items);
+	}
+	
+	/**
+	 * Retreives metadata for the PDF items passed, placing the PDFs as a children of the new items
+	 */
+	this.recognizeItems = function(/**Zotero.Item[]*/ items) {
+		var itemsCopy = items.slice();
+		var item = itemsCopy.shift();
+		var file = item.getFile();
+		if(file) {
+			var recognizer = new Zotero_RecognizePDF.Recognizer();
+			recognizer.recognize(file, item.getField("title"),
+				function(translate, newItem) {
+					// put new item in same collections as the old one
+					var itemCollections = item.getCollections();
+					for(var j=0; j<itemCollections.length; j++) {
+						var collection = Zotero.Collections.get(itemCollections[j]);
+						collection.addItem(newItem.id);
+					}
+					
+					// put old item as a child of the new item
+					item.setSource(newItem.id);
+					item.save();
+					
+					// continue recognizing
+					if(itemsCopy.length) Zotero_RecognizePDF.recognizeItems(itemsCopy);
+				});
+		} else {
+			if(itemsCopy.length) Zotero_RecognizePDF.recognizeItems(itemsCopy);
+		}
+	}
+}
+
+/**
+ * @class PDF recognizer backend
+ */
+Zotero_RecognizePDF.Recognizer = function () {}
+
+/**
+ * Retrieves metadata for a PDF and saves it as an item
+ *
+ * @param {nsIFile} file The PDF file to retrieve metadata for
+ * @param {String} pdfTitle The title of the PDF
+ * @param {Function} callback The function to be executed when recognition is complete
+ */
+Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, pdfTitle, callback) {
+	this._pdfTitle = pdfTitle;
+	this._callback = callback;
+	
+	const whitespaceRe = /^\s*$/;
+	
+	var cacheFile = Zotero.getZoteroDirectory();
+	cacheFile.append("recognizePDFcache.txt");
+	
+	Zotero.debug('Running pdftotext -enc UTF-8 -nopgbrk '
+				+ '-l ' + MAX_PAGES + ' "' + file.path + '" "'
+				+ cacheFile.path + '"');
+	
+	var proc = Components.classes["@mozilla.org/process/util;1"].
+			createInstance(Components.interfaces.nsIProcess);
+	var exec = Zotero.getZoteroDirectory();
+	exec.append(Zotero.Fulltext.pdfConverterFileName);
+	proc.init(exec);
+	
+	var args = ['-enc', 'UTF-8', '-nopgbrk', '-raw', '-l', MAX_PAGES];
+	args.push(file.path, cacheFile.path);
+	proc.run(true, args, args.length);
+	
+	var inputStream = Components.classes["@mozilla.org/network/file-input-stream;1"]
+		.createInstance(Components.interfaces.nsIFileInputStream);
+	inputStream.init(cacheFile, 0x01, 0664, 0);
+	var intlStream = Components.classes["@mozilla.org/intl/converter-input-stream;1"]
+		.createInstance(Components.interfaces.nsIConverterInputStream);
+	intlStream.init(inputStream, "UTF-8", 65535,
+		Components.interfaces.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER);
+	intlStream.QueryInterface(Components.interfaces.nsIUnicharLineInputStream);
+	
+	// get the lines in this sample
+	var lines = [];
+	var lineLengths = [];
+	var str = {};
+	while(intlStream.readLine(str)) {
+		if(!whitespaceRe.test(str.value)) {
+			lines.push(str.value);
+			lineLengths.push(str.value.length);
+		}
+	}
+	
+	// get (not quite) median length
+	var lineLengthsLength = lineLengths.length;
+	if(lineLengthsLength < 20) {
+		this._error();
+		return;
+	}
+	
+	var sortedLengths = lineLengths.sort();
+	var medianLength = sortedLengths[Math.floor(lineLengthsLength/2)];
+	
+	// pick lines within 4 chars of the median
+	this._goodLines = [];
+	var uBound = medianLength + 4;
+	var lBound = medianLength - 4;
+	for (var i=0; i<lineLengthsLength; i++) {
+		if(lineLengths[i] >= lBound && lineLengths[i] <= uBound) this._goodLines.push(lines[i]);
+	}
+	
+	this._startLine = this._iteration = 0;
+	this._queryGoogle();
+}
+
+/**
+ * Queries Google Scholar for metadata for this PDF
+ * @private
+ */
+Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() {
+	if(this._iteration > 3 || this._startLine >= this._goodLines.length) {
+		this._error();
+		return;
+	}
+	
+	// take the relevant parts of some lines (exclude hyphenated word)
+	var queryStringWords = 0;
+	var queryString = "";
+	while(queryStringWords < 25 && this._startLine < this._goodLines.length) {
+		var words = this._goodLines[this._startLine].split(/\s+/);
+		words.shift();
+		words.pop();
+		if(words.length) {
+			queryStringWords += words.length;
+			queryString += '"'+words.join(" ")+'" ';
+		}
+		this._startLine++;
+	}
+	Zotero.debug("RecognizePDF: Query string "+queryString);
+	
+	// pass query string to Google Scholar and translate
+	var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString);
+	this.hiddenBrowser = Zotero.Browser.createHiddenBrowser();
+	
+	var me = this;
+	var translate = new Zotero.Translate("web", true, false);
+	translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289");
+	translate.setHandler("itemDone", this._callback);
+	translate.setHandler("select", function(translate, items) { return me._selectItems(translate, items) });
+	translate.setHandler("done", function(translate, success) { if(!success) me._queryGoogle() });
+	
+	this.hiddenBrowser.addEventListener("pageshow", function() { me._scrape(translate) }, true);
+	this.hiddenBrowser.loadURI(url);
+}
+
+/**
+ * Callback to be executed when Google Scholar is loaded
+ * @private
+ */
+Zotero_RecognizePDF.Recognizer.prototype._scrape = function(/**Zotero.Translate*/ translate) {
+	this.hiddenBrowser.removeEventListener("pageshow", this._scrape.caller, true);
+	translate.setDocument(this.hiddenBrowser.contentDocument);
+	translate.translate();
+}
+
+/**
+ * Callback to pick first item in the Google Scholar item list
+ * @private
+ * @type Object
+ */
+Zotero_RecognizePDF.Recognizer.prototype._selectItems = function(/**Zotero.Translate*/ translate, /**Object*/ items) {
+	for(var i in items) {
+		var obj = {};
+		obj[i] = items;
+		return obj;
+	}
+}
+
+/**
+ * Displays an error when a PDF cannot be recognized
+ * @private
+ */
+Zotero_RecognizePDF.Recognizer.prototype._error = function() {
+	var promptService = Components.classes["@mozilla.org/embedcomp/prompt-service;1"]
+									.getService(Components.interfaces.nsIPromptService);
+	promptService.alert(window,
+		Zotero.getString('recognizePDF.couldNotRecognize.title'),
+		Zotero.getString('recognizePDF.couldNotRecognize.message', this._pdfTitle));
+}
--- a/chrome/content/zotero/xpcom/translate.js
+++ b/chrome/content/zotero/xpcom/translate.js
@ -117,7 +117,7 @@ const BOMs = {
 *
 * output - export output (if no location has been specified)
 */
-Zotero.Translate = function(type, saveItem) {
+Zotero.Translate = function(type, saveItem, saveAttachments) {
 	this.type = type;
 	
 	// import = 0001 = 1
@ -147,12 +147,8 @@ Zotero.Translate = function(type, saveItem) {
 	}
 	this._numericTypes = this._numericTypes.substr(1);
 	
-	if(saveItem === false) {	// three equals signs means if it's left
-								// undefined, this.saveItem will still be true
-		this.saveItem = false;
-	} else {
-		this.saveItem = true;
-	}
+	this.saveItem = !(saveItem === false);
+	this.saveAttachments = !(saveAttachments === false);
 	
 	this._handlers = new Array();
 	this._streams = new Array();
@ -1320,7 +1316,7 @@ Zotero.Translate.prototype._itemDone = function(item, attachedTo) {
 		var downloadAssociatedFiles = Zotero.Prefs.get("downloadAssociatedFiles");
 		
 		// handle attachments
-		if(item.attachments && (automaticSnapshots || downloadAssociatedFiles)) {
+		if(item.attachments && this.saveAttachments && (automaticSnapshots || downloadAssociatedFiles)) {
 			for each(var attachment in item.attachments) {
 				if(this.type == "web") {
 					if(!attachment.url && !attachment.document) {
--- a/chrome/locale/en-US/zotero/zotero.properties
+++ b/chrome/locale/en-US/zotero/zotero.properties
@ -95,6 +95,8 @@ pane.items.menu.generateReport	= Generate Report from Selected Item...
 pane.items.menu.generateReport.multiple	= Generate Report from Selected Items...
 pane.items.menu.reindexItem					= Reindex Item
 pane.items.menu.reindexItem.multiple		= Reindex Items
+pane.items.menu.recognizePDF				= Retrieve Metadata for PDF
+pane.items.menu.recognizePDF.multiple		= Retrieve Metadata for PDFs

 pane.items.letter.oneParticipant		= Letter to %S
 pane.items.letter.twoParticipants		= Letter to %S and %S
@ -508,4 +510,7 @@ proxies.error.scheme.noPath			= A valid proxy scheme must contain either the pat
 proxies.recognized.message			= Adding this proxy will allow Zotero to recognize items from its pages and will automatically redirect future requests to %1$S through %2$S.
 proxies.recognized.add				= Add Proxy
 proxies.enableTransparentWarning.title			= Warning
-proxies.enableTransparentWarning.description	= Please ensure that the proxies listed below belong to a library, school, or other institution with which you are affiliated. A malicious proxy could pose a security risk when transparent redirection is enabled.
+proxies.enableTransparentWarning.description	= Please ensure that the proxies listed below belong to a library, school, or other institution with which you are affiliated. A malicious proxy could pose a security risk.
+
+recognizePDF.couldNotRecognize.title	= Could Not Retrieve Metada
+recognizePDF.couldNotRecognize.message	= Zotero could not retrieve metadata for "%1$S".