Merge pull request #433 from aurimasv/retrieve-meta

Retrieve Metadata query limit fixes
2014-01-31 00:16:09 -08:00 · 2014-01-31 00:16:09 -08:00 · 71a3751179
commit 71a3751179
parent 121b75ef6c 3c21e7c999
7 changed files with 669 additions and 177 deletions
--- a/chrome/content/zotero/captcha.js
+++ b/chrome/content/zotero/captcha.js
@ -0,0 +1,73 @@
+/*
+    ***** BEGIN LICENSE BLOCK *****
+    
+    Copyright © 2009 Center for History and New Media
+                     George Mason University, Fairfax, Virginia, USA
+                     http://zotero.org
+    
+    This file is part of Zotero.
+    
+    Zotero is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    
+    Zotero is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+    
+    You should have received a copy of the GNU Affero General Public License
+    along with Zotero.  If not, see <http://www.gnu.org/licenses/>.
+    
+    ***** END LICENSE BLOCK *****
+*/
+
+var Zotero_Captcha = new function() {
+	this._io;
+	
+	this.onLoad = function() {
+		this._io = window.arguments[0];
+		var description = document.getElementById('zotero-captcha-description'),
+			errorMsg = document.getElementById('zotero-captcha-error');
+		
+		if(this._io.dataIn.title) {
+			document.title = this._io.dataIn.title;
+		}
+		
+		if(this._io.dataIn.description) {
+			description.textContent = this._io.dataIn.description;
+			description.hidden = false;
+		} else {
+			description.hidden = true;
+		}
+		
+		if(this._io.dataIn.error) {
+			errorMsg.textContent = this._io.dataIn.error;
+			errorMsg.hidden = false;
+		} else {
+			errorMsg.hidden = true;
+		}
+		
+		document.getElementById('zotero-captcha-image').src = this._io.dataIn.imgUrl;
+		document.getElementById('zotero-captcha-input').focus();
+	}
+	
+	this.imageOnLoad = function() {
+		window.sizeToContent();
+	}
+	
+	this.resolve = function() {
+		var result = document.getElementById('zotero-captcha-input');
+		if(!result.value) return;
+		
+		this._io.dataOut = {
+			captcha: result.value
+		};
+		window.close();
+	}
+	
+	this.cancel = function() {
+		window.close();
+	}
+}
--- a/chrome/content/zotero/captcha.xul
+++ b/chrome/content/zotero/captcha.xul
@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+
+<?xml-stylesheet href="chrome://global/skin/" type="text/css"?>
+<?xml-stylesheet href="chrome://zotero/skin/zotero.css" type="text/css"?>
+
+<!DOCTYPE window SYSTEM "chrome://zotero/locale/zotero.dtd">
+
+<window xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul"
+	onload="Zotero_Captcha.onLoad();"
+	id="zotero-captcha"
+	onkeypress="if(event.keyCode === KeyEvent.DOM_VK_ESCAPE) Zotero_Captcha.cancel();">
+	
+	<script src="include.js"/>
+	<script src="captcha.js"/>
+	
+	<vbox style="padding:10px" align="center" flex="1">
+		<description id="zotero-captcha-description"></description>
+		<image id="zotero-captcha-image" onload="Zotero_Captcha.imageOnLoad();" />
+		<description id="zotero-captcha-error"></description>
+		<textbox id="zotero-captcha-input"
+			onkeypress="if(event.keyCode === KeyEvent.DOM_VK_RETURN) Zotero_Captcha.resolve();" />
+		<hbox>
+			<button label="&zotero.general.ok;" default="true" oncommand="Zotero_Captcha.resolve();" />
+			<button label="&zotero.general.cancel;" oncommand="Zotero_Captcha.cancel();" />
+		</hbox>
+	</vbox>
+</window>
--- a/chrome/content/zotero/pdfProgress.xul
+++ b/chrome/content/zotero/pdfProgress.xul
@ -14,7 +14,9 @@
 		<tree flex="1" id="tree" hidecolumnpicker="true">
 			<treecols>
 				<treecol id="success-col" style="width:20px;"/>
+				<splitter class="tree-splitter" hidden="true"/>
 				<treecol label="&zotero.recognizePDF.pdfName.label;" id="pdf-col" flex="1"/>
+				<splitter class="tree-splitter"/>
 				<treecol label="&zotero.recognizePDF.itemName.label;" id="item-col" flex="2"/>
 			</treecols>
 			<treechildren id="treechildren"/>
--- a/chrome/content/zotero/recognizePDF.js
+++ b/chrome/content/zotero/recognizePDF.js
@ -65,19 +65,32 @@ var Zotero_RecognizePDF = new function() {
 	 *
 	 * @param {nsIFile} file The PDF file to retrieve metadata for
 	 * @param {Integer|null} libraryID The library in which to save the PDF
+	 * @param {Function} stopCheckCallback Function that returns true if the
+	 *                   process is to be interrupted
 	 * @return {Promise} A promise resolved when PDF metadata has been retrieved
 	 */
-	this.recognize = function(file, libraryID) {
+	this.recognize = function(file, libraryID, stopCheckCallback) {
 		const MAX_PAGES = 7;
-		const GOOGLE_SCHOLAR_QUERY_DELAY = 2000; // in ms
+		var me = this;
 		
 		return _extractText(file, MAX_PAGES).then(function(lines) {
 			// Look for DOI - Use only first 80 lines to avoid catching article references
 			var allText = lines.join("\n"),
-				doi = Zotero.Utilities.cleanDOI(lines.slice(0,80).join('\n')),
+				firstChunk = lines.slice(0,80).join('\n'),
+				doi = Zotero.Utilities.cleanDOI(firstChunk),
 				promise;
 			Zotero.debug(allText);
 			
+			if(!doi) {
+				// Look for a JSTOR stable URL, which can be converted to a DOI by prepending 10.2307
+				doi = firstChunk.match(/www.\jstor\.org\/stable\/(\S+)/i);
+				if(doi) {
+					doi = Zotero.Utilities.cleanDOI(
+						doi[1].indexOf('10.') == 0 ? doi[1] : '10.2307/' + doi[1]
+					);
+				}
+			}
+			
 			if(doi) {
 				// Look up DOI
 				Zotero.debug("RecognizePDF: Found DOI: "+doi);
@ -104,118 +117,7 @@ var Zotero_RecognizePDF = new function() {
 			// If no DOI or ISBN, query Google Scholar
 			return promise.fail(function(error) {
 				Zotero.debug("RecognizePDF: "+error);
-				
-				// Use only first column from multi-column lines
-				const lineRe = /^[\s_]*([^\s]+(?: [^\s_]+)+)/;
-				var cleanedLines = [], cleanedLineLengths = [];
-				for(var i=0; i<lines.length && cleanedLines.length<100; i++) {
-					var m = lineRe.exec(lines[i]);
-					if(m && m[1].split(' ').length > 3) {
-						cleanedLines.push(m[1]);
-						cleanedLineLengths.push(m[1].length);
-					}
-				}
-				
-				// get (not quite) median length
-				var lineLengthsLength = cleanedLineLengths.length;
-				if(lineLengthsLength < 20
-						|| cleanedLines[0] === "This is a digital copy of a book that was preserved for generations on library shelves before it was carefully scanned by Google as part of a project") {
-					throw new Zotero.Exception.Alert("recognizePDF.noOCR");
-				}
-				
-				var sortedLengths = cleanedLineLengths.sort(),
-					medianLength = sortedLengths[Math.floor(lineLengthsLength/2)];
-				
-				// pick lines within 6 chars of the median (this is completely arbitrary)
-				var goodLines = [],
-					uBound = medianLength + 6,
-					lBound = medianLength - 6;
-				for (var i=0; i<lineLengthsLength; i++) {
-					if(cleanedLineLengths[i] > lBound && cleanedLineLengths[i] < uBound) {
-						// Strip quotation marks so they don't mess up search query quoting
-						var line = cleanedLines[i].replace('"', '');
-						goodLines.push(line);
-					}
-				}
-				
-				var nextLine = 0,
-				limited = false,
-				queryGoogle = function() {
-					// Once we hit the CAPTCHA once, don't keep trying
-					if(limited) throw new Zotero.Exception.Alert("recognizePDF.limit");
-
-					// Take the relevant parts of some lines (exclude hyphenated word)
-					var queryString = "", queryStringWords = 0;
-					while(queryStringWords < 25) {
-						if(!goodLines.length) throw new Zotero.Exception.Alert("recognizePDF.noMatches");
-				
-						var words = goodLines.splice(nextLine, 1)[0].split(/\s+/);
-						// Try to avoid picking adjacent strings so the odds of them appearing in another
-						// document quoting our document is low. Every 7th line is a magic value
-						nextLine = (nextLine + 7) % goodLines.length;
-				
-						// get rid of first and last words
-						words.shift();
-						words.pop();
-						// make sure there are no long words (probably OCR mistakes)
-						var skipLine = false;
-						for(var i=0; i<words.length; i++) {
-							if(words[i].length > 20) {
-								skipLine = true;
-								break;
-							}
-						}
-						// add words to query
-						if(!skipLine && words.length) {
-							queryStringWords += words.length;
-							queryString += '"'+words.join(" ")+'" ';
-						}
-					}
-					
-					Zotero.debug("RecognizePDF: Query string "+queryString);
-					
-					var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search",
-						delay = GOOGLE_SCHOLAR_QUERY_DELAY - (Date.now() - Zotero.HTTP.lastGoogleScholarQueryTime);
-
-					// Delay 
-					return (delay > 0 ? Q.delay(delay) : Q.when())
-					.then(function() {
-						Zotero.HTTP.lastGoogleScholarQueryTime = Date.now();
-						return Zotero.HTTP.promise("GET", url, {"responseType":"document"})
-					})
-					.then(function(xmlhttp) {
-						var doc = xmlhttp.response,
-							deferred = Q.defer(),
-							translate = new Zotero.Translate.Web();
-
-						if(Zotero.Utilities.xpath(doc, "//form[@action='Captcha']").length) {
-							// Hit CAPTCHA
-							limited = true;
-							throw new Zotero.Exception.Alert("recognizePDF.limit");
-						}
-						
-						translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289");
-						translate.setDocument(Zotero.HTTP.wrapDocument(doc, url));
-						translate.setHandler("translators", function(translate, detected) {
-							if(detected.length) {
-								deferred.resolve(_promiseTranslate(translate, libraryID));
-							} else {
-								deferred.reject(new Zotero.Exception.Alert("recognizePDF.noMatches"));
-							}
-						});
-						translate.getTranslators();
-						
-						return deferred.promise;
-					}, function(e) {
-						if(e instanceof Zotero.HTTP.UnexpectedStatusException && e.status == 403) {
-							// Hit hard block
-							throw new Zotero.Exception.Alert("recognizePDF.limit");
-						}
-						throw e;
-					});
-				};
-
-				return queryGoogle().fail(queryGoogle).fail(queryGoogle);
+				return me.GSFullTextSearch.findItem(lines, libraryID, stopCheckCallback);
 			});
 		});
 	}
@ -331,40 +233,14 @@ var Zotero_RecognizePDF = new function() {
 		}
 	
 		// Validate ISBNs
-		var validIsbns = [];
+		var validIsbns = [], cleanISBN;
 		for (var i =0; i < isbns.length; i++) {
-			if(_isValidISBN(isbns[i])) validIsbns.push(isbns[i]);
+			cleanISBN = Zotero.Utilities.cleanISBN(isbns[i]);
+			if(cleanISBN) validIsbns.push(cleanISBN);
 		}
 		return validIsbns;
 	}
 	
-	/**
-	 * Check whether an ISBNs is valid
-	 * @private
-	 * @return {Boolean}
-	 */
-	function _isValidISBN(isbn) {
-		if(isbn.length == 13) {
-			// ISBN-13 should start with 978 or 979 i.e. GS1 for book publishing industry
-			var prefix = isbn.slice(0,3);
-			if (prefix != "978" && prefix != "979") return false;
-			// Verify check digit
-			var check = 0;
-			for (var i = 0; i < 13; i+=2) check += isbn[i]*1;
-			for (i = 1; i < 12; i+=2) check += 3 * isbn[i]*1;
-			return (check % 10 == 0);
-		} else if(isbn.length == 10) {
-			// Verify ISBN-10 check digit
-			var check = 0;
-			for (var i = 0; i < 9; i++) check += isbn[i]*1 * (10-i);
-			// last number might be 'X'
-			if (isbn[9] == 'X' || isbn[9] == 'x') check += 10;
-			else check += isbn[i]*1;
-			return (check % 11 == 0);
-		}
-		return false;
-	}
-
 	/**
 	 * @class Handles UI, etc. for recognizing multiple items
 	 */
@ -388,7 +264,7 @@ var Zotero_RecognizePDF = new function() {
 			this._items = items.slice();
 			this._itemTotal = items.length;
 			
-			this._progressWindow = window.openDialog("chrome://zotero/content/pdfProgress.xul", "", "chrome,close=yes,resizable=yes,dependent,dialog,centerscreen");
+			_progressWindow = this._progressWindow = window.openDialog("chrome://zotero/content/pdfProgress.xul", "", "chrome,close=yes,resizable=yes,dependent,dialog,centerscreen");
 			this._progressWindow.addEventListener("pageshow", function() { me._onWindowLoaded() }, false);
 		},

@ -398,7 +274,15 @@ var Zotero_RecognizePDF = new function() {
 		"stop": function() {
 			this._stopped = true;	
 		},
-
+		
+		/**
+		 * Halts recognition and closes window
+		 */
+		"close": function() {
+			this.stop();
+			this._progressWindow.close();
+		},
+		
 		/**
 		 * Called when the progress window has been opened; adds items to the tree and begins recognizing
 		 * @param
@ -406,9 +290,11 @@ var Zotero_RecognizePDF = new function() {
 		"_onWindowLoaded": function() {
 			// populate progress window
 			var treechildren = this._progressWindow.document.getElementById("treechildren");
+			this._rowIDs = [];
 			for(var i in this._items) {
 				var treeitem = this._progressWindow.document.createElement('treeitem');
 				var treerow = this._progressWindow.document.createElement('treerow');
+				this._rowIDs.push(this._items[i].id);
 				
 				var treecell = this._progressWindow.document.createElement('treecell');
 				treecell.setAttribute("id", "item-"+this._items[i].id+"-icon");
@ -427,12 +313,22 @@ var Zotero_RecognizePDF = new function() {
 			}
 			
 			var me = this;
-			this._progressIndicator = this._progressWindow.document.getElementById("progress-indicator");
-			this._progressWindow.document.getElementById("cancel-button").addEventListener("command", function() {
-				me.stop();
-				me._progressWindow.close();
-			}, false);
-			this._progressWindow.addEventListener("close", function() { me.stop() }, false);
+			
+			this._progressWindow.document.getElementById("tree").addEventListener(
+				"dblclick", function(event) { me._onDblClick(event, this); });
+			
+			this._cancelHandler = function() { me.stop() };
+			this._keypressCancelHandler = function(e) {
+				if(e.keyCode === KeyEvent.DOM_VK_ESCAPE) me.stop();
+			};
+			
+			_progressIndicator = this._progressIndicator = this._progressWindow.document.getElementById("progress-indicator");
+			this._progressWindow.document.getElementById("cancel-button")
+				.addEventListener("command", this._cancelHandler, false);
+			// Also cancel if the user presses Esc
+			this._progressWindow.addEventListener("keypress", this._keypressCancelHandler);
+			this._progressWindow.addEventListener("close", this._cancelHandler, false);
+			Zotero_RecognizePDF.GSFullTextSearch.resetQueryLimit();
 			this._recognizeItem();
 		},

@ -452,23 +348,31 @@ var Zotero_RecognizePDF = new function() {
 				return;
 			}
 			
+			// Order here matters. Otherwise we may show an incorrect label
+			if(this._stopped) {
+				this._done(true);
+				return;
+			}
+			
 			this._progressIndicator.value = (this._itemTotal-this._items.length)/this._itemTotal*100;
 			
 			var item = this._items.shift(),
 				itemIcon = this._progressWindow.document.getElementById("item-"+item.id+"-icon"),
-				itemTitle = this._progressWindow.document.getElementById("item-"+item.id+"-title");
+				itemTitle = this._progressWindow.document.getElementById("item-"+item.id+"-title"),
+				rowNumber = this._rowIDs.indexOf(item.id);
 			itemIcon.setAttribute("src", LOADING_IMAGE);
+			itemTitle.setAttribute("label", "");
 			
 			var file = item.getFile(), me = this;
 			
 			(file
-			? Zotero_RecognizePDF.recognize(file, item.libraryID)
+			? Zotero_RecognizePDF.recognize(file, item.libraryID, function() { return me._stopped; })
 			: Q.reject(new Zotero.Exception.Alert("recognizePDF.fileNotFound")))
 			.then(function(newItem) {
 				// If already stopped, delete
 				if(me._stopped) {
-					Zotero.Items.erase(item.id);
-					return;
+					Zotero.Items.erase(newItem.id);
+					throw new Zotero.Exception.Alert('recognizePDF.stopped');
 				}
 				
 				// put new item in same collections as the old one
@ -484,37 +388,504 @@ var Zotero_RecognizePDF = new function() {
 				
 				itemTitle.setAttribute("label", newItem.getField("title"));
 				itemIcon.setAttribute("src", SUCCESS_IMAGE);
+				me._rowIDs[rowNumber] = newItem.id;
 				
 				me._recognizeItem();
-			}, function(error) {
+			})
+			.catch(function(error) {
 				Zotero.debug(error);
 				Zotero.logError(error);

 				itemTitle.setAttribute("label", error instanceof Zotero.Exception.Alert ? error.message : Zotero.getString("recognizePDF.error"));
 				itemIcon.setAttribute("src", FAILURE_IMAGE);
 				
-				if(error instanceof Zotero.Exception.Alert && error.name === "recognizePDF.limit") {
-					me._done();
+				// Don't show "completed" label if stopped on last item
+				if(me._stopped && !me._items.length) {
+					me._done(true);
 				} else {
 					me._recognizeItem();
 				}
-			}).fin(function() {
+			}).finally(function() {
 				// scroll to this item
-				me._progressWindow.document.getElementById("tree").treeBoxObject.scrollToRow(Math.max(0, me._itemTotal-me._items.length-5));
+				me._progressWindow.document.getElementById("tree").treeBoxObject.scrollToRow(Math.max(0, me._itemTotal-me._items.length-4));
 			}).done();
 		},

 		/**
-		 * Cleans up after items are recognized, disabling the cancel button and making the progress window
-		 * close on blur
+		 * Cleans up after items are recognized, disabling the cancel button and
+		 * making the progress window close on blur.
+		 * @param {Boolean} cancelled Whether the process was cancelled
 		 */
-		"_done": function() {
+		"_done": function(cancelled) {
 			this._progressIndicator.value = 100;
-			this._progressWindow.document.getElementById("cancel-button").label = Zotero.getString("recognizePDF.close.label");
-			var me = this;
-			this._progressWindow.addEventListener("blur",
-				function() { me._progressWindow.setTimeout(function() { me._progressWindow.close() }, 2000) }, false);
-			this._progressWindow.document.getElementById("label").value = Zotero.getString("recognizePDF.complete.label");
+			// Switch out cancel for close
+			var cancelButton = this._progressWindow.document.getElementById("cancel-button"),
+				me = this;
+			cancelButton.label = Zotero.getString("recognizePDF.close.label");
+			cancelButton.removeEventListener("command", this._cancelHandler, false);
+			cancelButton.addEventListener("command", function() { me.close() }, false);
+			this._progressWindow.removeEventListener("keypress", this._keypressCancelHandler);
+			this._progressWindow.addEventListener("keypress", function() { me.close() });
+			
+			if(Zotero.isMac) {
+				// On MacOS X, the windows are not always on top, so we hide them on
+				// blur to avoid clutter
+				this._setCloseTimer();
+			}
+			this._progressWindow.document.getElementById("label").value = 
+				cancelled ? Zotero.getString("recognizePDF.cancelled.label")
+					: Zotero.getString("recognizePDF.complete.label");
+		},
+		
+		/**
+		 * Set a timer after which the window will close automatically. If the
+		 * window is refocused, clear the timer and do not attempt to auto-close
+		 * any more
+		 * @private
+		 */
+		"_setCloseTimer": function() {
+			var me = this, win = this._progressWindow;
+			var focusListener = function() {
+				if(!win.zoteroCloseTimeoutID) return;
+				
+				win.clearTimeout(win.zoteroCloseTimeoutID);
+				delete win.zoteroCloseTimeoutID;
+				
+				win.removeEventListener('blur', blurListener, false);
+				win.removeEventListener('focus', focusListener, false);
+			};
+			var blurListener = function() {
+				// Close window after losing focus for 5 seconds
+				win.zoteroCloseTimeoutID = win.setTimeout(function() { win.close() }, 5000);
+				// Prevent auto-close if we gain focus again
+				win.addEventListener("focus", focusListener, false);
+			};
+			win.addEventListener("blur", blurListener, false);
+		},
+		
+		/**
+		 * Focus items in Zotero library when double-clicking them in the Retrieve
+		 * metadata window.
+		 * @param {Event} event
+		 * @param {tree} tree XUL tree object
+		 * @private
+		 */
+		"_onDblClick": function(event, tree) {
+			if (event && tree && event.type == "dblclick") {
+				var itemID = this._rowIDs[tree.treeBoxObject.getRowAt(event.clientX, event.clientY)];
+				if(!itemID) return;
+				
+				// Get the right window. In tab mode, it's the container window
+				var lastWin = (window.ZoteroTab ? window.ZoteroTab.containerWindow : window);
+				
+				if (lastWin.ZoteroOverlay) {
+					lastWin.ZoteroOverlay.toggleDisplay(true);
+				}
+				
+				lastWin.ZoteroPane.selectItem(itemID, false, true);
+				lastWin.focus();
+			}
 		}
-	}
+	};
+	
+	/**
+	 * Singleton for querying Google Scholar. Ensures that all queries are
+	 * sequential and respect the delay inbetween queries.
+	 * @namespace
+	 */
+	this.GSFullTextSearch = new function() {
+		const GOOGLE_SCHOLAR_QUERY_DELAY = 2000; // In ms
+		var queryLimitReached = false,
+			inProgress = false,
+			queue = [],
+			stopCheckCallback; // As long as we process one query at a time, this is ok
+		// Load nsICookieManager2
+		Components.utils.import("resource://gre/modules/Services.jsm");
+		var cookieService = Services.cookies;
+		
+		/**
+		 * Reset "Query Limit Reached" flag, so that we attempt to query Google again
+		 */
+		this.resetQueryLimit = function() {
+			queryLimitReached = false;
+		};
+		
+		/**
+		 * Queue up item for Google Scholar query
+		 * @param {String[]} lines Lines of text to use for full-text query
+		 * @param {Integer | null} libraryID Library to save the item to
+		 * @param {Function} stopCheckCallback Function that returns true if the
+		 *                   process is to be interrupted
+		 * @return {Promise} A promise resolved when PDF metadata has been retrieved
+		 */
+		this.findItem = function(lines, libraryID, stopCheckCallback) {
+			if(!inProgress && queryLimitReached) {
+				// There's no queue, so we can reject immediately
+				return Q.reject(new Zotero.Exception.Alert("recognizePDF.limit"));
+			}
+			
+			var deferred = Q.defer();
+			queue.push({
+				deferred: deferred,
+				lines: lines,
+				libraryID: libraryID,
+				stopCheckCallback: stopCheckCallback
+			});
+			_processQueue();
+			return deferred.promise;
+		};
+		
+		/**
+		 * Process Google Scholar queue
+		 * @private
+		 * @param {Boolean} proceed Whether we should pop the next item off the queue
+		 *                  This should not be true unless being called after processing
+		 *                  another item
+		 */
+		function _processQueue(proceed) {
+			if(inProgress && !proceed) return; //only one at a time
+			
+			if(!queue.length) {
+				inProgress = false;
+				return;
+			}
+			
+			inProgress = true;
+			if(queryLimitReached) {
+				// Irreversibly blocked. Reject remaining items in queue
+				var item;
+				while(item = queue.shift()) {
+					item.deferred.reject(new Zotero.Exception.Alert("recognizePDF.limit"));
+				}
+				_processQueue(true); // Wrap it up
+			} else {
+				var item = queue.shift();
+				
+				stopCheckCallback = item.stopCheckCallback;
+				if(stopCheckCallback && stopCheckCallback()) {
+					item.deferred.reject(new Zotero.Exception.Alert('recognizePDF.stopped'));
+					_processQueue(true);
+					return;
+				}
+				
+				item.deferred.resolve(
+					Q.try(getGoodLines, item.lines)
+					.then(function(lines) {
+						return queryGoogle(lines, item.libraryID, 3); // Try querying 3 times
+					})
+					.finally(function() { _processQueue(true); })
+				);
+			}
+		}
+		
+		/**
+		 * Select lines that are good candidates for Google Scholar query
+		 * @private
+		 * @param {String[]} lines
+		 * @return {String[]}
+		 */
+		function getGoodLines(lines) {
+			// Use only first column from multi-column lines
+			const lineRe = /^[\s_]*([^\s]+(?: [^\s_]+)+)/;
+			var cleanedLines = [], cleanedLineLengths = [];
+			for(var i=0; i<lines.length && cleanedLines.length<100; i++) {
+				var m = lineRe.exec(lines[i]);
+				if(m && m[1].split(' ').length > 3) {
+					cleanedLines.push(m[1]);
+					cleanedLineLengths.push(m[1].length);
+				}
+			}
+			
+			// Get (not quite) median length
+			var lineLengthsLength = cleanedLineLengths.length;
+			if(lineLengthsLength < 20
+					|| cleanedLines[0] === "This is a digital copy of a book that was preserved for generations on library shelves before it was carefully scanned by Google as part of a project") {
+				throw new Zotero.Exception.Alert("recognizePDF.noOCR");
+			}
+			
+			var sortedLengths = cleanedLineLengths.sort(),
+				medianLength = sortedLengths[Math.floor(lineLengthsLength/2)];
+			
+			// Pick lines within 6 chars of the median (this is completely arbitrary)
+			var goodLines = [],
+				uBound = medianLength + 6,
+				lBound = medianLength - 6;
+			for (var i=0; i<lineLengthsLength; i++) {
+				if(cleanedLineLengths[i] > lBound && cleanedLineLengths[i] < uBound) {
+					// Strip quotation marks so they don't mess up search query quoting
+					var line = cleanedLines[i].replace('"', '');
+					goodLines.push(line);
+				}
+			}
+			return goodLines;
+		}
+		
+		/**
+		 * Query Google Scholar
+		 * @private
+		 * @param {String[]} goodLines
+		 * @param {Integer | null} libraryID
+		 * @param {Integer} tries Number of queries to attempt before giving up
+		 * @return {Promise} A promise resolved when PDF metadata has been retrieved
+		 */
+		function queryGoogle(goodLines, libraryID, tries) {
+			if(tries <= 0) throw new Zotero.Exception.Alert("recognizePDF.noMatches");
+			
+			// Take the relevant parts of some lines (exclude hyphenated word)
+			var queryString = "", queryStringWords = 0, nextLine = 0;
+			while(queryStringWords < 25) {
+				if(!goodLines.length) throw new Zotero.Exception.Alert("recognizePDF.noMatches");
+		
+				var words = goodLines.splice(nextLine, 1)[0].split(/\s+/);
+				// Try to avoid picking adjacent strings so the odds of them appearing in another
+				// document quoting our document is low. Every 7th line is a magic value
+				nextLine = (nextLine + 7) % goodLines.length;
+		
+				// Get rid of first and last words
+				words.shift();
+				words.pop();
+				// Make sure there are no long words (probably OCR mistakes)
+				var skipLine = false;
+				for(var i=0; i<words.length; i++) {
+					if(words[i].length > 20) {
+						skipLine = true;
+						break;
+					}
+				}
+				// Add words to query
+				if(!skipLine && words.length) {
+					queryStringWords += words.length;
+					queryString += '"'+words.join(" ")+'" ';
+				}
+			}
+			
+			Zotero.debug("RecognizePDF: Query string " + queryString);
+			
+			var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search",
+				delay = GOOGLE_SCHOLAR_QUERY_DELAY - (Date.now() - Zotero.HTTP.lastGoogleScholarQueryTime);
+
+			// Delay 
+			return (delay > 0 ? Q.delay(delay) : Q())
+			.then(function() {
+				Zotero.HTTP.lastGoogleScholarQueryTime = Date.now();
+				return Zotero.HTTP.promise("GET", url, {"responseType":"document"})
+			})
+			.then(function(xmlhttp) {
+				return _checkCaptchaOK(xmlhttp, 3);
+			},
+			function(e) {
+				return _checkCaptchaError(e, 3);
+			})
+			.then(function(xmlhttp) {
+				var doc = xmlhttp.response,
+					deferred = Q.defer(),
+					translate = new Zotero.Translate.Web();
+				
+				translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289");
+				translate.setDocument(Zotero.HTTP.wrapDocument(doc, url));
+				translate.setHandler("translators", function(translate, detected) {
+					if(detected.length) {
+						deferred.resolve(_promiseTranslate(translate, libraryID));
+					} else {
+						deferred.resolve(Q.try(function() {
+							return queryGoogle(goodLines, libraryID, tries-1);
+						}));
+					}
+				});
+				translate.getTranslators();
+				
+				return deferred.promise;
+			})
+			.catch(function(e) {
+				if(e.name == "recognizePDF.limit") {
+					queryLimitReached = true;
+				}
+				throw e;
+			});
+		}
+		
+		/**
+		 * Check for CAPTCHA on a page with HTTP 200 status
+		 * @private
+		 * @param {XMLHttpRequest} xmlhttp
+		 * @param {Integer} tries Number of queries to attempt before giving up
+		 * @return {Promise} A promise resolved when PDF metadata has been retrieved
+		 */
+		function _checkCaptchaOK(xmlhttp, tries) {
+			if(stopCheckCallback && stopCheckCallback()) {
+				throw new Zotero.Exception.Alert('recognizePDF.stopped');
+			}
+			
+			if(Zotero.Utilities.xpath(xmlhttp.response, "//form[@action='Captcha']").length) {
+				return _solveCaptcha(xmlhttp, tries);
+			}
+			return xmlhttp;
+		}
+		
+		/**
+		 * Check for CAPTCHA on an error page. Handle 403 and 503 pages
+		 * @private
+		 * @param {Zotero.HTTP.UnexpectedStatusException} e HTTP response error object
+		 * @param {Integer} tries Number of queries to attempt before giving up
+		 * @param {Boolean} dontClearCookies Whether to attempt to clear cookies in
+		 *                  in order to get CAPTCHA to show up
+		 * @return {Promise} A promise resolved when PDF metadata has been retrieved
+		 */
+		function _checkCaptchaError(e, tries, dontClearCookies) {
+			if(stopCheckCallback && stopCheckCallback()) {
+				throw new Zotero.Exception.Alert('recognizePDF.stopped');
+			}
+			
+			// Check for captcha on error page
+			if(e instanceof Zotero.HTTP.UnexpectedStatusException
+				&& (e.status == 403 || e.status == 503) && e.xmlhttp.response) {
+				if(_extractCaptchaFormData(e.xmlhttp.response)) {
+					return _solveCaptcha(e.xmlhttp, tries);
+				} else if(!dontClearCookies && e.xmlhttp.channel) { // Make sure we can obtain original URL
+					// AFAICT, for 403 errors, GS just says "sorry, try later",
+					// but if you clear cookies, you get a CAPTCHA
+					if(!_clearGSCookies(e.xmlhttp.channel.originalURI.host)) {
+						//user said no or no cookies removed
+						throw new Zotero.Exception.Alert('recognizePDF.limit');
+					}
+					// Redo GET request
+					return Zotero.HTTP.promise("GET", e.xmlhttp.channel.originalURI.spec, {"responseType":"document"})
+						.then(function(xmlhttp) {
+							return _checkCaptchaOK(xmlhttp, tries);
+						},
+						function(e) {
+							return _checkCaptchaError(e, tries, true); // Don't try this again
+						});
+				}
+				
+				Zotero.debug("RecognizePDF: Google Scholar returned an unexpected page"
+					+ " with status " + e.status);
+				throw new Zotero.Exception.Alert('recognizePDF.limit');
+			}
+			throw e;
+		}
+		
+		/**
+		 * Prompt user to enter CPATCHA
+		 * @private
+		 * @param {XMLHttpRequest} xmlhttp
+		 * @param {Integer} [tries] Number of queries to attempt before giving up
+		 * @return {Promise} A promise resolved when PDF metadata has been retrieved
+		 */
+		function _solveCaptcha(xmlhttp, tries) {
+			var doc = xmlhttp.response;
+			
+			if(tries === undefined) tries = 3;
+			
+			if(!tries) {
+				Zotero.debug("RecognizePDF: Failed to solve CAPTCHA after multiple attempts.");
+				throw new Zotero.Exception.Alert('recognizePDF.limit');
+			}
+			
+			tries--;
+			var formData = doc && _extractCaptchaFormData(doc);
+			if(!formData) {
+				Zotero.debug("RecognizePDF: Could not find CAPTCHA on page.");
+				throw new Zotero.Exception.Alert('recognizePDF.limit');
+			}
+	
+			var io = { dataIn: {
+				title: Zotero.getString("recognizePDF.captcha.title"),
+				description: Zotero.getString("recognizePDF.captcha.description"),
+				imgUrl: formData.img
+			}};
+			
+			_progressWindow.openDialog("chrome://zotero/content/captcha.xul", "",
+				"chrome,modal,resizable=no,centerscreen", io);
+			
+			if(!io.dataOut) {
+				Zotero.debug("RecognizePDF: No CAPTCHA entered");
+				throw new Zotero.Exception.Alert('recognizePDF.limit');
+			}
+			
+			formData.input.captcha = io.dataOut.captcha;
+			var url = '', prop;
+			for(prop in formData.input) {
+				url += '&' + encodeURIComponent(prop) + '='
+					+ encodeURIComponent(formData.input[prop]);
+			}
+			
+			url = formData.action + '?' + url.substr(1);
+			
+			return Zotero.HTTP.promise("GET", url, {"responseType":"document"})
+				.then(function(xmlhttp) {
+					return _checkCaptchaOK(xmlhttp, tries);
+				},
+				function(e) {
+					return _checkCaptchaError(e, tries);
+				});
+		}
+		
+		/**
+		 * Extract CAPTCHA form-related data from the CAPTCHA page
+		 * @private
+		 * @param {Document} doc DOM document object for the CAPTCHA page
+		 * @return {Object} Object containing data describing CAPTCHA form
+		 */
+		function _extractCaptchaFormData(doc) {
+			var formData = {};
+			
+			var img = doc.getElementsByTagName('img')[0];
+			if(!img) return;
+			formData.img = img.src;
+			
+			var form = doc.forms[0];
+			if(!form) return;
+			
+			formData.action = form.action;
+			formData.input = {};
+			var inputs = form.getElementsByTagName('input');
+			for(var i=0, n=inputs.length; i<n; i++) {
+				if(!inputs[i].name) continue;
+				formData.input[inputs[i].name] = inputs[i].value;
+			}
+			
+			formData.continue = "http://scholar.google.com";
+			
+			return formData;
+		}
+		
+		/**
+		 * Clear Google cookies to get the CAPTCHA page to appear
+		 * @private
+		 * @param {String} host Host of the Google Scholar page (in case it's proxied)
+		 * @return {Boolean} Whether any cookies were cleared
+		 */
+		function _clearGSCookies(host) {
+			/* There don't seem to be any negative effects of deleting GDSESS
+			if(!Zotero.isStandalone) {
+				//ask user first
+				var response = Components.classes["@mozilla.org/embedcomp/prompt-service;1"]
+					.getService(Components.interfaces.nsIPromptService)
+					.confirm(null, "Clear Google Scholar cookies?",
+						"Google Scholar is attempting to block further queries. We can "
+						+ "clear certain cookies and try again. This may affect some "
+						+ "temporary Google preferences or it may log you out. May we clear"
+						+ " your Google Scholar cookies?");
+				if(!response) return;
+			}*/
+			
+			var removed = false, cookies = cookieService.getCookiesFromHost(host);
+			while(cookies.hasMoreElements()) {
+				var cookie = cookies.getNext().QueryInterface(Components.interfaces.nsICookie2);
+				if(["GDSESS", "PREF"].indexOf(cookie.name) !== -1) { // GDSESS doesn't seem to always be enough
+					Zotero.debug("RecognizePDF: Removing cookie " + cookie.name + " for host "
+						+ cookie.host + " and path " + cookie.path);
+					cookieService.remove(cookie.host, cookie.name, cookie.path, false);
+					removed = true;
+				}
+			}
+			
+			if(!removed) {
+				Zotero.debug("RecognizePDF: No cookies removed");
+			}
+			
+			return removed;
+		}
+	};
 }
--- a/chrome/locale/en-US/zotero/zotero.dtd
+++ b/chrome/locale/en-US/zotero/zotero.dtd
@ -4,6 +4,8 @@
 <!ENTITY zotero.general.deselectAll						"Deselect All">
 <!ENTITY zotero.general.edit								"Edit">
 <!ENTITY zotero.general.delete 								"Delete">
+<!ENTITY zotero.general.ok								"OK">
+<!ENTITY zotero.general.cancel								"Cancel">

 <!ENTITY zotero.errorReport.title							"Zotero Error Report">
 <!ENTITY zotero.errorReport.unrelatedMessages				"The error log may include messages unrelated to Zotero.">
@ -253,7 +255,6 @@
 <!ENTITY zotero.recognizePDF.cancel.label					"Cancel">
 <!ENTITY zotero.recognizePDF.pdfName.label				"PDF Name">
 <!ENTITY zotero.recognizePDF.itemName.label				"Item Name">
-<!ENTITY zotero.recognizePDF.captcha.label				"Type the text below to continue retrieving metadata.">

 <!ENTITY zotero.rtfScan.title		                    "RTF Scan">
 <!ENTITY zotero.rtfScan.cancel.label					"Cancel">
@ -282,4 +283,4 @@

 <!ENTITY zotero.downloadManager.label			"Save to Zotero">
 <!ENTITY zotero.downloadManager.saveToLibrary.description	"Attachments cannot be saved to the currently selected library. This item will be saved to your library instead.">
-<!ENTITY zotero.downloadManager.noPDFTools.description	"To use this feature, you must first install the PDF tools in the Search pane of the Zotero preferences.">
+<!ENTITY zotero.downloadManager.noPDFTools.description	"To use this feature, you must first install the PDF tools in the Search pane of the Zotero preferences.">
--- a/chrome/locale/en-US/zotero/zotero.properties
+++ b/chrome/locale/en-US/zotero/zotero.properties
@ -895,12 +895,16 @@ proxies.recognized.add				= Add Proxy

 recognizePDF.noOCR					= PDF does not contain OCRed text.
 recognizePDF.couldNotRead			= Could not read text from PDF.
-recognizePDF.noMatches				= No matching references found.
-recognizePDF.fileNotFound			= File not found.
-recognizePDF.limit					= Query limit reached. Try again later.
+recognizePDF.noMatches				= No matching references found
+recognizePDF.fileNotFound			= File not found
+recognizePDF.limit					= Google Scholar query limit reached. Try again later.
 recognizePDF.error				= An unexpected error occurred.
-recognizePDF.complete.label			= Metadata Retrieval Complete.
+recognizePDF.stopped					= Cancelled
+recognizePDF.complete.label			= Metadata Retrieval Complete
+recognizePDF.cancelled.label		= Metadata Retrieval Cancelled
 recognizePDF.close.label			= Close
+recognizePDF.captcha.title		= Please enter CAPTCHA
+recognizePDF.captcha.description		= Zotero uses Google Scholar to help identify PDFs. To continue using Google Scholar, please enter the text from the image below.

 rtfScan.openTitle					= Select a file to scan
 rtfScan.scanning.label				= Scanning RTF Document…
--- a/chrome/skin/default/zotero/zotero.css
+++ b/chrome/skin/default/zotero/zotero.css
@ -303,7 +303,6 @@ label.zotero-text-link {
  margin-bottom: 1em;
 }

-
 .zotero-small-progress-indicator {
 	list-style-image: url(chrome://global/skin/icons/notloading_16.png);
 	margin-left: -2px;
@ -316,4 +315,19 @@ label.zotero-text-link {

 #zotero-note-window {
 	padding-bottom: 4px;
+}
+
+#zotero-captcha-description {
+	max-width: 300px;
+	padding-bottom: 4px;
+	text-align: justify;
+}
+
+#zotero-captcha-error {
+	max-width: 300px;
+	padding-bottom: 4px;
+	padding-top: 4px;
+	font-weight: bold;
+	color: red;
+	text-align: center;
 }