zotero/chrome/content/zotero/recognizePDF.js

/*
    ***** BEGIN LICENSE BLOCK *****

    Copyright © 2009 Center for History and New Media
                     George Mason University, Fairfax, Virginia, USA
                     http://zotero.org

    This file is part of Zotero.

    Zotero is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    Zotero is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with Zotero.  If not, see <http://www.gnu.org/licenses/>.

    ***** END LICENSE BLOCK *****
*/

/**
 * @fileOverview Tools for automatically retrieving a citation for the given PDF
 */

/**
 * Front end for recognizing PDFs
 * @namespace
 */
var Zotero_RecognizePDF = new function() {
	var _progressWindow, _progressIndicator;

	/**
	 * Checks whether a given PDF could theoretically be recognized
	 * @returns {Boolean} True if the PDF can be recognized, false if it cannot be
	 */
	this.canRecognize = function(/**Zotero.Item*/ item) {
		return item.attachmentMIMEType
			&& item.attachmentMIMEType == "application/pdf"
			&& item.isTopLevelItem();
	}

	/**
	 * Retrieves metadata for the PDF(s) selected in the Zotero Pane, placing the PDFs as a children
	 * of the new items
	 */
	this.recognizeSelected = function() {
		var installed = ZoteroPane_Local.checkPDFConverter();
		if (!installed) {
			return;
		}

		var items = ZoteroPane_Local.getSelectedItems();
		if (!items) return;
		var itemRecognizer = new Zotero_RecognizePDF.ItemRecognizer();
		itemRecognizer.recognizeItems(items);
	}

	/**
	 * Retrieves metadata for a PDF and saves it as an item
	 *
	 * @param {nsIFile} file The PDF file to retrieve metadata for
	 * @param {Integer} libraryID The library in which to save the PDF
	 * @param {Function} stopCheckCallback Function that returns true if the
	 *                   process is to be interrupted
	 * @return {Promise} A promise resolved when PDF metadata has been retrieved
	 */
	this.recognize = Zotero.Promise.coroutine(function* (file, libraryID, stopCheckCallback) {
		const MAX_PAGES = 15;
		var me = this;

		var lines = yield _extractText(file, MAX_PAGES);
		// Look for DOI - Use only first 80 lines to avoid catching article references
		var allText = lines.join("\n"),
			firstChunk = lines.slice(0,80).join('\n'),
			doi = Zotero.Utilities.cleanDOI(firstChunk),
			promise;
		Zotero.debug(allText);

		if(!doi) {
			// Look for a JSTOR stable URL, which can be converted to a DOI by prepending 10.2307
			doi = firstChunk.match(/www.\jstor\.org\/stable\/(\S+)/i);
			if(doi) {
				doi = Zotero.Utilities.cleanDOI(
					doi[1].indexOf('10.') == 0 ? doi[1] : '10.2307/' + doi[1]
				);
			}
		}

		var newItem;
		if (doi) {
			// Look up DOI
			Zotero.debug("RecognizePDF: Found DOI: "+doi);

			var translateDOI = new Zotero.Translate.Search();
			translateDOI.setTranslator("11645bd1-0420-45c1-badb-53fb41eeb753");
			translateDOI.setSearch({"itemType":"journalArticle", "DOI":doi});
			try {
				newItem = yield _promiseTranslate(translateDOI, libraryID);
				return newItem;
			}
			catch (e) {
				Zotero.debug("RecognizePDF: " + e);
			}
		}
		else {
			Zotero.debug("RecognizePDF: No DOI found in text");
		}

		// Look for ISBNs if no DOI
		var isbns = _findISBNs(allText);
		if (isbns.length) {
			Zotero.debug("RecognizePDF: Found ISBNs: " + isbns);

			var translate = new Zotero.Translate.Search();
			translate.setSearch({"itemType":"book", "ISBN":isbns[0]});
			try {
				newItem = yield _promiseTranslate(translate, libraryID);
				return newItem;
			}
			catch (e) {
				// If no DOI or ISBN, query Google Scholar
				Zotero.debug("RecognizePDF: " + e);
			}
		}
		else {
			Zotero.debug("RecognizePDF: No ISBN found in text");
		}

		return this.GSFullTextSearch.findItem(lines, libraryID, stopCheckCallback);
	});

	/**
	 * Get text from a PDF
	 * @param {nsIFile} file PDF
	 * @param {Number} pages Number of pages to extract
	 * @return {Promise}
	 */
	function _extractText(file, pages) {
		var cacheFile = Zotero.File.pathToFile(Zotero.DataDirectory.dir);
		cacheFile.append("recognizePDFcache.txt");
		if(cacheFile.exists()) {
			cacheFile.remove(false);
		}

		var {exec, args} = Zotero.Fulltext.getPDFConverterExecAndArgs();
		args.push('-enc', 'UTF-8', '-nopgbrk', '-layout', '-l', pages, file.path, cacheFile.path);

		Zotero.debug("RecognizePDF: Running " + exec.path + " " + args.map(arg => "'" + arg + "'").join(" "));

		return Zotero.Utilities.Internal.exec(exec, args).then(function() {
			if(!cacheFile.exists()) {
				throw new Zotero.Exception.Alert("recognizePDF.couldNotRead");
			}

			try {
				var inputStream = Components.classes["@mozilla.org/network/file-input-stream;1"]
					.createInstance(Components.interfaces.nsIFileInputStream);
				inputStream.init(cacheFile, 0x01, 0o664, 0);
				try {
					var intlStream = Components.classes["@mozilla.org/intl/converter-input-stream;1"]
						.createInstance(Components.interfaces.nsIConverterInputStream);
					intlStream.init(inputStream, "UTF-8", 65535,
						Components.interfaces.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER);
					intlStream.QueryInterface(Components.interfaces.nsIUnicharLineInputStream);

					// get the lines in this sample
					var lines = [], str = {};
					while(intlStream.readLine(str)) {
						var line = str.value.trim();
						if(line) lines.push(line);
					}
				} finally {
					inputStream.close();
				}
			} finally {
				cacheFile.remove(false);
			}

			return lines;
		}, function() {
			throw new Zotero.Exception.Alert("recognizePDF.couldNotRead");
		});
	}

	/**
	 * Attach appropriate handlers to a Zotero.Translate instance and begin translation
	 * @return {Promise}
	 */
	var _promiseTranslate = Zotero.Promise.coroutine(function* (translate, libraryID) {
		translate.setHandler("select", function(translate, items, callback) {
			for(var i in items) {
				var obj = {};
				obj[i] = items[i];
				callback(obj);
				return;
			}
		});
		/*translate.setHandler("done", function(translate, success) {
			if(success && translate.newItems.length) {
				deferred.resolve(translate.newItems[0]);
			} else {
				deferred.reject(translate.translator && translate.translator.length
					? "Translation with " + translate.translator.map(t => t.label) + " failed"
					: "Could not find a translator for given search item"
				);
			}
		});*/
		var newItems = yield translate.translate({
			libraryID,
			saveAttachments: false
		});
		if (newItems.length) {
			return newItems[0];
		}
		throw new Error("No items found");
	});

	/**
	 * Search ISBNs in text
	 * @private
	 * @return {String[]} Array of ISBNs
	 */
	function _findISBNs(x) {
		if(typeof(x) != "string") {
			throw "findISBNs: argument must be a string";
		}
		var isbns = [];

		// Match lines saying "isbn: " or "ISBN-10:" or similar, consider m-dashes and n-dashes as well
		var pattern = /(SBN|sbn)[ \u2014\u2013\u2012-]?(10|13)?[: ]*([0-9X][0-9X \u2014\u2013\u2012-]+)/g;
		var match;

		while (match = pattern.exec(x)) {
			var isbn = match[3];
			isbn = isbn.replace(/[ \u2014\u2013\u2012-]/g, '');
			if(isbn.length==20 || isbn.length==26) {
				// Handle the case of two isbns (e.g. paper+hardback) next to each other
				isbns.push(isbn.slice(0,isbn.length/2), isbn.slice(isbn.length/2));
			} else if(isbn.length==23) {
				// Handle the case of two isbns (10+13) next to each other
				isbns.push(isbn.slice(0,10), isbn.slice(10));
			} else if(isbn.length==10 || isbn.length==13) {
				isbns.push(isbn);
			}
		}

		// Validate ISBNs
		var validIsbns = [], cleanISBN;
		for (var i =0; i < isbns.length; i++) {
			cleanISBN = Zotero.Utilities.cleanISBN(isbns[i]);
			if(cleanISBN) validIsbns.push(cleanISBN);
		}
		return validIsbns;
	}

	/**
	 * @class Handles UI, etc. for recognizing multiple items
	 */
	this.ItemRecognizer = function () {
		this._items = [];
	}

	this.ItemRecognizer.prototype = {
		"_stopped": false,
		"_itemsTotal": 0,
		"_progressWindow": null,
		"_progressIndicator": null,

		/**
		 * Retreives metadata for the PDF items passed, displaying a progress dialog during conversion
		 * and placing the PDFs as a children of the new items
		 * @param {Zotero.Item[]} items
		 */
		"recognizeItems": function(items) {
			var me = this;
			this._items = items.slice();
			this._itemTotal = items.length;

			_progressWindow = this._progressWindow = window.openDialog("chrome://zotero/content/pdfProgress.xul", "", "chrome,close=yes,resizable=yes,dependent,dialog,centerscreen");
			this._progressWindow.addEventListener("pageshow", function() { me._onWindowLoaded() }, false);
		},

		/**
		 * Halts recognition of PDFs
		 */
		"stop": function() {
			this._stopped = true;
		},

		/**
		 * Halts recognition and closes window
		 */
		"close": function() {
			this.stop();
			this._progressWindow.close();
		},

		/**
		 * Called when the progress window has been opened; adds items to the tree and begins recognizing
		 * @param
		 */
		"_onWindowLoaded": function() {
			// populate progress window
			var treechildren = this._progressWindow.document.getElementById("treechildren");
			this._rowIDs = [];
			for(var i in this._items) {
				var treeitem = this._progressWindow.document.createElement('treeitem');
				var treerow = this._progressWindow.document.createElement('treerow');
				this._rowIDs.push(this._items[i].id);

				var treecell = this._progressWindow.document.createElement('treecell');
				treecell.setAttribute("id", "item-"+this._items[i].id+"-icon");
				treerow.appendChild(treecell);

				treecell = this._progressWindow.document.createElement('treecell');
				treecell.setAttribute("label", this._items[i].getField("title"));
				treerow.appendChild(treecell);

				treecell = this._progressWindow.document.createElement('treecell');
				treecell.setAttribute("id", "item-"+this._items[i].id+"-title");
				treerow.appendChild(treecell);

				treeitem.appendChild(treerow);
				treechildren.appendChild(treeitem);
			}

			var me = this;

			this._progressWindow.document.getElementById("tree").addEventListener(
				"dblclick", function(event) { me._onDblClick(event, this); });

			this._cancelHandler = function() { me.stop() };
			this._keypressCancelHandler = function(e) {
				if(e.keyCode === KeyEvent.DOM_VK_ESCAPE) me.stop();
			};

			_progressIndicator = this._progressIndicator = this._progressWindow.document.getElementById("progress-indicator");
			this._progressWindow.document.getElementById("cancel-button")
				.addEventListener("command", this._cancelHandler, false);
			// Also cancel if the user presses Esc
			this._progressWindow.addEventListener("keypress", this._keypressCancelHandler);
			this._progressWindow.addEventListener("close", this._cancelHandler, false);
			Zotero_RecognizePDF.GSFullTextSearch.resetQueryLimit();
			return this._recognizeItem();
		},

		/**
		 * Shifts an item off of this._items and recognizes it, then calls itself again if there are more
		 * @private
		 */
		"_recognizeItem": Zotero.Promise.coroutine(function* () {
			const SUCCESS_IMAGE = "chrome://zotero/skin/tick.png";
			const FAILURE_IMAGE = "chrome://zotero/skin/cross.png";
			const LOADING_IMAGE = "chrome://global/skin/icons/loading_16.png";

			if(!this._items.length) {
				this._done();
				return;
			}

			// Order here matters. Otherwise we may show an incorrect label
			if(this._stopped) {
				this._done(true);
				return;
			}

			this._progressIndicator.value = (this._itemTotal-this._items.length)/this._itemTotal*100;

			var item = this._items.shift(),
				itemIcon = this._progressWindow.document.getElementById("item-"+item.id+"-icon"),
				itemTitle = this._progressWindow.document.getElementById("item-"+item.id+"-title"),
				rowNumber = this._rowIDs.indexOf(item.id);
			itemIcon.setAttribute("src", LOADING_IMAGE);
			itemTitle.setAttribute("label", "");

			var file = item.getFile(), me = this;

			try {
				if (file) {
					let newItem = yield Zotero_RecognizePDF.recognize(
						file,
						item.libraryID,
						() => this._stopped
					);

					// If already stopped, delete
					if (this._stopped) {
						yield Zotero.Items.erase(newItem.id);
						throw new Zotero.Exception.Alert('recognizePDF.stopped');
					}

					// put new item in same collections as the old one
					let itemCollections = item.getCollections();
					yield Zotero.DB.executeTransaction(function* () {
						for (let i = 0; i < itemCollections.length; i++) {
							let collection = Zotero.Collections.get(itemCollections[i]);
							yield collection.addItem(newItem.id);
						}

						// put old item as a child of the new item
						item.parentID = newItem.id;
						yield item.save();
					});

					itemTitle.setAttribute("label", newItem.getField("title"));
					itemIcon.setAttribute("src", SUCCESS_IMAGE);
					this._rowIDs[rowNumber] = newItem.id;

					return this._recognizeItem();
				}
				else {
					throw new Zotero.Exception.Alert("recognizePDF.fileNotFound");
				}
			}
			catch (e) {
				Zotero.logError(e);

				itemTitle.setAttribute(
					"label",
					e instanceof Zotero.Exception.Alert
						? e.message
						: Zotero.getString("recognizePDF.error")
				);
				itemIcon.setAttribute("src", FAILURE_IMAGE);

				// Don't show "completed" label if stopped on last item
				if (this._stopped && !this._items.length) {
					this._done(true);
				} else {
					return this._recognizeItem();
				}
			}
			finally {
				// scroll to this item
				this._progressWindow.document.getElementById("tree").treeBoxObject.scrollToRow(
					Math.max(0, this._itemTotal - this._items.length - 4)
				);
			}
		}),

		/**
		 * Cleans up after items are recognized, disabling the cancel button and
		 * making the progress window close on blur.
		 * @param {Boolean} cancelled Whether the process was cancelled
		 */
		"_done": function(cancelled) {
			this._progressIndicator.value = 100;
			// Switch out cancel for close
			var cancelButton = this._progressWindow.document.getElementById("cancel-button"),
				me = this;
			cancelButton.label = Zotero.getString("recognizePDF.close.label");
			cancelButton.removeEventListener("command", this._cancelHandler, false);
			cancelButton.addEventListener("command", function() { me.close() }, false);
			this._progressWindow.removeEventListener("keypress", this._keypressCancelHandler);
			this._progressWindow.addEventListener("keypress", function() { me.close() });

			if(Zotero.isMac) {
				// On MacOS X, the windows are not always on top, so we hide them on
				// blur to avoid clutter
				this._setCloseTimer();
			}
			this._progressWindow.document.getElementById("label").value =
				cancelled ? Zotero.getString("recognizePDF.cancelled.label")
					: Zotero.getString("recognizePDF.complete.label");
		},

		/**
		 * Set a timer after which the window will close automatically. If the
		 * window is refocused, clear the timer and do not attempt to auto-close
		 * any more
		 * @private
		 */
		"_setCloseTimer": function() {
			var me = this, win = this._progressWindow;
			var focusListener = function() {
				if(!win.zoteroCloseTimeoutID) return;

				win.clearTimeout(win.zoteroCloseTimeoutID);
				delete win.zoteroCloseTimeoutID;

				win.removeEventListener('blur', blurListener, false);
				win.removeEventListener('focus', focusListener, false);
			};
			var blurListener = function() {
				// Close window after losing focus for 5 seconds
				win.zoteroCloseTimeoutID = win.setTimeout(function() { win.close() }, 5000);
				// Prevent auto-close if we gain focus again
				win.addEventListener("focus", focusListener, false);
			};
			win.addEventListener("blur", blurListener, false);
		},

		/**
		 * Focus items in Zotero library when double-clicking them in the Retrieve
		 * metadata window.
		 * @param {Event} event
		 * @param {tree} tree XUL tree object
		 * @private
		 */
		"_onDblClick": function(event, tree) {
			if (event && tree && event.type == "dblclick") {
				var itemID = this._rowIDs[tree.treeBoxObject.getRowAt(event.clientX, event.clientY)];
				if(!itemID) return;

				// Get the right window. In tab mode, it's the container window
				var lastWin = (window.ZoteroTab ? window.ZoteroTab.containerWindow : window);

				if (lastWin.ZoteroOverlay) {
					lastWin.ZoteroOverlay.toggleDisplay(true);
				}

				lastWin.ZoteroPane.selectItem(itemID, false, true);
				lastWin.focus();
			}
		}
	};

	/**
	 * Singleton for querying Google Scholar. Ensures that all queries are
	 * sequential and respect the delay inbetween queries.
	 * @namespace
	 */
	this.GSFullTextSearch = new function() {
		const GOOGLE_SCHOLAR_QUERY_DELAY = 2000; // In ms
		var queryLimitReached = false,
			inProgress = false,
			queue = [],
			stopCheckCallback; // As long as we process one query at a time, this is ok
		// Load nsICookieManager2
		Components.utils.import("resource://gre/modules/Services.jsm");
		var cookieService = Services.cookies;

		/**
		 * Reset "Query Limit Reached" flag, so that we attempt to query Google again
		 */
		this.resetQueryLimit = function() {
			queryLimitReached = false;
		};

		/**
		 * Queue up item for Google Scholar query
		 * @param {String[]} lines Lines of text to use for full-text query
		 * @param {Integer | null} libraryID Library to save the item to
		 * @param {Function} stopCheckCallback Function that returns true if the
		 *                   process is to be interrupted
		 * @return {Promise} A promise resolved when PDF metadata has been retrieved
		 */
		this.findItem = function(lines, libraryID, stopCheckCallback) {
			if(!inProgress && queryLimitReached) {
				// There's no queue, so we can reject immediately
				return Zotero.Promise.reject(new Zotero.Exception.Alert("recognizePDF.limit"));
			}

			var deferred = Zotero.Promise.defer();
			queue.push({
				deferred: deferred,
				lines: lines,
				libraryID: libraryID,
				stopCheckCallback: stopCheckCallback
			});
			_processQueue();
			return deferred.promise;
		};

		/**
		 * Process Google Scholar queue
		 * @private
		 * @param {Boolean} proceed Whether we should pop the next item off the queue
		 *                  This should not be true unless being called after processing
		 *                  another item
		 */
		function _processQueue(proceed) {
			if(inProgress && !proceed) return; //only one at a time

			if(!queue.length) {
				inProgress = false;
				return;
			}

			inProgress = true;
			if(queryLimitReached) {
				// Irreversibly blocked. Reject remaining items in queue
				var item;
				while(item = queue.shift()) {
					item.deferred.reject(new Zotero.Exception.Alert("recognizePDF.limit"));
				}
				_processQueue(true); // Wrap it up
			} else {
				var item = queue.shift();

				stopCheckCallback = item.stopCheckCallback;
				if(stopCheckCallback && stopCheckCallback()) {
					item.deferred.reject(new Zotero.Exception.Alert('recognizePDF.stopped'));
					_processQueue(true);
					return;
				}

				item.deferred.resolve(
					Zotero.Promise.try(function () {
						var lines = getGoodLines(item.lines);
						return queryGoogle(lines, item.libraryID, 3); // Try querying 3 times
					})
					.finally(function() { _processQueue(true); })
				);
			}
		}

		/**
		 * Select lines that are good candidates for Google Scholar query
		 * @private
		 * @param {String[]} lines
		 * @return {String[]}
		 */
		function getGoodLines(lines) {
			// Use only first column from multi-column lines
			const lineRe = /^[\s_]*([^\s]+(?: [^\s_]+)+)/;
			var cleanedLines = [], cleanedLineLengths = [];
			for(var i=0; i<lines.length && cleanedLines.length<100; i++) {
				var m = lineRe.exec(
					lines[i]
					// Replace non-breaking spaces
					.replace(/\xA0/g, ' ')
				);
				if(m && m[1].split(' ').length > 3) {
					cleanedLines.push(m[1]);
					cleanedLineLengths.push(m[1].length);
				}
			}

			// Get (not quite) median length
			var lineLengthsLength = cleanedLineLengths.length;
			if(lineLengthsLength < 20
					|| cleanedLines[0] === "This is a digital copy of a book that was preserved for generations on library shelves before it was carefully scanned by Google as part of a project") {
				throw new Zotero.Exception.Alert("recognizePDF.noOCR");
			}

			var sortedLengths = cleanedLineLengths.sort(),
				medianLength = sortedLengths[Math.floor(lineLengthsLength/2)];

			// Pick lines within 6 chars of the median (this is completely arbitrary)
			var goodLines = [],
				uBound = medianLength + 6,
				lBound = medianLength - 6;
			for (var i=0; i<lineLengthsLength; i++) {
				if(cleanedLineLengths[i] > lBound && cleanedLineLengths[i] < uBound) {
					// Strip quotation marks so they don't mess up search query quoting
					var line = cleanedLines[i].replace('"', '');
					goodLines.push(line);
				}
			}
			return goodLines;
		}

		/**
		 * Query Google Scholar
		 * @private
		 * @param {String[]} goodLines
		 * @param {Integer | null} libraryID
		 * @param {Integer} tries Number of queries to attempt before giving up
		 * @return {Promise} A promise resolved when PDF metadata has been retrieved
		 */
		var queryGoogle = Zotero.Promise.coroutine(function* (goodLines, libraryID, tries) {
			if(tries <= 0) throw new Zotero.Exception.Alert("recognizePDF.noMatches");

			// Take the relevant parts of some lines (exclude hyphenated word)
			var queryString = "", queryStringWords = 0, nextLine = 0;
			while(queryStringWords < 25) {
				if(!goodLines.length) throw new Zotero.Exception.Alert("recognizePDF.noMatches");

				var words = goodLines.splice(nextLine, 1)[0].split(/\s+/);
				// Try to avoid picking adjacent strings so the odds of them appearing in another
				// document quoting our document is low. Every 7th line is a magic value
				nextLine = (nextLine + 7) % goodLines.length;

				// Get rid of first and last words
				words.shift();
				words.pop();
				// Make sure there are no long words (probably OCR mistakes)
				var skipLine = false;
				for(var i=0; i<words.length; i++) {
					if(words[i].length > 20) {
						skipLine = true;
						break;
					}
				}
				// Add words to query
				if(!skipLine && words.length) {
					queryStringWords += words.length;
					queryString += '"'+words.join(" ")+'" ';
				}
			}

			Zotero.debug("RecognizePDF: Query string " + queryString);

			var url = "https://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search",
				delay = GOOGLE_SCHOLAR_QUERY_DELAY - (Date.now() - Zotero.HTTP.lastGoogleScholarQueryTime);

			// Delay
			if (delay > 0) {
				yield Zotero.Promise.delay(delay);
			}
			Zotero.HTTP.lastGoogleScholarQueryTime = Date.now();
			try {
				let xmlhttp = yield Zotero.HTTP.request("GET", url, { "responseType": "document" })
					.then(
						function (xmlhttp) {
							return _checkCaptchaOK(xmlhttp, 3);
						},
						function (e) {
							return _checkCaptchaError(e, 3);
						}
					);

				let doc = xmlhttp.response,
					deferred = Zotero.Promise.defer(),
					translate = new Zotero.Translate.Web();

				translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289");
				translate.setDocument(Zotero.HTTP.wrapDocument(doc, url));
				translate.setHandler("translators", function(translate, detected) {
					if(detected.length) {
						deferred.resolve(_promiseTranslate(translate, libraryID));
					} else {
						deferred.resolve(Zotero.Promise.try(function() {
							return queryGoogle(goodLines, libraryID, tries-1);
						}));
					}
				});
				translate.getTranslators();

				return deferred.promise;
			}
			catch (e) {
				if(e.name == "recognizePDF.limit") {
					queryLimitReached = true;
				}
				throw e;
			}
		});

		/**
		 * Check for CAPTCHA on a page with HTTP 200 status
		 * @private
		 * @param {XMLHttpRequest} xmlhttp
		 * @param {Integer} tries Number of queries to attempt before giving up
		 * @return {Promise} A promise resolved when PDF metadata has been retrieved
		 */
		function _checkCaptchaOK(xmlhttp, tries) {
			if(stopCheckCallback && stopCheckCallback()) {
				throw new Zotero.Exception.Alert('recognizePDF.stopped');
			}

			Zotero.debug("RecognizePDF: (" + xmlhttp.status + ") Got page with title " + xmlhttp.response.title);

			if(Zotero.Utilities.xpath(xmlhttp.response, "//form[@action='Captcha']").length) {
				Zotero.debug("RecognizePDF: Found CAPTCHA on page.");
				return _solveCaptcha(xmlhttp, tries);
			}
			return xmlhttp;
		}

		/**
		 * Check for CAPTCHA on an error page. Handle 403 and 503 pages
		 * @private
		 * @param {Zotero.HTTP.UnexpectedStatusException} e HTTP response error object
		 * @param {Integer} tries Number of queries to attempt before giving up
		 * @param {Boolean} dontClearCookies Whether to attempt to clear cookies in
		 *                  in order to get CAPTCHA to show up
		 * @return {Promise} A promise resolved when PDF metadata has been retrieved
		 */
		var _checkCaptchaError = Zotero.Promise.coroutine(function* (e, tries, dontClearCookies) {
			if(stopCheckCallback && stopCheckCallback()) {
				throw new Zotero.Exception.Alert('recognizePDF.stopped');
			}

			Zotero.debug("RecognizePDF: Checking for CAPTCHA on Google Scholar error page (" + e.status + ")");

			// Check for captcha on error page
			if(e instanceof Zotero.HTTP.UnexpectedStatusException
				&& (e.status == 403 || e.status == 503) && e.xmlhttp.response) {
				if(_extractCaptchaFormData(e.xmlhttp.response)) {
					Zotero.debug("RecognizePDF: CAPTCHA found");
					return _solveCaptcha(e.xmlhttp, tries);
				} else if(!dontClearCookies && e.xmlhttp.channel) { // Make sure we can obtain original URL
					// AFAICT, for 403 errors, GS just says "sorry, try later",
					// but if you clear cookies, you get a CAPTCHA
					Zotero.debug("RecognizePDF: No CAPTCHA detected on page. Clearing cookies.");
					if(!_clearGSCookies(e.xmlhttp.channel.originalURI.host)) {
						//user said no or no cookies removed
						throw new Zotero.Exception.Alert('recognizePDF.limit');
					}
					// Redo GET request
					Zotero.debug("RecognizePDF: Reloading page after clearing cookies.");
					return Zotero.HTTP.request(
						"GET", e.xmlhttp.channel.originalURI.spec, { "responseType": "document" }
					)
					.then(
						function (xmlhttp) {
							return _checkCaptchaOK(xmlhttp, tries);
						},
						function (e) {
							return _checkCaptchaError(e, tries, true); // Don't try this again
						}
					);
				}

				Zotero.debug("RecognizePDF: Google Scholar returned an unexpected page"
					+ " with status " + e.status);
				throw new Zotero.Exception.Alert('recognizePDF.limit');
			}
			throw e;
		});

		/**
		 * Prompt user to enter CPATCHA
		 * @private
		 * @param {XMLHttpRequest} xmlhttp
		 * @param {Integer} [tries] Number of queries to attempt before giving up
		 * @return {Promise} A promise resolved when PDF metadata has been retrieved
		 */
		function _solveCaptcha(xmlhttp, tries) {
			var doc = xmlhttp.response;

			if(tries === undefined) tries = 3;

			if(!tries) {
				Zotero.debug("RecognizePDF: Failed to solve CAPTCHA after multiple attempts.");
				throw new Zotero.Exception.Alert('recognizePDF.limit');
			}

			tries--;
			var formData = doc && _extractCaptchaFormData(doc);
			if(!formData) {
				Zotero.debug("RecognizePDF: Could not find CAPTCHA on page.");
				throw new Zotero.Exception.Alert('recognizePDF.limit');
			}

			var io = { dataIn: {
				title: Zotero.getString("recognizePDF.captcha.title"),
				description: Zotero.getString("recognizePDF.captcha.description"),
				imgUrl: formData.img
			}};

			_progressWindow.openDialog("chrome://zotero/content/captcha.xul", "",
				"chrome,modal,resizable=no,centerscreen", io);

			if(!io.dataOut) {
				Zotero.debug("RecognizePDF: No CAPTCHA entered");
				throw new Zotero.Exception.Alert('recognizePDF.limit');
			}

			Zotero.debug('RecognizePDF: User entered "' + io.dataOut.captcha + '" for CAPTCHA');
			formData.input.captcha = io.dataOut.captcha;
			var url = '', prop;
			for(prop in formData.input) {
				url += '&' + encodeURIComponent(prop) + '='
					+ encodeURIComponent(formData.input[prop]);
			}

			url = formData.action + '?' + url.substr(1);

			return Zotero.HTTP.promise("GET", url, {"responseType":"document"})
				.then(function(xmlhttp) {
					return _checkCaptchaOK(xmlhttp, tries);
				},
				function(e) {
					return _checkCaptchaError(e, tries);
				});
		}

		/**
		 * Extract CAPTCHA form-related data from the CAPTCHA page
		 * @private
		 * @param {Document} doc DOM document object for the CAPTCHA page
		 * @return {Object} Object containing data describing CAPTCHA form
		 */
		function _extractCaptchaFormData(doc) {
			var formData = {};

			var img = doc.getElementsByTagName('img')[0];
			if(!img) return;
			formData.img = img.src;

			var form = doc.forms[0];
			if(!form) return;

			formData.action = form.action;
			formData.input = {};
			var inputs = form.getElementsByTagName('input');
			for(var i=0, n=inputs.length; i<n; i++) {
				if(!inputs[i].name) continue;
				formData.input[inputs[i].name] = inputs[i].value;
			}

			formData.continue = "https://scholar.google.com";

			return formData;
		}

		/**
		 * Clear Google cookies to get the CAPTCHA page to appear
		 * @private
		 * @param {String} host Host of the Google Scholar page (in case it's proxied)
		 * @return {Boolean} Whether any cookies were cleared
		 */
		function _clearGSCookies(host) {
			/* There don't seem to be any negative effects of deleting GDSESS
			if(!Zotero.isStandalone) {
				//ask user first
				var response = Components.classes["@mozilla.org/embedcomp/prompt-service;1"]
					.getService(Components.interfaces.nsIPromptService)
					.confirm(null, "Clear Google Scholar cookies?",
						"Google Scholar is attempting to block further queries. We can "
						+ "clear certain cookies and try again. This may affect some "
						+ "temporary Google preferences or it may log you out. May we clear"
						+ " your Google Scholar cookies?");
				if(!response) return;
			}*/

			var removed = false, cookies = cookieService.getCookiesFromHost(host);
			while(cookies.hasMoreElements()) {
				var cookie = cookies.getNext().QueryInterface(Components.interfaces.nsICookie2);
				if(["GDSESS", "PREF"].indexOf(cookie.name) !== -1) { // GDSESS doesn't seem to always be enough
					Zotero.debug("RecognizePDF: Removing cookie " + cookie.name + " for host "
						+ cookie.host + " and path " + cookie.path);
					cookieService.remove(cookie.host, cookie.name, cookie.path, false);
					removed = true;
				}
			}

			if(!removed) {
				Zotero.debug("RecognizePDF: No cookies removed");
			}

			return removed;
		}
	};
}