From 3c21e7c999a63e5f556dc8d4c0ae27c7e31ab921 Mon Sep 17 00:00:00 2001 From: aurimasv Date: Tue, 28 Jan 2014 03:59:31 -0600 Subject: [PATCH] [Retrieve Metadata] Add/fix comments --- chrome/content/zotero/recognizePDF.js | 148 +++++++++++++++++++++----- 1 file changed, 119 insertions(+), 29 deletions(-) diff --git a/chrome/content/zotero/recognizePDF.js b/chrome/content/zotero/recognizePDF.js index 125327e036..9b50eac23a 100644 --- a/chrome/content/zotero/recognizePDF.js +++ b/chrome/content/zotero/recognizePDF.js @@ -65,6 +65,8 @@ var Zotero_RecognizePDF = new function() { * * @param {nsIFile} file The PDF file to retrieve metadata for * @param {Integer|null} libraryID The library in which to save the PDF + * @param {Function} stopCheckCallback Function that returns true if the + * process is to be interrupted * @return {Promise} A promise resolved when PDF metadata has been retrieved */ this.recognize = function(file, libraryID, stopCheckCallback) { @@ -273,6 +275,9 @@ var Zotero_RecognizePDF = new function() { this._stopped = true; }, + /** + * Halts recognition and closes window + */ "close": function() { this.stop(); this._progressWindow.close(); @@ -308,7 +313,7 @@ var Zotero_RecognizePDF = new function() { } var me = this; - // Tree double-click handler + this._progressWindow.document.getElementById("tree").addEventListener( "dblclick", function(event) { me._onDblClick(event, this); }); @@ -390,7 +395,7 @@ var Zotero_RecognizePDF = new function() { .catch(function(error) { Zotero.debug(error); Zotero.logError(error); - + itemTitle.setAttribute("label", error instanceof Zotero.Exception.Alert ? error.message : Zotero.getString("recognizePDF.error")); itemIcon.setAttribute("src", FAILURE_IMAGE); @@ -407,8 +412,9 @@ var Zotero_RecognizePDF = new function() { }, /** - * Cleans up after items are recognized, disabling the cancel button and making the progress window - * close on blur + * Cleans up after items are recognized, disabling the cancel button and + * making the progress window close on blur. + * @param {Boolean} cancelled Whether the process was cancelled */ "_done": function(cancelled) { this._progressIndicator.value = 100; @@ -422,8 +428,8 @@ var Zotero_RecognizePDF = new function() { this._progressWindow.addEventListener("keypress", function() { me.close() }); if(Zotero.isMac) { - //on MacOS X, the windows are not always on top, so we hide them on blur - // to avoid clutter + // On MacOS X, the windows are not always on top, so we hide them on + // blur to avoid clutter this._setCloseTimer(); } this._progressWindow.document.getElementById("label").value = @@ -431,6 +437,12 @@ var Zotero_RecognizePDF = new function() { : Zotero.getString("recognizePDF.complete.label"); }, + /** + * Set a timer after which the window will close automatically. If the + * window is refocused, clear the timer and do not attempt to auto-close + * any more + * @private + */ "_setCloseTimer": function() { var me = this, win = this._progressWindow; var focusListener = function() { @@ -443,19 +455,27 @@ var Zotero_RecognizePDF = new function() { win.removeEventListener('focus', focusListener, false); }; var blurListener = function() { - //close window after losing focus for 5 seconds + // Close window after losing focus for 5 seconds win.zoteroCloseTimeoutID = win.setTimeout(function() { win.close() }, 5000); - //re-set timer if we gain focus again + // Prevent auto-close if we gain focus again win.addEventListener("focus", focusListener, false); }; win.addEventListener("blur", blurListener, false); }, + /** + * Focus items in Zotero library when double-clicking them in the Retrieve + * metadata window. + * @param {Event} event + * @param {tree} tree XUL tree object + * @private + */ "_onDblClick": function(event, tree) { if (event && tree && event.type == "dblclick") { var itemID = this._rowIDs[tree.treeBoxObject.getRowAt(event.clientX, event.clientY)]; if(!itemID) return; + // Get the right window. In tab mode, it's the container window var lastWin = (window.ZoteroTab ? window.ZoteroTab.containerWindow : window); if (lastWin.ZoteroOverlay) { @@ -468,23 +488,39 @@ var Zotero_RecognizePDF = new function() { } }; + /** + * Singleton for querying Google Scholar. Ensures that all queries are + * sequential and respect the delay inbetween queries. + * @namespace + */ this.GSFullTextSearch = new function() { - const GOOGLE_SCHOLAR_QUERY_DELAY = 2000; // in ms + const GOOGLE_SCHOLAR_QUERY_DELAY = 2000; // In ms var queryLimitReached = false, inProgress = false, queue = [], stopCheckCallback; // As long as we process one query at a time, this is ok - //load nsICookieManager2 + // Load nsICookieManager2 Components.utils.import("resource://gre/modules/Services.jsm"); var cookieService = Services.cookies; + /** + * Reset "Query Limit Reached" flag, so that we attempt to query Google again + */ this.resetQueryLimit = function() { queryLimitReached = false; }; + /** + * Queue up item for Google Scholar query + * @param {String[]} lines Lines of text to use for full-text query + * @param {Integer | null} libraryID Library to save the item to + * @param {Function} stopCheckCallback Function that returns true if the + * process is to be interrupted + * @return {Promise} A promise resolved when PDF metadata has been retrieved + */ this.findItem = function(lines, libraryID, stopCheckCallback) { if(!inProgress && queryLimitReached) { - //there's no queue, so we can reject immediately + // There's no queue, so we can reject immediately return Q.reject(new Zotero.Exception.Alert("recognizePDF.limit")); } @@ -499,6 +535,13 @@ var Zotero_RecognizePDF = new function() { return deferred.promise; }; + /** + * Process Google Scholar queue + * @private + * @param {Boolean} proceed Whether we should pop the next item off the queue + * This should not be true unless being called after processing + * another item + */ function _processQueue(proceed) { if(inProgress && !proceed) return; //only one at a time @@ -509,12 +552,12 @@ var Zotero_RecognizePDF = new function() { inProgress = true; if(queryLimitReached) { - //irreversibly blocked. Reject remaining items in queue + // Irreversibly blocked. Reject remaining items in queue var item; while(item = queue.shift()) { item.deferred.reject(new Zotero.Exception.Alert("recognizePDF.limit")); } - _processQueue(true); //wrap it up + _processQueue(true); // Wrap it up } else { var item = queue.shift(); @@ -528,13 +571,19 @@ var Zotero_RecognizePDF = new function() { item.deferred.resolve( Q.try(getGoodLines, item.lines) .then(function(lines) { - return queryGoogle(lines, item.libraryID, 3); //try querying 3 times + return queryGoogle(lines, item.libraryID, 3); // Try querying 3 times }) .finally(function() { _processQueue(true); }) ); } } + /** + * Select lines that are good candidates for Google Scholar query + * @private + * @param {String[]} lines + * @return {String[]} + */ function getGoodLines(lines) { // Use only first column from multi-column lines const lineRe = /^[\s_]*([^\s]+(?: [^\s_]+)+)/; @@ -547,7 +596,7 @@ var Zotero_RecognizePDF = new function() { } } - // get (not quite) median length + // Get (not quite) median length var lineLengthsLength = cleanedLineLengths.length; if(lineLengthsLength < 20 || cleanedLines[0] === "This is a digital copy of a book that was preserved for generations on library shelves before it was carefully scanned by Google as part of a project") { @@ -557,7 +606,7 @@ var Zotero_RecognizePDF = new function() { var sortedLengths = cleanedLineLengths.sort(), medianLength = sortedLengths[Math.floor(lineLengthsLength/2)]; - // pick lines within 6 chars of the median (this is completely arbitrary) + // Pick lines within 6 chars of the median (this is completely arbitrary) var goodLines = [], uBound = medianLength + 6, lBound = medianLength - 6; @@ -571,6 +620,14 @@ var Zotero_RecognizePDF = new function() { return goodLines; } + /** + * Query Google Scholar + * @private + * @param {String[]} goodLines + * @param {Integer | null} libraryID + * @param {Integer} tries Number of queries to attempt before giving up + * @return {Promise} A promise resolved when PDF metadata has been retrieved + */ function queryGoogle(goodLines, libraryID, tries) { if(tries <= 0) throw new Zotero.Exception.Alert("recognizePDF.noMatches"); @@ -584,10 +641,10 @@ var Zotero_RecognizePDF = new function() { // document quoting our document is low. Every 7th line is a magic value nextLine = (nextLine + 7) % goodLines.length; - // get rid of first and last words + // Get rid of first and last words words.shift(); words.pop(); - // make sure there are no long words (probably OCR mistakes) + // Make sure there are no long words (probably OCR mistakes) var skipLine = false; for(var i=0; i 20) { @@ -595,7 +652,7 @@ var Zotero_RecognizePDF = new function() { break; } } - // add words to query + // Add words to query if(!skipLine && words.length) { queryStringWords += words.length; queryString += '"'+words.join(" ")+'" '; @@ -647,42 +704,57 @@ var Zotero_RecognizePDF = new function() { }); } + /** + * Check for CAPTCHA on a page with HTTP 200 status + * @private + * @param {XMLHttpRequest} xmlhttp + * @param {Integer} tries Number of queries to attempt before giving up + * @return {Promise} A promise resolved when PDF metadata has been retrieved + */ function _checkCaptchaOK(xmlhttp, tries) { if(stopCheckCallback && stopCheckCallback()) { throw new Zotero.Exception.Alert('recognizePDF.stopped'); } - //check for captcha on page with HTTP 200 status if(Zotero.Utilities.xpath(xmlhttp.response, "//form[@action='Captcha']").length) { return _solveCaptcha(xmlhttp, tries); } return xmlhttp; } + /** + * Check for CAPTCHA on an error page. Handle 403 and 503 pages + * @private + * @param {Zotero.HTTP.UnexpectedStatusException} e HTTP response error object + * @param {Integer} tries Number of queries to attempt before giving up + * @param {Boolean} dontClearCookies Whether to attempt to clear cookies in + * in order to get CAPTCHA to show up + * @return {Promise} A promise resolved when PDF metadata has been retrieved + */ function _checkCaptchaError(e, tries, dontClearCookies) { if(stopCheckCallback && stopCheckCallback()) { throw new Zotero.Exception.Alert('recognizePDF.stopped'); } - //check for captcha on error page + // Check for captcha on error page if(e instanceof Zotero.HTTP.UnexpectedStatusException && (e.status == 403 || e.status == 503) && e.xmlhttp.response) { if(_extractCaptchaFormData(e.xmlhttp.response)) { return _solveCaptcha(e.xmlhttp, tries); - } else if(!dontClearCookies && e.xmlhttp.channel) { //make sure we can obtain original URL - //AFAICT, for 403 errors, GS just says "sorry, try later", - // but if you clear cookies, you get a captcha + } else if(!dontClearCookies && e.xmlhttp.channel) { // Make sure we can obtain original URL + // AFAICT, for 403 errors, GS just says "sorry, try later", + // but if you clear cookies, you get a CAPTCHA if(!_clearGSCookies(e.xmlhttp.channel.originalURI.host)) { //user said no or no cookies removed throw new Zotero.Exception.Alert('recognizePDF.limit'); } - //redo GET request + // Redo GET request return Zotero.HTTP.promise("GET", e.xmlhttp.channel.originalURI.spec, {"responseType":"document"}) .then(function(xmlhttp) { - return _checkCaptchaOK(xmlhttp, tries, true); //don't try this again + return _checkCaptchaOK(xmlhttp, tries); }, function(e) { - return _checkCaptchaError(e, tries, true); //don't try this again + return _checkCaptchaError(e, tries, true); // Don't try this again }); } @@ -693,6 +765,13 @@ var Zotero_RecognizePDF = new function() { throw e; } + /** + * Prompt user to enter CPATCHA + * @private + * @param {XMLHttpRequest} xmlhttp + * @param {Integer} [tries] Number of queries to attempt before giving up + * @return {Promise} A promise resolved when PDF metadata has been retrieved + */ function _solveCaptcha(xmlhttp, tries) { var doc = xmlhttp.response; @@ -742,6 +821,12 @@ var Zotero_RecognizePDF = new function() { }); } + /** + * Extract CAPTCHA form-related data from the CAPTCHA page + * @private + * @param {Document} doc DOM document object for the CAPTCHA page + * @return {Object} Object containing data describing CAPTCHA form + */ function _extractCaptchaFormData(doc) { var formData = {}; @@ -765,6 +850,12 @@ var Zotero_RecognizePDF = new function() { return formData; } + /** + * Clear Google cookies to get the CAPTCHA page to appear + * @private + * @param {String} host Host of the Google Scholar page (in case it's proxied) + * @return {Boolean} Whether any cookies were cleared + */ function _clearGSCookies(host) { /* There don't seem to be any negative effects of deleting GDSESS if(!Zotero.isStandalone) { @@ -779,11 +870,10 @@ var Zotero_RecognizePDF = new function() { if(!response) return; }*/ - //find GDSESS cookie var removed = false, cookies = cookieService.getCookiesFromHost(host); while(cookies.hasMoreElements()) { var cookie = cookies.getNext().QueryInterface(Components.interfaces.nsICookie2); - if(["GDSESS", "PREF"].indexOf(cookie.name) !== -1) { + if(["GDSESS", "PREF"].indexOf(cookie.name) !== -1) { // GDSESS doesn't seem to always be enough Zotero.debug("RecognizePDF: Removing cookie " + cookie.name + " for host " + cookie.host + " and path " + cookie.path); cookieService.remove(cookie.host, cookie.name, cookie.path, false);