[Retrieve Metadata] Add/fix comments
This commit is contained in:
parent
859e99f020
commit
3c21e7c999
1 changed files with 119 additions and 29 deletions
|
@ -65,6 +65,8 @@ var Zotero_RecognizePDF = new function() {
|
|||
*
|
||||
* @param {nsIFile} file The PDF file to retrieve metadata for
|
||||
* @param {Integer|null} libraryID The library in which to save the PDF
|
||||
* @param {Function} stopCheckCallback Function that returns true if the
|
||||
* process is to be interrupted
|
||||
* @return {Promise} A promise resolved when PDF metadata has been retrieved
|
||||
*/
|
||||
this.recognize = function(file, libraryID, stopCheckCallback) {
|
||||
|
@ -273,6 +275,9 @@ var Zotero_RecognizePDF = new function() {
|
|||
this._stopped = true;
|
||||
},
|
||||
|
||||
/**
|
||||
* Halts recognition and closes window
|
||||
*/
|
||||
"close": function() {
|
||||
this.stop();
|
||||
this._progressWindow.close();
|
||||
|
@ -308,7 +313,7 @@ var Zotero_RecognizePDF = new function() {
|
|||
}
|
||||
|
||||
var me = this;
|
||||
// Tree double-click handler
|
||||
|
||||
this._progressWindow.document.getElementById("tree").addEventListener(
|
||||
"dblclick", function(event) { me._onDblClick(event, this); });
|
||||
|
||||
|
@ -390,7 +395,7 @@ var Zotero_RecognizePDF = new function() {
|
|||
.catch(function(error) {
|
||||
Zotero.debug(error);
|
||||
Zotero.logError(error);
|
||||
|
||||
|
||||
itemTitle.setAttribute("label", error instanceof Zotero.Exception.Alert ? error.message : Zotero.getString("recognizePDF.error"));
|
||||
itemIcon.setAttribute("src", FAILURE_IMAGE);
|
||||
|
||||
|
@ -407,8 +412,9 @@ var Zotero_RecognizePDF = new function() {
|
|||
},
|
||||
|
||||
/**
|
||||
* Cleans up after items are recognized, disabling the cancel button and making the progress window
|
||||
* close on blur
|
||||
* Cleans up after items are recognized, disabling the cancel button and
|
||||
* making the progress window close on blur.
|
||||
* @param {Boolean} cancelled Whether the process was cancelled
|
||||
*/
|
||||
"_done": function(cancelled) {
|
||||
this._progressIndicator.value = 100;
|
||||
|
@ -422,8 +428,8 @@ var Zotero_RecognizePDF = new function() {
|
|||
this._progressWindow.addEventListener("keypress", function() { me.close() });
|
||||
|
||||
if(Zotero.isMac) {
|
||||
//on MacOS X, the windows are not always on top, so we hide them on blur
|
||||
// to avoid clutter
|
||||
// On MacOS X, the windows are not always on top, so we hide them on
|
||||
// blur to avoid clutter
|
||||
this._setCloseTimer();
|
||||
}
|
||||
this._progressWindow.document.getElementById("label").value =
|
||||
|
@ -431,6 +437,12 @@ var Zotero_RecognizePDF = new function() {
|
|||
: Zotero.getString("recognizePDF.complete.label");
|
||||
},
|
||||
|
||||
/**
|
||||
* Set a timer after which the window will close automatically. If the
|
||||
* window is refocused, clear the timer and do not attempt to auto-close
|
||||
* any more
|
||||
* @private
|
||||
*/
|
||||
"_setCloseTimer": function() {
|
||||
var me = this, win = this._progressWindow;
|
||||
var focusListener = function() {
|
||||
|
@ -443,19 +455,27 @@ var Zotero_RecognizePDF = new function() {
|
|||
win.removeEventListener('focus', focusListener, false);
|
||||
};
|
||||
var blurListener = function() {
|
||||
//close window after losing focus for 5 seconds
|
||||
// Close window after losing focus for 5 seconds
|
||||
win.zoteroCloseTimeoutID = win.setTimeout(function() { win.close() }, 5000);
|
||||
//re-set timer if we gain focus again
|
||||
// Prevent auto-close if we gain focus again
|
||||
win.addEventListener("focus", focusListener, false);
|
||||
};
|
||||
win.addEventListener("blur", blurListener, false);
|
||||
},
|
||||
|
||||
/**
|
||||
* Focus items in Zotero library when double-clicking them in the Retrieve
|
||||
* metadata window.
|
||||
* @param {Event} event
|
||||
* @param {tree} tree XUL tree object
|
||||
* @private
|
||||
*/
|
||||
"_onDblClick": function(event, tree) {
|
||||
if (event && tree && event.type == "dblclick") {
|
||||
var itemID = this._rowIDs[tree.treeBoxObject.getRowAt(event.clientX, event.clientY)];
|
||||
if(!itemID) return;
|
||||
|
||||
// Get the right window. In tab mode, it's the container window
|
||||
var lastWin = (window.ZoteroTab ? window.ZoteroTab.containerWindow : window);
|
||||
|
||||
if (lastWin.ZoteroOverlay) {
|
||||
|
@ -468,23 +488,39 @@ var Zotero_RecognizePDF = new function() {
|
|||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Singleton for querying Google Scholar. Ensures that all queries are
|
||||
* sequential and respect the delay inbetween queries.
|
||||
* @namespace
|
||||
*/
|
||||
this.GSFullTextSearch = new function() {
|
||||
const GOOGLE_SCHOLAR_QUERY_DELAY = 2000; // in ms
|
||||
const GOOGLE_SCHOLAR_QUERY_DELAY = 2000; // In ms
|
||||
var queryLimitReached = false,
|
||||
inProgress = false,
|
||||
queue = [],
|
||||
stopCheckCallback; // As long as we process one query at a time, this is ok
|
||||
//load nsICookieManager2
|
||||
// Load nsICookieManager2
|
||||
Components.utils.import("resource://gre/modules/Services.jsm");
|
||||
var cookieService = Services.cookies;
|
||||
|
||||
/**
|
||||
* Reset "Query Limit Reached" flag, so that we attempt to query Google again
|
||||
*/
|
||||
this.resetQueryLimit = function() {
|
||||
queryLimitReached = false;
|
||||
};
|
||||
|
||||
/**
|
||||
* Queue up item for Google Scholar query
|
||||
* @param {String[]} lines Lines of text to use for full-text query
|
||||
* @param {Integer | null} libraryID Library to save the item to
|
||||
* @param {Function} stopCheckCallback Function that returns true if the
|
||||
* process is to be interrupted
|
||||
* @return {Promise} A promise resolved when PDF metadata has been retrieved
|
||||
*/
|
||||
this.findItem = function(lines, libraryID, stopCheckCallback) {
|
||||
if(!inProgress && queryLimitReached) {
|
||||
//there's no queue, so we can reject immediately
|
||||
// There's no queue, so we can reject immediately
|
||||
return Q.reject(new Zotero.Exception.Alert("recognizePDF.limit"));
|
||||
}
|
||||
|
||||
|
@ -499,6 +535,13 @@ var Zotero_RecognizePDF = new function() {
|
|||
return deferred.promise;
|
||||
};
|
||||
|
||||
/**
|
||||
* Process Google Scholar queue
|
||||
* @private
|
||||
* @param {Boolean} proceed Whether we should pop the next item off the queue
|
||||
* This should not be true unless being called after processing
|
||||
* another item
|
||||
*/
|
||||
function _processQueue(proceed) {
|
||||
if(inProgress && !proceed) return; //only one at a time
|
||||
|
||||
|
@ -509,12 +552,12 @@ var Zotero_RecognizePDF = new function() {
|
|||
|
||||
inProgress = true;
|
||||
if(queryLimitReached) {
|
||||
//irreversibly blocked. Reject remaining items in queue
|
||||
// Irreversibly blocked. Reject remaining items in queue
|
||||
var item;
|
||||
while(item = queue.shift()) {
|
||||
item.deferred.reject(new Zotero.Exception.Alert("recognizePDF.limit"));
|
||||
}
|
||||
_processQueue(true); //wrap it up
|
||||
_processQueue(true); // Wrap it up
|
||||
} else {
|
||||
var item = queue.shift();
|
||||
|
||||
|
@ -528,13 +571,19 @@ var Zotero_RecognizePDF = new function() {
|
|||
item.deferred.resolve(
|
||||
Q.try(getGoodLines, item.lines)
|
||||
.then(function(lines) {
|
||||
return queryGoogle(lines, item.libraryID, 3); //try querying 3 times
|
||||
return queryGoogle(lines, item.libraryID, 3); // Try querying 3 times
|
||||
})
|
||||
.finally(function() { _processQueue(true); })
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Select lines that are good candidates for Google Scholar query
|
||||
* @private
|
||||
* @param {String[]} lines
|
||||
* @return {String[]}
|
||||
*/
|
||||
function getGoodLines(lines) {
|
||||
// Use only first column from multi-column lines
|
||||
const lineRe = /^[\s_]*([^\s]+(?: [^\s_]+)+)/;
|
||||
|
@ -547,7 +596,7 @@ var Zotero_RecognizePDF = new function() {
|
|||
}
|
||||
}
|
||||
|
||||
// get (not quite) median length
|
||||
// Get (not quite) median length
|
||||
var lineLengthsLength = cleanedLineLengths.length;
|
||||
if(lineLengthsLength < 20
|
||||
|| cleanedLines[0] === "This is a digital copy of a book that was preserved for generations on library shelves before it was carefully scanned by Google as part of a project") {
|
||||
|
@ -557,7 +606,7 @@ var Zotero_RecognizePDF = new function() {
|
|||
var sortedLengths = cleanedLineLengths.sort(),
|
||||
medianLength = sortedLengths[Math.floor(lineLengthsLength/2)];
|
||||
|
||||
// pick lines within 6 chars of the median (this is completely arbitrary)
|
||||
// Pick lines within 6 chars of the median (this is completely arbitrary)
|
||||
var goodLines = [],
|
||||
uBound = medianLength + 6,
|
||||
lBound = medianLength - 6;
|
||||
|
@ -571,6 +620,14 @@ var Zotero_RecognizePDF = new function() {
|
|||
return goodLines;
|
||||
}
|
||||
|
||||
/**
|
||||
* Query Google Scholar
|
||||
* @private
|
||||
* @param {String[]} goodLines
|
||||
* @param {Integer | null} libraryID
|
||||
* @param {Integer} tries Number of queries to attempt before giving up
|
||||
* @return {Promise} A promise resolved when PDF metadata has been retrieved
|
||||
*/
|
||||
function queryGoogle(goodLines, libraryID, tries) {
|
||||
if(tries <= 0) throw new Zotero.Exception.Alert("recognizePDF.noMatches");
|
||||
|
||||
|
@ -584,10 +641,10 @@ var Zotero_RecognizePDF = new function() {
|
|||
// document quoting our document is low. Every 7th line is a magic value
|
||||
nextLine = (nextLine + 7) % goodLines.length;
|
||||
|
||||
// get rid of first and last words
|
||||
// Get rid of first and last words
|
||||
words.shift();
|
||||
words.pop();
|
||||
// make sure there are no long words (probably OCR mistakes)
|
||||
// Make sure there are no long words (probably OCR mistakes)
|
||||
var skipLine = false;
|
||||
for(var i=0; i<words.length; i++) {
|
||||
if(words[i].length > 20) {
|
||||
|
@ -595,7 +652,7 @@ var Zotero_RecognizePDF = new function() {
|
|||
break;
|
||||
}
|
||||
}
|
||||
// add words to query
|
||||
// Add words to query
|
||||
if(!skipLine && words.length) {
|
||||
queryStringWords += words.length;
|
||||
queryString += '"'+words.join(" ")+'" ';
|
||||
|
@ -647,42 +704,57 @@ var Zotero_RecognizePDF = new function() {
|
|||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Check for CAPTCHA on a page with HTTP 200 status
|
||||
* @private
|
||||
* @param {XMLHttpRequest} xmlhttp
|
||||
* @param {Integer} tries Number of queries to attempt before giving up
|
||||
* @return {Promise} A promise resolved when PDF metadata has been retrieved
|
||||
*/
|
||||
function _checkCaptchaOK(xmlhttp, tries) {
|
||||
if(stopCheckCallback && stopCheckCallback()) {
|
||||
throw new Zotero.Exception.Alert('recognizePDF.stopped');
|
||||
}
|
||||
|
||||
//check for captcha on page with HTTP 200 status
|
||||
if(Zotero.Utilities.xpath(xmlhttp.response, "//form[@action='Captcha']").length) {
|
||||
return _solveCaptcha(xmlhttp, tries);
|
||||
}
|
||||
return xmlhttp;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check for CAPTCHA on an error page. Handle 403 and 503 pages
|
||||
* @private
|
||||
* @param {Zotero.HTTP.UnexpectedStatusException} e HTTP response error object
|
||||
* @param {Integer} tries Number of queries to attempt before giving up
|
||||
* @param {Boolean} dontClearCookies Whether to attempt to clear cookies in
|
||||
* in order to get CAPTCHA to show up
|
||||
* @return {Promise} A promise resolved when PDF metadata has been retrieved
|
||||
*/
|
||||
function _checkCaptchaError(e, tries, dontClearCookies) {
|
||||
if(stopCheckCallback && stopCheckCallback()) {
|
||||
throw new Zotero.Exception.Alert('recognizePDF.stopped');
|
||||
}
|
||||
|
||||
//check for captcha on error page
|
||||
// Check for captcha on error page
|
||||
if(e instanceof Zotero.HTTP.UnexpectedStatusException
|
||||
&& (e.status == 403 || e.status == 503) && e.xmlhttp.response) {
|
||||
if(_extractCaptchaFormData(e.xmlhttp.response)) {
|
||||
return _solveCaptcha(e.xmlhttp, tries);
|
||||
} else if(!dontClearCookies && e.xmlhttp.channel) { //make sure we can obtain original URL
|
||||
//AFAICT, for 403 errors, GS just says "sorry, try later",
|
||||
// but if you clear cookies, you get a captcha
|
||||
} else if(!dontClearCookies && e.xmlhttp.channel) { // Make sure we can obtain original URL
|
||||
// AFAICT, for 403 errors, GS just says "sorry, try later",
|
||||
// but if you clear cookies, you get a CAPTCHA
|
||||
if(!_clearGSCookies(e.xmlhttp.channel.originalURI.host)) {
|
||||
//user said no or no cookies removed
|
||||
throw new Zotero.Exception.Alert('recognizePDF.limit');
|
||||
}
|
||||
//redo GET request
|
||||
// Redo GET request
|
||||
return Zotero.HTTP.promise("GET", e.xmlhttp.channel.originalURI.spec, {"responseType":"document"})
|
||||
.then(function(xmlhttp) {
|
||||
return _checkCaptchaOK(xmlhttp, tries, true); //don't try this again
|
||||
return _checkCaptchaOK(xmlhttp, tries);
|
||||
},
|
||||
function(e) {
|
||||
return _checkCaptchaError(e, tries, true); //don't try this again
|
||||
return _checkCaptchaError(e, tries, true); // Don't try this again
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -693,6 +765,13 @@ var Zotero_RecognizePDF = new function() {
|
|||
throw e;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prompt user to enter CPATCHA
|
||||
* @private
|
||||
* @param {XMLHttpRequest} xmlhttp
|
||||
* @param {Integer} [tries] Number of queries to attempt before giving up
|
||||
* @return {Promise} A promise resolved when PDF metadata has been retrieved
|
||||
*/
|
||||
function _solveCaptcha(xmlhttp, tries) {
|
||||
var doc = xmlhttp.response;
|
||||
|
||||
|
@ -742,6 +821,12 @@ var Zotero_RecognizePDF = new function() {
|
|||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract CAPTCHA form-related data from the CAPTCHA page
|
||||
* @private
|
||||
* @param {Document} doc DOM document object for the CAPTCHA page
|
||||
* @return {Object} Object containing data describing CAPTCHA form
|
||||
*/
|
||||
function _extractCaptchaFormData(doc) {
|
||||
var formData = {};
|
||||
|
||||
|
@ -765,6 +850,12 @@ var Zotero_RecognizePDF = new function() {
|
|||
return formData;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear Google cookies to get the CAPTCHA page to appear
|
||||
* @private
|
||||
* @param {String} host Host of the Google Scholar page (in case it's proxied)
|
||||
* @return {Boolean} Whether any cookies were cleared
|
||||
*/
|
||||
function _clearGSCookies(host) {
|
||||
/* There don't seem to be any negative effects of deleting GDSESS
|
||||
if(!Zotero.isStandalone) {
|
||||
|
@ -779,11 +870,10 @@ var Zotero_RecognizePDF = new function() {
|
|||
if(!response) return;
|
||||
}*/
|
||||
|
||||
//find GDSESS cookie
|
||||
var removed = false, cookies = cookieService.getCookiesFromHost(host);
|
||||
while(cookies.hasMoreElements()) {
|
||||
var cookie = cookies.getNext().QueryInterface(Components.interfaces.nsICookie2);
|
||||
if(["GDSESS", "PREF"].indexOf(cookie.name) !== -1) {
|
||||
if(["GDSESS", "PREF"].indexOf(cookie.name) !== -1) { // GDSESS doesn't seem to always be enough
|
||||
Zotero.debug("RecognizePDF: Removing cookie " + cookie.name + " for host "
|
||||
+ cookie.host + " and path " + cookie.path);
|
||||
cookieService.remove(cookie.host, cookie.name, cookie.path, false);
|
||||
|
|
Loading…
Add table
Reference in a new issue