[Retrieve Metadata] Use a single queue to query Google Scholar. Window closing tweaks.

* Close window on blur after completion on Mac (revert previous change)
* Don't close window when canceling
* Add Esc handler to cancel/close window
* Allow columns to be resized
* Fixes #445
* Fixes #444
This commit is contained in:
aurimasv 2014-01-14 02:17:58 -06:00
parent 4bedb61aa2
commit 57350fae1e
7 changed files with 465 additions and 249 deletions

View file

@ -28,6 +28,27 @@ var Zotero_Captcha = new function() {
this.onLoad = function() { this.onLoad = function() {
this._io = window.arguments[0]; this._io = window.arguments[0];
var description = document.getElementById('zotero-captcha-description'),
errorMsg = document.getElementById('zotero-captcha-error');
if(this._io.dataIn.title) {
document.title = this._io.dataIn.title;
}
if(this._io.dataIn.description) {
description.textContent = this._io.dataIn.description;
description.hidden = false;
} else {
description.hidden = true;
}
if(this._io.dataIn.error) {
errorMsg.textContent = this._io.dataIn.error;
errorMsg.hidden = false;
} else {
errorMsg.hidden = true;
}
document.getElementById('zotero-captcha-image').src = this._io.dataIn.imgUrl; document.getElementById('zotero-captcha-image').src = this._io.dataIn.imgUrl;
document.getElementById('zotero-captcha-input').focus(); document.getElementById('zotero-captcha-input').focus();
} }

View file

@ -1,16 +1,22 @@
<?xml version="1.0" ?> <?xml version="1.0"?>
<?xml-stylesheet href="chrome://global/skin/" type="text/css"?> <?xml-stylesheet href="chrome://global/skin/" type="text/css"?>
<?xml-stylesheet href="chrome://zotero/skin/zotero.css" type="text/css"?>
<!DOCTYPE window SYSTEM "chrome://zotero/locale/zotero.dtd"> <!DOCTYPE window SYSTEM "chrome://zotero/locale/zotero.dtd">
<window xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul" <window xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul"
title="&zotero.captcha.title;"
onload="Zotero_Captcha.onLoad();" onload="Zotero_Captcha.onLoad();"
id="zotero-captcha"> id="zotero-captcha"
onkeypress="if(event.keyCode === KeyEvent.DOM_VK_ESCAPE) Zotero_Captcha.cancel();">
<script src="include.js"/>
<script src="captcha.js"/> <script src="captcha.js"/>
<vbox style="padding:10px" align="center" flex="1"> <vbox style="padding:10px" align="center" flex="1">
<description id="zotero-captcha-description"></description>
<image id="zotero-captcha-image" onload="Zotero_Captcha.imageOnLoad();" /> <image id="zotero-captcha-image" onload="Zotero_Captcha.imageOnLoad();" />
<description id="zotero-captcha-error"></description>
<textbox id="zotero-captcha-input" <textbox id="zotero-captcha-input"
onkeypress="if(event.keyCode === KeyEvent.DOM_VK_RETURN) Zotero_Captcha.resolve();" /> onkeypress="if(event.keyCode === KeyEvent.DOM_VK_RETURN) Zotero_Captcha.resolve();" />
<hbox> <hbox>

View file

@ -14,7 +14,9 @@
<tree flex="1" id="tree" hidecolumnpicker="true"> <tree flex="1" id="tree" hidecolumnpicker="true">
<treecols> <treecols>
<treecol id="success-col" style="width:20px;"/> <treecol id="success-col" style="width:20px;"/>
<splitter class="tree-splitter" hidden="true"/>
<treecol label="&zotero.recognizePDF.pdfName.label;" id="pdf-col" flex="1"/> <treecol label="&zotero.recognizePDF.pdfName.label;" id="pdf-col" flex="1"/>
<splitter class="tree-splitter"/>
<treecol label="&zotero.recognizePDF.itemName.label;" id="item-col" flex="2"/> <treecol label="&zotero.recognizePDF.itemName.label;" id="item-col" flex="2"/>
</treecols> </treecols>
<treechildren id="treechildren"/> <treechildren id="treechildren"/>

View file

@ -33,7 +33,7 @@
*/ */
var Zotero_RecognizePDF = new function() { var Zotero_RecognizePDF = new function() {
Components.utils.import("resource://zotero/q.js"); Components.utils.import("resource://zotero/q.js");
var _progressWindow, _progressIndicator, itemRecognizer; var _progressWindow, _progressIndicator;
/** /**
* Checks whether a given PDF could theoretically be recognized * Checks whether a given PDF could theoretically be recognized
@ -56,7 +56,7 @@ var Zotero_RecognizePDF = new function() {
var items = ZoteroPane_Local.getSelectedItems(); var items = ZoteroPane_Local.getSelectedItems();
if (!items) return; if (!items) return;
itemRecognizer = new Zotero_RecognizePDF.ItemRecognizer(); var itemRecognizer = new Zotero_RecognizePDF.ItemRecognizer();
itemRecognizer.recognizeItems(items); itemRecognizer.recognizeItems(items);
} }
@ -67,9 +67,8 @@ var Zotero_RecognizePDF = new function() {
* @param {Integer|null} libraryID The library in which to save the PDF * @param {Integer|null} libraryID The library in which to save the PDF
* @return {Promise} A promise resolved when PDF metadata has been retrieved * @return {Promise} A promise resolved when PDF metadata has been retrieved
*/ */
this.recognize = function(file, libraryID) { this.recognize = function(file, libraryID, stopCheckCallback) {
const MAX_PAGES = 7; const MAX_PAGES = 7;
const GOOGLE_SCHOLAR_QUERY_DELAY = 2000; // in ms
var me = this; var me = this;
return _extractText(file, MAX_PAGES).then(function(lines) { return _extractText(file, MAX_PAGES).then(function(lines) {
@ -116,129 +115,7 @@ var Zotero_RecognizePDF = new function() {
// If no DOI or ISBN, query Google Scholar // If no DOI or ISBN, query Google Scholar
return promise.fail(function(error) { return promise.fail(function(error) {
Zotero.debug("RecognizePDF: "+error); Zotero.debug("RecognizePDF: "+error);
return me.GSFullTextSearch.findItem(lines, libraryID, stopCheckCallback);
// Don't try Google Scholar if we already reached query limit
if(itemRecognizer._gsQueryLimitReached) throw new Zotero.Exception.Alert("recognizePDF.limit");
// Use only first column from multi-column lines
const lineRe = /^[\s_]*([^\s]+(?: [^\s_]+)+)/;
var cleanedLines = [], cleanedLineLengths = [];
for(var i=0; i<lines.length && cleanedLines.length<100; i++) {
var m = lineRe.exec(lines[i]);
if(m && m[1].split(' ').length > 3) {
cleanedLines.push(m[1]);
cleanedLineLengths.push(m[1].length);
}
}
// get (not quite) median length
var lineLengthsLength = cleanedLineLengths.length;
if(lineLengthsLength < 20
|| cleanedLines[0] === "This is a digital copy of a book that was preserved for generations on library shelves before it was carefully scanned by Google as part of a project") {
throw new Zotero.Exception.Alert("recognizePDF.noOCR");
}
var sortedLengths = cleanedLineLengths.sort(),
medianLength = sortedLengths[Math.floor(lineLengthsLength/2)];
// pick lines within 6 chars of the median (this is completely arbitrary)
var goodLines = [],
uBound = medianLength + 6,
lBound = medianLength - 6;
for (var i=0; i<lineLengthsLength; i++) {
if(cleanedLineLengths[i] > lBound && cleanedLineLengths[i] < uBound) {
// Strip quotation marks so they don't mess up search query quoting
var line = cleanedLines[i].replace('"', '');
goodLines.push(line);
}
}
var nextLine = 0,
limited = false,
queryGoogle = function() {
// If the users fails (or chooses not) to solve the CAPTCHA, don't keep trying
if(limited) throw new Zotero.Exception.Alert("recognizePDF.limit");
// Take the relevant parts of some lines (exclude hyphenated word)
var queryString = "", queryStringWords = 0;
while(queryStringWords < 25) {
if(!goodLines.length) throw new Zotero.Exception.Alert("recognizePDF.noMatches");
var words = goodLines.splice(nextLine, 1)[0].split(/\s+/);
// Try to avoid picking adjacent strings so the odds of them appearing in another
// document quoting our document is low. Every 7th line is a magic value
nextLine = (nextLine + 7) % goodLines.length;
// get rid of first and last words
words.shift();
words.pop();
// make sure there are no long words (probably OCR mistakes)
var skipLine = false;
for(var i=0; i<words.length; i++) {
if(words[i].length > 20) {
skipLine = true;
break;
}
}
// add words to query
if(!skipLine && words.length) {
queryStringWords += words.length;
queryString += '"'+words.join(" ")+'" ';
}
}
Zotero.debug("RecognizePDF: Query string "+queryString);
var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search",
delay = GOOGLE_SCHOLAR_QUERY_DELAY - (Date.now() - Zotero.HTTP.lastGoogleScholarQueryTime);
// Delay
return (delay > 0 ? Q.delay(delay) : Q.when())
.then(function() {
Zotero.HTTP.lastGoogleScholarQueryTime = Date.now();
return Zotero.HTTP.promise("GET", url, {"responseType":"document"})
})
.then(function(xmlhttp) {
var doc = xmlhttp.response,
deferred = Q.defer(),
translate = new Zotero.Translate.Web();
if(Zotero.Utilities.xpath(doc, "//form[@action='Captcha']").length) {
return _solveCaptcha(xmlhttp, 3);
}
translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289");
translate.setDocument(Zotero.HTTP.wrapDocument(doc, url));
translate.setHandler("translators", function(translate, detected) {
if(detected.length) {
deferred.resolve(_promiseTranslate(translate, libraryID));
} else {
deferred.reject(new Zotero.Exception.Alert("recognizePDF.noMatches"));
}
});
translate.getTranslators();
return deferred.promise;
}, function(e) {
if(e instanceof Zotero.HTTP.UnexpectedStatusException
&& (e.status == 403 || e.status == 503)) {
return _solveCaptcha(e.xmlhttp, 3); // Give the user 3 chances to get it right
}
throw e;
});
};
var retryCount = 2;
var retryGS = function(e) {
if(!retryCount--) throw e;
// Only retry if we can't find matches
if(e instanceof Zotero.Exception.Alert && e.name == "recognizePDF.noMatches") {
return queryGoogle().catch(retryGS);
}
throw e;
}
return queryGoogle().catch(retryGS);
}); });
}); });
} }
@ -362,80 +239,6 @@ var Zotero_RecognizePDF = new function() {
return validIsbns; return validIsbns;
} }
function _extractCaptchaFormData(doc) {
var formData = {};
var img = doc.getElementsByTagName('img')[0];
if(!img) return;
formData.img = img.src;
var form = doc.forms[0];
if(!form) return;
formData.action = form.action;
formData.input = {};
var inputs = form.getElementsByTagName('input');
for(var i=0, n=inputs.length; i<n; i++) {
if(!inputs[i].name) continue;
formData.input[inputs[i].name] = inputs[i].value;
}
formData.continue = "http://scholar.google.com";
return formData;
}
function _solveCaptcha(xmlhttp, tries) {
var doc = xmlhttp.response;
if(tries === undefined) tries = 3;
if(!tries) throw new Zotero_RecognizePDF.CaptchaResult(false);
tries--;
var formData = doc && _extractCaptchaFormData(doc);
if(!formData) throw new Zotero.Exception.Alert('recognizePDF.limit');
var io = { dataIn: {
imgUrl: formData.img
}};
_progressWindow.openDialog("chrome://zotero/content/captcha.xul", "",
"chrome,modal,resizable=no,centerscreen", io);
if(!io.dataOut) {
return Q.reject(new Zotero_RecognizePDF.CaptchaResult(false));
}
formData.input.captcha = io.dataOut.captcha;
var url = '', prop;
for(prop in formData.input) {
url += '&' + encodeURIComponent(prop) + '='
+ encodeURIComponent(formData.input[prop]);
}
url = formData.action + '?' + url.substr(1);
return Zotero.HTTP.promise("GET", url, {"responseType":"document"})
.then(function() {
throw new Zotero_RecognizePDF.CaptchaResult(true);
})
.catch(function(e) {
if(e instanceof Zotero.HTTP.UnexpectedStatusException
&& (e.status == 403 || e.status == 503)) {
return _solveCaptcha(e.xmlhttp, tries);
}
throw e;
});
}
this.CaptchaResult = function(success) {
this.success = success;
};
this.CaptchaResult.prototype.toString = function() {
return this.success ? "CAPTCHA successful" : "CAPTCHA failed";
};
/** /**
* @class Handles UI, etc. for recognizing multiple items * @class Handles UI, etc. for recognizing multiple items
*/ */
@ -448,7 +251,6 @@ var Zotero_RecognizePDF = new function() {
"_itemsTotal": 0, "_itemsTotal": 0,
"_progressWindow": null, "_progressWindow": null,
"_progressIndicator": null, "_progressIndicator": null,
"_gsQueryLimitReached": false,
/** /**
* Retreives metadata for the PDF items passed, displaying a progress dialog during conversion * Retreives metadata for the PDF items passed, displaying a progress dialog during conversion
@ -470,7 +272,12 @@ var Zotero_RecognizePDF = new function() {
"stop": function() { "stop": function() {
this._stopped = true; this._stopped = true;
}, },
"close": function() {
this.stop();
this._progressWindow.close();
},
/** /**
* Called when the progress window has been opened; adds items to the tree and begins recognizing * Called when the progress window has been opened; adds items to the tree and begins recognizing
* @param * @param
@ -499,13 +306,18 @@ var Zotero_RecognizePDF = new function() {
} }
var me = this; var me = this;
this._cancelHandler = function() { me.stop() };
this._keypressCancelHandler = function(e) {
if(e.keyCode === KeyEvent.DOM_VK_ESCAPE) me.stop();
};
_progressIndicator = this._progressIndicator = this._progressWindow.document.getElementById("progress-indicator"); _progressIndicator = this._progressIndicator = this._progressWindow.document.getElementById("progress-indicator");
this._progressWindow.document.getElementById("cancel-button").addEventListener("command", function() { this._progressWindow.document.getElementById("cancel-button")
me.stop(); .addEventListener("command", this._cancelHandler, false);
me._progressWindow.close(); // Also cancel if the user presses Esc
}, false); this._progressWindow.addEventListener("keypress", this._keypressCancelHandler);
this._progressWindow.addEventListener("close", function() { me.stop() }, false); this._progressWindow.addEventListener("close", this._cancelHandler, false);
this._gsQueryLimitReached = false; // Clear query limit flag Zotero_RecognizePDF.GSFullTextSearch.resetQueryLimit();
this._recognizeItem(); this._recognizeItem();
}, },
@ -514,8 +326,6 @@ var Zotero_RecognizePDF = new function() {
* @private * @private
*/ */
"_recognizeItem": function() { "_recognizeItem": function() {
if(this._stopped) return;
Components.utils.import("resource://zotero/q.js"); Components.utils.import("resource://zotero/q.js");
const SUCCESS_IMAGE = "chrome://zotero/skin/tick.png"; const SUCCESS_IMAGE = "chrome://zotero/skin/tick.png";
@ -527,6 +337,12 @@ var Zotero_RecognizePDF = new function() {
return; return;
} }
// Order here matters. Otherwise we may show an incorrect label
if(this._stopped) {
this._done(true);
return;
}
this._progressIndicator.value = (this._itemTotal-this._items.length)/this._itemTotal*100; this._progressIndicator.value = (this._itemTotal-this._items.length)/this._itemTotal*100;
var item = this._items.shift(), var item = this._items.shift(),
@ -538,13 +354,13 @@ var Zotero_RecognizePDF = new function() {
var file = item.getFile(), me = this; var file = item.getFile(), me = this;
(file (file
? Zotero_RecognizePDF.recognize(file, item.libraryID) ? Zotero_RecognizePDF.recognize(file, item.libraryID, function() { return me._stopped; })
: Q.reject(new Zotero.Exception.Alert("recognizePDF.fileNotFound"))) : Q.reject(new Zotero.Exception.Alert("recognizePDF.fileNotFound")))
.then(function(newItem) { .then(function(newItem) {
// If already stopped, delete // If already stopped, delete
if(me._stopped) { if(me._stopped) {
Zotero.Items.erase(item.id); Zotero.Items.erase(newItem.id);
return; throw new Zotero.Exception.Alert('recognizePDF.stopped');
} }
// put new item in same collections as the old one // put new item in same collections as the old one
@ -562,32 +378,23 @@ var Zotero_RecognizePDF = new function() {
itemIcon.setAttribute("src", SUCCESS_IMAGE); itemIcon.setAttribute("src", SUCCESS_IMAGE);
me._recognizeItem(); me._recognizeItem();
}, function(error) { })
if(error instanceof Zotero_RecognizePDF.CaptchaResult && error.success) { .catch(function(error) {
// Redo last item
me._items.unshift(item);
me._recognizeItem();
return;
}
Zotero.debug(error); Zotero.debug(error);
Zotero.logError(error); Zotero.logError(error);
if(error instanceof Zotero_RecognizePDF.CaptchaResult && !error.success) {
error = new Zotero.Exception.Alert("recognizePDF.limit");
}
if(error instanceof Zotero.Exception.Alert && error.name === "recognizePDF.limit") {
this._gsQueryLimitReached = true;;
}
itemTitle.setAttribute("label", error instanceof Zotero.Exception.Alert ? error.message : Zotero.getString("recognizePDF.error")); itemTitle.setAttribute("label", error instanceof Zotero.Exception.Alert ? error.message : Zotero.getString("recognizePDF.error"));
itemIcon.setAttribute("src", FAILURE_IMAGE); itemIcon.setAttribute("src", FAILURE_IMAGE);
me._recognizeItem(); // Don't show "completed" label if stopped on last item
}).fin(function() { if(me._stopped && !me._items.length) {
me._done(true);
} else {
me._recognizeItem();
}
}).finally(function() {
// scroll to this item // scroll to this item
me._progressWindow.document.getElementById("tree").treeBoxObject.scrollToRow(Math.max(0, me._itemTotal-me._items.length-5)); me._progressWindow.document.getElementById("tree").treeBoxObject.scrollToRow(Math.max(0, me._itemTotal-me._items.length-4));
}).done(); }).done();
}, },
@ -595,11 +402,376 @@ var Zotero_RecognizePDF = new function() {
* Cleans up after items are recognized, disabling the cancel button and making the progress window * Cleans up after items are recognized, disabling the cancel button and making the progress window
* close on blur * close on blur
*/ */
"_done": function() { "_done": function(cancelled) {
this._progressIndicator.value = 100; this._progressIndicator.value = 100;
this._progressWindow.document.getElementById("cancel-button").label = Zotero.getString("recognizePDF.close.label"); // Switch out cancel for close
var me = this; var cancelButton = this._progressWindow.document.getElementById("cancel-button"),
this._progressWindow.document.getElementById("label").value = Zotero.getString("recognizePDF.complete.label"); me = this;
cancelButton.label = Zotero.getString("recognizePDF.close.label");
cancelButton.removeEventListener("command", this._cancelHandler, false);
cancelButton.addEventListener("command", function() { me.close() }, false);
this._progressWindow.removeEventListener("keypress", this._keypressCancelHandler);
this._progressWindow.addEventListener("keypress", function() { me.close() });
if(Zotero.isMac) {
//on MacOS X, the windows are not always on top, so we hide them on blur
// to avoid clutter
this._setCloseTimer();
}
this._progressWindow.document.getElementById("label").value =
cancelled ? Zotero.getString("recognizePDF.cancelled.label")
: Zotero.getString("recognizePDF.complete.label");
},
"_setCloseTimer": function() {
var me = this, win = this._progressWindow;
var focusListener = function() {
if(!win.zoteroCloseTimeoutID) return;
win.clearTimeout(win.zoteroCloseTimeoutID);
delete win.zoteroCloseTimeoutID;
win.removeEventListener('blur', blurListener, false);
win.removeEventListener('focus', focusListener, false);
};
var blurListener = function() {
//close window after losing focus for 5 seconds
win.zoteroCloseTimeoutID = win.setTimeout(function() { win.close() }, 5000);
//re-set timer if we gain focus again
win.addEventListener("focus", focusListener, false);
};
win.addEventListener("blur", blurListener, false);
} }
} };
this.GSFullTextSearch = new function() {
const GOOGLE_SCHOLAR_QUERY_DELAY = 2000; // in ms
var queryLimitReached = false,
inProgress = false,
queue = [],
stopCheckCallback; // As long as we process one query at a time, this is ok
//load nsICookieManager2
Components.utils.import("resource://gre/modules/Services.jsm");
var cookieService = Services.cookies;
this.resetQueryLimit = function() {
queryLimitReached = false;
};
this.findItem = function(lines, libraryID, stopCheckCallback) {
if(!inProgress && queryLimitReached) {
//there's no queue, so we can reject immediately
return Q.reject(new Zotero.Exception.Alert("recognizePDF.limit"));
}
var deferred = Q.defer();
queue.push({
deferred: deferred,
lines: lines,
libraryID: libraryID,
stopCheckCallback: stopCheckCallback
});
_processQueue();
return deferred.promise;
};
function _processQueue(proceed) {
if(inProgress && !proceed) return; //only one at a time
if(!queue.length) {
inProgress = false;
return;
}
inProgress = true;
if(queryLimitReached) {
//irreversibly blocked. Reject remaining items in queue
var item;
while(item = queue.shift()) {
item.deferred.reject(new Zotero.Exception.Alert("recognizePDF.limit"));
}
_processQueue(true); //wrap it up
} else {
var item = queue.shift();
stopCheckCallback = item.stopCheckCallback;
if(stopCheckCallback && stopCheckCallback()) {
item.deferred.reject(new Zotero.Exception.Alert('recognizePDF.stopped'));
_processQueue(true);
return;
}
item.deferred.resolve(
Q.try(getGoodLines, item.lines)
.then(function(lines) {
return queryGoogle(lines, item.libraryID, 3); //try querying 3 times
})
.finally(function() { _processQueue(true); })
);
}
}
function getGoodLines(lines) {
// Use only first column from multi-column lines
const lineRe = /^[\s_]*([^\s]+(?: [^\s_]+)+)/;
var cleanedLines = [], cleanedLineLengths = [];
for(var i=0; i<lines.length && cleanedLines.length<100; i++) {
var m = lineRe.exec(lines[i]);
if(m && m[1].split(' ').length > 3) {
cleanedLines.push(m[1]);
cleanedLineLengths.push(m[1].length);
}
}
// get (not quite) median length
var lineLengthsLength = cleanedLineLengths.length;
if(lineLengthsLength < 20
|| cleanedLines[0] === "This is a digital copy of a book that was preserved for generations on library shelves before it was carefully scanned by Google as part of a project") {
throw new Zotero.Exception.Alert("recognizePDF.noOCR");
}
var sortedLengths = cleanedLineLengths.sort(),
medianLength = sortedLengths[Math.floor(lineLengthsLength/2)];
// pick lines within 6 chars of the median (this is completely arbitrary)
var goodLines = [],
uBound = medianLength + 6,
lBound = medianLength - 6;
for (var i=0; i<lineLengthsLength; i++) {
if(cleanedLineLengths[i] > lBound && cleanedLineLengths[i] < uBound) {
// Strip quotation marks so they don't mess up search query quoting
var line = cleanedLines[i].replace('"', '');
goodLines.push(line);
}
}
return goodLines;
}
function queryGoogle(goodLines, libraryID, tries) {
if(tries <= 0) throw new Zotero.Exception.Alert("recognizePDF.noMatches");
// Take the relevant parts of some lines (exclude hyphenated word)
var queryString = "", queryStringWords = 0, nextLine = 0;
while(queryStringWords < 25) {
if(!goodLines.length) throw new Zotero.Exception.Alert("recognizePDF.noMatches");
var words = goodLines.splice(nextLine, 1)[0].split(/\s+/);
// Try to avoid picking adjacent strings so the odds of them appearing in another
// document quoting our document is low. Every 7th line is a magic value
nextLine = (nextLine + 7) % goodLines.length;
// get rid of first and last words
words.shift();
words.pop();
// make sure there are no long words (probably OCR mistakes)
var skipLine = false;
for(var i=0; i<words.length; i++) {
if(words[i].length > 20) {
skipLine = true;
break;
}
}
// add words to query
if(!skipLine && words.length) {
queryStringWords += words.length;
queryString += '"'+words.join(" ")+'" ';
}
}
Zotero.debug("RecognizePDF: Query string " + queryString);
var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search",
delay = GOOGLE_SCHOLAR_QUERY_DELAY - (Date.now() - Zotero.HTTP.lastGoogleScholarQueryTime);
// Delay
return (delay > 0 ? Q.delay(delay) : Q())
.then(function() {
Zotero.HTTP.lastGoogleScholarQueryTime = Date.now();
return Zotero.HTTP.promise("GET", url, {"responseType":"document"})
})
.then(function(xmlhttp) {
return _checkCaptchaOK(xmlhttp, 3);
},
function(e) {
return _checkCaptchaError(e, 3);
})
.then(function(xmlhttp) {
var doc = xmlhttp.response,
deferred = Q.defer(),
translate = new Zotero.Translate.Web();
translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289");
translate.setDocument(Zotero.HTTP.wrapDocument(doc, url));
translate.setHandler("translators", function(translate, detected) {
if(detected.length) {
deferred.resolve(_promiseTranslate(translate, libraryID));
} else {
deferred.resolve(Q.try(function() {
return queryGoogle(goodLines, libraryID, tries-1);
}));
}
});
translate.getTranslators();
return deferred.promise;
})
.catch(function(e) {
if(e.name == "recognizePDF.limit") {
queryLimitReached = true;
}
throw e;
});
}
function _checkCaptchaOK(xmlhttp, tries) {
if(stopCheckCallback && stopCheckCallback()) {
throw new Zotero.Exception.Alert('recognizePDF.stopped');
}
//check for captcha on page with HTTP 200 status
if(Zotero.Utilities.xpath(xmlhttp.response, "//form[@action='Captcha']").length) {
return _solveCaptcha(xmlhttp, tries);
}
return xmlhttp;
}
function _checkCaptchaError(e, tries, dontClearCookies) {
if(stopCheckCallback && stopCheckCallback()) {
throw new Zotero.Exception.Alert('recognizePDF.stopped');
}
//check for captcha on error page
if(e instanceof Zotero.HTTP.UnexpectedStatusException
&& (e.status == 403 || e.status == 503) && e.xmlhttp.response) {
if(_extractCaptchaFormData(e.xmlhttp.response)) {
return _solveCaptcha(e.xmlhttp, tries);
} else if(!dontClearCookies && e.xmlhttp.channel) { //make sure we can obtain original URL
//AFAICT, for 403 errors, GS just says "sorry, try later",
// but if you clear cookies, you get a captcha
if(!_clearGSCookies(e.xmlhttp.channel.originalURI.host)) {
//user said no or no cookies removed
throw new Zotero.Exception.Alert('recognizePDF.limit');
}
//redo GET request
return Zotero.HTTP.promise("GET", e.xmlhttp.channel.originalURI.spec, {"responseType":"document"})
.then(function(xmlhttp) {
return _checkCaptchaOK(xmlhttp, tries, true); //don't try this again
},
function(e) {
return _checkCaptchaError(e, tries, true); //don't try this again
});
}
Zotero.debug("RecognizePDF: Google Scholar returned an unexpected page"
+ " with status " + e.status);
throw new Zotero.Exception.Alert('recognizePDF.limit');
}
throw e;
}
function _solveCaptcha(xmlhttp, tries) {
var doc = xmlhttp.response;
if(tries === undefined) tries = 3;
if(!tries) {
Zotero.debug("RecognizePDF: Failed to solve CAPTCHA after multiple attempts.");
throw new Zotero.Exception.Alert('recognizePDF.limit');
}
tries--;
var formData = doc && _extractCaptchaFormData(doc);
if(!formData) {
Zotero.debug("RecognizePDF: Could not find CAPTCHA on page.");
throw new Zotero.Exception.Alert('recognizePDF.limit');
}
var io = { dataIn: {
title: Zotero.getString("recognizePDF.captcha.title"),
description: Zotero.getString("recognizePDF.captcha.description"),
imgUrl: formData.img
}};
_progressWindow.openDialog("chrome://zotero/content/captcha.xul", "",
"chrome,modal,resizable=no,centerscreen", io);
if(!io.dataOut) {
Zotero.debug("RecognizePDF: No CAPTCHA entered");
throw new Zotero.Exception.Alert('recognizePDF.limit');
}
formData.input.captcha = io.dataOut.captcha;
var url = '', prop;
for(prop in formData.input) {
url += '&' + encodeURIComponent(prop) + '='
+ encodeURIComponent(formData.input[prop]);
}
url = formData.action + '?' + url.substr(1);
return Zotero.HTTP.promise("GET", url, {"responseType":"document"})
.then(function(xmlhttp) {
return _checkCaptchaOK(xmlhttp, tries);
},
function(e) {
return _checkCaptchaError(e, tries);
});
}
function _extractCaptchaFormData(doc) {
var formData = {};
var img = doc.getElementsByTagName('img')[0];
if(!img) return;
formData.img = img.src;
var form = doc.forms[0];
if(!form) return;
formData.action = form.action;
formData.input = {};
var inputs = form.getElementsByTagName('input');
for(var i=0, n=inputs.length; i<n; i++) {
if(!inputs[i].name) continue;
formData.input[inputs[i].name] = inputs[i].value;
}
formData.continue = "http://scholar.google.com";
return formData;
}
function _clearGSCookies(host) {
/* There don't seem to be any negative effects of deleting GDSESS
if(!Zotero.isStandalone) {
//ask user first
var response = Components.classes["@mozilla.org/embedcomp/prompt-service;1"]
.getService(Components.interfaces.nsIPromptService)
.confirm(null, "Clear Google Scholar cookies?",
"Google Scholar is attempting to block further queries. We can "
+ "clear certain cookies and try again. This may affect some "
+ "temporary Google preferences or it may log you out. May we clear"
+ " your Google Scholar cookies?");
if(!response) return;
}*/
//find GDSESS cookie
var removed = false, cookies = cookieService.getCookiesFromHost(host);
while(cookies.hasMoreElements()) {
var cookie = cookies.getNext().QueryInterface(Components.interfaces.nsICookie2);
if(["GDSESS", "PREF"].indexOf(cookie.name) !== -1) {
Zotero.debug("RecognizePDF: Removing cookie " + cookie.name + " for host "
+ cookie.host + " and path " + cookie.path);
cookieService.remove(cookie.host, cookie.name, cookie.path, false);
removed = true;
}
}
if(!removed) {
Zotero.debug("RecognizePDF: No cookies removed");
}
return removed;
}
};
} }

View file

@ -255,7 +255,6 @@
<!ENTITY zotero.recognizePDF.cancel.label "Cancel"> <!ENTITY zotero.recognizePDF.cancel.label "Cancel">
<!ENTITY zotero.recognizePDF.pdfName.label "PDF Name"> <!ENTITY zotero.recognizePDF.pdfName.label "PDF Name">
<!ENTITY zotero.recognizePDF.itemName.label "Item Name"> <!ENTITY zotero.recognizePDF.itemName.label "Item Name">
<!ENTITY zotero.recognizePDF.captcha.label "Type the text below to continue retrieving metadata.">
<!ENTITY zotero.rtfScan.title "RTF Scan"> <!ENTITY zotero.rtfScan.title "RTF Scan">
<!ENTITY zotero.rtfScan.cancel.label "Cancel"> <!ENTITY zotero.rtfScan.cancel.label "Cancel">
@ -284,6 +283,4 @@
<!ENTITY zotero.downloadManager.label "Save to Zotero"> <!ENTITY zotero.downloadManager.label "Save to Zotero">
<!ENTITY zotero.downloadManager.saveToLibrary.description "Attachments cannot be saved to the currently selected library. This item will be saved to your library instead."> <!ENTITY zotero.downloadManager.saveToLibrary.description "Attachments cannot be saved to the currently selected library. This item will be saved to your library instead.">
<!ENTITY zotero.downloadManager.noPDFTools.description "To use this feature, you must first install the PDF tools in the Search pane of the Zotero preferences."> <!ENTITY zotero.downloadManager.noPDFTools.description "To use this feature, you must first install the PDF tools in the Search pane of the Zotero preferences.">
<!ENTITY zotero.captcha.title "Please enter CAPTCHA">

View file

@ -895,12 +895,16 @@ proxies.recognized.add = Add Proxy
recognizePDF.noOCR = PDF does not contain OCRed text. recognizePDF.noOCR = PDF does not contain OCRed text.
recognizePDF.couldNotRead = Could not read text from PDF. recognizePDF.couldNotRead = Could not read text from PDF.
recognizePDF.noMatches = No matching references found. recognizePDF.noMatches = No matching references found
recognizePDF.fileNotFound = File not found. recognizePDF.fileNotFound = File not found
recognizePDF.limit = Google Scholar query limit reached. Try again later. recognizePDF.limit = Google Scholar query limit reached. Try again later.
recognizePDF.error = An unexpected error occurred. recognizePDF.error = An unexpected error occurred.
recognizePDF.complete.label = Metadata Retrieval Complete. recognizePDF.stopped = Cancelled
recognizePDF.complete.label = Metadata Retrieval Complete
recognizePDF.cancelled.label = Metadata Retrieval Cancelled
recognizePDF.close.label = Close recognizePDF.close.label = Close
recognizePDF.captcha.title = Please enter CAPTCHA
recognizePDF.captcha.description = Zotero uses Google Scholar to help identify PDFs. To continue using Google Scholar, please enter the text from the image below.
rtfScan.openTitle = Select a file to scan rtfScan.openTitle = Select a file to scan
rtfScan.scanning.label = Scanning RTF Document… rtfScan.scanning.label = Scanning RTF Document…

View file

@ -303,7 +303,6 @@ label.zotero-text-link {
margin-bottom: 1em; margin-bottom: 1em;
} }
.zotero-small-progress-indicator { .zotero-small-progress-indicator {
list-style-image: url(chrome://global/skin/icons/notloading_16.png); list-style-image: url(chrome://global/skin/icons/notloading_16.png);
margin-left: -2px; margin-left: -2px;
@ -316,4 +315,19 @@ label.zotero-text-link {
#zotero-note-window { #zotero-note-window {
padding-bottom: 4px; padding-bottom: 4px;
}
#zotero-captcha-description {
max-width: 300px;
padding-bottom: 4px;
text-align: justify;
}
#zotero-captcha-error {
max-width: 300px;
padding-bottom: 4px;
padding-top: 4px;
font-weight: bold;
color: red;
text-align: center;
} }