Merge pull request #217 from aurimasv/recognizePDF
Tweak recognizePDF to avoid false positives
This commit is contained in:
commit
3c6b0d99c3
1 changed files with 130 additions and 93 deletions
|
@ -243,7 +243,7 @@ Zotero_RecognizePDF.Recognizer = function () {}
|
|||
* (function will be passed image as URL and must return text of CAPTCHA)
|
||||
*/
|
||||
Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, callback, captchaCallback) {
|
||||
const MAX_PAGES = 3;
|
||||
const MAX_PAGES = 5;
|
||||
|
||||
this._libraryID = libraryID;
|
||||
this._callback = callback;
|
||||
|
@ -293,10 +293,7 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c
|
|||
intlStream.QueryInterface(Components.interfaces.nsIUnicharLineInputStream);
|
||||
|
||||
// get the lines in this sample
|
||||
var lines = [],
|
||||
cleanedLines = [],
|
||||
cleanedLineLengths = [],
|
||||
str = {};
|
||||
var lines = [], str = {};
|
||||
while(intlStream.readLine(str)) {
|
||||
var line = str.value.trim();
|
||||
if(line) lines.push(line);
|
||||
|
@ -305,19 +302,59 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c
|
|||
inputStream.close();
|
||||
cacheFile.remove(false);
|
||||
|
||||
var me = this;
|
||||
|
||||
// look for DOI
|
||||
var allText = lines.join("\n");
|
||||
Zotero.debug(allText);
|
||||
var m = Zotero.Utilities.cleanDOI(allText);
|
||||
if(m) {
|
||||
this._DOI = m;
|
||||
this._queryDOI(m, function() {
|
||||
me._cleanLines(lines, me._queryGoogle);
|
||||
});
|
||||
} else {
|
||||
this._cleanLines(lines, me._queryGoogle);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks up item by DOI
|
||||
* @private
|
||||
* @param {String} doi DOI to search for
|
||||
* @param {Function} onFail Callback function to call if a DOI is not found
|
||||
*/
|
||||
Zotero_RecognizePDF.Recognizer.prototype._queryDOI = function(doi, onFail) {
|
||||
var me = this;
|
||||
var translate = new Zotero.Translate.Search();
|
||||
translate.setTranslator("11645bd1-0420-45c1-badb-53fb41eeb753");
|
||||
var item = {"itemType":"journalArticle", "DOI":doi};
|
||||
translate.setSearch(item);
|
||||
translate.setHandler("itemDone", function(translate, item) {
|
||||
me._callback(item);
|
||||
});
|
||||
translate.setHandler("select", function(translate, items, callback) {
|
||||
return me._selectItems(translate, items, callback);
|
||||
});
|
||||
translate.setHandler("done", function(translate, success) {
|
||||
if(!success) onFail.call(me);
|
||||
});
|
||||
translate.translate(this._libraryID, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepares a list of lines that can be used for querying
|
||||
* The lines are stored in this._goodLines
|
||||
* @private
|
||||
* @param {String[]} lines Array of lines
|
||||
* @param {Function} callback A callback function to be called on completing
|
||||
*/
|
||||
Zotero_RecognizePDF.Recognizer.prototype._cleanLines = function(lines, callback) {
|
||||
// Use only first column from multi-column lines
|
||||
const lineRe = /^\s*([^\s]+(?: [^\s]+)+)/;
|
||||
for(var i=0; i<lines.length; i++) {
|
||||
var cleanedLines = [], cleanedLineLengths = [];
|
||||
for(var i=0; i<lines.length && cleanedLines.length<30; i++) {
|
||||
var m = lineRe.exec(lines[i]);
|
||||
if(m) {
|
||||
if(m && m[1].split(' ').length > 3) {
|
||||
cleanedLines.push(m[1]);
|
||||
cleanedLineLengths.push(m[1].length);
|
||||
}
|
||||
|
@ -334,8 +371,8 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c
|
|||
|
||||
// pick lines within 4 chars of the median (this is completely arbitrary)
|
||||
this._goodLines = [];
|
||||
var uBound = medianLength + 4;
|
||||
var lBound = medianLength - 4;
|
||||
var uBound = medianLength + 6;
|
||||
var lBound = medianLength - 6;
|
||||
for (var i=0; i<lineLengthsLength; i++) {
|
||||
if(cleanedLineLengths[i] > lBound && cleanedLineLengths[i] < uBound) {
|
||||
// Strip quotation marks so they don't mess up search query quoting
|
||||
|
@ -344,49 +381,51 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c
|
|||
}
|
||||
}
|
||||
|
||||
this._startLine = this._iteration = 0;
|
||||
this._queryGoogle();
|
||||
this._nextLine = this._iteration = 0;
|
||||
callback.call(this);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Deletes hidden browser and sends a failure message to this_callback
|
||||
* @private
|
||||
* @param {String} msg Message to be sent to this._callback
|
||||
*/
|
||||
Zotero_RecognizePDF.Recognizer.prototype._deleteBrowserAndFail = function(msg) {
|
||||
var me = this;
|
||||
try {
|
||||
if(this._hiddenBrowser) Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser);
|
||||
} catch(e) {}
|
||||
this._callback(false, msg);
|
||||
}
|
||||
|
||||
/**
|
||||
* Queries Google Scholar for metadata for this PDF
|
||||
* @private
|
||||
*/
|
||||
Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() {
|
||||
if(this._iteration > 3 || this._startLine >= this._goodLines.length) {
|
||||
try {
|
||||
if(this._hiddenBrowser) Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser);
|
||||
} catch(e) {}
|
||||
this._callback(false, "recognizePDF.noMatches");
|
||||
if(this._iteration > 3 || !this._goodLines.length) {
|
||||
this._deleteBrowserAndFail("recognizePDF.noMatches");
|
||||
return;
|
||||
}
|
||||
this._iteration++;
|
||||
|
||||
var queryString = "";
|
||||
var me = this;
|
||||
if(this._DOI) {
|
||||
// use CrossRef to look for DOI
|
||||
var translate = new Zotero.Translate.Search();
|
||||
translate.setTranslator("11645bd1-0420-45c1-badb-53fb41eeb753");
|
||||
var item = {"itemType":"journalArticle", "DOI":this._DOI};
|
||||
translate.setSearch(item);
|
||||
translate.setHandler("itemDone", function(translate, item) {
|
||||
me._callback(item);
|
||||
});
|
||||
translate.setHandler("select", function(translate, items, callback) {
|
||||
return me._selectItems(translate, items, callback);
|
||||
});
|
||||
translate.setHandler("done", function(translate, success) {
|
||||
if(!success) me._queryGoogle();
|
||||
});
|
||||
translate.translate(this._libraryID, false);
|
||||
delete this._DOI;
|
||||
} else {
|
||||
|
||||
// take the relevant parts of some lines (exclude hyphenated word)
|
||||
var queryStringWords = 0;
|
||||
while(queryStringWords < 25 && this._startLine < this._goodLines.length) {
|
||||
var words = this._goodLines[this._startLine].split(/\s+/);
|
||||
while(queryStringWords < 25) {
|
||||
if(!this._goodLines.length) {
|
||||
this._deleteBrowserAndFail("recognizePDF.noMatches");
|
||||
return;
|
||||
}
|
||||
|
||||
var words = this._goodLines.splice(this._nextLine,1)[0].split(/\s+/);
|
||||
//Try to avoid picking adjacent strings so the odds of them appearing in another
|
||||
// document quoting our document is low. Every 7th line is a magic value
|
||||
this._nextLine = (this._nextLine + 7) % this._goodLines.length;
|
||||
|
||||
// get rid of first and last words
|
||||
words.shift();
|
||||
words.pop();
|
||||
|
@ -403,7 +442,6 @@ Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() {
|
|||
queryStringWords += words.length;
|
||||
queryString += '"'+words.join(" ")+'" ';
|
||||
}
|
||||
this._startLine++;
|
||||
}
|
||||
|
||||
Zotero.debug("RecognizePDF: Query string "+queryString);
|
||||
|
@ -442,7 +480,6 @@ Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() {
|
|||
this._hiddenBrowser.loadURIWithFlags(url,
|
||||
Components.interfaces.nsIWebNavigation.LOAD_FLAGS_BYPASS_HISTORY, null, null, null);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* To be executed when Google Scholar is loaded
|
||||
|
|
Loading…
Reference in a new issue