Merge pull request #217 from aurimasv/recognizePDF

Tweak recognizePDF to avoid false positives
2013-01-01 06:21:25 -08:00 · 2013-01-01 06:21:25 -08:00 · 3c6b0d99c3
commit 3c6b0d99c3
parent 11a83e5df3 ea6a1098a6
1 changed files with 130 additions and 93 deletions
--- a/chrome/content/zotero/recognizePDF.js
+++ b/chrome/content/zotero/recognizePDF.js
@ -243,7 +243,7 @@ Zotero_RecognizePDF.Recognizer = function () {}
 *	(function will be passed image as URL and must return text of CAPTCHA)
 */
 Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, callback, captchaCallback) {
-	const MAX_PAGES = 3;
+	const MAX_PAGES = 5;
 	
 	this._libraryID = libraryID;
 	this._callback = callback;
@ -293,10 +293,7 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c
 	intlStream.QueryInterface(Components.interfaces.nsIUnicharLineInputStream);
 	
 	// get the lines in this sample
-	var lines = [],
-		cleanedLines = [],
-		cleanedLineLengths = [],
-		str = {};
+	var lines = [], str = {};
 	while(intlStream.readLine(str)) {
 		var line = str.value.trim();
 		if(line) lines.push(line);
@ -305,19 +302,59 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c
 	inputStream.close();
 	cacheFile.remove(false);

+	var me = this;
+
 	// look for DOI
 	var allText = lines.join("\n");
 	Zotero.debug(allText);
 	var m = Zotero.Utilities.cleanDOI(allText);
 	if(m) {
-		this._DOI = m;
+		this._queryDOI(m, function() {
+			me._cleanLines(lines, me._queryGoogle);
+		});
+	} else {
+		this._cleanLines(lines, me._queryGoogle);
+	}
 }

+/**
+ * Looks up item by DOI
+ * @private
+ * @param {String} doi DOI to search for
+ * @param {Function} onFail Callback function to call if a DOI is not found
+ */
+Zotero_RecognizePDF.Recognizer.prototype._queryDOI = function(doi, onFail) {
+	var me = this;
+	var translate = new Zotero.Translate.Search();
+	translate.setTranslator("11645bd1-0420-45c1-badb-53fb41eeb753");
+	var item = {"itemType":"journalArticle", "DOI":doi};
+	translate.setSearch(item);
+	translate.setHandler("itemDone", function(translate, item) {
+		me._callback(item);
+	});
+	translate.setHandler("select", function(translate, items, callback) {
+		return me._selectItems(translate, items, callback);
+	});
+	translate.setHandler("done", function(translate, success) {
+		if(!success) onFail.call(me);
+	});
+	translate.translate(this._libraryID, false);
+}
+
+/**
+ * Prepares a list of lines that can be used for querying
+ * The lines are stored in this._goodLines
+ * @private
+ * @param {String[]} lines Array of lines
+ * @param {Function} callback A callback function to be called on completing
+ */
+Zotero_RecognizePDF.Recognizer.prototype._cleanLines = function(lines, callback) {
 	// Use only first column from multi-column lines
 	const lineRe = /^\s*([^\s]+(?: [^\s]+)+)/;
-	for(var i=0; i<lines.length; i++) {
+	var cleanedLines = [], cleanedLineLengths = [];
+	for(var i=0; i<lines.length && cleanedLines.length<30; i++) {
 		var m = lineRe.exec(lines[i]);
-		if(m) {
+		if(m && m[1].split(' ').length > 3) {
 			cleanedLines.push(m[1]);
 			cleanedLineLengths.push(m[1].length);
 		}
@ -334,8 +371,8 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c
 		
 		// pick lines within 4 chars of the median (this is completely arbitrary)
 		this._goodLines = [];
-		var uBound = medianLength + 4;
-		var lBound = medianLength - 4;
+		var uBound = medianLength + 6;
+		var lBound = medianLength - 6;
 		for (var i=0; i<lineLengthsLength; i++) {
 			if(cleanedLineLengths[i] > lBound && cleanedLineLengths[i] < uBound) {
 				// Strip quotation marks so they don't mess up search query quoting
@ -344,49 +381,51 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c
 			}
 		}
 		
-		this._startLine = this._iteration = 0;
-		this._queryGoogle();
+		this._nextLine = this._iteration = 0;
+		callback.call(this);
 	}
 }

+/**
+ * Deletes hidden browser and sends a failure message to this_callback
+ * @private
+ * @param {String} msg Message to be sent to this._callback
+ */
+Zotero_RecognizePDF.Recognizer.prototype._deleteBrowserAndFail = function(msg) {
+	var me = this;
+	try {
+		if(this._hiddenBrowser) Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser);
+	} catch(e) {}
+	this._callback(false, msg);
+}
+
 /**
 * Queries Google Scholar for metadata for this PDF
 * @private
 */
 Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() {
-	if(this._iteration > 3 || this._startLine >= this._goodLines.length) {
-		try {
-			if(this._hiddenBrowser) Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser);
-		} catch(e) {}
-		this._callback(false, "recognizePDF.noMatches");
+	if(this._iteration > 3 || !this._goodLines.length) {
+		this._deleteBrowserAndFail("recognizePDF.noMatches");
 		return;
 	}
 	this._iteration++;

 	var queryString = "";
 	var me = this;
-	if(this._DOI) {
-		// use CrossRef to look for DOI
-		var translate = new Zotero.Translate.Search();
-		translate.setTranslator("11645bd1-0420-45c1-badb-53fb41eeb753");
-		var item = {"itemType":"journalArticle", "DOI":this._DOI};
-		translate.setSearch(item);
-		translate.setHandler("itemDone", function(translate, item) {
-			me._callback(item);
-		});
-		translate.setHandler("select", function(translate, items, callback) {
-			return me._selectItems(translate, items, callback);
-		});
-		translate.setHandler("done", function(translate, success) {
-			if(!success) me._queryGoogle();
-		});
-		translate.translate(this._libraryID, false);
-		delete this._DOI;
-	} else {
+
 	// take the relevant parts of some lines (exclude hyphenated word)
 	var queryStringWords = 0;
-		while(queryStringWords < 25 && this._startLine < this._goodLines.length) {
-			var words = this._goodLines[this._startLine].split(/\s+/);
+	while(queryStringWords < 25) {
+		if(!this._goodLines.length) {
+			this._deleteBrowserAndFail("recognizePDF.noMatches");
+			return;
+		}
+
+		var words = this._goodLines.splice(this._nextLine,1)[0].split(/\s+/);
+		//Try to avoid picking adjacent strings so the odds of them appearing in another
+		// document quoting our document is low. Every 7th line is a magic value
+		this._nextLine = (this._nextLine + 7) % this._goodLines.length;
+
 		// get rid of first and last words
 		words.shift();
 		words.pop();
@ -403,7 +442,6 @@ Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() {
 			queryStringWords += words.length;
 			queryString += '"'+words.join(" ")+'" ';
 		}
-			this._startLine++;
 	}
 	
 	Zotero.debug("RecognizePDF: Query string "+queryString);
@ -442,7 +480,6 @@ Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() {
 	this._hiddenBrowser.loadURIWithFlags(url,
 		Components.interfaces.nsIWebNavigation.LOAD_FLAGS_BYPASS_HISTORY, null, null, null);
 }
-}

 /**
 * To be executed when Google Scholar is loaded