From 22eebc6cdfb5f8fb837f65c4980397279c05e9d8 Mon Sep 17 00:00:00 2001
From: Simon Kornblith <simon@simonster.com>
Date: Sun, 25 Jun 2006 04:30:43 +0000
Subject: [PATCH] Addresses #68, figure out way to have scrapers work for gated
 resources behind proxies. We can now access pages through an EZProxy. We need
 to know what alternatives to EZProxy exist in order to support them. Also,
 fixes some spacing issues in browser.js.

---
 .../content/scholar/ingester/browser.js       |  48 +++---
 .../content/scholar/xpcom/ingester.js         | 161 ++++++++++++++++--
 2 files changed, 172 insertions(+), 37 deletions(-)

diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js
index c98d8534fc..7a4c3d496f 100644
--- a/chrome/chromeFiles/content/scholar/ingester/browser.js
+++ b/chrome/chromeFiles/content/scholar/ingester/browser.js
@@ -27,8 +27,8 @@ Scholar_Ingester_Interface.init = function() {
 	Scholar_Ingester_Interface.browserDocuments = new Object();
 	Scholar_Ingester_Interface.browserUris = new Array();
 	
-    window.addEventListener("load", Scholar_Ingester_Interface.chromeLoad, false);
-    window.addEventListener("unload", Scholar_Ingester_Interface.chromeUnload, false);
+	window.addEventListener("load", Scholar_Ingester_Interface.chromeLoad, false);
+	window.addEventListener("unload", Scholar_Ingester_Interface.chromeUnload, false);
 }
 
 /*
@@ -39,12 +39,12 @@ Scholar_Ingester_Interface.chromeLoad = function() {
 	Scholar_Ingester_Interface.appContent = document.getElementById("appcontent");
 	Scholar_Ingester_Interface.statusImage = document.getElementById("scholar-status-image");
 	
-	// this gives us onLocationChange
+	// this gives us onLocationChange, for updating when tabs are switched/created
 	Scholar_Ingester_Interface.tabBrowser.addProgressListener(Scholar_Ingester_Interface.Listener,
 		Components.interfaces.nsIWebProgress.NOTIFY_LOCATION);
-	// let's use load instead of DOMContentLoaded
-    Scholar_Ingester_Interface.appContent.addEventListener("pageshow",
-    	Scholar_Ingester_Interface.contentLoad, true);
+	// this is for pageshow, for updating the status of the book icon
+	Scholar_Ingester_Interface.appContent.addEventListener("pageshow",
+		Scholar_Ingester_Interface.contentLoad, true);
 }
 
 /*
@@ -132,29 +132,29 @@ Scholar_Ingester_Interface.Listener.onStateChange = function() {}
  * appropriate status indicator for the current tab, and to free useless objects
  */
 Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject) {
-    var browsers = Scholar_Ingester_Interface.tabBrowser.browsers;
+	var browsers = Scholar_Ingester_Interface.tabBrowser.browsers;
 
-    // Remove document object of any browser that no longer exists
-    for (var i = 0; i < Scholar_Ingester_Interface.browsers.length; i++) {
-        var browser = Scholar_Ingester_Interface.browsers[i];
-        var exists = false;
+	// Remove document object of any browser that no longer exists
+	for (var i = 0; i < Scholar_Ingester_Interface.browsers.length; i++) {
+		var browser = Scholar_Ingester_Interface.browsers[i];
+		var exists = false;
 
-        for (var j = 0; j < browsers.length; j++) {
-            if (browser == browsers[j]) {
-                exists = true;
-                break;
-            }
-        }
+		for (var j = 0; j < browsers.length; j++) {
+			if (browser == browsers[j]) {
+				exists = true;
+				break;
+			}
+		}
 
-        if (!exists) {
-            Scholar_Ingester_Interface.browsers.splice(i,1);
+		if (!exists) {
+			Scholar_Ingester_Interface.browsers.splice(i,1);
 
-        	// To execute if document object does not exist
-            Scholar_Ingester_Interface._deleteDocument(browser);
-        }
-    }
+			// To execute if document object does not exist
+			Scholar_Ingester_Interface._deleteDocument(browser);
+		}
+	}
 
-    Scholar_Ingester_Interface.updateStatus();
+		Scholar_Ingester_Interface.updateStatus();
 }
 
 //////////////////////////////////////////////////////////////////////////////
diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js
index e9c579094f..63b14510d2 100644
--- a/chrome/chromeFiles/content/scholar/xpcom/ingester.js
+++ b/chrome/chromeFiles/content/scholar/xpcom/ingester.js
@@ -19,6 +19,116 @@ Scholar.Ingester.deleteHiddenBrowser = function(myBrowser) {
 	Scholar.debug("deleted hidden browser");
 }
 
+/////////////////////////////////////////////////////////////////
+//
+// Scholar.Ingester.ProxyMonitor
+//
+/////////////////////////////////////////////////////////////////
+
+// A singleton for recognizing EZProxies and converting URLs such that databases
+// will work from outside them. Unfortunately, this only works with the ($495)
+// EZProxy software. If there are open source alternatives, we should support
+// them too.
+
+/*
+ * Precompile proxy regexps
+ */
+Scholar.Ingester.ProxyMonitor = new Object();
+Scholar.Ingester.ProxyMonitor._ezProxyRe = new RegExp();
+Scholar.Ingester.ProxyMonitor._ezProxyRe.compile("(https?://([^/:]+)(?:\:[0-9])?/login)\\?(?:.+&)?(url|qurl)=([^&]+)");
+Scholar.Ingester.ProxyMonitor._hostRe = new RegExp();
+Scholar.Ingester.ProxyMonitor._hostRe.compile("^https?://(([^/:]+)(\:[0-9]+)?)");
+
+/*
+ * Returns a page's proper url, adjusting for proxying
+ *
+ * This is a bit of a hack, in that it offers an opportunity for spoofing. Not
+ * really any way around this, but our scrapers should be sufficiently sandboxed
+ * that it won't be a problem.
+ */
+Scholar.Ingester.ProxyMonitor.proxyToProper = function(url) {
+	var m = Scholar.Ingester.ProxyMonitor._ezProxyRe.exec(url);
+	if(m) {
+		// EZProxy detected
+		var loginURL = m[1];
+		var host = m[2];
+		var arg = m[3];
+		var url = m[4];
+		
+		if(arg == "qurl") {
+			url = unescape(url);
+		}
+		
+		// FIXME - potential memory leak
+		Scholar.Ingester.ProxyMonitor._now = true;
+		Scholar.Ingester.ProxyMonitor._url = url;
+		Scholar.Ingester.ProxyMonitor._host = host;
+		Scholar.Ingester.ProxyMonitor._loginURL = loginURL;
+	} else if(Scholar.Ingester.ProxyMonitor._now) {
+		// EZProxying something
+		var m = Scholar.Ingester.ProxyMonitor._hostRe.exec(url);
+		
+		// EZProxy always runs on a higher port
+		if(url == Scholar.Ingester.ProxyMonitor._loginURL) {
+			Scholar.debug("EZProxy: detected wrong password; won't disable monitoring yet");
+		} else {
+			if(m) {
+				var hostAndPort = m[1];
+				var host = m[2];
+				var port = m[3];
+				
+				if(port) {
+					// Make sure our host is the same who we logged in under
+					if(host == Scholar.Ingester.ProxyMonitor._host) {
+						// Extract host information from the URL we're proxying
+						var m = Scholar.Ingester.ProxyMonitor._hostRe.exec(Scholar.Ingester.ProxyMonitor._url);
+						var properHostAndPort = m[1];
+						if(m) {
+							if(!Scholar.Ingester.ProxyMonitor._mapFromProxy) {
+								Scholar.Ingester.ProxyMonitor._mapFromProxy = new Object();
+								Scholar.Ingester.ProxyMonitor._mapToProxy = new Object();
+							}
+							Scholar.debug("EZProxy: host "+hostAndPort+" is really "+properHostAndPort);
+							Scholar.Ingester.ProxyMonitor._mapFromProxy[hostAndPort] = properHostAndPort;
+							Scholar.Ingester.ProxyMonitor._mapToProxy[properHostAndPort] = hostAndPort;
+							url = url.replace(hostAndPort, properHostAndPort);
+						}
+					}
+				}
+			}
+			Scholar.Ingester.ProxyMonitor._now = false;
+		}
+	} else if(Scholar.Ingester.ProxyMonitor._mapFromProxy) {
+		// EZProxy detection is active
+		
+		var m = Scholar.Ingester.ProxyMonitor._hostRe.exec(url);
+		if(m && Scholar.Ingester.ProxyMonitor._mapFromProxy[m[1]]) {
+			url = url.replace(m[1], Scholar.Ingester.ProxyMonitor._mapFromProxy[m[1]]);
+			Scholar.debug("EZProxy: proper url is "+url);
+		}
+	}
+	
+	return url;
+}
+
+/*
+ * Returns a page's proxied url from the proper url
+ */
+Scholar.Ingester.ProxyMonitor.properToProxy = function(url) {
+	if(Scholar.Ingester.ProxyMonitor._mapToProxy) {
+		// EZProxy detection is active
+		
+		var m = Scholar.Ingester.ProxyMonitor._hostRe.exec(url);
+		if(Scholar.Ingester.ProxyMonitor._mapToProxy[m[1]]) {
+			// Actually need to map
+			url = url.replace(m[1], Scholar.Ingester.ProxyMonitor._mapToProxy[m[1]]);
+			Scholar.debug("EZProxy: proxied url is "+url);
+		}
+	}
+	
+	return url;
+}
+
 /////////////////////////////////////////////////////////////////
 //
 // Scholar.Ingester.Model
@@ -63,8 +173,9 @@ Scholar.Ingester.Model.prototype.detachRepository = function() {}
 /////////////////////////////////////////////////////////////////
 // Scholar.Ingester.Utilities class, a set of methods to assist in data
 // extraction. Most code here was stolen directly from the Piggy Bank project.
-Scholar.Ingester.Utilities = function(myWindow) {
+Scholar.Ingester.Utilities = function(myWindow, proxiedURL) {
 	this.window = myWindow;
+	this.proxiedURL = proxiedURL;
 }
 
 // Adapter for Piggy Bank function to print debug messages; log level is
@@ -149,8 +260,11 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD
 		var doLoad = function() {
 			urlIndex++;
 			if (urlIndex < urls.length) {
+				url = urls[urlIndex];
+				if(this.proxiedURL) {
+					url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
+				}
 				try {
-					url = urls[urlIndex];
 					Scholar.debug("loading "+url);
 					hiddenBrowser.loadURI(url);
 				} catch (e) {
@@ -477,11 +591,16 @@ Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, mo
 // These are front ends for XMLHttpRequest. XMLHttpRequest can't actually be
 // accessed outside the sandbox, and even if it could, it wouldn't let scripts
 // access across domains, so everything's replicated here.
-Scholar.Ingester.HTTPUtilities = function(contentWindow) {
+Scholar.Ingester.HTTPUtilities = function(contentWindow, proxiedURL) {
 	this.window = contentWindow;
+	this.proxiedURL = proxiedURL;
 }
 
 Scholar.Ingester.HTTPUtilities.prototype.doGet = function(url, onStatus, onDone) {
+	if(this.proxiedURL) {
+		url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
+	}
+	
 	var xmlhttp = new this.window.XMLHttpRequest();
 	
 	xmlhttp.open('GET', url, true);
@@ -495,6 +614,10 @@ Scholar.Ingester.HTTPUtilities.prototype.doGet = function(url, onStatus, onDone)
 }
 
 Scholar.Ingester.HTTPUtilities.prototype.doPost = function(url, body, onStatus, onDone) {
+	if(this.proxiedURL) {
+		url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
+	}
+	
 	var xmlhttp = new this.window.XMLHttpRequest();
 	
 	xmlhttp.open('POST', url, true);
@@ -508,6 +631,10 @@ Scholar.Ingester.HTTPUtilities.prototype.doPost = function(url, body, onStatus,
 }
 	
 Scholar.Ingester.HTTPUtilities.prototype.doOptions = function(url, body, onStatus, onDone) {
+	if(this.proxiedURL) {
+		url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
+	}
+	
 	var xmlhttp = new this.window.XMLHttpRequest();
   
 	xmlhttp.open('OPTIONS', url, true);
@@ -519,9 +646,7 @@ Scholar.Ingester.HTTPUtilities.prototype.doOptions = function(url, body, onStatu
 	};
 	xmlhttp.send(body);
 }
-	
-// Possible point of failure; for some reason, this used to be a separate
-// class, so make sure it works
+
 Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatus, onDone) {
 	switch (xmlhttp.readyState) {
 
@@ -564,6 +689,7 @@ Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatu
 		break;
 	}
 }
+
 //////////////////////////////////////////////////////////////////////////////
 //
 // Scholar.Ingester.Document
@@ -597,6 +723,13 @@ Scholar.Ingester.Document = function(browserWindow, myWindow){
 	this.browser = browserWindow;
 	this.window = myWindow;
 	this.model = new Scholar.Ingester.Model();
+	
+	// Create separate URL to account for proxies
+	this.url = Scholar.Ingester.ProxyMonitor.proxyToProper(this.browser.contentDocument.location.href);
+	if(this.url != this.browser.contentDocument.location.href) {
+		this.proxiedURL = true;
+	}
+	
 	this.items = new Array();
 	this._appSvc = Cc["@mozilla.org/appshell/appShellService;1"]
 	             .getService(Ci.nsIAppShellService);
@@ -607,7 +740,8 @@ Scholar.Ingester.Document = function(browserWindow, myWindow){
  * Retrieves the best scraper to scrape a given page
  */
 Scholar.Ingester.Document.prototype.retrieveScraper = function() {
-	Scholar.debug("Retrieving scrapers for "+this.browser.contentDocument.location.href);
+	Scholar.debug("Retrieving scrapers for "+this.url);
+	
 	var sql = 'SELECT * FROM scrapers ORDER BY scraperDetectCode IS NULL DESC';
 	var scrapers = Scholar.DB.query(sql);
 	for(var i=0; i<scrapers.length; i++) {
@@ -625,14 +759,14 @@ Scholar.Ingester.Document.prototype.retrieveScraper = function() {
  * Check to see if _scraper_ can scrape this document
  */
 Scholar.Ingester.Document.prototype.canScrape = function(currentScraper) {
-		var canScrape = false;
+	var canScrape = false;
 	
 	// Test with regular expression
 	// If this is slow, we could preload all scrapers and compile regular
 	// expressions, so each check will be faster
 	if(currentScraper.urlPattern) {
 		var regularExpression = new RegExp(currentScraper.urlPattern, "i");
-		if(regularExpression.test(this.browser.contentDocument.location.href)) {
+		if(regularExpression.test(this.url)) {
 			canScrape = true;
 		}
 	}
@@ -672,7 +806,7 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
 		this._scrapeCallback = callback;
 	}
 	
-	Scholar.debug("Scraping "+this.browser.contentDocument.location.href);
+	Scholar.debug("Scraping "+this.url);
 	
 	var scraperSandbox = this._sandbox;
 	try {
@@ -739,9 +873,10 @@ Scholar.Ingester.Document.prototype._scrapePageComplete = function(returnValue)
 Scholar.Ingester.Document.prototype._generateSandbox = function() {
 	this._sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
 	this._sandbox.browser = this.browser;
-	this._sandbox.doc = this._sandbox.browser.contentDocument;
-	this._sandbox.utilities = new Scholar.Ingester.Utilities(this.window);
-	this._sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this._appSvc.hiddenDOMWindow);
+	this._sandbox.doc = this.browser.contentDocument;
+	this._sandbox.url = this.url;
+	this._sandbox.utilities = new Scholar.Ingester.Utilities(this.window, this.proxiedURL);
+	this._sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this._appSvc.hiddenDOMWindow, this.proxiedURL);
 	this._sandbox.window = this.window;
 	this._sandbox.model = this.model;
 	this._sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult;