closes #68, figure out way to have scrapers work for gated resources behind proxies. most institutions use EZProxy for their proxy needs (or a more transparent proxy, which we support natively). this implementation is significantly better than the old one, which refused to work after you'd already logged in once, and is also simpler, because it's stateless. it has to observe every HTTP request, but there's no noticeable speed hit. it also still doesn't work when there's a link from one gated site to another gated site, but as far as i can tell, this only happens on the Gale Group site.
This commit is contained in:
parent
19504e6746
commit
257ed8f69b
3 changed files with 123 additions and 88 deletions
|
@ -26,6 +26,7 @@ Scholar_Ingester_Interface.init = function() {
|
|||
Scholar_Ingester_Interface.browsers = new Array();
|
||||
Scholar_Ingester_Interface.browserDocuments = new Object();
|
||||
Scholar_Ingester_Interface.browserUris = new Array();
|
||||
Scholar.Ingester.ProxyMonitor.init();
|
||||
|
||||
window.addEventListener("load", Scholar_Ingester_Interface.chromeLoad, false);
|
||||
window.addEventListener("unload", Scholar_Ingester_Interface.chromeUnload, false);
|
||||
|
@ -249,9 +250,9 @@ Scholar_Ingester_Interface._finishScraping = function(obj, returnValue) {
|
|||
}
|
||||
|
||||
// Save items
|
||||
/*for(i in obj.items) {
|
||||
for(i in obj.items) {
|
||||
obj.items[i].save();
|
||||
}*/
|
||||
}
|
||||
setTimeout(function() { Scholar_Ingester_Interface.scrapeProgress.fade() }, 2500);
|
||||
} else if(returnValue) {
|
||||
Scholar_Ingester_Interface.scrapeProgress.kill();
|
||||
|
|
|
@ -75,99 +75,127 @@ Scholar.Ingester.ingestURL = function(url, complete, error, myWindow) {
|
|||
/*
|
||||
* Precompile proxy regexps
|
||||
*/
|
||||
Scholar.Ingester.ProxyMonitor = new Object();
|
||||
Scholar.Ingester.ProxyMonitor._ezProxyRe = new RegExp();
|
||||
Scholar.Ingester.ProxyMonitor._ezProxyRe.compile("(https?://([^/:]+)(?:\:[0-9])?/login)\\?(?:.+&)?(url|qurl)=([^&]+)");
|
||||
Scholar.Ingester.ProxyMonitor._hostRe = new RegExp();
|
||||
Scholar.Ingester.ProxyMonitor._hostRe.compile("^https?://(([^/:]+)(\:[0-9]+)?)");
|
||||
|
||||
/*
|
||||
* Returns a page's proper url, adjusting for proxying
|
||||
*
|
||||
* This is a bit of a hack, in that it offers an opportunity for spoofing. Not
|
||||
* really any way around this, but our scrapers should be sufficiently sandboxed
|
||||
* that it won't be a problem.
|
||||
*/
|
||||
Scholar.Ingester.ProxyMonitor.proxyToProper = function(url) {
|
||||
var m = Scholar.Ingester.ProxyMonitor._ezProxyRe.exec(url);
|
||||
if(m) {
|
||||
// EZProxy detected
|
||||
var loginURL = m[1];
|
||||
var host = m[2];
|
||||
var arg = m[3];
|
||||
var url = m[4];
|
||||
|
||||
if(arg == "qurl") {
|
||||
url = unescape(url);
|
||||
Scholar.Ingester.ProxyMonitor = new function() {
|
||||
var _ezProxyRe = new RegExp();
|
||||
_ezProxyRe.compile("\\?(?:.+&)?(url|qurl)=([^&]+)", "i");
|
||||
/*var _hostRe = new RegExp();
|
||||
_hostRe.compile("^https?://(([^/:]+)(?:\:([0-9]+))?)");*/
|
||||
var ioService = Components.classes["@mozilla.org/network/io-service;1"]
|
||||
.getService(Components.interfaces.nsIIOService);
|
||||
var on = false;
|
||||
var _mapFromProxy = null;
|
||||
var _mapToProxy = null;
|
||||
|
||||
this.init = init;
|
||||
this.proxyToProper = proxyToProper;
|
||||
this.properToProxy = properToProxy;
|
||||
this.observe = observe;
|
||||
|
||||
function init() {
|
||||
if(!on) {
|
||||
var observerService = Components.classes["@mozilla.org/observer-service;1"]
|
||||
.getService(Components.interfaces.nsIObserverService);
|
||||
observerService.addObserver(this, "http-on-examine-response", false);
|
||||
}
|
||||
|
||||
Scholar.Ingester.ProxyMonitor._now = true;
|
||||
Scholar.Ingester.ProxyMonitor._url = url;
|
||||
Scholar.Ingester.ProxyMonitor._host = host;
|
||||
Scholar.Ingester.ProxyMonitor._loginURL = loginURL;
|
||||
} else if(Scholar.Ingester.ProxyMonitor._now) {
|
||||
// EZProxying something
|
||||
var m = Scholar.Ingester.ProxyMonitor._hostRe.exec(url);
|
||||
|
||||
// EZProxy always runs on a higher port
|
||||
if(url == Scholar.Ingester.ProxyMonitor._loginURL) {
|
||||
Scholar.debug("EZProxy: detected wrong password; won't disable monitoring yet");
|
||||
} else {
|
||||
if(m) {
|
||||
var hostAndPort = m[1];
|
||||
var host = m[2];
|
||||
var port = m[3];
|
||||
|
||||
if(port) {
|
||||
// Make sure our host is the same who we logged in under
|
||||
if(host == Scholar.Ingester.ProxyMonitor._host) {
|
||||
// Extract host information from the URL we're proxying
|
||||
var m = Scholar.Ingester.ProxyMonitor._hostRe.exec(Scholar.Ingester.ProxyMonitor._url);
|
||||
var properHostAndPort = m[1];
|
||||
if(m) {
|
||||
if(!Scholar.Ingester.ProxyMonitor._mapFromProxy) {
|
||||
Scholar.Ingester.ProxyMonitor._mapFromProxy = new Object();
|
||||
Scholar.Ingester.ProxyMonitor._mapToProxy = new Object();
|
||||
}
|
||||
Scholar.debug("EZProxy: host "+hostAndPort+" is really "+properHostAndPort);
|
||||
Scholar.Ingester.ProxyMonitor._mapFromProxy[hostAndPort] = properHostAndPort;
|
||||
Scholar.Ingester.ProxyMonitor._mapToProxy[properHostAndPort] = hostAndPort;
|
||||
url = url.replace(hostAndPort, properHostAndPort);
|
||||
}
|
||||
}
|
||||
}
|
||||
on = true;
|
||||
}
|
||||
|
||||
function observe(channel) {
|
||||
channel.QueryInterface(Components.interfaces.nsIHttpChannel);
|
||||
if(channel.getResponseHeader("Server") == "EZproxy") {
|
||||
// We're connected to an EZproxy
|
||||
if(channel.responseStatus != "302") {
|
||||
return;
|
||||
}
|
||||
|
||||
Scholar.debug(channel.URI.spec);
|
||||
// We should be able to scrape the URL out of this
|
||||
var m = _ezProxyRe.exec(channel.URI.spec);
|
||||
if(!m) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Found URL
|
||||
var variable = m[1];
|
||||
var properURL = m[2];
|
||||
if(variable.toLowerCase() == "qurl") {
|
||||
properURL = unescape(properURL);
|
||||
}
|
||||
var properURI = _parseURL(properURL);
|
||||
if(!properURI) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Get the new URL
|
||||
var newURL = channel.getResponseHeader("Location");
|
||||
if(!newURL) {
|
||||
return;
|
||||
}
|
||||
var newURI = _parseURL(newURL);
|
||||
if(!newURI) {
|
||||
return;
|
||||
}
|
||||
|
||||
if(channel.URI.host == newURI.host && channel.URI.port != newURI.port) {
|
||||
// Different ports but the same server means EZproxy active
|
||||
|
||||
Scholar.debug("EZProxy: host "+newURI.hostPort+" is really "+properURI.hostPort);
|
||||
// Initialize variables here so people who never use EZProxies
|
||||
// don't get the (very very minor) speed hit
|
||||
if(!_mapFromProxy) {
|
||||
_mapFromProxy = new Object();
|
||||
_mapToProxy = new Object();
|
||||
}
|
||||
_mapFromProxy[newURI.hostPort] = properURI.hostPort;
|
||||
_mapToProxy[properURI.hostPort] = newURI.hostPort;
|
||||
}
|
||||
Scholar.Ingester.ProxyMonitor._now = false;
|
||||
}
|
||||
} else if(Scholar.Ingester.ProxyMonitor._mapFromProxy) {
|
||||
// EZProxy detection is active
|
||||
|
||||
var m = Scholar.Ingester.ProxyMonitor._hostRe.exec(url);
|
||||
if(m && Scholar.Ingester.ProxyMonitor._mapFromProxy[m[1]]) {
|
||||
url = url.replace(m[1], Scholar.Ingester.ProxyMonitor._mapFromProxy[m[1]]);
|
||||
Scholar.debug("EZProxy: proper url is "+url);
|
||||
}
|
||||
}
|
||||
|
||||
return url;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns a page's proxied url from the proper url
|
||||
*/
|
||||
Scholar.Ingester.ProxyMonitor.properToProxy = function(url) {
|
||||
if(Scholar.Ingester.ProxyMonitor._mapToProxy) {
|
||||
// EZProxy detection is active
|
||||
|
||||
var m = Scholar.Ingester.ProxyMonitor._hostRe.exec(url);
|
||||
if(Scholar.Ingester.ProxyMonitor._mapToProxy[m[1]]) {
|
||||
// Actually need to map
|
||||
url = url.replace(m[1], Scholar.Ingester.ProxyMonitor._mapToProxy[m[1]]);
|
||||
Scholar.debug("EZProxy: proxied url is "+url);
|
||||
/*
|
||||
* Returns a page's proper url, adjusting for proxying
|
||||
*/
|
||||
function proxyToProper(url) {
|
||||
if(_mapFromProxy) {
|
||||
// EZProxy detection is active
|
||||
|
||||
var uri = _parseURL(url);
|
||||
if(uri && _mapFromProxy[uri.hostPort]) {
|
||||
url = url.replace(uri.hostPort, _mapFromProxy[uri.hostPort]);
|
||||
Scholar.debug("EZProxy: proper url is "+url);
|
||||
}
|
||||
}
|
||||
|
||||
return url;
|
||||
}
|
||||
|
||||
return url;
|
||||
/*
|
||||
* Returns a page's proxied url from the proper url
|
||||
*/
|
||||
function properToProxy(url) {
|
||||
if(_mapToProxy) {
|
||||
// EZProxy detection is active
|
||||
|
||||
var uri = _parseURL(url);
|
||||
if(uri && _mapToProxy[uri.hostPort]) {
|
||||
// Actually need to map
|
||||
url = url.replace(uri.hostPort, _mapToProxy[uri.hostPort]);
|
||||
Scholar.debug("EZProxy: proxied url is "+url);
|
||||
}
|
||||
}
|
||||
|
||||
return url;
|
||||
}
|
||||
|
||||
/*
|
||||
* Parses a url into components (hostPort, port, host, and spec)
|
||||
*/
|
||||
function _parseURL(url) {
|
||||
// create an nsIURI (not sure if this is faster than the regular
|
||||
// expression, but it's at least more kosher)
|
||||
var uri = ioService.newURI(url, null, null);
|
||||
return uri;
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
|
|
|
@ -381,14 +381,20 @@ Scholar.Utilities.Ingester.prototype.importMARCRecord = function(record, uri, mo
|
|||
// Ingester adapters for Scholar.Utilities.HTTP to handle proxies
|
||||
|
||||
Scholar.Utilities.Ingester.prototype.loadDocument = function(url, browser, succeeded, failed) {
|
||||
if(this.proxiedURL) {
|
||||
url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
|
||||
}
|
||||
Scholar.Utilities.HTTP.processDocuments(null, [ url ], succeeded, function() {}, failed);
|
||||
}
|
||||
Scholar.Utilities.Ingester.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
|
||||
for(i in urls) {
|
||||
urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(url);
|
||||
}
|
||||
Scholar.Utilities.HTTP.processDocuments(firstDoc, urls, processor, done, exception);
|
||||
}
|
||||
|
||||
Scholar.Utilities.Ingester.HTTPUtilities = function(proxiedURL) {
|
||||
this.proxiedURL = proxiedURL
|
||||
this.proxiedURL = proxiedURL;
|
||||
}
|
||||
|
||||
Scholar.Utilities.Ingester.HTTPUtilities.prototype.doGet = function(url, onStatus, onDone) {
|
||||
|
|
Loading…
Reference in a new issue