zotero/chrome/chromeFiles/content/scholar/xpcom/ingester.js

// Scholar for Firefox Ingester
// Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed)
// This code is licensed according to the GPL

Scholar.Ingester = new Object();

/////////////////////////////////////////////////////////////////
//
// Scholar.Ingester.ProxyMonitor
//
/////////////////////////////////////////////////////////////////

// A singleton for recognizing EZProxies and converting URLs such that databases
// will work from outside them. Unfortunately, this only works with the ($495)
// EZProxy software. If there are open source alternatives, we should support
// them too.

/*
 * Precompile proxy regexps
 */
Scholar.Ingester.ProxyMonitor = new function() {
	var _ezProxyRe = new RegExp();
	_ezProxyRe.compile("\\?(?:.+&)?(url|qurl)=([^&]+)", "i");
	/*var _hostRe = new RegExp();
	_hostRe.compile("^https?://(([^/:]+)(?:\:([0-9]+))?)");*/
	var ioService = Components.classes["@mozilla.org/network/io-service;1"]
							  .getService(Components.interfaces.nsIIOService);
	var on = false;
	var _mapFromProxy = null;
	var _mapToProxy = null;
	
	this.init = init;
	this.proxyToProper = proxyToProper;
	this.properToProxy = properToProxy;
	this.observe = observe;
	
	function init() {
		if(!on) {
			var observerService = Components.classes["@mozilla.org/observer-service;1"]
										.getService(Components.interfaces.nsIObserverService);
			observerService.addObserver(this, "http-on-examine-response", false);
		}
		on = true;
	}
	
	function observe(channel) {
		channel.QueryInterface(Components.interfaces.nsIHttpChannel);
		try {
			if(channel.getResponseHeader("Server") == "EZproxy") {
				// We're connected to an EZproxy
				if(channel.responseStatus != "302") {
					return;
				}
				
				Scholar.debug(channel.URI.spec);
				// We should be able to scrape the URL out of this
				var m = _ezProxyRe.exec(channel.URI.spec);
				if(!m) {
					return;
				}
				
				// Found URL
				var variable = m[1];
				var properURL = m[2];
				if(variable.toLowerCase() == "qurl") {
					properURL = unescape(properURL);
				}
				var properURI = _parseURL(properURL);
				if(!properURI) {
					return;
				}
				
				// Get the new URL
				var newURL = channel.getResponseHeader("Location");
				if(!newURL) {
					return;
				}
				var newURI = _parseURL(newURL);
				if(!newURI) {
					return;
				}
				
				if(channel.URI.host == newURI.host && channel.URI.port != newURI.port) {
					// Different ports but the same server means EZproxy active
					
					Scholar.debug("EZProxy: host "+newURI.hostPort+" is really "+properURI.hostPort);
					// Initialize variables here so people who never use EZProxies
					// don't get the (very very minor) speed hit
					if(!_mapFromProxy) {
						_mapFromProxy = new Object();
						_mapToProxy = new Object();
					}
					_mapFromProxy[newURI.hostPort] = properURI.hostPort;
					_mapToProxy[properURI.hostPort] = newURI.hostPort;
				}
			}
		} catch(e) {}
	}
	
	/*
	 * Returns a page's proper url, adjusting for proxying
	 */
	function proxyToProper(url) {
		if(_mapFromProxy) {
			// EZProxy detection is active
			
			var uri = _parseURL(url);
			if(uri && _mapFromProxy[uri.hostPort]) {
				url = url.replace(uri.hostPort, _mapFromProxy[uri.hostPort]);
				Scholar.debug("EZProxy: proper url is "+url);
			}
		}
		
		return url;
	}
	
	/*
	 * Returns a page's proxied url from the proper url
	 */
	function properToProxy(url) {
		if(_mapToProxy) {
			// EZProxy detection is active
			
			var uri = _parseURL(url);
			if(uri && _mapToProxy[uri.hostPort]) {
				// Actually need to map
				url = url.replace(uri.hostPort, _mapToProxy[uri.hostPort]);
				Scholar.debug("EZProxy: proxied url is "+url);
			}
		}
		
		return url;
	}
	
	/*
	 * Parses a url into components (hostPort, port, host, and spec)
	 */
	function _parseURL(url) {
		// create an nsIURI (not sure if this is faster than the regular
		// expression, but it's at least more kosher)
		var uri = ioService.newURI(url, null, null);
		return uri;
	}
}
Replaced all instances of "Firefox Scholar" (not counting the repository URL) with "Scholar for Firefox" for now 2006-06-24 09:08:12 +00:00			`// Scholar for Firefox Ingester`
Still getting the hang of Subversion...the rest of the ingester code 2006-06-01 06:53:39 +00:00			`// Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed)`
			`// This code is licensed according to the GPL`

make generic Scholar.Utilities class and HTTP-dependent Scholar.Utilities.Ingester and Scholar.Utilities.HTTP classes in preparation for import/export filters; split off into separate javascript file 2006-06-26 14:46:57 +00:00			`Scholar.Ingester = new Object();`
Still getting the hang of Subversion...the rest of the ingester code 2006-06-01 06:53:39 +00:00
Addresses #68, figure out way to have scrapers work for gated resources behind proxies. We can now access pages through an EZProxy. We need to know what alternatives to EZProxy exist in order to support them. Also, fixes some spacing issues in browser.js. 2006-06-25 04:30:43 +00:00			`/////////////////////////////////////////////////////////////////`
			`//`
			`// Scholar.Ingester.ProxyMonitor`
			`//`
			`/////////////////////////////////////////////////////////////////`

			`// A singleton for recognizing EZProxies and converting URLs such that databases`
			`// will work from outside them. Unfortunately, this only works with the ($495)`
			`// EZProxy software. If there are open source alternatives, we should support`
			`// them too.`

			`/*`
			`* Precompile proxy regexps`
			`*/`
closes #68, figure out way to have scrapers work for gated resources behind proxies. most institutions use EZProxy for their proxy needs (or a more transparent proxy, which we support natively). this implementation is significantly better than the old one, which refused to work after you'd already logged in once, and is also simpler, because it's stateless. it has to observe every HTTP request, but there's no noticeable speed hit. it also still doesn't work when there's a link from one gated site to another gated site, but as far as i can tell, this only happens on the Gale Group site. 2006-06-27 04:08:21 +00:00			`Scholar.Ingester.ProxyMonitor = new function() {`
			`var _ezProxyRe = new RegExp();`
			`_ezProxyRe.compile("\\?(?:.+&)?(url\|qurl)=([^&]+)", "i");`
			`/*var _hostRe = new RegExp();`
			`_hostRe.compile("^https?://(([^/:]+)(?:\:([0-9]+))?)");*/`
			`var ioService = Components.classes["@mozilla.org/network/io-service;1"]`
			`.getService(Components.interfaces.nsIIOService);`
			`var on = false;`
			`var _mapFromProxy = null;`
			`var _mapToProxy = null;`

			`this.init = init;`
			`this.proxyToProper = proxyToProper;`
			`this.properToProxy = properToProxy;`
			`this.observe = observe;`

			`function init() {`
			`if(!on) {`
			`var observerService = Components.classes["@mozilla.org/observer-service;1"]`
			`.getService(Components.interfaces.nsIObserverService);`
			`observerService.addObserver(this, "http-on-examine-response", false);`
Addresses #68, figure out way to have scrapers work for gated resources behind proxies. We can now access pages through an EZProxy. We need to know what alternatives to EZProxy exist in order to support them. Also, fixes some spacing issues in browser.js. 2006-06-25 04:30:43 +00:00			`}`
closes #68, figure out way to have scrapers work for gated resources behind proxies. most institutions use EZProxy for their proxy needs (or a more transparent proxy, which we support natively). this implementation is significantly better than the old one, which refused to work after you'd already logged in once, and is also simpler, because it's stateless. it has to observe every HTTP request, but there's no noticeable speed hit. it also still doesn't work when there's a link from one gated site to another gated site, but as far as i can tell, this only happens on the Gale Group site. 2006-06-27 04:08:21 +00:00			`on = true;`
			`}`

			`function observe(channel) {`
			`channel.QueryInterface(Components.interfaces.nsIHttpChannel);`
closes #78, figure out import/export architecture closes #100, migrate ingester to Scholar.Translate closes #88, migrate scrapers away from RDF closes #9, pull out LC subject heading tags references #87, add fromArray() and toArray() methods to item objects API changes: all translation (import/export/web) now goes through Scholar.Translate all Scholar-specific functions in scrapers start with "Scholar." rather than the jumbled up piggy bank un-namespaced confusion scrapers now longer specify items through RDF (the beginning of an item.fromArray()-like function exists in Scholar.Translate.prototype._itemDone()) scrapers can be any combination of import, export, and web (type is the sum of 1/2/4 respectively) scrapers now contain functions (doImport, doExport, doWeb) rather than loose code scrapers can call functions in other scrapers or just call the function to translate itself export accesses items item-by-item, rather than accepting a huge array of items MARC functions are now in the MARC import translator, and accessed by the web translators new features: import now works rudimentary RDF (unqualified dublin core only), RIS, and MARC import translators are implemented (although they are a little picky with respect to file extensions at the moment) items appear as they are scraped MARC import translator pulls out tags, although this seems to slow things down no icon appears next to a the URL when Scholar hasn't detected metadata, since this seemed somewhat confusing apologizes for the size of this diff. i figured if i was going to re-write the API, i might as well do it all at once and get everything working right. 2006-07-17 04:06:58 +00:00			`try {`
			`if(channel.getResponseHeader("Server") == "EZproxy") {`
			`// We're connected to an EZproxy`
			`if(channel.responseStatus != "302") {`
			`return;`
			`}`

			`Scholar.debug(channel.URI.spec);`
			`// We should be able to scrape the URL out of this`
			`var m = _ezProxyRe.exec(channel.URI.spec);`
			`if(!m) {`
			`return;`
			`}`

			`// Found URL`
			`var variable = m[1];`
			`var properURL = m[2];`
			`if(variable.toLowerCase() == "qurl") {`
			`properURL = unescape(properURL);`
			`}`
			`var properURI = _parseURL(properURL);`
			`if(!properURI) {`
			`return;`
			`}`

			`// Get the new URL`
			`var newURL = channel.getResponseHeader("Location");`
			`if(!newURL) {`
			`return;`
			`}`
			`var newURI = _parseURL(newURL);`
			`if(!newURI) {`
			`return;`
			`}`
Addresses #68, figure out way to have scrapers work for gated resources behind proxies. We can now access pages through an EZProxy. We need to know what alternatives to EZProxy exist in order to support them. Also, fixes some spacing issues in browser.js. 2006-06-25 04:30:43 +00:00
closes #78, figure out import/export architecture closes #100, migrate ingester to Scholar.Translate closes #88, migrate scrapers away from RDF closes #9, pull out LC subject heading tags references #87, add fromArray() and toArray() methods to item objects API changes: all translation (import/export/web) now goes through Scholar.Translate all Scholar-specific functions in scrapers start with "Scholar." rather than the jumbled up piggy bank un-namespaced confusion scrapers now longer specify items through RDF (the beginning of an item.fromArray()-like function exists in Scholar.Translate.prototype._itemDone()) scrapers can be any combination of import, export, and web (type is the sum of 1/2/4 respectively) scrapers now contain functions (doImport, doExport, doWeb) rather than loose code scrapers can call functions in other scrapers or just call the function to translate itself export accesses items item-by-item, rather than accepting a huge array of items MARC functions are now in the MARC import translator, and accessed by the web translators new features: import now works rudimentary RDF (unqualified dublin core only), RIS, and MARC import translators are implemented (although they are a little picky with respect to file extensions at the moment) items appear as they are scraped MARC import translator pulls out tags, although this seems to slow things down no icon appears next to a the URL when Scholar hasn't detected metadata, since this seemed somewhat confusing apologizes for the size of this diff. i figured if i was going to re-write the API, i might as well do it all at once and get everything working right. 2006-07-17 04:06:58 +00:00			`if(channel.URI.host == newURI.host && channel.URI.port != newURI.port) {`
			`// Different ports but the same server means EZproxy active`

			`Scholar.debug("EZProxy: host "+newURI.hostPort+" is really "+properURI.hostPort);`
			`// Initialize variables here so people who never use EZProxies`
			`// don't get the (very very minor) speed hit`
			`if(!_mapFromProxy) {`
			`_mapFromProxy = new Object();`
			`_mapToProxy = new Object();`
			`}`
			`_mapFromProxy[newURI.hostPort] = properURI.hostPort;`
			`_mapToProxy[properURI.hostPort] = newURI.hostPort;`
Addresses #68, figure out way to have scrapers work for gated resources behind proxies. We can now access pages through an EZProxy. We need to know what alternatives to EZProxy exist in order to support them. Also, fixes some spacing issues in browser.js. 2006-06-25 04:30:43 +00:00			`}`
			`}`
closes #78, figure out import/export architecture closes #100, migrate ingester to Scholar.Translate closes #88, migrate scrapers away from RDF closes #9, pull out LC subject heading tags references #87, add fromArray() and toArray() methods to item objects API changes: all translation (import/export/web) now goes through Scholar.Translate all Scholar-specific functions in scrapers start with "Scholar." rather than the jumbled up piggy bank un-namespaced confusion scrapers now longer specify items through RDF (the beginning of an item.fromArray()-like function exists in Scholar.Translate.prototype._itemDone()) scrapers can be any combination of import, export, and web (type is the sum of 1/2/4 respectively) scrapers now contain functions (doImport, doExport, doWeb) rather than loose code scrapers can call functions in other scrapers or just call the function to translate itself export accesses items item-by-item, rather than accepting a huge array of items MARC functions are now in the MARC import translator, and accessed by the web translators new features: import now works rudimentary RDF (unqualified dublin core only), RIS, and MARC import translators are implemented (although they are a little picky with respect to file extensions at the moment) items appear as they are scraped MARC import translator pulls out tags, although this seems to slow things down no icon appears next to a the URL when Scholar hasn't detected metadata, since this seemed somewhat confusing apologizes for the size of this diff. i figured if i was going to re-write the API, i might as well do it all at once and get everything working right. 2006-07-17 04:06:58 +00:00			`} catch(e) {}`
Addresses #68, figure out way to have scrapers work for gated resources behind proxies. We can now access pages through an EZProxy. We need to know what alternatives to EZProxy exist in order to support them. Also, fixes some spacing issues in browser.js. 2006-06-25 04:30:43 +00:00			`}`

closes #68, figure out way to have scrapers work for gated resources behind proxies. most institutions use EZProxy for their proxy needs (or a more transparent proxy, which we support natively). this implementation is significantly better than the old one, which refused to work after you'd already logged in once, and is also simpler, because it's stateless. it has to observe every HTTP request, but there's no noticeable speed hit. it also still doesn't work when there's a link from one gated site to another gated site, but as far as i can tell, this only happens on the Gale Group site. 2006-06-27 04:08:21 +00:00			`/*`
			`* Returns a page's proper url, adjusting for proxying`
			`*/`
			`function proxyToProper(url) {`
			`if(_mapFromProxy) {`
			`// EZProxy detection is active`

			`var uri = _parseURL(url);`
			`if(uri && _mapFromProxy[uri.hostPort]) {`
			`url = url.replace(uri.hostPort, _mapFromProxy[uri.hostPort]);`
			`Scholar.debug("EZProxy: proper url is "+url);`
			`}`
			`}`
Addresses #68, figure out way to have scrapers work for gated resources behind proxies. We can now access pages through an EZProxy. We need to know what alternatives to EZProxy exist in order to support them. Also, fixes some spacing issues in browser.js. 2006-06-25 04:30:43 +00:00
closes #68, figure out way to have scrapers work for gated resources behind proxies. most institutions use EZProxy for their proxy needs (or a more transparent proxy, which we support natively). this implementation is significantly better than the old one, which refused to work after you'd already logged in once, and is also simpler, because it's stateless. it has to observe every HTTP request, but there's no noticeable speed hit. it also still doesn't work when there's a link from one gated site to another gated site, but as far as i can tell, this only happens on the Gale Group site. 2006-06-27 04:08:21 +00:00			`return url;`
			`}`

			`/*`
			`* Returns a page's proxied url from the proper url`
			`*/`
			`function properToProxy(url) {`
			`if(_mapToProxy) {`
			`// EZProxy detection is active`

			`var uri = _parseURL(url);`
			`if(uri && _mapToProxy[uri.hostPort]) {`
			`// Actually need to map`
			`url = url.replace(uri.hostPort, _mapToProxy[uri.hostPort]);`
			`Scholar.debug("EZProxy: proxied url is "+url);`
			`}`
Addresses #68, figure out way to have scrapers work for gated resources behind proxies. We can now access pages through an EZProxy. We need to know what alternatives to EZProxy exist in order to support them. Also, fixes some spacing issues in browser.js. 2006-06-25 04:30:43 +00:00			`}`
closes #68, figure out way to have scrapers work for gated resources behind proxies. most institutions use EZProxy for their proxy needs (or a more transparent proxy, which we support natively). this implementation is significantly better than the old one, which refused to work after you'd already logged in once, and is also simpler, because it's stateless. it has to observe every HTTP request, but there's no noticeable speed hit. it also still doesn't work when there's a link from one gated site to another gated site, but as far as i can tell, this only happens on the Gale Group site. 2006-06-27 04:08:21 +00:00
			`return url;`
Addresses #68, figure out way to have scrapers work for gated resources behind proxies. We can now access pages through an EZProxy. We need to know what alternatives to EZProxy exist in order to support them. Also, fixes some spacing issues in browser.js. 2006-06-25 04:30:43 +00:00			`}`

closes #68, figure out way to have scrapers work for gated resources behind proxies. most institutions use EZProxy for their proxy needs (or a more transparent proxy, which we support natively). this implementation is significantly better than the old one, which refused to work after you'd already logged in once, and is also simpler, because it's stateless. it has to observe every HTTP request, but there's no noticeable speed hit. it also still doesn't work when there's a link from one gated site to another gated site, but as far as i can tell, this only happens on the Gale Group site. 2006-06-27 04:08:21 +00:00			`/*`
			`* Parses a url into components (hostPort, port, host, and spec)`
			`*/`
			`function _parseURL(url) {`
			`// create an nsIURI (not sure if this is faster than the regular`
			`// expression, but it's at least more kosher)`
			`var uri = ioService.newURI(url, null, null);`
			`return uri;`
			`}`
Still getting the hang of Subversion...the rest of the ingester code 2006-06-01 06:53:39 +00:00			`}`