zotero/chrome/chromeFiles/content/scholar/xpcom/ingester.js

// Scholar for Firefox Ingester
// Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed)
// This code is licensed according to the GPL

Scholar.Ingester = new Object();

/////////////////////////////////////////////////////////////////
//
// Scholar.Ingester.ProxyMonitor
//
/////////////////////////////////////////////////////////////////

// A singleton for recognizing EZProxies and converting URLs such that databases
// will work from outside them. Unfortunately, this only works with the ($495)
// EZProxy software. If there are open source alternatives, we should support
// them too.

/*
 * Precompile proxy regexps
 */
Scholar.Ingester.ProxyMonitor = new function() {
	var _ezProxyRe = new RegExp();
	_ezProxyRe.compile("\\?(?:.+&)?(url|qurl)=([^&]+)", "i");
	/*var _hostRe = new RegExp();
	_hostRe.compile("^https?://(([^/:]+)(?:\:([0-9]+))?)");*/
	var ioService = Components.classes["@mozilla.org/network/io-service;1"]
							  .getService(Components.interfaces.nsIIOService);
	var on = false;
	var _mapFromProxy = null;
	var _mapToProxy = null;
	
	this.init = init;
	this.proxyToProper = proxyToProper;
	this.properToProxy = properToProxy;
	this.observe = observe;
	
	function init() {
		if(!on) {
			var observerService = Components.classes["@mozilla.org/observer-service;1"]
										.getService(Components.interfaces.nsIObserverService);
			observerService.addObserver(this, "http-on-examine-response", false);
		}
		on = true;
	}
	
	function observe(channel) {
		channel.QueryInterface(Components.interfaces.nsIHttpChannel);
		try {
			if(channel.getResponseHeader("Server") == "EZproxy") {
				// We're connected to an EZproxy
				if(channel.responseStatus != "302") {
					return;
				}
				
				Scholar.debug(channel.URI.spec);
				// We should be able to scrape the URL out of this
				var m = _ezProxyRe.exec(channel.URI.spec);
				if(!m) {
					return;
				}
				
				// Found URL
				var variable = m[1];
				var properURL = m[2];
				if(variable.toLowerCase() == "qurl") {
					properURL = unescape(properURL);
				}
				var properURI = _parseURL(properURL);
				if(!properURI) {
					return;
				}
				
				// Get the new URL
				var newURL = channel.getResponseHeader("Location");
				if(!newURL) {
					return;
				}
				var newURI = _parseURL(newURL);
				if(!newURI) {
					return;
				}
				
				if(channel.URI.host == newURI.host && channel.URI.port != newURI.port) {
					// Different ports but the same server means EZproxy active
					
					Scholar.debug("EZProxy: host "+newURI.hostPort+" is really "+properURI.hostPort);
					// Initialize variables here so people who never use EZProxies
					// don't get the (very very minor) speed hit
					if(!_mapFromProxy) {
						_mapFromProxy = new Object();
						_mapToProxy = new Object();
					}
					_mapFromProxy[newURI.hostPort] = properURI.hostPort;
					_mapToProxy[properURI.hostPort] = newURI.hostPort;
				}
			}
		} catch(e) {}
	}
	
	/*
	 * Returns a page's proper url, adjusting for proxying
	 */
	function proxyToProper(url) {
		if(_mapFromProxy) {
			// EZProxy detection is active
			
			var uri = _parseURL(url);
			if(uri && _mapFromProxy[uri.hostPort]) {
				url = url.replace(uri.hostPort, _mapFromProxy[uri.hostPort]);
				Scholar.debug("EZProxy: proper url is "+url);
			}
		}
		
		return url;
	}
	
	/*
	 * Returns a page's proxied url from the proper url
	 */
	function properToProxy(url) {
		if(_mapToProxy) {
			// EZProxy detection is active
			
			var uri = _parseURL(url);
			if(uri && _mapToProxy[uri.hostPort]) {
				// Actually need to map
				url = url.replace(uri.hostPort, _mapToProxy[uri.hostPort]);
				Scholar.debug("EZProxy: proxied url is "+url);
			}
		}
		
		return url;
	}
	
	/*
	 * Parses a url into components (hostPort, port, host, and spec)
	 */
	function _parseURL(url) {
		// create an nsIURI (not sure if this is faster than the regular
		// expression, but it's at least more kosher)
		var uri = ioService.newURI(url, null, null);
		return uri;
	}
}

Scholar.OpenURL = new function() {
	this.resolve = resolve;
	this.discoverResolvers = discoverResolvers;
	this.createContextObject = createContextObject;
	this.parseContextObject = parseContextObject;
	this.lookupContextObject = lookupContextObject;
	
	/*
	 * Returns a URL to look up an item in the OpenURL resolver
	 */
	function resolve(itemObject) {
		var co = createContextObject(itemObject, Scholar.Prefs.get("openURL.version"));
		if(co) {
			return Scholar.Prefs.get("openURL.resolver")+"?"+co;
		}
		return false;
	}
	
	/*
	 * Queries OCLC's OpenURL resolver registry and returns an address and version
	 */
	function discoverResolvers() {
		var req = Components.classes["@mozilla.org/xmlextras/xmlhttprequest;1"].createInstance();
		req.open("GET", "http://worldcatlibraries.org/registry/lookup?IP=requestor", false);
		req.send(null);
		
		if(!req.responseXML) {
			throw "Could not access resolver registry";
		}
		
		var resolverArray = new Array();
		var resolvers = req.responseXML.getElementsByTagName("resolver");
		for(var i=0; i<resolvers.length; i++) {
			var resolver = resolvers[i];
			
			var name = resolver.parentNode.getElementsByTagName("institutionName");
			if(!name.length) {
				continue;
			}
			name = name[0].textContent;
			
			var url = resolver.getElementsByTagName("baseURL");
			if(!url.length) {
				continue;
			}
			url = url[0].textContent;
			
			if(resolver.getElementsByTagName("Z39.88-2004").length > 0) {
				var version = "1.0";
			} else if(resolver.getElementsByTagName("OpenUrl 0.1").length > 0) {
				var version = "0.1";
			} else {
				continue;
			}
			
			resolverArray[name] = [url, version];
		}
		
		return resolverArray;
	}
	
	/*
	 * Generates an OpenURL ContextObject from an item
	 */
	function createContextObject(itemObject, version) {
		var item = itemObject.toArray();
		
		var identifiers = new Array();
		if(item.DOI) {
			identifiers.push(item.DOI);
		}
		if(item.ISBN) {
			identifiers.push("urn:isbn:");
		}
		
		// encode ctx_ver (if available) and identifiers
		if(version == "0.1") {
			var co = "";
			
			for each(identifier in identifiers) {
				co += "&id="+escape(identifier);
			}
		} else {
			var co = "url_ver=Z39.88-2004&ctx_ver=Z39.88-2004";
			
			for each(identifier in identifiers) {
				co += "&rft_id="+escape(identifier);
			}
		}
		
		// encode genre and item-specific data
		if(item.itemType == "journalArticle") {
			if(version == "0.1") {
				co += "&genre=article";
			} else {
				co += "&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article";
			}
			co += _mapTag(item.title, "atitle", version)		
			co += _mapTag(item.publicationTitle, (version == "0.1" ? "title" : "jtitle"), version)		
			co += _mapTag(item.journalAbbreviation, "stitle", version);
			co += _mapTag(item.volume, "volume", version);
			co += _mapTag(item.issue, "issue", version);
		} else if(item.itemType == "book" || item.itemType == "bookitem") {
			if(version == "0.1") {
				co += "&genre=book";
			} else {
				co += "&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book";
			}
			
			if(item.itemType == "book") {
				co += "&rft.genre=book";
				co += _mapTag(item.title, (version == "0.1" ? "title" : "btitle"), version);
			} else {
				co += "&rft.genre=bookitem";
				co += _mapTag(item.title, "atitle", version)		
				co += _mapTag(item.publicationTitle, (version == "0.1" ? "title" : "btitle"), version);
			}
			
			co += _mapTag(item.place, "place", version);
			co += _mapTag(item.publisher, "publisher", version)		
			co += _mapTag(item.edition, "edition", version);
			co += _mapTag(item.seriesTitle, "series", version);
		} else if(item.itemType == "thesis" && version == "1.0") {
			co += "&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Adissertation";
			
			_mapTag(item.title, "title", version);
			_mapTag(item.publisher, "inst", version);
			_mapTag(item.thesisType, "degree", version);
		} else {
			return false;
		}
		
		// encode fields on all items
		for each(creator in item.creators) {
			if(creator.firstName) {
				co += _mapTag(creator.firstName, "aufirst", version);
				co += _mapTag(creator.lastName, "aulast", version);
			} else {
				co += _mapTag(creator.lastName, "aucorp", version);
			}
		}
		
		if(item.date) {
			co += _mapTag(item.date, "date", version);
		} else {
			co += _mapTag(item.year, "date", version);
		}
		co += _mapTag(item.pages, "pages", version);
		co += _mapTag(item.ISBN, "ISBN", version);
		co += _mapTag(item.ISSN, "ISSN", version);
		
		if(version == "0.1") {
			// chop off leading & sign if version is 0.1
			co = co.substr(1);
		}
		
		return co;
	}
	
	/*
	 * Generates an item in the format returned by item.fromArray() given an
	 * OpenURL version 1.0 contextObject
	 */
	function parseContextObject(co) {
		var coParts = co.split("&");
		
		var item = new Array();
		item.creators = new Array();
		
		// get type
		item.itemType = _determineResourceType(coParts);
		if(!item.itemType) {
			return false;
		}
		
		var pagesKey = "";
		
		for each(part in coParts) {
			var keyVal = part.split("=");
			var key = keyVal[0];
			var value = unescape(keyVal[1].replace(/\+|%2[bB]/g, " "));
			if(!value) {
				continue;
			}
			
			if(key == "rft_id") {
				var firstEight = value.substr(0, 8).toLowerCase();
				if(firstEight == "info:doi") {
					item.DOI = value;
				} else if(firstEight == "urn:isbn") {
					item.ISBN = value.substr(9);
				}
			} else if(key == "rft.btitle") {
				if(item.itemType == "book") {
					item.title = value;
				} else if(item.itemType == "bookSection") {
					item.publicationTitle = value;
				}
			} else if(key == "rft.atitle" && item.itemType != "book") {
				item.title = value;
			} else if(key == "rft.jtitle" && item.itemType == "journal") {
				item.publcation = value;
			} else if(key == "rft.stitle" && item.itemType == "journal") {
				item.journalAbbreviation = value;
			} else if(key == "rft.date") {
				item.date = value;
			} else if(key == "rft.volume") {
				item.volume = value;
			} else if(key == "rft.issue") {
				item.issue = value;
			} else if(key == "rft.pages") {
				pagesKey = key;
				item.pages = value;
			} else if(key == "rft.spage") {
				if(pagesKey != "rft.pages") {
					pagesKey = key;
					// make pages look like start-end
					if(pagesKey == "rft.epage") {
						if(value != item.pages) {
							item.pages = value+"-"+item.pages;
						}
					} else {
						item.pages = value;
					}
				}
			} else if(key == "rft.epage") {
				if(pagesKey != "rft.pages") {
					pagesKey = key;
					// make pages look like start-end
					if(pagesKey == "rft.spage") {
						if(value != item.pages) {
							item.pages = +item.pages+"-"+value;
						}
					} else {
						item.pages = value;
					}
				}
			} else if(key == "issn" || (key == "eissn" && !item.ISSN)) {
				item.ISSN = value;
			} else if(key == "rft.aulast") {
				var lastCreator = item.creators[item.creators.length-1];
				if(item.creators.length && !lastCreator.lastName && !lastCreator.institutional) {
					lastCreator.lastName = value;
				} else {
					item.creators.push({lastName:value});
				}
			} else if(key == "rft.aufirst") {
				var lastCreator = item.creators[item.creators.length-1];
				if(item.creators.length && !lastCreator.firstName && !lastCreator.institutional) {
					lastCreator.firstName = value;
				} else {
					item.creators.push({firstName:value});
				}
			} else if(key == "rft.au") {
				item.creators.push(Scholar.cleanAuthor(value, "author", true));
			} else if(key == "rft.aucorp") {
				item.creators.push({lastName:value, institutional:true});
			} else if(key == "rft.isbn" && !item.ISBN) {
				item.ISBN = value;
			} else if(key == "rft.pub") {
				item.publisher = value;
			} else if(key == "rft.place") {
				item.place = value;
			} else if(key == "rft.edition") {
				item.edition = value;
			} else if(key == "rft.series") {
				item.seriesTitle = value;
			}
		}
		
		return item;
	}
	
	/*
	 * Looks up additional information on an item in the format returned by
	 * item.fromArray() in CrossRef or Open WorldCat given an OpenURL version
	 * 1.0 contextObject
	 */
	function lookupContextObject(co, done, error) {
		// CrossRef requires a url_ver to work right
		if(co.indexOf("url_ver=Z39.88-2004") == -1) {
			co = "url_ver=Z39.88-2004&"+co;
		}
		
		var type = _determineResourceType(co.split("&"));
		if(!type) {
			return false;
		}
		
		if(type == "journal") {
			// look up journals in CrossRef
			Scholar.Utilities.HTTP.doGet("http://www.crossref.org/openurl/?"+co+"&noredirect=true", null, function(req) {
				var items = _processCrossRef(req.responseText);
				done(items);
			});
		} else {
			// look up books in Open WorldCat
			Scholar.Utilities.HTTP.processDocuments(null, ["http://partneraccess.oclc.org/wcpa/servlet/OpenUrl?"+co], function(browser) {
				var doc = browser.contentDocument;
				// find new COinS in the Open WorldCat page
				items = _processOWC(doc);
				
				if(items) {	// we got a single item page; return the item
					done(items);
				} else {	// assume we have a search results page
					var items = new Array();
					
					var namespace = doc.documentElement.namespaceURI;
					var nsResolver = namespace ? function(prefix) {
						if (prefix == 'x') return namespace; else return null;
					} : null;
					
					// first try to get only books
					var elmts = doc.evaluate('//table[@class="tableLayout"]/tbody/tr/td[@class="content"]/table[@class="tableResults"]/tbody/tr[td/img[@alt="Book"]]/td/div[@class="title"]/a', doc, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);
					var elmt = elmts.iterateNext();
					if(!elmt) {	// if that fails, look for other options
						var elmts = doc.evaluate('//table[@class="tableLayout"]/tbody/tr/td[@class="content"]/table[@class="tableResults"]/tbody/tr[td/img[@alt="Book"]]/td/div[@class="title"]/a', doc, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);
						elmt = elmts.iterateNext()
					}
					
					var urlsToProcess = new Array();
					do {
						urlsToProcess.push(elmt.href);
					} while(elmt = elmts.iterateNext());
					
					Scholar.Utilities.HTTP.processDocuments(null, urlsToProcess, function(browser) {
						// per URL
						var newItems = _processOWC(browser.contentDocument);
						if(newItems) {
							items = items.concat(newItems);
						}
					}, function() {	// done
						done(items);
					}, function() {	// error
						error();
					});
				}
			}, null, function() {
				error();
			});
		}
	}
	
	/*
	 * Processes the XML format returned by CrossRef
	 */
	function _processCrossRef(xmlOutput) {
		xmlOutput = xmlOutput.replace(/<\?xml[^>]*\?>/, "");
		
		// parse XML with E4X
		var qr = new Namespace("http://www.crossref.org/qrschema/2.0");
		try {
			var xml = new XML(xmlOutput);
		} catch(e) {
			return false;
		}
		
		// ensure status is valid
		var status = xml.qr::body.qr::query.@status.toString();
		if(status != "resolved" && status != "multiresolved") {
			return false;
		}
		
		var query = xml.qr::body.qr::query;
		var item = new Array();
		item.creators = new Array();
		
		// try to get a DOI
		item.DOI = query.qr::doi.(@type=="journal_article").toString();
		if(!item.DOI) {
			item.DOI = query.qr::doi.(@type=="book_title").toString();
		}
		if(!item.DOI) {
			item.DOI = query.qr::doi.(@type=="book_content").toString();
		}
		
		// try to get an ISSN (no print/electronic preferences)
		item.ISSN = query.qr::issn.toString();
		// get title
		item.title = query.qr::article_title.toString();
		// get publicationTitle
		item.publicationTitle = query.qr::journal_title.toString();
		// get author
		item.creators.push(Scholar.Utilities.cleanAuthor(query.qr::author.toString(), "author", true));
		// get volume
		item.volume = query.qr::volume.toString();
		// get issue
		item.issue = query.qr::issue.toString();
		// get year
		item.date = query.qr::year.toString();
		// get edition
		item.edition = query.qr::edition_number.toString();
		// get first page
		item.pages = query.qr::first_page.toString();
		
		return [item];
	}
	
	/*
	 * Parses a document object referring to an Open WorldCat entry for its
	 * OpenURL contextObject, then returns an item generated from this
	 * contextObject
	 */
	function _processOWC(doc) {
		var spanTags = doc.getElementsByTagName("span");
		for(var i=0; i<spanTags.length; i++) {
			var spanClass = spanTags[i].getAttribute("class");
			if(spanClass) {
				var spanClasses = spanClass.split(" ");
				if(Scholar.inArray("Z3988", spanClasses)) {
					var spanTitle = spanTags[i].getAttribute("title");
					var item = parseContextObject(spanTitle);
					if(item) {
						return [item];
					} else {
						return false;
					}
				}
			}
		}
		
		return false;
	}
	
	/*
	 * Determines the type of an OpenURL contextObject
	 */
	function _determineResourceType(coParts) {
		// determine resource type
		var type = false;
		for(var i in coParts) {
			if(coParts[i].substr(0, 12) == "rft_val_fmt=") {
				var format = unescape(coParts[i].substr(12));
				if(format == "info:ofi/fmt:kev:mtx:journal") {
					var type = "journal";
				} else if(format == "info:ofi/fmt:kev:mtx:book") {
					if(Scholar.inArray("rft.genre=bookitem", coParts)) {
						var type = "bookSection";
					} else {
						var type = "book";
					}
					break;
				}
			}
		}
		return type;
	}
	
	/*
	 * Used to map tags for generating OpenURL contextObjects
	 */
	function _mapTag(data, tag, version) {
		if(data) {
			if(version == "0.1") {
				return "&"+tag+"="+escape(data);
			} else {
				return "&rft."+tag+"="+escape(data);
			}
		} else {
			return "";
		}
	}
}