// Scholar for Firefox Ingester // Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed) // This code is licensed according to the GPL Scholar.Ingester = new Object(); ///////////////////////////////////////////////////////////////// // // Scholar.Ingester.ProxyMonitor // ///////////////////////////////////////////////////////////////// // A singleton for recognizing EZProxies and converting URLs such that databases // will work from outside them. Unfortunately, this only works with the ($495) // EZProxy software. If there are open source alternatives, we should support // them too. /* * Precompile proxy regexps */ Scholar.Ingester.ProxyMonitor = new function() { var _ezProxyRe = new RegExp(); _ezProxyRe.compile("\\?(?:.+&)?(url|qurl)=([^&]+)", "i"); /*var _hostRe = new RegExp(); _hostRe.compile("^https?://(([^/:]+)(?:\:([0-9]+))?)");*/ var ioService = Components.classes["@mozilla.org/network/io-service;1"] .getService(Components.interfaces.nsIIOService); var on = false; var _mapFromProxy = null; var _mapToProxy = null; this.init = init; this.proxyToProper = proxyToProper; this.properToProxy = properToProxy; this.observe = observe; function init() { if(!on) { var observerService = Components.classes["@mozilla.org/observer-service;1"] .getService(Components.interfaces.nsIObserverService); observerService.addObserver(this, "http-on-examine-response", false); } on = true; } function observe(channel) { channel.QueryInterface(Components.interfaces.nsIHttpChannel); try { if(channel.getResponseHeader("Server") == "EZproxy") { // We're connected to an EZproxy if(channel.responseStatus != "302") { return; } // We should be able to scrape the URL out of this var m = _ezProxyRe.exec(channel.URI.spec); if(!m) { return; } // Found URL var variable = m[1]; var properURL = m[2]; if(variable.toLowerCase() == "qurl") { properURL = unescape(properURL); } var properURI = _parseURL(properURL); if(!properURI) { return; } // Get the new URL var newURL = channel.getResponseHeader("Location"); if(!newURL) { return; } var newURI = _parseURL(newURL); if(!newURI) { return; } if(channel.URI.host == newURI.host && channel.URI.port != newURI.port) { // Different ports but the same server means EZproxy active Scholar.debug("EZProxy: host "+newURI.hostPort+" is really "+properURI.hostPort); // Initialize variables here so people who never use EZProxies // don't get the (very very minor) speed hit if(!_mapFromProxy) { _mapFromProxy = new Object(); _mapToProxy = new Object(); } _mapFromProxy[newURI.hostPort] = properURI.hostPort; _mapToProxy[properURI.hostPort] = newURI.hostPort; } } } catch(e) {} } /* * Returns a page's proper url, adjusting for proxying */ function proxyToProper(url) { if(_mapFromProxy) { // EZProxy detection is active var uri = _parseURL(url); if(uri && _mapFromProxy[uri.hostPort]) { url = url.replace(uri.hostPort, _mapFromProxy[uri.hostPort]); Scholar.debug("EZProxy: proper url is "+url); } } return url; } /* * Returns a page's proxied url from the proper url */ function properToProxy(url) { if(_mapToProxy) { // EZProxy detection is active var uri = _parseURL(url); if(uri && _mapToProxy[uri.hostPort]) { // Actually need to map url = url.replace(uri.hostPort, _mapToProxy[uri.hostPort]); Scholar.debug("EZProxy: proxied url is "+url); } } return url; } /* * Parses a url into components (hostPort, port, host, and spec) */ function _parseURL(url) { // create an nsIURI (not sure if this is faster than the regular // expression, but it's at least more kosher) var uri = ioService.newURI(url, null, null); return uri; } } Scholar.OpenURL = new function() { this.resolve = resolve; this.discoverResolvers = discoverResolvers; this.createContextObject = createContextObject; this.parseContextObject = parseContextObject; /* * Returns a URL to look up an item in the OpenURL resolver */ function resolve(itemObject) { var co = createContextObject(itemObject, Scholar.Prefs.get("openURL.version")); if(co) { return Scholar.Prefs.get("openURL.resolver")+"?"+co; } return false; } /* * Queries OCLC's OpenURL resolver registry and returns an address and version */ function discoverResolvers() { var req = Components.classes["@mozilla.org/xmlextras/xmlhttprequest;1"].createInstance(); req.open("GET", "http://worldcatlibraries.org/registry/lookup?IP=requestor", false); req.send(null); if(!req.responseXML) { throw "Could not access resolver registry"; } var resolverArray = new Array(); var resolvers = req.responseXML.getElementsByTagName("resolver"); for(var i=0; i 0) { var version = "1.0"; } else if(resolver.getElementsByTagName("OpenUrl 0.1").length > 0) { var version = "0.1"; } else { continue; } resolverArray.push({name:name, url:url, version:version}); } return resolverArray; } /* * Generates an OpenURL ContextObject from an item */ function createContextObject(item, version) { if(item.toArray) { item = item.toArray(); } var identifiers = new Array(); if(item.DOI) { identifiers.push(item.DOI); } if(item.ISBN) { identifiers.push("urn:isbn:"); } // encode ctx_ver (if available) and identifiers if(version == "0.1") { var co = ""; for each(identifier in identifiers) { co += "&id="+escape(identifier); } } else { var co = "url_ver=Z39.88-2004&ctx_ver=Z39.88-2004"; for each(identifier in identifiers) { co += "&rft_id="+escape(identifier); } } // encode genre and item-specific data if(item.itemType == "journalArticle") { if(version == "0.1") { co += "&genre=article"; } else { co += "&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article"; } co += _mapTag(item.title, "atitle", version) co += _mapTag(item.publicationTitle, (version == "0.1" ? "title" : "jtitle"), version) co += _mapTag(item.journalAbbreviation, "stitle", version); co += _mapTag(item.volume, "volume", version); co += _mapTag(item.issue, "issue", version); } else if(item.itemType == "book" || item.itemType == "bookitem") { if(version == "0.1") { co += "&genre=book"; } else { co += "&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book"; } if(item.itemType == "book") { co += "&rft.genre=book"; co += _mapTag(item.title, (version == "0.1" ? "title" : "btitle"), version); } else { co += "&rft.genre=bookitem"; co += _mapTag(item.title, "atitle", version) co += _mapTag(item.publicationTitle, (version == "0.1" ? "title" : "btitle"), version); } co += _mapTag(item.place, "place", version); co += _mapTag(item.publisher, "publisher", version) co += _mapTag(item.edition, "edition", version); co += _mapTag(item.seriesTitle, "series", version); } else if(item.itemType == "thesis" && version == "1.0") { co += "&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Adissertation"; _mapTag(item.title, "title", version); _mapTag(item.publisher, "inst", version); _mapTag(item.thesisType, "degree", version); } else { return false; } // encode fields on all items for each(creator in item.creators) { if(creator.firstName) { co += _mapTag(creator.firstName, "aufirst", version); co += _mapTag(creator.lastName, "aulast", version); } else { co += _mapTag(creator.lastName, "aucorp", version); } } if(item.date) { co += _mapTag(item.date, "date", version); } else { co += _mapTag(item.year, "date", version); } co += _mapTag(item.pages, "pages", version); co += _mapTag(item.ISBN, "ISBN", version); co += _mapTag(item.ISSN, "ISSN", version); if(version == "0.1") { // chop off leading & sign if version is 0.1 co = co.substr(1); } return co; } /* * Generates an item in the format returned by item.fromArray() given an * OpenURL version 1.0 contextObject * * accepts an item array to fill, or creates and returns a new item array */ function parseContextObject(co, item) { if(!item) { var item = new Array(); item.creators = new Array(); } var coParts = co.split("&"); // get type for each(var part in coParts) { if(part.substr(0, 12) == "rft_val_fmt=") { var format = unescape(part.substr(12)); if(format == "info:ofi/fmt:kev:mtx:journal") { item.itemType = "journalArticle"; } else if(format == "info:ofi/fmt:kev:mtx:book") { if(Scholar.inArray("rft.genre=bookitem", coParts)) { item.itemType = "bookSection"; } else { item.itemType = "book"; } break; } } } if(!item.itemType) { return false; } var pagesKey = ""; for each(var part in coParts) { var keyVal = part.split("="); var key = keyVal[0]; var value = unescape(keyVal[1].replace(/\+|%2[bB]/g, " ")); if(!value) { continue; } if(key == "rft_id") { var firstEight = value.substr(0, 8).toLowerCase(); if(firstEight == "info:doi") { item.DOI = value; } else if(firstEight == "urn:isbn") { item.ISBN = value.substr(9); } } else if(key == "rft.btitle") { if(item.itemType == "book") { item.title = value; } else if(item.itemType == "bookSection") { item.publicationTitle = value; } } else if(key == "rft.atitle" && item.itemType != "book") { item.title = value; } else if(key == "rft.jtitle" && item.itemType == "journal") { item.publcation = value; } else if(key == "rft.stitle" && item.itemType == "journal") { item.journalAbbreviation = value; } else if(key == "rft.date") { item.date = value; } else if(key == "rft.volume") { item.volume = value; } else if(key == "rft.issue") { item.issue = value; } else if(key == "rft.pages") { pagesKey = key; item.pages = value; } else if(key == "rft.spage") { if(pagesKey != "rft.pages") { pagesKey = key; // make pages look like start-end if(pagesKey == "rft.epage") { if(value != item.pages) { item.pages = value+"-"+item.pages; } } else { item.pages = value; } } } else if(key == "rft.epage") { if(pagesKey != "rft.pages") { pagesKey = key; // make pages look like start-end if(pagesKey == "rft.spage") { if(value != item.pages) { item.pages = +item.pages+"-"+value; } } else { item.pages = value; } } } else if(key == "issn" || (key == "eissn" && !item.ISSN)) { item.ISSN = value; } else if(key == "rft.aulast") { var lastCreator = item.creators[item.creators.length-1]; if(item.creators.length && !lastCreator.lastName && !lastCreator.institutional) { lastCreator.lastName = value; } else { item.creators.push({lastName:value}); } } else if(key == "rft.aufirst") { var lastCreator = item.creators[item.creators.length-1]; if(item.creators.length && !lastCreator.firstName && !lastCreator.institutional) { lastCreator.firstName = value; } else { item.creators.push({firstName:value}); } } else if(key == "rft.au") { item.creators.push(Scholar.cleanAuthor(value, "author", true)); } else if(key == "rft.aucorp") { item.creators.push({lastName:value, institutional:true}); } else if(key == "rft.isbn" && !item.ISBN) { item.ISBN = value; } else if(key == "rft.pub") { item.publisher = value; } else if(key == "rft.place") { item.place = value; } else if(key == "rft.edition") { item.edition = value; } else if(key == "rft.series") { item.seriesTitle = value; } } return item; } /* * Used to map tags for generating OpenURL contextObjects */ function _mapTag(data, tag, version) { if(data) { if(version == "0.1") { return "&"+tag+"="+escape(data); } else { return "&rft."+tag+"="+escape(data); } } else { return ""; } } } Scholar.Ingester.MIMEHandler = new function() { var on = false; this.init = init; /* * registers URIContentListener to handle MIME types */ function init() { var prefStatus = Scholar.Prefs.get("parseEndNoteMIMETypes"); if(!on && prefStatus) { var uriLoader = Components.classes["@mozilla.org/uriloader;1"]. getService(Components.interfaces.nsIURILoader); uriLoader.registerContentListener(Scholar.Ingester.MIMEHandler.URIContentListener); on = true; } else if(on && !prefStatus) { var uriLoader = Components.classes["@mozilla.org/uriloader;1"]. getService(Components.interfaces.nsIURILoader); uriLoader.unRegisterContentListener(Scholar.Ingester.MIMEHandler.URIContentListener); on = false; } } } /* * Scholar.Ingester.MIMEHandler.URIContentListener: implements * nsIURIContentListener interface to grab MIME types */ Scholar.Ingester.MIMEHandler.URIContentListener = new function() { var _desiredContentTypes = ["application/x-endnote-refer", "application/x-research-info-systems"]; this.QueryInterface = QueryInterface; this.canHandleContent = canHandleContent; this.doContent = doContent; this.isPreferred = isPreferred; this.onStartURIOpen = onStartURIOpen; function QueryInterface(iid) { if(iid.equals(Components.interfaces.nsISupports) || iid.equals(Components.interfaces.nsISupportsWeakReference) || iid.equals(Components.interfaces.nsIURIContentListener)) { return this; } throw Components.results.NS_ERROR_NO_INTERFACE; } function canHandleContent(contentType, isContentPreferred, desiredContentType) { if(Scholar.inArray(contentType, _desiredContentTypes)) { return true; } return false; } function doContent(contentType, isContentPreferred, request, contentHandler) { Scholar.debug("doing content for "+request.name); contentHandler.value = new Scholar.Ingester.MIMEHandler.StreamListener(request, contentType); return false; } function isPreferred(contentType, desiredContentType) { if(Scholar.inArray(contentType, _desiredContentTypes)) { return true; } return false; } function onStartURIOpen(URI) { return true; } } /* * Scholar.Ingester.MIMEHandler.StreamListener: implements nsIStreamListener and * nsIRequestObserver interfaces to download MIME types we've grabbed */ Scholar.Ingester.MIMEHandler.StreamListener = function(request, contentType) { this._request = request; this._contentType = contentType this._readString = ""; this._scriptableStream = null; this._scriptableStreamInput = null // get front window var windowWatcher = Components.classes["@mozilla.org/embedcomp/window-watcher;1"]. getService(Components.interfaces.nsIWindowWatcher); this._frontWindow = windowWatcher.activeWindow; this._frontWindow.Scholar_Ingester_Interface.Progress.show(); } Scholar.Ingester.MIMEHandler.StreamListener.prototype.QueryInterface = function(iid) { if(iid.equals(Components.interfaces.nsISupports) || iid.equals(Components.interfaces.nsIRequestObserver) || iid.equals(Components.interfaces.nsIStreamListener)) { return this; } throw Components.results.NS_ERROR_NO_INTERFACE; } Scholar.Ingester.MIMEHandler.StreamListener.prototype.onStartRequest = function(channel, context) {} /* * called when there's data available; basicallly, we just want to collect this data */ Scholar.Ingester.MIMEHandler.StreamListener.prototype.onDataAvailable = function(request, context, inputStream, offset, count) { Scholar.debug(count+" bytes available"); if(inputStream != this._scriptableStreamInput) { // get storage stream // if there's not one this._scriptableStream = Components.classes["@mozilla.org/scriptableinputstream;1"]. createInstance(Components.interfaces.nsIScriptableInputStream); this._scriptableStream.init(inputStream); this._scriptableStreamInput = inputStream; } this._readString += this._scriptableStream.read(count); } /* * called when the request is done */ Scholar.Ingester.MIMEHandler.StreamListener.prototype.onStopRequest = function(channel, context, status) { Scholar.debug("request finished"); var externalHelperAppService = Components.classes["@mozilla.org/uriloader/external-helper-app-service;1"]. getService(Components.interfaces.nsIExternalHelperAppService); // attempt to import through Scholar.Translate var translation = new Scholar.Translate("import"); translation.setLocation(this._request.name); translation.setString(this._readString); translation.setHandler("itemDone", this._frontWindow.Scholar_Ingester_Interface._itemDone); translation.setHandler("done", this._frontWindow.Scholar_Ingester_Interface._finishScraping); // attempt to retrieve translators var translators = translation.getTranslators(); if(!translators.length) { // we lied. we can't really translate this file. call // nsIExternalHelperAppService with the data this._frontWindow.Scholar_Ingester_Interface.Progress.kill(); var streamListener; if(streamListener = externalHelperAppService.doContent(this._contentType, this._request, this._frontWindow)) { // create a string input stream var inputStream = Components.classes["@mozilla.org/io/string-input-stream;1"]. createInstance(Components.interfaces.nsIStringInputStream); inputStream.setData(this._readString, this._readString.length); streamListener.onStartRequest(channel, context); streamListener.onDataAvailable(this._request, context, inputStream, 0, this._readString.length); streamListener.onStopRequest(channel, context, status); } return false; } // translate using first available translation.setTranslator(translators[0]); translation.translate(); }