// Scholar for Firefox Ingester // Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed) // This code is licensed according to the GPL Scholar.Ingester = new Object(); Scholar.Ingester.createHiddenBrowser = function(myWindow) { // Create a hidden browser var newHiddenBrowser = myWindow.document.createElement("browser"); var windows = myWindow.document.getElementsByTagName("window"); windows[0].appendChild(newHiddenBrowser); Scholar.debug("created hidden browser"); return newHiddenBrowser; } Scholar.Ingester.deleteHiddenBrowser = function(myBrowser) { // Delete a hidden browser delete myBrowser; Scholar.debug("deleted hidden browser"); } /* * Operates the ingester given only a URL * url - URL to scrape * complete - callback function to be executed if page grab completes * (will be passed document object; obj.items contains array of * *unsaved* items scraped; empty array indicates unscrapable page) * error - callback function to be executed if an error occurred loading page * myWindow - optional argument indicating window to attach a dialog to. if no * window is given, Firefox Scholar uses the hidden DOM window and * will simply avoid scraping multiple pages */ Scholar.Ingester.ingestURL = function(url, complete, error, myWindow) { var isHidden = false; if(!myWindow) { var myWindow = Components.classes["@mozilla.org/appshell/appShellService;1"] .getService(Components.interfaces.nsIAppShellService) .hiddenDOMWindow; var isHidden = true; } var succeeded = function(browser) { var myDoc = new Scholar.Ingester.Document(browser, myWindow, isHidden); myDoc.retrieveScraper(); if(myDoc.scraper) { myDoc.scrapePage(function(myDoc) { Scholar.Ingester.deleteHiddenBrowser(browser); complete(myDoc); }); } else { Scholar.Ingester.deleteHiddenBrowser(browser); complete(myDoc); } } var failed = function() { Scholar.debug("Scholar.Ingester.ingestURL: could not ingest "+url); error(); } Scholar.Utilities.HTTP.processDocuments(null, [ url ], succeeded, function() {}, failed, true); } ///////////////////////////////////////////////////////////////// // // Scholar.Ingester.ProxyMonitor // ///////////////////////////////////////////////////////////////// // A singleton for recognizing EZProxies and converting URLs such that databases // will work from outside them. Unfortunately, this only works with the ($495) // EZProxy software. If there are open source alternatives, we should support // them too. /* * Precompile proxy regexps */ Scholar.Ingester.ProxyMonitor = new function() { var _ezProxyRe = new RegExp(); _ezProxyRe.compile("\\?(?:.+&)?(url|qurl)=([^&]+)", "i"); /*var _hostRe = new RegExp(); _hostRe.compile("^https?://(([^/:]+)(?:\:([0-9]+))?)");*/ var ioService = Components.classes["@mozilla.org/network/io-service;1"] .getService(Components.interfaces.nsIIOService); var on = false; var _mapFromProxy = null; var _mapToProxy = null; this.init = init; this.proxyToProper = proxyToProper; this.properToProxy = properToProxy; this.observe = observe; function init() { if(!on) { var observerService = Components.classes["@mozilla.org/observer-service;1"] .getService(Components.interfaces.nsIObserverService); observerService.addObserver(this, "http-on-examine-response", false); } on = true; } function observe(channel) { channel.QueryInterface(Components.interfaces.nsIHttpChannel); if(channel.getResponseHeader("Server") == "EZproxy") { // We're connected to an EZproxy if(channel.responseStatus != "302") { return; } Scholar.debug(channel.URI.spec); // We should be able to scrape the URL out of this var m = _ezProxyRe.exec(channel.URI.spec); if(!m) { return; } // Found URL var variable = m[1]; var properURL = m[2]; if(variable.toLowerCase() == "qurl") { properURL = unescape(properURL); } var properURI = _parseURL(properURL); if(!properURI) { return; } // Get the new URL var newURL = channel.getResponseHeader("Location"); if(!newURL) { return; } var newURI = _parseURL(newURL); if(!newURI) { return; } if(channel.URI.host == newURI.host && channel.URI.port != newURI.port) { // Different ports but the same server means EZproxy active Scholar.debug("EZProxy: host "+newURI.hostPort+" is really "+properURI.hostPort); // Initialize variables here so people who never use EZProxies // don't get the (very very minor) speed hit if(!_mapFromProxy) { _mapFromProxy = new Object(); _mapToProxy = new Object(); } _mapFromProxy[newURI.hostPort] = properURI.hostPort; _mapToProxy[properURI.hostPort] = newURI.hostPort; } } } /* * Returns a page's proper url, adjusting for proxying */ function proxyToProper(url) { if(_mapFromProxy) { // EZProxy detection is active var uri = _parseURL(url); if(uri && _mapFromProxy[uri.hostPort]) { url = url.replace(uri.hostPort, _mapFromProxy[uri.hostPort]); Scholar.debug("EZProxy: proper url is "+url); } } return url; } /* * Returns a page's proxied url from the proper url */ function properToProxy(url) { if(_mapToProxy) { // EZProxy detection is active var uri = _parseURL(url); if(uri && _mapToProxy[uri.hostPort]) { // Actually need to map url = url.replace(uri.hostPort, _mapToProxy[uri.hostPort]); Scholar.debug("EZProxy: proxied url is "+url); } } return url; } /* * Parses a url into components (hostPort, port, host, and spec) */ function _parseURL(url) { // create an nsIURI (not sure if this is faster than the regular // expression, but it's at least more kosher) var uri = ioService.newURI(url, null, null); return uri; } } ///////////////////////////////////////////////////////////////// // // Scholar.Ingester.Model // ///////////////////////////////////////////////////////////////// // Scholar.Ingester.Model, an object representing an RDF data model with // methods to add to that model. In Piggy Bank, this was implemented in Java, // but seeing as we don't really want an enormous web server running with FS, // but we don't actually need that, so it's much simpler. // // The Java version of this class can be viewed at // http://simile.mit.edu/repository/piggy-bank/trunk/src/java/edu/mit/simile/piggyBank/WorkingModel.java Scholar.Ingester.Model = function() { this.data = new Object(); } // Piggy Bank provides a fourth argument, one that determines if the third // argument is a literal or an RDF URI. Since our ontologies are // sufficiently restricted, we have no chance of confusing a literal and an // RDF URI and thus this is unnecessary. Scholar.Ingester.Model.prototype.addStatement = function(uri, rdfUri, literal) { if(!this.data[uri]) this.data[uri] = new Object(); if(!this.data[uri][rdfUri]) { this.data[uri][rdfUri] = new Array(); } this.data[uri][rdfUri].push(literal); Scholar.debug(rdfUri+" for "+uri+" is "+literal); } // Additional functions added for compatibility purposes only // No idea if any scraper actually uses these, but just in case, they're // implemented so as not to throw an exception Scholar.Ingester.Model.prototype.addTag = function() {} Scholar.Ingester.Model.prototype.getRepository = function() {} Scholar.Ingester.Model.prototype.detachRepository = function() {} ////////////////////////////////////////////////////////////////////////////// // // Scholar.Ingester.Document // ////////////////////////////////////////////////////////////////////////////// /* Public properties: * browser - browser window object of document * model - data model for semantic scrapers * scraper - best scraper to use to scrape page * items - items returned after page is scraped * window - window, for creating new hidden browsers * url - url, as passed through proxy system * type - type of item that will be scraped (set after retrieveScraper() is * called) * * Private properties: * _sandbox - sandbox for code execution * _scrapeCallback - callback function to be executed when scraping is complete */ ////////////////////////////////////////////////////////////////////////////// // // Public Scholar.Ingester.Document methods // ////////////////////////////////////////////////////////////////////////////// /* * Constructor for Document object */ Scholar.Ingester.Document = function(myBrowser, myWindow, isHidden) { this.browser = myBrowser; this.window = myWindow; this.isHidden = isHidden; this.scraper = this.type = null; this.model = new Scholar.Ingester.Model(); // Create separate URL to account for proxies this.url = Scholar.Ingester.ProxyMonitor.proxyToProper(this.browser.contentDocument.location.href); if(this.url != this.browser.contentDocument.location.href) { this.proxiedURL = true; } this.items = new Array(); this._generateSandbox(); } /* * Retrieves the best scraper to scrape a given page */ Scholar.Ingester.Document.prototype.retrieveScraper = function() { Scholar.debug("Retrieving scrapers for "+this.url); var sql = 'SELECT * FROM scrapers ORDER BY scraperDetectCode IS NULL DESC'; var scrapers = Scholar.DB.query(sql); for(var i=0; i= 4) { var ISORe = /^[0-9]{4}-[0-9]{2}-[0-9]{2}$/ if(ISORe.test(this.model.data[uri][prefixDC + 'date'][0])) { newItem.setField("year", this.model.data[uri][prefixDC + 'date'][0].substr(0, 4)); } else { var m; var yearRe = /[0-9]{4}$/; if(m = yearRe.exec(this.model.data[uri][prefixDC + 'date'][0])) { newItem.setField("year", m[0]); } } } } // Handle ISBNs/ISSNs/Call Numbers if(this.model.data[uri][prefixDC + 'identifier']) { var oldIndex = -1; var needISSN = Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("ISSN"), typeID); var needISBN = Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("ISBN"), typeID); for(i in this.model.data[uri][prefixDC + 'identifier']) { prefix = this.model.data[uri][prefixDC + 'identifier'][i].substr(0, this.model.data[uri][prefixDC + 'identifier'][i].indexOf(" ")); if(needISSN && prefix == 'ISSN') { newItem.setField("ISSN", this.model.data[uri][prefixDC + 'identifier'][i].substring(5)); needISSN = false; } if(needISBN && prefix == 'ISBN') { newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][i].substring(5)); needISBN = false; } var newIndex = Scholar.arraySearch(prefix, callNumbers); if(newIndex && newIndex > oldIndex) { oldIndex = newIndex; var callNumber = this.model.data[uri][prefixDC + 'identifier'][i].substring(prefix.length+1); } } if(callNumber) { newItem.setField("callNumber", callNumber); } } this._associateRDF(prefixDummy + 'publication', "publication", uri, newItem, typeID); this._associateRDF(prefixDummy + 'volume', "volume", uri, newItem, typeID); this._associateRDF(prefixDummy + 'number', "number", uri, newItem, typeID); this._associateRDF(prefixDummy + 'pages', "pages", uri, newItem, typeID); this._associateRDF(prefixDC + 'publisher', "publisher", uri, newItem, typeID); this._associateRDF(prefixDC + 'date', "date", uri, newItem, typeID); this._associateRDF(prefixDC + 'hasVersion', "edition", uri, newItem, typeID); this._associateRDF(prefixDummy + 'series', "series", uri, newItem, typeID); this._associateRDF(prefixDummy + 'place', "place", uri, newItem, typeID); this.items.push(newItem); } } catch(ex) { Scholar.debug('Error in Scholar.Ingester.Document._updateDatabase: '+ex); } }