// Scholar for Firefox Ingester // Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed) // This code is licensed according to the GPL Scholar.Ingester = new Object(); Scholar.Ingester.createHiddenBrowser = function(myWindow) { // Create a hidden browser var newHiddenBrowser = myWindow.document.createElement("browser"); var windows = myWindow.document.getElementsByTagName("window"); windows[0].appendChild(newHiddenBrowser); Scholar.debug("created hidden browser"); return newHiddenBrowser; } Scholar.Ingester.deleteHiddenBrowser = function(myBrowser) { // Delete a hidden browser delete myBrowser; Scholar.debug("deleted hidden browser"); } ///////////////////////////////////////////////////////////////// // // Scholar.Ingester.ProxyMonitor // ///////////////////////////////////////////////////////////////// // A singleton for recognizing EZProxies and converting URLs such that databases // will work from outside them. Unfortunately, this only works with the ($495) // EZProxy software. If there are open source alternatives, we should support // them too. /* * Precompile proxy regexps */ Scholar.Ingester.ProxyMonitor = new Object(); Scholar.Ingester.ProxyMonitor._ezProxyRe = new RegExp(); Scholar.Ingester.ProxyMonitor._ezProxyRe.compile("(https?://([^/:]+)(?:\:[0-9])?/login)\\?(?:.+&)?(url|qurl)=([^&]+)"); Scholar.Ingester.ProxyMonitor._hostRe = new RegExp(); Scholar.Ingester.ProxyMonitor._hostRe.compile("^https?://(([^/:]+)(\:[0-9]+)?)"); /* * Returns a page's proper url, adjusting for proxying * * This is a bit of a hack, in that it offers an opportunity for spoofing. Not * really any way around this, but our scrapers should be sufficiently sandboxed * that it won't be a problem. */ Scholar.Ingester.ProxyMonitor.proxyToProper = function(url) { var m = Scholar.Ingester.ProxyMonitor._ezProxyRe.exec(url); if(m) { // EZProxy detected var loginURL = m[1]; var host = m[2]; var arg = m[3]; var url = m[4]; if(arg == "qurl") { url = unescape(url); } Scholar.Ingester.ProxyMonitor._now = true; Scholar.Ingester.ProxyMonitor._url = url; Scholar.Ingester.ProxyMonitor._host = host; Scholar.Ingester.ProxyMonitor._loginURL = loginURL; } else if(Scholar.Ingester.ProxyMonitor._now) { // EZProxying something var m = Scholar.Ingester.ProxyMonitor._hostRe.exec(url); // EZProxy always runs on a higher port if(url == Scholar.Ingester.ProxyMonitor._loginURL) { Scholar.debug("EZProxy: detected wrong password; won't disable monitoring yet"); } else { if(m) { var hostAndPort = m[1]; var host = m[2]; var port = m[3]; if(port) { // Make sure our host is the same who we logged in under if(host == Scholar.Ingester.ProxyMonitor._host) { // Extract host information from the URL we're proxying var m = Scholar.Ingester.ProxyMonitor._hostRe.exec(Scholar.Ingester.ProxyMonitor._url); var properHostAndPort = m[1]; if(m) { if(!Scholar.Ingester.ProxyMonitor._mapFromProxy) { Scholar.Ingester.ProxyMonitor._mapFromProxy = new Object(); Scholar.Ingester.ProxyMonitor._mapToProxy = new Object(); } Scholar.debug("EZProxy: host "+hostAndPort+" is really "+properHostAndPort); Scholar.Ingester.ProxyMonitor._mapFromProxy[hostAndPort] = properHostAndPort; Scholar.Ingester.ProxyMonitor._mapToProxy[properHostAndPort] = hostAndPort; url = url.replace(hostAndPort, properHostAndPort); } } } } Scholar.Ingester.ProxyMonitor._now = false; } } else if(Scholar.Ingester.ProxyMonitor._mapFromProxy) { // EZProxy detection is active var m = Scholar.Ingester.ProxyMonitor._hostRe.exec(url); if(m && Scholar.Ingester.ProxyMonitor._mapFromProxy[m[1]]) { url = url.replace(m[1], Scholar.Ingester.ProxyMonitor._mapFromProxy[m[1]]); Scholar.debug("EZProxy: proper url is "+url); } } return url; } /* * Returns a page's proxied url from the proper url */ Scholar.Ingester.ProxyMonitor.properToProxy = function(url) { if(Scholar.Ingester.ProxyMonitor._mapToProxy) { // EZProxy detection is active var m = Scholar.Ingester.ProxyMonitor._hostRe.exec(url); if(Scholar.Ingester.ProxyMonitor._mapToProxy[m[1]]) { // Actually need to map url = url.replace(m[1], Scholar.Ingester.ProxyMonitor._mapToProxy[m[1]]); Scholar.debug("EZProxy: proxied url is "+url); } } return url; } ///////////////////////////////////////////////////////////////// // // Scholar.Ingester.Model // ///////////////////////////////////////////////////////////////// // Scholar.Ingester.Model, an object representing an RDF data model with // methods to add to that model. In Piggy Bank, this was implemented in Java, // but seeing as we don't really want an enormous web server running with FS, // but we don't actually need that, so it's much simpler. // // The Java version of this class can be viewed at // http://simile.mit.edu/repository/piggy-bank/trunk/src/java/edu/mit/simile/piggyBank/WorkingModel.java Scholar.Ingester.Model = function() { this.data = new Object(); } // Piggy Bank provides a fourth argument, one that determines if the third // argument is a literal or an RDF URI. Since our ontologies are // sufficiently restricted, we have no chance of confusing a literal and an // RDF URI and thus this is unnecessary. Scholar.Ingester.Model.prototype.addStatement = function(uri, rdfUri, literal) { if(!this.data[uri]) this.data[uri] = new Object(); if(!this.data[uri][rdfUri]) { this.data[uri][rdfUri] = new Array(); } this.data[uri][rdfUri].push(literal); Scholar.debug(rdfUri+" for "+uri+" is "+literal); } // Additional functions added for compatibility purposes only // No idea if any scraper actually uses these, but just in case, they're // implemented so as not to throw an exception Scholar.Ingester.Model.prototype.addTag = function() {} Scholar.Ingester.Model.prototype.getRepository = function() {} Scholar.Ingester.Model.prototype.detachRepository = function() {} ////////////////////////////////////////////////////////////////////////////// // // Scholar.Ingester.Document // ////////////////////////////////////////////////////////////////////////////// /* Public properties: * browser - browser window object of document * model - data model for semantic scrapers * scraper - best scraper to use to scrape page * items - items returned after page is scraped * * Private properties: * _sandbox - sandbox for code execution * _appSvc - AppShellService instance * _hiddenBrowser - hiden browser object * _scrapeCallback - callback function to be executed when scraping is complete */ ////////////////////////////////////////////////////////////////////////////// // // Public Scholar.Ingester.Document methods // ////////////////////////////////////////////////////////////////////////////// /* * Constructor for Document object */ Scholar.Ingester.Document = function(browserWindow, myWindow){ this.scraper = null; this.browser = browserWindow; this.window = myWindow; this.model = new Scholar.Ingester.Model(); // Create separate URL to account for proxies this.url = Scholar.Ingester.ProxyMonitor.proxyToProper(this.browser.contentDocument.location.href); if(this.url != this.browser.contentDocument.location.href) { this.proxiedURL = true; } this.items = new Array(); this._appSvc = Cc["@mozilla.org/appshell/appShellService;1"] .getService(Ci.nsIAppShellService); this._generateSandbox(); } /* * Retrieves the best scraper to scrape a given page */ Scholar.Ingester.Document.prototype.retrieveScraper = function() { Scholar.debug("Retrieving scrapers for "+this.url); var sql = 'SELECT * FROM scrapers ORDER BY scraperDetectCode IS NULL DESC'; var scrapers = Scholar.DB.query(sql); for(var i=0; i= 4) { var ISORe = /^[0-9]{4}-[0-9]{2}-[0-9]{2}$/ if(ISORe.test(this.model.data[uri][prefixDC + 'date'][0])) { newItem.setField("year", this.model.data[uri][prefixDC + 'date'][0].substr(0, 4)); } else { var m; var yearRe = /[0-9]{4}$/; if(m = yearRe.exec(this.model.data[uri][prefixDC + 'date'][0])) { newItem.setField("year", m[0]); } } } } // Handle ISBNs/ISSNs/Call Numbers if(this.model.data[uri][prefixDC + 'identifier']) { var oldIndex = -1; var needISSN = Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("ISSN"), typeID); var needISBN = Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("ISBN"), typeID); for(i in this.model.data[uri][prefixDC + 'identifier']) { prefix = this.model.data[uri][prefixDC + 'identifier'][i].substr(0, this.model.data[uri][prefixDC + 'identifier'][i].indexOf(" ")); if(needISSN && prefix == 'ISSN') { newItem.setField("ISSN", this.model.data[uri][prefixDC + 'identifier'][i].substring(5)); needISSN = false; } if(needISBN && prefix == 'ISBN') { newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][i].substring(5)); needISBN = false; } var newIndex = Scholar.arraySearch(prefix, callNumbers); if(newIndex && newIndex > oldIndex) { oldIndex = newIndex; var callNumber = this.model.data[uri][prefixDC + 'identifier'][i].substring(prefix.length+1); } } if(callNumber) { newItem.setField("callNumber", callNumber); } } this._associateRDF(prefixDummy + 'publication', "publication", uri, newItem, typeID); this._associateRDF(prefixDummy + 'volume', "volume", uri, newItem, typeID); this._associateRDF(prefixDummy + 'number', "number", uri, newItem, typeID); this._associateRDF(prefixDummy + 'pages', "pages", uri, newItem, typeID); this._associateRDF(prefixDC + 'publisher', "publisher", uri, newItem, typeID); this._associateRDF(prefixDC + 'date', "date", uri, newItem, typeID); this._associateRDF(prefixDC + 'hasVersion', "edition", uri, newItem, typeID); this._associateRDF(prefixDummy + 'series', "series", uri, newItem, typeID); this._associateRDF(prefixDummy + 'place', "place", uri, newItem, typeID); this.items.push(newItem); } } catch(ex) { Scholar.debug('Error in Scholar.Ingester.Document._updateDatabase: '+ex); } }