// Firefox Scholar Ingester // Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed) // This code is licensed according to the GPL Scholar.Ingester = new function() {} ///////////////////////////////////////////////////////////////// // // Scholar.Ingester.Model // ///////////////////////////////////////////////////////////////// // Scholar.Ingester.Model, an object representing an RDF data model with // methods to add to that model. In Piggy Bank, this was implemented in Java, // but seeing as we don't really want an enormous web server running with FS, // but we don't actually need that, so it's much simpler. // // The Java version of this class can be viewed at // http://simile.mit.edu/repository/piggy-bank/trunk/src/java/edu/mit/simile/piggyBank/WorkingModel.java Scholar.Ingester.Model = function() { this.data = new Object(); } // Piggy Bank provides a fourth argument, one that determines if the third // argument is a literal or an RDF URI. Since our ontologies are // sufficiently restricted, we have no chance of confusing a literal and an // RDF URI and thus this is unnecessary. Scholar.Ingester.Model.prototype.addStatement = function(uri, rdfUri, literal) { if(!this.data[uri]) this.data[uri] = new Object(); if(!this.data[uri][rdfUri]) { this.data[uri][rdfUri] = new Array(); } this.data[uri][rdfUri].push(literal); Scholar.debug(rdfUri+" for "+uri+" is "+literal); } // Additional functions added for compatibility purposes only // No idea if any scraper actually uses these, but just in case, they're // implemented so as not to throw an exception Scholar.Ingester.Model.prototype.addTag = function() {} Scholar.Ingester.Model.prototype.getRepository = function() {} Scholar.Ingester.Model.prototype.detachRepository = function() {} ///////////////////////////////////////////////////////////////// // // Scholar.Ingester.Utilities // ///////////////////////////////////////////////////////////////// // Scholar.Ingester.Utilities class, a set of methods to assist in data // extraction. Most code here was stolen directly from the Piggy Bank project. Scholar.Ingester.Utilities = function(hiddenBrowser) { this.hiddenBrowser = hiddenBrowser; } // Adapter for Piggy Bank function to print debug messages; log level is // fixed at 4 (could change this) Scholar.Ingester.Utilities.prototype.debugPrint = function(msg) { Scholar.debug(msg, 4); } // Appears to trim a string, chopping of newlines/spacing Scholar.Ingester.Utilities.prototype.trimString = function(s) { var i = 0; var spaceChars = " \n\r\t" + String.fromCharCode(160) /*   */; while (i < s.length) { var c = s.charAt(i); if (spaceChars.indexOf(c) < 0) { break; } i++; } s = s.substring(i); i = s.length; while (i > 0) { var c = s.charAt(i - 1); if (spaceChars.indexOf(c) < 0) { break; } i--; } return s.substring(0, i); } // Takes an XPath query and returns the results Scholar.Ingester.Utilities.prototype.gatherElementsOnXPath = function(doc, parentNode, xpath, nsResolver) { var elmts = []; var iterator = doc.evaluate(xpath, parentNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null); var elmt = iterator.iterateNext(); var i = 0; while (elmt) { elmts[i++] = elmt; elmt = iterator.iterateNext(); } return elmts; } // Loads a single document for a scraper, running succeeded() on success or // failed() on failure Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succeeded, failed) { Scholar.debug("loadDocument called"); this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed); } // Downloads and processes documents with processor() // browser - a browser object // firstDoc - the first document to process with the processor (if null, // first document is processed without processor) // urls - an array of URLs to load // processor - a function to execute to process each document // done - a function to execute when all document processing is complete // exception - a function to execute if an exception occurs (exceptions are // also logged in the Firefox Scholar log) Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) { var hiddenBrowser = this.hiddenBrowser; Scholar.debug("processDocuments called"); try { if (urls.length == 0) { if (firstDoc) { processor(firstDoc, done); } else { done(); } return; } var urlIndex = -1; var doLoad = function() { urlIndex++; if (urlIndex < urls.length) { try { var url = urls[urlIndex]; Scholar.debug("loading "+url); hiddenBrowser.loadURI(url); } catch (e) { Scholar.debug("Scholar.Ingester.Utilities.processDocuments doLoad: " + e, 2); exception(e); } } else { hiddenBrowser.setTimeout(done, 10); } }; var onLoad = function() { Scholar.debug("onLoad called"); if(hiddenBrowser.id == "scholar-hidden-browser") { hiddenBrowser.removeEventListener("DOMContentLoaded", onLoad, true); try { var newHiddenBrowser = new Object(); Scholar.debug("new hidden browser"); newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument; newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow; Scholar.debug("added attributes"); processor(newHiddenBrowser); Scholar.debug("called processor"); } catch (e) { Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2); exception(e); } } }; var init = function() { Scholar.debug("init called"); hiddenBrowser.addEventListener("DOMContentLoaded", onLoad, true); if (firstDoc) { Scholar.debug("processing"); processor(firstDoc, doLoad); } else { Scholar.debug("doing load"); doLoad(); } } init(); } catch (e) { Scholar.debug("processDocuments: " + e); exception(e); } } // Appears to look for links in a document containing a certain substring Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, substring) { var urls = []; var addedURLs = []; var aElements = doc.evaluate("//a", doc, null, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null); var aElement = aElements.iterateNext(); while (aElement) { var href = aElement.href; if (href.indexOf(substring) >= 0 && !(addedURLs[href])) { urls.unshift(href); addedURLs[href] = true; } aElement = aElements.iterateNext(); } return urls; } // For now, we're going to skip the getLLsFromAddresses function (which gets // latitude and longitude pairs from a series of addresses, but requires the // big mess of Java code that is the Piggy Bank server) and the geoHelper // tools (which rely on getLLsFromAddresses) since these are probably not // essential components for Scholar and would take a great deal of effort to // implement. We can, however, always implement them later. // These functions are for use by importMARCRecord. They're private, because, // while they are useful, it's also nice if as many of our scrapers as possible // are PiggyBank compatible, and if our scrapers used functions, that would // break compatibility Scholar.Ingester.Utilities.prototype._MARCCleanString = function(author) { author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); author = author.replace(/[\s\.\,\/\[\]\:]+$/, ''); return author.replace(/ +/, ' '); } Scholar.Ingester.Utilities.prototype._MARCCleanAuthor = function(author) { author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); author = author.replace(/[\s\,\/\[\]\:\.]+$/, ''); author = author.replace(/ +/, ' '); // Add period for initials if(author.substring(author.length-2, author.length-1) == " ") { author += "."; } var splitNames = author.split(', '); if(splitNames.length > 1) { author = splitNames[1]+' '+splitNames[0]; } return author; } Scholar.Ingester.Utilities.prototype._MARCCleanNumber = function(author) { author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); author = author.replace(/[\s\.\,\/\[\]\:]+$/, ''); var regexp = /^[^ ]*/; var m = regexp.exec(author); if(m) { return m[0]; } } Scholar.Ingester.Utilities.prototype._MARCAssociateField = function(record, uri, model, fieldNo, rdfUri, execMe, prefix, part) { if(!part) { part = 'a'; } var field = record.get_field_subfields(fieldNo); Scholar.debug('Found '+field.length+' matches for '+fieldNo+part); if(field) { for(i in field) { if(field[i][part]) { var value = field[i][part]; Scholar.debug(value); if(fieldNo == '245') { // special case - title + subtitle if(field[i]['b']) { value += ' '+field[i]['b']; } } if(execMe) { value = execMe(value); } if(prefix) { value = prefix + value; } model.addStatement(uri, rdfUri, value); } } } return model; } // This is an extension to PiggyBank's architecture. It's here so that we don't // need an enormous library for each scraper that wants to use MARC records Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, model) { var prefixDC = 'http://purl.org/dc/elements/1.1/'; var prefixDCMI = 'http://purl.org/dc/dcmitype/'; var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/'; // Extract ISBNs model = this._MARCAssociateField(record, uri, model, '020', prefixDC + 'identifier', this._MARCCleanNumber, 'ISBN '); // Extract ISSNs model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN '); // Extract creators model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this._MARCCleanAuthor); model = this._MARCAssociateField(record, uri, model, '110', prefixDC + 'creator', this._MARCCleanString); model = this._MARCAssociateField(record, uri, model, '111', prefixDC + 'creator', this._MARCCleanString); model = this._MARCAssociateField(record, uri, model, '130', prefixDC + 'creator', this._MARCCleanString); model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this._MARCCleanAuthor); model = this._MARCAssociateField(record, uri, model, '710', prefixDC + 'contributor', this._MARCCleanString); model = this._MARCAssociateField(record, uri, model, '711', prefixDC + 'contributor', this._MARCCleanString); model = this._MARCAssociateField(record, uri, model, '730', prefixDC + 'contributor', this._MARCCleanString); if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'])) { // some LOC entries have no listed author, but have the author // in the person subject field as the first entry var field = record.get_field_subfields('600'); if(field[0]) { model.addStatement(uri, prefixDC + 'creator', this._MARCCleanAuthor(field[0]['a'])); } } // Extract title model = this._MARCAssociateField(record, uri, model, '245', prefixDC + 'title', this._MARCCleanString); // Extract edition model = this._MARCAssociateField(record, uri, model, '250', prefixDC + 'edition', this._MARCCleanString); // Extract place info model = this._MARCAssociateField(record, uri, model, '260', prefixDummy + 'place', this._MARCCleanString, '', 'a'); // Extract publisher info model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'publisher', this._MARCCleanString, '', 'b'); // Extract series model = this._MARCAssociateField(record, uri, model, '440', prefixDummy + 'series', this._MARCCleanString); } // These are front ends for XMLHttpRequest. XMLHttpRequest can't actually be // accessed outside the sandbox, and even if it could, it wouldn't let scripts // access across domains, so everything's replicated here. Scholar.Ingester.HTTPUtilities = function(contentWindow) { this.window = contentWindow; } Scholar.Ingester.HTTPUtilities.prototype.doGet = function(url, onStatus, onDone) { var xmlhttp = new this.window.XMLHttpRequest(); xmlhttp.open('GET', url, true); xmlhttp.overrideMimeType("text/plain"); var me = this; xmlhttp.onreadystatechange = function() { me.stateChange(xmlhttp, onStatus, onDone); }; xmlhttp.send(null); } Scholar.Ingester.HTTPUtilities.prototype.doPost = function(url, body, onStatus, onDone) { var xmlhttp = new this.window.XMLHttpRequest(); xmlhttp.open('POST', url, true); xmlhttp.overrideMimeType("text/plain"); var me = this; xmlhttp.onreadystatechange = function() { me.stateChange(xmlhttp, onStatus, onDone); }; xmlhttp.send(body); } Scholar.Ingester.HTTPUtilities.prototype.doOptions = function(url, body, onStatus, onDone) { var xmlhttp = new this.window.XMLHttpRequest(); xmlhttp.open('OPTIONS', url, true); xmlhttp.overrideMimeType("text/plain"); var me = this; xmlhttp.onreadystatechange = function() { me.stateChange(xmlhttp, onStatus, onDone); }; xmlhttp.send(body); } // Possible point of failure; for some reason, this used to be a separate // class, so make sure it works Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatus, onDone) { switch (xmlhttp.readyState) { // Request not yet made case 1: break; // Contact established with server but nothing downloaded yet case 2: try { // Check for HTTP status 200 if (xmlhttp.status != 200) { if (onStatus) { onStatus( xmlhttp.status, xmlhttp.statusText, xmlhttp ); xmlhttp.abort(); } } } catch (e) { Scholar.debug(e, 2); } break; // Called multiple while downloading in progress case 3: break; // Download complete case 4: try { if (onDone) { onDone(xmlhttp.responseText, xmlhttp); } } catch (e) { Scholar.debug(e, 2); } break; } } ////////////////////////////////////////////////////////////////////////////// // // Scholar.Ingester.Document // ////////////////////////////////////////////////////////////////////////////// /* Public properties: * browser - browser window object of document * model - data model for semantic scrapers * scraper - best scraper to use to scrape page * * Private properties: * _sandbox - sandbox for code execution */ ////////////////////////////////////////////////////////////////////////////// // // Public Scholar.Ingester.Document methods // ////////////////////////////////////////////////////////////////////////////// /* * Constructor for Document object */ Scholar.Ingester.Document = function(browserWindow, hiddenBrowser){ this.browser = browserWindow; this.model = new Scholar.Ingester.Model(); this.appSvc = Cc["@mozilla.org/appshell/appShellService;1"] .getService(Ci.nsIAppShellService); this.scraper = null; this.hiddenBrowser = hiddenBrowser; this._generateSandbox(); } /* * Retrieves the best scraper to scrape a given page */ Scholar.Ingester.Document.prototype.retrieveScraper = function() { Scholar.debug("Retrieving scrapers for "+this.browser.contentDocument.location.href); var sql = 'SELECT * FROM scrapers ORDER BY scraperDetectCode IS NULL DESC'; var scrapers = Scholar.DB.query(sql); for(var i=0; i