// Firefox Scholar Ingester // Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed) // This code is licensed according to the GPL Scholar.Ingester = new function() {} ///////////////////////////////////////////////////////////////// // // Scholar.Ingester.Model // ///////////////////////////////////////////////////////////////// // Scholar.Ingester.Model, an object representing an RDF data model with // methods to add to that model. In Piggy Bank, this was implemented in Java, // but seeing as we don't really want an enormous web server running with FS, // but we don't actually need that, so it's much simpler. // // The Java version of this class can be viewed at // http://simile.mit.edu/repository/piggy-bank/trunk/src/java/edu/mit/simile/piggyBank/WorkingModel.java Scholar.Ingester.Model = function() { this.data = new Object(); } // Piggy Bank provides a fourth argument, one that determines if the third // argument is a literal or an RDF URI. Since our ontologies are // sufficiently restricted, we have no chance of confusing a literal and an // RDF URI and thus this is unnecessary. Scholar.Ingester.Model.prototype.addStatement = function(uri, rdfUri, literal) { if(!this.data[uri]) this.data[uri] = new Object(); this.data[uri][rdfUri] = literal; Scholar.debug(rdfUri+" for "+uri+" is "+literal); } // Additional functions added for compatibility purposes only // No idea if any scraper actually uses these, but just in case, they're // implemented so as not to throw an exception Scholar.Ingester.Model.prototype.addTag = function() {} Scholar.Ingester.Model.prototype.getRepository = function() {} Scholar.Ingester.Model.prototype.detachRepository = function() {} ///////////////////////////////////////////////////////////////// // // Scholar.Ingester.Utilities // ///////////////////////////////////////////////////////////////// // Scholar.Ingester.Utilities class, a set of methods to assist in data // extraction. Most code here was stolen directly from the Piggy Bank project. Scholar.Ingester.Utilities = function() {} // Adapter for Piggy Bank function to print debug messages; log level is // fixed at 4 (could change this) Scholar.Ingester.Utilities.prototype.debugPrint = function(msg) { Scholar.debug(msg, 4); } // Appears to trim a string, chopping of newlines/spacing Scholar.Ingester.Utilities.prototype.trimString = function(s) { var i = 0; var spaceChars = " \n\r\t" + String.fromCharCode(160) /*   */; while (i < s.length) { var c = s.charAt(i); if (spaceChars.indexOf(c) < 0) { break; } i++; } s = s.substring(i); i = s.length; while (i > 0) { var c = s.charAt(i - 1); if (spaceChars.indexOf(c) < 0) { break; } i--; } return s.substring(0, i); } // Takes an XPath query and returns the results Scholar.Ingester.Utilities.prototype.gatherElementsOnXPath = function(doc, parentNode, xpath, nsResolver) { var elmts = []; var iterator = doc.evaluate(xpath, parentNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null); var elmt = iterator.iterateNext(); var i = 0; while (elmt) { elmts[i++] = elmt; elmt = iterator.iterateNext(); } return elmts; } // Loads a single document for a scraper, running succeeded() on success or // failed() on failure Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succeeded, failed) { this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed); } // Downloads and processes documents with processor() // browser - a browser object // firstDoc - the first document to process with the processor (if null, // first document is processed without processor) // urls - an array of URLs to load // processor - a function to execute to process each document // done - a function to execute when all document processing is complete // exception - a function to execute if an exception occurs (exceptions are // also logged in the Firefox Scholar log) Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) { try { if (urls.length == 0) { if (firstDoc) { processor(firstDoc, done); } else { done(); } return; } var urlIndex = -1; var doLoad = function() { urlIndex++; if (urlIndex < urls.length) { try { var url = urls[urlIndex]; var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser"); b.loadURI(url); } catch (e) { exception(e); Scholar.debug("Scholar.Ingester.Utilities.processDocuments doLoad: " + e, 2); } } else { window.setTimeout(done, 10); } }; var onLoad = function() { try { var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser").selectedBrowser; processor(b.contentDocument, doLoad); } catch (e) { exception(e); Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2); } }; var init = function() { var listener; listener.onStateChange = function(webProgress, request, stateFlags, status) { if ((stateFlags & Components.interfaces.nsIWebProgressListener.STATE_STOP) > 0 && request.name == urls[urlIndex]) { try { Scholar.Ingester.progressDialog.setTimeout(onLoad, 10); } catch (e) { exception(e); Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLocationChange: " + e, 2); } } }; var tb = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser"); tb.addProgressListener(listener, Components.interfaces.nsIWebProgress.NOTIFY_STATUS); if (firstDoc) { processor(firstDoc, doLoad); } else { doLoad(); } } w.addEventListener("load", init, false); } catch (e) { exception(e); PB_Debug.print("processDocuments: " + e); } } // Appears to look for links in a document containing a certain substring Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, substring) { var urls = []; var addedURLs = []; var aElements = doc.evaluate("//a", doc, null, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null); var aElement = aElements.iterateNext(); while (aElement) { var href = aElement.href; if (href.indexOf(substring) >= 0 && !(addedURLs[href])) { urls.unshift(href); addedURLs[href] = true; } aElement = aElements.iterateNext(); } return urls; } // For now, we're going to skip the getLLsFromAddresses function (which gets // latitude and longitude pairs from a series of addresses, but requires the // big mess of Java code that is the Piggy Bank server) and the geoHelper // tools (which rely on getLLsFromAddresses) since these are probably not // essential components for Scholar and would take a great deal of effort to // implement. We can, however, always implement them later. // It looks like these are simple front-ends for XMLHttpRequest. They're a // component of the Piggy Bank API, so they're implemented here. Scholar.Ingester.Utilities.HTTPUtilities = function() {} Scholar.Ingester.Utilities.HTTPUtilities.prototype.doGet = function(url, onStatus, onDone) { var xmlhttp = new XMLHttpRequest(); xmlhttp.open('GET', url, true); xmlhttp.overrideMimeType("text/xml"); xmlhttp.onreadystatechange = function() { Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone); }; xmlhttp.send(null); } Scholar.Ingester.Utilities.HTTPUtilities.prototype.doPost = function(url, body, onStatus, onDone) { var xmlhttp = new XMLHttpRequest(); xmlhttp.open('POST', url, true); xmlhttp.overrideMimeType("text/xml"); xmlhttp.onreadystatechange = function() { Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone); }; xmlhttp.send(body); } Scholar.Ingester.Utilities.HTTPUtilities.prototype.doOptions = function(url, body, onStatus, onDone) { var xmlhttp = new XMLHttpRequest(); xmlhttp.open('OPTIONS', url, true); xmlhttp.overrideMimeType("text/xml"); xmlhttp.onreadystatechange = function() { Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone); }; xmlhttp.send(body); } // Possible point of failure; for some reason, this used to be a separate // class, so make sure it works Scholar.Ingester.Utilities.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatus, onDone) { switch (xmlhttp.readyState) { // Request not yet made case 1: break; // Contact established with server but nothing downloaded yet case 2: try { // Check for HTTP status 200 if (xmlhttp.status != 200) { if (onStatus) { onStatus( xmlhttp.status, xmlhttp.statusText, xmlhttp ); xmlhttp.abort(); } } } catch (e) { Scholar.debug(e, 2); } break; // Called multiple while downloading in progress case 3: break; // Download complete case 4: try { if (onDone) { onDone(xmlhttp.responseText, xmlhttp); } } catch (e) { Scholar.debug(e, 2); } break; } } ////////////////////////////////////////////////////////////////////////////// // // Scholar.Ingester.Document // ////////////////////////////////////////////////////////////////////////////// /* Public properties: * browser - browser window object of document * model - data model for semantic scrapers * scraper - best scraper to use to scrape page * * Private properties: * _sandbox - sandbox for code execution */ ////////////////////////////////////////////////////////////////////////////// // // Public Scholar.Ingester.Document methods // ////////////////////////////////////////////////////////////////////////////// /* * Constructor for Document object */ Scholar.Ingester.Document = function(browserWindow){ this.browser = browserWindow; this.scraper = null this.model = new Scholar.Ingester.Model(); this._generateSandbox(); } /* * Retrieves the best scraper to scrape a given page */ Scholar.Ingester.Document.prototype.retrieveScraper = function() { Scholar.debug("Retrieving scrapers for "+this.browser.contentDocument.location.href); var sql = 'SELECT * FROM scrapers ORDER BY scraperDetectCode IS NULL DESC'; var scrapers = Scholar.DB.query(sql); for(var i=0; i