// Scholar for Firefox Ingester // Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed) // This code is licensed according to the GPL Scholar.Ingester = new function() {} Scholar.Ingester.createHiddenBrowser = function(myWindow) { // Create a hidden browser var newHiddenBrowser = myWindow.document.createElement("browser"); var windows = myWindow.document.getElementsByTagName("window"); windows[0].appendChild(newHiddenBrowser); Scholar.debug("created hidden browser"); return newHiddenBrowser; } Scholar.Ingester.deleteHiddenBrowser = function(myBrowser) { // Delete a hidden browser delete myBrowser; Scholar.debug("deleted hidden browser"); } ///////////////////////////////////////////////////////////////// // // Scholar.Ingester.Model // ///////////////////////////////////////////////////////////////// // Scholar.Ingester.Model, an object representing an RDF data model with // methods to add to that model. In Piggy Bank, this was implemented in Java, // but seeing as we don't really want an enormous web server running with FS, // but we don't actually need that, so it's much simpler. // // The Java version of this class can be viewed at // http://simile.mit.edu/repository/piggy-bank/trunk/src/java/edu/mit/simile/piggyBank/WorkingModel.java Scholar.Ingester.Model = function() { this.data = new Object(); } // Piggy Bank provides a fourth argument, one that determines if the third // argument is a literal or an RDF URI. Since our ontologies are // sufficiently restricted, we have no chance of confusing a literal and an // RDF URI and thus this is unnecessary. Scholar.Ingester.Model.prototype.addStatement = function(uri, rdfUri, literal) { if(!this.data[uri]) this.data[uri] = new Object(); if(!this.data[uri][rdfUri]) { this.data[uri][rdfUri] = new Array(); } this.data[uri][rdfUri].push(literal); Scholar.debug(rdfUri+" for "+uri+" is "+literal); } // Additional functions added for compatibility purposes only // No idea if any scraper actually uses these, but just in case, they're // implemented so as not to throw an exception Scholar.Ingester.Model.prototype.addTag = function() {} Scholar.Ingester.Model.prototype.getRepository = function() {} Scholar.Ingester.Model.prototype.detachRepository = function() {} ///////////////////////////////////////////////////////////////// // // Scholar.Ingester.Utilities // ///////////////////////////////////////////////////////////////// // Scholar.Ingester.Utilities class, a set of methods to assist in data // extraction. Most code here was stolen directly from the Piggy Bank project. Scholar.Ingester.Utilities = function(myWindow) { this.window = myWindow; } // Adapter for Piggy Bank function to print debug messages; log level is // fixed at 4 (could change this) Scholar.Ingester.Utilities.prototype.debugPrint = function(msg) { Scholar.debug(msg, 4); } // Appears to trim a string, chopping of newlines/spacing Scholar.Ingester.Utilities.prototype.trimString = function(s) { var i = 0; var spaceChars = " \n\r\t" + String.fromCharCode(160) /*   */; while (i < s.length) { var c = s.charAt(i); if (spaceChars.indexOf(c) < 0) { break; } i++; } s = s.substring(i); i = s.length; while (i > 0) { var c = s.charAt(i - 1); if (spaceChars.indexOf(c) < 0) { break; } i--; } return s.substring(0, i); } // Takes an XPath query and returns the results Scholar.Ingester.Utilities.prototype.gatherElementsOnXPath = function(doc, parentNode, xpath, nsResolver) { var elmts = []; var iterator = doc.evaluate(xpath, parentNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null); var elmt = iterator.iterateNext(); var i = 0; while (elmt) { elmts[i++] = elmt; elmt = iterator.iterateNext(); } return elmts; } // Loads a single document for a scraper, running succeeded() on success or // failed() on failure Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succeeded, failed) { Scholar.debug("loadDocument called"); this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed); } // Downloads and processes documents with processor() // browser - a browser object // firstDoc - the first document to process with the processor (if null, // first document is processed without processor) // urls - an array of URLs to load // processor - a function to execute to process each document // done - a function to execute when all document processing is complete // exception - a function to execute if an exception occurs (exceptions are // also logged in the Scholar for Firefox log) Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) { var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(this.window); var myWindow = this.window; var prevUrl, url; Scholar.debug("processDocuments called"); try { if (urls.length == 0) { if(firstDoc) { processor(firstDoc, done); } else { done(); } return; } var urlIndex = -1; var doLoad = function() { urlIndex++; if (urlIndex < urls.length) { try { url = urls[urlIndex]; Scholar.debug("loading "+url); hiddenBrowser.loadURI(url); } catch (e) { Scholar.debug("Scholar.Ingester.Utilities.processDocuments doLoad: " + e, 2); exception(e); } } else { hiddenBrowser.removeEventListener("load", onLoad, true); Scholar.Ingester.deleteHiddenBrowser(hiddenBrowser); done(); } }; var onLoad = function() { Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded"); if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times prevUrl = hiddenBrowser.contentDocument.location.href; try { var newHiddenBrowser = new Object(); newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument; newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow; processor(newHiddenBrowser); } catch (e) { Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2); exception(e); } doLoad(); } }; var init = function() { Scholar.debug("init called"); hiddenBrowser.addEventListener("load", onLoad, true); if (firstDoc) { Scholar.debug("processing"); processor(firstDoc, doLoad); } else { Scholar.debug("doing load"); doLoad(); } } init(); } catch (e) { Scholar.debug("processDocuments: " + e); exception(e); } } // Appears to look for links in a document containing a certain substring Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, substring) { var urls = []; var addedURLs = []; var aElements = doc.evaluate("//a", doc, null, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null); var aElement = aElements.iterateNext(); while (aElement) { var href = aElement.href; if (href.indexOf(substring) >= 0 && !(addedURLs[href])) { urls.unshift(href); addedURLs[href] = true; } aElement = aElements.iterateNext(); } return urls; } // For now, we're going to skip the getLLsFromAddresses function (which gets // latitude and longitude pairs from a series of addresses, but requires the // big mess of Java code that is the Piggy Bank server) and the geoHelper // tools (which rely on getLLsFromAddresses) since these are probably not // essential components for Scholar and would take a great deal of effort to // implement. We can, however, always implement them later. /* * BEGIN SCHOLAR FOR FIREFOX EXTENSIONS * Functions below this point are extensions to the utilities provided by * Piggy Bank. When used in external code, the repository will need to add * a function definition when exporting in Piggy Bank format. */ /* * Converts a JavaScript date object to an ISO-style date */ Scholar.Ingester.Utilities.prototype.dateToISO = function(jsDate) { var date = ""; var year = jsDate.getFullYear().toString(); var month = (jsDate.getMonth()+1).toString(); var day = jsDate.getDate().toString(); for(var i = year.length; i<4; i++) { date += "0"; } date += year+"-"; if(month.length == 1) { date += "0"; } date += month+"-"; if(day.length == 1) { date += "0"; } date += day; return date; } /* * Gets a given node (assumes only one value) */ Scholar.Ingester.Utilities.prototype.getNode = function(doc, contextNode, xpath, nsResolver) { return doc.evaluate(xpath, contextNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null).iterateNext(); } /* * Gets a given node as a string containing all child nodes */ Scholar.Ingester.Utilities.prototype.getNodeString = function(doc, contextNode, xpath, nsResolver) { var elmts = this.gatherElementsOnXPath(doc, contextNode, xpath, nsResolver); var returnVar = ""; for(var i=0; i 1) { author = splitNames[1]+' '+splitNames[0]; } return author; } /* * Cleans whitespace off a string and replaces multiple spaces with one */ Scholar.Ingester.Utilities.prototype.cleanString = function(s) { s = this.trimString(s); return s.replace(/[ \xA0]+/g, " "); } /* * Cleans any non-world non-parenthesis characters off the ends of a string */ Scholar.Ingester.Utilities.prototype.superCleanString = function(x) { var x = x.replace(/^[^\w(]+/, ""); return x.replace(/[^\w)]+$/, ""); } /* * Eliminates HTML tags, replacing
s with /ns */ Scholar.Ingester.Utilities.prototype.cleanTags = function(x) { x = x.replace(/]*>/gi, "\n"); return x.replace(/<[^>]+>/g, ""); } /* * Allows a user to select which items to scrape */ Scholar.Ingester.Utilities.prototype.selectItems = function(itemList) { // mozillazine made me do it! honest! var io = { dataIn:itemList, dataOut:null } var newDialog = this.window.openDialog("chrome://scholar/content/ingester/selectitems.xul", "_blank","chrome,modal,centerscreen,resizable=yes", io); return io.dataOut; } /* * Grabs items based on URLs */ Scholar.Ingester.Utilities.prototype.getItemArray = function(doc, inHere, urlRe, rejectRe) { var availableItems = new Object(); // Technically, associative arrays are objects // Require link to match this if(urlRe) { var urlRegexp = new RegExp(); urlRegexp.compile(urlRe, "i"); } // Do not allow text to match this if(rejectRe) { var rejectRegexp = new RegExp(); rejectRegexp.compile(rejectRe, "i"); } if(!inHere.length) { inHere = new Array(inHere); } for(var j=0; j= 4) { var ISORe = /^[0-9]{4}-[0-9]{2}-[0-9]{2}$/ if(ISORe.test(this.model.data[uri][prefixDC + 'date'][0])) { newItem.setField("year", this.model.data[uri][prefixDC + 'date'][0].substr(0, 4)); } else { var m; var yearRe = /[0-9]{4}$/; if(m = yearRe.exec(this.model.data[uri][prefixDC + 'date'][0])) { newItem.setField("year", m[0]); } } } } // Handle ISBNs/ISSNs if(this.model.data[uri][prefixDC + 'identifier']) { var needISSN = Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("ISSN"), typeID); var needISBN = Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("ISBN"), typeID); if(needISSN || needISBN) { for(i in this.model.data[uri][prefixDC + 'identifier']) { firstFour = this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4); if(needISSN && firstFour == 'ISSN') { newItem.setField("ISSN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5)); break; } if(needISBN && firstFour == 'ISBN') { newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5)); break; } } } } this._associateRDF(prefixDummy + 'publication', "publication", uri, newItem, typeID); this._associateRDF(prefixDummy + 'volume', "volume", uri, newItem, typeID); this._associateRDF(prefixDummy + 'number', "number", uri, newItem, typeID); this._associateRDF(prefixDummy + 'pages', "pages", uri, newItem, typeID); this._associateRDF(prefixDC + 'publisher', "publisher", uri, newItem, typeID); this._associateRDF(prefixDC + 'date', "date", uri, newItem, typeID); this._associateRDF(prefixDC + 'hasVersion', "edition", uri, newItem, typeID); this._associateRDF(prefixDummy + 'series', "series", uri, newItem, typeID); this._associateRDF(prefixDummy + 'place', "place", uri, newItem, typeID); this.items.push(newItem); } } catch(ex) { Scholar.debug('Error in Scholar.Ingester.Document._updateDatabase: '+ex); } }