// Scholar for Firefox Utilities // Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed) // This code is licensed according to the GPL ///////////////////////////////////////////////////////////////// // // Scholar.Utilities // ///////////////////////////////////////////////////////////////// // Scholar.Utilities class, a set of methods to assist in data // extraction. Some of the code here was stolen directly from the Piggy Bank // project. Scholar.Utilities = function () {} // Adapter for Piggy Bank function to print debug messages; log level is // fixed at 4 (could change this) Scholar.Utilities.prototype.debugPrint = function(msg) { Scholar.debug(msg, 4); } // Appears to trim a string, chopping of newlines/spacing Scholar.Utilities.prototype.trimString = function(s) { var i = 0; var spaceChars = " \n\r\t" + String.fromCharCode(160) /* */; while (i < s.length) { var c = s.charAt(i); if (spaceChars.indexOf(c) < 0) { break; } i++; } s = s.substring(i); i = s.length; while (i > 0) { var c = s.charAt(i - 1); if (spaceChars.indexOf(c) < 0) { break; } i--; } return s.substring(0, i); } /* * BEGIN SCHOLAR FOR FIREFOX EXTENSIONS * Functions below this point are extensions to the utilities provided by * Piggy Bank. When used in external code, the repository will need to add * a function definition when exporting in Piggy Bank format. */ /* * Converts a JavaScript date object to an ISO-style date */ Scholar.Utilities.prototype.dateToISO = function(jsDate) { var date = ""; var year = jsDate.getFullYear().toString(); var month = (jsDate.getMonth()+1).toString(); var day = jsDate.getDate().toString(); for(var i = year.length; i<4; i++) { date += "0"; } date += year+"-"; if(month.length == 1) { date += "0"; } date += month+"-"; if(day.length == 1) { date += "0"; } date += day; return date; } /* * Cleans extraneous punctuation off an author name */ Scholar.Utilities.prototype.cleanAuthor = function(author) { author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); author = author.replace(/[\s\,\/\[\]\:\.]+$/, ''); author = author.replace(/ +/, ' '); // Add period for initials if(author.substring(author.length-2, author.length-1) == " ") { author += "."; } var splitNames = author.split(', '); if(splitNames.length > 1) { author = splitNames[1]+' '+splitNames[0]; } return author; } /* * Cleans whitespace off a string and replaces multiple spaces with one */ Scholar.Utilities.prototype.cleanString = function(s) { s = s.replace(/[ \xA0]+/g, " "); return this.trimString(s); } /* * Cleans any non-word non-parenthesis characters off the ends of a string */ Scholar.Utilities.prototype.superCleanString = function(x) { var x = x.replace(/^[^\w(]+/, ""); return x.replace(/[^\w)]+$/, ""); } /* * Eliminates HTML tags, replacing
s with /ns */ Scholar.Utilities.prototype.cleanTags = function(x) { x = x.replace(/]*>/gi, "\n"); return x.replace(/<[^>]+>/g, ""); } /* * END SCHOLAR FOR FIREFOX EXTENSIONS */ ///////////////////////////////////////////////////////////////// // // Scholar.Utilities.Ingester // ///////////////////////////////////////////////////////////////// // Scholar.Utilities.Ingester extends Scholar.Utilities, offering additional // classes relating to data extraction specifically from HTML documents. Scholar.Utilities.Ingester = function(myWindow, proxiedURL, isHidden) { this.window = myWindow; this.proxiedURL = proxiedURL; this.isHidden = isHidden; } Scholar.Utilities.Ingester.prototype = new Scholar.Utilities(); // Takes an XPath query and returns the results Scholar.Utilities.Ingester.prototype.gatherElementsOnXPath = function(doc, parentNode, xpath, nsResolver) { var elmts = []; var iterator = doc.evaluate(xpath, parentNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null); var elmt = iterator.iterateNext(); var i = 0; while (elmt) { elmts[i++] = elmt; elmt = iterator.iterateNext(); } return elmts; } // Appears to look for links in a document containing a certain substring (kind // of like getItemArray, only with NO REGEXP FUNCTIONALITY) Scholar.Utilities.Ingester.prototype.collectURLsWithSubstring = function(doc, substring) { var urls = []; var addedURLs = []; var aElements = doc.evaluate("//a", doc, null, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null); var aElement = aElements.iterateNext(); while (aElement) { var href = aElement.href; if (href.indexOf(substring) >= 0 && !(addedURLs[href])) { urls.unshift(href); addedURLs[href] = true; } aElement = aElements.iterateNext(); } return urls; } // For now, we're going to skip the getLLsFromAddresses function (which gets // latitude and longitude pairs from a series of addresses, but requires the // big mess of Java code that is the Piggy Bank server) and the geoHelper // tools (which rely on getLLsFromAddresses) since these are probably not // essential components for Scholar and would take a great deal of effort to // implement. We can, however, always implement them later. /* * BEGIN SCHOLAR FOR FIREFOX EXTENSIONS */ /* * Gets a given node (assumes only one value) */ Scholar.Utilities.Ingester.prototype.getNode = function(doc, contextNode, xpath, nsResolver) { return doc.evaluate(xpath, contextNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null).iterateNext(); } /* * Gets a given node as a string containing all child nodes */ Scholar.Utilities.Ingester.prototype.getNodeString = function(doc, contextNode, xpath, nsResolver) { var elmts = this.gatherElementsOnXPath(doc, contextNode, xpath, nsResolver); var returnVar = ""; for(var i=0; i