diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js index 197cf98811..0e5e591604 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/ingester.js +++ b/chrome/chromeFiles/content/scholar/xpcom/ingester.js @@ -2,7 +2,7 @@ // Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed) // This code is licensed according to the GPL -Scholar.Ingester = new function() {} +Scholar.Ingester = new Object(); Scholar.Ingester.createHiddenBrowser = function(myWindow) { // Create a hidden browser @@ -165,545 +165,6 @@ Scholar.Ingester.Model.prototype.addTag = function() {} Scholar.Ingester.Model.prototype.getRepository = function() {} Scholar.Ingester.Model.prototype.detachRepository = function() {} -///////////////////////////////////////////////////////////////// -// -// Scholar.Ingester.Utilities -// -///////////////////////////////////////////////////////////////// -// Scholar.Ingester.Utilities class, a set of methods to assist in data -// extraction. Most code here was stolen directly from the Piggy Bank project. -Scholar.Ingester.Utilities = function(myWindow, proxiedURL) { - this.window = myWindow; - this.proxiedURL = proxiedURL; -} - -// Adapter for Piggy Bank function to print debug messages; log level is -// fixed at 4 (could change this) -Scholar.Ingester.Utilities.prototype.debugPrint = function(msg) { - Scholar.debug(msg, 4); -} - -// Appears to trim a string, chopping of newlines/spacing -Scholar.Ingester.Utilities.prototype.trimString = function(s) { - var i = 0; - var spaceChars = " \n\r\t" + String.fromCharCode(160) /*   */; - while (i < s.length) { - var c = s.charAt(i); - if (spaceChars.indexOf(c) < 0) { - break; - } - i++; - } - - s = s.substring(i); - - i = s.length; - while (i > 0) { - var c = s.charAt(i - 1); - if (spaceChars.indexOf(c) < 0) { - break; - } - i--; - } - - return s.substring(0, i); -} - -// Takes an XPath query and returns the results -Scholar.Ingester.Utilities.prototype.gatherElementsOnXPath = function(doc, parentNode, xpath, nsResolver) { - var elmts = []; - - var iterator = doc.evaluate(xpath, parentNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null); - var elmt = iterator.iterateNext(); - var i = 0; - while (elmt) { - elmts[i++] = elmt; - elmt = iterator.iterateNext(); - } - return elmts; -} - -// Loads a single document for a scraper, running succeeded() on success or -// failed() on failure -Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succeeded, failed) { - Scholar.debug("loadDocument called"); - this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed); -} - -// Downloads and processes documents with processor() -// browser - a browser object -// firstDoc - the first document to process with the processor (if null, -// first document is processed without processor) -// urls - an array of URLs to load -// processor - a function to execute to process each document -// done - a function to execute when all document processing is complete -// exception - a function to execute if an exception occurs (exceptions are -// also logged in the Scholar for Firefox log) -Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) { - var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(this.window); - var myWindow = this.window; - var prevUrl, url; - Scholar.debug("processDocuments called"); - - try { - if (urls.length == 0) { - if(firstDoc) { - processor(firstDoc, done); - } else { - done(); - } - return; - } - - var urlIndex = -1; - var doLoad = function() { - urlIndex++; - if (urlIndex < urls.length) { - url = urls[urlIndex]; - if(this.proxiedURL) { - url = Scholar.Ingester.ProxyMonitor.properToProxy(url); - } - try { - Scholar.debug("loading "+url); - hiddenBrowser.loadURI(url); - } catch (e) { - Scholar.debug("Scholar.Ingester.Utilities.processDocuments doLoad: " + e, 2); - exception(e); - } - } else { - hiddenBrowser.removeEventListener("load", onLoad, true); - Scholar.Ingester.deleteHiddenBrowser(hiddenBrowser); - done(); - } - }; - var onLoad = function() { - Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded"); - if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times - prevUrl = hiddenBrowser.contentDocument.location.href; - try { - var newHiddenBrowser = new Object(); - newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument; - newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow; - processor(newHiddenBrowser); - } catch (e) { - Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2); - exception(e); - } - doLoad(); - } - }; - var init = function() { - Scholar.debug("init called"); - hiddenBrowser.addEventListener("load", onLoad, true); - - if (firstDoc) { - Scholar.debug("processing"); - processor(firstDoc, doLoad); - } else { - Scholar.debug("doing load"); - doLoad(); - } - } - - init(); - } catch (e) { - Scholar.debug("processDocuments: " + e); - exception(e); - } -} - -// Appears to look for links in a document containing a certain substring -Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, substring) { - var urls = []; - var addedURLs = []; - - var aElements = doc.evaluate("//a", doc, null, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null); - var aElement = aElements.iterateNext(); - while (aElement) { - var href = aElement.href; - if (href.indexOf(substring) >= 0 && !(addedURLs[href])) { - urls.unshift(href); - addedURLs[href] = true; - } - aElement = aElements.iterateNext(); - } - return urls; -} - -// For now, we're going to skip the getLLsFromAddresses function (which gets -// latitude and longitude pairs from a series of addresses, but requires the -// big mess of Java code that is the Piggy Bank server) and the geoHelper -// tools (which rely on getLLsFromAddresses) since these are probably not -// essential components for Scholar and would take a great deal of effort to -// implement. We can, however, always implement them later. - -/* - * BEGIN SCHOLAR FOR FIREFOX EXTENSIONS - * Functions below this point are extensions to the utilities provided by - * Piggy Bank. When used in external code, the repository will need to add - * a function definition when exporting in Piggy Bank format. - */ - -/* - * Converts a JavaScript date object to an ISO-style date - */ -Scholar.Ingester.Utilities.prototype.dateToISO = function(jsDate) { - var date = ""; - var year = jsDate.getFullYear().toString(); - var month = (jsDate.getMonth()+1).toString(); - var day = jsDate.getDate().toString(); - - for(var i = year.length; i<4; i++) { - date += "0"; - } - date += year+"-"; - - if(month.length == 1) { - date += "0"; - } - date += month+"-"; - - if(day.length == 1) { - date += "0"; - } - date += day; - - return date; -} - -/* - * Gets a given node (assumes only one value) - */ -Scholar.Ingester.Utilities.prototype.getNode = function(doc, contextNode, xpath, nsResolver) { - return doc.evaluate(xpath, contextNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null).iterateNext(); -} - -/* - * Gets a given node as a string containing all child nodes - */ -Scholar.Ingester.Utilities.prototype.getNodeString = function(doc, contextNode, xpath, nsResolver) { - var elmts = this.gatherElementsOnXPath(doc, contextNode, xpath, nsResolver); - var returnVar = ""; - for(var i=0; i 1) { - author = splitNames[1]+' '+splitNames[0]; - } - return author; -} - -/* - * Cleans whitespace off a string and replaces multiple spaces with one - */ -Scholar.Ingester.Utilities.prototype.cleanString = function(s) { - s = s.replace(/[ \xA0]+/g, " "); - return this.trimString(s); -} - -/* - * Cleans any non-world non-parenthesis characters off the ends of a string - */ -Scholar.Ingester.Utilities.prototype.superCleanString = function(x) { - var x = x.replace(/^[^\w(]+/, ""); - return x.replace(/[^\w)]+$/, ""); -} - -/* - * Eliminates HTML tags, replacing
s with /ns - */ -Scholar.Ingester.Utilities.prototype.cleanTags = function(x) { - x = x.replace(/]*>/gi, "\n"); - return x.replace(/<[^>]+>/g, ""); -} - -/* - * Allows a user to select which items to scrape - */ -Scholar.Ingester.Utilities.prototype.selectItems = function(itemList) { - // mozillazine made me do it! honest! - var io = { dataIn:itemList, dataOut:null } - var newDialog = this.window.openDialog("chrome://scholar/content/ingester/selectitems.xul", - "_blank","chrome,modal,centerscreen,resizable=yes", io); - return io.dataOut; -} - -/* - * Grabs items based on URLs - */ -Scholar.Ingester.Utilities.prototype.getItemArray = function(doc, inHere, urlRe, rejectRe) { - var availableItems = new Object(); // Technically, associative arrays are objects - - // Require link to match this - if(urlRe) { - var urlRegexp = new RegExp(); - urlRegexp.compile(urlRe, "i"); - } - // Do not allow text to match this - if(rejectRe) { - var rejectRegexp = new RegExp(); - rejectRegexp.compile(rejectRe, "i"); - } - - if(!inHere.length) { - inHere = new Array(inHere); - } - - for(var j=0; j 0) { + var c = s.charAt(i - 1); + if (spaceChars.indexOf(c) < 0) { + break; + } + i--; + } + + return s.substring(0, i); +} + +/* + * BEGIN SCHOLAR FOR FIREFOX EXTENSIONS + * Functions below this point are extensions to the utilities provided by + * Piggy Bank. When used in external code, the repository will need to add + * a function definition when exporting in Piggy Bank format. + */ + +/* + * Converts a JavaScript date object to an ISO-style date + */ +Scholar.Utilities.prototype.dateToISO = function(jsDate) { + var date = ""; + var year = jsDate.getFullYear().toString(); + var month = (jsDate.getMonth()+1).toString(); + var day = jsDate.getDate().toString(); + + for(var i = year.length; i<4; i++) { + date += "0"; + } + date += year+"-"; + + if(month.length == 1) { + date += "0"; + } + date += month+"-"; + + if(day.length == 1) { + date += "0"; + } + date += day; + + return date; +} + +/* + * Cleans extraneous punctuation off an author name + */ +Scholar.Utilities.prototype.cleanAuthor = function(author) { + author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); + author = author.replace(/[\s\,\/\[\]\:\.]+$/, ''); + author = author.replace(/ +/, ' '); + // Add period for initials + if(author.substring(author.length-2, author.length-1) == " ") { + author += "."; + } + var splitNames = author.split(', '); + if(splitNames.length > 1) { + author = splitNames[1]+' '+splitNames[0]; + } + return author; +} + +/* + * Cleans whitespace off a string and replaces multiple spaces with one + */ +Scholar.Utilities.prototype.cleanString = function(s) { + s = s.replace(/[ \xA0]+/g, " "); + return this.trimString(s); +} + +/* + * Cleans any non-word non-parenthesis characters off the ends of a string + */ +Scholar.Utilities.prototype.superCleanString = function(x) { + var x = x.replace(/^[^\w(]+/, ""); + return x.replace(/[^\w)]+$/, ""); +} + +/* + * Eliminates HTML tags, replacing
s with /ns + */ +Scholar.Utilities.prototype.cleanTags = function(x) { + x = x.replace(/]*>/gi, "\n"); + return x.replace(/<[^>]+>/g, ""); +} + +// These functions are for use by importMARCRecord. They're private, because, +// while they are useful, it's also nice if as many of our scrapers as possible +// are PiggyBank compatible, and if our scrapers used functions, that would +// break compatibility +Scholar.Utilities.prototype._MARCCleanString = function(author) { + author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); + author = author.replace(/[\s\.\,\/\[\]\:]+$/, ''); + return author.replace(/ +/, ' '); +} + +Scholar.Utilities.prototype._MARCCleanNumber = function(author) { + author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); + author = author.replace(/[\s\.\,\/\[\]\:]+$/, ''); + var regexp = /^[^ ]*/; + var m = regexp.exec(author); + if(m) { + return m[0]; + } +} +Scholar.Utilities.prototype._MARCPullYear = function(text) { + var pullRe = /[0-9]+/; + var m = pullRe.exec(text); + if(m) { + return m[0]; + } +} + +Scholar.Utilities.prototype._MARCAssociateField = function(record, uri, model, fieldNo, rdfUri, execMe, prefix, part) { + if(!part) { + part = 'a'; + } + var field = record.get_field_subfields(fieldNo); + Scholar.debug('Found '+field.length+' matches for '+fieldNo+part); + if(field) { + for(i in field) { + var value; + for(var j=0; j= 0 && !(addedURLs[href])) { + urls.unshift(href); + addedURLs[href] = true; + } + aElement = aElements.iterateNext(); + } + return urls; +} + +// For now, we're going to skip the getLLsFromAddresses function (which gets +// latitude and longitude pairs from a series of addresses, but requires the +// big mess of Java code that is the Piggy Bank server) and the geoHelper +// tools (which rely on getLLsFromAddresses) since these are probably not +// essential components for Scholar and would take a great deal of effort to +// implement. We can, however, always implement them later. + +/* + * BEGIN SCHOLAR FOR FIREFOX EXTENSIONS + */ + +/* + * Gets a given node (assumes only one value) + */ +Scholar.Utilities.Ingester.prototype.getNode = function(doc, contextNode, xpath, nsResolver) { + return doc.evaluate(xpath, contextNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null).iterateNext(); +} + +/* + * Gets a given node as a string containing all child nodes + */ +Scholar.Utilities.Ingester.prototype.getNodeString = function(doc, contextNode, xpath, nsResolver) { + var elmts = this.gatherElementsOnXPath(doc, contextNode, xpath, nsResolver); + var returnVar = ""; + for(var i=0; i