From 7d3deb5b9ffda685851ea2210666a34d25f88723 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Wed, 21 Jun 2006 01:41:07 +0000 Subject: [PATCH] - Make Scholar.Ingester.Utilities.loadDocument() attach an event handler to load rather than DOMContentLoaded to resolve an issue with the Ex Libris/Aleph scraper (VCU) - When possible, corporate creators/contributors are categorized with their own RDF types (prefixDummy + "corporateCreator/corporateContributor) - Remove extraneous debug code in extensions --- .../content/scholar/xpcom/ingester.js | 41 +++++++++++++++++-- scrapers.sql | 35 ++++------------ 2 files changed, 45 insertions(+), 31 deletions(-) diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js index 35cc748df7..4c22de5108 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/ingester.js +++ b/chrome/chromeFiles/content/scholar/xpcom/ingester.js @@ -164,7 +164,7 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD }; var init = function() { Scholar.debug("init called"); - hiddenBrowser.addEventListener("DOMContentLoaded", onLoad, true); + hiddenBrowser.addEventListener("load", onLoad, true); if (firstDoc) { Scholar.debug("processing"); @@ -213,6 +213,10 @@ Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, su * Piggy Bank. When used in external code, the repository will need to add * a function definition when exporting in Piggy Bank format. */ + +/* + * Converts a JavaScript date object to an ISO-style date + */ Scholar.Ingester.Utilities.prototype.dateToISO = function(jsDate) { var date = ""; var year = jsDate.getFullYear().toString(); @@ -237,10 +241,28 @@ Scholar.Ingester.Utilities.prototype.dateToISO = function(jsDate) { return date; } +/* + * Gets a given node (assumes only one value) + */ Scholar.Ingester.Utilities.prototype.getNode = function(doc, contextNode, xpath, nsResolver) { return doc.evaluate(xpath, contextNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null).iterateNext(); } +/* + * Gets a given node as a string containing all child nodes + */ +Scholar.Ingester.Utilities.prototype.getNodeString = function(doc, contextNode, xpath, nsResolver) { + var elmts = this.gatherElementsOnXPath(doc, contextNode, xpath, nsResolver); + var returnVar = ""; + for(var i=0; is with /ns + */ Scholar.Ingester.Utilities.prototype.cleanTags = function(x) { x = x.replace(/]*>/gi, "\n"); return x.replace(/<[^>]+>/g, ""); @@ -555,6 +586,8 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) { Scholar.debug("Scraping "+this.browser.contentDocument.location.href); + Scholar.debug(this.scraper.scraperJavaScript); + var scraperSandbox = this._sandbox; try { Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox); @@ -563,6 +596,8 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) { this._scrapePageComplete(); } + Scholar.debug("scraping complete"); + // If synchronous, call _scrapePageComplete(); if(!this._waitForCompletion) { this._scrapePageComplete(); @@ -694,13 +729,13 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() { } if(this.model.data[uri][prefixDummy + 'corporateCreator']) { for(i in this.model.data[uri][prefixDummy + 'corporateCreator']) { - newItem.setCreator(creatorIndex, this.model.data[uri][prefixDummy + 'corporateCreator'][i], null, 1); + newItem.setCreator(creatorIndex, null, this.model.data[uri][prefixDummy + 'corporateCreator'][i], 1); creatorIndex++; } } if(this.model.data[uri][prefixDummy + 'corporateContributor']) { for(i in this.model.data[uri][prefixDummy + 'corporateContributor']) { - newItem.setCreator(creatorIndex, this.model.data[uri][prefixDummy + 'corporateContributor'][i], null, 2); + newItem.setCreator(creatorIndex, null, this.model.data[uri][prefixDummy + 'corporateContributor'][i], 2); creatorIndex++; } } diff --git a/scrapers.sql b/scrapers.sql index 271f1b5efb..806c732aa3 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -247,7 +247,6 @@ if(!elmts.length) { var xpath = ''/html/body/div[@class="indent"]/center/p/font/a[@class="nav"]''; var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); } -utilities.debugPrint(elmts.length); if(elmts && elmts.length) { return true; } @@ -333,7 +332,6 @@ utilities.HTTPUtilities.doPost(''http://www.jstor.org/browse'', postData, null, data[prefixDummy + "series"].push(fieldContent); } else if(fieldCode == "DA") { var date = new Date(fieldContent.replace(".", "")); - utilities.debugPrint(date.valueOf()); if(isNaN(date.valueOf())) { data[prefixDC + "date"].push(fieldContent); } else { @@ -540,7 +538,7 @@ for (var i = 0; i < elmts.length; i++) { rdfUri = prefixDC + ''contributor''; value = utilities.cleanAuthor(node.nodeValue); } else if(field == "corporate author") { - rdfUri = prefixDC + ''creator''; + rdfUri = prefixDummy + ''corporateCreator''; } if(rdfUri) { var insert = true; @@ -807,7 +805,6 @@ if(m) { var bylineRegexp = /\nBYLINE: *(\w[\w\- ]+)/; var m = bylineRegexp.exec(citationData); if(m) { - utilities.debugPrint(m[1].substring(0, 3).toLowerCase()); if(m[1].substring(0, 3).toLowerCase() == "by ") { m[1] = m[1].substring(3); } @@ -835,7 +832,6 @@ var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var uri = doc.location.href; var newUri = uri.replace("&format=999", "&format=001"); -utilities.debugPrint(newUri); utilities.loadDocument(newUri, browser, function(newBrowser) { newDoc = newBrowser.contentDocument; @@ -847,11 +843,11 @@ utilities.loadDocument(newUri, browser, function(newBrowser) { var xpath = ''/html/body/table/tbody/tr[td[1][@class="td1"][@id="bold"]][td[2][@class="td1"]]''; var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); - var record = new MARC_Record(); + var record = new MARC_Record(); for(var i=0; i