- Make Scholar.Ingester.Utilities.loadDocument() attach an event handler to load rather than DOMContentLoaded to resolve an issue with the Ex Libris/Aleph scraper (VCU)
- When possible, corporate creators/contributors are categorized with their own RDF types (prefixDummy + "corporateCreator/corporateContributor) - Remove extraneous debug code in extensions
This commit is contained in:
parent
6c89acbe0d
commit
7d3deb5b9f
2 changed files with 45 additions and 31 deletions
|
@ -164,7 +164,7 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD
|
|||
};
|
||||
var init = function() {
|
||||
Scholar.debug("init called");
|
||||
hiddenBrowser.addEventListener("DOMContentLoaded", onLoad, true);
|
||||
hiddenBrowser.addEventListener("load", onLoad, true);
|
||||
|
||||
if (firstDoc) {
|
||||
Scholar.debug("processing");
|
||||
|
@ -213,6 +213,10 @@ Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, su
|
|||
* Piggy Bank. When used in external code, the repository will need to add
|
||||
* a function definition when exporting in Piggy Bank format.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Converts a JavaScript date object to an ISO-style date
|
||||
*/
|
||||
Scholar.Ingester.Utilities.prototype.dateToISO = function(jsDate) {
|
||||
var date = "";
|
||||
var year = jsDate.getFullYear().toString();
|
||||
|
@ -237,10 +241,28 @@ Scholar.Ingester.Utilities.prototype.dateToISO = function(jsDate) {
|
|||
return date;
|
||||
}
|
||||
|
||||
/*
|
||||
* Gets a given node (assumes only one value)
|
||||
*/
|
||||
Scholar.Ingester.Utilities.prototype.getNode = function(doc, contextNode, xpath, nsResolver) {
|
||||
return doc.evaluate(xpath, contextNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null).iterateNext();
|
||||
}
|
||||
|
||||
/*
|
||||
* Gets a given node as a string containing all child nodes
|
||||
*/
|
||||
Scholar.Ingester.Utilities.prototype.getNodeString = function(doc, contextNode, xpath, nsResolver) {
|
||||
var elmts = this.gatherElementsOnXPath(doc, contextNode, xpath, nsResolver);
|
||||
var returnVar = "";
|
||||
for(var i=0; i<elmts.length; i++) {
|
||||
returnVar += elmts[i].nodeValue;
|
||||
}
|
||||
return returnVar;
|
||||
}
|
||||
|
||||
/*
|
||||
* Cleans extraneous punctuation off an author name
|
||||
*/
|
||||
Scholar.Ingester.Utilities.prototype.cleanAuthor = function(author) {
|
||||
author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
|
||||
author = author.replace(/[\s\,\/\[\]\:\.]+$/, '');
|
||||
|
@ -256,16 +278,25 @@ Scholar.Ingester.Utilities.prototype.cleanAuthor = function(author) {
|
|||
return author;
|
||||
}
|
||||
|
||||
/*
|
||||
* Cleans whitespace off a string and replaces multiple spaces with one
|
||||
*/
|
||||
Scholar.Ingester.Utilities.prototype.cleanString = function(s) {
|
||||
s = this.trimString(s);
|
||||
return s.replace(/ +/g, " ");
|
||||
}
|
||||
|
||||
/*
|
||||
* Cleans any non-world non-parenthesis characters off the ends of a string
|
||||
*/
|
||||
Scholar.Ingester.Utilities.prototype.superCleanString = function(x) {
|
||||
var x = x.replace(/^[^\w(]+/, "");
|
||||
return x.replace(/[^\w)]+$/, "");
|
||||
}
|
||||
|
||||
/*
|
||||
* Eliminates HTML tags, replacing <br>s with /ns
|
||||
*/
|
||||
Scholar.Ingester.Utilities.prototype.cleanTags = function(x) {
|
||||
x = x.replace(/<br[^>]*>/gi, "\n");
|
||||
return x.replace(/<[^>]+>/g, "");
|
||||
|
@ -555,6 +586,8 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
|
|||
|
||||
Scholar.debug("Scraping "+this.browser.contentDocument.location.href);
|
||||
|
||||
Scholar.debug(this.scraper.scraperJavaScript);
|
||||
|
||||
var scraperSandbox = this._sandbox;
|
||||
try {
|
||||
Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox);
|
||||
|
@ -563,6 +596,8 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
|
|||
this._scrapePageComplete();
|
||||
}
|
||||
|
||||
Scholar.debug("scraping complete");
|
||||
|
||||
// If synchronous, call _scrapePageComplete();
|
||||
if(!this._waitForCompletion) {
|
||||
this._scrapePageComplete();
|
||||
|
@ -694,13 +729,13 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
|
|||
}
|
||||
if(this.model.data[uri][prefixDummy + 'corporateCreator']) {
|
||||
for(i in this.model.data[uri][prefixDummy + 'corporateCreator']) {
|
||||
newItem.setCreator(creatorIndex, this.model.data[uri][prefixDummy + 'corporateCreator'][i], null, 1);
|
||||
newItem.setCreator(creatorIndex, null, this.model.data[uri][prefixDummy + 'corporateCreator'][i], 1);
|
||||
creatorIndex++;
|
||||
}
|
||||
}
|
||||
if(this.model.data[uri][prefixDummy + 'corporateContributor']) {
|
||||
for(i in this.model.data[uri][prefixDummy + 'corporateContributor']) {
|
||||
newItem.setCreator(creatorIndex, this.model.data[uri][prefixDummy + 'corporateContributor'][i], null, 2);
|
||||
newItem.setCreator(creatorIndex, null, this.model.data[uri][prefixDummy + 'corporateContributor'][i], 2);
|
||||
creatorIndex++;
|
||||
}
|
||||
}
|
||||
|
|
33
scrapers.sql
33
scrapers.sql
|
@ -247,7 +247,6 @@ if(!elmts.length) {
|
|||
var xpath = ''/html/body/div[@class="indent"]/center/p/font/a[@class="nav"]'';
|
||||
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||
}
|
||||
utilities.debugPrint(elmts.length);
|
||||
if(elmts && elmts.length) {
|
||||
return true;
|
||||
}
|
||||
|
@ -333,7 +332,6 @@ utilities.HTTPUtilities.doPost(''http://www.jstor.org/browse'', postData, null,
|
|||
data[prefixDummy + "series"].push(fieldContent);
|
||||
} else if(fieldCode == "DA") {
|
||||
var date = new Date(fieldContent.replace(".", ""));
|
||||
utilities.debugPrint(date.valueOf());
|
||||
if(isNaN(date.valueOf())) {
|
||||
data[prefixDC + "date"].push(fieldContent);
|
||||
} else {
|
||||
|
@ -540,7 +538,7 @@ for (var i = 0; i < elmts.length; i++) {
|
|||
rdfUri = prefixDC + ''contributor'';
|
||||
value = utilities.cleanAuthor(node.nodeValue);
|
||||
} else if(field == "corporate author") {
|
||||
rdfUri = prefixDC + ''creator'';
|
||||
rdfUri = prefixDummy + ''corporateCreator'';
|
||||
}
|
||||
if(rdfUri) {
|
||||
var insert = true;
|
||||
|
@ -807,7 +805,6 @@ if(m) {
|
|||
var bylineRegexp = /\nBYLINE: *(\w[\w\- ]+)/;
|
||||
var m = bylineRegexp.exec(citationData);
|
||||
if(m) {
|
||||
utilities.debugPrint(m[1].substring(0, 3).toLowerCase());
|
||||
if(m[1].substring(0, 3).toLowerCase() == "by ") {
|
||||
m[1] = m[1].substring(3);
|
||||
}
|
||||
|
@ -835,7 +832,6 @@ var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
|||
var uri = doc.location.href;
|
||||
|
||||
var newUri = uri.replace("&format=999", "&format=001");
|
||||
utilities.debugPrint(newUri);
|
||||
|
||||
utilities.loadDocument(newUri, browser, function(newBrowser) {
|
||||
newDoc = newBrowser.contentDocument;
|
||||
|
@ -851,7 +847,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
|
|||
for(var i=0; i<elmts.length; i++) {
|
||||
var elmt = elmts[i];
|
||||
var field = utilities.superCleanString(utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue);
|
||||
var value = utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver).nodeValue;
|
||||
var value = utilities.getNodeString(doc, elmt, ''./TD[2]//text()'', nsResolver);
|
||||
var value = value.replace(/\|([a-z]) /g, record.subfield_delimiter+"$1");
|
||||
|
||||
if(field != "FMT" && field != "LDR") {
|
||||
|
@ -868,9 +864,10 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
|
|||
}
|
||||
}
|
||||
|
||||
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
|
||||
model = utilities.importMARCRecord(record, uri, model);
|
||||
done();
|
||||
}, function() {})
|
||||
}, function() {});
|
||||
|
||||
wait();');
|
||||
|
||||
|
@ -882,16 +879,6 @@ var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
|||
|
||||
var uri = doc.location.href;
|
||||
var newUri = uri+''&fullmarc=true'';
|
||||
utilities.debugPrint(newUri);
|
||||
|
||||
var utilities.getNodeString = function(doc, contextNode, xpath, nsResolver) {
|
||||
var elmts = utilities.gatherElementsOnXPath(doc, contextNode, xpath, nsResolver);
|
||||
var returnVar = "";
|
||||
for(var i=0; i<elmts.length; i++) {
|
||||
returnVar += elmts[i].nodeValue;
|
||||
}
|
||||
return returnVar;
|
||||
}
|
||||
|
||||
utilities.loadDocument(newUri, browser, function(newBrowser) {
|
||||
newDoc = newBrowser.contentDocument;
|
||||
|
@ -906,8 +893,8 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
|
|||
var record = new MARC_Record();
|
||||
for(var i=0; i<elmts.length; i++) {
|
||||
var elmt = elmts[i];
|
||||
var field = utilities.superCleanString(utilities.getNode(doc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver).nodeValue);
|
||||
var value = utilities.getNodeString(doc, elmt, ''./TD[2]/TABLE[1]/TBODY[1]/TR[1]/TD[1]/A[1]//text()'', nsResolver);
|
||||
var field = utilities.superCleanString(utilities.getNode(newDoc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver).nodeValue);
|
||||
var value = utilities.getNodeString(newDoc, elmt, ''./TD[2]/TABLE[1]/TBODY[1]/TR[1]/TD[1]/A[1]//text()'', nsResolver);
|
||||
value = value.replace(/\$([a-z]) /g, record.subfield_delimiter+"$1");
|
||||
|
||||
if(field != "FMT" && field != "LDR") {
|
||||
|
@ -940,7 +927,6 @@ var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
|||
|
||||
var uri = doc.location.href;
|
||||
var newUri = uri.replace(/function=[A-Z]{7}/, "function=MARCSCR");
|
||||
utilities.debugPrint(newUri);
|
||||
|
||||
utilities.loadDocument(newUri, browser, function(newBrowser) {
|
||||
newDoc = newBrowser.contentDocument;
|
||||
|
@ -992,8 +978,6 @@ if(uri.indexOf("authority_hits") < 0) {
|
|||
var newUri = m[1]+"download_record"+m[2]+"/RECORD.MRC?format=marc";
|
||||
}
|
||||
|
||||
utilities.debugPrint(newUri);
|
||||
|
||||
utilities.HTTPUtilities.doGet(newUri, null, function(text) {
|
||||
var record = new MARC_Record();
|
||||
record.load(text, "binary");
|
||||
|
@ -1070,7 +1054,6 @@ var nsResolver = namespace ? function(prefix) {
|
|||
|
||||
var elmts = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p/text()[1]'', nsResolver);
|
||||
for(i in elmts) {
|
||||
utilities.debugPrint(elmts[i].nodeValue);
|
||||
if(elmts[i].nodeValue == "\n\nViewing record\n") {
|
||||
return true;
|
||||
}
|
||||
|
@ -1090,7 +1073,6 @@ var uri = doc.location.href;
|
|||
var uriRegexp = /^(.*)(\/[0-9]+)$/;
|
||||
var m = uriRegexp.exec(uri);
|
||||
var newUri = m[1]+"/40";
|
||||
utilities.debugPrint(newUri);
|
||||
|
||||
var elmts = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p'', nsResolver);
|
||||
for(i in elmts) {
|
||||
|
@ -1154,7 +1136,6 @@ var nsResolver = namespace ? function(prefix) {
|
|||
|
||||
var uri = doc.location.href;
|
||||
var newUri = uri.replace("LabelDisplay", "MARCDisplay");
|
||||
utilities.debugPrint(newUri);
|
||||
|
||||
utilities.loadDocument(newUri, browser, function(newBrowser) {
|
||||
newDoc = newBrowser.contentDocument;
|
||||
|
@ -1206,7 +1187,6 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
|
|||
}
|
||||
|
||||
record.add_field(tag, ind1, ind2, content);
|
||||
utilities.debugPrint("tag:"+tag+" ind1:"+ind1+" ind2:"+ind2+" content:"+content);
|
||||
}
|
||||
|
||||
model = utilities.importMARCRecord(record, uri, model);
|
||||
|
@ -1304,7 +1284,6 @@ utilities.HTTPUtilities.doGet(newUri, null, function(text) {
|
|||
var xml = new XML(text);
|
||||
|
||||
for(var i=0; i<xml.PubmedArticle.length(); i++) {
|
||||
utilities.debugPrint("one article...");
|
||||
var citation = xml.PubmedArticle[i].MedlineCitation;
|
||||
|
||||
if(citation.PMID.length()) {
|
||||
|
|
Loading…
Reference in a new issue