diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js
index 3907562a8d..1896891fa5 100644
--- a/chrome/chromeFiles/content/scholar/xpcom/ingester.js
+++ b/chrome/chromeFiles/content/scholar/xpcom/ingester.js
@@ -207,17 +207,41 @@ Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, su
// essential components for Scholar and would take a great deal of effort to
// implement. We can, however, always implement them later.
-// These functions are for use by importMARCRecord. They're private, because,
-// while they are useful, it's also nice if as many of our scrapers as possible
-// are PiggyBank compatible, and if our scrapers used functions, that would
-// break compatibility
-Scholar.Ingester.Utilities.prototype._MARCCleanString = function(author) {
- author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
- author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
- return author.replace(/ +/, ' ');
+/*
+ * BEGIN FIREFOX SCHOLAR EXTENSIONS
+ * Functions below this point are extensions to the utilities provided by
+ * Piggy Bank. When used in external code, the repository will need to add
+ * a function definition when exporting in Piggy Bank format.
+ */
+Scholar.Ingester.Utilities.prototype.dateToISO = function(jsDate) {
+ var date = "";
+ var year = jsDate.getFullYear().toString();
+ var month = (jsDate.getMonth()+1).toString();
+ var day = jsDate.getDate().toString();
+
+ for(var i = year.length; i<4; i++) {
+ date += "0";
+ }
+ date += year+"-";
+
+ if(month.length == 1) {
+ date += "0";
+ }
+ date += month+"-";
+
+ if(day.length == 1) {
+ date += "0";
+ }
+ date += day;
+
+ return date;
}
-Scholar.Ingester.Utilities.prototype._MARCCleanAuthor = function(author) {
+Scholar.Ingester.Utilities.prototype.getNode = function(doc, contextNode, xpath, nsResolver) {
+ return doc.evaluate(xpath, contextNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null).iterateNext();
+}
+
+Scholar.Ingester.Utilities.prototype.cleanAuthor = function(author) {
author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
author = author.replace(/[\s\,\/\[\]\:\.]+$/, '');
author = author.replace(/ +/, ' ');
@@ -232,6 +256,31 @@ Scholar.Ingester.Utilities.prototype._MARCCleanAuthor = function(author) {
return author;
}
+Scholar.Ingester.Utilities.prototype.cleanString = function(s) {
+ s = this.trimString(s);
+ return s.replace(/ +/g, " ");
+}
+
+Scholar.Ingester.Utilities.prototype.superCleanString = function(x) {
+ var x = x.replace(/^[^\w(]+/, "");
+ return x.replace(/[^\w)]+$/, "");
+}
+
+Scholar.Ingester.Utilities.prototype.cleanTags = function(x) {
+ x = x.replace(/
]*>/gi, "\n");
+ return x.replace(/<[^>]+>/g, "");
+}
+
+// These functions are for use by importMARCRecord. They're private, because,
+// while they are useful, it's also nice if as many of our scrapers as possible
+// are PiggyBank compatible, and if our scrapers used functions, that would
+// break compatibility
+Scholar.Ingester.Utilities.prototype._MARCCleanString = function(author) {
+ author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
+ author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
+ return author.replace(/ +/, ' ');
+}
+
Scholar.Ingester.Utilities.prototype._MARCCleanNumber = function(author) {
author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
@@ -283,11 +332,11 @@ Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, mo
// Extract ISSNs
model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN ');
// Extract creators
- model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this._MARCCleanAuthor);
+ model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this.cleanAuthor);
model = this._MARCAssociateField(record, uri, model, '110', prefixDC + 'creator', this._MARCCleanString);
model = this._MARCAssociateField(record, uri, model, '111', prefixDC + 'creator', this._MARCCleanString);
model = this._MARCAssociateField(record, uri, model, '130', prefixDC + 'creator', this._MARCCleanString);
- model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this._MARCCleanAuthor);
+ model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this.cleanAuthor);
model = this._MARCAssociateField(record, uri, model, '710', prefixDC + 'contributor', this._MARCCleanString);
model = this._MARCAssociateField(record, uri, model, '711', prefixDC + 'contributor', this._MARCCleanString);
model = this._MARCAssociateField(record, uri, model, '730', prefixDC + 'contributor', this._MARCCleanString);
@@ -295,7 +344,7 @@ Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, mo
// in the person subject field as the first entry
var field = record.get_field_subfields('600');
if(field[0]) {
- model.addStatement(uri, prefixDC + 'creator', this._MARCCleanAuthor(field[0]['a']));
+ model.addStatement(uri, prefixDC + 'creator', this.cleanAuthor(field[0]['a']));
}
}
// Extract title
@@ -312,6 +361,9 @@ Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, mo
model = this._MARCAssociateField(record, uri, model, '440', prefixDummy + 'series', this._MARCCleanString);
}
+/*
+ * END FIREFOX SCHOLAR EXTENSIONS
+ */
// These are front ends for XMLHttpRequest. XMLHttpRequest can't actually be
// accessed outside the sandbox, and even if it could, it wouldn't let scripts
@@ -573,6 +625,19 @@ Scholar.Ingester.Document.prototype._generateSandbox = function() {
this._sandbox.done = function(){ me._scrapePageComplete(); };
}
+Scholar.Ingester.Document.prototype._associateRDF = function(rdfUri, field, uri, item, typeID) {
+ var fieldID;
+ if(fieldID = Scholar.ItemFields.getID(field)) {
+ if(this.model.data[uri][rdfUri] && Scholar.ItemFields.isValidForType(fieldID, typeID)) {
+ item.setField(field, this.model.data[uri][rdfUri][0]);
+ } else {
+ Scholar.debug("discarded scraper " + field + " data: not valid for item type "+typeID);
+ }
+ } else {
+ Scholar.debug("discarded scraper " + field + " data: no field in database");
+ }
+}
+
/*
* Add data ingested using RDF to database
* (Ontologies are hard-coded until we have a real way of dealing with them)
@@ -585,17 +650,27 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
var prefixDCMI = 'http://purl.org/dc/dcmitype/';
var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
+ var typeToTypeID = new Object();
+ typeToTypeID[prefixDummy + 'book'] = 1;
+ typeToTypeID[prefixDummy + 'journal'] = 2;
+ typeToTypeID[prefixDummy + 'newspaper'] = 2;
+
try {
for(var uri in this.model.data) {
- if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) {
- var newItem = Scholar.Items.getNewItemByType(2);
- } else {
- var newItem = Scholar.Items.getNewItemByType(1);
+ var typeID = typeToTypeID[this.model.data[uri][prefixRDF + 'type']];
+ if(!typeID) {
+ var typeID = 1;
}
+
+ var newItem = Scholar.Items.getNewItemByType(typeID);
+
+ // Handle source and title
newItem.setField("source", uri);
if(this.model.data[uri][prefixDC + 'title']) {
newItem.setField("title", this.model.data[uri][prefixDC + 'title'][0]);
}
+
+ // Handle creators and contributors
var creatorIndex = 0;
if(this.model.data[uri][prefixDC + 'creator']) {
for(i in this.model.data[uri][prefixDC + 'creator']) {
@@ -619,54 +694,45 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
creatorIndex++;
}
}
- if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) {
- if(this.model.data[uri][prefixDummy + 'publication']) {
- newItem.setField("publication", this.model.data[uri][prefixDummy + 'publication'][0]);
- }
- if(this.model.data[uri][prefixDummy + 'volume']) {
- newItem.setField("volume", this.model.data[uri][prefixDummy + 'volume'][0]);
- }
- if(this.model.data[uri][prefixDummy + 'number']) {
- newItem.setField("number", this.model.data[uri][prefixDummy + 'number'][0]);
- }
- if(this.model.data[uri][prefixDummy + 'pages']) {
- newItem.setField("pages", this.model.data[uri][prefixDummy + 'pages'][0]);
- }
- if(this.model.data[uri][prefixDC + 'identifier']) {
- for(i in this.model.data[uri][prefixDC + 'identifier']) {
- if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISSN') {
- newItem.setField("ISSN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5));
- break;
- }
- }
- }
- } else {
- if(this.model.data[uri][prefixDC + 'publisher']) {
- newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher'][0]);
- }
+
+ // Handle years, extracting from date if necessary
+ if(Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("year"), typeID)) {
if(this.model.data[uri][prefixDC + 'year']) {
newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]);
} else if(this.model.data[uri][prefixDC + 'date'] && this.model.data[uri][prefixDC + 'date'][0].length >= 4) {
newItem.setField("year", this.model.data[uri][prefixDC + 'date'][0].substr(0, 4));
}
- if(this.model.data[uri][prefixDC + 'hasVersion']) {
- newItem.setField("edition", this.model.data[uri][prefixDC + 'hasVersion'][0]);
- }
- if(this.model.data[uri][prefixDummy + 'series']) {
- newItem.setField("series", this.model.data[uri][prefixDummy + 'series'][0]);
- }
- if(this.model.data[uri][prefixDummy + 'place']) {
- newItem.setField("place", this.model.data[uri][prefixDummy + 'place'][0]);
- }
- if(this.model.data[uri][prefixDC + 'identifier']) {
+ }
+
+ // Handle ISBNs/ISSNs
+ if(this.model.data[uri][prefixDC + 'identifier']) {
+ var needISSN = Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("ISSN"), typeID);
+ var needISBN = Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("ISBN"), typeID);
+ if(needISSN || needISBN) {
for(i in this.model.data[uri][prefixDC + 'identifier']) {
- if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISBN') {
+ firstFour = this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4);
+ if(needISSN && firstFour == 'ISSN') {
+ newItem.setField("ISSN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5));
+ break;
+ }
+ if(needISBN && firstFour == 'ISBN') {
newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5));
break;
}
}
}
}
+
+ this._associateRDF(prefixDummy + 'publication', "publication", uri, newItem, typeID);
+ this._associateRDF(prefixDummy + 'volume', "volume", uri, newItem, typeID);
+ this._associateRDF(prefixDummy + 'number', "number", uri, newItem, typeID);
+ this._associateRDF(prefixDummy + 'pages', "pages", uri, newItem, typeID);
+ this._associateRDF(prefixDC + 'publisher', "publisher", uri, newItem, typeID);
+ this._associateRDF(prefixDC + 'date', "date", uri, newItem, typeID);
+ this._associateRDF(prefixDC + 'hasVersion', "edition", uri, newItem, typeID);
+ this._associateRDF(prefixDummy + 'series', "series", uri, newItem, typeID);
+ this._associateRDF(prefixDummy + 'place', "place", uri, newItem, typeID);
+
this.items.push(newItem);
}
} catch(ex) {
diff --git a/scrapers.sql b/scrapers.sql
index ae14f1e3aa..3547026705 100644
--- a/scrapers.sql
+++ b/scrapers.sql
@@ -1,9 +1,9 @@
--- 5
+-- 6
-- Set the following timestamp to the most recent scraper update date
-REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-12 20:00:00'));
+REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-18 11:19:00'));
-REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-12 20:00:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/gp/product/', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-18 10:15:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/gp/product/', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
@@ -13,50 +13,15 @@ var nsResolver = namespace ? function(prefix) {
if (prefix == ''x'') return namespace; else return null;
} : null;
-var getNode = function(doc, contextNode, xpath, nsResolver) {
- return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
-}
-
-var cleanString = function(s) {
- s = utilities.trimString(s);
- return s.replace(/ +/g, " ");
-}
-
-var dateToISO = function(jsDate) {
- var date = "";
- var year = jsDate.getFullYear().toString();
- var month = (jsDate.getMonth()+1).toString();
- var day = jsDate.getDate().toString();
-
- for(var i = year.length; i<4; i++) {
- date += "0";
- }
- date += year+"-";
-
- if(month.length == 1) {
- date += "0";
- }
- date += month+"-";
-
- if(day.length == 1) {
- date += "0";
- }
- date += day;
-
- return date;
-}
-
var uri = doc.location.href;
-model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
-
// Retrieve authors
var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a'';
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
for (var i = 0; i < elmts.length; i++) {
var elmt = elmts[i];
- model.addStatement(uri, prefixDC + ''creator'', cleanString(getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue), false); // Use your own type here
+ model.addStatement(uri, prefixDC + ''creator'', utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue), false); // Use your own type here
}
// Retrieve data from "Product Details" box
@@ -64,14 +29,14 @@ var xpath = ''/html/body/table/tbody/tr/td[2]/table/tbody/tr/td[@class="bucket"]
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
for (var i = 0; i < elmts.length; i++) {
var elmt = elmts[i];
- var attribute = cleanString(getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue);
- if(getNode(doc, elmt, ''./text()[1]'', nsResolver)) {
- var value = cleanString(getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue);
+ var attribute = utilities.cleanString(utilities.getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue);
+ if(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver)) {
+ var value = utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue);
if(attribute == "Publisher:") {
if(value.lastIndexOf("(") != -1) {
var jsDate = value.substring(value.lastIndexOf("(")+1, value.length-1);
jsDate = new Date(jsDate);
- var date = dateToISO(jsDate);
+ var date = utilities.dateToISO(jsDate);
value = value.substring(0, value.lastIndexOf("(")-1);
}
@@ -95,13 +60,14 @@ for (var i = 0; i < elmts.length; i++) {
var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]'';
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
-var title = cleanString(getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue);
+var title = utilities.cleanString(utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue);
if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) {
title = title.substring(0, title.lastIndexOf("(")-1);
}
-model.addStatement(uri, prefixDC + ''title'', title);');
+model.addStatement(uri, prefixDC + ''title'', title);
+model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);');
-REPLACE INTO "scrapers" VALUES('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-12 20:00:00', 'WorldCat Scraper', 'Simon Kornblith', '^http://newfirstsearch\.oclc\.org/WebZ/',
+REPLACE INTO "scrapers" VALUES('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-18 11:02:00', 'WorldCat Scraper', 'Simon Kornblith', '^http://newfirstsearch\.oclc\.org/WebZ/',
'if(doc.title == ''FirstSearch: WorldCat Detailed Record'') {
return true;
}
@@ -139,22 +105,6 @@ if(rMatch) {
var newUri = ''http://newfirstsearch.oclc.org/WebZ/DirectExport?numrecs=10:smartpage=directexport:entityexportnumrecs=10:entityexportresultset='' + resultset + '':entityexportrecno='' + number + '':sessionid='' + sessionid + '':entitypagenum=35:0'';
-model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
-
-function cleanAuthor(author) {
- author = author.replace(/^[\s\.\,\/\[\]\:]+/, '''');
- author = author.replace(/[\s\,\/\[\]\:\.]+$/, '''');
- // Add period for initials
- if(author.substring(author.length-2, author.length-1) == " ") {
- author += ".";
- }
- var splitNames = author.split('', '');
- if(splitNames.length > 1) {
- author = splitNames[1]+'' ''+splitNames[0];
- }
- return author;
-}
-
utilities.HTTPUtilities.doPost(newUri, ''exportselect=record&exporttype=plaintext'', null, function(text) {
var lines = text.split(''\n'');
for(var i=0;i 1) {
- author = splitNames[1]+'' ''+splitNames[0];
- }
- return author;
-}
-
var uri = doc.location.href;
var data = new Object();
@@ -557,23 +459,29 @@ var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
for (var i = 0; i < elmts.length; i++) {
var elmt = elmts[i];
try {
- var node = getNode(doc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver);
+ var node = utilities.getNode(doc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver);
if(!node) {
- var node = getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver);
+ var node = utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver);
}
if(node) {
- var field = stringTrimmer(getNode(doc, elmt, ''./TH[1]/text()[1]'', nsResolver).nodeValue);
+ var field = utilities.superCleanString(utilities.getNode(doc, elmt, ''./TH[1]/text()[1]'', nsResolver).nodeValue);
field = field.toLowerCase();
- var value = stringTrimmer(node.nodeValue);
+ var value = utilities.superCleanString(node.nodeValue);
var rdfUri = null;
if(field == "publisher") {
rdfUri = prefixDC + ''publisher'';
} else if(field == "pub date") {
- rdfUri = prefixDC + ''date'';
- value = getAnyNumber(value);
+ rdfUri = prefixDC + ''year'';
+
+ var re = /[0-9]+/;
+ var m = re.exec(value);
+ value = m[0];
} else if(field == "isbn") {
rdfUri = prefixDC + ''identifier'';
- value = ''ISBN ''+getISBN(value);
+
+ var re = /^[0-9](?:[0-9X]+)/;
+ var m = re.exec(value);
+ value = m[0];
} else if(field == "title") {
rdfUri = prefixDC + ''title'';
var titleParts = value.split(" / ");
@@ -584,10 +492,10 @@ for (var i = 0; i < elmts.length; i++) {
value = pubParts[0];
} else if(field == "personal author") {
rdfUri = prefixDC + ''creator'';
- value = cleanAuthor(node.nodeValue);
+ value = utilities.cleanAuthor(node.nodeValue);
} else if(field == "added author") {
rdfUri = prefixDC + ''contributor'';
- value = cleanAuthor(node.nodeValue);
+ value = utilities.cleanAuthor(node.nodeValue);
} else if(field == "corporate author") {
rdfUri = prefixDC + ''creator'';
}
@@ -611,10 +519,12 @@ for (var i = 0; i < elmts.length; i++) {
}
} catch (e) {}
-}
+}
+
+model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
');
-REPLACE INTO "scrapers" VALUES('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-12 20:00:00', 'ProQuest Scraper', 'Simon Kornblith', 'http://proquest\.umi\.com/pqdweb\?(?:.*\&)?did=', '',
+REPLACE INTO "scrapers" VALUES('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-18 09:58:00', 'ProQuest Scraper', 'Simon Kornblith', 'http://proquest\.umi\.com/pqdweb\?(?:.*\&)?did=', '',
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
@@ -625,37 +535,6 @@ var nsResolver = namespace ? function(prefix) {
if (prefix == ''x'') return namespace; else return null;
} : null;
-var getNode = function(doc, contextNode, xpath, nsResolver) {
- return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
-}
-
-function stringTrimmer(x) {
- var x = x.replace(/^[^\w(]+/, "");
- return x.replace(/[^\w)]+$/, "");
-}
-
-function getPageRange(x) {
- var re = /[0-9\-]+/;
- var m = re.exec(x);
- if(m) {
- return m[0];
- }
-}
-
-function cleanAuthor(author) {
- author = author.replace(/^[\s\.\,\/\[\]\:]+/, '''');
- author = author.replace(/[\s\,\/\[\]\:\.]+$/, '''');
- // Add period for initials
- if(author.substring(author.length-2, author.length-1) == " ") {
- author += ".";
- }
- var splitNames = author.split('', '');
- if(splitNames.length > 1) {
- author = splitNames[1]+'' ''+splitNames[0];
- }
- return author;
-}
-
var uri = doc.location.href;
var data = new Object();
@@ -685,7 +564,7 @@ for (var i = 0; i < elmts.length; i++) {
var authorElmt = authorElmts[j];
author += authorElmt.nodeValue;
}
- model.addStatement(uri, prefixDC + ''creator'', cleanAuthor(author), true);
+ model.addStatement(uri, prefixDC + ''creator'', utilities.cleanAuthor(author), true);
}
// Other info
@@ -693,23 +572,24 @@ var xpath = ''/html/body/span[@class="textMedium"]/font/table/tbody/tr'';
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
for (var i = 0; i < elmts.length; i++) {
var elmt = elmts[i];
- var field = stringTrimmer(getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue).toLowerCase();
+ var field = utilities.superCleanString(utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue).toLowerCase();
if(field == "publication title") {
- var publication = getNode(doc, elmt, ''./TD[2]/A[1]/text()[1]'', nsResolver);
+ var publication = utilities.getNode(doc, elmt, ''./TD[2]/A[1]/text()[1]'', nsResolver);
if(publication.nodeValue) {
- model.addStatement(uri, prefixDummy + ''publication'', stringTrimmer(publication.nodeValue), true);
+ model.addStatement(uri, prefixDummy + ''publication'', utilities.superCleanString(publication.nodeValue), true);
}
- var place = getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
+ var place = utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
if(place.nodeValue) {
- model.addStatement(uri, prefixDummy + ''place'', stringTrimmer(place.nodeValue), true);
+ model.addStatement(uri, prefixDummy + ''place'', utilities.superCleanString(place.nodeValue), true);
}
- var date = getNode(doc, elmt, ''./TD[2]/A[2]/text()[1]'', nsResolver);
+ var date = utilities.getNode(doc, elmt, ''./TD[2]/A[2]/text()[1]'', nsResolver);
if(date.nodeValue) {
- model.addStatement(uri, prefixDC + ''date'', stringTrimmer(date.nodeValue), true);
+ var jsDate = new Date(utilities.superCleanString(date.nodeValue));
+ model.addStatement(uri, prefixDC + ''date'', utilities.dateToISO(jsDate), true);
}
- var moreInfo = getNode(doc, elmt, ''./TD[2]/text()[2]'', nsResolver);
+ var moreInfo = utilities.getNode(doc, elmt, ''./TD[2]/text()[2]'', nsResolver);
if(moreInfo.nodeValue) {
- moreInfo = stringTrimmer(moreInfo.nodeValue);
+ moreInfo = utilities.superCleanString(moreInfo.nodeValue);
var parts = moreInfo.split(";\xA0");
var issueRegexp = /^(\w+)\.(?: |\xA0)?(.+)$/
@@ -718,34 +598,38 @@ for (var i = 0; i < elmts.length; i++) {
var m = issueRegexp.exec(issueInfo[j]);
var info = m[1].toLowerCase();
if(info == "vol") {
- model.addStatement(uri, prefixDummy + ''volume'', stringTrimmer(m[2]), true);
+ model.addStatement(uri, prefixDummy + ''volume'', utilities.superCleanString(m[2]), true);
} else if(info == "iss" || info == "no") {
- model.addStatement(uri, prefixDummy + ''number'', stringTrimmer(m[2]), true);
+ model.addStatement(uri, prefixDummy + ''number'', utilities.superCleanString(m[2]), true);
}
}
- if(parts[1] && stringTrimmer(parts[1]).substring(0, 3).toLowerCase() == "pg.") {
- var pages = getPageRange(parts[1]);
- if(pages) {
- model.addStatement(uri, prefixDummy + ''pages'', pages, true);
+ if(parts[1] && utilities.superCleanString(parts[1]).substring(0, 3).toLowerCase() == "pg.") {
+ var re = /[0-9\-]+/;
+ var m = re.exec(parts[1]);
+
+ if(m) {
+ model.addStatement(uri, prefixDummy + ''pages'', m[0], true);
}
}
}
} else if(field == "source type") {
- var value = getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
+ var value = utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
if(value.nodeValue) {
- value = stringTrimmer(value.nodeValue).toLowerCase();
+ value = utilities.superCleanString(value.nodeValue).toLowerCase();
- if(value == "newspaper" || value == "periodical") {
+ if(value == "periodical") {
model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);
+ } else if(value == "newspaper") {
+ model.addStatement(uri, prefixRDF + "type", prefixDummy + "newspaper", false);
} else {
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
}
}
} else if(field == "isbn" || field == "issn" || field == "issn/isbn") {
- var value = getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
+ var value = utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
if(value) {
var type;
- value = stringTrimmer(value.nodeValue);
+ value = utilities.superCleanString(value.nodeValue);
if(value.length == 10 || value.length == 13) {
type = "ISBN";
} else if(value.length == 8) {
@@ -758,7 +642,7 @@ for (var i = 0; i < elmts.length; i++) {
}
}');
-REPLACE INTO "scrapers" VALUES('6773a9af-5375-3224-d148-d32793884dec', '2006-06-12 20:00:00', 'InfoTrac Scraper', 'Simon Kornblith', '^http://infotrac-college\.thomsonlearning\.com/itw/infomark/',
+REPLACE INTO "scrapers" VALUES('6773a9af-5375-3224-d148-d32793884dec', '2006-06-18 11:19:00', 'InfoTrac Scraper', 'Simon Kornblith', '^http://infotrac-college\.thomsonlearning\.com/itw/infomark/',
'if(doc.title.substring(0, 8) == "Article ") {
return true;
}
@@ -773,20 +657,6 @@ var nsResolver = namespace ? function(prefix) {
if (prefix == ''x'') return namespace; else return null;
} : null;
-function cleanAuthor(author) {
- author = author.replace(/^[\s\.\,\/\[\]\:]+/, '''');
- author = author.replace(/[\s\,\/\[\]\:\.]+$/, '''');
- // Add period for initials
- if(author.substring(author.length-2, author.length-1) == " ") {
- author += ".";
- }
- var splitNames = author.split('', '');
- if(splitNames.length > 1) {
- author = splitNames[1]+'' ''+splitNames[0];
- }
- return author;
-}
-
var uri = doc.location.href;
var xpath = ''/html/body//comment()'';
@@ -837,22 +707,17 @@ for (var i = 0; i < elmts.length; i++) {
model.addStatement(uri, prefixDC + "date", date.substring(1), false);
}
} else if(field == "author") {
- model.addStatement(uri, prefixDC + "creator", cleanAuthor(value), false);
+ model.addStatement(uri, prefixDC + "creator", utilities.cleanAuthor(value), false);
}
}
model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);');
-REPLACE INTO "scrapers" VALUES('b047a13c-fe5c-6604-c997-bef15e502b09', '2006-06-12 20:00:00', 'LexisNexis Scraper', 'Simon Kornblith', '^http://web\.lexis-nexis\.com/universe/document', NULL,
+REPLACE INTO "scrapers" VALUES('b047a13c-fe5c-6604-c997-bef15e502b09', '2006-06-18 10:13:00', 'LexisNexis Scraper', 'Simon Kornblith', '^http://web\.lexis-nexis\.com/universe/document', NULL,
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
-function clearTags(x) {
- x = x.replace(/
]*>/gi, "\n");
- return x.replace(/<[^>]+>/g, "");
-}
-
var uri = doc.location.href;
var citationDataDiv;
@@ -868,10 +733,11 @@ centerElements = citationDataDiv.getElementsByTagName("center");
var elementParts = centerElements[0].innerHTML.split(/
]*>/gi);
model.addStatement(uri, prefixDummy + "publication", elementParts[elementParts.length-1], true);
-var dateRegexp = /
]*>([A-Z][a-z]+)<\/b> ([0-9]+, [0-9]{4})/;
+var dateRegexp = /
]*>(?:)?([A-Z][a-z]+)(?:<\/b>)? ([0-9]+, [0-9]{4})/;
var m = dateRegexp.exec(centerElements[centerElements.length-1].innerHTML);
if(m) {
- model.addStatement(uri, prefixDC + "date", m[1]+" "+m[2], true);
+ var jsDate = new Date(m[1]+" "+m[2]);
+ model.addStatement(uri, prefixDC + "date", utilities.dateToISO(jsDate), true);
} else {
var elementParts = centerElements[centerElements.length-1].innerHTML.split(/
]*>/gi);
model.addStatement(uri, prefixDC + "date", elementParts[1], true);
@@ -887,12 +753,12 @@ if(cutIndex > 0) {
citationData = citationDataDiv.innerHTML;
}
-citationData = clearTags(citationData);
+citationData = utilities.cleanTags(citationData);
var headlineRegexp = /\n(?:HEADLINE|TITLE|ARTICLE): ([^\n]+)\n/;
var m = headlineRegexp.exec(citationData);
if(m) {
- model.addStatement(uri, prefixDC + "title", clearTags(m[1]), true);
+ model.addStatement(uri, prefixDC + "title", utilities.cleanTags(m[1]), true);
}
var bylineRegexp = /\nBYLINE: *(\w[\w\- ]+)/;
@@ -903,6 +769,9 @@ if(m) {
m[1] = m[1].substring(3);
}
model.addStatement(uri, prefixDC + "creator", m[1], true);
+ model.addStatement(uri, prefixRDF + "type", prefixDummy + "newspaper", false);
+} else {
+ model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);
}
var authorRegexp = /\n(?:AUTHOR|NAME): ([^\n]+)\n/;
@@ -912,13 +781,9 @@ if(m) {
for(i in authors) {
model.addStatement(uri, prefixDC + "creator", authors[i].replace(" *", ""), true);
}
-}
+}');
-model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);
-
-utilities.debugPrint(citationData);');
-
-REPLACE INTO "scrapers" VALUES('cf87eca8-041d-b954-795a-2d86348999d5', '2006-06-12 20:00:00', 'Aleph Scraper', 'Simon Kornblith', 'func=full-set-set.*\&format=999', NULL,
+REPLACE INTO "scrapers" VALUES('cf87eca8-041d-b954-795a-2d86348999d5', '2006-06-18 11:19:00', 'Aleph Scraper', 'Simon Kornblith', 'func=full-set-set.*\&format=999', NULL,
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
@@ -929,11 +794,6 @@ var uri = doc.location.href;
var newUri = uri.replace("&format=999", "&format=001");
utilities.debugPrint(newUri);
-function stringTrimmer(x) {
- var x = x.replace(/^[^\w(]+/, "");
- return x.replace(/[^\w)]+$/, "");
-}
-
utilities.loadDocument(newUri, browser, function(newBrowser) {
newDoc = newBrowser.contentDocument;
@@ -942,17 +802,13 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
if (prefix == ''x'') return namespace; else return null;
} : null;
- var getNode = function(doc, contextNode, xpath, nsResolver) {
- return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
- }
-
var xpath = ''/html/body/table/tbody/tr[td[1][@class="td1"][@id="bold"]][td[2][@class="td1"]]'';
var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver);
var record = new MARC_Record();
for(var i=0; i 0) {
var body = doc.getElementsByTagName("body");
if(body[0].innerHTML.indexOf("ISBN") < 0) {
@@ -1117,7 +960,7 @@ utilities.HTTPUtilities.doGet(newUri, null, function(text) {
wait();');
-REPLACE INTO "scrapers" VALUES('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-12 20:00:00', 'GEAC Scraper', 'Simon Kornblith', '/(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html)', NULL,
+REPLACE INTO "scrapers" VALUES('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-18 11:19:00', 'GEAC Scraper', 'Simon Kornblith', '/(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html)', NULL,
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
@@ -1126,10 +969,6 @@ var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
var uri = doc.location.href;
var newUri = uri.replace(/([:&])next=html\/geacnffull.html/, "$1next=html/marc.html");
newUri = newUri.replace(/([:&])next=html\/record.html/, "$1next=html/marc.html");
-
-var getNode = function(doc, contextNode, xpath, nsResolver) {
- return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
-}
utilities.loadDocument(newUri, browser, function(newBrowser) {
newDoc = newBrowser.contentDocument;
@@ -1180,7 +1019,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
wait();');
-REPLACE INTO "scrapers" VALUES('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006-06-12 20:00:00', 'SIRSI -2003 Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi',
+REPLACE INTO "scrapers" VALUES('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006-06-18 11:19:00', 'SIRSI -2003 Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi',
'var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == ''x'') return namespace; else return null;
@@ -1204,10 +1043,6 @@ var nsResolver = namespace ? function(prefix) {
if (prefix == ''x'') return namespace; else return null;
} : null;
-var getNode = function(doc, contextNode, xpath, nsResolver) {
- return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
-}
-
var uri = doc.location.href;
var uriRegexp = /^(.*)(\/[0-9]+)$/;
var m = uriRegexp.exec(uri);
@@ -1217,9 +1052,9 @@ utilities.debugPrint(newUri);
var elmts = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p'', nsResolver);
for(i in elmts) {
var elmt = elmts[i];
- var initialText = getNode(doc, elmt, ''./text()[1]'', nsResolver);
+ var initialText = utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver);
if(initialText.nodeValue == "\n\nViewing record\n") {
- var recNumber = getNode(doc, elmt, ''./b[1]/text()[1]'', nsResolver).nodeValue;
+ var recNumber = utilities.getNode(doc, elmt, ''./b[1]/text()[1]'', nsResolver).nodeValue;
}
}
@@ -1263,7 +1098,7 @@ utilities.HTTPUtilities.doPost(newUri, ''marks=''+recNumber+''&shadow=NO&format=
})
wait();');
-REPLACE INTO "scrapers" VALUES('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006-06-12 20:00:00', 'TLC/YouSeeMore Scraper', 'Simon Kornblith', 'TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]', NULL,
+REPLACE INTO "scrapers" VALUES('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006-06-18 11:19:00', 'TLC/YouSeeMore Scraper', 'Simon Kornblith', 'TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]', NULL,
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
@@ -1274,10 +1109,6 @@ var nsResolver = namespace ? function(prefix) {
if (prefix == ''x'') return namespace; else return null;
} : null;
-var getNode = function(doc, contextNode, xpath, nsResolver) {
- return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
-}
-
var uri = doc.location.href;
var newUri = uri.replace("LabelDisplay", "MARCDisplay");
utilities.debugPrint(newUri);
@@ -1298,8 +1129,8 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
for(var i=0; i