From 20369f41b316164189c5a6d5aaff804abb843a38 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Sun, 18 Jun 2006 19:04:32 +0000 Subject: [PATCH] - Move commonly used scraper functions to ingester.js, rather than re-defining them in each scraper. This breaks Piggy Bank compatibility in our scrapers, but we will still be able to export our scrapers in a Piggy Bank compatible form. - Better handling of scraper RDF to item mapping. - Improved date handling. All scrapers now return ISO-style dates when possible. --- .../content/scholar/xpcom/ingester.js | 170 +++++--- scrapers.sql | 402 +++++------------- 2 files changed, 232 insertions(+), 340 deletions(-) diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js index 3907562a8d..1896891fa5 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/ingester.js +++ b/chrome/chromeFiles/content/scholar/xpcom/ingester.js @@ -207,17 +207,41 @@ Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, su // essential components for Scholar and would take a great deal of effort to // implement. We can, however, always implement them later. -// These functions are for use by importMARCRecord. They're private, because, -// while they are useful, it's also nice if as many of our scrapers as possible -// are PiggyBank compatible, and if our scrapers used functions, that would -// break compatibility -Scholar.Ingester.Utilities.prototype._MARCCleanString = function(author) { - author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); - author = author.replace(/[\s\.\,\/\[\]\:]+$/, ''); - return author.replace(/ +/, ' '); +/* + * BEGIN FIREFOX SCHOLAR EXTENSIONS + * Functions below this point are extensions to the utilities provided by + * Piggy Bank. When used in external code, the repository will need to add + * a function definition when exporting in Piggy Bank format. + */ +Scholar.Ingester.Utilities.prototype.dateToISO = function(jsDate) { + var date = ""; + var year = jsDate.getFullYear().toString(); + var month = (jsDate.getMonth()+1).toString(); + var day = jsDate.getDate().toString(); + + for(var i = year.length; i<4; i++) { + date += "0"; + } + date += year+"-"; + + if(month.length == 1) { + date += "0"; + } + date += month+"-"; + + if(day.length == 1) { + date += "0"; + } + date += day; + + return date; } -Scholar.Ingester.Utilities.prototype._MARCCleanAuthor = function(author) { +Scholar.Ingester.Utilities.prototype.getNode = function(doc, contextNode, xpath, nsResolver) { + return doc.evaluate(xpath, contextNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null).iterateNext(); +} + +Scholar.Ingester.Utilities.prototype.cleanAuthor = function(author) { author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); author = author.replace(/[\s\,\/\[\]\:\.]+$/, ''); author = author.replace(/ +/, ' '); @@ -232,6 +256,31 @@ Scholar.Ingester.Utilities.prototype._MARCCleanAuthor = function(author) { return author; } +Scholar.Ingester.Utilities.prototype.cleanString = function(s) { + s = this.trimString(s); + return s.replace(/ +/g, " "); +} + +Scholar.Ingester.Utilities.prototype.superCleanString = function(x) { + var x = x.replace(/^[^\w(]+/, ""); + return x.replace(/[^\w)]+$/, ""); +} + +Scholar.Ingester.Utilities.prototype.cleanTags = function(x) { + x = x.replace(/]*>/gi, "\n"); + return x.replace(/<[^>]+>/g, ""); +} + +// These functions are for use by importMARCRecord. They're private, because, +// while they are useful, it's also nice if as many of our scrapers as possible +// are PiggyBank compatible, and if our scrapers used functions, that would +// break compatibility +Scholar.Ingester.Utilities.prototype._MARCCleanString = function(author) { + author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); + author = author.replace(/[\s\.\,\/\[\]\:]+$/, ''); + return author.replace(/ +/, ' '); +} + Scholar.Ingester.Utilities.prototype._MARCCleanNumber = function(author) { author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); author = author.replace(/[\s\.\,\/\[\]\:]+$/, ''); @@ -283,11 +332,11 @@ Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, mo // Extract ISSNs model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN '); // Extract creators - model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this._MARCCleanAuthor); + model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this.cleanAuthor); model = this._MARCAssociateField(record, uri, model, '110', prefixDC + 'creator', this._MARCCleanString); model = this._MARCAssociateField(record, uri, model, '111', prefixDC + 'creator', this._MARCCleanString); model = this._MARCAssociateField(record, uri, model, '130', prefixDC + 'creator', this._MARCCleanString); - model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this._MARCCleanAuthor); + model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this.cleanAuthor); model = this._MARCAssociateField(record, uri, model, '710', prefixDC + 'contributor', this._MARCCleanString); model = this._MARCAssociateField(record, uri, model, '711', prefixDC + 'contributor', this._MARCCleanString); model = this._MARCAssociateField(record, uri, model, '730', prefixDC + 'contributor', this._MARCCleanString); @@ -295,7 +344,7 @@ Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, mo // in the person subject field as the first entry var field = record.get_field_subfields('600'); if(field[0]) { - model.addStatement(uri, prefixDC + 'creator', this._MARCCleanAuthor(field[0]['a'])); + model.addStatement(uri, prefixDC + 'creator', this.cleanAuthor(field[0]['a'])); } } // Extract title @@ -312,6 +361,9 @@ Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, mo model = this._MARCAssociateField(record, uri, model, '440', prefixDummy + 'series', this._MARCCleanString); } +/* + * END FIREFOX SCHOLAR EXTENSIONS + */ // These are front ends for XMLHttpRequest. XMLHttpRequest can't actually be // accessed outside the sandbox, and even if it could, it wouldn't let scripts @@ -573,6 +625,19 @@ Scholar.Ingester.Document.prototype._generateSandbox = function() { this._sandbox.done = function(){ me._scrapePageComplete(); }; } +Scholar.Ingester.Document.prototype._associateRDF = function(rdfUri, field, uri, item, typeID) { + var fieldID; + if(fieldID = Scholar.ItemFields.getID(field)) { + if(this.model.data[uri][rdfUri] && Scholar.ItemFields.isValidForType(fieldID, typeID)) { + item.setField(field, this.model.data[uri][rdfUri][0]); + } else { + Scholar.debug("discarded scraper " + field + " data: not valid for item type "+typeID); + } + } else { + Scholar.debug("discarded scraper " + field + " data: no field in database"); + } +} + /* * Add data ingested using RDF to database * (Ontologies are hard-coded until we have a real way of dealing with them) @@ -585,17 +650,27 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() { var prefixDCMI = 'http://purl.org/dc/dcmitype/'; var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/'; + var typeToTypeID = new Object(); + typeToTypeID[prefixDummy + 'book'] = 1; + typeToTypeID[prefixDummy + 'journal'] = 2; + typeToTypeID[prefixDummy + 'newspaper'] = 2; + try { for(var uri in this.model.data) { - if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) { - var newItem = Scholar.Items.getNewItemByType(2); - } else { - var newItem = Scholar.Items.getNewItemByType(1); + var typeID = typeToTypeID[this.model.data[uri][prefixRDF + 'type']]; + if(!typeID) { + var typeID = 1; } + + var newItem = Scholar.Items.getNewItemByType(typeID); + + // Handle source and title newItem.setField("source", uri); if(this.model.data[uri][prefixDC + 'title']) { newItem.setField("title", this.model.data[uri][prefixDC + 'title'][0]); } + + // Handle creators and contributors var creatorIndex = 0; if(this.model.data[uri][prefixDC + 'creator']) { for(i in this.model.data[uri][prefixDC + 'creator']) { @@ -619,54 +694,45 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() { creatorIndex++; } } - if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) { - if(this.model.data[uri][prefixDummy + 'publication']) { - newItem.setField("publication", this.model.data[uri][prefixDummy + 'publication'][0]); - } - if(this.model.data[uri][prefixDummy + 'volume']) { - newItem.setField("volume", this.model.data[uri][prefixDummy + 'volume'][0]); - } - if(this.model.data[uri][prefixDummy + 'number']) { - newItem.setField("number", this.model.data[uri][prefixDummy + 'number'][0]); - } - if(this.model.data[uri][prefixDummy + 'pages']) { - newItem.setField("pages", this.model.data[uri][prefixDummy + 'pages'][0]); - } - if(this.model.data[uri][prefixDC + 'identifier']) { - for(i in this.model.data[uri][prefixDC + 'identifier']) { - if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISSN') { - newItem.setField("ISSN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5)); - break; - } - } - } - } else { - if(this.model.data[uri][prefixDC + 'publisher']) { - newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher'][0]); - } + + // Handle years, extracting from date if necessary + if(Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("year"), typeID)) { if(this.model.data[uri][prefixDC + 'year']) { newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]); } else if(this.model.data[uri][prefixDC + 'date'] && this.model.data[uri][prefixDC + 'date'][0].length >= 4) { newItem.setField("year", this.model.data[uri][prefixDC + 'date'][0].substr(0, 4)); } - if(this.model.data[uri][prefixDC + 'hasVersion']) { - newItem.setField("edition", this.model.data[uri][prefixDC + 'hasVersion'][0]); - } - if(this.model.data[uri][prefixDummy + 'series']) { - newItem.setField("series", this.model.data[uri][prefixDummy + 'series'][0]); - } - if(this.model.data[uri][prefixDummy + 'place']) { - newItem.setField("place", this.model.data[uri][prefixDummy + 'place'][0]); - } - if(this.model.data[uri][prefixDC + 'identifier']) { + } + + // Handle ISBNs/ISSNs + if(this.model.data[uri][prefixDC + 'identifier']) { + var needISSN = Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("ISSN"), typeID); + var needISBN = Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("ISBN"), typeID); + if(needISSN || needISBN) { for(i in this.model.data[uri][prefixDC + 'identifier']) { - if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISBN') { + firstFour = this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4); + if(needISSN && firstFour == 'ISSN') { + newItem.setField("ISSN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5)); + break; + } + if(needISBN && firstFour == 'ISBN') { newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5)); break; } } } } + + this._associateRDF(prefixDummy + 'publication', "publication", uri, newItem, typeID); + this._associateRDF(prefixDummy + 'volume', "volume", uri, newItem, typeID); + this._associateRDF(prefixDummy + 'number', "number", uri, newItem, typeID); + this._associateRDF(prefixDummy + 'pages', "pages", uri, newItem, typeID); + this._associateRDF(prefixDC + 'publisher', "publisher", uri, newItem, typeID); + this._associateRDF(prefixDC + 'date', "date", uri, newItem, typeID); + this._associateRDF(prefixDC + 'hasVersion', "edition", uri, newItem, typeID); + this._associateRDF(prefixDummy + 'series', "series", uri, newItem, typeID); + this._associateRDF(prefixDummy + 'place', "place", uri, newItem, typeID); + this.items.push(newItem); } } catch(ex) { diff --git a/scrapers.sql b/scrapers.sql index ae14f1e3aa..3547026705 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1,9 +1,9 @@ --- 5 +-- 6 -- Set the following timestamp to the most recent scraper update date -REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-12 20:00:00')); +REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-18 11:19:00')); -REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-12 20:00:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/gp/product/', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; +REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-18 10:15:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/gp/product/', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; @@ -13,50 +13,15 @@ var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; -var getNode = function(doc, contextNode, xpath, nsResolver) { - return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext(); -} - -var cleanString = function(s) { - s = utilities.trimString(s); - return s.replace(/ +/g, " "); -} - -var dateToISO = function(jsDate) { - var date = ""; - var year = jsDate.getFullYear().toString(); - var month = (jsDate.getMonth()+1).toString(); - var day = jsDate.getDate().toString(); - - for(var i = year.length; i<4; i++) { - date += "0"; - } - date += year+"-"; - - if(month.length == 1) { - date += "0"; - } - date += month+"-"; - - if(day.length == 1) { - date += "0"; - } - date += day; - - return date; -} - var uri = doc.location.href; -model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); - // Retrieve authors var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a''; var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); for (var i = 0; i < elmts.length; i++) { var elmt = elmts[i]; - model.addStatement(uri, prefixDC + ''creator'', cleanString(getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue), false); // Use your own type here + model.addStatement(uri, prefixDC + ''creator'', utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue), false); // Use your own type here } // Retrieve data from "Product Details" box @@ -64,14 +29,14 @@ var xpath = ''/html/body/table/tbody/tr/td[2]/table/tbody/tr/td[@class="bucket"] var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); for (var i = 0; i < elmts.length; i++) { var elmt = elmts[i]; - var attribute = cleanString(getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue); - if(getNode(doc, elmt, ''./text()[1]'', nsResolver)) { - var value = cleanString(getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue); + var attribute = utilities.cleanString(utilities.getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue); + if(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver)) { + var value = utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue); if(attribute == "Publisher:") { if(value.lastIndexOf("(") != -1) { var jsDate = value.substring(value.lastIndexOf("(")+1, value.length-1); jsDate = new Date(jsDate); - var date = dateToISO(jsDate); + var date = utilities.dateToISO(jsDate); value = value.substring(0, value.lastIndexOf("(")-1); } @@ -95,13 +60,14 @@ for (var i = 0; i < elmts.length; i++) { var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]''; var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); -var title = cleanString(getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue); +var title = utilities.cleanString(utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue); if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) { title = title.substring(0, title.lastIndexOf("(")-1); } -model.addStatement(uri, prefixDC + ''title'', title);'); +model.addStatement(uri, prefixDC + ''title'', title); +model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);'); -REPLACE INTO "scrapers" VALUES('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-12 20:00:00', 'WorldCat Scraper', 'Simon Kornblith', '^http://newfirstsearch\.oclc\.org/WebZ/', +REPLACE INTO "scrapers" VALUES('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-18 11:02:00', 'WorldCat Scraper', 'Simon Kornblith', '^http://newfirstsearch\.oclc\.org/WebZ/', 'if(doc.title == ''FirstSearch: WorldCat Detailed Record'') { return true; } @@ -139,22 +105,6 @@ if(rMatch) { var newUri = ''http://newfirstsearch.oclc.org/WebZ/DirectExport?numrecs=10:smartpage=directexport:entityexportnumrecs=10:entityexportresultset='' + resultset + '':entityexportrecno='' + number + '':sessionid='' + sessionid + '':entitypagenum=35:0''; -model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); - -function cleanAuthor(author) { - author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''''); - author = author.replace(/[\s\,\/\[\]\:\.]+$/, ''''); - // Add period for initials - if(author.substring(author.length-2, author.length-1) == " ") { - author += "."; - } - var splitNames = author.split('', ''); - if(splitNames.length > 1) { - author = splitNames[1]+'' ''+splitNames[0]; - } - return author; -} - utilities.HTTPUtilities.doPost(newUri, ''exportselect=record&exporttype=plaintext'', null, function(text) { var lines = text.split(''\n''); for(var i=0;i 1) { - author = splitNames[1]+'' ''+splitNames[0]; - } - return author; -} - var uri = doc.location.href; var data = new Object(); @@ -557,23 +459,29 @@ var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); for (var i = 0; i < elmts.length; i++) { var elmt = elmts[i]; try { - var node = getNode(doc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver); + var node = utilities.getNode(doc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver); if(!node) { - var node = getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver); + var node = utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver); } if(node) { - var field = stringTrimmer(getNode(doc, elmt, ''./TH[1]/text()[1]'', nsResolver).nodeValue); + var field = utilities.superCleanString(utilities.getNode(doc, elmt, ''./TH[1]/text()[1]'', nsResolver).nodeValue); field = field.toLowerCase(); - var value = stringTrimmer(node.nodeValue); + var value = utilities.superCleanString(node.nodeValue); var rdfUri = null; if(field == "publisher") { rdfUri = prefixDC + ''publisher''; } else if(field == "pub date") { - rdfUri = prefixDC + ''date''; - value = getAnyNumber(value); + rdfUri = prefixDC + ''year''; + + var re = /[0-9]+/; + var m = re.exec(value); + value = m[0]; } else if(field == "isbn") { rdfUri = prefixDC + ''identifier''; - value = ''ISBN ''+getISBN(value); + + var re = /^[0-9](?:[0-9X]+)/; + var m = re.exec(value); + value = m[0]; } else if(field == "title") { rdfUri = prefixDC + ''title''; var titleParts = value.split(" / "); @@ -584,10 +492,10 @@ for (var i = 0; i < elmts.length; i++) { value = pubParts[0]; } else if(field == "personal author") { rdfUri = prefixDC + ''creator''; - value = cleanAuthor(node.nodeValue); + value = utilities.cleanAuthor(node.nodeValue); } else if(field == "added author") { rdfUri = prefixDC + ''contributor''; - value = cleanAuthor(node.nodeValue); + value = utilities.cleanAuthor(node.nodeValue); } else if(field == "corporate author") { rdfUri = prefixDC + ''creator''; } @@ -611,10 +519,12 @@ for (var i = 0; i < elmts.length; i++) { } } catch (e) {} -} +} + +model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); '); -REPLACE INTO "scrapers" VALUES('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-12 20:00:00', 'ProQuest Scraper', 'Simon Kornblith', 'http://proquest\.umi\.com/pqdweb\?(?:.*\&)?did=', '', +REPLACE INTO "scrapers" VALUES('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-18 09:58:00', 'ProQuest Scraper', 'Simon Kornblith', 'http://proquest\.umi\.com/pqdweb\?(?:.*\&)?did=', '', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; @@ -625,37 +535,6 @@ var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; -var getNode = function(doc, contextNode, xpath, nsResolver) { - return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext(); -} - -function stringTrimmer(x) { - var x = x.replace(/^[^\w(]+/, ""); - return x.replace(/[^\w)]+$/, ""); -} - -function getPageRange(x) { - var re = /[0-9\-]+/; - var m = re.exec(x); - if(m) { - return m[0]; - } -} - -function cleanAuthor(author) { - author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''''); - author = author.replace(/[\s\,\/\[\]\:\.]+$/, ''''); - // Add period for initials - if(author.substring(author.length-2, author.length-1) == " ") { - author += "."; - } - var splitNames = author.split('', ''); - if(splitNames.length > 1) { - author = splitNames[1]+'' ''+splitNames[0]; - } - return author; -} - var uri = doc.location.href; var data = new Object(); @@ -685,7 +564,7 @@ for (var i = 0; i < elmts.length; i++) { var authorElmt = authorElmts[j]; author += authorElmt.nodeValue; } - model.addStatement(uri, prefixDC + ''creator'', cleanAuthor(author), true); + model.addStatement(uri, prefixDC + ''creator'', utilities.cleanAuthor(author), true); } // Other info @@ -693,23 +572,24 @@ var xpath = ''/html/body/span[@class="textMedium"]/font/table/tbody/tr''; var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); for (var i = 0; i < elmts.length; i++) { var elmt = elmts[i]; - var field = stringTrimmer(getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue).toLowerCase(); + var field = utilities.superCleanString(utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue).toLowerCase(); if(field == "publication title") { - var publication = getNode(doc, elmt, ''./TD[2]/A[1]/text()[1]'', nsResolver); + var publication = utilities.getNode(doc, elmt, ''./TD[2]/A[1]/text()[1]'', nsResolver); if(publication.nodeValue) { - model.addStatement(uri, prefixDummy + ''publication'', stringTrimmer(publication.nodeValue), true); + model.addStatement(uri, prefixDummy + ''publication'', utilities.superCleanString(publication.nodeValue), true); } - var place = getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); + var place = utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); if(place.nodeValue) { - model.addStatement(uri, prefixDummy + ''place'', stringTrimmer(place.nodeValue), true); + model.addStatement(uri, prefixDummy + ''place'', utilities.superCleanString(place.nodeValue), true); } - var date = getNode(doc, elmt, ''./TD[2]/A[2]/text()[1]'', nsResolver); + var date = utilities.getNode(doc, elmt, ''./TD[2]/A[2]/text()[1]'', nsResolver); if(date.nodeValue) { - model.addStatement(uri, prefixDC + ''date'', stringTrimmer(date.nodeValue), true); + var jsDate = new Date(utilities.superCleanString(date.nodeValue)); + model.addStatement(uri, prefixDC + ''date'', utilities.dateToISO(jsDate), true); } - var moreInfo = getNode(doc, elmt, ''./TD[2]/text()[2]'', nsResolver); + var moreInfo = utilities.getNode(doc, elmt, ''./TD[2]/text()[2]'', nsResolver); if(moreInfo.nodeValue) { - moreInfo = stringTrimmer(moreInfo.nodeValue); + moreInfo = utilities.superCleanString(moreInfo.nodeValue); var parts = moreInfo.split(";\xA0"); var issueRegexp = /^(\w+)\.(?: |\xA0)?(.+)$/ @@ -718,34 +598,38 @@ for (var i = 0; i < elmts.length; i++) { var m = issueRegexp.exec(issueInfo[j]); var info = m[1].toLowerCase(); if(info == "vol") { - model.addStatement(uri, prefixDummy + ''volume'', stringTrimmer(m[2]), true); + model.addStatement(uri, prefixDummy + ''volume'', utilities.superCleanString(m[2]), true); } else if(info == "iss" || info == "no") { - model.addStatement(uri, prefixDummy + ''number'', stringTrimmer(m[2]), true); + model.addStatement(uri, prefixDummy + ''number'', utilities.superCleanString(m[2]), true); } } - if(parts[1] && stringTrimmer(parts[1]).substring(0, 3).toLowerCase() == "pg.") { - var pages = getPageRange(parts[1]); - if(pages) { - model.addStatement(uri, prefixDummy + ''pages'', pages, true); + if(parts[1] && utilities.superCleanString(parts[1]).substring(0, 3).toLowerCase() == "pg.") { + var re = /[0-9\-]+/; + var m = re.exec(parts[1]); + + if(m) { + model.addStatement(uri, prefixDummy + ''pages'', m[0], true); } } } } else if(field == "source type") { - var value = getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); + var value = utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); if(value.nodeValue) { - value = stringTrimmer(value.nodeValue).toLowerCase(); + value = utilities.superCleanString(value.nodeValue).toLowerCase(); - if(value == "newspaper" || value == "periodical") { + if(value == "periodical") { model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false); + } else if(value == "newspaper") { + model.addStatement(uri, prefixRDF + "type", prefixDummy + "newspaper", false); } else { model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); } } } else if(field == "isbn" || field == "issn" || field == "issn/isbn") { - var value = getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); + var value = utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); if(value) { var type; - value = stringTrimmer(value.nodeValue); + value = utilities.superCleanString(value.nodeValue); if(value.length == 10 || value.length == 13) { type = "ISBN"; } else if(value.length == 8) { @@ -758,7 +642,7 @@ for (var i = 0; i < elmts.length; i++) { } }'); -REPLACE INTO "scrapers" VALUES('6773a9af-5375-3224-d148-d32793884dec', '2006-06-12 20:00:00', 'InfoTrac Scraper', 'Simon Kornblith', '^http://infotrac-college\.thomsonlearning\.com/itw/infomark/', +REPLACE INTO "scrapers" VALUES('6773a9af-5375-3224-d148-d32793884dec', '2006-06-18 11:19:00', 'InfoTrac Scraper', 'Simon Kornblith', '^http://infotrac-college\.thomsonlearning\.com/itw/infomark/', 'if(doc.title.substring(0, 8) == "Article ") { return true; } @@ -773,20 +657,6 @@ var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; -function cleanAuthor(author) { - author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''''); - author = author.replace(/[\s\,\/\[\]\:\.]+$/, ''''); - // Add period for initials - if(author.substring(author.length-2, author.length-1) == " ") { - author += "."; - } - var splitNames = author.split('', ''); - if(splitNames.length > 1) { - author = splitNames[1]+'' ''+splitNames[0]; - } - return author; -} - var uri = doc.location.href; var xpath = ''/html/body//comment()''; @@ -837,22 +707,17 @@ for (var i = 0; i < elmts.length; i++) { model.addStatement(uri, prefixDC + "date", date.substring(1), false); } } else if(field == "author") { - model.addStatement(uri, prefixDC + "creator", cleanAuthor(value), false); + model.addStatement(uri, prefixDC + "creator", utilities.cleanAuthor(value), false); } } model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);'); -REPLACE INTO "scrapers" VALUES('b047a13c-fe5c-6604-c997-bef15e502b09', '2006-06-12 20:00:00', 'LexisNexis Scraper', 'Simon Kornblith', '^http://web\.lexis-nexis\.com/universe/document', NULL, +REPLACE INTO "scrapers" VALUES('b047a13c-fe5c-6604-c997-bef15e502b09', '2006-06-18 10:13:00', 'LexisNexis Scraper', 'Simon Kornblith', '^http://web\.lexis-nexis\.com/universe/document', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; -function clearTags(x) { - x = x.replace(/]*>/gi, "\n"); - return x.replace(/<[^>]+>/g, ""); -} - var uri = doc.location.href; var citationDataDiv; @@ -868,10 +733,11 @@ centerElements = citationDataDiv.getElementsByTagName("center"); var elementParts = centerElements[0].innerHTML.split(/]*>/gi); model.addStatement(uri, prefixDummy + "publication", elementParts[elementParts.length-1], true); -var dateRegexp = /]*>([A-Z][a-z]+)<\/b> ([0-9]+, [0-9]{4})/; +var dateRegexp = /]*>(?:)?([A-Z][a-z]+)(?:<\/b>)? ([0-9]+, [0-9]{4})/; var m = dateRegexp.exec(centerElements[centerElements.length-1].innerHTML); if(m) { - model.addStatement(uri, prefixDC + "date", m[1]+" "+m[2], true); + var jsDate = new Date(m[1]+" "+m[2]); + model.addStatement(uri, prefixDC + "date", utilities.dateToISO(jsDate), true); } else { var elementParts = centerElements[centerElements.length-1].innerHTML.split(/]*>/gi); model.addStatement(uri, prefixDC + "date", elementParts[1], true); @@ -887,12 +753,12 @@ if(cutIndex > 0) { citationData = citationDataDiv.innerHTML; } -citationData = clearTags(citationData); +citationData = utilities.cleanTags(citationData); var headlineRegexp = /\n(?:HEADLINE|TITLE|ARTICLE): ([^\n]+)\n/; var m = headlineRegexp.exec(citationData); if(m) { - model.addStatement(uri, prefixDC + "title", clearTags(m[1]), true); + model.addStatement(uri, prefixDC + "title", utilities.cleanTags(m[1]), true); } var bylineRegexp = /\nBYLINE: *(\w[\w\- ]+)/; @@ -903,6 +769,9 @@ if(m) { m[1] = m[1].substring(3); } model.addStatement(uri, prefixDC + "creator", m[1], true); + model.addStatement(uri, prefixRDF + "type", prefixDummy + "newspaper", false); +} else { + model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false); } var authorRegexp = /\n(?:AUTHOR|NAME): ([^\n]+)\n/; @@ -912,13 +781,9 @@ if(m) { for(i in authors) { model.addStatement(uri, prefixDC + "creator", authors[i].replace(" *", ""), true); } -} +}'); -model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false); - -utilities.debugPrint(citationData);'); - -REPLACE INTO "scrapers" VALUES('cf87eca8-041d-b954-795a-2d86348999d5', '2006-06-12 20:00:00', 'Aleph Scraper', 'Simon Kornblith', 'func=full-set-set.*\&format=999', NULL, +REPLACE INTO "scrapers" VALUES('cf87eca8-041d-b954-795a-2d86348999d5', '2006-06-18 11:19:00', 'Aleph Scraper', 'Simon Kornblith', 'func=full-set-set.*\&format=999', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; @@ -929,11 +794,6 @@ var uri = doc.location.href; var newUri = uri.replace("&format=999", "&format=001"); utilities.debugPrint(newUri); -function stringTrimmer(x) { - var x = x.replace(/^[^\w(]+/, ""); - return x.replace(/[^\w)]+$/, ""); -} - utilities.loadDocument(newUri, browser, function(newBrowser) { newDoc = newBrowser.contentDocument; @@ -942,17 +802,13 @@ utilities.loadDocument(newUri, browser, function(newBrowser) { if (prefix == ''x'') return namespace; else return null; } : null; - var getNode = function(doc, contextNode, xpath, nsResolver) { - return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext(); - } - var xpath = ''/html/body/table/tbody/tr[td[1][@class="td1"][@id="bold"]][td[2][@class="td1"]]''; var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); var record = new MARC_Record(); for(var i=0; i 0) { var body = doc.getElementsByTagName("body"); if(body[0].innerHTML.indexOf("ISBN") < 0) { @@ -1117,7 +960,7 @@ utilities.HTTPUtilities.doGet(newUri, null, function(text) { wait();'); -REPLACE INTO "scrapers" VALUES('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-12 20:00:00', 'GEAC Scraper', 'Simon Kornblith', '/(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html)', NULL, +REPLACE INTO "scrapers" VALUES('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-18 11:19:00', 'GEAC Scraper', 'Simon Kornblith', '/(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; @@ -1126,10 +969,6 @@ var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var uri = doc.location.href; var newUri = uri.replace(/([:&])next=html\/geacnffull.html/, "$1next=html/marc.html"); newUri = newUri.replace(/([:&])next=html\/record.html/, "$1next=html/marc.html"); - -var getNode = function(doc, contextNode, xpath, nsResolver) { - return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext(); -} utilities.loadDocument(newUri, browser, function(newBrowser) { newDoc = newBrowser.contentDocument; @@ -1180,7 +1019,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) { wait();'); -REPLACE INTO "scrapers" VALUES('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006-06-12 20:00:00', 'SIRSI -2003 Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', +REPLACE INTO "scrapers" VALUES('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006-06-18 11:19:00', 'SIRSI -2003 Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', 'var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; @@ -1204,10 +1043,6 @@ var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; -var getNode = function(doc, contextNode, xpath, nsResolver) { - return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext(); -} - var uri = doc.location.href; var uriRegexp = /^(.*)(\/[0-9]+)$/; var m = uriRegexp.exec(uri); @@ -1217,9 +1052,9 @@ utilities.debugPrint(newUri); var elmts = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p'', nsResolver); for(i in elmts) { var elmt = elmts[i]; - var initialText = getNode(doc, elmt, ''./text()[1]'', nsResolver); + var initialText = utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver); if(initialText.nodeValue == "\n\nViewing record\n") { - var recNumber = getNode(doc, elmt, ''./b[1]/text()[1]'', nsResolver).nodeValue; + var recNumber = utilities.getNode(doc, elmt, ''./b[1]/text()[1]'', nsResolver).nodeValue; } } @@ -1263,7 +1098,7 @@ utilities.HTTPUtilities.doPost(newUri, ''marks=''+recNumber+''&shadow=NO&format= }) wait();'); -REPLACE INTO "scrapers" VALUES('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006-06-12 20:00:00', 'TLC/YouSeeMore Scraper', 'Simon Kornblith', 'TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]', NULL, +REPLACE INTO "scrapers" VALUES('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006-06-18 11:19:00', 'TLC/YouSeeMore Scraper', 'Simon Kornblith', 'TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; @@ -1274,10 +1109,6 @@ var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; -var getNode = function(doc, contextNode, xpath, nsResolver) { - return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext(); -} - var uri = doc.location.href; var newUri = uri.replace("LabelDisplay", "MARCDisplay"); utilities.debugPrint(newUri); @@ -1298,8 +1129,8 @@ utilities.loadDocument(newUri, browser, function(newBrowser) { for(var i=0; i