From 152c9bf9e7ee421d8e56db29adef7680b392c096 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Tue, 6 Jun 2006 18:25:45 +0000 Subject: [PATCH] - Small changes to MARC record support - Implemented loadDocument API, for loading and parsing the DOMs of HTML documents in the background - Added scraper code to SVN repository (now includes 12 scrapers, see Writeboard for details) To update to the latest versions of all scrapers, ensure you have an up-to-date version of sqlite3, then run: sqlite3 ~/Library/Application\ Support/Firefox/Profiles/profileName/scholar.sqlite < scrapers.sql --- .../content/scholar/ingester/browser.js | 3 +- .../content/scholar/ingester/browser.xul | 3 + .../content/scholar/xpcom/ingester.js | 217 ++-- .../chromeFiles/content/scholar/xpcom/marc.js | 53 +- scrapers.sql | 1014 +++++++++++++++++ 5 files changed, 1205 insertions(+), 85 deletions(-) create mode 100644 scrapers.sql diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js index 6f062bccc8..ad768d915f 100644 --- a/chrome/chromeFiles/content/scholar/ingester/browser.js +++ b/chrome/chromeFiles/content/scholar/ingester/browser.js @@ -35,6 +35,7 @@ Scholar.Ingester.Interface.init = function() { */ Scholar.Ingester.Interface.chromeLoad = function() { Scholar.Ingester.Interface.tabBrowser = document.getElementById("content"); + Scholar.Ingester.Interface.hiddenBrowser = document.getElementById("scholar-hidden-browser"); Scholar.Ingester.Interface.appContent = document.getElementById("appcontent"); Scholar.Ingester.Interface.statusImage = document.getElementById("scholar-status-image"); @@ -189,7 +190,7 @@ Scholar.Ingester.Interface._setDocument = function(browser) { browser.setAttribute("scholar-key", key); } } - Scholar.Ingester.Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser); + Scholar.Ingester.Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser, Scholar.Ingester.Interface.hiddenBrowser); Scholar.Ingester.Interface.browserDocuments[key].retrieveScraper(); } diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.xul b/chrome/chromeFiles/content/scholar/ingester/browser.xul index 649a123718..d252a04165 100755 --- a/chrome/chromeFiles/content/scholar/ingester/browser.xul +++ b/chrome/chromeFiles/content/scholar/ingester/browser.xul @@ -19,4 +19,7 @@ + + + diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js index 8404ddbbc0..627f137d0e 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/ingester.js +++ b/chrome/chromeFiles/content/scholar/xpcom/ingester.js @@ -48,7 +48,9 @@ Scholar.Ingester.Model.prototype.detachRepository = function() {} ///////////////////////////////////////////////////////////////// // Scholar.Ingester.Utilities class, a set of methods to assist in data // extraction. Most code here was stolen directly from the Piggy Bank project. -Scholar.Ingester.Utilities = function() {} +Scholar.Ingester.Utilities = function(hiddenBrowser) { + this.hiddenBrowser = hiddenBrowser; +} // Adapter for Piggy Bank function to print debug messages; log level is // fixed at 4 (could change this) @@ -99,6 +101,7 @@ Scholar.Ingester.Utilities.prototype.gatherElementsOnXPath = function(doc, paren // Loads a single document for a scraper, running succeeded() on success or // failed() on failure Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succeeded, failed) { + Scholar.debug("loadDocument called"); this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed); } @@ -112,6 +115,9 @@ Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succe // exception - a function to execute if an exception occurs (exceptions are // also logged in the Firefox Scholar log) Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) { + var hiddenBrowser = this.hiddenBrowser; + Scholar.debug("processDocuments called"); + try { if (urls.length == 0) { if (firstDoc) { @@ -128,53 +134,51 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD if (urlIndex < urls.length) { try { var url = urls[urlIndex]; - var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser"); - b.loadURI(url); + Scholar.debug("loading "+url); + hiddenBrowser.loadURI(url); } catch (e) { - exception(e); Scholar.debug("Scholar.Ingester.Utilities.processDocuments doLoad: " + e, 2); + exception(e); } } else { - window.setTimeout(done, 10); + hiddenBrowser.setTimeout(done, 10); } }; var onLoad = function() { - try { - var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser").selectedBrowser; - processor(b.contentDocument, doLoad); - } catch (e) { - exception(e); - Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2); + Scholar.debug("onLoad called"); + if(hiddenBrowser.id == "scholar-hidden-browser") { + hiddenBrowser.removeEventListener("DOMContentLoaded", onLoad, true); + try { + var newHiddenBrowser = new Object(); + Scholar.debug("new hidden browser"); + newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument; + newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow; + Scholar.debug("added attributes"); + processor(newHiddenBrowser); + Scholar.debug("called processor"); + } catch (e) { + Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2); + exception(e); + } } }; var init = function() { - var listener; - listener.onStateChange = function(webProgress, request, stateFlags, status) { - if ((stateFlags & Components.interfaces.nsIWebProgressListener.STATE_STOP) > 0 && - request.name == urls[urlIndex]) { - try { - Scholar.Ingester.progressDialog.setTimeout(onLoad, 10); - } catch (e) { - exception(e); - Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLocationChange: " + e, 2); - } - } - }; - - var tb = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser"); - tb.addProgressListener(listener, Components.interfaces.nsIWebProgress.NOTIFY_STATUS); + Scholar.debug("init called"); + hiddenBrowser.addEventListener("DOMContentLoaded", onLoad, true); if (firstDoc) { + Scholar.debug("processing"); processor(firstDoc, doLoad); } else { + Scholar.debug("doing load"); doLoad(); } } - w.addEventListener("load", init, false); + init(); } catch (e) { + Scholar.debug("processDocuments: " + e); exception(e); - PB_Debug.print("processDocuments: " + e); } } @@ -209,12 +213,18 @@ Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, su // break compatibility Scholar.Ingester.Utilities.prototype._MARCCleanString = function(author) { author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); - return author.replace(/[\s\.\,\/\[\]\:]+$/, ''); + author = author.replace(/[\s\.\,\/\[\]\:]+$/, ''); + return author.replace(/ +/, ' '); } Scholar.Ingester.Utilities.prototype._MARCCleanAuthor = function(author) { author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); - author = author.replace(/[\s\.\,\/\[\]\:]+$/, ''); + author = author.replace(/[\s\,\/\[\]\:\.]+$/, ''); + author = author.replace(/ +/, ' '); + // Add period for initials + if(author.substring(author.length-2, author.length-1) == " ") { + author += "."; + } var splitNames = author.split(', '); if(splitNames.length > 1) { author = splitNames[1]+' '+splitNames[0]; @@ -222,6 +232,16 @@ Scholar.Ingester.Utilities.prototype._MARCCleanAuthor = function(author) { return author; } +Scholar.Ingester.Utilities.prototype._MARCCleanNumber = function(author) { + author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); + author = author.replace(/[\s\.\,\/\[\]\:]+$/, ''); + var regexp = /^[^ ]*/; + var m = regexp.exec(author); + if(m) { + return m[0]; + } +} + Scholar.Ingester.Utilities.prototype._MARCAssociateField = function(record, uri, model, fieldNo, rdfUri, execMe, prefix, part) { if(!part) { part = 'a'; @@ -253,27 +273,29 @@ Scholar.Ingester.Utilities.prototype._MARCAssociateField = function(record, uri, // This is an extension to PiggyBank's architecture. It's here so that we don't // need an enormous library for each scraper that wants to use MARC records -Scholar.Ingester.Utilities.prototype.importMARCRecord = function(text, format, uri, model) { +Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, model) { var prefixDC = 'http://purl.org/dc/elements/1.1/'; var prefixDCMI = 'http://purl.org/dc/dcmitype/'; var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/'; - var record = new Scholar.Ingester.MARC_Record(); - record.load(text, format); - // Extract ISBNs - model = this._MARCAssociateField(record, uri, model, '020', prefixDC + 'identifier', this._MARCCleanString, 'ISBN '); + model = this._MARCAssociateField(record, uri, model, '020', prefixDC + 'identifier', this._MARCCleanNumber, 'ISBN '); // Extract ISSNs - model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanString, 'ISBN '); + model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN '); // Extract creators model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this._MARCCleanAuthor); model = this._MARCAssociateField(record, uri, model, '110', prefixDC + 'creator', this._MARCCleanString); model = this._MARCAssociateField(record, uri, model, '111', prefixDC + 'creator', this._MARCCleanString); model = this._MARCAssociateField(record, uri, model, '130', prefixDC + 'creator', this._MARCCleanString); - if(!model.data[uri][prefixDC + 'creator']) { + model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this._MARCCleanAuthor); + model = this._MARCAssociateField(record, uri, model, '710', prefixDC + 'contributor', this._MARCCleanString); + model = this._MARCAssociateField(record, uri, model, '711', prefixDC + 'contributor', this._MARCCleanString); + model = this._MARCAssociateField(record, uri, model, '730', prefixDC + 'contributor', this._MARCCleanString); + if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'])) { // some LOC entries have no listed author, but have the author + // in the person subject field as the first entry var field = record.get_field_subfields('600'); - if(field) { - model = this.addStatement(uri, prefixDC + 'creator', this._MARCCleanAuthor(field[0]['a'])); + if(field[0]) { + model.addStatement(uri, prefixDC + 'creator', this._MARCCleanAuthor(field[0]['a'])); } } // Extract title @@ -403,12 +425,13 @@ Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatu /* * Constructor for Document object */ -Scholar.Ingester.Document = function(browserWindow){ +Scholar.Ingester.Document = function(browserWindow, hiddenBrowser){ this.browser = browserWindow; + this.model = new Scholar.Ingester.Model(); this.appSvc = Cc["@mozilla.org/appshell/appShellService;1"] .getService(Ci.nsIAppShellService); - this.scraper = null - this.model = new Scholar.Ingester.Model(); + this.scraper = null; + this.hiddenBrowser = hiddenBrowser; this._generateSandbox(); } @@ -530,11 +553,13 @@ Scholar.Ingester.Document.prototype._generateSandbox = function() { this.sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href); this.sandbox.browser = this.browser; this.sandbox.doc = this.sandbox.browser.contentDocument; - this.sandbox.utilities = new Scholar.Ingester.Utilities; + this.sandbox.utilities = new Scholar.Ingester.Utilities(this.hiddenBrowser); this.sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this.appSvc.hiddenDOMWindow); this.sandbox.window = this.window; this.sandbox.model = this.model; this.sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult; + this.sandbox.MARC_Record = Scholar.Ingester.MARC_Record; + this.sandbox.MARC_Record.prototype = new Scholar.Ingester.MARC_Record(); var me = this; this.sandbox.wait = function(){ me._waitForCompletion = true; }; @@ -552,50 +577,90 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() { var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/'; for(var uri in this.model.data) { - var newItem = Scholar.Items.getNewItemByType(1); + if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) { + var newItem = Scholar.Items.getNewItemByType(2); + } else { + var newItem = Scholar.Items.getNewItemByType(1); + } newItem.setField("source", uri); if(this.model.data[uri][prefixDC + 'title']) { newItem.setField("title", this.model.data[uri][prefixDC + 'title'][0]); } - if(this.model.data[uri][prefixDC + 'publisher']) { - newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher'][0]); - } - if(this.model.data[uri][prefixDC + 'year']) { - if(this.model.data[uri][prefixDC + 'year'].length == 4) { - newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]); - } else { - try { - newItem.setField(this.model.data[uri][prefixDC + 'year'][0].substring( - this.model.data[uri][prefixDC + 'year'][0].lastIndexOf(" ")+1, - this.model.data[uri][prefixDC + 'year'][0].length)); - } catch(e) {} - } - } - if(this.model.data[uri][prefixDC + 'edition']) { - newItem.setField("edition", this.model.data[uri][prefixDC + 'edition'][0]); - } - if(this.model.data[uri][prefixDummy + 'series']) { - newItem.setField("series", this.model.data[uri][prefixDummy + 'series'][0]); - } - if(this.model.data[uri][prefixDummy + 'place']) { - newItem.setField("place", this.model.data[uri][prefixDummy + 'place'][0]); - } - if(this.model.data[uri][prefixDC + 'identifier']) { - for(i in this.model.data[uri][prefixDC + 'identifier']) { - if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISBN') { - newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5)); - break; - } - } - } + var creatorIndex = 0; if(this.model.data[uri][prefixDC + 'creator']) { for(i in this.model.data[uri][prefixDC + 'creator']) { var creator = this.model.data[uri][prefixDC + 'creator'][i]; var spaceIndex = creator.lastIndexOf(" "); var lastName = creator.substring(spaceIndex+1, creator.length); var firstName = creator.substring(0, spaceIndex); + + newItem.setCreator(creatorIndex, firstName, lastName, 1); + creatorIndex++; + } + } + if(this.model.data[uri][prefixDC + 'contributor']) { + for(i in this.model.data[uri][prefixDC + 'contributor']) { + var creator = this.model.data[uri][prefixDC + 'contributor'][i]; + var spaceIndex = creator.lastIndexOf(" "); + var lastName = creator.substring(spaceIndex+1, creator.length); + var firstName = creator.substring(0, spaceIndex); - newItem.setCreator(i, firstName, lastName); + newItem.setCreator(creatorIndex, firstName, lastName, 2); + creatorIndex++; + } + } + if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) { + if(this.model.data[uri][prefixDummy + 'publication']) { + newItem.setField("publication", this.model.data[uri][prefixDummy + 'publication'][0]); + } + if(this.model.data[uri][prefixDummy + 'volume']) { + newItem.setField("volume", this.model.data[uri][prefixDummy + 'volume'][0]); + } + if(this.model.data[uri][prefixDummy + 'number']) { + newItem.setField("number", this.model.data[uri][prefixDummy + 'number'][0]); + } + if(this.model.data[uri][prefixDummy + 'pages']) { + newItem.setField("pages", this.model.data[uri][prefixDummy + 'pages'][0]); + } + if(this.model.data[uri][prefixDC + 'identifier']) { + for(i in this.model.data[uri][prefixDC + 'identifier']) { + if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISSN') { + newItem.setField("ISSN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5)); + break; + } + } + } + } else { + if(this.model.data[uri][prefixDC + 'publisher']) { + newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher'][0]); + } + if(this.model.data[uri][prefixDC + 'year']) { + if(this.model.data[uri][prefixDC + 'year'].length == 4) { + newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]); + } else { + try { + newItem.setField(this.model.data[uri][prefixDC + 'year'][0].substring( + this.model.data[uri][prefixDC + 'year'][0].lastIndexOf(" ")+1, + this.model.data[uri][prefixDC + 'year'][0].length)); + } catch(e) {} + } + } + if(this.model.data[uri][prefixDC + 'edition']) { + newItem.setField("edition", this.model.data[uri][prefixDC + 'edition'][0]); + } + if(this.model.data[uri][prefixDummy + 'series']) { + newItem.setField("series", this.model.data[uri][prefixDummy + 'series'][0]); + } + if(this.model.data[uri][prefixDummy + 'place']) { + newItem.setField("place", this.model.data[uri][prefixDummy + 'place'][0]); + } + if(this.model.data[uri][prefixDC + 'identifier']) { + for(i in this.model.data[uri][prefixDC + 'identifier']) { + if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISBN') { + newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5)); + break; + } + } } } newItem.save(); diff --git a/chrome/chromeFiles/content/scholar/xpcom/marc.js b/chrome/chromeFiles/content/scholar/xpcom/marc.js index 13ae7cfb2a..6cf46d1469 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/marc.js +++ b/chrome/chromeFiles/content/scholar/xpcom/marc.js @@ -80,8 +80,7 @@ Scholar.Ingester.MARC_Record.prototype.load = function(s,f) { // loads record s } this.add_field(tag,ind1,ind2,value); } - } - if (f == 'MARC_Harvard') { + } else if (f == 'MARC_Harvard') { var linee = s.split('\n'); for (var i=0; i '008' && tag < '899') { // jumps low and high tags + if (tag != '040') this.add_field(tag,ind1,ind2,value); + } + } + this.add_field_005(); } this.update_record_length(); @@ -310,7 +347,7 @@ Scholar.Ingester.MARC_Record.prototype.exists = function(tag) { // field existen return false; } -function MARC_field(rec,tag,ind1,ind2,value) { // new MARC gield +Scholar.Ingester.MARC_Record.prototype.MARC_field = function(rec,tag,ind1,ind2,value) { // new MARC gield this.tag = tag; this.occ = rec.count_occ(tag)+1; // occurrence order no. this.ind1 = ind1; if (this.ind1 == '') this.ind1 = ' '; @@ -428,7 +465,7 @@ Scholar.Ingester.MARC_Record.prototype.get_field_subfields = function(tag) { // Scholar.Ingester.MARC_Record.prototype.add_field = function(tag,ind1,ind2,value) { // adds a field to the record if (tag.length != 3) { return false; } - var F = new MARC_field(this,tag,ind1,ind2,value); + var F = new this.MARC_field(this,tag,ind1,ind2,value); // adds pointer to list of fields this.variable_fields[this.variable_fields.length] = F; // adds the entry to the directory diff --git a/scrapers.sql b/scrapers.sql new file mode 100644 index 0000000000..abb6c123f3 --- /dev/null +++ b/scrapers.sql @@ -0,0 +1,1014 @@ +BEGIN TRANSACTION; +DELETE FROM scrapers; +INSERT INTO "scrapers" VALUES(1, NULL, NULL, 20060603002000, 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/gp/product/', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; +var prefixDC = ''http://purl.org/dc/elements/1.1/''; +var prefixDCMI = ''http://purl.org/dc/dcmitype/''; +var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; + +var namespace = doc.documentElement.namespaceURI; +var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; +} : null; + +var getNode = function(doc, contextNode, xpath, nsResolver) { + return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext(); +} + +var cleanString = function(s) { + s = utilities.trimString(s); + return s.replace(/ +/g, " "); +} + +var uri = doc.location.href; + +model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); + +// Retrieve authors +var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a''; +var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); +for (var i = 0; i < elmts.length; i++) { + var elmt = elmts[i]; + + model.addStatement(uri, prefixDC + ''creator'', cleanString(getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue), false); // Use your own type here +} + +// Retrieve data from "Product Details" box +var xpath = ''/html/body/table/tbody/tr/td[2]/table/tbody/tr/td[@class="bucket"]/div[@class="content"]/ul/li''; +var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); +for (var i = 0; i < elmts.length; i++) { + var elmt = elmts[i]; + var attribute = cleanString(getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue); + if(getNode(doc, elmt, ''./text()[1]'', nsResolver)) { + var value = cleanString(getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue); + + if(attribute == "Publisher:") { + if(value.lastIndexOf("(") != -1) { + var date = value.substring(value.lastIndexOf("(")+1, value.length-1); + value = value.substring(0, value.lastIndexOf("(")-1); + } + if(value.lastIndexOf(";") != -1) { + var edition = value.substring(value.lastIndexOf(";")+2, value.length); + value = value.substring(0, value.lastIndexOf(";")); + } + model.addStatement(uri, prefixDC + ''publisher'', value); + model.addStatement(uri, prefixDC + ''date'', date); + model.addStatement(uri, prefixDC + ''hasVersion'', edition); + } else if(attribute == "Language:") { + model.addStatement(uri, prefixDC + ''language'', value); + } else if(attribute == "ISBN:") { + model.addStatement(uri, prefixDC + ''identifier'', ''ISBN ''+value); + } else if(value.substring(value.indexOf(" ")+1, value.length) == "pages") { + model.addStatement(uri, prefixDummy + ''pages'', value.substring(0, value.indexOf(" "))); + model.addStatement(uri, prefixDC + ''medium'', attribute.substring(0, attribute.indexOf(":"))); + } + } +} + +var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]''; +var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); +var title = cleanString(getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue); +if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) { + title = title.substring(0, title.lastIndexOf("(")-1); +} +model.addStatement(uri, prefixDC + ''title'', title);'); + +INSERT INTO "scrapers" VALUES(2, NULL, NULL, 20060603002000, 'WorldCat Scraper', 'Simon Kornblith', '^http://newfirstsearch\.oclc\.org/WebZ/', +'if(doc.title == ''FirstSearch: WorldCat Detailed Record'') { + return true; +} +return false;', +'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; +var prefixDC = ''http://purl.org/dc/elements/1.1/''; +var prefixDCMI = ''http://purl.org/dc/dcmitype/''; +var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; + +var sessionRegexp = /(?:\?|\:)sessionid=([^?:]+)(?:\?|\:|$)/; +var numberRegexp = /(?:\?|\:)recno=([^?:]+)(?:\?|\:|$)/; +var resultsetRegexp = /(?:\?|\:)resultset=([^?:]+)(?:\?|\:|$)/; +var lineRegexp = /^([\w() ]+): *(.*)$/; +var publisherRegexp = /^(.*), (.*?),?$/; + +var uri = doc.location.href; + +var sMatch = sessionRegexp.exec(uri); +var sessionid = sMatch[1]; + +var nMatch = numberRegexp.exec(uri); +if(nMatch) { + var number = nMatch[1]; +} else { + number = 1; +} + +var rMatch = resultsetRegexp.exec(uri); +if(rMatch) { + var resultset = rMatch[1]; +} else { + // It''s in an XPCNativeWrapper, so we have to do this black magic + resultset = doc.forms.namedItem(''main'').elements.namedItem(''resultset'').value; +} + +var newUri = ''http://newfirstsearch.oclc.org/WebZ/DirectExport?numrecs=10:smartpage=directexport:entityexportnumrecs=10:entityexportresultset='' + resultset + '':entityexportrecno='' + number + '':sessionid='' + sessionid + '':entitypagenum=35:0''; + +model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); + +function cleanAuthor(author) { + author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''''); + author = author.replace(/[\s\,\/\[\]\:\.]+$/, ''''); + // Add period for initials + if(author.substring(author.length-2, author.length-1) == " ") { + author += "."; + } + var splitNames = author.split('', ''); + if(splitNames.length > 1) { + author = splitNames[1]+'' ''+splitNames[0]; + } + return author; +} + +utilities.HTTPUtilities.doPost(newUri, ''exportselect=record&exporttype=plaintext'', null, function(text) { + var lines = text.split(''\n''); + for(var i=0;i") { + haveStarted = true; + } + } + + // Loop through again so that we can add with the stableURL + model.addStatement(stableURL, prefixRDF + "type", prefixDummy + "journal", false); + for(i in data) { + if(data[i].length) { + for(j in data[i]) { + model.addStatement(stableURL, i, data[i][j]); + } + } + } + + done(); + }) + }) +}); + +wait();'); + +INSERT INTO "scrapers" VALUES(5, NULL, NULL, 20060603002000, 'History Cooperative Scraper', 'Simon Kornblith', '^http://www\.historycooperative\.org/journals/.+/.+/.+\.html', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; +var prefixDC = ''http://purl.org/dc/elements/1.1/''; +var prefixDCMI = ''http://purl.org/dc/dcmitype/''; +var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; + +var uri = doc.location.href; + +model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false); + +var month, year; + +var metaTags = doc.getElementsByTagName("meta"); + +function associateMeta(field, rdfUri) { + var field = metaTags.namedItem(field); + if(field) { + model.addStatement(uri, rdfUri, field.getAttribute("content"), false); + } +} + +associateMeta("Title", prefixDC + "title"); +associateMeta("Journal", prefixDummy + "publication"); +associateMeta("Volume", prefixDummy + "volume"); +associateMeta("Issue", prefixDummy + "number"); + +var author = metaTags.namedItem("Author"); +if(author) { + var authors = author.getAttribute("content").split(" and "); + for(j in authors) { + model.addStatement(uri, prefixDC + "creator", authors[j], false); + } +} + +var month = metaTags.namedItem("PublicationMonth"); +var year = metaTags.namedItem("PublicationYear"); +if(month && year) { + model.addStatement(uri, prefixDC + "date", month.getAttribute("content")+" "+year.getAttribute("content"), false); +} +'); + +INSERT INTO "scrapers" VALUES(6, NULL, NULL, 20060603002000, 'InnoPAC Scraper', 'Simon Kornblith', '^http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/frameset\&FF=', NULL, +'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; +var prefixDC = ''http://purl.org/dc/elements/1.1/''; +var prefixDCMI = ''http://purl.org/dc/dcmitype/''; +var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; + +var uri = doc.location.href; + +var matchRegexp = new RegExp(''^(http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/)frameset(.+)$''); +var m = matchRegexp.exec(uri); +var newUri = m[1]+''marc''+m[2]; + +utilities.loadDocument(newUri, browser, function(newBrowser) { + newDoc = newBrowser.contentDocument; + + var namespace = newDoc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var getNode = function(doc, contextNode, xpath, nsResolver) { + return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext(); + } + + var xpath = ''//pre''; + var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); + + var text = getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue; + + var record = new MARC_Record(); + record.load(text, "MARC_PAC"); + model = utilities.importMARCRecord(record, uri, model); + done(); +}, function() {}) + +wait();'); + +INSERT INTO "scrapers" VALUES(7, NULL, NULL, 20060603002000, 'SIRSI Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', +'var namespace = doc.documentElement.namespaceURI; +var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; +} : null; + +var xpath = ''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]''; +var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); +if(elmts.length) { + return true; +} +return false;', +'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; +var prefixDC = ''http://purl.org/dc/elements/1.1/''; +var prefixDCMI = ''http://purl.org/dc/dcmitype/''; +var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; + +var namespace = doc.documentElement.namespaceURI; +var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; +} : null; + +var getNode = function(doc, contextNode, xpath, nsResolver) { + return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext(); +} + +function stringTrimmer(x) { + var x = x.replace(/^[^\w(]+/, ""); + return x.replace(/[^\w)]+$/, ""); +} + +function getAnyNumber(x) { + var re = /[0-9]+/; + var m = re.exec(x); + if(m) { + return m[0]; + } +} + +function getISBN(x) { + var re = /^[0-9](?:[0-9X]+)/; + var m = re.exec(x); + if(m) { + return m[0]; + } +} + +function cleanAuthor(author) { + author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''''); + author = author.replace(/[\s\,\/\[\]\:\.]+$/, ''''); + // Add period for initials + if(author.substring(author.length-2, author.length-1) == " ") { + author += "."; + } + var splitNames = author.split('', ''); + if(splitNames.length > 1) { + author = splitNames[1]+'' ''+splitNames[0]; + } + return author; +} + +var uri = doc.location.href; +var data = new Object(); + +var xpath = ''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]''; +var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); +for (var i = 0; i < elmts.length; i++) { + var elmt = elmts[i]; + try { + var node = getNode(doc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver); + if(!node) { + var node = getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver); + } + if(node) { + var field = stringTrimmer(getNode(doc, elmt, ''./TH[1]/text()[1]'', nsResolver).nodeValue); + field = field.toLowerCase(); + var value = stringTrimmer(node.nodeValue); + var rdfUri = null; + if(field == "publisher") { + rdfUri = prefixDC + ''publisher''; + } else if(field == "pub date") { + rdfUri = prefixDC + ''date''; + value = getAnyNumber(value); + } else if(field == "isbn") { + rdfUri = prefixDC + ''identifier''; + value = ''ISBN ''+getISBN(value); + } else if(field == "title") { + rdfUri = prefixDC + ''title''; + var titleParts = value.split(" / "); + value = titleParts[0]; + } else if(field == "publication info") { + rdfUri = prefixDummy + ''place''; + var pubParts = value.split(" : "); + value = pubParts[0]; + } else if(field == "personal author") { + rdfUri = prefixDC + ''creator''; + value = cleanAuthor(node.nodeValue); + } else if(field == "added author") { + rdfUri = prefixDC + ''contributor''; + value = cleanAuthor(node.nodeValue); + } else if(field == "corporate author") { + rdfUri = prefixDC + ''creator''; + } + if(rdfUri) { + var insert = true; + if(data && data[rdfUri]) { + for(j in data[rdfUri]) { + if(data[rdfUri][j] == value) { + insert = false; + break; + } + } + } else if(!data[rdfUri]) { + data[rdfUri] = new Array(); + } + if(insert) { + data[rdfUri].push(value); + model.addStatement(uri, rdfUri, value, true); + } + } + } + } catch (e) {} + +} +'); + +INSERT INTO "scrapers" VALUES(8, NULL, NULL, 20060603002000, 'ProQuest Scraper', 'Simon Kornblith', 'http://proquest\.umi\.com/pqdweb\?(?:.*\&)?did=', '', +'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; +var prefixDC = ''http://purl.org/dc/elements/1.1/''; +var prefixDCMI = ''http://purl.org/dc/dcmitype/''; +var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; + +var namespace = doc.documentElement.namespaceURI; +var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; +} : null; + +var getNode = function(doc, contextNode, xpath, nsResolver) { + return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext(); +} + +function stringTrimmer(x) { + var x = x.replace(/^[^\w(]+/, ""); + return x.replace(/[^\w)]+$/, ""); +} + +function getPageRange(x) { + var re = /[0-9\-]+/; + var m = re.exec(x); + if(m) { + return m[0]; + } +} + +function cleanAuthor(author) { + author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''''); + author = author.replace(/[\s\,\/\[\]\:\.]+$/, ''''); + // Add period for initials + if(author.substring(author.length-2, author.length-1) == " ") { + author += "."; + } + var splitNames = author.split('', ''); + if(splitNames.length > 1) { + author = splitNames[1]+'' ''+splitNames[0]; + } + return author; +} + +var uri = doc.location.href; +var data = new Object(); + +// Title +var xpath = ''/html/body/span[@class="textMedium"]/table/tbody/tr/td[@class="headerBlack"]/strong//text()''; +var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); +var title = ""; +for (var i = 0; i < elmts.length; i++) { + var elmt = elmts[i]; + title += elmt.nodeValue; +} +if(title) { + model.addStatement(uri, prefixDC + ''title'', title, true); +} + +// Authors +var xpath = ''/html/body/span[@class="textMedium"]/table/tbody/tr/td[@class="textMedium"]/a/em''; +var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); +for (var i = 0; i < elmts.length; i++) { + var elmt = elmts[i]; + + // Dirty hack to fix highlighted words + var xpath = ''.//text()''; + var author = ""; + var authorElmts = utilities.gatherElementsOnXPath(doc, elmt, xpath, nsResolver); + for (var j = 0; j < authorElmts.length; j++) { + var authorElmt = authorElmts[j]; + author += authorElmt.nodeValue; + } + model.addStatement(uri, prefixDC + ''creator'', cleanAuthor(author), true); +} + +// Other info +var xpath = ''/html/body/span[@class="textMedium"]/font/table/tbody/tr''; +var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); +for (var i = 0; i < elmts.length; i++) { + var elmt = elmts[i]; + var field = stringTrimmer(getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue).toLowerCase(); + if(field == "publication title") { + var publication = getNode(doc, elmt, ''./TD[2]/A[1]/text()[1]'', nsResolver); + if(publication.nodeValue) { + model.addStatement(uri, prefixDummy + ''publication'', stringTrimmer(publication.nodeValue), true); + } + var place = getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); + if(place.nodeValue) { + model.addStatement(uri, prefixDummy + ''place'', stringTrimmer(place.nodeValue), true); + } + var date = getNode(doc, elmt, ''./TD[2]/A[2]/text()[1]'', nsResolver); + if(date.nodeValue) { + model.addStatement(uri, prefixDC + ''date'', stringTrimmer(date.nodeValue), true); + } + var moreInfo = getNode(doc, elmt, ''./TD[2]/text()[2]'', nsResolver); + if(moreInfo.nodeValue) { + moreInfo = stringTrimmer(moreInfo.nodeValue); + var parts = moreInfo.split(";\xA0"); + + var issueRegexp = /^(\w+)\.(?: |\xA0)?(.+)$/ + var issueInfo = parts[0].split(",\xA0"); + for(j in issueInfo) { + var m = issueRegexp.exec(issueInfo[j]); + var info = m[1].toLowerCase(); + if(info == "vol") { + model.addStatement(uri, prefixDummy + ''volume'', stringTrimmer(m[2]), true); + } else if(info == "iss" || info == "no") { + model.addStatement(uri, prefixDummy + ''number'', stringTrimmer(m[2]), true); + } + } + if(parts[1] && stringTrimmer(parts[1]).substring(0, 3).toLowerCase() == "pg.") { + var pages = getPageRange(parts[1]); + if(pages) { + model.addStatement(uri, prefixDummy + ''pages'', pages, true); + } + } + } + } else if(field == "source type") { + var value = getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); + if(value.nodeValue) { + value = stringTrimmer(value.nodeValue).toLowerCase(); + + if(value == "newspaper" || value == "periodical") { + model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false); + } else { + model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); + } + } + } else if(field == "isbn" || field == "issn" || field == "issn/isbn") { + var value = getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); + if(value) { + var type; + value = stringTrimmer(value.nodeValue); + if(value.length == 10 || value.length == 13) { + type = "ISBN"; + } else if(value.length == 8) { + type = "ISSN"; + } + if(type) { + model.addStatement(uri, prefixDC + "identifier", type+" "+value, false); + } + } + } +}'); + +INSERT INTO "scrapers" VALUES(9, NULL, NULL, 20060603002000, 'InfoTrac Scraper', 'Simon Kornblith', '^http://infotrac-college\.thomsonlearning\.com/itw/infomark/', +'if(doc.title.substring(0, 8) == "Article ") { + return true; +} +return false;', +'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; +var prefixDC = ''http://purl.org/dc/elements/1.1/''; +var prefixDCMI = ''http://purl.org/dc/dcmitype/''; +var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; + +var namespace = doc.documentElement.namespaceURI; +var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; +} : null; + +function cleanAuthor(author) { + author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''''); + author = author.replace(/[\s\,\/\[\]\:\.]+$/, ''''); + // Add period for initials + if(author.substring(author.length-2, author.length-1) == " ") { + author += "."; + } + var splitNames = author.split('', ''); + if(splitNames.length > 1) { + author = splitNames[1]+'' ''+splitNames[0]; + } + return author; +} + +var uri = doc.location.href; + +var xpath = ''/html/body//comment()''; +var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); +for (var i = 0; i < elmts.length; i++) { + var elmt = elmts[i]; + var colon = elmt.nodeValue.indexOf(":"); + var field = elmt.nodeValue.substring(1, colon).toLowerCase(); + var value = elmt.nodeValue.substring(colon+1, elmt.nodeValue.length-1); + if(field == "title") { + model.addStatement(uri, prefixDC + "title", value, false); + } else if(field == "journal") { + model.addStatement(uri, prefixDummy + "publication", value, false); + } else if(field == "pi") { + parts = value.split(" "); + var date = ""; + var isDate = true; + var rdfUri; + for(j in parts) { + firstChar = parts[j].substring(0, 1); + rdfUri = false; + + if(firstChar == "v") { + rdfUri = prefixDummy + "volume"; + } else if(firstChar == "i") { + rdfUri = prefixDummy + "issue"; + } else if(firstChar == "p") { + rdfUri = prefixDummy + "pages"; + var pagesRegexp = /p(\w+)\((\w+)\)/; + var match = pagesRegexp.exec(parts[j]); + if(match) { + var finalPage = parseInt(match[1])+parseInt(match[2]) + parts[j] = "p"+match[1]+"-"+finalPage.toString(); + } + } + + if(rdfUri) { + isDate = false; + if(parts[j] != "pNA") { // not a real page number + var content = parts[j].substring(1); + model.addStatement(uri, rdfUri, content, true); + } + } else if(isDate) { + date += " "+parts[j]; + } + } + if(date != "") { + model.addStatement(uri, prefixDC + "date", date.substring(1), false); + } + } else if(field == "author") { + model.addStatement(uri, prefixDC + "creator", cleanAuthor(value), false); + } +} +model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);'); + +INSERT INTO "scrapers" VALUES(10, NULL, NULL, 20060603002000, 'LexisNexis Scraper', 'Simon Kornblith', '^http://web\.lexis-nexis\.com/universe/document', NULL, +'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; +var prefixDC = ''http://purl.org/dc/elements/1.1/''; +var prefixDCMI = ''http://purl.org/dc/dcmitype/''; +var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; + +function clearTags(x) { + x = x.replace(/]*>/gi, "\n"); + return x.replace(/<[^>]+>/g, ""); +} + +var uri = doc.location.href; + +var citationDataDiv; +var divs = doc.getElementsByTagName("div"); +for(i in divs) { + if(divs[i].className == "bodytext") { + citationDataDiv = divs[i]; + break; + } +} + +centerElements = citationDataDiv.getElementsByTagName("center"); +var elementParts = centerElements[0].innerHTML.split(/]*>/gi); +model.addStatement(uri, prefixDummy + "publication", elementParts[elementParts.length-1], true); + +var dateRegexp = /]*>([A-Z][a-z]+)<\/b> ([0-9]+, [0-9]{4})/; +var m = dateRegexp.exec(centerElements[centerElements.length-1].innerHTML); +if(m) { + model.addStatement(uri, prefixDC + "date", m[1]+" "+m[2], true); +} else { + var elementParts = centerElements[centerElements.length-1].innerHTML.split(/]*>/gi); + model.addStatement(uri, prefixDC + "date", elementParts[1], true); +} + +var cutIndex = citationDataDiv.innerHTML.indexOf("BODY:"); +if(cutIndex < 0) { + cutIndex = citationDataDiv.innerHTML.indexOf("TEXT:"); +} +if(cutIndex > 0) { + citationData = citationDataDiv.innerHTML.substring(0, cutIndex); +} else { + citationData = citationDataDiv.innerHTML; +} + +citationData = clearTags(citationData); + +var headlineRegexp = /\n(?:HEADLINE|TITLE|ARTICLE): ([^\n]+)\n/; +var m = headlineRegexp.exec(citationData); +if(m) { + model.addStatement(uri, prefixDC + "title", clearTags(m[1]), true); +} + +var bylineRegexp = /\nBYLINE: *(\w[\w\- ]+)/; +var m = bylineRegexp.exec(citationData); +if(m) { + utilities.debugPrint(m[1].substring(0, 3).toLowerCase()); + if(m[1].substring(0, 3).toLowerCase() == "by ") { + m[1] = m[1].substring(3); + } + model.addStatement(uri, prefixDC + "creator", m[1], true); +} + +var authorRegexp = /\n(?:AUTHOR|NAME): ([^\n]+)\n/; +var m = authorRegexp.exec(citationData); +if(m) { + var authors = m[1].split(/, (?:and )?/); + for(i in authors) { + model.addStatement(uri, prefixDC + "creator", authors[i].replace(" *", ""), true); + } +} + +model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false); + +utilities.debugPrint(citationData);'); + +INSERT INTO "scrapers" VALUES(11, NULL, NULL, 20060603002000, 'Aleph Scraper', 'Simon Kornblith', 'func=full-set-set.*\&format=999', NULL, +'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; +var prefixDC = ''http://purl.org/dc/elements/1.1/''; +var prefixDCMI = ''http://purl.org/dc/dcmitype/''; +var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; + +var uri = doc.location.href; + +var newUri = uri.replace("&format=999", "&format=001"); +utilities.debugPrint(newUri); + +function stringTrimmer(x) { + var x = x.replace(/^[^\w(]+/, ""); + return x.replace(/[^\w)]+$/, ""); +} + +utilities.loadDocument(newUri, browser, function(newBrowser) { + newDoc = newBrowser.contentDocument; + + var namespace = newDoc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var getNode = function(doc, contextNode, xpath, nsResolver) { + return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext(); + } + + var xpath = ''/html/body/table/tbody/tr[td[1][@class="td1"][@id="bold"]][td[2][@class="td1"]]''; + var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); + var record = new MARC_Record(); + for(var i=0; i 3) { + var ind1 = field.charAt(3); + if(field.length > 4) { + var ind2 = field.charAt(4); + } + } + record.add_field(code, ind1, ind2, value); + } + } + + model = utilities.importMARCRecord(record, uri, model); + done(); +}, function() {}) + +wait();'); + + +INSERT INTO "scrapers" VALUES(12, NULL, NULL, 20060603002000, 'Dynix Scraper', 'Simon Kornblith', 'ipac\.jsp\?.*uri=full=[0-9]', NULL, +'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; +var prefixDC = ''http://purl.org/dc/elements/1.1/''; +var prefixDCMI = ''http://purl.org/dc/dcmitype/''; +var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; + +var uri = doc.location.href; +var newUri = uri+''&fullmarc=true''; +utilities.debugPrint(newUri); + +function stringTrimmer(x) { + var x = x.replace(/^[^\w(]+/, ""); + return x.replace(/[^\w)]+$/, ""); +} + +var getNode = function(doc, contextNode, xpath, nsResolver) { + return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext(); +} + +var getNodeString = function(doc, contextNode, xpath, nsResolver) { + var elmts = utilities.gatherElementsOnXPath(doc, contextNode, xpath, nsResolver); + var returnVar = ""; + for(var i=0; i