From f43791701679b2e9d2ed0b3c175ad5d72bee3e9a Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Wed, 7 Jun 2006 21:26:55 +0000 Subject: [PATCH] Add Project MUSE scraper --- scrapers.sql | 68 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/scrapers.sql b/scrapers.sql index cd690a37d8..5e33237bed 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1151,7 +1151,6 @@ utilities.loadDocument(newUri, browser, function(newBrowser) { wait();'); - INSERT INTO "scrapers" VALUES(16, NULL, NULL, 20060603002000, 'SIRSI -2003 Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', 'var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { @@ -1235,7 +1234,6 @@ utilities.HTTPUtilities.doPost(newUri, ''marks=''+recNumber+''&shadow=NO&format= }) wait();'); - INSERT INTO "scrapers" VALUES(17, NULL, NULL, 20060603002000, 'TLC/YouSeeMore Scraper', 'Simon Kornblith', 'TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; @@ -1312,4 +1310,68 @@ utilities.loadDocument(newUri, browser, function(newBrowser) { done(); }, function() {}); -wait();'); \ No newline at end of file +wait();'); + +INSERT INTO "scrapers" VALUES(18, NULL, NULL, 20060603002000, 'Project MUSE Scraper', 'Simon Kornblith', '^http://muse.jhu.edu/journals/[^/]+/[^/]+/[^/]+.html$', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; +var prefixDC = ''http://purl.org/dc/elements/1.1/''; +var prefixDCMI = ''http://purl.org/dc/dcmitype/''; +var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; + +var namespace = doc.documentElement.namespaceURI; +var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; +} : null; + +function numbersOnly(text) { + return text.replace(/[^0-9]/g, ""); +} + +var uri = doc.location.href; + +var elmts = utilities.gatherElementsOnXPath(doc, doc, ''//comment()'', nsResolver); +for(i in elmts) { + if(elmts[i].nodeValue.substr(0, 10) == "HeaderData") { + var headerRegexp = /HeaderData((?:.|\n)*)\#\#EndHeaders/i + var m = headerRegexp.exec(elmts[i].nodeValue); + var headerData = m[1]; + } +} + +// Use E4X rather than DOM/XPath, because the Mozilla gods have decided not to +// expose DOM/XPath to sandboxed scripts +var newDOM = new XML(headerData); + +function mapDOM(path, rdfUri) { + if(newDOM.header[path]) { + model.addStatement(uri, rdfUri, newDOM[path].text(), true); + } +} + +mapDOM("journal", prefixDummy + "publication"); +mapDOM("issn", prefixDummy + "publication", numbersOnly); +mapDOM("volume", prefixDummy + "volume"); +mapDOM("issue", prefixDummy + "issue"); +mapDOM("year", prefixDummy + "year"); +mapDOM("pubdate", prefixDC + "date"); +mapDOM("doctitle", prefixDC + "title"); + +// Do pages +var fpage = newDOM.fpage.text(); +var lpage = newDOM.lpage.text(); +if(fpage) { + var pages = fpage; + if(lpage) { + pages += "-"+lpage; + } + model.addStatement(uri, prefixDummy + "pages", pages, true); +} + +// Do authors +var elmts = newDOM.docauthor; +for(i in elmts) { + var fname = elmts[i].fname.text(); + var surname = elmts[i].surname.text(); + model.addStatement(uri, prefixDC + "creator", fname+" "+surname, true); +} + +model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);'); \ No newline at end of file