-- 12 -- Set the following timestamp to the most recent scraper update date REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-23 12:17:00')); REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-22 22:58:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; function scrape(doc) { uri = doc.location.href; // Retrieve authors var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a''; var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); for (var i = 0; i < elmts.length; i++) { var elmt = elmts[i]; model.addStatement(uri, prefixDC + ''creator'', utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue), false); // Use your own type here } // Retrieve data from "Product Details" box var xpath = ''/html/body/table/tbody/tr/td[2]/table/tbody/tr/td[@class="bucket"]/div[@class="content"]/ul/li''; var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); for (var i = 0; i < elmts.length; i++) { var elmt = elmts[i]; var attribute = utilities.cleanString(utilities.getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue); if(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver)) { var value = utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue); if(attribute == "Publisher:") { if(value.lastIndexOf("(") != -1) { var date = value.substring(value.lastIndexOf("(")+1, value.length-1); jsDate = new Date(date); if(!isNaN(jsDate.valueOf())) { date = utilities.dateToISO(jsDate); } value = value.substring(0, value.lastIndexOf("(")-1); } if(value.lastIndexOf(";") != -1) { var edition = value.substring(value.lastIndexOf(";")+2, value.length); value = value.substring(0, value.lastIndexOf(";")); } model.addStatement(uri, prefixDC + ''publisher'', value); model.addStatement(uri, prefixDC + ''date'', date); model.addStatement(uri, prefixDC + ''hasVersion'', edition); } else if(attribute == "Language:") { model.addStatement(uri, prefixDC + ''language'', value); } else if(attribute == "ISBN:") { model.addStatement(uri, prefixDC + ''identifier'', ''ISBN ''+value); } else if(value.substring(value.indexOf(" ")+1, value.length) == "pages") { model.addStatement(uri, prefixDummy + ''pages'', value.substring(0, value.indexOf(" "))); model.addStatement(uri, prefixDC + ''medium'', attribute.substring(0, attribute.indexOf(":"))); } } } var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]''; var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); var title = utilities.cleanString(utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue); if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) { title = title.substring(0, title.lastIndexOf("(")-1); } model.addStatement(uri, prefixDC + ''title'', title); model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); } var searchRe = new RegExp(''http://www\.amazon\.com/(gp/search/|exec/obidos/search-handle-url/)''); var m = searchRe.exec(doc.location.href) if(m) { // Why can''t amazon use the same stylesheets var xpath; if(m == "gp/search/") { xpath = ''//table[@class="searchresults"]''; } else { xpath = ''//table[@cellpadding="3"]''; } var searchresults = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); var items = utilities.getItemArray(doc, searchresults, ''http://www\.amazon\.com/(gp/product/|exec/obidos/tg/detail/)'', ''^(Buy new|Hardcover|Paperback|Digital)$''); items = utilities.selectItems(items); if(!items) { return true; } var uris = new Array(); for(i in items) { uris.push(i); } utilities.processDocuments(browser, null, uris, function(browser) { scrape(browser.contentDocument) }, function() { utilities.debugPrint("look, done"); done(); }, function() {}); wait(); } else { scrape(doc); }'); REPLACE INTO "scrapers" VALUES('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-18 11:02:00', 'WorldCat Scraper', 'Simon Kornblith', '^http://newfirstsearch\.oclc\.org/WebZ/', 'if(doc.title == ''FirstSearch: WorldCat Detailed Record'') { return true; } return false;', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var sessionRegexp = /(?:\?|\:)sessionid=([^?:]+)(?:\?|\:|$)/; var numberRegexp = /(?:\?|\:)recno=([^?:]+)(?:\?|\:|$)/; var resultsetRegexp = /(?:\?|\:)resultset=([^?:]+)(?:\?|\:|$)/; var lineRegexp = /^([\w() ]+): *(.*)$/; var publisherRegexp = /^(.*), (.*?),?$/; var uri = doc.location.href; var sMatch = sessionRegexp.exec(uri); var sessionid = sMatch[1]; var nMatch = numberRegexp.exec(uri); if(nMatch) { var number = nMatch[1]; } else { number = 1; } var rMatch = resultsetRegexp.exec(uri); if(rMatch) { var resultset = rMatch[1]; } else { // It''s in an XPCNativeWrapper, so we have to do this black magic resultset = doc.forms.namedItem(''main'').elements.namedItem(''resultset'').value; } var newUri = ''http://newfirstsearch.oclc.org/WebZ/DirectExport?numrecs=10:smartpage=directexport:entityexportnumrecs=10:entityexportresultset='' + resultset + '':entityexportrecno='' + number + '':sessionid='' + sessionid + '':entitypagenum=35:0''; utilities.HTTPUtilities.doPost(newUri, ''exportselect=record&exporttype=plaintext'', null, function(text) { var lines = text.split(''\n''); for(var i=0;i") { haveStarted = true; } } // Loop through again so that we can add with the stableURL model.addStatement(stableURL, prefixRDF + "type", prefixDummy + "journal", false); for(i in data) { if(data[i].length) { for(j in data[i]) { model.addStatement(stableURL, i, data[i][j]); } } } done(); }) }) }); wait();'); REPLACE INTO "scrapers" VALUES('e85a3134-8c1a-8644-6926-584c8565f23e', '2006-06-18 11:02:00', 'History Cooperative Scraper', 'Simon Kornblith', '^http://www\.historycooperative\.org/journals/.+/.+/.+\.html', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var uri = doc.location.href; var month, year; var metaTags = doc.getElementsByTagName("meta"); function associateMeta(field, rdfUri) { var field = metaTags.namedItem(field); if(field) { model.addStatement(uri, rdfUri, field.getAttribute("content"), false); } } associateMeta("Title", prefixDC + "title"); associateMeta("Journal", prefixDummy + "publication"); associateMeta("Volume", prefixDummy + "volume"); associateMeta("Issue", prefixDummy + "number"); var author = metaTags.namedItem("Author"); if(author) { var authors = author.getAttribute("content").split(" and "); for(j in authors) { model.addStatement(uri, prefixDC + "creator", authors[j], false); } } var month = metaTags.namedItem("PublicationMonth"); var year = metaTags.namedItem("PublicationYear"); if(month && year) { model.addStatement(uri, prefixDC + "date", month.getAttribute("content")+" "+year.getAttribute("content"), false); } model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false); '); REPLACE INTO "scrapers" VALUES('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006-06-23 10:11:00', 'InnoPAC Scraper', 'Simon Kornblith', '^http://[^/]+/(?:search/|record=)', '// First, check to see if the URL alone reveals InnoPAC, since some sites don''t reveal the MARC button var matchRegexp = new RegExp(''^(http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/)frameset(.+)$''); if(matchRegexp.test(doc.location.href)) { return true; } // Next, look for the MARC button var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; var xpath = ''//a[img[@alt="MARC Display"]]''; var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); if(elmts.length) { return true; } // Also, check for links to an item display page var tags = doc.getElementsByTagName("a"); for(i=0; i]*>/gi); model.addStatement(uri, prefixDummy + "publication", elementParts[elementParts.length-1], true); var dateRegexp = /]*>(?:)?([A-Z][a-z]+)(?:<\/b>)? ([0-9]+, [0-9]{4})/; var m = dateRegexp.exec(centerElements[centerElements.length-1].innerHTML); if(m) { var jsDate = new Date(m[1]+" "+m[2]); model.addStatement(uri, prefixDC + "date", utilities.dateToISO(jsDate), true); } else { var elementParts = centerElements[centerElements.length-1].innerHTML.split(/]*>/gi); model.addStatement(uri, prefixDC + "date", elementParts[1], true); } var cutIndex = citationDataDiv.innerHTML.indexOf("BODY:"); if(cutIndex < 0) { cutIndex = citationDataDiv.innerHTML.indexOf("TEXT:"); } if(cutIndex > 0) { citationData = citationDataDiv.innerHTML.substring(0, cutIndex); } else { citationData = citationDataDiv.innerHTML; } citationData = utilities.cleanTags(citationData); var headlineRegexp = /\n(?:HEADLINE|TITLE|ARTICLE): ([^\n]+)\n/; var m = headlineRegexp.exec(citationData); if(m) { model.addStatement(uri, prefixDC + "title", utilities.cleanTags(m[1]), true); } var bylineRegexp = /\nBYLINE: *(\w[\w\- ]+)/; var m = bylineRegexp.exec(citationData); if(m) { if(m[1].substring(0, 3).toLowerCase() == "by ") { m[1] = m[1].substring(3); } model.addStatement(uri, prefixDC + "creator", m[1], true); model.addStatement(uri, prefixRDF + "type", prefixDummy + "newspaper", false); } else { model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false); } var authorRegexp = /\n(?:AUTHOR|NAME): ([^\n]+)\n/; var m = authorRegexp.exec(citationData); if(m) { var authors = m[1].split(/, (?:and )?/); for(i in authors) { model.addStatement(uri, prefixDC + "creator", authors[i].replace(" *", ""), true); } }'); REPLACE INTO "scrapers" VALUES('cf87eca8-041d-b954-795a-2d86348999d5', '2006-06-21 09:55:00', 'Aleph Scraper', 'Simon Kornblith', 'func=full-set-set.*\&format=999', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var uri = doc.location.href; var newUri = uri.replace("&format=999", "&format=001"); utilities.loadDocument(newUri, browser, function(newBrowser) { newDoc = newBrowser.contentDocument; var namespace = newDoc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; var xpath = ''/html/body/table/tbody/tr[td[1][@class="td1"][@id="bold"]][td[2][@class="td1"]]''; var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); var record = new MARC_Record(); for(var i=0; i 3) { var ind1 = field.charAt(3); if(field.length > 4) { var ind2 = field.charAt(4); } } record.add_field(code, ind1, ind2, value); } } model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); utilities.importMARCRecord(record, uri, model); done(); }, function() {}); wait();'); REPLACE INTO "scrapers" VALUES('774d7dc2-3474-2684-392c-f787789ec63d', '2006-06-21 09:55:00', 'Dynix Scraper', 'Simon Kornblith', 'ipac\.jsp\?.*uri=full=[0-9]', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var uri = doc.location.href; var newUri = uri+''&fullmarc=true''; utilities.loadDocument(newUri, browser, function(newBrowser) { newDoc = newBrowser.contentDocument; var namespace = newDoc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; var xpath = ''//form/table[@class="tableBackground"]/tbody/tr/td/table[@class="tableBackground"]/tbody/tr[td[1]/a[@class="normalBlackFont1"]]''; var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); var record = new MARC_Record(); for(var i=0; i 0) { var body = doc.getElementsByTagName("body"); if(body[0].innerHTML.indexOf("ISBN") < 0) { return false; } } return true;', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var uri = doc.location.href; var uriRegexp = /^(https?:\/\/.*\/web2\/tramp2\.exe\/)(?:goto|see\_record|authority\_hits)(\/.*)\?(?:screen=Record\.html\&)?(.*)$/i; var m = uriRegexp.exec(uri); if(uri.indexOf("authority_hits") < 0) { var newUri = m[1]+"download_record"+m[2]+"/RECORD.MRC?format=marc&"+m[3]; } else { var newUri = m[1]+"download_record"+m[2]+"/RECORD.MRC?format=marc"; } utilities.HTTPUtilities.doGet(newUri, null, function(text) { var record = new MARC_Record(); record.load(text, "binary"); utilities.importMARCRecord(record, uri, model); done(); }) wait();'); REPLACE INTO "scrapers" VALUES('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-18 11:19:00', 'GEAC Scraper', 'Simon Kornblith', '/(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var uri = doc.location.href; var newUri = uri.replace(/([:&])next=html\/geacnffull.html/, "$1next=html/marc.html"); newUri = newUri.replace(/([:&])next=html\/record.html/, "$1next=html/marc.html"); utilities.loadDocument(newUri, browser, function(newBrowser) { newDoc = newBrowser.contentDocument; var namespace = newDoc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; var record = new MARC_Record(); var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, ''//pre/text()'', nsResolver); var tag, ind1, ind2, content; for(var i=0; i 10) { ind1 = line.substring(4, 5); ind2 = line.substring(5, 6); content = line.substring(7); content = content.replace(/\$([a-z])(?: |$)/g, record.subfield_delimiter+"$1"); } else { ind1 = ""; ind2 = ""; content = line.substring(4); } } utilities.importMARCRecord(record, uri, model); done(); }, function() {}); wait();'); REPLACE INTO "scrapers" VALUES('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006-06-18 11:19:00', 'SIRSI -2003 Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', 'var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; var elmts = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p/text()[1]'', nsResolver); for(i in elmts) { if(elmts[i].nodeValue == "\n\nViewing record\n") { return true; } } return false;', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; var uri = doc.location.href; var uriRegexp = /^(.*)(\/[0-9]+)$/; var m = uriRegexp.exec(uri); var newUri = m[1]+"/40"; var elmts = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p'', nsResolver); for(i in elmts) { var elmt = elmts[i]; var initialText = utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver); if(initialText.nodeValue == "\n\nViewing record\n") { var recNumber = utilities.getNode(doc, elmt, ''./b[1]/text()[1]'', nsResolver).nodeValue; } } utilities.HTTPUtilities.doPost(newUri, ''marks=''+recNumber+''&shadow=NO&format=FLAT+ASCII&sort=TITLE&vopt_elst=ALL&library=ALL&display_rule=ASCENDING&duedate_code=l&holdcount_code=t&DOWNLOAD_x=22&DOWNLOAD_y=12&address=&form_type='', null, function(text) { var texts = text.split("
");
	texts = texts[1].split("
"); text = texts[0]; var lines = text.split("\n"); var record = new MARC_Record(); var tag, ind1, ind2, content; for(var i=0; i 10) { ind1 = line.substr(6, 1); ind2 = line.substr(7, 1); content = line.substr(8); } else { ind1 = ""; ind2 = ""; content = line.substring(6); } } utilities.importMARCRecord(record, uri, model); done(); }) wait();'); REPLACE INTO "scrapers" VALUES('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006-06-18 11:19:00', 'TLC/YouSeeMore Scraper', 'Simon Kornblith', 'TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; var uri = doc.location.href; var newUri = uri.replace("LabelDisplay", "MARCDisplay"); utilities.loadDocument(newUri, browser, function(newBrowser) { newDoc = newBrowser.contentDocument; var namespace = newDoc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; var record = new MARC_Record(); var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, ''/html/body/table/tbody/tr[td[4]]'', nsResolver); var tag, ind1, ind2, content; for(var i=0; i]*>/, "").replace(/<\?xml[^>]*\?>/, ""); var xml = new XML(text); for(var i=0; i