Add SIRSI (old) scraper

This commit is contained in:
Simon Kornblith 2006-06-07 17:44:55 +00:00
parent 07dad8fae9
commit 1e48189c3b

View file

@ -1,4 +1,4 @@
-- 2
-- 3
DELETE FROM scrapers;
INSERT INTO "scrapers" VALUES(1, NULL, NULL, 20060603002000, 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/gp/product/', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
@ -459,7 +459,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
wait();');
INSERT INTO "scrapers" VALUES(7, NULL, NULL, 20060603002000, 'SIRSI Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi',
INSERT INTO "scrapers" VALUES(7, NULL, NULL, 20060603002000, 'SIRSI 2003+ Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi',
'var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == ''x'') return namespace; else return null;
@ -1098,7 +1098,9 @@ var uri = doc.location.href;
var newUri = uri.replace(/([:&])next=html\/geacnffull.html/, "$1next=html/marc.html");
newUri = newUri.replace(/([:&])next=html\/record.html/, "$1next=html/marc.html");
utilities.debugPrint(newUri);
var getNode = function(doc, contextNode, xpath, nsResolver) {
return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
}
utilities.loadDocument(newUri, browser, function(newBrowser) {
newDoc = newBrowser.contentDocument;
@ -1108,11 +1110,6 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
if (prefix == ''x'') return namespace; else return null;
} : null;
var getNode = function(doc, contextNode, xpath, nsResolver) {
return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
}
var record = new MARC_Record();
var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, ''//pre/text()'', nsResolver);
@ -1123,6 +1120,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
if(line.substring(0, 6) == " ") {
content += " "+line.substring(6);
continue;
} else {
if(tag) {
record.add_field(tag, ind1, ind2, content);
@ -1145,7 +1143,6 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
content = line.substring(4);
}
utilities.debugPrint(''tag:''+tag+'' ind1:''+ind1+'' ind2:''+ind2+'' content:''+content);
}
model = utilities.importMARCRecord(record, uri, model);
@ -1153,3 +1150,87 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
}, function() {});
wait();');
INSERT INTO "scrapers" VALUES(16, NULL, NULL, 20060603002000, 'SIRSI -2003 Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi',
'var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == ''x'') return namespace; else return null;
} : null;
var elmts = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p/text()[1]'', nsResolver);
for(i in elmts) {
utilities.debugPrint(elmts[i].nodeValue);
if(elmts[i].nodeValue == "\n\nViewing record\n") {
return true;
}
}
return false;',
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == ''x'') return namespace; else return null;
} : null;
var getNode = function(doc, contextNode, xpath, nsResolver) {
return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
}
var uri = doc.location.href;
var uriRegexp = /^(.*)(\/[0-9]+)$/;
var m = uriRegexp.exec(uri);
var newUri = m[1]+"/40";
utilities.debugPrint(newUri);
var elmts = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p'', nsResolver);
for(i in elmts) {
var elmt = elmts[i];
var initialText = getNode(doc, elmt, ''./text()[1]'', nsResolver);
if(initialText.nodeValue == "\n\nViewing record\n") {
var recNumber = getNode(doc, elmt, ''./b[1]/text()[1]'', nsResolver).nodeValue;
}
}
utilities.HTTPUtilities.doPost(newUri, ''marks=''+recNumber+''&shadow=NO&format=FLAT+ASCII&sort=TITLE&vopt_elst=ALL&library=ALL&display_rule=ASCENDING&duedate_code=l&holdcount_code=t&DOWNLOAD_x=22&DOWNLOAD_y=12&address=&form_type='', null, function(text) {
var texts = text.split("<PRE>");
texts = texts[1].split("</PRE>");
text = texts[0];
var lines = text.split("\n");
var record = new MARC_Record();
var tag, ind1, ind2, content;
for(var i=0; i<lines.length; i++) {
var line = lines[i];
if(line.substr(0, 1) == "." && line.substr(4,2) == ". ") {
if(tag) {
content = content.replace(/\|([a-z])/g, record.subfield_delimiter+"$1");
record.add_field(tag, ind1, ind2, content);
}
} else {
content += " "+line.substring(6);
continue;
}
tag = line.substr(1, 3);
if(parseInt(tag) > 10) {
ind1 = line.substr(6, 1);
ind2 = line.substr(7, 1);
content = line.substr(8);
} else {
ind1 = "";
ind2 = "";
content = line.substring(6);
}
}
model = utilities.importMARCRecord(record, uri, model);
done();
})
wait();');