Scrapable search results for SIRSI 2003+ scraper
This commit is contained in:
parent
9742283389
commit
83c36f330d
1 changed files with 119 additions and 68 deletions
59
scrapers.sql
59
scrapers.sql
|
@ -1,7 +1,7 @@
|
||||||
-- 12
|
-- 12
|
||||||
|
|
||||||
-- Set the following timestamp to the most recent scraper update date
|
-- Set the following timestamp to the most recent scraper update date
|
||||||
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-23 10:11:00'));
|
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-23 12:17:00'));
|
||||||
|
|
||||||
REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-22 22:58:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-22 22:58:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
||||||
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
||||||
|
@ -661,7 +661,7 @@ if(newUri) {
|
||||||
|
|
||||||
wait();');
|
wait();');
|
||||||
|
|
||||||
REPLACE INTO "scrapers" VALUES('add7c71c-21f3-ee14-d188-caf9da12728b', '2006-06-12 09:58:00', 'SIRSI 2003+ Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi',
|
REPLACE INTO "scrapers" VALUES('add7c71c-21f3-ee14-d188-caf9da12728b', '2006-06-23 12:17:00', 'SIRSI 2003+ Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi',
|
||||||
'var namespace = doc.documentElement.namespaceURI;
|
'var namespace = doc.documentElement.namespaceURI;
|
||||||
var nsResolver = namespace ? function(prefix) {
|
var nsResolver = namespace ? function(prefix) {
|
||||||
if (prefix == ''x'') return namespace; else return null;
|
if (prefix == ''x'') return namespace; else return null;
|
||||||
|
@ -672,6 +672,12 @@ var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||||
if(elmts.length) {
|
if(elmts.length) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
var xpath = ''//td[@class="searchsum"]/table'';
|
||||||
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||||
|
if(elmts.length) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
return false;',
|
return false;',
|
||||||
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
||||||
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
||||||
|
@ -683,11 +689,16 @@ var nsResolver = namespace ? function(prefix) {
|
||||||
if (prefix == ''x'') return namespace; else return null;
|
if (prefix == ''x'') return namespace; else return null;
|
||||||
} : null;
|
} : null;
|
||||||
|
|
||||||
var uri = doc.location.href;
|
|
||||||
var data = new Object();
|
var data = new Object();
|
||||||
|
|
||||||
|
function scrape(doc) {
|
||||||
|
var uri = doc.location.href;
|
||||||
|
|
||||||
var xpath = ''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]'';
|
var xpath = ''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]'';
|
||||||
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||||
|
if(!elmts.length) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
for (var i = 0; i < elmts.length; i++) {
|
for (var i = 0; i < elmts.length; i++) {
|
||||||
var elmt = elmts[i];
|
var elmt = elmts[i];
|
||||||
try {
|
try {
|
||||||
|
@ -750,10 +761,50 @@ for (var i = 0; i < elmts.length; i++) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (e) {}
|
} catch (e) {}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
|
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!scrape(doc)) {
|
||||||
|
var checkboxes = new Array();
|
||||||
|
var urls = new Array();
|
||||||
|
var availableItems = new Array();
|
||||||
|
|
||||||
|
var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''//td[@class="searchsum"]/table[//input[@value="Details"]]'', nsResolver);
|
||||||
|
// Go through table rows
|
||||||
|
for(var i=1; i<tableRows.length; i++) {
|
||||||
|
var input = utilities.getNode(doc, tableRows[i], ''.//input[@value="Details"]'', nsResolver);
|
||||||
|
checkboxes[i] = input.name;
|
||||||
|
var text = utilities.getNodeString(doc, tableRows[i], ''.//label/strong//text()'', nsResolver);
|
||||||
|
if(text) {
|
||||||
|
availableItems[i] = text;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var items = utilities.selectItems(availableItems);
|
||||||
|
|
||||||
|
if(!items) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
var hostRe = new RegExp("^http://[^/]+");
|
||||||
|
var m = hostRe.exec(doc.location.href);
|
||||||
|
var hitlist = doc.forms.namedItem("hitlist");
|
||||||
|
var baseUrl = m[0]+hitlist.getAttribute("action")+"?first_hit="+hitlist.elements.namedItem("first_hit").value+"&last_hit="+hitlist.elements.namedItem("last_hit").value;
|
||||||
|
utilities.debugPrint(baseUrl);
|
||||||
|
|
||||||
|
var uris = new Array();
|
||||||
|
for(i in items) {
|
||||||
|
uris.push(baseUrl+"&"+checkboxes[i]+"=Details");
|
||||||
|
}
|
||||||
|
|
||||||
|
utilities.processDocuments(browser, null, uris, function(browser) { scrape(browser.contentDocument) },
|
||||||
|
function() { done() }, function() {});
|
||||||
|
|
||||||
|
wait();
|
||||||
|
}
|
||||||
');
|
');
|
||||||
|
|
||||||
REPLACE INTO "scrapers" VALUES('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-18 09:58:00', 'ProQuest Scraper', 'Simon Kornblith', 'http://proquest\.umi\.com/pqdweb\?(?:.*\&)?did=', '',
|
REPLACE INTO "scrapers" VALUES('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-18 09:58:00', 'ProQuest Scraper', 'Simon Kornblith', 'http://proquest\.umi\.com/pqdweb\?(?:.*\&)?did=', '',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue