From 974228338981254f9a3db8c43c8d144701a8c43e Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Fri, 23 Jun 2006 14:12:34 +0000 Subject: [PATCH] InnoPAC scraper now handles search results pages --- scrapers.sql | 129 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 109 insertions(+), 20 deletions(-) diff --git a/scrapers.sql b/scrapers.sql index f28cb2ee15..ca67b1edec 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1,7 +1,7 @@ --- 11 +-- 12 -- Set the following timestamp to the most recent scraper update date -REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-22 22:58:00')); +REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-23 10:11:00')); REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-22 22:58:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; @@ -74,7 +74,7 @@ function scrape(doc) { var searchRe = new RegExp(''http://www\.amazon\.com/(gp/search/|exec/obidos/search-handle-url/)''); var m = searchRe.exec(doc.location.href) if(m) { - // Why can''t amazon use standard stylesheets + // Why can''t amazon use the same stylesheets var xpath; if(m == "gp/search/") { xpath = ''//table[@class="searchresults"]''; @@ -511,7 +511,7 @@ if(month && year) { model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false); '); -REPLACE INTO "scrapers" VALUES('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006-06-18 16:55:00', 'InnoPAC Scraper', 'Simon Kornblith', '^http://[^/]+/(?:search/|record=)', +REPLACE INTO "scrapers" VALUES('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006-06-23 10:11:00', 'InnoPAC Scraper', 'Simon Kornblith', '^http://[^/]+/(?:search/|record=)', '// First, check to see if the URL alone reveals InnoPAC, since some sites don''t reveal the MARC button var matchRegexp = new RegExp(''^(http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/)frameset(.+)$''); if(matchRegexp.test(doc.location.href)) { @@ -528,6 +528,13 @@ var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); if(elmts.length) { return true; } +// Also, check for links to an item display page +var tags = doc.getElementsByTagName("a"); +for(i=0; i