From 4535b220db89dfbb9616bfd11381c86cb5ba6b42 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Mon, 26 Jun 2006 18:05:23 +0000 Subject: [PATCH] Closes #84, make type icon in toolbar match item about to be scraped. It's not perfect, since to get everything right, we'd need to scrape the page as soon as it appears, but it provides a pretty good indication. Multiple items get the folder icon. If there's a better icon out there, it's pretty straightforward to implement. --- .../content/scholar/ingester/browser.js | 8 +- .../content/scholar/xpcom/ingester.js | 13 +- .../content/scholar/xpcom/utilities.js | 7 + scrapers.sql | 153 +++++++++++++----- 4 files changed, 134 insertions(+), 47 deletions(-) diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js index 0475a11fcc..88a8ff4688 100644 --- a/chrome/chromeFiles/content/scholar/ingester/browser.js +++ b/chrome/chromeFiles/content/scholar/ingester/browser.js @@ -73,7 +73,13 @@ Scholar_Ingester_Interface.scrapeThisPage = function() { Scholar_Ingester_Interface.updateStatus = function() { var documentObject = Scholar_Ingester_Interface._getDocument(Scholar_Ingester_Interface.tabBrowser.selectedBrowser); if(documentObject && documentObject.scraper) { - //Scholar_Ingester_Interface.statusImage.src = "chrome://scholar/skin/treeitem-"+TYPE+".png"; + if(documentObject.type == "multiple") { + // Use folder icon for multiple types, for now + Scholar_Ingester_Interface.statusImage.src = "chrome://scholar/skin/treesource-collection.png"; + } else { + Scholar_Ingester_Interface.statusImage.src = "chrome://scholar/skin/treeitem-"+documentObject.type+".png"; + } + Scholar.debug("status image is "+Scholar_Ingester_Interface.statusImage.src); Scholar_Ingester_Interface.statusImage.hidden = false; } else { Scholar_Ingester_Interface.statusImage.hidden = true; diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js index ebd2ec6c93..3090f2b881 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/ingester.js +++ b/chrome/chromeFiles/content/scholar/xpcom/ingester.js @@ -176,6 +176,10 @@ Scholar.Ingester.Model.prototype.detachRepository = function() {} * model - data model for semantic scrapers * scraper - best scraper to use to scrape page * items - items returned after page is scraped + * window - window, for creating new hidden browsers + * url - url, as passed through proxy system + * type - type of item that will be scraped (set after retrieveScraper() is + * called) * * Private properties: * _sandbox - sandbox for code execution @@ -192,7 +196,7 @@ Scholar.Ingester.Model.prototype.detachRepository = function() {} * Constructor for Document object */ Scholar.Ingester.Document = function(browserWindow, myWindow){ - this.scraper = null; + this.scraper = this.type = null; this.browser = browserWindow; this.window = myWindow; this.model = new Scholar.Ingester.Model(); @@ -258,11 +262,10 @@ Scholar.Ingester.Document.prototype.canScrape = function(currentScraper) { } // scraperDetectCode returns an associative array (object) in the case of a search result - if(typeof(canScrape) == "object") { - Scholar.debug("scraperDetectCode returned a URL list"); - this.scrapeURLList = canScrape; + if(canScrape.toString() != "") { + this.type = canScrape; } else { - Scholar.debug("canScrape was a "+typeof(canScrape)); + this.type = "website"; } } return canScrape; diff --git a/chrome/chromeFiles/content/scholar/xpcom/utilities.js b/chrome/chromeFiles/content/scholar/xpcom/utilities.js index 547dc5d967..c5b0b7b061 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/utilities.js +++ b/chrome/chromeFiles/content/scholar/xpcom/utilities.js @@ -348,6 +348,13 @@ Scholar.Utilities.Ingester.prototype.getItemArray = function(doc, inHere, urlRe, return availableItems; } +/* + * Handles OAI-PMH requests + */ +Scholar.Utilities.Ingester.prototype.importOAIPMH = function(uri, model) { + +} + // These functions are for use by importMARCRecord. They're private, because, // while they are useful, it's also nice if as many of our scrapers as possible // are PiggyBank compatible, and if our scrapers used functions, that would diff --git a/scrapers.sql b/scrapers.sql index 8ae009a4f2..ff1e80ade0 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1,9 +1,16 @@ --- 25 +-- 26 -- Set the following timestamp to the most recent scraper update date -REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-25 21:15:00')); +REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-26 16:01:00')); -REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-22 22:58:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; +REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-26 16:01:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', +'if(doc.title.indexOf("search") >= 0) { + return "multiple"; +} else { + return "book"; +} +', +'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; @@ -103,9 +110,11 @@ if(m) { scrape(doc); }'); -REPLACE INTO "scrapers" VALUES('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-25 12:11:00', 'WorldCat Scraper', 'Simon Kornblith', '^http://(?:new)?firstsearch\.oclc\.org/WebZ/', -'if(doc.title == ''FirstSearch: WorldCat Detailed Record'' || doc.title == ''FirstSearch: WorldCat List of Records'') { - return true; +REPLACE INTO "scrapers" VALUES('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-26 16:01:00', 'WorldCat Scraper', 'Simon Kornblith', '^http://(?:new)?firstsearch\.oclc\.org/WebZ/', +'if(doc.title == ''FirstSearch: WorldCat Detailed Record'') { + return "book"; +} else if(doc.title == ''FirstSearch: WorldCat List of Records'') { + return "multiple"; } return false;', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; @@ -261,7 +270,7 @@ utilities.HTTPUtilities.doPost(newUri, ''exportselect=''+exportselect+''&exportt }) wait();'); -REPLACE INTO "scrapers" VALUES('88915634-1af6-c134-0171-56fd198235ed', '2006-06-22 16:51:00', 'LOC/Voyager WebVoyage Scraper', 'Simon Kornblith', 'Pwebrecon\.cgi', +REPLACE INTO "scrapers" VALUES('88915634-1af6-c134-0171-56fd198235ed', '2006-06-26 16:01:00', 'LOC/Voyager WebVoyage Scraper', 'Simon Kornblith', 'Pwebrecon\.cgi', 'var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options; for(i in export_options) { if(export_options[i].text == ''Latin1 MARC'' @@ -270,7 +279,11 @@ for(i in export_options) { || export_options[i].text == ''MARC (Unicode/UTF-8)'' || export_options[i].text == ''MARC (non-Unicode/MARC-8)'') { // We have an exportable single record - return true; + if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) { + return "book"; + } else { + return "multiple"; + } } } return false;', @@ -384,7 +397,7 @@ utilities.HTTPUtilities.doGet(newUri+''?''+postString, null, function(text) { }) wait();'); -REPLACE INTO "scrapers" VALUES('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-25 14:16:00', 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse|search/)', +REPLACE INTO "scrapers" VALUES('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-26 16:01:00', 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse|search/)', 'var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; @@ -392,7 +405,7 @@ var nsResolver = namespace ? function(prefix) { // See if this is a seach results page if(doc.title == "JSTOR: Search Results") { - return true; + return "multiple"; } // If this is a view page, find the link to the citation @@ -403,7 +416,7 @@ if(!elmts.length) { var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); } if(elmts && elmts.length) { - return true; + return "journalArticle"; } return false;', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; @@ -602,7 +615,12 @@ utilities.HTTPUtilities.doGet(''http://www.jstor.org/browse?citationAction=remov wait();'); -REPLACE INTO "scrapers" VALUES('e85a3134-8c1a-8644-6926-584c8565f23e', '2006-06-25 14:33:00', 'History Cooperative Scraper', 'Simon Kornblith', '^http://www\.historycooperative\.org/(?:journals/.+/.+/.+\.html$|cgi-bin/search.cgi)', NULL, +REPLACE INTO "scrapers" VALUES('e85a3134-8c1a-8644-6926-584c8565f23e', '2006-06-26 16:01:00', 'History Cooperative Scraper', 'Simon Kornblith', '^http://www\.historycooperative\.org/(?:journals/.+/.+/.+\.html$|cgi-bin/search.cgi)', +'if(doc.title == "History Cooperative: Search Results") { + return "multiple"; +} else { + return "journalArticle"; +}', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; @@ -662,11 +680,11 @@ if(doc.title == "History Cooperative: Search Results") { scrape(doc); }'); -REPLACE INTO "scrapers" VALUES('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006-06-23 12:49:00', 'InnoPAC Scraper', 'Simon Kornblith', '^http://[^/]+/(?:search/|record=)', +REPLACE INTO "scrapers" VALUES('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006-06-26 16:01:00', 'InnoPAC Scraper', 'Simon Kornblith', '^http://[^/]+/(?:search/|record=)', '// First, check to see if the URL alone reveals InnoPAC, since some sites don''t reveal the MARC button var matchRegexp = new RegExp(''^(http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/)frameset(.+)$''); if(matchRegexp.test(doc.location.href)) { - return true; + return "book"; } // Next, look for the MARC button var namespace = doc.documentElement.namespaceURI; @@ -677,13 +695,13 @@ var nsResolver = namespace ? function(prefix) { var xpath = ''//a[img[@alt="MARC Display"]]''; var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); if(elmts.length) { - return true; + return "book"; } // Also, check for links to an item display page var tags = doc.getElementsByTagName("a"); for(i=0; i 0) { + return "multiple"; +} else { + return "book"; +}', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; @@ -1667,7 +1709,12 @@ for(i in uris) { wait();'); -REPLACE INTO "scrapers" VALUES('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-18 11:19:00', 'GEAC Scraper', 'Simon Kornblith', '/(?:GeacQUERY|(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))', NULL, +REPLACE INTO "scrapers" VALUES('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-26 16:01:00', 'GEAC Scraper', 'Simon Kornblith', '/(?:GeacQUERY|(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))', +'if(doc.location.href.indexOf("/GeacQUERY") > 0) { + return "multiple"; +} else { + return "book"; +}', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; @@ -1746,7 +1793,7 @@ utilities.processDocuments(browser, null, uris, function(newBrowser) { wait();'); -REPLACE INTO "scrapers" VALUES('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006-06-24 11:22:00', 'SIRSI -2003 Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', +REPLACE INTO "scrapers" VALUES('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006-06-26 16:01:00', 'SIRSI -2003 Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', 'var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; @@ -1755,13 +1802,13 @@ var nsResolver = namespace ? function(prefix) { var elmts = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p/text()[1]'', nsResolver); for(i in elmts) { if(utilities.superCleanString(elmts[i].nodeValue) == "Viewing record") { - return true; + return "book"; } } var xpath = ''//form[@name="hitlist"]/table/tbody/tr''; var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); if(elmts.length) { - return true; + return "multiple"; } return false;', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; @@ -1878,7 +1925,13 @@ utilities.HTTPUtilities.doGet(newUri+''?marks=''+recNumbers.join(",")+''&shadow= wait();'); -REPLACE INTO "scrapers" VALUES('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006-06-18 11:19:00', 'TLC/YouSeeMore Scraper', 'Simon Kornblith', 'TLCScripts/interpac\.dll\?(?:.*LabelDisplay.*RecordNumber=[0-9]|Search|ItemTitles)', NULL, +REPLACE INTO "scrapers" VALUES('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006-06-26 16:01:00', 'TLC/YouSeeMore Scraper', 'Simon Kornblith', 'TLCScripts/interpac\.dll\?(?:.*LabelDisplay.*RecordNumber=[0-9]|Search|ItemTitles)', +'var detailRe = new RegExp("TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]"); +if(detailRe.test(doc.location.href)) { + return "book"; +} else { + return "multiple"; +}', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; @@ -1966,7 +2019,13 @@ utilities.processDocuments(browser, null, newUris, function(newBrowser) { wait();'); -REPLACE INTO "scrapers" VALUES('c54d1932-73ce-dfd4-a943-109380e06574', '2006-06-25 17:11:00', 'Project MUSE Scraper', 'Simon Kornblith', '^http://muse\.jhu\.edu/(?:journals/[^/]+/[^/]+/[^/]+\.html|search/pia.cgi)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; +REPLACE INTO "scrapers" VALUES('c54d1932-73ce-dfd4-a943-109380e06574', '2006-06-26 16:01:00', 'Project MUSE Scraper', 'Simon Kornblith', '^http://muse\.jhu\.edu/(?:journals/[^/]+/[^/]+/[^/]+\.html|search/pia.cgi)', +'var searchRe = new RegExp("^http://[^/]+/search/pia\.cgi"); +if(searchRe.test(doc.location.href)) { + return "multiple"; +} else { + return "journalArticle"; +}', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; @@ -2137,7 +2196,12 @@ if(searchRe.test(doc.location.href)) { model.addStatement(uri, prefixRDF + "type", prefixDummy + "journalArticle", false); }'); -REPLACE INTO "scrapers" VALUES('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-06-25 00:56:00', 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed|.*db=PubMed.*CMD=search|.*CMD=search.*db=PubMed)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; +REPLACE INTO "scrapers" VALUES('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-06-26 16:01:00', 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed|.*db=PubMed.*CMD=search|.*CMD=search.*db=PubMed)', +'if(doc.location.href.indexOf("list_uids=") >= 0) { + return "journalArticle"; +} else { + return "multiple"; +}', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; @@ -2261,7 +2325,8 @@ utilities.HTTPUtilities.doGet(newUri, null, function(text) { wait();'); -REPLACE INTO "scrapers" VALUES('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006-06-20 10:52:00', 'Scraper for Dublin Core expressed as HTML META elements', 'Simon Kornblith', NULL, +REPLACE INTO "scrapers" VALUES('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006-06-26 16:01:00', 'Scraper for Dublin Core expressed as HTML META elements', 'Simon Kornblith', +'return "website";', 'var metaTags = doc.getElementsByTagName("meta"); if(metaTags) { @@ -2295,7 +2360,13 @@ for(var i=0; i