From a457cdb493334fb75a804ed962ea19f8b9ad4e8c Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Sat, 26 Aug 2006 07:27:02 +0000 Subject: [PATCH] added New York Times translator --- scrapers.sql | 166 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 160 insertions(+), 6 deletions(-) diff --git a/scrapers.sql b/scrapers.sql index 759f73e5c7..8ba5aaeebf 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -703,17 +703,16 @@ function scrape(doc) { } } - newItem.attachments.push({document:doc, title:"History Cooperative Full Text", - downloadable:true}); - - newItem.complete(); - - // don''t actually need date info for a journal article var month = metaTags.namedItem("PublicationMonth"); var year = metaTags.namedItem("PublicationYear"); if(month && year) { newItem.date = month.getAttribute("content")+" "+year.getAttribute("content"); } + + newItem.attachments.push({document:doc, title:"History Cooperative Full Text", + downloadable:true}); + + newItem.complete(); } function doWeb(doc, url) { @@ -3344,6 +3343,161 @@ function doWeb(doc, url) { Scholar.wait(); }'); +REPLACE INTO "translators" VALUES ('ce7a3727-d184-407f-ac12-52837f3361ff', '2006-08-26 14:21:00', 4, 'New York Times', 'Simon Kornblith', '^(?:http://query.nytimes.com/search/query|http://www.nytimes.com/.+)', +'function getList(urls, each, done) { + var url = urls.shift(); + Scholar.Utilities.HTTP.doGet(url, function(text) { + if(each) { + each(text, url); + } + + if(urls.length) { + getList(urls, each, done); + } else if(done) { + done(text); + } + }); +} + +function detectWeb(doc, url) { + if(doc.title.substr(0, 30) == "The New York Times: Search for") { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var result = doc.evaluate(''//div[@id="srchContent"]'', doc, nsResolver, + XPathResult.ANY_TYPE, null).iterateNext(); + if(result) { + return "multiple"; + } + } else { + var metaTags = doc.getElementsByTagName("meta"); + if(metaTags.namedItem("hdl") && metaTags.namedItem("byl")) { + return "newspaperArticle"; + } + } +}', +'function associateMeta(newItem, metaTags, field, scholarField) { + if(metaTags[field]) { + newItem[scholarField] = metaTags[field]; + } +} + +function scrape(doc, url) { + var newItem = new Scholar.Item("newspaperArticle"); + newItem.publicationTitle = "The New York Times"; + newItem.ISSN = "0362-4331"; + + var metaTags = new Object(); + if(url != undefined) { + newItem.url = url; + var metaTagRe = /]*>/gi; + var nameRe = /name="([^"]+)"/i; + var contentRe = /content="([^"]+)"/i; + var m = doc.match(metaTagRe); + + if(!m) { + return; + } + + for(var i=0; i