From 107632d97048314fb495dcd339bdf393acec95c7 Mon Sep 17 00:00:00 2001 From: Avram Lyon Date: Fri, 6 May 2011 17:04:42 +0000 Subject: [PATCH] Trans: Updated Globe translator, by Frank Bennett. --- translators/The Boston Globe.js | 282 ++++++++++++++++++++------------ 1 file changed, 180 insertions(+), 102 deletions(-) diff --git a/translators/The Boston Globe.js b/translators/The Boston Globe.js index 5f225abb24..1d39d910d5 100644 --- a/translators/The Boston Globe.js +++ b/translators/The Boston Globe.js @@ -1,110 +1,189 @@ { - "translatorID":"1f245496-4c1b-406a-8641-d286b3888231", - "translatorType":4, - "label":"The Boston Globe", - "creator":"Adam Crymble", - "target":"http://(www|search).boston.com/", - "minVersion":"1.0.0b4.r5", - "maxVersion":"", - "priority":100, - "inRepository":true, - "lastUpdated":"2008-06-06 08:45:00" + "translatorID": "1f245496-4c1b-406a-8641-d286b3888231", + "label": "The Boston Globe", + "creator": "Adam Crymble and Frank Bennett", + "target": "^http://(www|search|articles)\\.boston\\.com/", + "minVersion": "1.0.0b4.r5", + "maxVersion": "", + "priority": 100, + "inRepository": false, + "translatorType": 4, + "lastUpdated": "2011-05-06 20:57:16" } +/* + * Sample URLs + * + * [Original request -- uncommon page format, no embedded metadata of any kind] + * http://articles.boston.com/2011-05-03/news/29500032_1_bouncer-assault-local-restaurant + * + * [More common page formats, marginally reliable metadata in a comment block] + * http://www.boston.com/yourtown/news/charlestown/2011/04/meet_charlestowns_youth_of_the.html + * http://www.boston.com/business/articles/2011/05/05/oil_drops_below_100_per_barrel/ + * http://www.boston.com/lifestyle/articles/2011/04/28/anticipation_grows_for_mfas_art_in_bloom_festival/ + + * Support for search results will require rewriting scrape(..) to use only regular expressions + */ + function detectWeb(doc, url) { - if (url.match("search.boston.com")) { - return "multiple"; - } else if (doc.evaluate('//div[@id="headTools"]/h1', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { - return "newspaperArticle"; - } else if (doc.evaluate('//div[@id="blogEntry"]/h1/a', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { - return "blogPost"; - } -} - -//Boston Globe and Boston.com Translator. Code by Adam Crymble - -function scrape (doc, url) { var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { }: null; - - //sets variables that remain constant in both formats - - if (doc.evaluate('//span[@id="dateline"]', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { - var xPathDateResults = doc.evaluate ('//span[@id="dateline"]', doc, nsResolver, XPathResult.ANY_TYPE, null); + + if (url.match("search.boston.com")) { + // Search disabled until cross-domain can be dealt with + return false; + var results = doc.evaluate('//div[@class="resultsMain"]//div[@class="regTZ"]/a[@class="titleLink"]', doc, nsResolver, XPathResult.ANY_TYPE, null); + if (results.iterateNext()) { + return "multiple"; + } else { + return false; } - - if (doc.evaluate('//span[@id="byline"]', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { - var xPathAuthorResults= doc.evaluate ('//span[@id="byline"]', doc, nsResolver, XPathResult.ANY_TYPE, null); - } - - - //sets variables unique to the blog posts on Boston.com - - if (doc.evaluate('//div[@id="blogEntry"]/h1/a', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { - - var newItem =new Zotero.Item("blogPost"); - newItem.publicationTitle = "Boston.com"; - - //title - var xPathTitle = '//div[@id="blogEntry"]/h1/a'; - - //date - var articleDate = xPathDateResults.iterateNext().textContent; - newItem.date = articleDate; - - //author - var articleAuthor = xPathAuthorResults.iterateNext().textContent.replace(/Posted by /i, ''); - articleAuthor = articleAuthor.split(','); - var authorName = articleAuthor[0].split("and "); - - //else it sets the variables unique to the articles on the Boston Globe - - } else if (doc.evaluate('//div[@id="headTools"]/h1', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { - - var newItem = new Zotero.Item("newspaperArticle"); - newItem.publicationTitle = "The Boston Globe"; - - //title - var xPathTitle = '//div[@id="headTools"]/h1'; - - //date - if (doc.evaluate('//span[@id="dateline"]', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { - var articleDate = xPathDateResults.iterateNext().textContent; - if (articleDate.match('/')) { - articleDate = articleDate.split('/'); - newItem.date = articleDate[1]; + } else if (url.match(/(\/[0-9]{4}\/[0-9]{2}\/|[0-9]{4}-[0-9]{2}-[0-9]{2})/)) { + return "newspaperArticle"; + } +} + +//Boston Globe and Boston.com Translator. Original code by Adam Crymble +// Rewritten by Frank Bennett, 2011 + +function sniffComment (elem) { + if (!elem) { + return elem; + } + for (var i = 0, ilen = elem.childNodes.length; i < ilen; i += 1) { + if (elem.childNodes[i].nodeName === "#comment") { + return elem.childNodes[i].nodeValue; + } + } + return false; +} + +function findMagicComment (doc) { + var hideMeElems = doc.getElementsByClassName("hideMe"); + for (var i = 0, ilen = hideMeElems.length; i < ilen; i += 1) { + var elem = hideMeElems.item(i); + var sniff = sniffComment(elem); + if (sniff) { + return sniff; + } + } + var contentElem = doc.getElementById("content"); + return sniffComment(contentElem); +} + +function findAuthorString (doc, newItem) { + var authors = ""; + var bylineElem = false; + var bylineElems = doc.getElementsByClassName("byline"); + if (bylineElems.length) { + bylineElem = bylineElems.item(0); + } + if (!bylineElem) { + var bylineElem = doc.getElementById('byline'); + } + if (bylineElem) { + authors = bylineElem.textContent; + authors = authors.replace("\n", " ", "g"); + if (authors.match(/[Pp]osted\s+by\s+/)) { + newItem.itemType = "blogPost"; + } + authors = authors.replace(/^\s*(?:[Bb]y|[Pp]osted\s+by)\s+(.*)/, "$1"); + } + return authors; +} + +function scrape (doc, url) { + // The site content is pretty chaotic, we do our best. + + // There are two independent blocks set-and-save blocks + // below. + + // Many pages seem to have metadata embedded in a comment + // The date and headline info look reliable, but + // the byline is a disaster, to be used only + // if absolutely necessary. + var magicComment = findMagicComment(doc); + if (magicComment) { + // Blind acceptance + var newItem =new Zotero.Item("newspaperArticle"); + newItem.publicationTitle = "Boston.com"; + // URL + newItem.url = doc.location.href; + // Attachment + newItem.attachments.push({url:doc.location.href,mimetype:"text/html",snapshot:true,title:"Boston.com page"}); + // Now try to get some citation details (go ahead, try) + var info = magicComment.replace('\n','','g'); + newItem.title = Zotero.Utilities.unescapeHTML(info.replace(/.*(.*)<\/headline>.*/,"$1")); + newItem.date = info.replace(/.*(.*)<\/date>.*/,"$1"); + var authors = findAuthorString(doc, newItem); + if (!authors) { + var authors = info.replace(/.*(.*)<\/byline>.*/,"$1"); + if (authors.toLowerCase() === authors) { + authors = info.replace(/.*(.*)<\/teasetext>.*/, "$1"); + var m = authors.match(/^(?:[Bb]y\s+)*([^ ,]+).*/); + if (m) { + authors = m[1]; } else { - newItem.date = articleDate; + authors = ""; } - - } - - //author(s) - var articleAuthor = xPathAuthorResults.iterateNext().textContent.replace(/^\s*|\s*$/g, ''); - articleAuthor= articleAuthor.substr(3); - var authorName = articleAuthor.split("and "); - - - //byline - if (doc.evaluate('//div[@id="headTools"]/h2', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { - newItem.abstractNote = doc.evaluate ('//div[@id="headTools"]/h2', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; } } - - //creates title using xPaths defined above - var xPathTitleResults = doc.evaluate (xPathTitle, doc, nsResolver, XPathResult.ANY_TYPE, null); - newItem.title = xPathTitleResults.iterateNext().textContent; - - //pushes author(s) - - for (var i=0; i