Trans: Updated Globe translator, by Frank Bennett.
This commit is contained in:
parent
fd209f98dc
commit
107632d970
1 changed files with 180 additions and 102 deletions
|
@ -1,110 +1,189 @@
|
|||
{
|
||||
"translatorID":"1f245496-4c1b-406a-8641-d286b3888231",
|
||||
"translatorType":4,
|
||||
"label":"The Boston Globe",
|
||||
"creator":"Adam Crymble",
|
||||
"target":"http://(www|search).boston.com/",
|
||||
"minVersion":"1.0.0b4.r5",
|
||||
"maxVersion":"",
|
||||
"priority":100,
|
||||
"inRepository":true,
|
||||
"lastUpdated":"2008-06-06 08:45:00"
|
||||
"translatorID": "1f245496-4c1b-406a-8641-d286b3888231",
|
||||
"label": "The Boston Globe",
|
||||
"creator": "Adam Crymble and Frank Bennett",
|
||||
"target": "^http://(www|search|articles)\\.boston\\.com/",
|
||||
"minVersion": "1.0.0b4.r5",
|
||||
"maxVersion": "",
|
||||
"priority": 100,
|
||||
"inRepository": false,
|
||||
"translatorType": 4,
|
||||
"lastUpdated": "2011-05-06 20:57:16"
|
||||
}
|
||||
|
||||
/*
|
||||
* Sample URLs
|
||||
*
|
||||
* [Original request -- uncommon page format, no embedded metadata of any kind]
|
||||
* http://articles.boston.com/2011-05-03/news/29500032_1_bouncer-assault-local-restaurant
|
||||
*
|
||||
* [More common page formats, marginally reliable metadata in a comment block]
|
||||
* http://www.boston.com/yourtown/news/charlestown/2011/04/meet_charlestowns_youth_of_the.html
|
||||
* http://www.boston.com/business/articles/2011/05/05/oil_drops_below_100_per_barrel/
|
||||
* http://www.boston.com/lifestyle/articles/2011/04/28/anticipation_grows_for_mfas_art_in_bloom_festival/
|
||||
|
||||
* Support for search results will require rewriting scrape(..) to use only regular expressions
|
||||
*/
|
||||
|
||||
function detectWeb(doc, url) {
|
||||
if (url.match("search.boston.com")) {
|
||||
return "multiple";
|
||||
} else if (doc.evaluate('//div[@id="headTools"]/h1', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
return "newspaperArticle";
|
||||
} else if (doc.evaluate('//div[@id="blogEntry"]/h1/a', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
return "blogPost";
|
||||
}
|
||||
}
|
||||
|
||||
//Boston Globe and Boston.com Translator. Code by Adam Crymble
|
||||
|
||||
function scrape (doc, url) {
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
}: null;
|
||||
|
||||
//sets variables that remain constant in both formats
|
||||
|
||||
if (doc.evaluate('//span[@id="dateline"]', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
var xPathDateResults = doc.evaluate ('//span[@id="dateline"]', doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
|
||||
if (url.match("search.boston.com")) {
|
||||
// Search disabled until cross-domain can be dealt with
|
||||
return false;
|
||||
var results = doc.evaluate('//div[@class="resultsMain"]//div[@class="regTZ"]/a[@class="titleLink"]', doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
if (results.iterateNext()) {
|
||||
return "multiple";
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (doc.evaluate('//span[@id="byline"]', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
var xPathAuthorResults= doc.evaluate ('//span[@id="byline"]', doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
}
|
||||
|
||||
|
||||
//sets variables unique to the blog posts on Boston.com
|
||||
|
||||
if (doc.evaluate('//div[@id="blogEntry"]/h1/a', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
|
||||
var newItem =new Zotero.Item("blogPost");
|
||||
newItem.publicationTitle = "Boston.com";
|
||||
|
||||
//title
|
||||
var xPathTitle = '//div[@id="blogEntry"]/h1/a';
|
||||
|
||||
//date
|
||||
var articleDate = xPathDateResults.iterateNext().textContent;
|
||||
newItem.date = articleDate;
|
||||
|
||||
//author
|
||||
var articleAuthor = xPathAuthorResults.iterateNext().textContent.replace(/Posted by /i, '');
|
||||
articleAuthor = articleAuthor.split(',');
|
||||
var authorName = articleAuthor[0].split("and ");
|
||||
|
||||
//else it sets the variables unique to the articles on the Boston Globe
|
||||
|
||||
} else if (doc.evaluate('//div[@id="headTools"]/h1', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
|
||||
var newItem = new Zotero.Item("newspaperArticle");
|
||||
newItem.publicationTitle = "The Boston Globe";
|
||||
|
||||
//title
|
||||
var xPathTitle = '//div[@id="headTools"]/h1';
|
||||
|
||||
//date
|
||||
if (doc.evaluate('//span[@id="dateline"]', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
var articleDate = xPathDateResults.iterateNext().textContent;
|
||||
if (articleDate.match('/')) {
|
||||
articleDate = articleDate.split('/');
|
||||
newItem.date = articleDate[1];
|
||||
} else if (url.match(/(\/[0-9]{4}\/[0-9]{2}\/|[0-9]{4}-[0-9]{2}-[0-9]{2})/)) {
|
||||
return "newspaperArticle";
|
||||
}
|
||||
}
|
||||
|
||||
//Boston Globe and Boston.com Translator. Original code by Adam Crymble
|
||||
// Rewritten by Frank Bennett, 2011
|
||||
|
||||
function sniffComment (elem) {
|
||||
if (!elem) {
|
||||
return elem;
|
||||
}
|
||||
for (var i = 0, ilen = elem.childNodes.length; i < ilen; i += 1) {
|
||||
if (elem.childNodes[i].nodeName === "#comment") {
|
||||
return elem.childNodes[i].nodeValue;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
function findMagicComment (doc) {
|
||||
var hideMeElems = doc.getElementsByClassName("hideMe");
|
||||
for (var i = 0, ilen = hideMeElems.length; i < ilen; i += 1) {
|
||||
var elem = hideMeElems.item(i);
|
||||
var sniff = sniffComment(elem);
|
||||
if (sniff) {
|
||||
return sniff;
|
||||
}
|
||||
}
|
||||
var contentElem = doc.getElementById("content");
|
||||
return sniffComment(contentElem);
|
||||
}
|
||||
|
||||
function findAuthorString (doc, newItem) {
|
||||
var authors = "";
|
||||
var bylineElem = false;
|
||||
var bylineElems = doc.getElementsByClassName("byline");
|
||||
if (bylineElems.length) {
|
||||
bylineElem = bylineElems.item(0);
|
||||
}
|
||||
if (!bylineElem) {
|
||||
var bylineElem = doc.getElementById('byline');
|
||||
}
|
||||
if (bylineElem) {
|
||||
authors = bylineElem.textContent;
|
||||
authors = authors.replace("\n", " ", "g");
|
||||
if (authors.match(/[Pp]osted\s+by\s+/)) {
|
||||
newItem.itemType = "blogPost";
|
||||
}
|
||||
authors = authors.replace(/^\s*(?:[Bb]y|[Pp]osted\s+by)\s+(.*)/, "$1");
|
||||
}
|
||||
return authors;
|
||||
}
|
||||
|
||||
function scrape (doc, url) {
|
||||
// The site content is pretty chaotic, we do our best.
|
||||
|
||||
// There are two independent blocks set-and-save blocks
|
||||
// below.
|
||||
|
||||
// Many pages seem to have metadata embedded in a comment
|
||||
// The date and headline info look reliable, but
|
||||
// the byline is a disaster, to be used only
|
||||
// if absolutely necessary.
|
||||
var magicComment = findMagicComment(doc);
|
||||
if (magicComment) {
|
||||
// Blind acceptance
|
||||
var newItem =new Zotero.Item("newspaperArticle");
|
||||
newItem.publicationTitle = "Boston.com";
|
||||
// URL
|
||||
newItem.url = doc.location.href;
|
||||
// Attachment
|
||||
newItem.attachments.push({url:doc.location.href,mimetype:"text/html",snapshot:true,title:"Boston.com page"});
|
||||
// Now try to get some citation details (go ahead, try)
|
||||
var info = magicComment.replace('\n','','g');
|
||||
newItem.title = Zotero.Utilities.unescapeHTML(info.replace(/.*<headline>(.*)<\/headline>.*/,"$1"));
|
||||
newItem.date = info.replace(/.*<date>(.*)<\/date>.*/,"$1");
|
||||
var authors = findAuthorString(doc, newItem);
|
||||
if (!authors) {
|
||||
var authors = info.replace(/.*<byline>(.*)<\/byline>.*/,"$1");
|
||||
if (authors.toLowerCase() === authors) {
|
||||
authors = info.replace(/.*<teasetext>(.*)<\/teasetext>.*/, "$1");
|
||||
var m = authors.match(/^(?:[Bb]y\s+)*([^ ,]+).*/);
|
||||
if (m) {
|
||||
authors = m[1];
|
||||
} else {
|
||||
newItem.date = articleDate;
|
||||
authors = "";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//author(s)
|
||||
var articleAuthor = xPathAuthorResults.iterateNext().textContent.replace(/^\s*|\s*$/g, '');
|
||||
articleAuthor= articleAuthor.substr(3);
|
||||
var authorName = articleAuthor.split("and ");
|
||||
|
||||
|
||||
//byline
|
||||
if (doc.evaluate('//div[@id="headTools"]/h2', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
newItem.abstractNote = doc.evaluate ('//div[@id="headTools"]/h2', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
|
||||
}
|
||||
}
|
||||
|
||||
//creates title using xPaths defined above
|
||||
var xPathTitleResults = doc.evaluate (xPathTitle, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
newItem.title = xPathTitleResults.iterateNext().textContent;
|
||||
|
||||
//pushes author(s)
|
||||
|
||||
for (var i=0; i<authorName.length; i++) {
|
||||
newItem.creators.push(Zotero.Utilities.cleanAuthor(authorName[i], "author"));
|
||||
}
|
||||
|
||||
newItem.url = doc.location.href;
|
||||
|
||||
authors = authors.split(/,*\s+and\s+/);
|
||||
authors[authors.length - 1] = authors[authors.length - 1].split(/,\s+/)[0];
|
||||
authors = authors.join(", ");
|
||||
authors = authors.split(/,\s+/);
|
||||
for (var j = 0, jlen = authors.length; j < jlen; j += 1) {
|
||||
var author = Zotero.Utilities.cleanAuthor(authors[j], 'author');
|
||||
if (author.lastName) {
|
||||
newItem.creators.push(author);
|
||||
}
|
||||
}
|
||||
newItem.complete();
|
||||
}
|
||||
|
||||
|
||||
// Information block
|
||||
var infoElem = doc.getElementById("mod-article-byline");
|
||||
if (infoElem) {
|
||||
var newItem = new Zotero.Item("newspaperArticle");
|
||||
newItem.publicationTitle = "Boston.com";
|
||||
// URL
|
||||
newItem.url = doc.location.href;
|
||||
newItem.attachments.push({url:doc.location.href,mimetype:"text/html",snapshot:true,title:"Boston.com page"});
|
||||
|
||||
// Date
|
||||
var dateElem = infoElem.getElementsByClassName('pubdate');
|
||||
if (dateElem.length) {
|
||||
newItem.date = dateElem.textContent;
|
||||
}
|
||||
|
||||
// Authors
|
||||
for (var i = 0, ilen = infoElem.childNodes.length; i < ilen; i += 1) {
|
||||
var node = infoElem.childNodes.item(i);
|
||||
if (node.nodeName === 'SPAN') {
|
||||
if ('By' === node.textContent.slice(0,2)) {
|
||||
|
||||
var authors = node.textContent.slice(3);
|
||||
authors = authors.split(/(?:, |,*\s+and\s+)/);
|
||||
for (var j = 0, jlen = authors.length; j < jlen; j += 1) {
|
||||
var author = Zotero.Utilities.cleanAuthor(authors[j], 'author');
|
||||
newItem.creators.push(author);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Title
|
||||
var headerElem = doc.getElementById('mod-article-header');
|
||||
if (headerElem) {
|
||||
var h = headerElem.getElementsByTagName('h1');
|
||||
if (h.length) {
|
||||
newItem.title = h[0].textContent;
|
||||
}
|
||||
}
|
||||
newItem.complete();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -116,10 +195,9 @@ function doWeb (doc, url) {
|
|||
var uris= new Array();
|
||||
|
||||
if (detectWeb(doc, url) == "multiple") {
|
||||
var items = new Object();
|
||||
var items = {};
|
||||
var result = doc.evaluate('//div[@class="regTZ"]/a[@class="titleLink"]', doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
var elmt = result.iterateNext();
|
||||
Zotero.debug(elmt);
|
||||
while (elmt) {
|
||||
//items.push(elmt.href);
|
||||
items[elmt.href] = elmt.textContent;
|
||||
|
@ -135,9 +213,9 @@ function doWeb (doc, url) {
|
|||
for (var i in items) {
|
||||
uris.push(i);
|
||||
}
|
||||
} else
|
||||
uris.push(url);
|
||||
Zotero.debug(uris);
|
||||
Zotero.Utilities.processDocuments(uris, scrape, function() {Zotero.done();});
|
||||
Zotero.wait();
|
||||
}
|
||||
Zotero.Utilities.processDocuments(uris, scrape, Zotero.done);
|
||||
Zotero.wait();
|
||||
} else {
|
||||
scrape(doc, url);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue