Sopheak's new NZ Herald translator
This commit is contained in:
parent
521ab94e35
commit
9aa227db6b
1 changed files with 134 additions and 90 deletions
|
@ -1,110 +1,154 @@
|
||||||
{
|
{
|
||||||
"translatorID":"c7830593-807e-48cb-99f2-c3bed2b148c2",
|
"translatorID" : "c7830593-807e-48cb-99f2-c3bed2b148c2",
|
||||||
|
"label" : "New Zealand Herald",
|
||||||
|
"creator" : "Sopheak Hean (University of Waikato, Faculty of Education, New Zealand)",
|
||||||
|
"target" : "^http://www\\.nzherald\\.co\\.nz",
|
||||||
|
"minVersion" : "1.0",
|
||||||
|
"maxVersion" : "",
|
||||||
|
"priority" : 100,
|
||||||
|
"inRepository" : "1",
|
||||||
"translatorType":4,
|
"translatorType":4,
|
||||||
"label":"New Zealand Herald",
|
"lastUpdated":"2010-08-03 10:49:18"
|
||||||
"creator":"Michael Berkowitz",
|
|
||||||
"target":"^http://(www|search).nzherald.co.nz/",
|
|
||||||
"minVersion":"1.0.0b4.r5",
|
|
||||||
"maxVersion":"",
|
|
||||||
"priority":100,
|
|
||||||
"inRepository":true,
|
|
||||||
"lastUpdated":"2007-08-14 22:15:00"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function detectWeb(doc, url) {
|
function detectWeb(doc, url) {
|
||||||
if (doc.title.indexOf("Search Results") != -1) {
|
var namespace = doc.documentElement.namespaceURI;
|
||||||
|
var nsResolver = namespace ? function(prefix) {
|
||||||
|
if (prefix == "x" ) return namespace; else return null;
|
||||||
|
} : null;
|
||||||
|
|
||||||
|
/* If the address bar has /news in it then its a newspaper article*/
|
||||||
|
|
||||||
|
if (doc.location.href.indexOf("/search/results.cfm") !=-1){
|
||||||
return "multiple";
|
return "multiple";
|
||||||
} else if (doc.location.href.indexOf("story.cfm") != -1) {
|
} else if (doc.location.href.indexOf("/news/article.cfm") !=-1){
|
||||||
return "newspaperArticle";
|
return "newspaperArticle";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function scrape(url) {
|
function associateData (newItem, items, field, zoteroField) {
|
||||||
Zotero.Utilities.HTTP.doGet(url, function(text) {
|
if (items[field]){
|
||||||
var newItem = new Zotero.Item("newspaperArticle");
|
newItem[zoteroField] = items[field];
|
||||||
newItem.url = url;
|
}
|
||||||
newItem.publicationTitle = "New Zealand Herald";
|
}
|
||||||
|
|
||||||
//author?
|
function scrape(doc, url){
|
||||||
var aut = /<a href=\"\/author\/[^>]*>(.*)<\/a>/;
|
var authorTemp;
|
||||||
if (text.match(aut)) {
|
var namespace = doc.documentElement.namespaceURI;
|
||||||
var author = text.match(aut)[1];
|
var nsResolver = namespace ? function(prefix) {
|
||||||
|
if (prefix == 'x') return namespace; else return null;
|
||||||
|
} : null;
|
||||||
|
|
||||||
|
var articleLanguage = "English";
|
||||||
|
|
||||||
|
var newItem = new Zotero.Item('newspaperArticle');
|
||||||
|
newItem.url = doc.location.href;
|
||||||
|
|
||||||
|
newItem.publicationTitle = "New Zealand Herald";
|
||||||
|
newItem.ISSN = "1170-0777";
|
||||||
|
|
||||||
|
//Get title of the news via xpath
|
||||||
|
var myXPath = '//h1';
|
||||||
|
var myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
|
||||||
|
var headers;
|
||||||
|
var items = new Object();
|
||||||
|
var authorsTemp;
|
||||||
|
var blankCell;
|
||||||
|
var contents;
|
||||||
|
var authorArray = new Array();
|
||||||
|
|
||||||
|
/*
|
||||||
|
Get authors of the article
|
||||||
|
Remove "By " then replace "and " with ", "
|
||||||
|
|
||||||
|
Put the string into an array then split the array and loop all
|
||||||
|
authors then push author to Zotero. Possible with more than 1 author
|
||||||
|
on an article.
|
||||||
|
*/
|
||||||
|
var authorXPath = '//span[@class="credits"]';
|
||||||
|
var authorXPathObject = doc.evaluate(authorXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
||||||
|
|
||||||
|
if (authorXPathObject) {
|
||||||
|
var authorString = authorXPathObject.textContent.replace(/\bBy\W+/g, '');
|
||||||
|
if (authorString.match(/\W\band\W+/g)){
|
||||||
|
authorTemp = authorString.replace(/\W\band\W+/g, ', ');
|
||||||
|
authorArray = authorTemp.split(", ");
|
||||||
|
} else if (!authorString.match(/\W\band\W+/g)){
|
||||||
|
authorArray = authorString;
|
||||||
|
}
|
||||||
|
if( authorArray instanceof Array ) {
|
||||||
|
for (var i in authorArray){
|
||||||
|
var author;
|
||||||
|
author = authorArray[i];
|
||||||
newItem.creators.push(Zotero.Utilities.cleanAuthor(author, "author"));
|
newItem.creators.push(Zotero.Utilities.cleanAuthor(author, "author"));
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
if (authorString.match(/\W\bof\W+/g)){
|
||||||
|
authorTemp = authorString.replace (/\W\bof\W(.*)/g, '');
|
||||||
|
authorArray = authorTemp;
|
||||||
|
newItem.creators.push(Zotero.Utilities.cleanAuthor(authorTemp, "author"));
|
||||||
|
|
||||||
//abstract
|
} else {
|
||||||
var a = /meta name=\"description\" content=\"([^&]*)/;
|
newItem.creators.push(Zotero.Utilities.cleanAuthor(authorArray, "author"));
|
||||||
newItem.abstractNote = text.match(a)[1];
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//date-Year
|
||||||
|
var dateXPath = '//div[@class="tools"]/span';
|
||||||
|
var dateXPathObject = doc.evaluate(dateXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\d{1,2}:\d{1,2} (AM|PM) (\w)+ /g, '');
|
||||||
|
|
||||||
//title and date
|
//If the original Xpath1 is equal to Updated then go to XPath2
|
||||||
var t = /<title>(.*)<\/title>/;
|
if ((dateXPathObject =="Updated")|| (dateXPathObject =="New")){
|
||||||
var result = text.match(t)[1].split(" - ");
|
var dateXPath = '//div[@class="tools"]/span[2]';
|
||||||
newItem.title = result[0];
|
var dateXPathObject = doc.evaluate(dateXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\d{1,2}:\d{1,2} (AM|PM) (\w)+ /g, '');
|
||||||
newItem.date = result[1];
|
newItem.date = dateXPathObject ;
|
||||||
|
} else { //great found the date just push it to Zotero.
|
||||||
//keywords
|
var dateXPath = '//div[@class="tools"]/span';
|
||||||
var k = /<meta name=\"keywords\" content=\"(.*)\"/;
|
var dateXPathObject = doc.evaluate(dateXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\d{1,2}:\d{1,2} (AM|PM) (\w)+ /g, '');
|
||||||
var kwords = Zotero.Utilities.cleanString(text.match(k)[1]).split(", ");
|
newItem.date = dateXPathObject ;
|
||||||
for (var i = 0 ; i < kwords.length ; i++) {
|
|
||||||
newItem.tags.push(kwords[i]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//section
|
//Get Section of the news
|
||||||
var s = /class=\"current\"><.*><span>(.*)<\/span>/;
|
var sectionXPath = '//div[@class="sectionHeader"]/span/a[1]';
|
||||||
newItem.section = text.match(s)[1];
|
var sectionXPathObject = doc.evaluate(sectionXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
|
||||||
|
newItem.section = sectionXPathObject;
|
||||||
|
|
||||||
|
//Get news title
|
||||||
|
headers =myXPathObject;
|
||||||
|
newItem.title = headers;
|
||||||
|
|
||||||
|
newItem.language= articleLanguage;
|
||||||
|
|
||||||
|
//grab abstract from meta data
|
||||||
|
var a= "//meta[@name='description']";
|
||||||
|
newItem.abstractNote = doc.evaluate(a, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content;
|
||||||
newItem.complete();
|
newItem.complete();
|
||||||
Zotero.debug(newItem);
|
|
||||||
|
|
||||||
Zotero.done();
|
|
||||||
}, function() {});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function doWeb(doc, url) {
|
function doWeb(doc, url){
|
||||||
|
var namespace = doc.documentElement.namespaceURI;
|
||||||
|
var nsResolver = namespace ? function(prefix){
|
||||||
|
if (prefix =='x')
|
||||||
|
return namespace; else return null;
|
||||||
|
} :null;
|
||||||
|
|
||||||
var articles = new Array();
|
var articles = new Array();
|
||||||
var names = new Array();
|
var items = new Object();
|
||||||
if (doc.title.indexOf("Search Results:") != -1) {
|
var nextTitle;
|
||||||
var URLS = new Array();
|
|
||||||
var titles = new Array();
|
|
||||||
var xpath = '//p[@class="g"]/a';
|
|
||||||
var links = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null);
|
|
||||||
var link = links.iterateNext();
|
|
||||||
|
|
||||||
while (link) {
|
if (detectWeb(doc, url) == "multiple"){
|
||||||
URLS.push(link.href);
|
var titles = doc.evaluate('//p[@class="results"]/a', doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||||
titles.push(link.textContent);
|
while (nextTitle = titles.iterateNext()){
|
||||||
link = links.iterateNext();
|
items[nextTitle.href] = nextTitle.textContent;
|
||||||
}
|
}
|
||||||
|
items= Zotero.selectItems(items);
|
||||||
Zotero.debug(titles);
|
for (var i in items){
|
||||||
Zotero.debug(URLS);
|
|
||||||
|
|
||||||
var newItems = new Object();
|
|
||||||
|
|
||||||
for (var i = 0 ; i < titles.length ; i++) {
|
|
||||||
newItems[URLS[i]] = titles[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
newItems = Zotero.selectItems(newItems);
|
|
||||||
|
|
||||||
Zotero.debug(newItems);
|
|
||||||
|
|
||||||
for (var i in newItems) {
|
|
||||||
articles.push(i);
|
articles.push(i);
|
||||||
names.push(newItems[i]);
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
articles.push(doc.location.href);
|
articles = [url];
|
||||||
names.push(Zotero.Utilities.cleanString(doc.title.split("-")[0]));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Zotero.debug(articles);
|
Zotero.Utilities.processDocuments(articles, scrape, function(){Zotero.done();});
|
||||||
|
|
||||||
Zotero.Utilities.HTTP.doPost(articles, "", function(text) {
|
|
||||||
for (var i = 0 ; i < articles.length ; i++) {
|
|
||||||
scrape(articles[i]);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
Zotero.wait();
|
Zotero.wait();
|
||||||
}
|
}
|
Loading…
Reference in a new issue