Sopheak's new NZ Herald translator

2010-08-08 22:42:03 +00:00 · 2010-08-08 22:42:03 +00:00 · 9aa227db6b
commit 9aa227db6b
parent 521ab94e35
1 changed files with 134 additions and 90 deletions
--- a/translators/New
+++ b/translators/New
@ -1,110 +1,154 @@
 {
-	"translatorID":"c7830593-807e-48cb-99f2-c3bed2b148c2",
+	"translatorID" : "c7830593-807e-48cb-99f2-c3bed2b148c2",
 	"label" : "New Zealand Herald",
 	"creator" : "Sopheak Hean (University of Waikato, Faculty of Education, New Zealand)",
 	"target" : "^http://www\\.nzherald\\.co\\.nz",
 	"minVersion" : "1.0",
 	"maxVersion" : "",
 	"priority" : 100,
 	"inRepository" : "1",
 	"translatorType":4,
-	"label":"New Zealand Herald",
+	"lastUpdated":"2010-08-03 10:49:18"
 	"creator":"Michael Berkowitz",
 	"target":"^http://(www|search).nzherald.co.nz/",
 	"minVersion":"1.0.0b4.r5",
 	"maxVersion":"",
 	"priority":100,
 	"inRepository":true,
 	"lastUpdated":"2007-08-14 22:15:00"
 }
 function detectWeb(doc, url) {
-	if (doc.title.indexOf("Search Results") != -1) {
+	var namespace = doc.documentElement.namespaceURI;
 	var nsResolver = namespace ? function(prefix) {
 	if (prefix == "x" ) return namespace; else return null;
 	} : null;
 /* If the address bar has /news in it then its a newspaper article*/
 	if (doc.location.href.indexOf("/search/results.cfm") !=-1){
 		return "multiple";
-	} else if (doc.location.href.indexOf("story.cfm") != -1) {
+	} else if (doc.location.href.indexOf("/news/article.cfm") !=-1){
 		return "newspaperArticle";
 	}
 }
-function scrape(url) {
+function associateData (newItem, items, field, zoteroField) {
-	Zotero.Utilities.HTTP.doGet(url, function(text) {
+	if (items[field]){
-		var newItem = new Zotero.Item("newspaperArticle");
+		newItem[zoteroField] = items[field];
-		newItem.url = url;
+	}
-		newItem.publicationTitle = "New Zealand Herald";
+}
-		//author?
+function scrape(doc, url){
-		var aut = /<a href=\"\/author\/[^>]*>(.*)<\/a>/;
+	var authorTemp;
-		if (text.match(aut)) {
+	var namespace = doc.documentElement.namespaceURI;
-			var author = text.match(aut)[1];
+	var nsResolver = namespace ? function(prefix) {
 		if (prefix == 'x') return namespace; else return null;
 	} : null;
 	var articleLanguage = "English";
 	var newItem = new Zotero.Item('newspaperArticle');
 	newItem.url = doc.location.href;
 	newItem.publicationTitle = "New Zealand Herald";
 	newItem.ISSN = "1170-0777";
 	//Get title of the news via xpath
 	var myXPath = '//h1';
 	var myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
 	var headers;
 	var items = new Object();
 	var authorsTemp;
 	var blankCell;
 	var contents;
 	var authorArray = new Array();
 	/*
 	 Get authors of the article
 	 Remove "By " then replace "and " with ", "
 	 Put the string into an array then split the array and loop all
     authors then push author to Zotero.  Possible with more than 1 author
     on an article.
 	*/
 	var authorXPath = '//span[@class="credits"]';
 	var authorXPathObject = doc.evaluate(authorXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
 	if (authorXPathObject) {
 		var authorString = authorXPathObject.textContent.replace(/\bBy\W+/g, '');
 		if (authorString.match(/\W\band\W+/g)){
 			authorTemp = authorString.replace(/\W\band\W+/g, ', ');
 			authorArray = authorTemp.split(", ");
 		} else if (!authorString.match(/\W\band\W+/g)){
 			authorArray = authorString;
 		}
 		if( authorArray instanceof Array ) {
 			for (var i in authorArray){
 				var author;
 				author = authorArray[i];
 				newItem.creators.push(Zotero.Utilities.cleanAuthor(author, "author"));
 			}
 		} else {
 			if (authorString.match(/\W\bof\W+/g)){
 				authorTemp = authorString.replace (/\W\bof\W(.*)/g, '');
 				authorArray = authorTemp;
 				newItem.creators.push(Zotero.Utilities.cleanAuthor(authorTemp, "author"));
-		//abstract
+			}  else {
-		var a = /meta name=\"description\" content=\"([^&]*)/;
+				newItem.creators.push(Zotero.Utilities.cleanAuthor(authorArray, "author"));
-		newItem.abstractNote = text.match(a)[1];
+			}
 		}
 	}
 	//date-Year
 	var dateXPath = '//div[@class="tools"]/span';
 	var dateXPathObject = doc.evaluate(dateXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\d{1,2}:\d{1,2} (AM|PM) (\w)+ /g, '');
-		//title and date
+	//If the original Xpath1 is equal to Updated then go to XPath2
-		var t = /<title>(.*)<\/title>/;
+	if ((dateXPathObject =="Updated")|| (dateXPathObject =="New")){
-		var result = text.match(t)[1].split(" - ");
+		var dateXPath = '//div[@class="tools"]/span[2]';
-		newItem.title = result[0];
+		var dateXPathObject = doc.evaluate(dateXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\d{1,2}:\d{1,2} (AM|PM) (\w)+ /g, '');
-		newItem.date = result[1];
+		newItem.date = dateXPathObject ;
-		
+	} else { //great found the date just push it to Zotero.
-		//keywords
+		var dateXPath = '//div[@class="tools"]/span';
-		var k = /<meta name=\"keywords\" content=\"(.*)\"/;
+		var dateXPathObject = doc.evaluate(dateXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\d{1,2}:\d{1,2} (AM|PM) (\w)+ /g, '');
-		var kwords = Zotero.Utilities.cleanString(text.match(k)[1]).split(", ");
+		newItem.date = dateXPathObject ;
 		for (var i = 0 ; i < kwords.length ; i++) {
 			newItem.tags.push(kwords[i]);
 	}
-		//section
+	//Get Section of the news
-		var s = /class=\"current\"><.*><span>(.*)<\/span>/;
+	var sectionXPath = '//div[@class="sectionHeader"]/span/a[1]';
-		newItem.section = text.match(s)[1];
+	var sectionXPathObject = doc.evaluate(sectionXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
 	newItem.section = sectionXPathObject;
 	//Get news title
 	headers =myXPathObject;
 	newItem.title = headers;
 	newItem.language= articleLanguage;
 	//grab abstract from meta data
 	var a= "//meta[@name='description']";
 	newItem.abstractNote = doc.evaluate(a, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content;
 	newItem.complete();
 		Zotero.debug(newItem);
 		Zotero.done();
 	}, function() {});
 }
-function doWeb(doc, url) {
+function doWeb(doc, url){
 	var namespace = doc.documentElement.namespaceURI;
 	var nsResolver = namespace ? function(prefix){
 		if (prefix =='x')
 		return namespace; else return null;
 	} :null;
 	var articles = new Array();
-	var names = new Array();
+	var items = new Object();
-	if (doc.title.indexOf("Search Results:") != -1) {
+	var nextTitle;
 		var URLS = new Array();
 		var titles = new Array();
 		var xpath = '//p[@class="g"]/a';
 		var links = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null);
 		var link = links.iterateNext();
-		while (link) {
+	if (detectWeb(doc, url) == "multiple"){
-			URLS.push(link.href);
+		var titles = doc.evaluate('//p[@class="results"]/a', doc, nsResolver, XPathResult.ANY_TYPE, null);
-			titles.push(link.textContent);
+		while (nextTitle = titles.iterateNext()){
-			link = links.iterateNext();
+			items[nextTitle.href] = nextTitle.textContent;
 		}
-		
+		items= Zotero.selectItems(items);
-		Zotero.debug(titles);
+		for (var i in items){
 		Zotero.debug(URLS);
 		var newItems = new Object();
 		for (var i = 0 ; i < titles.length ; i++) {
 			newItems[URLS[i]] = titles[i];
 		}
 		newItems = Zotero.selectItems(newItems);
 		Zotero.debug(newItems);
 		for (var i in newItems) {
 			articles.push(i);
 			names.push(newItems[i]);
 		}
 	} else {
-		articles.push(doc.location.href);
+		articles = [url];
 		names.push(Zotero.Utilities.cleanString(doc.title.split("-")[0]));
 	}
-	Zotero.debug(articles);
+	Zotero.Utilities.processDocuments(articles, scrape, function(){Zotero.done();});
 	Zotero.Utilities.HTTP.doPost(articles, "", function(text) {
 		for (var i = 0 ; i < articles.length ; i++) {
 			scrape(articles[i]);
 		}
 	});
 	Zotero.wait();
 }