Rewritten Stuff.co.nz translator by Sopheak Hean.

2010-08-22 22:40:16 +00:00 · 2010-08-22 22:40:16 +00:00 · 903051e471
commit 903051e471
parent c373786a1e
1 changed files with 450 additions and 88 deletions
--- a/translators/Stuff.co.nz.js
+++ b/translators/Stuff.co.nz.js
@ -1,107 +1,469 @@
 {
-	"translatorID":"631ff0c7-2e64-4279-a9c9-ad9518d40f2b",
-	"translatorType":4,
-	"label":"Stuff.co.nz",
-	"creator":"Michael Berkowitz",
-	"target":"^http://(www.)?stuff.co.nz/",
-	"minVersion":"1.0.0b4.r5",
-	"maxVersion":"",
-	"priority":100,
-	"inRepository":true,
-	"lastUpdated":"2007-08-14 22:15:00"
+        "translatorID":"386c7e75-eef4-47b1-b5a6-0faa3cfa4f44",
+        "label":"Stuff.co.nz",
+        "creator":"Sopheak Hean (University of Waikato, Faculty of Education)",
+        "target":"^http://(www\\.)?stuff\\.co\\.nz/",
+        "minVersion":"1.0",
+        "maxVersion":"",
+        "priority":100,
+        "inRepository":"1",
+        "translatorType":4,
+        "lastUpdated":"2010-08-23 00:34:34"
 }

+/*
+    Stuff.co.nz Translator- Parses Stuff.co.nz articles and creates Zotero-based metadata
+   Copyright (C) 2010 Sopheak Hean, University of Waikato, Faculty of Education
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* Stuff.co.nz does not have an ISSN because it is not a newspaper publisher. Stuff.co.nz is a collection of newspaper articles from around the country*/
+
 function detectWeb(doc, url) {
-	if ((doc.location.href.indexOf("search-results") != -1) || (doc.location.href.indexOf("/blogs/blogs/") != -1 )) {
-		return "multiple";
-	} else if ((doc.location.href.indexOf("blogs") != -1) && (url != "http://www.stuff.co.nz/blogs/blogs") && (url != "http://stuff.co.nz/blogs/blogs")) {
+	var namespace = doc.documentElement.namespaceURI;
+	var nsResolver = namespace ? function(prefix) {
+	if (prefix == "x" ) return namespace; else return null;
+	} : null;
+	var definePath = '//div[@class="blog_content"]';
+	var XpathObject = doc.evaluate(definePath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+if  (XpathObject){
 		return "blogPost";
-	} else if (doc.location.href.indexOf("html") == (doc.location.href.length - 4)){
-		return "newspaperArticle";
 	}
+
+	else {
+	var definePath = '//div[@class="story_landing"]';
+	var XpathObject = doc.evaluate(definePath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+	if  (XpathObject){
+		return "newspaperArticle";
+		}
+	}
+
 }

+function myUpperCaseFunction(input){
+		/*Will define one later*/
+}
+
+
 function scrape(doc, url) {
-	if (doc.location.href.indexOf("html") != -1) {
-		var newItem = new Zotero.Item("newspaperArticle");
-		newItem.url = doc.location.href;
-		newItem.publicationTitle = "Stuff.co.nz";
-		newItem.title = doc.title.split(" - ")[0];
-		
-		//abstract
-		var xpath = '//div[@id="leftcol_story"]/p/strong';
-		newItem.abstractNote = Zotero.Utilities.cleanString(doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext().textContent);
-		
-		//date and author
-		var xpath = '//div[@id="story_headline"]';
-		var info = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext().textContent.split(/\n+/)[2].split(" | ");
-		
-		newItem.date = Zotero.Utilities.cleanString(info[1].split(",")[1]);
-		
-		var author = Zotero.Utilities.cleanString(info[0]);
-		if (author.substr(0,2).toLowerCase() == "by") {
-			author = author.substr(3);
-			if (author.indexOf(" - ") != -1) {
-				author = author.split(" - ")[0].split(" ");
-			} else {
-				author = author.split(" ");
-			}
-			for (var i = 0 ; i < author.length ; i++) {
-				author[i] = author[i][0] + author[i].substr(1).toLowerCase();
-				var creator = author.join(" ");
-			}
-			newItem.creators.push(Zotero.Utilities.cleanAuthor(creator, "author"));
-		} else {
-			newItem.extra = author;
-		}
-	} else if (doc.location.href.indexOf("blogs") != -1) {
-		var newItem = new Zotero.Item("blogPost");
-		newItem.url = doc.location.href;

-		//post title
-		var xpath = '//div[@class="post"]/h2[@class="storytitle"]/a';
-		newItem.title = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext().textContent;
+	var namespace = doc.documentElement.namespaceURI;
+	var nsResolver = namespace ? function(prefix) {
+		if (prefix == 'x') return namespace; else return null;
+	} : null;
+	var url = doc.location.href;
+		var splitIntoArray;
+		var fullName="";
+		var emptyString =" ";
+		var firstName; var lastName;
+	/*==========================Blog Post===========================*/
 	
-		//date and author
-		var xpath = '//div[@class="meta"][@id="postdate"]'
-		var info = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext().textContent.split(" | ");
-		var byline = Zotero.Utilities.cleanString(info[0]).split(" in ");
-		newItem.creators.push(Zotero.Utilities.cleanAuthor(byline[0], "author"));
-		newItem.blogTitle = byline[1];
-		var date = Zotero.Utilities.cleanString(info[1]).split("m ");
-		newItem.date = date[1];
+	if (detectWeb(doc, url) =="blogPost"){
+	
+		var newItem = new Zotero.Item('blogPost');
+		newItem.url = doc.location.href;
+		//newItem.title = "No Title Found";
+		newItem.publicationTitle = "Stuff.co.nz";
+		newItem.language = "English";
+
+		//Get Author
+		try { /*Try and Catch if encounter erro */
+		
+			var blogAuthor = "//div[@id='left_col']/span";
+			var blogAuthorObject = doc.evaluate(blogAuthor, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+				if (blogAuthorObject) {
+					
+					if (blogAuthorObject.textContent.replace(/\s*/g,'') ==""){
+					newItem.creators =blogAuthorObject.textContent.replace(/\s*/g,'');
+					}
+					
+					else{
+						blogAuthorObject = blogAuthorObject.textContent;
+						if(blogAuthorObject.match(/[\s\n\r\t]+-[\s\n\r\t]+[a-zA-Z\s\n\r\t]*/g)){
+							blogAuthorObject = blogAuthorObject.replace(/([\s\n\r\t]+-[\s\n\r\t]+[a-zA-Z\s\n\r\t]*)/g, '').replace(/\bBy \b/g,'');
+							splitIntoArray = blogAuthorObject.split (" ");
+							for (var i = 0; i < splitIntoArray.length; i++){
+								firstName = splitIntoArray[i].substring(0,1).toUpperCase();
+								lastName = splitIntoArray[i].substring(1).toLowerCase();
+								fullName += firstName + lastName + emptyString;
+										
+							}
+							newItem.creators.push(Zotero.Utilities.cleanAuthor(fullName , "author"));
+						}
+				
+					 else { 
+						splitIntoArray = blogAuthorObject.replace(/\bBy \b/g,'').split (" ");
+						for (var i = 0; i < splitIntoArray.length; i++){
+							firstName = splitIntoArray[i].substring(0,1).toUpperCase();
+							lastName = splitIntoArray[i].substring(1).toLowerCase();
+							fullName += firstName + lastName + emptyString;
+										
+						}
+					 	newItem.creators.push(Zotero.Utilities.cleanAuthor(fullName , "author"));   }
+					}
+				}
+		} catch (err) {
+			newItem.creators ="error";
+	
+			}
+			
+		//Title of the Article
+		var getBlogTitle = "//span[@class='hbox_top_title headlines_title']/a";
+		var getBlogTitleObject = doc.evaluate(getBlogTitle, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+		if (getBlogTitleObject){
+			newItem.blogTitle =getBlogTitleObject.textContent.replace(/\s+\bHeadlines\b/g, '');
+		}
+		newItem.shortTitle = doShortTitle(doc,url);
+		newItem.title= doTitle(doc, url);
+		newItem.date = doDate(doc, url);
+		newItem.abstractNote = doAbstract(doc, url);
+		newItem.websiteType = "Newspaper";
+		newItem.attachments.push({url:url, title:"Stuff.co.nz Snapshot", mimeType:"text/html"});
+		newItem.complete();
+	} 
+	
+	
+	
+	/* ======================Newspaper Article========================*/
+	
+	else  if (detectWeb(doc, url) =="newspaperArticle"){
+	
+		var newItem = new Zotero.Item('newspaperArticle');
+		newItem.url = doc.location.href;
+		//newItem.title = "No Title Found";
+		
+		//Get extended publisher if there is any then replace with stuff.co.nz
+		var myPublisher = '//span[@class="storycredit"]';
+	
+		var myPublisherObject = doc.evaluate(myPublisher , doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+		if (myPublisherObject) {
+			var realPublisher = myPublisherObject.textContent;
+			if (realPublisher.match(/\bBy[\s\n\r\t]+[a-zA-Z\s\r\t\n]*-[\s\n\r\t]*/g)){
+				realPublisher = realPublisher.replace (/\bBy[\s\n\r\t]+[a-zA-Z\s\r\t\n]*-[\s\n\r\t]*/g, '').replace(/^\s*|\s*$/g, '');
+				newItem.publicationTitle = realPublisher;
+			} else {
+				newItem.publicationTitle = "Stuff.co.nz";
+			}
+			
+		} else {
+				newItem.publicationTitle = "Stuff.co.nz";
+		}
+	
+		newItem.language = "English";
+		
+		//Short Title
+		newItem.shortTitle = doShortTitle(doc,url);
+	
+		
+		//get Abstract
+		newItem.abstractNote = doAbstract(doc, url);
+		var authorXPath = '//span[@class="storycredit"]';
+		
+		var authorXPathObject = doc.evaluate(authorXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+		if (authorXPathObject){
+			var authorArray = new Array("NZPA", "The Press", "The Dominion Post");
+			authorXPathObject = authorXPathObject.textContent;
+			
+			if(authorXPathObject.match(/[\s\n\r\t]+-[\s\n\r\t]+\b[a-zA-Z\s\n\r\t]*|^\s+\bBy\s*/g)){
+				authorXPathObject = authorXPathObject.replace(/([\s\n\r\t]+-[\s\n\r\t]+\b[a-zA-Z\s\n\r\t]*)|\b.co.nz|\b.com|(-[a-zA-Z0-9]*)/g, '');
+				var authorString = authorXPathObject.replace(/^\s+\bBy\s*|^\s+\bBY\s*/g, '');
+				
+				if (authorString.match(/\W\band\W+/g)){
+								authorTemp = authorString.replace(/\W\band\W+/g, ', ');
+								authorArray = authorTemp.split(", ");
+							
+						} else if (!authorString.match(/\W\band\W+/g))
+							{
+								authorArray = authorString.toLowerCase();
+							}
+						if( authorArray instanceof Array ) {
+							for (var i in authorArray){			
+							splitIntoArray = authorArray[i].split (" ");
+								for (var i = 0; i < splitIntoArray.length; i++){
+									firstName = splitIntoArray[i].substring(0,1).toUpperCase();
+									lastName = splitIntoArray[i].substring(1).toLowerCase();
+									fullName += firstChar + lastChar + emptyString;
+										
+								
+								}
+							newItem.creators.push(Zotero.Utilities.cleanAuthor(JoinString, "author"));
+							
+							}
+							
+						} else {
+							
+					
+							if (authorString.match(/\W\bof\W+/g)){
+								authorTemp = authorString.replace (/\W\bof\W(.*)/g, '');
+								splitIntoArray = authorTemp.split (" ");
+								for (var i = 0; i < splitIntoArray.length; i++){
+											firstName = splitIntoArray[i].substring(0,1).toUpperCase();
+											lastName = splitIntoArray[i].substring(1).toLowerCase();
+											fullName += firstChar + lastChar + emptyString;
+									
+									}
+								newItem.creators.push(Zotero.Utilities.cleanAuthor(JoinString, "author"));
+							
+		
+							} else {
+								
+								splitIntoArray = authorArray.split (" ");
+								for (var i = 0; i < splitIntoArray.length; i++){	
+									firstName = splitIntoArray[i].substring(0,1).toUpperCase();
+									lastName = splitIntoArray[i].substring(1).toLowerCase();
+									fullName += firstName+ lastName + emptyString;
+										
+									
+								}
+								newItem.creators.push(Zotero.Utilities.cleanAuthor(fullName, "author"));
+							}
+										
+						}
+			}  else {
+				
+						if(authorXPathObject.match(/[\s\n\r]+/g)){
+							
+						authorXPathObject = authorXPathObject.replace(/^\s*|\s*$/g, '').replace(/\s+/g, '-');
+						newItem.creators.push(Zotero.Utilities.cleanAuthor(authorXPathObject, "author"));
+						}
+						else { newItem.creators.push(Zotero.Utilities.cleanAuthor(authorXPathObject , "author"));}
+					
+			}
+			
+		} else{
+			newItem.creators ="";
+		}
+			
+		//Title of the Article
+		newItem.title= doTitle(doc, url);
+		
+		
+		//Section of the Article 
+	
+		var current = '//li/a[@class="current"]';
+		var currentObject = doc.evaluate(current, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+		if (currentObject){
+			currentObject = currentObject.textContent;
+	
+			var articleSection = '//li[@class="mid_nav_item"]/a';
+			var articleSectionObject = doc.evaluate(articleSection , doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+			if (articleSectionObject){
+				articleSectionObject = articleSectionObject .textContent;
+				switch (articleSectionObject){
+					case "National":
+					case "Business":
+					case "Sport":
+					case "Politics":
+						newItem.place= "New Zealand";
+						newItem.section = currentObject;
+						break;
+				
+					case "World":
+						newItem.place= "World";
+						newItem.section = currentObject; break;
+					
+					default:
+						newItem.section = articleSectionObject;break;
+				}
+			} 
+			var SectionType = '//li[@class="current_nav_item"]/a';
+			var SectionTypeObject = doc.evaluate(SectionType, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+			if (SectionType){
+				
+					SectionTypeObject = SectionTypeObject.textContent;
+					switch (SectionTypeObject) {
+						case "National":
+						case "Crime":
+						case "Education":
+						case "Health":
+						case "Politics":
+						case "Environment":
+						case "Business":
+						
+							newItem.place= "New Zealand";
+							newItem.section = currentObject; break;
+							
+						case  "Opinion": 
+						case  "Rugby": 
+						case  "Soccer": 
+						case  "Cricket": 
+						case  "Basketball": 
+						case  "Fishing": 
+						case  "League":
+						case  "Scoreboard":
+						case  "Football":
+						case  "Golf": 
+						case  "Motorsport":
+						case  "Netball":
+						case  "Tennis":
+						
+							newItem.section ="Sport"; break;
+						default: 
+							newItem.section = SectionTypeObject; break;
+					}
+				}
+		}
+		else {
+			var SectionType = '//li[@class="current_nav_item"]/a';
+			var SectionTypeObject = doc.evaluate(SectionType, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+			if (SectionType){
+				
+					SectionTypeObject = SectionTypeObject.textContent;
+					
+					switch (SectionTypeObject) {
+						case "National":
+						case "Crime":
+						case "Education":
+						case "Health":
+						case "Politics":
+						case "Environment":
+						case "Business":
+							newItem.place= "New Zealand";
+							newItem.section = SectionTypeObject; break;
+						
+						default:
+							newItem.section =SectionTypeObject; break;
+					}
+				
+			}
+		}
+		//Snapshot of  the web page.
+		newItem.attachments.push({url:url, title:"Stuff.co.nz Snapshot",
+	 	                          mimeType:"text/html"});
+	 	                          
+		//Call Do date function to make it cleaner in scape. This way things are easier to follow.
+		newItem.date = doDate(doc,url);
+		newItem.complete();
+		
 	}
-	newItem.complete();
+	
 }

-function doWeb(doc, url) {
-	var URLS = new Array();
+
+function doShortTitle(doc, url){
 	
-	//multiple
-	if ((url.indexOf("search-results") != -1) || (url.indexOf("blogs/blogs/") != -1)) {
-		if (url.indexOf("search-results") != -1) {
-			var xpath = '//div[@id="leftcol_story"]/p/a';
-		} else if (url.indexOf("blogs/blogs/") != -1) {
-			var xpath = '//h2[@class="storytitle"]/a';
-		}
-	
-		var items = new Object();
-		var titles = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null);
-		var newTitle = titles.iterateNext();
-		while (newTitle) {
-			items[newTitle.href] = newTitle.textContent;
-			newTitle = titles.iterateNext();
-		}
-		
-		items = Zotero.selectItems(items);
-		
-		for (var i in items) {
-			URLS.push(i);
-		}
+	var namespace = doc.documentElement.namespaceURI;
+	var nsResolver = namespace ? function(prefix) {
+		if (prefix == 'x') return namespace; else return null;
+	} : null;
+	var shortTitle="";
+	var subTitle = '//div[@id="left_col"]/h2';
+	var subTitleObject = doc.evaluate(subTitle, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+	if (subTitleObject){
+		 shortTitle= subTitleObject.textContent.replace(/^\s*|\s*$/g, '');
+		return shortTitle;
 	} else {
-		URLS.push(url);
+		return shortTitle;
 	}
 	
-	Zotero.Utilities.processDocuments(URLS, scrape, function() {Zotero.done();});
+}
+
+function doAbstract(doc, url){
+	
+	var namespace = doc.documentElement.namespaceURI;
+	var nsResolver = namespace ? function(prefix) {
+		if (prefix == 'x') return namespace; else return null;
+	} : null;
+	var abstractString=""; 
+	var a= "//meta[@name='description']";
+	var abs= doc.evaluate(a, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+	if (abs){
+		 abstractString = abs.content;
+		 return abstractString;
+		
+	}
+	return abstractString;
+	
+}
+
+function doTitle(doc, url){
+	var namespace = doc.documentElement.namespaceURI;
+	var nsResolver = namespace ? function(prefix) {
+		if (prefix == 'x') return namespace; else return null;
+	} : null;
+	
+	var temp="";
+	var getTitle = '//div[@id="left_col"]/h1';
+	var getTitleObject = doc.evaluate(getTitle, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+	if (getTitleObject) {
+		var temp=getTitleObject.textContent.replace(/^\s*|\s*$/g, '');
+		return temp;
+	}
+	return temp;
+}
+
+function doDate(doc, url){
+	var namespace = doc.documentElement.namespaceURI;
+	var nsResolver = namespace ? function(prefix) {
+		if (prefix == 'x') return namespace; else return null;
+	} : null;
+	
+	var dateXpath = "//div[@id='toolbox']/div[3]";
+	var dateXpathObject = doc.evaluate(dateXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+	try {
+		if (dateXpathObject){
+			var storeDateValue = dateXpathObject.textContent.replace(/\b(Last updated )\d{0,9}:\d{0,9} /g,'');
+			
+			var ArrayDate = storeDateValue.split('/');
+			var emptyString = " ";
+			var comma = ", ";
+			var DateString;
+			var ArrayMonth = new Array("Jan", "Feb", "Mar", "Apr", "May", "Jun", "July", "Aug", "Sep", "Oct", "Nov", "Dec");
+			var ArrayNumber = new Array("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12");
+			for (var i=0; i <ArrayNumber.length; i++){
+				if(ArrayDate[1] ==ArrayNumber[i]) {
+					
+					ArrayNumber[i] = ArrayMonth[i];
+					var month = ArrayNumber[i] + emptyString;
+				}
+				DateString = month + ArrayDate[0] + comma + ArrayDate[2];
+				
+			}
+			return DateString;
+		} else {
+			DateString = "";
+			return DateString;
+		}
+	}catch (err) {
+		
+		DateString = "";
+	}
+	return DateString;
+}
+
+
+function doWeb(doc, url) {
+	var namespace = doc.documentElement.namespaceURI;
+	var nsResolver = namespace ? function(prefix) {
+		if (prefix == 'x') return namespace; else return null;
+	} : null;
+	
+	//var articles = new Array();
+	
+	if (detectWeb(doc, url) == "newspaperArticle") {
+		var articles = [url];
+		
+	}else if (detectWeb(doc, url) == "blogPost") {
+		var articles = [url];
+		
+	}
+
+
+	//Zotero.debug(articles);
+	Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();});
 	Zotero.wait();
-}
+	
+}
+