diff --git a/translators/Stuff.co.nz.js b/translators/Stuff.co.nz.js
index 4d03721a97..1f7495f47b 100644
--- a/translators/Stuff.co.nz.js
+++ b/translators/Stuff.co.nz.js
@@ -1,107 +1,469 @@
{
- "translatorID":"631ff0c7-2e64-4279-a9c9-ad9518d40f2b",
- "translatorType":4,
- "label":"Stuff.co.nz",
- "creator":"Michael Berkowitz",
- "target":"^http://(www.)?stuff.co.nz/",
- "minVersion":"1.0.0b4.r5",
- "maxVersion":"",
- "priority":100,
- "inRepository":true,
- "lastUpdated":"2007-08-14 22:15:00"
+ "translatorID":"386c7e75-eef4-47b1-b5a6-0faa3cfa4f44",
+ "label":"Stuff.co.nz",
+ "creator":"Sopheak Hean (University of Waikato, Faculty of Education)",
+ "target":"^http://(www\\.)?stuff\\.co\\.nz/",
+ "minVersion":"1.0",
+ "maxVersion":"",
+ "priority":100,
+ "inRepository":"1",
+ "translatorType":4,
+ "lastUpdated":"2010-08-23 00:34:34"
}
+/*
+ Stuff.co.nz Translator- Parses Stuff.co.nz articles and creates Zotero-based metadata
+ Copyright (C) 2010 Sopheak Hean, University of Waikato, Faculty of Education
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+*/
+
+/* Stuff.co.nz does not have an ISSN because it is not a newspaper publisher. Stuff.co.nz is a collection of newspaper articles from around the country*/
+
function detectWeb(doc, url) {
- if ((doc.location.href.indexOf("search-results") != -1) || (doc.location.href.indexOf("/blogs/blogs/") != -1 )) {
- return "multiple";
- } else if ((doc.location.href.indexOf("blogs") != -1) && (url != "http://www.stuff.co.nz/blogs/blogs") && (url != "http://stuff.co.nz/blogs/blogs")) {
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == "x" ) return namespace; else return null;
+ } : null;
+ var definePath = '//div[@class="blog_content"]';
+ var XpathObject = doc.evaluate(definePath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+if (XpathObject){
return "blogPost";
- } else if (doc.location.href.indexOf("html") == (doc.location.href.length - 4)){
- return "newspaperArticle";
}
+
+ else {
+ var definePath = '//div[@class="story_landing"]';
+ var XpathObject = doc.evaluate(definePath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ if (XpathObject){
+ return "newspaperArticle";
+ }
+ }
+
}
+function myUpperCaseFunction(input){
+ /*Will define one later*/
+}
+
+
function scrape(doc, url) {
- if (doc.location.href.indexOf("html") != -1) {
- var newItem = new Zotero.Item("newspaperArticle");
- newItem.url = doc.location.href;
- newItem.publicationTitle = "Stuff.co.nz";
- newItem.title = doc.title.split(" - ")[0];
-
- //abstract
- var xpath = '//div[@id="leftcol_story"]/p/strong';
- newItem.abstractNote = Zotero.Utilities.cleanString(doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext().textContent);
-
- //date and author
- var xpath = '//div[@id="story_headline"]';
- var info = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext().textContent.split(/\n+/)[2].split(" | ");
-
- newItem.date = Zotero.Utilities.cleanString(info[1].split(",")[1]);
-
- var author = Zotero.Utilities.cleanString(info[0]);
- if (author.substr(0,2).toLowerCase() == "by") {
- author = author.substr(3);
- if (author.indexOf(" - ") != -1) {
- author = author.split(" - ")[0].split(" ");
- } else {
- author = author.split(" ");
- }
- for (var i = 0 ; i < author.length ; i++) {
- author[i] = author[i][0] + author[i].substr(1).toLowerCase();
- var creator = author.join(" ");
- }
- newItem.creators.push(Zotero.Utilities.cleanAuthor(creator, "author"));
- } else {
- newItem.extra = author;
- }
- } else if (doc.location.href.indexOf("blogs") != -1) {
- var newItem = new Zotero.Item("blogPost");
- newItem.url = doc.location.href;
- //post title
- var xpath = '//div[@class="post"]/h2[@class="storytitle"]/a';
- newItem.title = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext().textContent;
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == 'x') return namespace; else return null;
+ } : null;
+ var url = doc.location.href;
+ var splitIntoArray;
+ var fullName="";
+ var emptyString =" ";
+ var firstName; var lastName;
+ /*==========================Blog Post===========================*/
- //date and author
- var xpath = '//div[@class="meta"][@id="postdate"]'
- var info = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext().textContent.split(" | ");
- var byline = Zotero.Utilities.cleanString(info[0]).split(" in ");
- newItem.creators.push(Zotero.Utilities.cleanAuthor(byline[0], "author"));
- newItem.blogTitle = byline[1];
- var date = Zotero.Utilities.cleanString(info[1]).split("m ");
- newItem.date = date[1];
+ if (detectWeb(doc, url) =="blogPost"){
+
+ var newItem = new Zotero.Item('blogPost');
+ newItem.url = doc.location.href;
+ //newItem.title = "No Title Found";
+ newItem.publicationTitle = "Stuff.co.nz";
+ newItem.language = "English";
+
+ //Get Author
+ try { /*Try and Catch if encounter erro */
+
+ var blogAuthor = "//div[@id='left_col']/span";
+ var blogAuthorObject = doc.evaluate(blogAuthor, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ if (blogAuthorObject) {
+
+ if (blogAuthorObject.textContent.replace(/\s*/g,'') ==""){
+ newItem.creators =blogAuthorObject.textContent.replace(/\s*/g,'');
+ }
+
+ else{
+ blogAuthorObject = blogAuthorObject.textContent;
+ if(blogAuthorObject.match(/[\s\n\r\t]+-[\s\n\r\t]+[a-zA-Z\s\n\r\t]*/g)){
+ blogAuthorObject = blogAuthorObject.replace(/([\s\n\r\t]+-[\s\n\r\t]+[a-zA-Z\s\n\r\t]*)/g, '').replace(/\bBy \b/g,'');
+ splitIntoArray = blogAuthorObject.split (" ");
+ for (var i = 0; i < splitIntoArray.length; i++){
+ firstName = splitIntoArray[i].substring(0,1).toUpperCase();
+ lastName = splitIntoArray[i].substring(1).toLowerCase();
+ fullName += firstName + lastName + emptyString;
+
+ }
+ newItem.creators.push(Zotero.Utilities.cleanAuthor(fullName , "author"));
+ }
+
+ else {
+ splitIntoArray = blogAuthorObject.replace(/\bBy \b/g,'').split (" ");
+ for (var i = 0; i < splitIntoArray.length; i++){
+ firstName = splitIntoArray[i].substring(0,1).toUpperCase();
+ lastName = splitIntoArray[i].substring(1).toLowerCase();
+ fullName += firstName + lastName + emptyString;
+
+ }
+ newItem.creators.push(Zotero.Utilities.cleanAuthor(fullName , "author")); }
+ }
+ }
+ } catch (err) {
+ newItem.creators ="error";
+
+ }
+
+ //Title of the Article
+ var getBlogTitle = "//span[@class='hbox_top_title headlines_title']/a";
+ var getBlogTitleObject = doc.evaluate(getBlogTitle, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ if (getBlogTitleObject){
+ newItem.blogTitle =getBlogTitleObject.textContent.replace(/\s+\bHeadlines\b/g, '');
+ }
+ newItem.shortTitle = doShortTitle(doc,url);
+ newItem.title= doTitle(doc, url);
+ newItem.date = doDate(doc, url);
+ newItem.abstractNote = doAbstract(doc, url);
+ newItem.websiteType = "Newspaper";
+ newItem.attachments.push({url:url, title:"Stuff.co.nz Snapshot", mimeType:"text/html"});
+ newItem.complete();
+ }
+
+
+
+ /* ======================Newspaper Article========================*/
+
+ else if (detectWeb(doc, url) =="newspaperArticle"){
+
+ var newItem = new Zotero.Item('newspaperArticle');
+ newItem.url = doc.location.href;
+ //newItem.title = "No Title Found";
+
+ //Get extended publisher if there is any then replace with stuff.co.nz
+ var myPublisher = '//span[@class="storycredit"]';
+
+ var myPublisherObject = doc.evaluate(myPublisher , doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ if (myPublisherObject) {
+ var realPublisher = myPublisherObject.textContent;
+ if (realPublisher.match(/\bBy[\s\n\r\t]+[a-zA-Z\s\r\t\n]*-[\s\n\r\t]*/g)){
+ realPublisher = realPublisher.replace (/\bBy[\s\n\r\t]+[a-zA-Z\s\r\t\n]*-[\s\n\r\t]*/g, '').replace(/^\s*|\s*$/g, '');
+ newItem.publicationTitle = realPublisher;
+ } else {
+ newItem.publicationTitle = "Stuff.co.nz";
+ }
+
+ } else {
+ newItem.publicationTitle = "Stuff.co.nz";
+ }
+
+ newItem.language = "English";
+
+ //Short Title
+ newItem.shortTitle = doShortTitle(doc,url);
+
+
+ //get Abstract
+ newItem.abstractNote = doAbstract(doc, url);
+ var authorXPath = '//span[@class="storycredit"]';
+
+ var authorXPathObject = doc.evaluate(authorXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ if (authorXPathObject){
+ var authorArray = new Array("NZPA", "The Press", "The Dominion Post");
+ authorXPathObject = authorXPathObject.textContent;
+
+ if(authorXPathObject.match(/[\s\n\r\t]+-[\s\n\r\t]+\b[a-zA-Z\s\n\r\t]*|^\s+\bBy\s*/g)){
+ authorXPathObject = authorXPathObject.replace(/([\s\n\r\t]+-[\s\n\r\t]+\b[a-zA-Z\s\n\r\t]*)|\b.co.nz|\b.com|(-[a-zA-Z0-9]*)/g, '');
+ var authorString = authorXPathObject.replace(/^\s+\bBy\s*|^\s+\bBY\s*/g, '');
+
+ if (authorString.match(/\W\band\W+/g)){
+ authorTemp = authorString.replace(/\W\band\W+/g, ', ');
+ authorArray = authorTemp.split(", ");
+
+ } else if (!authorString.match(/\W\band\W+/g))
+ {
+ authorArray = authorString.toLowerCase();
+ }
+ if( authorArray instanceof Array ) {
+ for (var i in authorArray){
+ splitIntoArray = authorArray[i].split (" ");
+ for (var i = 0; i < splitIntoArray.length; i++){
+ firstName = splitIntoArray[i].substring(0,1).toUpperCase();
+ lastName = splitIntoArray[i].substring(1).toLowerCase();
+ fullName += firstChar + lastChar + emptyString;
+
+
+ }
+ newItem.creators.push(Zotero.Utilities.cleanAuthor(JoinString, "author"));
+
+ }
+
+ } else {
+
+
+ if (authorString.match(/\W\bof\W+/g)){
+ authorTemp = authorString.replace (/\W\bof\W(.*)/g, '');
+ splitIntoArray = authorTemp.split (" ");
+ for (var i = 0; i < splitIntoArray.length; i++){
+ firstName = splitIntoArray[i].substring(0,1).toUpperCase();
+ lastName = splitIntoArray[i].substring(1).toLowerCase();
+ fullName += firstChar + lastChar + emptyString;
+
+ }
+ newItem.creators.push(Zotero.Utilities.cleanAuthor(JoinString, "author"));
+
+
+ } else {
+
+ splitIntoArray = authorArray.split (" ");
+ for (var i = 0; i < splitIntoArray.length; i++){
+ firstName = splitIntoArray[i].substring(0,1).toUpperCase();
+ lastName = splitIntoArray[i].substring(1).toLowerCase();
+ fullName += firstName+ lastName + emptyString;
+
+
+ }
+ newItem.creators.push(Zotero.Utilities.cleanAuthor(fullName, "author"));
+ }
+
+ }
+ } else {
+
+ if(authorXPathObject.match(/[\s\n\r]+/g)){
+
+ authorXPathObject = authorXPathObject.replace(/^\s*|\s*$/g, '').replace(/\s+/g, '-');
+ newItem.creators.push(Zotero.Utilities.cleanAuthor(authorXPathObject, "author"));
+ }
+ else { newItem.creators.push(Zotero.Utilities.cleanAuthor(authorXPathObject , "author"));}
+
+ }
+
+ } else{
+ newItem.creators ="";
+ }
+
+ //Title of the Article
+ newItem.title= doTitle(doc, url);
+
+
+ //Section of the Article
+
+ var current = '//li/a[@class="current"]';
+ var currentObject = doc.evaluate(current, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ if (currentObject){
+ currentObject = currentObject.textContent;
+
+ var articleSection = '//li[@class="mid_nav_item"]/a';
+ var articleSectionObject = doc.evaluate(articleSection , doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ if (articleSectionObject){
+ articleSectionObject = articleSectionObject .textContent;
+ switch (articleSectionObject){
+ case "National":
+ case "Business":
+ case "Sport":
+ case "Politics":
+ newItem.place= "New Zealand";
+ newItem.section = currentObject;
+ break;
+
+ case "World":
+ newItem.place= "World";
+ newItem.section = currentObject; break;
+
+ default:
+ newItem.section = articleSectionObject;break;
+ }
+ }
+ var SectionType = '//li[@class="current_nav_item"]/a';
+ var SectionTypeObject = doc.evaluate(SectionType, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ if (SectionType){
+
+ SectionTypeObject = SectionTypeObject.textContent;
+ switch (SectionTypeObject) {
+ case "National":
+ case "Crime":
+ case "Education":
+ case "Health":
+ case "Politics":
+ case "Environment":
+ case "Business":
+
+ newItem.place= "New Zealand";
+ newItem.section = currentObject; break;
+
+ case "Opinion":
+ case "Rugby":
+ case "Soccer":
+ case "Cricket":
+ case "Basketball":
+ case "Fishing":
+ case "League":
+ case "Scoreboard":
+ case "Football":
+ case "Golf":
+ case "Motorsport":
+ case "Netball":
+ case "Tennis":
+
+ newItem.section ="Sport"; break;
+ default:
+ newItem.section = SectionTypeObject; break;
+ }
+ }
+ }
+ else {
+ var SectionType = '//li[@class="current_nav_item"]/a';
+ var SectionTypeObject = doc.evaluate(SectionType, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ if (SectionType){
+
+ SectionTypeObject = SectionTypeObject.textContent;
+
+ switch (SectionTypeObject) {
+ case "National":
+ case "Crime":
+ case "Education":
+ case "Health":
+ case "Politics":
+ case "Environment":
+ case "Business":
+ newItem.place= "New Zealand";
+ newItem.section = SectionTypeObject; break;
+
+ default:
+ newItem.section =SectionTypeObject; break;
+ }
+
+ }
+ }
+ //Snapshot of the web page.
+ newItem.attachments.push({url:url, title:"Stuff.co.nz Snapshot",
+ mimeType:"text/html"});
+
+ //Call Do date function to make it cleaner in scape. This way things are easier to follow.
+ newItem.date = doDate(doc,url);
+ newItem.complete();
+
}
- newItem.complete();
+
}
-function doWeb(doc, url) {
- var URLS = new Array();
+
+function doShortTitle(doc, url){
- //multiple
- if ((url.indexOf("search-results") != -1) || (url.indexOf("blogs/blogs/") != -1)) {
- if (url.indexOf("search-results") != -1) {
- var xpath = '//div[@id="leftcol_story"]/p/a';
- } else if (url.indexOf("blogs/blogs/") != -1) {
- var xpath = '//h2[@class="storytitle"]/a';
- }
-
- var items = new Object();
- var titles = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null);
- var newTitle = titles.iterateNext();
- while (newTitle) {
- items[newTitle.href] = newTitle.textContent;
- newTitle = titles.iterateNext();
- }
-
- items = Zotero.selectItems(items);
-
- for (var i in items) {
- URLS.push(i);
- }
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == 'x') return namespace; else return null;
+ } : null;
+ var shortTitle="";
+ var subTitle = '//div[@id="left_col"]/h2';
+ var subTitleObject = doc.evaluate(subTitle, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ if (subTitleObject){
+ shortTitle= subTitleObject.textContent.replace(/^\s*|\s*$/g, '');
+ return shortTitle;
} else {
- URLS.push(url);
+ return shortTitle;
}
- Zotero.Utilities.processDocuments(URLS, scrape, function() {Zotero.done();});
+}
+
+function doAbstract(doc, url){
+
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == 'x') return namespace; else return null;
+ } : null;
+ var abstractString="";
+ var a= "//meta[@name='description']";
+ var abs= doc.evaluate(a, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ if (abs){
+ abstractString = abs.content;
+ return abstractString;
+
+ }
+ return abstractString;
+
+}
+
+function doTitle(doc, url){
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == 'x') return namespace; else return null;
+ } : null;
+
+ var temp="";
+ var getTitle = '//div[@id="left_col"]/h1';
+ var getTitleObject = doc.evaluate(getTitle, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ if (getTitleObject) {
+ var temp=getTitleObject.textContent.replace(/^\s*|\s*$/g, '');
+ return temp;
+ }
+ return temp;
+}
+
+function doDate(doc, url){
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == 'x') return namespace; else return null;
+ } : null;
+
+ var dateXpath = "//div[@id='toolbox']/div[3]";
+ var dateXpathObject = doc.evaluate(dateXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ try {
+ if (dateXpathObject){
+ var storeDateValue = dateXpathObject.textContent.replace(/\b(Last updated )\d{0,9}:\d{0,9} /g,'');
+
+ var ArrayDate = storeDateValue.split('/');
+ var emptyString = " ";
+ var comma = ", ";
+ var DateString;
+ var ArrayMonth = new Array("Jan", "Feb", "Mar", "Apr", "May", "Jun", "July", "Aug", "Sep", "Oct", "Nov", "Dec");
+ var ArrayNumber = new Array("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12");
+ for (var i=0; i