New translator for NZ's Papers Past, by staplegun

2010-09-14 15:06:21 +00:00 · 2010-09-14 15:06:21 +00:00 · 2f64ff78ea
commit 2f64ff78ea
parent 536f1ee24e
1 changed files with 171 additions and 0 deletions
--- a/translators/Papers
+++ b/translators/Papers
@ -0,0 +1,171 @@
+{
+        "translatorID":"1b052690-16dd-431d-9828-9dc675eb55f6",
+        "label":"Papers Past",
+        "creator":"staplegun",
+        "target":"^http://paperspast\\.natlib\\.govt\\.nz",
+        "minVersion":"1.0",
+        "maxVersion":"",
+        "priority":100,
+        "inRepository":"1",
+        "translatorType":4,
+        "lastUpdated":"2010-09-14 19:04:32"
+}
+
+/*
+    Papers Past Translator - Parses historic digitised newspaper articles and creates Zotero-based metadata
+    Copyright (C) 2010 staplegun
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+function detectWeb(doc, url) {
+
+  // a results parameter in URL means search hitlist
+  if (url.match(/results=/) ) {
+    return "multiple";
+
+  } else {
+
+    // init variables
+    var namespace = doc.documentElement.namespaceURI;
+    var nsResolver = namespace ? function(prefix) {
+      if (prefix == "x" ) return namespace; else return null;
+    } : null;
+    var myXPath;
+    var myXPathObject;
+
+    // publication title in meta tags means have an article
+    myXPath          = '//meta[@name="newsarticle_publication"]/@content';
+    myXPathObject    = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+    var meta = myXPathObject.iterateNext().textContent;
+    if (meta.length > 0) {
+      return "newspaperArticle";
+    }
+  }
+}
+
+function doWeb(doc, url) {
+
+  // init variables
+  var namespace = doc.documentElement.namespaceURI;
+  var nsResolver = namespace ? function(prefix) {
+    if (prefix == "x" ) return namespace; else return null;
+  } : null;
+
+  // hitlist page: compile hitlist titles, user selects which are wanted 
+  // (add &zto=1 to URL for usage tracking)
+  var articles = new Array();
+  if (detectWeb(doc, url) == "multiple") {
+    var titlesXPath = '//div[@class="search-results"]/p/a';
+    var titles      = doc.evaluate(titlesXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+    var nextTitle;
+    var items       = new Array();
+    while (nextTitle = titles.iterateNext()) {
+      items[nextTitle.href+"&zto=1"] = nextTitle.textContent;
+    }
+    // presented to user - who reduces list to those selected
+    items = Zotero.selectItems(items);
+    // transfer this list to articles array
+    for (var i in items) {
+      articles.push(i);
+    }
+
+  // article page: just continue with single (current) page URL
+  } else {
+    articles = [url+"&zto=1"];
+  }
+
+  // process each selected article page URL
+  Zotero.Utilities.processDocuments(articles, scrape, function(){Zotero.done();});
+  Zotero.wait();
+}
+
+function scrape(doc) {
+
+  // init variables
+  var namespace = doc.documentElement.namespaceURI;
+  var nsResolver = namespace ? function(prefix) {
+    if (prefix == "x" ) return namespace; else return null;
+  } : null;
+  var myXPath;
+  var myXPathObject;
+  
+  // basic item details
+  var newItem     = new Zotero.Item('newspaperArticle');
+  newItem.url     = doc.location.href;
+  newItem.archive = 'Papers Past';
+
+  // publication title
+  myXPath       = '//meta[@name="newsarticle_publication"]/@content';
+  myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+  newItem.publicationTitle = myXPathObject.iterateNext().textContent;
+  Zotero.debug(newItem.publicationTitle);
+
+  // article title (convert to sentence case)
+  // NB: THE CONVERSION SEEMS TO FAIL IF HAS SPECIAL CHARS
+  myXPath          = '//meta[@name="newsarticle_headline"]/@content';
+  myXPathObject    = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+  var title   = myXPathObject.iterateNext().textContent;
+  var words = title.split(/\s/);
+  var titleFixed = '';
+  for (var i in words) {
+   words[i] = words[i][0].toUpperCase() + words[i].substr(1).toLowerCase();
+   titleFixed = titleFixed + words[i] + ' ';
+  }
+  titleFixed = Zotero.Utilities.trim(titleFixed);
+  newItem.title = titleFixed;
+
+  // publication date (is preformatted to ISO 8601)
+  myXPath          = '//meta[@name="dc_date"]/@content';
+  myXPathObject    = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+  newItem.date = myXPathObject.iterateNext().textContent;
+
+  // pagination
+  myXPath          = '//meta[@name="newsarticle_firstpage"]/@content';
+  myXPathObject    = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+  var pages = myXPathObject.iterateNext().textContent;
+
+  myXPath          = '//meta[@name="newsarticle_otherpages"]/@content';
+  myXPathObject    = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+  pages = pages + ' ' + myXPathObject.iterateNext().textContent;
+
+  newItem.pages = Zotero.Utilities.trim(pages);
+
+  // save copy of entire web page as attachment
+	var attachments = new Array();
+  attachments.push({
+    title:titleFixed + " : Article webpage",
+    mimeType:"text/html",
+    url:doc.location.href
+  });
+
+  // find image scans and add as attachments
+  myXPath       = '//img[@class="veridianimage"]/@src';
+  myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+  var imgSrc;
+  var imgUrl;
+  var imgNo = 0;
+  while (imgSrc = myXPathObject.iterateNext() ) {
+	  imgUrl = "http://paperspast.natlib.govt.nz" + imgSrc.textContent;
+	  attachments.push({
+		  title: titleFixed + " : Scan image part " + ++imgNo,
+		  mimeType: "image/gif",
+		  url: imgUrl
+    });
+  }
+  newItem.attachments = attachments;
+
+  // finish
+  newItem.complete();
+}