New translator for NZ's Papers Past, by staplegun

This commit is contained in:
Avram Lyon 2010-09-14 15:06:21 +00:00
parent 536f1ee24e
commit 2f64ff78ea

171
translators/Papers Past.js Normal file
View file

@ -0,0 +1,171 @@
{
"translatorID":"1b052690-16dd-431d-9828-9dc675eb55f6",
"label":"Papers Past",
"creator":"staplegun",
"target":"^http://paperspast\\.natlib\\.govt\\.nz",
"minVersion":"1.0",
"maxVersion":"",
"priority":100,
"inRepository":"1",
"translatorType":4,
"lastUpdated":"2010-09-14 19:04:32"
}
/*
Papers Past Translator - Parses historic digitised newspaper articles and creates Zotero-based metadata
Copyright (C) 2010 staplegun
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
function detectWeb(doc, url) {
// a results parameter in URL means search hitlist
if (url.match(/results=/) ) {
return "multiple";
} else {
// init variables
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == "x" ) return namespace; else return null;
} : null;
var myXPath;
var myXPathObject;
// publication title in meta tags means have an article
myXPath = '//meta[@name="newsarticle_publication"]/@content';
myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
var meta = myXPathObject.iterateNext().textContent;
if (meta.length > 0) {
return "newspaperArticle";
}
}
}
function doWeb(doc, url) {
// init variables
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == "x" ) return namespace; else return null;
} : null;
// hitlist page: compile hitlist titles, user selects which are wanted
// (add &zto=1 to URL for usage tracking)
var articles = new Array();
if (detectWeb(doc, url) == "multiple") {
var titlesXPath = '//div[@class="search-results"]/p/a';
var titles = doc.evaluate(titlesXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
var nextTitle;
var items = new Array();
while (nextTitle = titles.iterateNext()) {
items[nextTitle.href+"&zto=1"] = nextTitle.textContent;
}
// presented to user - who reduces list to those selected
items = Zotero.selectItems(items);
// transfer this list to articles array
for (var i in items) {
articles.push(i);
}
// article page: just continue with single (current) page URL
} else {
articles = [url+"&zto=1"];
}
// process each selected article page URL
Zotero.Utilities.processDocuments(articles, scrape, function(){Zotero.done();});
Zotero.wait();
}
function scrape(doc) {
// init variables
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == "x" ) return namespace; else return null;
} : null;
var myXPath;
var myXPathObject;
// basic item details
var newItem = new Zotero.Item('newspaperArticle');
newItem.url = doc.location.href;
newItem.archive = 'Papers Past';
// publication title
myXPath = '//meta[@name="newsarticle_publication"]/@content';
myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
newItem.publicationTitle = myXPathObject.iterateNext().textContent;
Zotero.debug(newItem.publicationTitle);
// article title (convert to sentence case)
// NB: THE CONVERSION SEEMS TO FAIL IF HAS SPECIAL CHARS
myXPath = '//meta[@name="newsarticle_headline"]/@content';
myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
var title = myXPathObject.iterateNext().textContent;
var words = title.split(/\s/);
var titleFixed = '';
for (var i in words) {
words[i] = words[i][0].toUpperCase() + words[i].substr(1).toLowerCase();
titleFixed = titleFixed + words[i] + ' ';
}
titleFixed = Zotero.Utilities.trim(titleFixed);
newItem.title = titleFixed;
// publication date (is preformatted to ISO 8601)
myXPath = '//meta[@name="dc_date"]/@content';
myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
newItem.date = myXPathObject.iterateNext().textContent;
// pagination
myXPath = '//meta[@name="newsarticle_firstpage"]/@content';
myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
var pages = myXPathObject.iterateNext().textContent;
myXPath = '//meta[@name="newsarticle_otherpages"]/@content';
myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
pages = pages + ' ' + myXPathObject.iterateNext().textContent;
newItem.pages = Zotero.Utilities.trim(pages);
// save copy of entire web page as attachment
var attachments = new Array();
attachments.push({
title:titleFixed + " : Article webpage",
mimeType:"text/html",
url:doc.location.href
});
// find image scans and add as attachments
myXPath = '//img[@class="veridianimage"]/@src';
myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
var imgSrc;
var imgUrl;
var imgNo = 0;
while (imgSrc = myXPathObject.iterateNext() ) {
imgUrl = "http://paperspast.natlib.govt.nz" + imgSrc.textContent;
attachments.push({
title: titleFixed + " : Scan image part " + ++imgNo,
mimeType: "image/gif",
url: imgUrl
});
}
newItem.attachments = attachments;
// finish
newItem.complete();
}