diff --git a/translators/Potsdamer Neueste Nachrichten.js b/translators/Potsdamer Neueste Nachrichten.js index a2a296f9b7..e6e322b45f 100644 --- a/translators/Potsdamer Neueste Nachrichten.js +++ b/translators/Potsdamer Neueste Nachrichten.js @@ -8,11 +8,11 @@ "priority": 100, "inRepository": "1", "translatorType": 4, - "lastUpdated": "2011-03-26 13:42:35" + "lastUpdated": "2011-05-06 11:34:44" } /* -Potsdamer Neueste Nachrichten Translator +Potsdamer Neueste Nachrichten Translator 1.1 Copyright (C) 2011 Martin Meyerhoff This program is free software: you can redistribute it and/or modify @@ -30,10 +30,9 @@ along with this program. If not, see . */ /* -The articles themselves are quite badly tagged, so that the translator sometimes doesn't capture the summary or the authors. Test it with: -http://www.pnn.de/archiv/?type=archiv&phrase=Krise +http://www.pnn.de/ http://www.pnn.de/zeitung/ http://www.pnn.de/zeitung/12.01.2011/ http://www.pnn.de/titelseite/364860/ @@ -48,8 +47,8 @@ function detectWeb(doc, url) { if (prefix == 'x') return namespace; else return null; } : null; - var PNN_Article_XPath = ".//a[contains(@class, 'print')]"; //only articles have a print button. - var PNN_Multiple_XPath = ".//ul/li/h2/a" + var PNN_Article_XPath = ".//div[contains (@class, 'um-article')]/h1"; //only articles have a print button. + var PNN_Multiple_XPath = "//div[contains(@class, 'um-teaser')]/h2/a" if (doc.evaluate(PNN_Article_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ Zotero.debug("newspaperArticle"); @@ -70,76 +69,47 @@ function scrape(doc, url) { newItem.url = doc.location.href; // Title - var title_XPath = '//title' + var title_XPath = "//div[contains (@class, 'um-article')]/h1" var title = doc.evaluate(title_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; - title = title.split("—")[0]; // split at mdash - title = title.replace(/\„|\“/g, '"'); // standard quotation marks - title = title.replace(/|^\s*|\s*$/, ''); // remove whitespace + title = title.replace(/\s+|\n/g, ' '); newItem.title = title; // Summary - var summary_XPath = ".//p[contains(@class, 'teaser')]"; - var summary = doc.evaluate(summary_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; - summary=summary.replace(/\(.*\)/, ''); // No date in the summary. - summary=Zotero.Utilities.trimInternal(summary); //remove white space - newItem.abstractNote = summary; + var summary_XPath = "//div[contains (@class, 'um-article')]/p[@class='um-first']"; + if (doc.evaluate(summary_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ + var summary = doc.evaluate(summary_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; + newItem.abstractNote = summary; + } // Date - var date_XPath = "//*[contains(@class, 'teaser')]/span[contains(@class, 'date')]"; + var date_XPath = "//div[contains (@class, 'um-article')]/div[@class='um-metabar']/ul/li[contains(@class, 'um-first')]"; var date = doc.evaluate(date_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; - date = date.replace(/\(|\)|^\s*|\s*$/g, ''); // remove whitespace and braces - newItem.date = date; + newItem.date = date.replace(/(\d+)\.(\d+).(\d+)/, '$3-$2-$1');; - // Authors. Tricky. Actually, horrible. I hope they change their site at some point and this mess can be cleaned up. - var temp = new Array(); - temp[0] = "" - var author_XPath = ".//*[@id='teaser']/p/i"; // Sometimes, the author is in italics in the paragraph. Easy Case, really. - if (doc.evaluate(author_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { + // Authors + var author_XPath = "//div[contains (@class, 'um-article')]/span[@class='um-author']"; + if (doc.evaluate(author_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ var author = doc.evaluate(author_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; - temp[0] = author; - } else { - author_XPath = ".//*[@id='teaser']"; // basically, grab the entire article. no other chance. - var author = doc.evaluate(author_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; - author = author.replace(/\s\s\s*/g, "|"); // replace lots of white space (indicative of a line break / paragraph) - author = author.split("|"); - - - // Zotero.debug(author); - var author_searchpattern1 = /^Von(.*)/; // These three patterns capture the majority of authors. - var author_searchpattern2 = /^Das\sGespräch\sführte(.*)\.$/; - var author_searchpattern3 = /^Interview\:\s(.*)Foto:.*/; - - for (var i in author) { - if (temp[0] == "") { - if (author[i].match(author_searchpattern1)) { - var temp = author[i].match(author_searchpattern1); - temp[0] = temp[0].replace(author_searchpattern1, "$1"); - } - if (author[i].match(author_searchpattern2)) { - var temp = author[i].match(author_searchpattern2); - temp[0] = temp[0].replace(author_searchpattern2, "$1"); - } - if (author[i].match(author_searchpattern3)) { - var temp = author[i].match(author_searchpattern3); - temp[0] = temp[0].replace(author_searchpattern3, "$1"); - } + author =author.replace(/^von\s|^\s*|\s*$/g, ''); + author =author.split(/\sund\s|\su\.\s|\,\s/); + for (var i in author) { + if (author[i].match(/\s/)) { // only names that contain a space! + newItem.creators.push(Zotero.Utilities.cleanAuthor(author[i], "author")); } - } - } - var realauthor = temp[0].replace(/^\s*|\s*$/g, ''); - realauthor = realauthor.split(/\sund\s|\su\.\s|\,\s/); - for (var i in realauthor) { - if (realauthor[i].match(/\s/)) { // only names that contain a space! - newItem.creators.push(Zotero.Utilities.cleanAuthor(realauthor[i], "author")); } } + newItem.attachments.push({url:doc.location.href, title:doc.title, mimeType:"text/html"}); newItem.publicationTitle = "Potsdamer Neueste Nachrichten" + // section - var section_XPath = ".//*[@id='sidebar-left']/ul/li[contains(@class, 'active')]"; - var section = doc.evaluate(section_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; - newItem.section = section.replace(/^\s*|\s*$/g, ''); + var section_XPath = "//div[@class='um-mainnav']/ul/li[@class='um-selected']/a"; + if (doc.evaluate(section_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ + var section = doc.evaluate(section_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; + newItem.section = section.replace(/^\s*|\s*$/g, ''); + } + newItem.complete(); } @@ -155,19 +125,19 @@ function doWeb(doc, url) { if (detectWeb(doc, url) == "multiple") { var items = new Object(); - var titles = doc.evaluate(".//ul/li/h2/a", doc, nsResolver, XPathResult.ANY_TYPE, null); + var titles = doc.evaluate("//div[contains(@class, 'um-teaser')]/h2/a", doc, nsResolver, XPathResult.ANY_TYPE, null); var next_title; while (next_title = titles.iterateNext()) { - items[next_title.href] = next_title.textContent; + items[next_title.href] = next_title.textContent.replace(/\s+/g, ' '); } items = Zotero.selectItems(items); for (var i in items) { articles.push(i); } Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();}); - Zotero.wait(); } else { scrape(doc, url); } + Zotero.wait(); }