diff --git a/translators/Japan Times Online.js b/translators/Japan Times Online.js new file mode 100644 index 0000000000..8f7eab29c8 --- /dev/null +++ b/translators/Japan Times Online.js @@ -0,0 +1,160 @@ +{ + "translatorID":"b56d756e-934e-4b46-bc58-d61dccc9f32f", + "translatorType":4, + "label":"Japan Times Online", + "creator":"Frank Bennett", + "target":"^http://(?:www|search)\\.japantimes\\.co\\.jp/(?:cgi-bin|gsearch|features|entertainment|sports|life|news)", + "minVersion":"2.0b7", + "maxVersion":"", + "priority":100, + "inRepository":true, + "lastUpdated":"2009-01-23 02:17:09" +} + +// ################################# +// #### Local utility functions #### +// ################################# + +var itemRe = new RegExp('^http://search\.japantimes\.co\.jp/cgi-bin/[a-z]{2}[0-9]{8}[a-z0-9]{2}\.html'); + +var getResolver = function (doc) { + var namespace, resolver; + namespace = doc.documentElement.namespaceURI; + if (namespace) { + resolver = function(prefix) { + if (prefix == 'x') { + return namespace; + } else { + return null; + } + }; + } else { + resolver = null; + } + return resolver; +}; + +var getTagContent = function (txt, attribute, value) { + var ret, m, rex; + ret = false; + rex = RegExp("<[^>]*" + attribute + "=\"" + value + "\"[^>]*>([^<]*)<"); + m = rex.exec(txt); + if (m) { + ret = m[1]; + } + return ret; +} + +var getTagsWithAttributeAndContent = function (txt, tag, attribute) { + var ret, pos, len, lst, m, tagsrex, attribrex; + ret = {}; + tagsrex = RegExp("(<" + tag + "(?: [^>]*>|>)|)"); + attribrex = RegExp(' ' + attribute + '="([^"]+)"'); + lst = txt.split(tagsrex); + if (lst.length > 1) { + len = lst.length; + for (pos=1; pos < len; pos += 4) { + if (pos < (len - 2) && lst[pos + 2] == ("")) { + m = lst[pos].match(attribrex); + if (m) { + if (!itemRe.exec(m[1])) { + continue; + } + var title = lst[pos + 1]; + title = title.replace(/\|.*/, "").replace(/<[^>]+>/g, "");; + ret[m[1]] = Zotero.Utilities.unescapeHTML(title); + } + } + } + } + return ret; +} + +// ######################### +// ##### API functions ##### +// ######################### + +var detectWeb = function (doc, url) { + if (itemRe.test(doc.location.href)) { + return "newspaperArticle"; + } else { + return "multiple"; + } +} + +var doWeb = function (doc, url) { + var type, nsResolver, availableItems, xpath, found, nodes, headline, pos, myurl, m, items; + nsResolver = getResolver(doc); + type = detectWeb(doc, url); + if (type === "multiple") { + availableItems = {}; + if (url.match(/\/gsearch\//)) { + // + // For Google SafeSearch. Thanks, guys, it was an entertaining afternoon. + // + xpath = '//iframe[@name="googleSearchFrame"]'; + var iframe = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + var address = iframe.src; + var page = Zotero.Utilities.retrieveSource(address); + availableItems = getTagsWithAttributeAndContent(page, "a", "href"); + } else { + xpath = '//a[contains(@href, "cgi-bin")]'; + nodes = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null); + found = nodes.iterateNext(); + while (found) { + if (!itemRe.test(found)) { + found = nodes.iterateNext(); + continue; + } + headline = found.text; + // + // Some headlines have a weird structure that yields two + // entries, the second of which is blank. Nothing is lost + // by this construct. + // + if (!headline.replace("\n", "")) { + found = nodes.iterateNext(); + continue; + } + headline = headline.replace("\u00a0", " ", "g").replace("\n", " ", "g"); + headline = headline.replace(/^\s+/, "").replace(/\s+$/, "").replace(/\s+/g, " "); + availableItems[found.href] = headline; + found = nodes.iterateNext(); + } + } + if (availableItems.__count__) { + items = Zotero.selectItems(availableItems); + for (myurl in items) { + if (items.hasOwnProperty(myurl)) { + scrapeAndParse(myurl); + } + } + } + } else if (type === "newspaperArticle") { + scrapeAndParse(url); + } +}; + +// ############################ +// ##### Scraper function ##### +// ############################ + +var scrapeAndParse = function (url) { + var item, mytxt, m, val; + item = new Zotero.Item("newspaperArticle"); + + mytxt = Zotero.Utilities.retrieveSource(url); + + item.publicationTitle = "Japan Times Online"; + item.url = url; + val = getTagContent(mytxt, "id", "date"); + if (val) { + item.date = val; + } + val = getTagContent(mytxt, "id", "headline"); + if (val) { + item.title = val; + } + item.attachments.push({title:"Japan Times Online snapshot", mimeType:"text/html", url:url}); + item.complete(); +}; diff --git a/translators/Mainichi Daily News.js b/translators/Mainichi Daily News.js new file mode 100644 index 0000000000..8c6f7441ed --- /dev/null +++ b/translators/Mainichi Daily News.js @@ -0,0 +1,121 @@ +{ + "translatorID":"b56f856e-934e-4b46-bc58-d61dccc9f32f", + "translatorType":4, + "label":"Mainichi Daily News", + "creator":"Frank Bennett", + "target":"^http://(?:search\\.)*mdn\\.mainichi\\.jp/(?:$|result\?|mdnnews/|perspectives/|features/|arts/|travel/)", + "minVersion":"2.0b7", + "maxVersion":"", + "priority":100, + "inRepository":true, + "lastUpdated":"2009-01-23 02:17:09" +} + +// ################################# +// #### Local utility functions #### +// ################################# + +var itemRe = new RegExp('.*/([0-9]{8})[a-z]{1}[0-9]{1}[a-z]{1}[0-9]{2}[a-z]{1}[0-9]{1}[a-z]{2}[0-9]{6}c\.html'); + +var getResolver = function (doc) { + var namespace, resolver; + namespace = doc.documentElement.namespaceURI; + if (namespace) { + resolver = function(prefix) { + if (prefix == 'x') { + return namespace; + } else { + return null; + } + }; + } else { + resolver = null; + } + return resolver; +}; + +var cleanUp = function (str) { + var ret; + ret = str.replace("\u00a0", " ", "g").replace("\n", " ", "g"); + ret = ret.replace(/^\s+/, "").replace(/\s+$/, "").replace(/\s+/g, " "); + ret = ret.replace(/\|.*/, "").replace(/<[^>]+>/g, "");; + ret = Zotero.Utilities.unescapeHTML(ret); + return ret; +} + + +// ######################### +// ##### API functions ##### +// ######################### + +var detectWeb = function (doc, url) { + if (itemRe.test(doc.location.href)) { + return "newspaperArticle"; + } else { + return "multiple"; + } +} + +var doWeb = function (doc, url) { + var type, nsResolver, availableItems, xpath, found, nodes, headline, pos, myurl, m, items, title; + nsResolver = getResolver(doc); + type = detectWeb(doc, url); + if (type === "multiple") { + availableItems = {}; + if (url.match(/^http:\/\/search\.mdn\.mainichi\.jp\/result\?/)){ + xpath = '//div[@class="ResultTitle"]/a[contains(@href, "mdn.mainichi.jp")]'; + } else { + xpath = '//h2[@class="NewsTitle"]/a[@href]|//ul[@class="Mark"]/li/a[@href]'; + } + nodes = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null); + found = nodes.iterateNext(); + while (found) { + if (!itemRe.test(found.href)) { + found = nodes.iterateNext(); + continue; + } + headline = found.textContent; + headline = cleanUp(headline); + availableItems[found.href] = headline; + found = nodes.iterateNext(); + } + if (availableItems.__count__) { + items = Zotero.selectItems(availableItems); + for (myurl in items) { + if (items.hasOwnProperty(myurl)) { + scrapeAndParse(myurl, availableItems[myurl]); + } + } + } + } else if (type === "newspaperArticle") { + xpath = '//h2[@class="NewsTitle"]'; + nodes = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null); + title = nodes.iterateNext(); + if (title) { + title = cleanUp(title.textContent); + scrapeAndParse(url, title); + } + } +}; + +// ############################ +// ##### Scraper function ##### +// ############################ + +var scrapeAndParse = function (url, title) { + var item, mytxt, m, val; + item = new Zotero.Item("newspaperArticle"); + item.title = title; + item.publicationTitle = "Mainichi Daily News"; + item.edition = "online edition"; + item.url = url; + m = itemRe.exec(url); + if (m) { + var year = m[1].slice(0,4); + var month = m[1].slice(4,6); + var day = m[1].slice(6,8); + item.date = [year, month, day].join("-"); + } + item.attachments.push({title:"Mainichi Daily News snapshot", mimeType:"text/html", url:url}); + item.complete(); +};