Adding new translators from Frank.
This commit is contained in:
parent
ba381943e1
commit
88d987c6ff
2 changed files with 281 additions and 0 deletions
160
translators/Japan Times Online.js
Normal file
160
translators/Japan Times Online.js
Normal file
|
@ -0,0 +1,160 @@
|
||||||
|
{
|
||||||
|
"translatorID":"b56d756e-934e-4b46-bc58-d61dccc9f32f",
|
||||||
|
"translatorType":4,
|
||||||
|
"label":"Japan Times Online",
|
||||||
|
"creator":"Frank Bennett",
|
||||||
|
"target":"^http://(?:www|search)\\.japantimes\\.co\\.jp/(?:cgi-bin|gsearch|features|entertainment|sports|life|news)",
|
||||||
|
"minVersion":"2.0b7",
|
||||||
|
"maxVersion":"",
|
||||||
|
"priority":100,
|
||||||
|
"inRepository":true,
|
||||||
|
"lastUpdated":"2009-01-23 02:17:09"
|
||||||
|
}
|
||||||
|
|
||||||
|
// #################################
|
||||||
|
// #### Local utility functions ####
|
||||||
|
// #################################
|
||||||
|
|
||||||
|
var itemRe = new RegExp('^http://search\.japantimes\.co\.jp/cgi-bin/[a-z]{2}[0-9]{8}[a-z0-9]{2}\.html');
|
||||||
|
|
||||||
|
var getResolver = function (doc) {
|
||||||
|
var namespace, resolver;
|
||||||
|
namespace = doc.documentElement.namespaceURI;
|
||||||
|
if (namespace) {
|
||||||
|
resolver = function(prefix) {
|
||||||
|
if (prefix == 'x') {
|
||||||
|
return namespace;
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
resolver = null;
|
||||||
|
}
|
||||||
|
return resolver;
|
||||||
|
};
|
||||||
|
|
||||||
|
var getTagContent = function (txt, attribute, value) {
|
||||||
|
var ret, m, rex;
|
||||||
|
ret = false;
|
||||||
|
rex = RegExp("<[^>]*" + attribute + "=\"" + value + "\"[^>]*>([^<]*)<");
|
||||||
|
m = rex.exec(txt);
|
||||||
|
if (m) {
|
||||||
|
ret = m[1];
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
var getTagsWithAttributeAndContent = function (txt, tag, attribute) {
|
||||||
|
var ret, pos, len, lst, m, tagsrex, attribrex;
|
||||||
|
ret = {};
|
||||||
|
tagsrex = RegExp("(<" + tag + "(?: [^>]*>|>)|</" + tag+ ">)");
|
||||||
|
attribrex = RegExp(' ' + attribute + '="([^"]+)"');
|
||||||
|
lst = txt.split(tagsrex);
|
||||||
|
if (lst.length > 1) {
|
||||||
|
len = lst.length;
|
||||||
|
for (pos=1; pos < len; pos += 4) {
|
||||||
|
if (pos < (len - 2) && lst[pos + 2] == ("</" + tag + ">")) {
|
||||||
|
m = lst[pos].match(attribrex);
|
||||||
|
if (m) {
|
||||||
|
if (!itemRe.exec(m[1])) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
var title = lst[pos + 1];
|
||||||
|
title = title.replace(/\|.*/, "").replace(/<[^>]+>/g, "");;
|
||||||
|
ret[m[1]] = Zotero.Utilities.unescapeHTML(title);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
// #########################
|
||||||
|
// ##### API functions #####
|
||||||
|
// #########################
|
||||||
|
|
||||||
|
var detectWeb = function (doc, url) {
|
||||||
|
if (itemRe.test(doc.location.href)) {
|
||||||
|
return "newspaperArticle";
|
||||||
|
} else {
|
||||||
|
return "multiple";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var doWeb = function (doc, url) {
|
||||||
|
var type, nsResolver, availableItems, xpath, found, nodes, headline, pos, myurl, m, items;
|
||||||
|
nsResolver = getResolver(doc);
|
||||||
|
type = detectWeb(doc, url);
|
||||||
|
if (type === "multiple") {
|
||||||
|
availableItems = {};
|
||||||
|
if (url.match(/\/gsearch\//)) {
|
||||||
|
//
|
||||||
|
// For Google SafeSearch. Thanks, guys, it was an entertaining afternoon.
|
||||||
|
//
|
||||||
|
xpath = '//iframe[@name="googleSearchFrame"]';
|
||||||
|
var iframe = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
||||||
|
var address = iframe.src;
|
||||||
|
var page = Zotero.Utilities.retrieveSource(address);
|
||||||
|
availableItems = getTagsWithAttributeAndContent(page, "a", "href");
|
||||||
|
} else {
|
||||||
|
xpath = '//a[contains(@href, "cgi-bin")]';
|
||||||
|
nodes = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||||
|
found = nodes.iterateNext();
|
||||||
|
while (found) {
|
||||||
|
if (!itemRe.test(found)) {
|
||||||
|
found = nodes.iterateNext();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
headline = found.text;
|
||||||
|
//
|
||||||
|
// Some headlines have a weird structure that yields two
|
||||||
|
// entries, the second of which is blank. Nothing is lost
|
||||||
|
// by this construct.
|
||||||
|
//
|
||||||
|
if (!headline.replace("\n", "")) {
|
||||||
|
found = nodes.iterateNext();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
headline = headline.replace("\u00a0", " ", "g").replace("\n", " ", "g");
|
||||||
|
headline = headline.replace(/^\s+/, "").replace(/\s+$/, "").replace(/\s+/g, " ");
|
||||||
|
availableItems[found.href] = headline;
|
||||||
|
found = nodes.iterateNext();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (availableItems.__count__) {
|
||||||
|
items = Zotero.selectItems(availableItems);
|
||||||
|
for (myurl in items) {
|
||||||
|
if (items.hasOwnProperty(myurl)) {
|
||||||
|
scrapeAndParse(myurl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (type === "newspaperArticle") {
|
||||||
|
scrapeAndParse(url);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// ############################
|
||||||
|
// ##### Scraper function #####
|
||||||
|
// ############################
|
||||||
|
|
||||||
|
var scrapeAndParse = function (url) {
|
||||||
|
var item, mytxt, m, val;
|
||||||
|
item = new Zotero.Item("newspaperArticle");
|
||||||
|
|
||||||
|
mytxt = Zotero.Utilities.retrieveSource(url);
|
||||||
|
|
||||||
|
item.publicationTitle = "Japan Times Online";
|
||||||
|
item.url = url;
|
||||||
|
val = getTagContent(mytxt, "id", "date");
|
||||||
|
if (val) {
|
||||||
|
item.date = val;
|
||||||
|
}
|
||||||
|
val = getTagContent(mytxt, "id", "headline");
|
||||||
|
if (val) {
|
||||||
|
item.title = val;
|
||||||
|
}
|
||||||
|
item.attachments.push({title:"Japan Times Online snapshot", mimeType:"text/html", url:url});
|
||||||
|
item.complete();
|
||||||
|
};
|
121
translators/Mainichi Daily News.js
Normal file
121
translators/Mainichi Daily News.js
Normal file
|
@ -0,0 +1,121 @@
|
||||||
|
{
|
||||||
|
"translatorID":"b56f856e-934e-4b46-bc58-d61dccc9f32f",
|
||||||
|
"translatorType":4,
|
||||||
|
"label":"Mainichi Daily News",
|
||||||
|
"creator":"Frank Bennett",
|
||||||
|
"target":"^http://(?:search\\.)*mdn\\.mainichi\\.jp/(?:$|result\?|mdnnews/|perspectives/|features/|arts/|travel/)",
|
||||||
|
"minVersion":"2.0b7",
|
||||||
|
"maxVersion":"",
|
||||||
|
"priority":100,
|
||||||
|
"inRepository":true,
|
||||||
|
"lastUpdated":"2009-01-23 02:17:09"
|
||||||
|
}
|
||||||
|
|
||||||
|
// #################################
|
||||||
|
// #### Local utility functions ####
|
||||||
|
// #################################
|
||||||
|
|
||||||
|
var itemRe = new RegExp('.*/([0-9]{8})[a-z]{1}[0-9]{1}[a-z]{1}[0-9]{2}[a-z]{1}[0-9]{1}[a-z]{2}[0-9]{6}c\.html');
|
||||||
|
|
||||||
|
var getResolver = function (doc) {
|
||||||
|
var namespace, resolver;
|
||||||
|
namespace = doc.documentElement.namespaceURI;
|
||||||
|
if (namespace) {
|
||||||
|
resolver = function(prefix) {
|
||||||
|
if (prefix == 'x') {
|
||||||
|
return namespace;
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
resolver = null;
|
||||||
|
}
|
||||||
|
return resolver;
|
||||||
|
};
|
||||||
|
|
||||||
|
var cleanUp = function (str) {
|
||||||
|
var ret;
|
||||||
|
ret = str.replace("\u00a0", " ", "g").replace("\n", " ", "g");
|
||||||
|
ret = ret.replace(/^\s+/, "").replace(/\s+$/, "").replace(/\s+/g, " ");
|
||||||
|
ret = ret.replace(/\|.*/, "").replace(/<[^>]+>/g, "");;
|
||||||
|
ret = Zotero.Utilities.unescapeHTML(ret);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// #########################
|
||||||
|
// ##### API functions #####
|
||||||
|
// #########################
|
||||||
|
|
||||||
|
var detectWeb = function (doc, url) {
|
||||||
|
if (itemRe.test(doc.location.href)) {
|
||||||
|
return "newspaperArticle";
|
||||||
|
} else {
|
||||||
|
return "multiple";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var doWeb = function (doc, url) {
|
||||||
|
var type, nsResolver, availableItems, xpath, found, nodes, headline, pos, myurl, m, items, title;
|
||||||
|
nsResolver = getResolver(doc);
|
||||||
|
type = detectWeb(doc, url);
|
||||||
|
if (type === "multiple") {
|
||||||
|
availableItems = {};
|
||||||
|
if (url.match(/^http:\/\/search\.mdn\.mainichi\.jp\/result\?/)){
|
||||||
|
xpath = '//div[@class="ResultTitle"]/a[contains(@href, "mdn.mainichi.jp")]';
|
||||||
|
} else {
|
||||||
|
xpath = '//h2[@class="NewsTitle"]/a[@href]|//ul[@class="Mark"]/li/a[@href]';
|
||||||
|
}
|
||||||
|
nodes = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||||
|
found = nodes.iterateNext();
|
||||||
|
while (found) {
|
||||||
|
if (!itemRe.test(found.href)) {
|
||||||
|
found = nodes.iterateNext();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
headline = found.textContent;
|
||||||
|
headline = cleanUp(headline);
|
||||||
|
availableItems[found.href] = headline;
|
||||||
|
found = nodes.iterateNext();
|
||||||
|
}
|
||||||
|
if (availableItems.__count__) {
|
||||||
|
items = Zotero.selectItems(availableItems);
|
||||||
|
for (myurl in items) {
|
||||||
|
if (items.hasOwnProperty(myurl)) {
|
||||||
|
scrapeAndParse(myurl, availableItems[myurl]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (type === "newspaperArticle") {
|
||||||
|
xpath = '//h2[@class="NewsTitle"]';
|
||||||
|
nodes = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||||
|
title = nodes.iterateNext();
|
||||||
|
if (title) {
|
||||||
|
title = cleanUp(title.textContent);
|
||||||
|
scrapeAndParse(url, title);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// ############################
|
||||||
|
// ##### Scraper function #####
|
||||||
|
// ############################
|
||||||
|
|
||||||
|
var scrapeAndParse = function (url, title) {
|
||||||
|
var item, mytxt, m, val;
|
||||||
|
item = new Zotero.Item("newspaperArticle");
|
||||||
|
item.title = title;
|
||||||
|
item.publicationTitle = "Mainichi Daily News";
|
||||||
|
item.edition = "online edition";
|
||||||
|
item.url = url;
|
||||||
|
m = itemRe.exec(url);
|
||||||
|
if (m) {
|
||||||
|
var year = m[1].slice(0,4);
|
||||||
|
var month = m[1].slice(4,6);
|
||||||
|
var day = m[1].slice(6,8);
|
||||||
|
item.date = [year, month, day].join("-");
|
||||||
|
}
|
||||||
|
item.attachments.push({title:"Mainichi Daily News snapshot", mimeType:"text/html", url:url});
|
||||||
|
item.complete();
|
||||||
|
};
|
Loading…
Reference in a new issue