From e636bd9c2e8e713ebf594a4859be8bb0b6fae00b Mon Sep 17 00:00:00 2001 From: Avram Lyon Date: Thu, 7 Oct 2010 20:36:57 +0000 Subject: [PATCH] Trans: Typo fix to BnF, per Ziche; new version of CNKI by Ace Strong --- .../Bibliothèque nationale de France.js | 2 +- translators/CNKI.js | 873 ++++++++++++++---- 2 files changed, 706 insertions(+), 169 deletions(-) diff --git a/translators/Bibliothèque nationale de France.js b/translators/Bibliothèque nationale de France.js index 3c9d9e25ea..f4be131f19 100644 --- a/translators/Bibliothèque nationale de France.js +++ b/translators/Bibliothèque nationale de France.js @@ -129,7 +129,7 @@ var BnfClass = function() { //Default case "205": default: - return "contibutor"; + return "contributor"; } }; diff --git a/translators/CNKI.js b/translators/CNKI.js index e6bdf101ec..e4570927bd 100644 --- a/translators/CNKI.js +++ b/translators/CNKI.js @@ -2,13 +2,13 @@ "translatorID":"5c95b67b-41c5-4f55-b71a-48d5d7183063", "label":"CNKI", "creator":"Ace Strong and Heromyth", - "target":"^https?://(?:(?:(dlib|epub|ckrd)(?:.edu)?.cnki.net)|(?:[0-9.]+))/(?:kns50|grid2008|grid20)", + "target":"^https?://(?:(?:(dlib|epub|acad|apj1|law1)\\.cnki\\.net)|(?:[0-9\\.]+))/(?:grid2008|kns50|Kns55|kcms)", "minVersion":"2.0.b4", "maxVersion":"", "priority":100, - "inRepository":true, + "inRepository":"1", "translatorType":4, - "lastUpdated":"2010-09-26 15:08:45" + "lastUpdated":"2010-10-07 15:58:33" } /* @@ -29,189 +29,726 @@ along with this program. If not, see . */ +// ####################### +// ##### Sample URLs ##### +// ####################### + +/* + * The starting point for an search is the URL below. + * In testing, I tried the following: + * + * - A search listing of journals + * - A search listing of phd thesis + * - A search listing of master thesis + * - A search listing of conference papers + * - A search listing of newspaper articles + * - A journal paper page + * - A phd thesis page + * - A master thesis page + * - A conference paper page + * - A newspaper article page + */ +// http://epub.cnki.net/grid2008/index/ZKCALD.htm + + +// ################################# +// #### Local utility functions #### +// ################################# + +function detectCode(url) { + var pattern = /(?:dbcode|dbname)=([A-Z]{4})/i; + if (pattern.test(url)) { + var code = pattern.exec(url)[1]; + return code; + } +} + +function getResolver(doc) { + var namespace, resolver; + namespace = doc.documentElement.namespaceURI; + if (namespace) { + resolver = function(prefix) { + if (prefix == 'x') { + return namespace; + } else { + return null; + } + }; + } else { + resolver = null; + } + return resolver; +} + +function trimTags(text) { + var pattern = /(<.*?>)/g; + text = text.replace(pattern, ""); + return text; +} + +function trimMultiline(text) { + var pattern = /(\s{2,})/g; + text = text.replace(pattern, "\n"); + return text; +} + +// ############################# +// ##### Scraper functions ##### +// ############################# + +// work for journalArticle +function scrapeAndParse1(url) { +// Zotero.debug("journalArticle"); + var page = Zotero.Utilities.retrieveSource(url); + var pattern; + + // 类型 & URL + var itemType = "journalArticle"; + var newItem = new Zotero.Item(itemType); +// Zotero.debug(url); + newItem.url = url; + + // 标题 + pattern = /(.*?)<\/span>/; + if (pattern.test(page)) { + var title = trimTags(pattern.exec(page)[1]); + newItem.title = title; +// Zotero.debug("title: "+title); + } + + // 作者 + var authorNames; + pattern = /【作者】(?:[\s\S]*?)GetLinkListEx\('(.*?);','/; + if (pattern.test(page)) { + authorNames = pattern.exec(page)[1].split(";"); + } else { + pattern = /【作者】([\s\S]*?)<\/tr>/; + if (pattern.test(page)) { + authorNames = trimTags(pattern.exec(page)[1]).split(";"); + } + } + if (authorNames) { + for (var i=0; i 0) { + newItem.creators.push( + Zotero.Utilities.cleanAuthor(authorNames[i], + "author", true)); + } + } +// Zotero.debug("authorNames:\n"+authorNames); + } + + // 摘要 + var abst; + pattern = /【摘要】\s*<[^>]*>(.*?)<\/span>/; + if (pattern.test(page)) { + abst = trimTags(pattern.exec(page)[1]); + } else { + pattern = /【摘要】([\s\S]*?)<\/tr>/; + if (pattern.test(page)) { + abst = trimTags(pattern.exec(page)[1]); + } + } + if (abst) { +// Zotero.debug("abstract:\n"+abst); + newItem.abstractNote = Zotero.Utilities.trim(abst); + } + pattern = /【Abstract】\s*<[^>]*>(.*?)<\/span>/; + if (pattern.test(page)) { + abst = trimTags(pattern.exec(page)[1]); + } else { + pattern = /【英文摘要】([\s\S]*?)<\/tr>/; + if (pattern.test(page)) { + abst = trimTags(pattern.exec(page)[1]); + } + } + if (abst) { +// Zotero.debug("abstract:\n"+abst); + if (newItem.abstractNote===undefined) { + newItem.abstractNote = Zotero.Utilities.trim(abst); + } else { + newItem.abstractNote = newItem.abstractNote + "\n" + + Zotero.Utilities.trim(abst); + } + } +// Zotero.debug(newItem.abstractNote); + + // 关键词 + var tags; + pattern = /【关键词】(?:[\s\S]*?)KeywordFilter\('(.*?)'\),'kw'/; + if (pattern.test(page)) { + tags = pattern.exec(page)[1].split(";"); + } else { + pattern = /【中文关键词】([\s\S]*?)<\/tr>/; + if (pattern.test(page)) { + tags = trimTags(pattern.exec(page)[1]).split(";"); + } + } + if (tags) { + for (var i=0; i0 && newItem.tags.indexOf(tag)<0) { + newItem.tags.push(tag); + } + } +// Zotero.debug("tags:\n"+tags); + } + pattern = /【Key words】(?:[\s\S]*?)GetLinkList\('(.*?)','kw'/; + if (pattern.test(page)) { + tags = pattern.exec(page)[1].split(";"); + } else { + pattern = /【英文关键词】([\s\S]*?)<\/tr>/; + if (pattern.test(page)) { + tags = trimTags(pattern.exec(page)[1]).split(";"); + } + } + if (tags) { + for (var i=0; i0 && newItem.tags.indexOf(tag)<0) { + newItem.tags.push(tag); + } + } +// Zotero.debug("tags:\n"+tags); + } + + // 文献出处 & DOI & 出版时间 + pattern = /【文献出处】([\s\S]*?)<\/a>/; + if (pattern.test(page)) { + var publicationTitle = trimTags(pattern.exec(page)[1]); + newItem.publicationTitle = Zotero.Utilities.trim(publicationTitle); +// Zotero.debug("publicationTitle: "+publicationTitle); + } + var doi; + pattern = /【DOI】(.*?)<\/li>/; + if (pattern.test(page)) { + doi= pattern.exec(page)[1]; + } else { + pattern = /【DOI】([\s\S]*?)<\/tr>/; + if (pattern.test(page)) { + doi= trimTags(pattern.exec(page)[1]); + } + } + if (doi) { + newItem.DOI = Zotero.Utilities.trim(doi); +// Zotero.debug("doi: "+doi); + } + pattern = /【文献出处】(?:[\s\S]*?)(\d{4})年\s*(\d{2})(卷|期)/; + if (pattern.test(page)) { + var date = pattern.exec(page)[1]; + newItem.date = date; + var val = pattern.exec(page)[2]; + var attr = pattern.exec(page)[3]; + if (attr == "卷") { + newItem.volume = val; + } else { + newItem.issue = val; + } +// Zotero.debug("date: "+date); +// Zotero.debug("val: "+val); +// Zotero.debug("attr: "+attr); + } + + newItem.complete(); +} + +// work for thesis +function scrapeAndParse2(url) { +// Zotero.debug("thesis"); + var page = Zotero.Utilities.retrieveSource(url); + var pattern; + + // 类型 & URL + var itemType = "thesis"; + var newItem = new Zotero.Item(itemType); +// Zotero.debug(url); + newItem.url = url; + var code = detectCode(url); + if (code == "CDFD") { + newItem.thesisType = "博士论文" + } else { + newItem.thesisType = "硕士论文" + } +// Zotero.debug(newItem.thesisType); + + + // 标题 + pattern = /(.*?)<\/span>/; + if (pattern.test(page)) { + var title = pattern.exec(page)[1]; + pattern = /(<.*?>)/g; + title = title.replace(pattern, ""); + newItem.title = title; +// Zotero.debug("title: "+title); + } + + // 作者 + pattern = /【作者】([\s\S]*?)<\/a>/; + if (pattern.test(page)) { + var authorNames = trimTags(pattern.exec(page)[1]).split(";"); + for (var i=0; i/; + if (pattern.test(page)) { + var directors = trimTags(pattern.exec(page)[1]).split(";"); + for (var i=0; i/; + if (pattern.test(page)) { + abst = trimTags(pattern.exec(page)[1]); + } + } + if (abst) { +// Zotero.debug("abstract:\n"+abst); + newItem.abstractNote = trimMultiline(abst); + } + pattern = /ReplaceFont\('EnDivSummary','(.*?)(?='\);if)/; + if (pattern.test(page)) { + abst = trimTags(pattern.exec(page)[1]); + } else { + pattern = /【英文摘要】([\s\S]*?)<\/tr>/; + if (pattern.test(page)) { + abst = trimTags(pattern.exec(page)[1]); + } + } + if (abst) { +// Zotero.debug("abstract:\n"+abst); + if (newItem.abstractNote===undefined) { + newItem.abstractNote = Zotero.Utilities.trim(abst); + } else { + newItem.abstractNote = newItem.abstractNote + "\n" + + trimMultiline(abst); + } + } +// Zotero.debug(newItem.abstractNote); + + // 关键词 + var tags; + pattern = /【关键词】\s*]*>(.*?)<\/a>*<\/span>/; + if (pattern.test(page)) { + tags = trimTags(pattern.exec(page)[1]).split(";"); + } else { + pattern = /【关键词】([\s\S]*?)<\/tr>/; + if (pattern.test(page)) { + tags = trimTags(pattern.exec(page)[1]).split(";"); + } + } + if (tags) { + for (var i=0; i0 && newItem.tags.indexOf(tag)<0) { + newItem.tags.push(tag); + } + } +// Zotero.debug("tags:\n"+tags); + } + pattern = /【Key words】\s*]*>(.*?)<\/a>*<\/span>/; + if (pattern.test(page)) { + tags = trimTags(pattern.exec(page)[1]).split(";"); + } else { + pattern = /【英文关键词】([\s\S]*?)<\/tr>/; + if (pattern.test(page)) { + tags = trimTags(pattern.exec(page)[1]).split(";"); + } + } + if (tags) { + for (var i=0; i0 && newItem.tags.indexOf(tag)<0) { + newItem.tags.push(tag); + } + } +// Zotero.debug("tags:\n"+tags); + } +// Zotero.debug(newItem.tags); + + // 出版学校 & DOI & 出版时间 + var publisher; + pattern = /【网络出版投稿人】\s*]*>(.*?)<\/a>/; + if (pattern.test(page)) { + publisher = pattern.exec(page)[1]; + } else { + pattern = /【网络出版投稿人】([\s\S]*?)<\/tr>/; + if (pattern.test(page)) { + publisher = Zotero.Utilities.trim( + trimTags(pattern.exec(page)[1])); + } + } + if (publisher) { + pattern = /(.*?)((.*?))/; + if (pattern.test(publisher)) { + newItem.publisher = pattern.exec(publisher)[1]; + newItem.place = pattern.exec(publisher)[2]; + } else { + newItem.publisher = publisher; + } +// Zotero.debug("publisher: "+publisher); + } + var doi; + pattern = /【DOI】(.*?)<\/li>/; + if (pattern.test(page)) { + doi= pattern.exec(page)[1]; + } else { + pattern = /【DOI】([\s\S]*?)<\/tr>/; + if (pattern.test(page)) { + var doi= trimTags(pattern.exec(page)[1]); + } + } + if (doi) { + newItem.DOI = Zotero.Utilities.trim(doi); +// Zotero.debug("doi: "+doi); + } + var date; + pattern = /【网络出版投稿时间】(.*?)\s*<\/li>/; + if (pattern.test(page)) { + date = pattern.exec(page)[1]; + } else { + pattern = /【网络出版投稿时间】([\s\S]*?)\s*<\/tr>/; + if (pattern.test(page)) { + date = trimTags(pattern.exec(page)[1]); + } + } + if (date) { + newItem.date = Zotero.Utilities.trim(date); +// Zotero.debug("date: "+date); + } + + newItem.complete(); +} + +// work for conferencePaper +function scrapeAndParse3(url) { +// Zotero.debug("conferencePaper"); + var page = Zotero.Utilities.retrieveSource(url); + var pattern; + + // 类型 & URL + var itemType = "conferencePaper"; + var newItem = new Zotero.Item(itemType); +// Zotero.debug(url); + newItem.url = url; + + // 标题 + pattern = /(.*?)<\/span>/; + if (pattern.test(page)) { + var title = trimTags(pattern.exec(page)[1]); + newItem.title = title; +// Zotero.debug("title: "+title); + } + + // 作者 + pattern = /【作者】(.*?)<\/p>/; + if (pattern.test(page)) { + var authorNames = trimTags(pattern.exec(page)[1]).split(";"); + for (var i=0; i)/g; + abst = abst.replace(pattern, ""); +// Zotero.debug("after:\n"+abst); + newItem.abstractNote = Zotero.Utilities.trim(abst); + } + + pattern = /ReplaceFont\('EnDivSummary','(.*?)(?='\);if)/; + if (pattern.test(page)) { + abst = pattern.exec(page)[1]; +// Zotero.debug("raw:\n"+abst); + if (abst != undefined && abst != null) { + pattern = /(<.*?>)/g; + abst = abst.replace(pattern, ""); +// Zotero.debug("after:\n"+abst); + + if (newItem.abstractNote===undefined) { + newItem.abstractNote = Zotero.Utilities.trim(abst); + } else { + newItem.abstractNote = newItem.abstractNote + "\n" + + Zotero.Utilities.trim(abst); + } + } + } +// Zotero.debug("abst:\n"+newItem.abstractNote); + + // 关键词 + pattern = /【关键词】\s*]*>(.*?)<\/a>*<\/span>/; + if (pattern.test(page)) { + var tags = trimTags(pattern.exec(page)[1]).split(";"); + for (var i=0; i0 && newItem.tags.indexOf(tag)<0) { + newItem.tags.push(tag); + } + } +// Zotero.debug("tags:\n"+tags); + } + pattern = /【Key words】\s*]*>(.*?)<\/a>*<\/span>/; + if (pattern.test(page)) { + var tags = trimTags(pattern.exec(page)[1]).split(";"); + for (var i=0; i0 && newItem.tags.indexOf(tag)<0) { + newItem.tags.push(tag); + } + } +// Zotero.debug("tags:\n"+tags); + } +// Zotero.debug(newItem.tags); + + // 会议名称 & 会议录名称 & 会议地点 & 会议时间 + pattern = /【会议名称】(.*?)\s*<\/li>/; + if (pattern.test(page)) { + var conferenceName = trimTags(pattern.exec(page)[1]); + newItem.conferenceName = conferenceName; +// Zotero.debug("conferenceName: "+conferenceName); + } + pattern = /【会议录名称】(.*?)\s*<\/li>/; + if (pattern.test(page)) { + var proceedingsTitle = trimTags(pattern.exec(page)[1]); + newItem.proceedingsTitle = proceedingsTitle; +// Zotero.debug("proceedingsTitle: "+proceedingsTitle); + } + pattern = /【会议地点】(.*?)\s*<\/li>/; + if (pattern.test(page)) { + var place = trimTags(pattern.exec(page)[1]); + newItem.place = place; +// Zotero.debug("place: "+place); + } + pattern = /【会议时间】(.*?)\s*<\/li>/; + if (pattern.test(page)) { + var date = trimTags(pattern.exec(page)[1]); + newItem.date = date; +// Zotero.debug("date: "+date); + } + + newItem.complete(); +} + +// work for newspaperArticle +function scrapeAndParse4(url) { +// Zotero.debug("newspaperArticle"); + var page = Zotero.Utilities.retrieveSource(url); + var pattern; + + // 类型 & URL + var itemType = "newspaperArticle"; + var newItem = new Zotero.Item(itemType); +// Zotero.debug(url); + newItem.url = url; + + // 标题 + pattern = /(.*?)<\/span>/; + if (pattern.test(page)) { + var title = trimTags(pattern.exec(page)[1]); + newItem.title = title; +// Zotero.debug("title: "+title); + } + + // 副标题/引题 + var shortTitle; + pattern = /

【(?:副标题|引题)】(.*?)(?=<\/p>)/; + if (pattern.test(page)) { + shortTitle = pattern.exec(page)[1]; +// Zotero.debug("shortTitle: "+shortTitle); + newItem.shortTitle = Zotero.Utilities.trimInternal(shortTitle); + } +// Zotero.debug(newItem.shortTitle); + + // 作者 + pattern = /【作\s*者】(.*?)<\/p>/; + if (pattern.test(page)) { + var authorNames = trimTags(pattern.exec(page)[1]).split(";"); + for (var i=0; i【正文快照】(.*?)(?=<\/p>)/; + if (pattern.test(page)) { + abst = pattern.exec(page)[1]; +// Zotero.debug("abst:\n"+abst); + newItem.abstractNote = Zotero.Utilities.trimInternal(abst); + } +// Zotero.debug(newItem.abstractNote); + + // 报纸名称 & DOI & 出版时间 & 版名 & 版号 + pattern = /【报纸名称】\s*<[^>]*>(.*?)<\/a>/; + if (pattern.test(page)) { + var publicationTitle = trimTags(pattern.exec(page)[1]); + newItem.publicationTitle = publicationTitle; +// Zotero.debug("publicationTitle: "+publicationTitle); + } + pattern = /【DOI】\s*(.*?)\s*<\/li>/; + if (pattern.test(page)) { + var doi = pattern.exec(page)[1]; + newItem.DOI = doi; +// Zotero.debug("doi: "+doi); + } + pattern = /【报纸日期】\s*(.*?)\s*<\/li>/; + if (pattern.test(page)) { + var date = pattern.exec(page)[1]; + newItem.date = date; +// Zotero.debug("date: "+date); + } + pattern = /【版名】\s*(.*?)\s*<\/li>/; + if (pattern.test(page)) { + var section = pattern.exec(page)[1]; + newItem.section = section; +// Zotero.debug("section: "+section); + } + pattern = /【版号】\s*(.*?)\s*<\/li>/; + if (pattern.test(page)) { + var edition = pattern.exec(page)[1]; + newItem.edition = edition; +// Zotero.debug("edition: "+edition); + } + + newItem.complete(); +} + +// ######################### +// ##### API functions ##### +// ######################### function detectWeb(doc, url) { - var articleRe = /detail.aspx/; - var s = articleRe.exec(url); - - if(s) { - return "journalArticle"; - } else { - articleRe = /Brief.aspx/; - s = articleRe.exec(url); - if(s) - return "multiple"; + var pattern = /detail.aspx/; + if (pattern.test(url)) { + var code = detectCode(url); +// Zotero.debug(code); + if (code == "CJFQ" || code == "CJFD") { + return "journalArticle"; + } else if (code == "CDFD") { + return "thesis"; + } else if (code == "CMFD" || code == "CLKM") { + return "thesis"; + } else if (code == "CPFD") { + return "conferencePaper"; + } else if (code == "CCND") { + return "newspaperArticle"; + } + } + + pattern = /brief/; + if (pattern.test(url)) { + return "multiple" } return false; } -function scrape(doc, url) { - //var namespace = doc.documentElement.namespaceURI; - //var nsResolver = namespace ? function(prefix) { - // if (prefix == "x") return namespace; else return null; - //} : null; - var nsResolver = null; - - var itemType = "journalArticle"; - // TODO: 因为中英文信息都不想丢失,所以存为两个Item,也算是中国特色吧~ - // 但是目前只解析出中文的信息,下个版本中添加英文信息。 - var newItem = new Zotero.Item(itemType); - //Zotero.debug(itemType); - newItem.url = url; - - // 标题 - var titles = doc.title.split('-').slice(0,-1); - //Zotero.debug(titles); - var title = titles.join("-"); - Zotero.debug("Title:"+title); - newItem.title = title; - - // 附件,网页快照 - var snapName = title + " (CNKI)"; - //Zotero.debug(snapName); - //newItem.attachments.push({document:doc, title:snapName}); - newItem.attachments.push({url:newItem.url, snapshot:true, title:snapName, mimeType:"text/html"}); - //Zotero.debug(doc); - - // 其他信息,/html/body/table[4]/tbody/tr/td[2]/table/tbody/tr/td[2]/table[2]/tbody - var dataRows = doc.evaluate('//body/table[4]/tbody/tr/td[2]/table/tbody/tr/td[2]/table[2]/tbody/tr', doc, nsResolver, - XPathResult.ANY_TYPE, null); - var dataRow; - while(dataRow = dataRows.iterateNext()) { - var tds = dataRow.getElementsByTagName("td"); - var heading = Zotero.Utilities.trimInternal(tds[0].textContent); - var content = tds[1]; - if(heading == "【作者中文名】" || heading == "【作者】") { - //Zotero.debug("Authors:"); - var as = content.getElementsByTagName("a"); - //Zotero.debug(as.length); - var i=0; - for(i=0;i