Updated version from dev list
This commit is contained in:
parent
edfc196e2e
commit
3c00572990
1 changed files with 160 additions and 59 deletions
|
@ -8,7 +8,7 @@
|
|||
"maxVersion":"",
|
||||
"priority":100,
|
||||
"inRepository":true,
|
||||
"lastUpdated":"2008-07-10 06:15:00"
|
||||
"lastUpdated":"2009-01-11 02:17:07"
|
||||
}
|
||||
|
||||
function detectWeb(doc, url) {
|
||||
|
@ -20,31 +20,96 @@ function detectWeb(doc, url) {
|
|||
}
|
||||
}
|
||||
|
||||
// initially posted to zotero-dev as an attachment -- sorry for the extra list traffic that caused
|
||||
|
||||
parseRomanAuthors = function (item,data) {
|
||||
var result = false;
|
||||
var datastring = data['title'][0].replace(/.*\//, "")
|
||||
if ( datastring.match(/.*[^- 0-9()\[\];:.a-zA-Z].*/) ) {
|
||||
return result;
|
||||
}
|
||||
var authors = datastring.split(";");
|
||||
for (i in authors) {
|
||||
authortype = authors[i].replace(/^([ a-z]*).*/, "$1");
|
||||
if ( authortype.match(/.*edit.*/) ) {
|
||||
authortype = "editor";
|
||||
} else if ( authortype.match(/.*trans.*/) ) {
|
||||
authortype = "translator";
|
||||
} else {
|
||||
authortype = "author";
|
||||
}
|
||||
author = authors[i].replace(/^[ a-z]*/, "").replace( /\.\.\..*/, "" );
|
||||
item.creators.push(Zotero.Utilities.cleanAuthor(author, authortype));
|
||||
result = true;
|
||||
}
|
||||
return result;
|
||||
/*
|
||||
* Set the texts used to find raw citation elements
|
||||
*/
|
||||
function setSpec() {
|
||||
var spec = new Array();
|
||||
spec['title'] = ['題および','title and statement'];
|
||||
spec['year'] = ['出版・頒布','publication,distribution'];
|
||||
spec['isbn'] = ['国際標準図書','international standard book'];
|
||||
spec['authors'] = ['著者標目','author link'];
|
||||
spec['series'] = ['書誌構造','parent bibliography'];
|
||||
return spec;
|
||||
}
|
||||
|
||||
/*
|
||||
* Extract raw string sets from the page. This is the only function that uses
|
||||
* xpath. The string sets retrieved for each label registered by setSpec is
|
||||
* stored as a list, to cope with the possibility of multiple instances of the
|
||||
* same label with different data.
|
||||
*/
|
||||
function getData(doc, spec) {
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == 'x') return namespace; else return null;
|
||||
} : null;
|
||||
var data = new Object();
|
||||
for (key in spec) {
|
||||
var check = doc.evaluate("//td[contains(text(),'"+spec[key][0]+"') or contains(text(),'"+spec[key][1]+"')]/following-sibling::td", doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
var c = check.iterateNext();
|
||||
while (c) {
|
||||
if (!data[key] ) {
|
||||
data[key] = new Array();
|
||||
}
|
||||
data[key].push(Zotero.Utilities.cleanString(c.textContent));
|
||||
c = check.iterateNext();
|
||||
}
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
/*
|
||||
* Chop a semicolon-delimited string of authors out of a raw title string,
|
||||
* check it for Japanese characters, and save the raw string for each author
|
||||
* to an array. If no Japanese authors were found, save directly to the item
|
||||
* object.
|
||||
*/
|
||||
parseRomanAuthors = function (item,data) {
|
||||
var datastring = data['title'][0];
|
||||
// don't bother if there is no author info
|
||||
if ( ! datastring.match(/.*\/.*/) ) {
|
||||
return true;
|
||||
}
|
||||
// cut off the title
|
||||
datastring = datastring.replace(/.*\//, "");
|
||||
// raise flag if there are japanese characters
|
||||
var japanese_check = datastring.match(/.*[^- &0-9()\[\];:,.a-zA-Z].*/);
|
||||
// replace comma with semicolon in certain cases, to prepare for split
|
||||
datastring = datastring.replace(/,(\s+[a-zA-Z]{3,})/, ";$1");
|
||||
datastring = datastring.replace(/,(\s+[a-zA-Z]{1}[^a-zA-Z])/, ";$1");
|
||||
datastring = datastring.replace(/(\s+and\s+)/, "; ");
|
||||
datastring = datastring.replace(/(\s+&\s+)/, "; ");
|
||||
// split the authors
|
||||
var authors = datastring.replace(/\|.*/, "").split(";");
|
||||
// this is parsing the authors for a single work. if there is a special byline, we
|
||||
// assume that it applies to all subsequent entries until overridden.
|
||||
var authortype = 'author';
|
||||
for (i in authors) {
|
||||
item.authorstrings.push(authors[i]);
|
||||
var authortypehint = authors[i].replace(/^([ ,.:a-z]*).*/, "$1");
|
||||
if ( authortypehint.match(/.*(edit|organiz).*/) ) {
|
||||
authortype = "editor";
|
||||
} else if ( authortypehint.match(/.*trans.*/) ) {
|
||||
authortype = "translator";
|
||||
}
|
||||
author = authors[i].replace(/^[ a-z]*/, "").replace( /\.\.\..*/, "" );
|
||||
// need to test for length because the replacement of commas with semicolons
|
||||
// can cause a short split at the end of a byline that originally ended in a comma
|
||||
if ( ! japanese_check && author.length ) {
|
||||
item.creators.push(Zotero.Utilities.cleanAuthor(author, authortype));
|
||||
}
|
||||
}
|
||||
return japanese_check;
|
||||
}
|
||||
|
||||
/*
|
||||
* For each author link, attempt to find a hint that the person
|
||||
* is an editor or translator, first in the link text itself, then in
|
||||
* the list of raw author strings captured by parseRomanAuthors.
|
||||
* Clean out cruft, reverse the order of each name, and save
|
||||
* directly to the item object.
|
||||
*/
|
||||
parseJapaneseAuthors = function ( item, data ) {
|
||||
var authortype = author;
|
||||
var authors = data['authors'];
|
||||
|
@ -56,58 +121,94 @@ parseJapaneseAuthors = function ( item, data ) {
|
|||
} else {
|
||||
authortype = 'author';
|
||||
}
|
||||
var author = authors[i].replace(/[*]/g,"").replace(/[0-9<(|].*/, "").replace(/(.*?),(.*)/, "$2 $1");
|
||||
var author = authors[i].replace(/[*]/g,"").replace(/[0-9<()|].*/, "").replace(/(.*?),(.*)/, "$2 $1");
|
||||
// If we claim to be an author, double-check in the English entries for a translator hint.
|
||||
// This is an enormous pain, but the original records are a mess, with different conventions
|
||||
// for Japanese and foreign records, sometimes mixed up in the same entry. What are you
|
||||
// going to do.
|
||||
for ( x in item.authorstrings ) {
|
||||
var authorstring = item.authorstrings[x];
|
||||
Zotero.debug(authorstring);
|
||||
var name = author.split(" ");
|
||||
name.reverse();
|
||||
if ( authorstring.indexOf( name[0] ) > -1 && authorstring.match(/.*(訳|譯|譯註)$/) ) {
|
||||
authortype = 'translator';
|
||||
break;
|
||||
} else if ( authorstring.indexOf( name[0] ) > -1 && authorstring.match(/.*(編|編著)$/) ) {
|
||||
authortype = 'editor';
|
||||
break;
|
||||
}
|
||||
}
|
||||
delete item.authorstrings;
|
||||
item.creators.push(Zotero.Utilities.cleanAuthor(author, authortype));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Split extracted title field. This always starts as a single list item,
|
||||
* but can contain entries for several works, as in an omnibus volume of
|
||||
* translated works, for example. Such records separate the elements of
|
||||
* the omnibus with periods that have no trailing space, so we use that as
|
||||
* the split point. We discard the phonetic information appended to the end
|
||||
* of the string in Japanese records.
|
||||
*/
|
||||
function splitTitle(data) {
|
||||
// split in data array
|
||||
var titlestring = data['title'][0].replace(/\|.*/, "");
|
||||
data['title'] = titlestring.split(" . ");
|
||||
}
|
||||
|
||||
/*
|
||||
* The scrape function brings the various parsing functions together
|
||||
*/
|
||||
function scrape(doc,url) {
|
||||
var item = new Zotero.Item("book");
|
||||
var spec = new Array();
|
||||
spec['title'] = ['題および','title and statement'];
|
||||
spec['year'] = ['出版・頒布','publication,distribution'];
|
||||
spec['isbn'] = ['国際標準図書','international standard book'];
|
||||
spec['authors'] = ['著者標目','author link'];
|
||||
spec['series'] = ['書誌構造','parent bibliography'];
|
||||
var data = {};
|
||||
for (key in spec) {
|
||||
var check = doc.evaluate("//td[contains(text(),'"+spec[key][0]+"') or contains(text(),'"+spec[key][1]+"')]/following-sibling::td", doc, null, XPathResult.ANY_TYPE, null);
|
||||
var c = check.iterateNext();
|
||||
if (!data[key] && c) {
|
||||
data[key] = [];
|
||||
}
|
||||
while (c) {
|
||||
data[key].push(Zotero.Utilities.cleanString(c.textContent));
|
||||
c = check.iterateNext();
|
||||
}
|
||||
}
|
||||
item.authorstrings = new Array();
|
||||
var spec = setSpec();
|
||||
var data = getData(doc, spec);
|
||||
splitTitle(data);
|
||||
|
||||
if (data['title']) {
|
||||
item.title = data['title'][0].replace(/\/.*/, "");
|
||||
// if authors are in roman letters, use them
|
||||
has_author = parseRomanAuthors( item, data );
|
||||
// otherwise, use author links
|
||||
if (!has_author) {
|
||||
var titles = new Array();
|
||||
for (i in data['title']) {
|
||||
titles.push( data['title'][i].replace(/\s*\/.*/, "") );
|
||||
}
|
||||
item.title = titles.join(", ");
|
||||
jse_authors = parseRomanAuthors( item, data );
|
||||
if ( jse_authors ) {
|
||||
parseJapaneseAuthors( item, data );
|
||||
}
|
||||
}
|
||||
|
||||
if (data['year']) {
|
||||
item.date = data['year'][0].replace(/.*?([0-9][.0-9][0-9]+).*/, "$1");
|
||||
item.place = data['year'][0].replace(/:.*/, "");
|
||||
item.publisher = data['year'][0].replace(/.*:(.*),.*/, "$1");
|
||||
// sometimes there are multiple "date" fields, some of which are filled
|
||||
// with other random information
|
||||
for (i in data['year']) {
|
||||
var year = data['year'][i];
|
||||
if ( year.match(/.*[0-9]{3}.*/) ) {
|
||||
item.date = year.replace(/.*?([0-9][.0-9][0-9]+).*/, "$1");
|
||||
item.place = year.replace(/:.*/, "").replace(/[\[\]]/g, "");
|
||||
item.publisher = year.replace(/.*:(.*),.*/, "$1");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// apparently the series field does not exist in this capture type
|
||||
//if (data['series']) {
|
||||
// Zotero.debug('series: '+data['series'][0]);
|
||||
// item.series = data['series'][0].replace(/<.*/, "");
|
||||
//}
|
||||
if (data['series']) {
|
||||
item.series = data['series'][0].replace(/<.*/, "");
|
||||
}
|
||||
|
||||
if (data['isbn']) {
|
||||
item.ISBN = data['isbn'][0].replace(/[^0-9]*([0-9]+).*/, "$1");
|
||||
}
|
||||
|
||||
item.complete();
|
||||
}
|
||||
|
||||
function doWeb(doc, url) {
|
||||
var articles = [url];
|
||||
Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();});
|
||||
articles = [url];
|
||||
Zotero.Utilities.processDocuments(articles, scrape, function() {
|
||||
Zotero.done();
|
||||
});
|
||||
Zotero.wait();
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue