updating Sudoc.js with sylvain's changes

This commit is contained in:
Matt Burton 2009-04-07 15:53:36 +00:00
parent 19ff243b92
commit d4a7e5bd37

View file

@ -2,7 +2,7 @@
"translatorID":"1b9ed730-69c7-40b0-8a06-517a89a3a278",
"translatorType":4,
"label":"Sudoc",
"creator":"Sean Takats and Michael Berkowitz",
"creator":"Sean Takats and Michael Berkowitz, updated by Sylvain Machefert",
"target":"^http://www\\.sudoc\\.abes\\.fr",
"minVersion":"1.0.0b3.r1",
"maxVersion":"",
@ -11,42 +11,59 @@
"lastUpdated":"2008-05-19 17:30:00"
}
function detectWeb(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var xpath = '//table/tbody/tr/td[1][@class="preslabel"]/strong';
var multxpath = '//a[@id="InitialFocusPoint"]';
var elt;
var multxpath = '/html/body/div[2]/div/span';
if (elt = doc.evaluate(multxpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
return "multiple";
}
else if (elt = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext())
{
var contenu = elt.textContent;
var numRegexp = /(Num.ro.de.notice|Record.number)/;
var m = numRegexp.exec(contenu);
if (m) {
// On a bien une notice d"ouvrage, on doit chercher limage
// pour choisir le type de document
var imgXpath = '/html/body/table/tbody/tr/td[1]/p/img/@src';
var imgsrc = doc.evaluate(imgXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
if (imgsrc){
if (imgsrc.indexOf("icon_per.gif") > 0){
return "book";
} else if (imgsrc.indexOf("icon_books.gif") > 0){
return "book";
} else if (imgsrc.indexOf("icon_thesis.gif") > 0){
return "thesis";
} else if (imgsrc.indexOf("icon_art.gif") > 0){
return "journalArticle";
} else {
return "book";
}
}
var content = elt.textContent;
if ( (content == "Résultats") || (content == "Results") )
{
return "multiple";
}
else if ( (content == "Notice complète") || (content == "title data") )
{
var xpathimage = '/html/body/div[2]/div[4]/span/img';
if (elt = doc.evaluate(xpathimage, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext())
{
var type = elt.getAttribute('src');
if (type.indexOf('article.gif') > 0)
{
return "journalArticle";
}
else if (type.indexOf('book.gif') > 0)
{
return "book";
}
else if (type.indexOf('handwriting.gif') > 0)
{
return "manuscript";
}
else if (type.indexOf('sons.gif') > 0)
{
return "audioRecording";
}
else if (type.indexOf('sound.gif') > 0)
{
return "audioRecording";
}
else if (type.indexOf('thesis.gif') > 0)
{
return "thesis";
}
else if (type.indexOf('map.gif') > 0)
{
return "map";
}
else
{
return "book";
}
}
}
}
}
@ -56,105 +73,154 @@ function scrape(doc) {
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var rowXpath = '//tr[td[@class="preslabel"]]';
var tableRows = doc.evaluate(rowXpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
var tableRow;
var newItem = new Zotero.Item();
// TODO add other item types using detectWeb's icon checking code
newItem.itemType = "book";
var imgXpath = '/html/body/table/tbody/tr/td[1]/p/img/@src';
var imgsrc = doc.evaluate(imgXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
if (imgsrc){
if (imgsrc.indexOf("icon_per.gif") > 0){
newItem.itemType = "book";
} else if (imgsrc.indexOf("icon_books.gif") > 0){
newItem.itemType = "book";
} else if (imgsrc.indexOf("icon_thesis.gif") > 0){
newItem.itemType = "thesis";
} else if (imgsrc.indexOf("icon_art.gif") > 0){
newItem.itemType = "journalArticle";
} else {
newItem.itemType = "book";
}
} else {
newItem.itemType = "book";
}
while (tableRow = tableRows.iterateNext())
var zXpath = '/html/body/span[@class="Z3988"]';
var eltCoins = doc.evaluate(zXpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
if (eltCoins = doc.evaluate(zXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext())
{
var field = doc.evaluate('./td[1]', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
var value = doc.evaluate('./td[2]', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
field = Zotero.Utilities.superCleanString(field);
field = field.replace(/(\(s\))?\s*:\s*$/, "");
if (field == "Titre" || field == "Title"){
Zotero.debug("title = " + value);
value = value.replace(/(\[[^\]]+\])/g,"");
newItem.title = value.split(" / ")[0];
}
if (field.substr(0,6) == "Auteur" || field.substr(0,6) == "Author"){
var authors = doc.evaluate('./td[2]/a', tableRow, nsResolver, XPathResult.ANY_TYPE, null);
var author;
while (author = authors.iterateNext()){
var coins = eltCoins.getAttribute('title');
var newItem = new Zotero.Item();
newItem.repository = false; // do not save repository
if(Zotero.Utilities.parseContextObject(coins, newItem))
{
if (newItem.title)
{
// We use the same method as in detectWeb to find
// the real type of document
var xpathimage = '/html/body/div[2]/div[4]/span/img';
if (elt = doc.evaluate(xpathimage, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext())
{
var type = elt.getAttribute('src');
var ZoteroType = '';
if (type.indexOf('article.gif') > 0)
{
zoteroType = 'journalArticle';
}
else if (type.indexOf('book.gif') > 0)
{
zoteroType = 'book';
}
else if (type.indexOf('handwriting.gif') > 0)
{
zoteroType = 'manuscript';
}
else if (type.indexOf('sons.gif') > 0)
{
zoteroType = "audioRecording";
}
else if (type.indexOf('sound.gif') > 0)
{
zoteroType = "audioRecording";
}
else if (type.indexOf('thesis.gif') > 0)
{
zoteroType = "thesis";
}
else if (type.indexOf('map.gif') > 0)
{
zoteroType = "map";
}
else
{
zoteroType = "book";
}
newItem.itemType = zoteroType;
}
// We need to correct some informations where COinS is wrong
var rowXpath = '//tr[td[@class="rec_lable"]]';
var tableRows = doc.evaluate(rowXpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
var tableRow;
while (tableRow = tableRows.iterateNext())
{
var field = doc.evaluate('./td[1]', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
var value = doc.evaluate('./td[2]', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
field = Zotero.Utilities.superCleanString(field);
field = field.replace(/(\(s\))?\s*:\s*$/, "");
// With COins, only one author is taken, changed.
if (field.substr(0,6) == "Auteur" || field.substr(0,6) == "Author")
{
var authors = doc.evaluate('./td[2]/div', tableRow, nsResolver, XPathResult.ANY_TYPE, null);
newItem.creators = new Array();
while (author = authors.iterateNext())
{
var authorText = author.textContent;
var authorParts = authorText.split(" (");
newItem.creators.push(Zotero.Utilities.cleanAuthor(authorParts[0], "author", true));
authorFunction = authorText.split(". ")[1];
authorText = authorText.split(". ")[0];
if (authorFunction)
{
authorFunction = Zotero.Utilities.superCleanString(authorFunction);
}
var zoteroFunction = '';
// TODO : Add other authotiry types
if (authorFunction == 'Traduction')
{
zoteroFunction = 'Translator';
}
else
{
zoteroFunction = 'Author';
}
newItem.creators.push(Zotero.Utilities.cleanAuthor(authorText, zoteroFunction, true));
}
}
}
if (field.substr(0,4) == "Date"){
newItem.date = value;
}
if (field.substr(0,7) == "Editeur" || field.substr(0,9) == "Publisher"){
var pubParts = value.split(" : ");
newItem.place = pubParts[0];
// needs error checking below to avoid error
if (pubParts[1] ) {
pubParts = pubParts[1].split(", ");
newItem.publisher = pubParts[0];
// The serie isn't in COinS
else if (field.substr(0,5) == "Serie" || field.substr(0,10) == "Collection")
{
newItem.series = value;
}
}
if (field.substr(0,4) == "ISBN" || field.substr(0,4) == "ISSN"){
newItem.ISBN = value.split(" (")[0];
}
if (field == "Description") {
var m = value.match(/([0-9]+) (?:[pP])/);
if (m) {
newItem.pages = m[1];
// When there's a subtitle, only main title is used !
else if (field == "Titre" || field == "Title")
{
var title = '';
var titles = doc.evaluate('./td[2]/div/span', tableRow, nsResolver, XPathResult.ANY_TYPE, null);
while (partTitle = titles.iterateNext())
{
partTitle = partTitle.textContent;
partTitle = partTitle.replace(/(\[[^\]]+\] ?)/g,"");
title = title + partTitle;
}
// Remove the author
title = title.split(" / ")[0];
newItem.title = title;
}
}
if (field.substr(0,5) == "Serie" || field.substr(0,10) == "Collection"){
newItem.series = value;
}
if (field.substr(0,6) == "Sujets" || field.substr(0,8) == "Subjects"){
var subjectElmts = doc.evaluate('./td[2]/a', tableRow, nsResolver, XPathResult.ANY_TYPE, null);
var subject;
var subjects;
while (subject = subjectElmts.iterateNext()){
subjects = subject.textContent.split(" -- ");
newItem.tags = newItem.tags.concat(subjects);
// Language not defined in COinS
else if ( (field == "Langue") || (field == "Language") )
{
newItem.language = value;
}
}
if (field == "In" || field == "Dans"){
var jtitle = value.replace(/(\[[^\]]+\])/g,"");
jtitle = jtitle.split(" / ")[0];
jtitle = jtitle.split(" - ")[0];
newItem.publicationTitle = jtitle;
//get page numbers
var m = value.match(/(?:[Pp]\. )([0-9\-]+)/);
if (m) {
newItem.pages = m[1];
else if ( (field == "Résumé") || (field == "Abstract") )
{
if (newItem.abstractNote)
{
newItem.abstractNote = newItem.abstractNote + " " + value;
}
else
{
newItem.abstractNote = value;
}
}
//get ISBN or ISSN
m = value.match(/(?:ISSN|ISBN) ([0-9Xx\-]+)/);
if (m) {
newItem.ISBN = m[1];
newItem.ISSN = m[1];
else if (field == "Notes")
{
if (newItem.abstractNote)
{
newItem.abstractNote = newItem.abstractNote + " " + value;
}
else
{
newItem.abstractNote = value;
}
}
// publicationTitle, issue/volume
}
newItem.complete();
}
// TODO Pages, Notes, Description, Language, Annexes
}
}
newItem.complete();
}
function doWeb(doc, url) {
@ -162,39 +228,44 @@ function doWeb(doc, url) {
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var multxpath = '//a[@id="InitialFocusPoint"]';
var elt;
var multxpath = '/html/body/div[2]/div/span';
if (elt = doc.evaluate(multxpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
var newUrl = doc.evaluate('//base/@href', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
var xpath = '//tr/td[3]/a';
var elmts = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
var elmt = elmts.iterateNext();
var links = new Array();
var availableItems = new Array();
var i = 0;
do {
var content = elt.textContent;
if ( (content == "Résultats") || (content == "Results") )
{
var newUrl = doc.evaluate('//base/@href', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
var xpath = '/html/body/div[2]/table/tbody/tr/td[3]/div/a';
var elmts = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
var elmt = elmts.iterateNext();
var links = new Array();
var availableItems = new Array();
var i = 0;
do {
var link = doc.evaluate('./@href', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
var searchTitle = elmt.textContent;
availableItems[i] = searchTitle;
links[i] = link;
i++;
} while (elmt = elmts.iterateNext());
var items = Zotero.selectItems(availableItems);
} while (elmt = elmts.iterateNext());
var items = Zotero.selectItems(availableItems);
if(!items) {
if(!items) {
return true;
}
var uris = new Array();
for(var i in items) {
uris.push(newUrl + links[i]);
Zotero.debug(newUrl + links[i]);
}
Zotero.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
function() { Zotero.done(); }, null);
Zotero.wait();
}
var uris = new Array();
for(var i in items) {
uris.push(newUrl + links[i]);
else if ( (content == "Notice complète") || (content == 'title data') )
{
scrape(doc);
}
Zotero.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
function() { Zotero.done(); }, null);
Zotero.wait();
}
else {
scrape(doc);
}
}
}