Adding Sylvain's 15mar10 changes.

This commit is contained in:
Matt Burton 2010-03-29 15:01:48 +00:00
parent 1caa7ac359
commit 466cf794e1

View file

@ -8,53 +8,53 @@
"maxVersion":"", "maxVersion":"",
"priority":100, "priority":100,
"inRepository":true, "inRepository":true,
"lastUpdated":"2009-10-08 17:40:00" "lastUpdated":"2010-02-20 10:40:00"
} }
function detectWeb(doc, url) { function detectWeb(doc, url) {
var namespace = doc.documentElement.namespaceURI; var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) { var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null; if (prefix == 'x') return namespace; else return null;
} : null; } : null;
var indexSearch = url.toString().indexOf('http://gallica.bnf.fr/Search'); var indexSearch = url.toString().indexOf('http://gallica.bnf.fr/Search');
var indexArk = url.toString().indexOf('http://gallica.bnf.fr/ark:'); var indexArk = url.toString().indexOf('http://gallica.bnf.fr/ark:');
var indexSNE = url.toString().indexOf('http://gallica.bnf.fr/VisuSNE'); var indexSNE = url.toString().indexOf('http://gallica.bnf.fr/VisuSNE');
if (indexSearch == 0) if (indexSearch == 0)
{ {
var errorXpath = '//div[@class="errorMessage"]'; var errorXpath = '//div[@class="errorMessage"]';
if (elt = doc.evaluate(errorXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { if (elt = doc.evaluate(errorXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
// We are on a search page result but it can be an empty result page. // We are on a search page result but it can be an empty result page.
// Nothing to return; // Nothing to return;
} }
else else
{ {
return "multiple"; return "multiple";
} }
} }
else if (indexArk == 0) else if (indexArk == 0)
{ {
var iconxpath = '//div[@id="Infos"]/img'; var iconxpath = '//div[@class="contenu1"]/img';
if (elt = doc.evaluate(iconxpath, doc, nsResolver, if (elt = doc.evaluate(iconxpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext())
XPathResult.ANY_TYPE, null).iterateNext()) { {
var icon = elt.getAttribute('src'); var icon = elt.getAttribute('src');
return getDoctypeGallica(icon); return getDoctypeGallica(icon);
} }
// For some biblio, the icon picture is located in another div ... // For some biblio, the icon picture is located in another div ...
var iconxpath = '//div[@class="titrePeriodiqueGauche"]/img'; var iconxpath = '//div[@class="titrePeriodiqueGauche"]/img';
if (elt = doc.evaluate(iconxpath, doc, nsResolver, if (elt = doc.evaluate(iconxpath, doc, nsResolver,
XPathResult.ANY_TYPE, null).iterateNext()) { XPathResult.ANY_TYPE, null).iterateNext())
var icon = elt.getAttribute('src'); {
var icon = elt.getAttribute('src');
return getDoctypeGallica(icon); return getDoctypeGallica(icon);
} }
} }
else if (indexSNE == 0) else if (indexSNE == 0)
{ {
return "book"; return "book";
} }
} }
// This function takes the name of the icon, and returns the Zotero item name // This function takes the name of the icon, and returns the Zotero item name
@ -62,19 +62,29 @@ function getDoctypeGallica(img)
{ {
var iconname = img.substring(img.lastIndexOf('/') + 1); var iconname = img.substring(img.lastIndexOf('/') + 1);
if ( (iconname =='doc_livre_ocr.png') || (iconname == 'doc_livre.png') ) if (iconname =='livre_a.png')
{ {
return "book"; return "book";
} }
else if (iconname == 'doc_carte.png') else if (iconname == 'carte.png')
{ {
return "map"; return "map";
} }
else if (iconname == 'doc_image.png') else if (iconname == 'images.png')
{ {
return "artwork"; return "artwork";
} }
else if ( (iconname == 'doc_periodique.png') || (iconname == 'doc_perio_vol_ocr.png') ) else if (iconname == 'docsonore.png')
{
return "audioRecording";
}
else if (iconname == 'musiquenotee.png')
{
// This icon is for Sheet music type. But no Zotero type matches
// as of today (2010-02)
return "book";
}
else if ( (iconname == 'picto_type_document1.png') || (iconname == 'perio_vol_ocr.png') )
{ {
return "book"; return "book";
} }
@ -95,52 +105,55 @@ function doWeb(doc, url) {
if (detectWeb(doc, url) == "multiple") if (detectWeb(doc, url) == "multiple")
{ {
var availableItems = new Array(); var availableItems = new Array();
var xpath = '//td[@class="ResultatsRechercheInfos"]/a'; var xpath = '//div[@class="resultats_line"]';
var elmts = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null); var elmts = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
var elmt = elmts.iterateNext(); var elmt = elmts.iterateNext();
var itemsId = new Array(); var itemsId = new Array();
var i = 0; var i = 1;
do { do {
var id = doc.evaluate('../../..//a[@id]', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); var id = doc.evaluate('div[@class="resultat_id"]', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
// This id looks like idN00000. We need the information after id to get the informations about var this_result = doc.evaluate('div[@class="resultat_desc"]/div[@class="titre"]/a', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
// the title. We need to store it in an array, we leave the starting id. availableItems[i] = Zotero.Utilities.cleanTags(this_result.getAttribute('title'));
var cleanId = id.getAttribute('id').substring(2);
itemsId[i] = cleanId;
var searchTitle = elmt.textContent;
availableItems[i] = searchTitle;
i++; i++;
} while (elmt = elmts.iterateNext()); } while (elmt = elmts.iterateNext());
var items = Zotero.selectItems(availableItems); var items = Zotero.selectItems(availableItems);
for (var i in items) { for (var i in items) {
// All informations are available on search result page. We don't need to query // All informations are available on search result page. We don't need to query
// every subpage with scrape. We'are going to call the special Gallica scrape function // every subpage with scrape. We'are going to call the special Gallica scrape function
// This function (scrapeGallica) is reused in scrape. // This function (scrapeGallica) is reused in scrape.
var fullpath = '//div[@id="noticeComplete' + itemsId[i] + '"]/div'; var fullpath = '//div[@class="resultats_line"][' + i + ']';
var detail = doc.evaluate(fullpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
Zotero.debug(itemsId[i]); var item_element = doc.evaluate(fullpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
var iconType = doc.evaluate('//a[@id="id' + itemsId[i] + '"]/..//span[@class="typedoc"]/img', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); if (item_element != undefined)
var docType = getDoctypeGallica(iconType.getAttribute('src')); {
Zotero.debug( itemsId[i]); var detail = doc.evaluate('.//div[@class="notice"]', item_element, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
scrapeGallica(doc, nsResolver, detail, docType);
var iconType = doc.evaluate('.//div[@class="picto"]/img', item_element, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
var docType = getDoctypeGallica(iconType.getAttribute('src'));
var docUrl = doc.evaluate('.//div[@class="liens"]/a', item_element, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
docUrl = docUrl.getAttribute("href");
scrapeGallica(doc, nsResolver, detail, docType, docUrl);
}
} }
} }
else else
{ {
var docType = detectWeb(doc, url); var docType = detectWeb(doc, url);
var xpath = '//div[@id="Popup1"]/div[@class="data"]'; var xpath = '//div[@class="notice"]';
var detail = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); var detail = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
scrapeGallica(doc, nsResolver, detail, docType); scrapeGallica(doc, nsResolver, detail, docType, "");
} }
} }
function scrapeGallica(doc, nsResolver, div, type) function scrapeGallica(doc, nsResolver, div, type, direct_url)
{ {
var item = new Zotero.Item; var item = new Zotero.Item;
item.itemType = type; item.itemType = type;
@ -237,10 +250,17 @@ function scrapeGallica(doc, nsResolver, div, type)
} }
} while (elmt = elmts.iterateNext()); } while (elmt = elmts.iterateNext());
if ( (item.url == "") || (item.url == undefined) ) if ( (item.url == "") || (item.url == undefined) )
{ {
item.url = doc.location.href; if (direct_url != "")
{
item.url = "http://gallica.bnf.fr" + direct_url;
}
else
{
item.url = doc.location.href;
}
} }
item.complete(); item.complete();
} }