Adding Sylvain's 15mar10 changes.
This commit is contained in:
parent
1caa7ac359
commit
466cf794e1
1 changed files with 92 additions and 72 deletions
|
@ -8,53 +8,53 @@
|
||||||
"maxVersion":"",
|
"maxVersion":"",
|
||||||
"priority":100,
|
"priority":100,
|
||||||
"inRepository":true,
|
"inRepository":true,
|
||||||
"lastUpdated":"2009-10-08 17:40:00"
|
"lastUpdated":"2010-02-20 10:40:00"
|
||||||
}
|
}
|
||||||
|
|
||||||
function detectWeb(doc, url) {
|
function detectWeb(doc, url) {
|
||||||
var namespace = doc.documentElement.namespaceURI;
|
var namespace = doc.documentElement.namespaceURI;
|
||||||
var nsResolver = namespace ? function(prefix) {
|
var nsResolver = namespace ? function(prefix) {
|
||||||
if (prefix == 'x') return namespace; else return null;
|
if (prefix == 'x') return namespace; else return null;
|
||||||
} : null;
|
} : null;
|
||||||
|
|
||||||
var indexSearch = url.toString().indexOf('http://gallica.bnf.fr/Search');
|
var indexSearch = url.toString().indexOf('http://gallica.bnf.fr/Search');
|
||||||
var indexArk = url.toString().indexOf('http://gallica.bnf.fr/ark:');
|
var indexArk = url.toString().indexOf('http://gallica.bnf.fr/ark:');
|
||||||
var indexSNE = url.toString().indexOf('http://gallica.bnf.fr/VisuSNE');
|
var indexSNE = url.toString().indexOf('http://gallica.bnf.fr/VisuSNE');
|
||||||
|
|
||||||
if (indexSearch == 0)
|
if (indexSearch == 0)
|
||||||
{
|
{
|
||||||
var errorXpath = '//div[@class="errorMessage"]';
|
var errorXpath = '//div[@class="errorMessage"]';
|
||||||
if (elt = doc.evaluate(errorXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
|
if (elt = doc.evaluate(errorXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||||
// We are on a search page result but it can be an empty result page.
|
// We are on a search page result but it can be an empty result page.
|
||||||
// Nothing to return;
|
// Nothing to return;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
return "multiple";
|
return "multiple";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (indexArk == 0)
|
else if (indexArk == 0)
|
||||||
{
|
{
|
||||||
var iconxpath = '//div[@id="Infos"]/img';
|
var iconxpath = '//div[@class="contenu1"]/img';
|
||||||
if (elt = doc.evaluate(iconxpath, doc, nsResolver,
|
if (elt = doc.evaluate(iconxpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext())
|
||||||
XPathResult.ANY_TYPE, null).iterateNext()) {
|
{
|
||||||
var icon = elt.getAttribute('src');
|
var icon = elt.getAttribute('src');
|
||||||
return getDoctypeGallica(icon);
|
return getDoctypeGallica(icon);
|
||||||
}
|
}
|
||||||
|
|
||||||
// For some biblio, the icon picture is located in another div ...
|
// For some biblio, the icon picture is located in another div ...
|
||||||
var iconxpath = '//div[@class="titrePeriodiqueGauche"]/img';
|
var iconxpath = '//div[@class="titrePeriodiqueGauche"]/img';
|
||||||
if (elt = doc.evaluate(iconxpath, doc, nsResolver,
|
if (elt = doc.evaluate(iconxpath, doc, nsResolver,
|
||||||
XPathResult.ANY_TYPE, null).iterateNext()) {
|
XPathResult.ANY_TYPE, null).iterateNext())
|
||||||
var icon = elt.getAttribute('src');
|
{
|
||||||
|
var icon = elt.getAttribute('src');
|
||||||
return getDoctypeGallica(icon);
|
return getDoctypeGallica(icon);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (indexSNE == 0)
|
else if (indexSNE == 0)
|
||||||
{
|
{
|
||||||
return "book";
|
return "book";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// This function takes the name of the icon, and returns the Zotero item name
|
// This function takes the name of the icon, and returns the Zotero item name
|
||||||
|
@ -62,19 +62,29 @@ function getDoctypeGallica(img)
|
||||||
{
|
{
|
||||||
var iconname = img.substring(img.lastIndexOf('/') + 1);
|
var iconname = img.substring(img.lastIndexOf('/') + 1);
|
||||||
|
|
||||||
if ( (iconname =='doc_livre_ocr.png') || (iconname == 'doc_livre.png') )
|
if (iconname =='livre_a.png')
|
||||||
{
|
{
|
||||||
return "book";
|
return "book";
|
||||||
}
|
}
|
||||||
else if (iconname == 'doc_carte.png')
|
else if (iconname == 'carte.png')
|
||||||
{
|
{
|
||||||
return "map";
|
return "map";
|
||||||
}
|
}
|
||||||
else if (iconname == 'doc_image.png')
|
else if (iconname == 'images.png')
|
||||||
{
|
{
|
||||||
return "artwork";
|
return "artwork";
|
||||||
}
|
}
|
||||||
else if ( (iconname == 'doc_periodique.png') || (iconname == 'doc_perio_vol_ocr.png') )
|
else if (iconname == 'docsonore.png')
|
||||||
|
{
|
||||||
|
return "audioRecording";
|
||||||
|
}
|
||||||
|
else if (iconname == 'musiquenotee.png')
|
||||||
|
{
|
||||||
|
// This icon is for Sheet music type. But no Zotero type matches
|
||||||
|
// as of today (2010-02)
|
||||||
|
return "book";
|
||||||
|
}
|
||||||
|
else if ( (iconname == 'picto_type_document1.png') || (iconname == 'perio_vol_ocr.png') )
|
||||||
{
|
{
|
||||||
return "book";
|
return "book";
|
||||||
}
|
}
|
||||||
|
@ -95,52 +105,55 @@ function doWeb(doc, url) {
|
||||||
if (detectWeb(doc, url) == "multiple")
|
if (detectWeb(doc, url) == "multiple")
|
||||||
{
|
{
|
||||||
var availableItems = new Array();
|
var availableItems = new Array();
|
||||||
var xpath = '//td[@class="ResultatsRechercheInfos"]/a';
|
var xpath = '//div[@class="resultats_line"]';
|
||||||
|
|
||||||
var elmts = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
var elmts = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||||
var elmt = elmts.iterateNext();
|
var elmt = elmts.iterateNext();
|
||||||
|
|
||||||
var itemsId = new Array();
|
var itemsId = new Array();
|
||||||
|
|
||||||
var i = 0;
|
var i = 1;
|
||||||
do {
|
do {
|
||||||
var id = doc.evaluate('../../..//a[@id]', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
var id = doc.evaluate('div[@class="resultat_id"]', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
|
||||||
// This id looks like idN00000. We need the information after id to get the informations about
|
var this_result = doc.evaluate('div[@class="resultat_desc"]/div[@class="titre"]/a', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
||||||
// the title. We need to store it in an array, we leave the starting id.
|
availableItems[i] = Zotero.Utilities.cleanTags(this_result.getAttribute('title'));
|
||||||
var cleanId = id.getAttribute('id').substring(2);
|
|
||||||
itemsId[i] = cleanId;
|
|
||||||
|
|
||||||
var searchTitle = elmt.textContent;
|
|
||||||
availableItems[i] = searchTitle;
|
|
||||||
|
|
||||||
i++;
|
i++;
|
||||||
} while (elmt = elmts.iterateNext());
|
} while (elmt = elmts.iterateNext());
|
||||||
|
|
||||||
var items = Zotero.selectItems(availableItems);
|
var items = Zotero.selectItems(availableItems);
|
||||||
|
|
||||||
for (var i in items) {
|
for (var i in items) {
|
||||||
// All informations are available on search result page. We don't need to query
|
// All informations are available on search result page. We don't need to query
|
||||||
// every subpage with scrape. We'are going to call the special Gallica scrape function
|
// every subpage with scrape. We'are going to call the special Gallica scrape function
|
||||||
// This function (scrapeGallica) is reused in scrape.
|
// This function (scrapeGallica) is reused in scrape.
|
||||||
var fullpath = '//div[@id="noticeComplete' + itemsId[i] + '"]/div';
|
var fullpath = '//div[@class="resultats_line"][' + i + ']';
|
||||||
var detail = doc.evaluate(fullpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
|
||||||
Zotero.debug(itemsId[i]);
|
var item_element = doc.evaluate(fullpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
||||||
var iconType = doc.evaluate('//a[@id="id' + itemsId[i] + '"]/..//span[@class="typedoc"]/img', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
if (item_element != undefined)
|
||||||
var docType = getDoctypeGallica(iconType.getAttribute('src'));
|
{
|
||||||
Zotero.debug( itemsId[i]);
|
var detail = doc.evaluate('.//div[@class="notice"]', item_element, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
||||||
scrapeGallica(doc, nsResolver, detail, docType);
|
|
||||||
|
var iconType = doc.evaluate('.//div[@class="picto"]/img', item_element, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
||||||
|
var docType = getDoctypeGallica(iconType.getAttribute('src'));
|
||||||
|
|
||||||
|
var docUrl = doc.evaluate('.//div[@class="liens"]/a', item_element, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
||||||
|
docUrl = docUrl.getAttribute("href");
|
||||||
|
|
||||||
|
scrapeGallica(doc, nsResolver, detail, docType, docUrl);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
var docType = detectWeb(doc, url);
|
var docType = detectWeb(doc, url);
|
||||||
var xpath = '//div[@id="Popup1"]/div[@class="data"]';
|
var xpath = '//div[@class="notice"]';
|
||||||
var detail = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
var detail = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
||||||
scrapeGallica(doc, nsResolver, detail, docType);
|
scrapeGallica(doc, nsResolver, detail, docType, "");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function scrapeGallica(doc, nsResolver, div, type)
|
function scrapeGallica(doc, nsResolver, div, type, direct_url)
|
||||||
{
|
{
|
||||||
var item = new Zotero.Item;
|
var item = new Zotero.Item;
|
||||||
item.itemType = type;
|
item.itemType = type;
|
||||||
|
@ -237,10 +250,17 @@ function scrapeGallica(doc, nsResolver, div, type)
|
||||||
}
|
}
|
||||||
|
|
||||||
} while (elmt = elmts.iterateNext());
|
} while (elmt = elmts.iterateNext());
|
||||||
|
|
||||||
if ( (item.url == "") || (item.url == undefined) )
|
if ( (item.url == "") || (item.url == undefined) )
|
||||||
{
|
{
|
||||||
item.url = doc.location.href;
|
if (direct_url != "")
|
||||||
|
{
|
||||||
|
item.url = "http://gallica.bnf.fr" + direct_url;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
item.url = doc.location.href;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
item.complete();
|
item.complete();
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue