From 466cf794e1dfdbe758b94101bebc7558fc606cf2 Mon Sep 17 00:00:00 2001 From: Matt Burton Date: Mon, 29 Mar 2010 15:01:48 +0000 Subject: [PATCH] Adding Sylvain's 15mar10 changes. --- translators/Gallica.js | 164 +++++++++++++++++++++++------------------ 1 file changed, 92 insertions(+), 72 deletions(-) diff --git a/translators/Gallica.js b/translators/Gallica.js index 757f38f52e..89b936d4f2 100644 --- a/translators/Gallica.js +++ b/translators/Gallica.js @@ -8,53 +8,53 @@ "maxVersion":"", "priority":100, "inRepository":true, - "lastUpdated":"2009-10-08 17:40:00" + "lastUpdated":"2010-02-20 10:40:00" } function detectWeb(doc, url) { - var namespace = doc.documentElement.namespaceURI; - var nsResolver = namespace ? function(prefix) { - if (prefix == 'x') return namespace; else return null; - } : null; - - var indexSearch = url.toString().indexOf('http://gallica.bnf.fr/Search'); - var indexArk = url.toString().indexOf('http://gallica.bnf.fr/ark:'); - var indexSNE = url.toString().indexOf('http://gallica.bnf.fr/VisuSNE'); - - if (indexSearch == 0) - { - var errorXpath = '//div[@class="errorMessage"]'; - if (elt = doc.evaluate(errorXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { - // We are on a search page result but it can be an empty result page. - // Nothing to return; - } - else - { - return "multiple"; - } - } - else if (indexArk == 0) - { - var iconxpath = '//div[@id="Infos"]/img'; - if (elt = doc.evaluate(iconxpath, doc, nsResolver, -XPathResult.ANY_TYPE, null).iterateNext()) { - var icon = elt.getAttribute('src'); - return getDoctypeGallica(icon); - } - - // For some biblio, the icon picture is located in another div ... - var iconxpath = '//div[@class="titrePeriodiqueGauche"]/img'; - if (elt = doc.evaluate(iconxpath, doc, nsResolver, -XPathResult.ANY_TYPE, null).iterateNext()) { - var icon = elt.getAttribute('src'); - - return getDoctypeGallica(icon); - } - } - else if (indexSNE == 0) - { - return "book"; - } + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == 'x') return namespace; else return null; + } : null; + + var indexSearch = url.toString().indexOf('http://gallica.bnf.fr/Search'); + var indexArk = url.toString().indexOf('http://gallica.bnf.fr/ark:'); + var indexSNE = url.toString().indexOf('http://gallica.bnf.fr/VisuSNE'); + + if (indexSearch == 0) + { + var errorXpath = '//div[@class="errorMessage"]'; + if (elt = doc.evaluate(errorXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { + // We are on a search page result but it can be an empty result page. + // Nothing to return; + } + else + { + return "multiple"; + } + } + else if (indexArk == 0) + { + var iconxpath = '//div[@class="contenu1"]/img'; + if (elt = doc.evaluate(iconxpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) + { + var icon = elt.getAttribute('src'); + return getDoctypeGallica(icon); + } + + // For some biblio, the icon picture is located in another div ... + var iconxpath = '//div[@class="titrePeriodiqueGauche"]/img'; + if (elt = doc.evaluate(iconxpath, doc, nsResolver, + XPathResult.ANY_TYPE, null).iterateNext()) + { + var icon = elt.getAttribute('src'); + return getDoctypeGallica(icon); + } + } + else if (indexSNE == 0) + { + return "book"; + } } // This function takes the name of the icon, and returns the Zotero item name @@ -62,19 +62,29 @@ function getDoctypeGallica(img) { var iconname = img.substring(img.lastIndexOf('/') + 1); - if ( (iconname =='doc_livre_ocr.png') || (iconname == 'doc_livre.png') ) + if (iconname =='livre_a.png') { return "book"; } - else if (iconname == 'doc_carte.png') + else if (iconname == 'carte.png') { return "map"; } - else if (iconname == 'doc_image.png') + else if (iconname == 'images.png') { return "artwork"; } - else if ( (iconname == 'doc_periodique.png') || (iconname == 'doc_perio_vol_ocr.png') ) + else if (iconname == 'docsonore.png') + { + return "audioRecording"; + } + else if (iconname == 'musiquenotee.png') + { + // This icon is for Sheet music type. But no Zotero type matches + // as of today (2010-02) + return "book"; + } + else if ( (iconname == 'picto_type_document1.png') || (iconname == 'perio_vol_ocr.png') ) { return "book"; } @@ -95,52 +105,55 @@ function doWeb(doc, url) { if (detectWeb(doc, url) == "multiple") { var availableItems = new Array(); - var xpath = '//td[@class="ResultatsRechercheInfos"]/a'; + var xpath = '//div[@class="resultats_line"]'; var elmts = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null); var elmt = elmts.iterateNext(); var itemsId = new Array(); - var i = 0; + var i = 1; do { - var id = doc.evaluate('../../..//a[@id]', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); - // This id looks like idN00000. We need the information after id to get the informations about - // the title. We need to store it in an array, we leave the starting id. - var cleanId = id.getAttribute('id').substring(2); - itemsId[i] = cleanId; - - var searchTitle = elmt.textContent; - availableItems[i] = searchTitle; + var id = doc.evaluate('div[@class="resultat_id"]', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; + var this_result = doc.evaluate('div[@class="resultat_desc"]/div[@class="titre"]/a', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + availableItems[i] = Zotero.Utilities.cleanTags(this_result.getAttribute('title')); i++; } while (elmt = elmts.iterateNext()); - + var items = Zotero.selectItems(availableItems); for (var i in items) { // All informations are available on search result page. We don't need to query // every subpage with scrape. We'are going to call the special Gallica scrape function - // This function (scrapeGallica) is reused in scrape. - var fullpath = '//div[@id="noticeComplete' + itemsId[i] + '"]/div'; - var detail = doc.evaluate(fullpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); - Zotero.debug(itemsId[i]); - var iconType = doc.evaluate('//a[@id="id' + itemsId[i] + '"]/..//span[@class="typedoc"]/img', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); - var docType = getDoctypeGallica(iconType.getAttribute('src')); - Zotero.debug( itemsId[i]); - scrapeGallica(doc, nsResolver, detail, docType); + // This function (scrapeGallica) is reused in scrape. + var fullpath = '//div[@class="resultats_line"][' + i + ']'; + + var item_element = doc.evaluate(fullpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if (item_element != undefined) + { + var detail = doc.evaluate('.//div[@class="notice"]', item_element, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + + var iconType = doc.evaluate('.//div[@class="picto"]/img', item_element, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + var docType = getDoctypeGallica(iconType.getAttribute('src')); + + var docUrl = doc.evaluate('.//div[@class="liens"]/a', item_element, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + docUrl = docUrl.getAttribute("href"); + + scrapeGallica(doc, nsResolver, detail, docType, docUrl); + } } } else { var docType = detectWeb(doc, url); - var xpath = '//div[@id="Popup1"]/div[@class="data"]'; + var xpath = '//div[@class="notice"]'; var detail = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); - scrapeGallica(doc, nsResolver, detail, docType); + scrapeGallica(doc, nsResolver, detail, docType, ""); } } -function scrapeGallica(doc, nsResolver, div, type) +function scrapeGallica(doc, nsResolver, div, type, direct_url) { var item = new Zotero.Item; item.itemType = type; @@ -237,10 +250,17 @@ function scrapeGallica(doc, nsResolver, div, type) } } while (elmt = elmts.iterateNext()); - + if ( (item.url == "") || (item.url == undefined) ) { - item.url = doc.location.href; + if (direct_url != "") + { + item.url = "http://gallica.bnf.fr" + direct_url; + } + else + { + item.url = doc.location.href; + } } item.complete(); }