Adding Gallica and modifications to Sudoc from Sylvain

2009-10-07 18:28:56 +00:00 · 2009-10-07 18:28:56 +00:00 · e517ba845b
commit e517ba845b
parent 6c28ddcb1c
2 changed files with 285 additions and 5 deletions
--- a/translators/Gallica.js
+++ b/translators/Gallica.js
@ -0,0 +1,246 @@
+{
+	"translatorID":"58ab2618-4a25-4b9b-83a7-80cd0259f896",
+	"translatorType":4,
+	"label":"Gallica",
+	"creator":"Sylvain Machefert",
+	"target":"^http://gallica\\.bnf\\.fr",
+	"minVersion":"1.0.0b3.r1",
+	"maxVersion":"",
+	"priority":100,
+	"inRepository":true,
+	"lastUpdated":"2009-10-06 08:55:00"
+}
+
+function detectWeb(doc, url) {
+               var namespace = doc.documentElement.namespaceURI;
+               var nsResolver = namespace ? function(prefix) {
+                               if (prefix == 'x') return namespace; else return null;
+               } : null;
+
+               var indexSearch = url.toString().indexOf('http://gallica.bnf.fr/Search');
+               var indexArk = url.toString().indexOf('http://gallica.bnf.fr/ark:');
+               var indexSNE = url.toString().indexOf('http://gallica.bnf.fr/VisuSNE');
+
+               if (indexSearch == 0)
+								{
+									var errorXpath = '//div[@class="errorMessage"]';
+									if  (elt = doc.evaluate(errorXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
+										// We are on a search page result but it can be an empty result page.
+										// Nothing to return;
+									}
+									else
+									{
+										return "multiple";
+									}
+               }
+               else if (indexArk == 0)
+               {
+                       var iconxpath = '//div[@id="Infos"]/img';
+                       if (elt = doc.evaluate(iconxpath, doc, nsResolver,
+XPathResult.ANY_TYPE, null).iterateNext()) {
+                               var icon = elt.getAttribute('src');
+                               return getDoctypeGallica(icon);
+                       }
+                       
+                       // For some biblio, the icon picture is located in another div ...
+                       var iconxpath = '//div[@class="titrePeriodiqueGauche"]/img';
+                       if  (elt = doc.evaluate(iconxpath, doc, nsResolver,
+XPathResult.ANY_TYPE, null).iterateNext()) {
+                               var icon = elt.getAttribute('src');
+                               
+                               return getDoctypeGallica(icon);
+                       }
+               }
+               else if (indexSNE == 0)
+               {
+                       return "book";
+               }
+}
+
+// This function takes the name of the icon, and returns the Zotero item name
+function getDoctypeGallica(img)
+{
+	var iconname = img.substring(img.lastIndexOf('/') + 1);
+	
+	if ( (iconname =='doc_livre_ocr.png') || (iconname == 'doc_livre.png') ) 
+	{
+		return "book";
+	}
+	else if (iconname == 'doc_carte.png')
+	{
+		return "map";
+	}
+	else if (iconname == 'doc_image.png')
+	{
+		return "artwork";
+	}
+	else if ( (iconname == 'doc_periodique.png') || (iconname == 'doc_perio_vol_ocr.png') )
+	{
+		return "book";
+	}
+	else
+	{
+		Zotero.debug("Undefined icon : " + iconname);
+		return "book";
+	}
+	
+}
+
+function doWeb(doc, url) {
+		var namespace = doc.documentElement.namespaceURI;
+		var nsResolver = namespace ? function(prefix) {
+				if (prefix == 'x') return namespace; else return null;
+		} : null;
+		
+		if (detectWeb(doc, url) == "multiple") 
+		{
+			var availableItems = new Array();
+			var xpath = '//td[@class="ResultatsRechercheInfos"]/a';
+			
+			var elmts = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+			var elmt = elmts.iterateNext();
+			
+			var itemsId = new Array();
+			
+			var i = 0;
+			do {
+				var id = doc.evaluate('../../..//a[@id]', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+				// This id looks like  idN00000. We need the information after id to get the informations about 
+				// the title. We need to store it in an array, we leave the starting id.
+				var cleanId = id.getAttribute('id').substring(2);
+				itemsId[i] = cleanId;
+
+				var searchTitle = elmt.textContent;
+				availableItems[i] = searchTitle;
+				
+				i++;
+			} while (elmt = elmts.iterateNext());
+			
+			var items = Zotero.selectItems(availableItems);
+			
+			for (var i in items) {
+				// All informations are available on search result page. We don't need to query 
+				// every subpage with scrape. We'are going to call the special Gallica scrape function
+				// This function (scrapeGallica) is reused in scrape. 
+				var fullpath = '//div[@id="noticeComplete' + itemsId[i] + '"]/div';
+				var detail = doc.evaluate(fullpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+				Zotero.debug(itemsId[i]);
+				var iconType = doc.evaluate('//a[@id="id' + itemsId[i] + '"]/..//span[@class="typedoc"]/img', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+				var docType = getDoctypeGallica(iconType.getAttribute('src'));
+				Zotero.debug( itemsId[i]);
+				scrapeGallica(doc, nsResolver, detail, docType);
+			}
+		}
+		else
+		{
+			var docType = detectWeb(doc, url);
+			var xpath = '//div[@id="Popup1"]/div[@class="data"]';
+			var detail = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+			scrapeGallica(doc, nsResolver, detail, docType);
+		}
+}
+
+function scrapeGallica(doc, nsResolver, div, type)
+{
+	var item = new Zotero.Item;
+	item.itemType = type;
+	
+	var elmts = doc.evaluate('p', div, nsResolver, XPathResult.ANY_TYPE, null);
+	
+	var elmt = elmts.iterateNext();
+
+	do {
+		var text = Zotero.Utilities.trimInternal(elmt.textContent);
+		var contenu = '';
+		if (contenu = text.split(/^(Titre|Title|Título) : /)[2])
+		{
+			item.title = Zotero.Utilities.trimInternal(contenu);
+		}
+		else if ( contenu = text.split(/^(Auteur|Author|Autor) : /)[2])
+		{
+			contenu = contenu.replace(/(See only the results matching this author|Ne voir que les résultats de cet auteur)/, '');
+			if (type == 'artwork')
+			{
+				 item.creators.push(Zotero.Utilities.cleanAuthor(contenu, "artist", true));	
+			}
+			else
+			{
+				item.creators.push(Zotero.Utilities.cleanAuthor(contenu, "author", true));	
+			}
+		}
+		else if ( contenu = text.split(/^(Publisher|Éditeur|Editor) : /)[2])
+		{
+			item.publisher = Zotero.Utilities.trimInternal(contenu);
+		}
+		else if ( contenu = text.split(/^(Date of publication|Date d'édition|Data de publicação|Fecha de publicación) : /)[2])
+		{
+			item.date = Zotero.Utilities.trimInternal(contenu);
+		}
+		else if ( contenu = text.split(/^(Contributeur|Contributor|Contribuidor) : /)[2])
+		{
+			item.creators.push(Zotero.Utilities.cleanAuthor(contenu, "contributor", true));
+		}
+		else if ( contenu = text.split(/^(Language|Langue|Língua|Idioma) : /)[2])
+		{
+			item.language = Zotero.Utilities.trimInternal(contenu);
+		}
+		else if ( contenu = text.split(/^(Format|Formato) : /)[2])
+		{
+			// This field contains : application/pdf for example.
+		}
+		else if ( contenu = text.split(/^(Copyright|Droits|Direitos) : /)[2])
+		{
+			item.rights = Zotero.Utilities.trimInternal(contenu);
+		}
+		else if (contenu = text.split(/^(Identifier|Identifiant|Senha) : /)[2])
+		{
+			var temp = '';
+			if (temp = contenu.split(/^ISSN /)[1])
+			{
+				item.ISSN = temp;	
+			}
+			else if (contenu.match(/^http:\/\//))
+			{
+				// If identifier starts with http it is the url of the document
+				item.url = contenu;
+			}
+			else if (contenu.match(/^ark:/))
+			{
+				item.url = "http://gallica.bnf.fr/" + contenu;
+			}
+		}
+		else if (contenu = text.split(/^(Description|Descrição) : /)[2])
+		{
+			var temp = '';
+			if (temp = contenu.split(/^Variante\(s\) de titre : /)[1])
+			{
+		// Alternative title : no field in zotero ? 
+		//		Zotero.debug("Titre : " + temp);
+			}
+			else if (temp = contenu.split(/^Collection : /)[1])
+			{
+				item.collection = temp;
+			}
+			else
+			{
+//				Zotero.debug(contenu);
+			}
+		}
+		else if (contenu = text.split(/^(Sujet|Assunto|Tema|Subject) : /)[2])
+		{
+			
+			var tagList = contenu.split(/; ?/);
+			for (var tag in tagList) 
+			{
+				item.tags.push(Zotero.Utilities.trimInternal(tagList[tag]));
+			}
+		}
+
+	} while (elmt = elmts.iterateNext());
+		
+	if ( (item.url == "") || (item.url == undefined) )
+	{
+		item.url = doc.location.href; 
+	}
+	item.complete();
+}
--- a/translators/Sudoc.js
+++ b/translators/Sudoc.js
@ -8,10 +8,9 @@
 	"maxVersion":"",
 	"priority":100,
 	"inRepository":true,
-	"lastUpdated":"2009-09-03 17:00:00"
+	"lastUpdated":"2009-10-06 17:00:00"
 }

-
 function detectWeb(doc, url) {
 		var namespace = doc.documentElement.namespaceURI;
 		var nsResolver = namespace ? function(prefix) {
@ -156,18 +155,32 @@ function scrape(doc) {
 									authorFunction = Zotero.Utilities.superCleanString(authorFunction);
 								}
 								var zoteroFunction = '';
+								
 								// TODO : Add other authotiry types
 								if (authorFunction == 'Traduction')
 								{
-									zoteroFunction = 'Translator';
+									zoteroFunction = 'translator';
+								}
+								else if ( (zoteroType == "thesis") && (authorFunction != 'Auteur') )
+								{
+									zoteroFunction = "contributor";
 								}
 								else
 								{
-									zoteroFunction = 'Author';
+									zoteroFunction = 'author';
 								}
+								
+								if (authorFunction == "Université de soutenance")
+								{
+									// If the author function is "université de soutenance"	it means that this author has to be in "university" field
+									newItem.university = authorText;
+								}
+								else
+								{
 									newItem.creators.push(Zotero.Utilities.cleanAuthor(authorText, zoteroFunction, true));
 								}
 							}
+						}
 						// The serie isn't in COinS
 						else 	if (field.substr(0,5) == "Serie" || field.substr(0,10) == "Collection")
 						{
@ -216,6 +229,27 @@ function scrape(doc) {
 								newItem.abstractNote = value;
 							}
 						}
+						else if ( (field == "Sujets"  ) || (field == "Subjects") )
+						{
+							var subjects = doc.evaluate('./td[2]/div', tableRow, nsResolver, XPathResult.ANY_TYPE, null);
+							var subject_out = "";
+							
+							while (subject = subjects.iterateNext())
+							{
+								var subject_content = subject.textContent;
+								subject_content = subject_content.replace(/^\s*/, "");
+								subject_content = subject_content.replace(/\s*$/, "");
+								if (subject_content != "")
+								{
+									newItem.tags.push(Zotero.Utilities.trimInternal(subject_content));
+								}
+							}
+						}
+						else if ( (field == "Thèse") || (field == "Dissertation") )
+						{
+							var thesisType = value.split(/ ?:/)[0];
+							newItem.type = thesisType;
+						}
 					}
 					newItem.complete();
 				}