- added Washington Post translator

- translation works properly even when a user has switched to a different page
2006-09-08 05:47:47 +00:00 · 2006-09-08 05:47:47 +00:00 · 7b7d3d85e3
commit 7b7d3d85e3
parent b8ddba3a67
4 changed files with 178 additions and 69 deletions
--- a/chrome/chromeFiles/content/scholar/ingester/browser.js
+++ b/chrome/chromeFiles/content/scholar/ingester/browser.js
@ -167,8 +167,6 @@ Scholar_Ingester_Interface.tabClose = function(event) {
 Scholar_Ingester_Interface.tabSelect = function(event) {
 	var data = Scholar_Ingester_Interface._getData(Scholar_Ingester_Interface.tabBrowser.selectedBrowser);
 	Scholar_Ingester_Interface._updateStatus(data);
-	// Make sure scrape progress is gone
-	Scholar_Ingester_Interface.Progress.kill();
 }

 Scholar_Ingester_Interface.hidePopup = function(collectionID) {
--- a/chrome/chromeFiles/content/scholar/xpcom/translate.js
+++ b/chrome/chromeFiles/content/scholar/xpcom/translate.js
@ -558,14 +558,13 @@ Scholar.Translate.prototype._generateSandbox = function() {
 		this._sandbox.Scholar.nextItem = function() { return me._exportGetItem() };
 		this._sandbox.Scholar.nextCollection = function() { return me._exportGetCollection() }
 	} else {
-		// add routines to add new items
-		this._sandbox.Scholar.Item = Scholar.Translate.ScholarItem;
-		// attach the function to be run when an item is done
+		// copy routines to add new items
+		this._sandbox.Scholar.Item = Scholar.Translate.GenerateScholarItemClass();
 		this._sandbox.Scholar.Item.prototype.complete = function() {me._itemDone(this)};
 		
 		if(this.type == "import") {
 			// add routines to add new collections
-			this._sandbox.Scholar.Collection = Scholar.Translate.ScholarCollection;
+			this._sandbox.Scholar.Collection = Scholar.Translate.GenerateScholarItemClass();
 			// attach the function to be run when a collection is done
 			this._sandbox.Scholar.Collection.prototype.complete = function() {me._collectionDone(this)};
 		}
@ -882,7 +881,7 @@ Scholar.Translate.prototype._translationComplete = function(returnValue) {
 					Scholar.Notifier.trigger("add", "item", this.newItems);
 				}
 				// notify collectionTreeView about updates
-				if(this.newCollections.length) {
+				if(this.newCollections && this.newCollections.length) {
 					Scholar.Notifier.trigger("add", "collection", this.newCollections);
 				}
 			}
@ -1007,7 +1006,7 @@ Scholar.Translate.prototype._itemTagsAndSeeAlso = function(item, newItem) {
 /*
 * executed when an item is done and ready to be loaded into the database
 */
-Scholar.Translate.prototype._itemDone = function(item) {
+Scholar.Translate.prototype._itemDone = function(item) {	
 	if(!this.saveItem) {	// if we're not supposed to save the item, just
 							// return the item array
 		
@ -1056,7 +1055,7 @@ Scholar.Translate.prototype._itemDone = function(item) {
 			item.itemType = item.complete = undefined;
 			
 			// automatically set access date if URL is set
-			if(item.url && !item.accessDate) {
+			if(item.url && !item.accessDate && this.type == "web") {
 				item.accessDate = (new Date()).toLocaleString();
 			}
 			
@ -1778,26 +1777,34 @@ Scholar.Translate.prototype._storageFunctions =  function(read, write) {
 * inside scraper code
 */
 
-Scholar.Translate.ScholarItem = function(itemType) {
-	// assign item type
-	this.itemType = itemType;
-	// generate creators array
-	this.creators = new Array();
-	// generate notes array
-	this.notes = new Array();
-	// generate tags array
-	this.tags = new Array();
-	// generate see also array
-	this.seeAlso = new Array();
-	// generate file array
-	this.attachments = new Array();
+Scholar.Translate.GenerateScholarItemClass = function() {
+	var ScholarItem = function(itemType) {
+		// assign item type
+		this.itemType = itemType;
+		// generate creators array
+		this.creators = new Array();
+		// generate notes array
+		this.notes = new Array();
+		// generate tags array
+		this.tags = new Array();
+		// generate see also array
+		this.seeAlso = new Array();
+		// generate file array
+		this.attachments = new Array();
+	};
+	
+	return ScholarItem;
 }

 /* Scholar.Translate.Collection: a class for generating a new top-level
 * collection from inside scraper code
 */
- 
-Scholar.Translate.ScholarCollection = function() {}
+
+Scholar.Translate.GenerateScholarCollectionClass = function() {
+	var ScholarCollection = Scholar.Translate.ScholarCollection = function() {};
+	
+	return ScholarCollection;
+}

 /* Scholar.Translate.RDF: a class for handling RDF IO
 *
--- a/chrome/chromeFiles/content/scholar/xpcom/utilities.js
+++ b/chrome/chromeFiles/content/scholar/xpcom/utilities.js
@ -62,7 +62,7 @@ Scholar.Utilities.prototype.cleanString = function(s) {
 		throw "cleanString: argument must be a string";
 	}
 	
-	s = s.replace(/[ \xA0\r\n]+/g, " ");
+	s = s.replace(/[\xA0\r\n\s]+/g, " ");
 	s = s.replace(/^\s+/, "");
 	return s.replace(/\s+$/, "");
 }
@ -236,13 +236,21 @@ Scholar.Utilities.Ingester.prototype.getItemArray = function(doc, inHere, urlRe,
 	
 	// Require link to match this
 	if(urlRe) {
-		var urlRegexp = new RegExp();
-		urlRegexp.compile(urlRe, "i");
+		if(urlRe.exec) {
+			var urlRegexp = urlRe;
+		} else {
+			var urlRegexp = new RegExp();
+			urlRegexp.compile(urlRe, "i");
+		}
 	}
 	// Do not allow text to match this
 	if(rejectRe) {
-		var rejectRegexp = new RegExp();
-		rejectRegexp.compile(rejectRe, "i");
+		if(rejectRe.exec) {
+			var rejectRegexp = rejectRe;
+		} else {
+			var rejectRegexp = new RegExp();
+			rejectRegexp.compile(rejectRe, "i");
+		}
 	}
 	
 	if(!inHere.length) {
@ -253,7 +261,7 @@ Scholar.Utilities.Ingester.prototype.getItemArray = function(doc, inHere, urlRe,
 		var links = inHere[j].getElementsByTagName("a");
 		for(var i=0; i<links.length; i++) {
 			if(!urlRe || urlRegexp.test(links[i].href)) {
-				var text = this.getNodeString(doc, links[i], './/text()', null);
+				var text = links[i].textContent;
 				if(text) {
 					text = this.cleanString(text);
 					if(!rejectRe || !rejectRegexp.test(text)) {
--- a/scrapers.sql
+++ b/scrapers.sql
@ -1,4 +1,4 @@
-- 84
+-- 85

 -- Set the following timestamp to the most recent scraper update date
 REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-31 22:44:00'));
@ -186,7 +186,15 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006
 						title = title.substring(0, title.length-2);
 					}
 					newItem.title = Scholar.Utilities.capitalizeTitle(title);
-				} else if(match[1] == ''Author(s)'') {
+				} else if(match[1] == "Series") {
+					newItem.series = match[2];
+				} else if(match[1] == "Description") {
+					var pageMatch = /([0-9]+) p\.?/
+					var m = pageMatch.exec(match[2]);
+					if(m) {
+						newItem.pages = m[1];
+					}
+				} else if(match[1] == ''Author(s)'' || match[1] == "Corp Author(s)") {
 					var yearRegexp = /[0-9]{4}-([0-9]{4})?/;
 					
 					var authors = match[2].split('';'');
@ -195,44 +203,33 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006
 						for(var j=1; j<authors.length; j+=2) {
 							if(authors[j-1].substring(0, 1) != ''('' && !yearRegexp.test(authors[j])) {
 								// ignore places where there are parentheses		
-								newItem.creators.push(Scholar.Utilities.cleanAuthor(authors[j], "author", true));
+								newItem.creators.push({lastName:authors[j], creatorType:"author", isInstitution:true});
 							}
 						}
 					} else {
 						newItem.creators.push(Scholar.Utilities.cleanString(match[2]));
 					}
 				} else if(match[1] == ''Publication'') {
-					// Don''t even try to deal with this. The WorldCat metadata is of poor enough quality that this isn''t worth it.
 					match[2] = Scholar.Utilities.cleanString(match[2]);
 					if(match[2].substring(match[2].length-1) == '','') {
-							match[2] = match[2].substring(0, match[2].length-1);
+						match[2] = match[2].substring(0, match[2].length-1);
+					}
+					
+					// most, but not all, WorldCat publisher/places are
+					// colon delimited
+					var parts = match[2].split(/ ?: ?/);
+					if(parts.length == 2) {
+						newItem.place = parts[0];
+						newItem.publisher = parts[1];
+					} else {
+						newItem.publisher = match[2];
 					}
-					newItem.publisher = match[2];
 				} else if(match[1] == ''Institution'') {
 					newItem.publisher = match[2];
 				} else if(match[1] == ''Standard No'') {
-					var identifiers = match[2].split(/ +/);
-					var j=0;
-					while(j<(identifiers.length-1)) {
-							var type = identifiers[j].substring(0, identifiers[j].length-1);
-							var lastChar;
-							var value;
-	
-							j++;
-							while(j<identifiers.length && (lastChar = identifiers[j].substring(identifiers[j].length-1)) != '':'') {
-								if(identifiers[j].substring(0, 1) != ''('') {
-									if(lastChar == '';'') {
-										value = identifiers[j].substring(0, identifiers[j].length-1);
-									} else {
-										value = identifiers[j];
-									}
-									if(type == "ISBN" || type == "ISSN") {
-										newItem[type] = value;
-									}
-								}
-								j++;
-							}
-					}
+					var ISBNRe = /ISBN:\s*([0-9X]+)/
+					var m = ISBNRe.exec(match[2]);
+					if(m) newItem.ISBN = m[1];
 				} else if(match[1] == ''Year'') {
 					newItem.date = match[2];
 				} else if(match[1] == "Descriptor") {
@ -255,7 +252,9 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006
 					if(match[2].substr(0, 8) != "WorldCat") {
 						newItem.itemType = "journalArticle";
 					}
-				} else {
+				} else if(match[1] != "Availability" &&
+				          match[1] != "Find Items About" &&
+				          match[1] != "Document Type") {
 					newItem.extra += match[1]+": "+match[2]+"\n";
 				}
 			} else {
@ -3635,11 +3634,6 @@ function doWeb(doc, url) {
 	if(articleRegexp.test(url)) {
 		scrape(doc);
 	} else {
-		var namespace = doc.documentElement.namespaceURI;
-		var nsResolver = namespace ? function(prefix) {
-			if (prefix == ''x'') return namespace; else return null;
-		} : null;
-		
 		var items = Scholar.Utilities.getItemArray(doc, doc, ''^http://chronicle\\.com/(?:daily|weekly)/[^/]+/'');
 		items = Scholar.selectItems(items);
 			
@ -3735,11 +3729,6 @@ function doWeb(doc, url) {
 	if(articleRegexp.test(url)) {
 		scrape(doc);
 	} else {
-		var namespace = doc.documentElement.namespaceURI;
-		var nsResolver = namespace ? function(prefix) {
-			if (prefix == ''x'') return namespace; else return null;
-		} : null;
-		
 		var items = Scholar.Utilities.getItemArray(doc, doc, "^http://www\\.nybooks\\.com/articles/[0-9]+/");
 		items = Scholar.selectItems(items);
 			
@ -3757,6 +3746,113 @@ function doWeb(doc, url) {
 	}
 }');

+REPLACE INTO "translators" VALUES ('d1bf1c29-4432-4ada-8893-2e29fc88fd9e', '2006-09-06 23:27:00', 4, 'Washington Post', 'Simon Kornblith', '^http://www\.washingtonpost\.com/', 
+'function detectWeb(doc, url) {
+	var namespace = doc.documentElement.namespaceURI;
+	var nsResolver = namespace ? function(prefix) {
+		if (prefix == ''x'') return namespace; else return null;
+	} : null;
+	
+	// don''t say we can scrape when we can''t; make sure user is logged in
+	var signedIn = doc.evaluate(''//a[text() = "Sign out" or text() = "Sign Out"]'',
+							   doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+	if(!signedIn) {
+		return;
+	}
+	
+	var articleRegexp = /http:\/\/www\.washingtonpost\.com\/wp-dyn\/content\/article\/[0-9]+\/[0-9]+\/[0-9]+\/[^\/]+\.html/
+	if(articleRegexp.test(url)) {
+		return "newspaperArticle";
+	} else {
+		var aTags = doc.getElementsByTagName("a");
+		for(var i=0; i<aTags.length; i++) {
+			if(articleRegexp.test(aTags[i].href)) {
+				return "multiple";
+			}
+		}
+	}
+}',
+'function scrape(doc) {
+	var namespace = doc.documentElement.namespaceURI;
+	var nsResolver = namespace ? function(prefix) {
+		if (prefix == ''x'') return namespace; else return null;
+	} : null;
+	
+	var newItem = new Scholar.Item("newspaperArticle");
+	newItem.publicationTitle = "The Washington Post";
+	newItem.ISSN = "0740-5421";
+	
+	newItem.url = doc.location.href;
+	var metaTags = doc.getElementsByTagName("meta");
+	
+	newItem.attachments.push({document:doc, title:"Article (HTML)",
+							  downloadable:true});
+	
+	// grab title from doc title
+	newItem.title = doc.title;
+	
+	var byline = doc.evaluate(''//div[@id="byline"]'', doc, nsResolver,
+	                        XPathResult.ANY_TYPE, null).iterateNext();	
+	// grab authors from byline
+	if(byline) {
+		var authors = byline.textContent.substr(3).split(" and ");
+		for each(var author in authors) {
+			newItem.creators.push(Scholar.Utilities.cleanAuthor(author, "author"));
+		}
+	}
+	
+	var fonts = doc.evaluate(''//div[@id="article"]/p/font/text()'', doc, nsResolver,
+	                        XPathResult.ANY_TYPE, null);
+	var font;
+	while(font = fonts.iterateNext()) {
+		var pageRe = /([^;]+);([\xA0 ]+Pages?[\xA0 ]+([A-Z0-9\-]+))?/
+		// grab pages and date
+		Scholar.Utilities.debug(Scholar.Utilities.cleanString(font.nodeValue));
+		var m = pageRe.exec(font.nodeValue);
+		if(m) {
+			newItem.date = m[1];
+			newItem.pages = m[2];
+			break;
+		}
+	}
+	
+	// grab tags from meta tag
+	var keywords = doc.getElementsByTagName("meta");
+	if(keywords) {
+		keywords = keywords.namedItem("keywords");
+		if(keywords) {
+			keywords = keywords.getAttribute("content");
+			if(keywords) {
+				newItem.tags = keywords.split(/, ?/);
+			}
+		}
+	}
+	
+	newItem.complete();
+}
+
+function doWeb(doc, url) {
+	var articleRegexp = /http:\/\/www\.washingtonpost\.com\/wp-dyn\/content\/article\/[0-9]+\/[0-9]+\/[0-9]+\/[^\/]+\.html/
+	if(articleRegexp.test(url)) {
+		scrape(doc);
+	} else {
+		var items = Scholar.Utilities.getItemArray(doc, doc, articleRegexp);
+		items = Scholar.selectItems(items);
+		
+		if(!items) {
+			return true;
+		}
+		
+		var urls = new Array();
+		for(var i in items) {
+			urls.push(i);
+		}
+		
+		Scholar.Utilities.processDocuments(urls, scrape, function() { Scholar.done(); });
+		Scholar.wait();
+	}
+}');
+
 REPLACE INTO "translators" VALUES ('a07bb62a-4d2d-4d43-ba08-d9679a0122f8', '2006-08-26 16:14:00', 4, 'ABC-CLIO', 'Simon Kornblith', '^http://serials\.abc-clio\.com/active/go/ABC-Clio-Serials_v4.1$', 
 'function detectWeb(doc, url) {
 	var namespace = doc.documentElement.namespaceURI;