- added Washington Post translator
- translation works properly even when a user has switched to a different page
This commit is contained in:
parent
b8ddba3a67
commit
7b7d3d85e3
4 changed files with 178 additions and 69 deletions
|
@ -167,8 +167,6 @@ Scholar_Ingester_Interface.tabClose = function(event) {
|
|||
Scholar_Ingester_Interface.tabSelect = function(event) {
|
||||
var data = Scholar_Ingester_Interface._getData(Scholar_Ingester_Interface.tabBrowser.selectedBrowser);
|
||||
Scholar_Ingester_Interface._updateStatus(data);
|
||||
// Make sure scrape progress is gone
|
||||
Scholar_Ingester_Interface.Progress.kill();
|
||||
}
|
||||
|
||||
Scholar_Ingester_Interface.hidePopup = function(collectionID) {
|
||||
|
|
|
@ -558,14 +558,13 @@ Scholar.Translate.prototype._generateSandbox = function() {
|
|||
this._sandbox.Scholar.nextItem = function() { return me._exportGetItem() };
|
||||
this._sandbox.Scholar.nextCollection = function() { return me._exportGetCollection() }
|
||||
} else {
|
||||
// add routines to add new items
|
||||
this._sandbox.Scholar.Item = Scholar.Translate.ScholarItem;
|
||||
// attach the function to be run when an item is done
|
||||
// copy routines to add new items
|
||||
this._sandbox.Scholar.Item = Scholar.Translate.GenerateScholarItemClass();
|
||||
this._sandbox.Scholar.Item.prototype.complete = function() {me._itemDone(this)};
|
||||
|
||||
if(this.type == "import") {
|
||||
// add routines to add new collections
|
||||
this._sandbox.Scholar.Collection = Scholar.Translate.ScholarCollection;
|
||||
this._sandbox.Scholar.Collection = Scholar.Translate.GenerateScholarItemClass();
|
||||
// attach the function to be run when a collection is done
|
||||
this._sandbox.Scholar.Collection.prototype.complete = function() {me._collectionDone(this)};
|
||||
}
|
||||
|
@ -882,7 +881,7 @@ Scholar.Translate.prototype._translationComplete = function(returnValue) {
|
|||
Scholar.Notifier.trigger("add", "item", this.newItems);
|
||||
}
|
||||
// notify collectionTreeView about updates
|
||||
if(this.newCollections.length) {
|
||||
if(this.newCollections && this.newCollections.length) {
|
||||
Scholar.Notifier.trigger("add", "collection", this.newCollections);
|
||||
}
|
||||
}
|
||||
|
@ -1007,7 +1006,7 @@ Scholar.Translate.prototype._itemTagsAndSeeAlso = function(item, newItem) {
|
|||
/*
|
||||
* executed when an item is done and ready to be loaded into the database
|
||||
*/
|
||||
Scholar.Translate.prototype._itemDone = function(item) {
|
||||
Scholar.Translate.prototype._itemDone = function(item) {
|
||||
if(!this.saveItem) { // if we're not supposed to save the item, just
|
||||
// return the item array
|
||||
|
||||
|
@ -1056,7 +1055,7 @@ Scholar.Translate.prototype._itemDone = function(item) {
|
|||
item.itemType = item.complete = undefined;
|
||||
|
||||
// automatically set access date if URL is set
|
||||
if(item.url && !item.accessDate) {
|
||||
if(item.url && !item.accessDate && this.type == "web") {
|
||||
item.accessDate = (new Date()).toLocaleString();
|
||||
}
|
||||
|
||||
|
@ -1778,26 +1777,34 @@ Scholar.Translate.prototype._storageFunctions = function(read, write) {
|
|||
* inside scraper code
|
||||
*/
|
||||
|
||||
Scholar.Translate.ScholarItem = function(itemType) {
|
||||
// assign item type
|
||||
this.itemType = itemType;
|
||||
// generate creators array
|
||||
this.creators = new Array();
|
||||
// generate notes array
|
||||
this.notes = new Array();
|
||||
// generate tags array
|
||||
this.tags = new Array();
|
||||
// generate see also array
|
||||
this.seeAlso = new Array();
|
||||
// generate file array
|
||||
this.attachments = new Array();
|
||||
Scholar.Translate.GenerateScholarItemClass = function() {
|
||||
var ScholarItem = function(itemType) {
|
||||
// assign item type
|
||||
this.itemType = itemType;
|
||||
// generate creators array
|
||||
this.creators = new Array();
|
||||
// generate notes array
|
||||
this.notes = new Array();
|
||||
// generate tags array
|
||||
this.tags = new Array();
|
||||
// generate see also array
|
||||
this.seeAlso = new Array();
|
||||
// generate file array
|
||||
this.attachments = new Array();
|
||||
};
|
||||
|
||||
return ScholarItem;
|
||||
}
|
||||
|
||||
/* Scholar.Translate.Collection: a class for generating a new top-level
|
||||
* collection from inside scraper code
|
||||
*/
|
||||
|
||||
Scholar.Translate.ScholarCollection = function() {}
|
||||
|
||||
Scholar.Translate.GenerateScholarCollectionClass = function() {
|
||||
var ScholarCollection = Scholar.Translate.ScholarCollection = function() {};
|
||||
|
||||
return ScholarCollection;
|
||||
}
|
||||
|
||||
/* Scholar.Translate.RDF: a class for handling RDF IO
|
||||
*
|
||||
|
|
|
@ -62,7 +62,7 @@ Scholar.Utilities.prototype.cleanString = function(s) {
|
|||
throw "cleanString: argument must be a string";
|
||||
}
|
||||
|
||||
s = s.replace(/[ \xA0\r\n]+/g, " ");
|
||||
s = s.replace(/[\xA0\r\n\s]+/g, " ");
|
||||
s = s.replace(/^\s+/, "");
|
||||
return s.replace(/\s+$/, "");
|
||||
}
|
||||
|
@ -236,13 +236,21 @@ Scholar.Utilities.Ingester.prototype.getItemArray = function(doc, inHere, urlRe,
|
|||
|
||||
// Require link to match this
|
||||
if(urlRe) {
|
||||
var urlRegexp = new RegExp();
|
||||
urlRegexp.compile(urlRe, "i");
|
||||
if(urlRe.exec) {
|
||||
var urlRegexp = urlRe;
|
||||
} else {
|
||||
var urlRegexp = new RegExp();
|
||||
urlRegexp.compile(urlRe, "i");
|
||||
}
|
||||
}
|
||||
// Do not allow text to match this
|
||||
if(rejectRe) {
|
||||
var rejectRegexp = new RegExp();
|
||||
rejectRegexp.compile(rejectRe, "i");
|
||||
if(rejectRe.exec) {
|
||||
var rejectRegexp = rejectRe;
|
||||
} else {
|
||||
var rejectRegexp = new RegExp();
|
||||
rejectRegexp.compile(rejectRe, "i");
|
||||
}
|
||||
}
|
||||
|
||||
if(!inHere.length) {
|
||||
|
@ -253,7 +261,7 @@ Scholar.Utilities.Ingester.prototype.getItemArray = function(doc, inHere, urlRe,
|
|||
var links = inHere[j].getElementsByTagName("a");
|
||||
for(var i=0; i<links.length; i++) {
|
||||
if(!urlRe || urlRegexp.test(links[i].href)) {
|
||||
var text = this.getNodeString(doc, links[i], './/text()', null);
|
||||
var text = links[i].textContent;
|
||||
if(text) {
|
||||
text = this.cleanString(text);
|
||||
if(!rejectRe || !rejectRegexp.test(text)) {
|
||||
|
|
174
scrapers.sql
174
scrapers.sql
|
@ -1,4 +1,4 @@
|
|||
-- 84
|
||||
-- 85
|
||||
|
||||
-- Set the following timestamp to the most recent scraper update date
|
||||
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-31 22:44:00'));
|
||||
|
@ -186,7 +186,15 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006
|
|||
title = title.substring(0, title.length-2);
|
||||
}
|
||||
newItem.title = Scholar.Utilities.capitalizeTitle(title);
|
||||
} else if(match[1] == ''Author(s)'') {
|
||||
} else if(match[1] == "Series") {
|
||||
newItem.series = match[2];
|
||||
} else if(match[1] == "Description") {
|
||||
var pageMatch = /([0-9]+) p\.?/
|
||||
var m = pageMatch.exec(match[2]);
|
||||
if(m) {
|
||||
newItem.pages = m[1];
|
||||
}
|
||||
} else if(match[1] == ''Author(s)'' || match[1] == "Corp Author(s)") {
|
||||
var yearRegexp = /[0-9]{4}-([0-9]{4})?/;
|
||||
|
||||
var authors = match[2].split('';'');
|
||||
|
@ -195,44 +203,33 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006
|
|||
for(var j=1; j<authors.length; j+=2) {
|
||||
if(authors[j-1].substring(0, 1) != ''('' && !yearRegexp.test(authors[j])) {
|
||||
// ignore places where there are parentheses
|
||||
newItem.creators.push(Scholar.Utilities.cleanAuthor(authors[j], "author", true));
|
||||
newItem.creators.push({lastName:authors[j], creatorType:"author", isInstitution:true});
|
||||
}
|
||||
}
|
||||
} else {
|
||||
newItem.creators.push(Scholar.Utilities.cleanString(match[2]));
|
||||
}
|
||||
} else if(match[1] == ''Publication'') {
|
||||
// Don''t even try to deal with this. The WorldCat metadata is of poor enough quality that this isn''t worth it.
|
||||
match[2] = Scholar.Utilities.cleanString(match[2]);
|
||||
if(match[2].substring(match[2].length-1) == '','') {
|
||||
match[2] = match[2].substring(0, match[2].length-1);
|
||||
match[2] = match[2].substring(0, match[2].length-1);
|
||||
}
|
||||
|
||||
// most, but not all, WorldCat publisher/places are
|
||||
// colon delimited
|
||||
var parts = match[2].split(/ ?: ?/);
|
||||
if(parts.length == 2) {
|
||||
newItem.place = parts[0];
|
||||
newItem.publisher = parts[1];
|
||||
} else {
|
||||
newItem.publisher = match[2];
|
||||
}
|
||||
newItem.publisher = match[2];
|
||||
} else if(match[1] == ''Institution'') {
|
||||
newItem.publisher = match[2];
|
||||
} else if(match[1] == ''Standard No'') {
|
||||
var identifiers = match[2].split(/ +/);
|
||||
var j=0;
|
||||
while(j<(identifiers.length-1)) {
|
||||
var type = identifiers[j].substring(0, identifiers[j].length-1);
|
||||
var lastChar;
|
||||
var value;
|
||||
|
||||
j++;
|
||||
while(j<identifiers.length && (lastChar = identifiers[j].substring(identifiers[j].length-1)) != '':'') {
|
||||
if(identifiers[j].substring(0, 1) != ''('') {
|
||||
if(lastChar == '';'') {
|
||||
value = identifiers[j].substring(0, identifiers[j].length-1);
|
||||
} else {
|
||||
value = identifiers[j];
|
||||
}
|
||||
if(type == "ISBN" || type == "ISSN") {
|
||||
newItem[type] = value;
|
||||
}
|
||||
}
|
||||
j++;
|
||||
}
|
||||
}
|
||||
var ISBNRe = /ISBN:\s*([0-9X]+)/
|
||||
var m = ISBNRe.exec(match[2]);
|
||||
if(m) newItem.ISBN = m[1];
|
||||
} else if(match[1] == ''Year'') {
|
||||
newItem.date = match[2];
|
||||
} else if(match[1] == "Descriptor") {
|
||||
|
@ -255,7 +252,9 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006
|
|||
if(match[2].substr(0, 8) != "WorldCat") {
|
||||
newItem.itemType = "journalArticle";
|
||||
}
|
||||
} else {
|
||||
} else if(match[1] != "Availability" &&
|
||||
match[1] != "Find Items About" &&
|
||||
match[1] != "Document Type") {
|
||||
newItem.extra += match[1]+": "+match[2]+"\n";
|
||||
}
|
||||
} else {
|
||||
|
@ -3635,11 +3634,6 @@ function doWeb(doc, url) {
|
|||
if(articleRegexp.test(url)) {
|
||||
scrape(doc);
|
||||
} else {
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
var items = Scholar.Utilities.getItemArray(doc, doc, ''^http://chronicle\\.com/(?:daily|weekly)/[^/]+/'');
|
||||
items = Scholar.selectItems(items);
|
||||
|
||||
|
@ -3735,11 +3729,6 @@ function doWeb(doc, url) {
|
|||
if(articleRegexp.test(url)) {
|
||||
scrape(doc);
|
||||
} else {
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
var items = Scholar.Utilities.getItemArray(doc, doc, "^http://www\\.nybooks\\.com/articles/[0-9]+/");
|
||||
items = Scholar.selectItems(items);
|
||||
|
||||
|
@ -3757,6 +3746,113 @@ function doWeb(doc, url) {
|
|||
}
|
||||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('d1bf1c29-4432-4ada-8893-2e29fc88fd9e', '2006-09-06 23:27:00', 4, 'Washington Post', 'Simon Kornblith', '^http://www\.washingtonpost\.com/',
|
||||
'function detectWeb(doc, url) {
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
// don''t say we can scrape when we can''t; make sure user is logged in
|
||||
var signedIn = doc.evaluate(''//a[text() = "Sign out" or text() = "Sign Out"]'',
|
||||
doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
||||
if(!signedIn) {
|
||||
return;
|
||||
}
|
||||
|
||||
var articleRegexp = /http:\/\/www\.washingtonpost\.com\/wp-dyn\/content\/article\/[0-9]+\/[0-9]+\/[0-9]+\/[^\/]+\.html/
|
||||
if(articleRegexp.test(url)) {
|
||||
return "newspaperArticle";
|
||||
} else {
|
||||
var aTags = doc.getElementsByTagName("a");
|
||||
for(var i=0; i<aTags.length; i++) {
|
||||
if(articleRegexp.test(aTags[i].href)) {
|
||||
return "multiple";
|
||||
}
|
||||
}
|
||||
}
|
||||
}',
|
||||
'function scrape(doc) {
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
var newItem = new Scholar.Item("newspaperArticle");
|
||||
newItem.publicationTitle = "The Washington Post";
|
||||
newItem.ISSN = "0740-5421";
|
||||
|
||||
newItem.url = doc.location.href;
|
||||
var metaTags = doc.getElementsByTagName("meta");
|
||||
|
||||
newItem.attachments.push({document:doc, title:"Article (HTML)",
|
||||
downloadable:true});
|
||||
|
||||
// grab title from doc title
|
||||
newItem.title = doc.title;
|
||||
|
||||
var byline = doc.evaluate(''//div[@id="byline"]'', doc, nsResolver,
|
||||
XPathResult.ANY_TYPE, null).iterateNext();
|
||||
// grab authors from byline
|
||||
if(byline) {
|
||||
var authors = byline.textContent.substr(3).split(" and ");
|
||||
for each(var author in authors) {
|
||||
newItem.creators.push(Scholar.Utilities.cleanAuthor(author, "author"));
|
||||
}
|
||||
}
|
||||
|
||||
var fonts = doc.evaluate(''//div[@id="article"]/p/font/text()'', doc, nsResolver,
|
||||
XPathResult.ANY_TYPE, null);
|
||||
var font;
|
||||
while(font = fonts.iterateNext()) {
|
||||
var pageRe = /([^;]+);([\xA0 ]+Pages?[\xA0 ]+([A-Z0-9\-]+))?/
|
||||
// grab pages and date
|
||||
Scholar.Utilities.debug(Scholar.Utilities.cleanString(font.nodeValue));
|
||||
var m = pageRe.exec(font.nodeValue);
|
||||
if(m) {
|
||||
newItem.date = m[1];
|
||||
newItem.pages = m[2];
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// grab tags from meta tag
|
||||
var keywords = doc.getElementsByTagName("meta");
|
||||
if(keywords) {
|
||||
keywords = keywords.namedItem("keywords");
|
||||
if(keywords) {
|
||||
keywords = keywords.getAttribute("content");
|
||||
if(keywords) {
|
||||
newItem.tags = keywords.split(/, ?/);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
newItem.complete();
|
||||
}
|
||||
|
||||
function doWeb(doc, url) {
|
||||
var articleRegexp = /http:\/\/www\.washingtonpost\.com\/wp-dyn\/content\/article\/[0-9]+\/[0-9]+\/[0-9]+\/[^\/]+\.html/
|
||||
if(articleRegexp.test(url)) {
|
||||
scrape(doc);
|
||||
} else {
|
||||
var items = Scholar.Utilities.getItemArray(doc, doc, articleRegexp);
|
||||
items = Scholar.selectItems(items);
|
||||
|
||||
if(!items) {
|
||||
return true;
|
||||
}
|
||||
|
||||
var urls = new Array();
|
||||
for(var i in items) {
|
||||
urls.push(i);
|
||||
}
|
||||
|
||||
Scholar.Utilities.processDocuments(urls, scrape, function() { Scholar.done(); });
|
||||
Scholar.wait();
|
||||
}
|
||||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('a07bb62a-4d2d-4d43-ba08-d9679a0122f8', '2006-08-26 16:14:00', 4, 'ABC-CLIO', 'Simon Kornblith', '^http://serials\.abc-clio\.com/active/go/ABC-Clio-Serials_v4.1$',
|
||||
'function detectWeb(doc, url) {
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
|
|
Loading…
Reference in a new issue