- new translators: New York Review of Books, Chronicle of Higher Education
- more useful errors in utilities - fixes minor bugs in citation styling
This commit is contained in:
parent
451be4b3a3
commit
cf8dc232b1
3 changed files with 242 additions and 10 deletions
|
@ -104,7 +104,6 @@ CSL.prototype.preprocessItems = function(items) {
|
|||
if(!item._csl || item._csl.dateModified != dateModified) {
|
||||
// namespace everything in item._csl so there's no chance of overlap
|
||||
item._csl = new Object();
|
||||
item._csl.ignore = new Array();
|
||||
item._csl.dateModified = dateModified;
|
||||
|
||||
// separate item into authors, editors, translators
|
||||
|
@ -115,11 +114,10 @@ CSL.prototype.preprocessItems = function(items) {
|
|||
|
||||
// parse date
|
||||
item._csl.date = CSL.prototype._processDate(item.getField("date"));
|
||||
} else {
|
||||
// clear disambiguation and subsequent author substitute
|
||||
if(item._csl.disambiguation) item._csl.date.disambiguation = undefined;
|
||||
if(item._csl.subsequentAuthorSubstitute) item._csl.subsequentAuthorSubstitute = undefined;
|
||||
}
|
||||
// clear disambiguation and subsequent author substitute
|
||||
if(item._csl.disambiguation) item._csl.date.disambiguation = undefined;
|
||||
if(item._csl.subsequentAuthorSubstitute) item._csl.subsequentAuthorSubstitute = undefined;
|
||||
}
|
||||
|
||||
// sort by sort order
|
||||
|
@ -179,7 +177,7 @@ CSL.prototype.preprocessItems = function(items) {
|
|||
item._csl.number = i;
|
||||
|
||||
// handle subsequent author substitutes
|
||||
if(this._bib.subsequentAuthorSubstitute && lastAuthor == author) {
|
||||
if(lastAuthor == author) {
|
||||
item._csl.subsequentAuthorSubstitute = true;
|
||||
}
|
||||
lastAuthor = author;
|
||||
|
|
|
@ -26,6 +26,10 @@ Scholar.Utilities.prototype.strToDate = function(date) {
|
|||
* Cleans extraneous punctuation off an author name
|
||||
*/
|
||||
Scholar.Utilities.prototype.cleanAuthor = function(author, type, useComma) {
|
||||
if(typeof(author) != "string") {
|
||||
throw "cleanAuthor: author must be a string";
|
||||
}
|
||||
|
||||
author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
|
||||
author = author.replace(/[\s\,\/\[\]\:\.]+$/, '');
|
||||
author = author.replace(/ +/, ' ');
|
||||
|
@ -54,6 +58,10 @@ Scholar.Utilities.prototype.cleanAuthor = function(author, type, useComma) {
|
|||
* Cleans whitespace off a string and replaces multiple spaces with one
|
||||
*/
|
||||
Scholar.Utilities.prototype.cleanString = function(s) {
|
||||
if(typeof(s) != "string") {
|
||||
throw "cleanString: argument must be a string";
|
||||
}
|
||||
|
||||
s = s.replace(/[ \xA0\r\n]+/g, " ");
|
||||
s = s.replace(/^\s+/, "");
|
||||
return s.replace(/\s+$/, "");
|
||||
|
@ -63,6 +71,10 @@ Scholar.Utilities.prototype.cleanString = function(s) {
|
|||
* Cleans any non-word non-parenthesis characters off the ends of a string
|
||||
*/
|
||||
Scholar.Utilities.prototype.superCleanString = function(x) {
|
||||
if(typeof(s) != "string") {
|
||||
throw "superCleanString: argument must be a string";
|
||||
}
|
||||
|
||||
var x = x.replace(/^[^\w(]+/, "");
|
||||
return x.replace(/[^\w)]+$/, "");
|
||||
}
|
||||
|
@ -71,6 +83,10 @@ Scholar.Utilities.prototype.superCleanString = function(x) {
|
|||
* Eliminates HTML tags, replacing <br>s with /ns
|
||||
*/
|
||||
Scholar.Utilities.prototype.cleanTags = function(x) {
|
||||
if(typeof(s) != "string") {
|
||||
throw "cleanTags: argument must be a string";
|
||||
}
|
||||
|
||||
x = x.replace(/<br[^>]*>/gi, "\n");
|
||||
return x.replace(/<[^>]+>/g, "");
|
||||
}
|
||||
|
@ -118,6 +134,10 @@ Scholar.Utilities.prototype.inArray = Scholar.inArray;
|
|||
* pads a number or other string with a given string on the left
|
||||
*/
|
||||
Scholar.Utilities.prototype.lpad = function(string, pad, length) {
|
||||
if(typeof(s) != "string") {
|
||||
throw "lpad: argument must be a string";
|
||||
}
|
||||
|
||||
while(string.length < length) {
|
||||
string = pad + string;
|
||||
}
|
||||
|
|
222
scrapers.sql
222
scrapers.sql
|
@ -1,4 +1,4 @@
|
|||
-- 81
|
||||
-- 82
|
||||
|
||||
-- Set the following timestamp to the most recent scraper update date
|
||||
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-31 22:44:00'));
|
||||
|
@ -3458,7 +3458,7 @@ function scrape(doc, url) {
|
|||
return;
|
||||
}
|
||||
|
||||
newItem.attachments.push({url:url, title:"New York Times Article",
|
||||
newItem.attachments.push({url:url, title:"Article (HTML)",
|
||||
mimeType:"text/html", downloadable:true});
|
||||
} else {
|
||||
newItem.url = doc.location.href;
|
||||
|
@ -3471,7 +3471,7 @@ function scrape(doc, url) {
|
|||
}
|
||||
}
|
||||
|
||||
newItem.attachments.push({document:doc, title:"New York Times Article",
|
||||
newItem.attachments.push({document:doc, title:"Article (HTML)",
|
||||
downloadable:true});
|
||||
}
|
||||
|
||||
|
@ -3543,6 +3543,220 @@ function doWeb(doc, url) {
|
|||
}
|
||||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('1e6d1529-246f-4429-84e2-1f1b180b250d', '2006-09-06 17:54:00', 4, 'Chronicle of Higher Education', 'Simon Kornblith', '^http://chronicle\.com/',
|
||||
'function detectWeb(doc, url) {
|
||||
var articleRegexp = /^http:\/\/chronicle\.com\/(?:daily|weekly)\/[^/]+\//
|
||||
if(articleRegexp.test(url)) {
|
||||
if(doc.location.href.indexOf("weekly") != -1) {
|
||||
return "magazineArticle";
|
||||
} else {
|
||||
return "website";
|
||||
}
|
||||
} else {
|
||||
var aTags = doc.getElementsByTagName("a");
|
||||
for(var i=0; i<aTags.length; i++) {
|
||||
if(articleRegexp.test(aTags[i].href)) {
|
||||
return "multiple";
|
||||
}
|
||||
}
|
||||
}
|
||||
}',
|
||||
'function associateMeta(newItem, metaTags, field, scholarField) {
|
||||
if(metaTags.namedItem(field)) {
|
||||
newItem[scholarField] = Scholar.Utilities.cleanString(metaTags.namedItem(field).getAttribute("content"));
|
||||
}
|
||||
}
|
||||
|
||||
function scrape(doc) {
|
||||
if(doc.location.href.indexOf("weekly") != -1) {
|
||||
var newItem = new Scholar.Item("magazineArticle");
|
||||
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
// go in search of pages
|
||||
var content = doc.evaluate(''/html/body/table[@class="layout"]/tbody/tr[1]/td[@class="content"]'',
|
||||
doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
||||
if(content) {
|
||||
var pagesRegexp = /http:\/\/chronicle.com\nSection: [^\n]+\nVolume [0-9]+, Issue [0-9]+, Pages? ([A-Z0-9\-]+)/;
|
||||
var m = pagesRegexp.exec(content.textContent);
|
||||
if(m) {
|
||||
newItem.pages = m[1];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
var newItem = new Scholar.Item("website");
|
||||
}
|
||||
newItem.publicationTitle = "The Chronicle of Higher Education";
|
||||
newItem.ISSN = "0009-5982";
|
||||
|
||||
newItem.url = doc.location.href;
|
||||
var metaTags = doc.getElementsByTagName("meta");
|
||||
|
||||
newItem.attachments.push({document:doc, title:"Article (HTML)",
|
||||
downloadable:true});
|
||||
|
||||
associateMeta(newItem, metaTags, "published_date", "date");
|
||||
associateMeta(newItem, metaTags, "headline", "title");
|
||||
associateMeta(newItem, metaTags, "section", "section");
|
||||
associateMeta(newItem, metaTags, "volume", "volume");
|
||||
associateMeta(newItem, metaTags, "issue", "issue");
|
||||
|
||||
if(metaTags.namedItem("byline")) {
|
||||
var author = Scholar.Utilities.cleanString(metaTags.namedItem("byline").getAttribute("content"));
|
||||
if(author.substr(0, 3).toLowerCase() == "by ") {
|
||||
author = author.substr(3);
|
||||
}
|
||||
|
||||
var authors = author.split(" and ");
|
||||
for each(var author in authors) {
|
||||
// fix capitalization
|
||||
var words = author.split(" ");
|
||||
for(var i in words) {
|
||||
words[i] = words[i][0].toUpperCase()+words[i].substr(1).toLowerCase();
|
||||
}
|
||||
author = words.join(" ");
|
||||
|
||||
if(words[0] == "The") {
|
||||
newItem.creators.push({lastName:author, creatorType:"author"});
|
||||
} else {
|
||||
newItem.creators.push(Scholar.Utilities.cleanAuthor(author, "author"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
newItem.complete();
|
||||
}
|
||||
|
||||
function doWeb(doc, url) {
|
||||
var articleRegexp = /^http:\/\/chronicle\.com\/(?:daily|weekly)\/[^/]+\//;
|
||||
if(articleRegexp.test(url)) {
|
||||
scrape(doc);
|
||||
} else {
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
var items = Scholar.Utilities.getItemArray(doc, doc, ''^http://chronicle\\.com/(?:daily|weekly)/[^/]+/'');
|
||||
items = Scholar.selectItems(items);
|
||||
|
||||
if(!items) {
|
||||
return true;
|
||||
}
|
||||
|
||||
var urls = new Array();
|
||||
for(var i in items) {
|
||||
urls.push(i);
|
||||
}
|
||||
|
||||
Scholar.Utilities.processDocuments(urls, scrape, function() { Scholar.done(); });
|
||||
Scholar.wait();
|
||||
}
|
||||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('4c164cc8-be7b-4d02-bfbf-37a5622dfd56', '2006-09-06 18:54:00', 4, 'New York Review of Books', 'Simon Kornblith', '^http://www\.nybooks\.com/',
|
||||
'function detectWeb(doc, url) {
|
||||
var articleRegexp = /^http:\/\/www\.nybooks\.com\/articles\/[0-9]+/
|
||||
if(articleRegexp.test(url)) {
|
||||
return "journalArticle";
|
||||
} else {
|
||||
var aTags = doc.getElementsByTagName("a");
|
||||
for(var i=0; i<aTags.length; i++) {
|
||||
if(articleRegexp.test(aTags[i].href)) {
|
||||
return "multiple";
|
||||
}
|
||||
}
|
||||
}
|
||||
}',
|
||||
'function associateMeta(newItem, metaTags, field, scholarField) {
|
||||
if(metaTags.namedItem(field)) {
|
||||
newItem[scholarField] = Scholar.Utilities.cleanString(metaTags.namedItem(field).getAttribute("content"));
|
||||
}
|
||||
}
|
||||
|
||||
function scrape(doc) {
|
||||
var newItem = new Scholar.Item("journalArticle");
|
||||
newItem.publicationTitle = "The New York Review of Books";
|
||||
newItem.ISSN = "0028-7504";
|
||||
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
newItem.url = doc.location.href;
|
||||
var metaTags = doc.getElementsByTagName("meta");
|
||||
|
||||
newItem.attachments.push({document:doc, title:"Review (HTML)",
|
||||
downloadable:true});
|
||||
|
||||
associateMeta(newItem, metaTags, "dc.title", "title");
|
||||
|
||||
var info = doc.evaluate(''//div[@id="center-content"]/h4[@class="date"]'',
|
||||
doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
||||
|
||||
if(info) {
|
||||
// get date (which is in an a tag)
|
||||
newItem.date = doc.evaluate("./a", info, nsResolver, XPathResult.ANY_TYPE,
|
||||
null).iterateNext();
|
||||
if(newItem.date) {
|
||||
newItem.date = newItem.date.textContent;
|
||||
}
|
||||
|
||||
info = Scholar.Utilities.cleanString(info.textContent);
|
||||
|
||||
// get volume and issue
|
||||
var infoRe = /Volume ([0-9]+), Number ([0-9]+)/;
|
||||
var m = infoRe.exec(info);
|
||||
if(m) {
|
||||
newItem.volume = m[1];
|
||||
newItem.issue = m[2];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
var authors = doc.evaluate(''//div[@id="center-content"]/h4/a[substring(@href, 1, 9) = "/authors/"]'',
|
||||
doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
|
||||
|
||||
var author;
|
||||
while(author = authors.iterateNext()) {
|
||||
newItem.creators.push(Scholar.Utilities.cleanAuthor(author.textContent, "author", false));
|
||||
}
|
||||
|
||||
newItem.complete();
|
||||
}
|
||||
|
||||
function doWeb(doc, url) {
|
||||
var articleRegexp = /^http:\/\/www\.nybooks\.com\/articles\/[0-9]+/
|
||||
if(articleRegexp.test(url)) {
|
||||
scrape(doc);
|
||||
} else {
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
var items = Scholar.Utilities.getItemArray(doc, doc, "^http://www\\.nybooks\\.com/articles/[0-9]+/");
|
||||
items = Scholar.selectItems(items);
|
||||
|
||||
if(!items) {
|
||||
return true;
|
||||
}
|
||||
|
||||
var urls = new Array();
|
||||
for(var i in items) {
|
||||
urls.push(i);
|
||||
}
|
||||
|
||||
Scholar.Utilities.processDocuments(urls, scrape, function() { Scholar.done(); });
|
||||
Scholar.wait();
|
||||
}
|
||||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('a07bb62a-4d2d-4d43-ba08-d9679a0122f8', '2006-08-26 16:14:00', 4, 'ABC-CLIO', 'Simon Kornblith', '^http://serials\.abc-clio\.com/active/go/ABC-Clio-Serials_v4.1$',
|
||||
'function detectWeb(doc, url) {
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
|
@ -5401,7 +5615,7 @@ var inputTypeMap = {
|
|||
INPR:"manuscript",
|
||||
JFULL:"journalArticle",
|
||||
MAP:"artwork",
|
||||
PAMP:"book",
|
||||
PAMP:"manuscript",
|
||||
RPRT:"book",
|
||||
SER:"book",
|
||||
SLIDE:"artwork",
|
||||
|
|
Loading…
Reference in a new issue