zotero/translators/washingtonpost.com.js

{
	"translatorID":"d1bf1c29-4432-4ada-8893-2e29fc88fd9e",
	"translatorType":4,
	"label":"washingtonpost.com",
	"creator":"Simon Kornblith",
	"target":"^http://www\\.washingtonpost\\.com/",
	"minVersion":"1.0.0b3.r1",
	"maxVersion":"",
	"priority":100,
	"inRepository":true,
	"lastUpdated":"2007-06-21 20:10:00"
}

function detectWeb(doc, url) {
	var namespace = doc.documentElement.namespaceURI;
	var nsResolver = namespace ? function(prefix) {
		if (prefix == 'x') return namespace; else return null;
	} : null;

	// don't say we can scrape when we can't; make sure user is logged in
	var signedIn = doc.evaluate('//a[text() = "Sign out" or text() = "Sign Out"]',
							   doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
	if(!signedIn) {
		return;
	}

	var articleRegexp = /http:\/\/www\.washingtonpost\.com\/wp-dyn\/content\/article\/[0-9]+\/[0-9]+\/[0-9]+\/[^\/]+\.html/
	if(articleRegexp.test(url)) {
		return "newspaperArticle";
	} else {
		var aTags = doc.getElementsByTagName("a");
		for(var i=0; i<aTags.length; i++) {
			if(articleRegexp.test(aTags[i].href)) {
				return "multiple";
			}
		}
	}
}

function scrape(doc) {
	var namespace = doc.documentElement.namespaceURI;
	var nsResolver = namespace ? function(prefix) {
		if (prefix == 'x') return namespace; else return null;
	} : null;

	var newItem = new Zotero.Item("newspaperArticle");
	newItem.publicationTitle = "The Washington Post";
	newItem.ISSN = "0740-5421";

	newItem.url = doc.location.href;
	var metaTags = doc.getElementsByTagName("meta");

	// Elena's code to grab print version (all pages)
	snapshotURL=doc.location.href.replace(".html", "_pf.html");
	newItem.attachments.push({title:"Washington Post Snapshot", mimeType:"text/html", url:snapshotURL, snapshot:true});

	// grab title from doc title
	newItem.title = doc.title.replace(" - washingtonpost.com", "");

	var byline = doc.evaluate('//div[@id="byline"]', doc, nsResolver,
	                        XPathResult.ANY_TYPE, null).iterateNext();
	// grab authors from byline
	if(byline) {
		var authors = byline.textContent.substr(3).split(" and ");
		for each(var author in authors) {
			newItem.creators.push(Zotero.Utilities.cleanAuthor(author, "author"));
		}
	}

	var fonts = doc.evaluate('//div[@id="article"]/p/font/text()', doc, nsResolver,
	                        XPathResult.ANY_TYPE, null);
	var font;
	while(font = fonts.iterateNext()) {
		var pageRe = /([^;]+);(?:[\xA0 ]+Pages?[\xA0 ]+([A-Z0-9\-]+))?/
		// grab pages and date
		Zotero.debug(Zotero.Utilities.trimInternal(font.nodeValue));
		var m = pageRe.exec(font.nodeValue);
		if(m) {
			newItem.date = m[1];
			newItem.pages = m[2];
			break;
		}
	}

	// grab tags from meta tag
	var keywords = doc.getElementsByTagName("meta");
	if(keywords) {
		keywords = keywords.namedItem("keywords");
		if(keywords) {
			keywords = keywords.getAttribute("content");
			if(keywords) {
				newItem.tags = keywords.split(/, ?/);
			}
		}
	}

	newItem.complete();
}

function doWeb(doc, url) {
	var articleRegexp = /http:\/\/www\.washingtonpost\.com\/wp-dyn\/content\/article\/[0-9]+\/[0-9]+\/[0-9]+\/[^\/]+\.html/
	if(articleRegexp.test(url)) {
		scrape(doc);
	} else {
		var items = Zotero.Utilities.getItemArray(doc, doc, articleRegexp);
		items = Zotero.selectItems(items);

		if(!items) {
			return true;
		}

		var urls = new Array();
		for(var i in items) {
			urls.push(i);
		}

		Zotero.Utilities.processDocuments(urls, scrape, function() { Zotero.done(); });
		Zotero.wait();
	}
}