120 lines
No EOL
3.4 KiB
JavaScript
120 lines
No EOL
3.4 KiB
JavaScript
{
|
|
"translatorID":"d1bf1c29-4432-4ada-8893-2e29fc88fd9e",
|
|
"translatorType":4,
|
|
"label":"washingtonpost.com",
|
|
"creator":"Simon Kornblith",
|
|
"target":"^http://www\\.washingtonpost\\.com/",
|
|
"minVersion":"1.0.0b3.r1",
|
|
"maxVersion":"",
|
|
"priority":100,
|
|
"inRepository":true,
|
|
"lastUpdated":"2007-06-21 20:10:00"
|
|
}
|
|
|
|
function detectWeb(doc, url) {
|
|
var namespace = doc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == 'x') return namespace; else return null;
|
|
} : null;
|
|
|
|
// don't say we can scrape when we can't; make sure user is logged in
|
|
var signedIn = doc.evaluate('//a[text() = "Sign out" or text() = "Sign Out"]',
|
|
doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
|
if(!signedIn) {
|
|
return;
|
|
}
|
|
|
|
var articleRegexp = /http:\/\/www\.washingtonpost\.com\/wp-dyn\/content\/article\/[0-9]+\/[0-9]+\/[0-9]+\/[^\/]+\.html/
|
|
if(articleRegexp.test(url)) {
|
|
return "newspaperArticle";
|
|
} else {
|
|
var aTags = doc.getElementsByTagName("a");
|
|
for(var i=0; i<aTags.length; i++) {
|
|
if(articleRegexp.test(aTags[i].href)) {
|
|
return "multiple";
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
function scrape(doc) {
|
|
var namespace = doc.documentElement.namespaceURI;
|
|
var nsResolver = namespace ? function(prefix) {
|
|
if (prefix == 'x') return namespace; else return null;
|
|
} : null;
|
|
|
|
var newItem = new Zotero.Item("newspaperArticle");
|
|
newItem.publicationTitle = "The Washington Post";
|
|
newItem.ISSN = "0740-5421";
|
|
|
|
newItem.url = doc.location.href;
|
|
var metaTags = doc.getElementsByTagName("meta");
|
|
|
|
// Elena's code to grab print version (all pages)
|
|
snapshotURL=doc.location.href.replace(".html", "_pf.html");
|
|
newItem.attachments.push({title:"Washington Post Snapshot", mimeType:"text/html", url:snapshotURL, snapshot:true});
|
|
|
|
// grab title from doc title
|
|
newItem.title = doc.title.replace(" - washingtonpost.com", "");
|
|
|
|
var byline = doc.evaluate('//div[@id="byline"]', doc, nsResolver,
|
|
XPathResult.ANY_TYPE, null).iterateNext();
|
|
// grab authors from byline
|
|
if(byline) {
|
|
var authors = byline.textContent.substr(3).split(" and ");
|
|
for each(var author in authors) {
|
|
newItem.creators.push(Zotero.Utilities.cleanAuthor(author, "author"));
|
|
}
|
|
}
|
|
|
|
var fonts = doc.evaluate('//div[@id="article"]/p/font/text()', doc, nsResolver,
|
|
XPathResult.ANY_TYPE, null);
|
|
var font;
|
|
while(font = fonts.iterateNext()) {
|
|
var pageRe = /([^;]+);(?:[\xA0 ]+Pages?[\xA0 ]+([A-Z0-9\-]+))?/
|
|
// grab pages and date
|
|
Zotero.debug(Zotero.Utilities.trimInternal(font.nodeValue));
|
|
var m = pageRe.exec(font.nodeValue);
|
|
if(m) {
|
|
newItem.date = m[1];
|
|
newItem.pages = m[2];
|
|
break;
|
|
}
|
|
}
|
|
|
|
// grab tags from meta tag
|
|
var keywords = doc.getElementsByTagName("meta");
|
|
if(keywords) {
|
|
keywords = keywords.namedItem("keywords");
|
|
if(keywords) {
|
|
keywords = keywords.getAttribute("content");
|
|
if(keywords) {
|
|
newItem.tags = keywords.split(/, ?/);
|
|
}
|
|
}
|
|
}
|
|
|
|
newItem.complete();
|
|
}
|
|
|
|
function doWeb(doc, url) {
|
|
var articleRegexp = /http:\/\/www\.washingtonpost\.com\/wp-dyn\/content\/article\/[0-9]+\/[0-9]+\/[0-9]+\/[^\/]+\.html/
|
|
if(articleRegexp.test(url)) {
|
|
scrape(doc);
|
|
} else {
|
|
var items = Zotero.Utilities.getItemArray(doc, doc, articleRegexp);
|
|
items = Zotero.selectItems(items);
|
|
|
|
if(!items) {
|
|
return true;
|
|
}
|
|
|
|
var urls = new Array();
|
|
for(var i in items) {
|
|
urls.push(i);
|
|
}
|
|
|
|
Zotero.Utilities.processDocuments(urls, scrape, function() { Zotero.done(); });
|
|
Zotero.wait();
|
|
}
|
|
} |