e20b02c124
And warn instead of throwing in the future. Fixes #4162
1658 lines
45 KiB
JavaScript
1658 lines
45 KiB
JavaScript
/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
|
|
/* eslint-disable quote-props */
|
|
/* globals SAXXMLReader */
|
|
|
|
"use strict";
|
|
|
|
function LOG(str) {
|
|
Zotero.debug("Feed Processor: " + str);
|
|
}
|
|
|
|
const XMLNS = "http://www.w3.org/XML/1998/namespace";
|
|
const RSS090NS = "http://my.netscape.com/rdf/simple/0.9/";
|
|
|
|
/** *** Some general utils *****/
|
|
function strToURI(link, base) {
|
|
base = base || undefined;
|
|
try {
|
|
return new URL(link, base);
|
|
}
|
|
catch (e) {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function isArray(a) {
|
|
return isObject(a) && a.constructor == Array;
|
|
}
|
|
|
|
function isObject(a) {
|
|
return (a && typeof a == "object") || isFunction(a);
|
|
}
|
|
|
|
function isFunction(a) {
|
|
return typeof a == "function";
|
|
}
|
|
|
|
function stripTags(someHTML) {
|
|
return someHTML.replace(/<[^>]+>/g, "");
|
|
}
|
|
|
|
/**
|
|
* Searches through an array of links and returns a JS array
|
|
* of matching property bags.
|
|
*/
|
|
const IANA_URI = "http://www.iana.org/assignments/relation/";
|
|
function findAtomLinks(rel, links) {
|
|
var rvLinks = [];
|
|
for (var i = 0; i < links.length; ++i) {
|
|
var linkElement = links[i];
|
|
// atom:link MUST have @href
|
|
if (linkElement.href) {
|
|
var relAttribute = null;
|
|
if (linkElement.rel) {
|
|
relAttribute = linkElement.rel;
|
|
}
|
|
if ((!relAttribute && rel == "alternate") || relAttribute == rel) {
|
|
rvLinks.push(linkElement);
|
|
continue;
|
|
}
|
|
// catch relations specified by IANA URI
|
|
if (relAttribute == IANA_URI + rel) {
|
|
rvLinks.push(linkElement);
|
|
}
|
|
}
|
|
}
|
|
return rvLinks;
|
|
}
|
|
|
|
function xmlEscape(s) {
|
|
s = s.replace(/&/g, "&");
|
|
s = s.replace(/>/g, ">");
|
|
s = s.replace(/</g, "<");
|
|
s = s.replace(/"/g, """);
|
|
s = s.replace(/'/g, "'");
|
|
return s;
|
|
}
|
|
|
|
function makePropGetter(key) {
|
|
return function(bag) {
|
|
return bag[key];
|
|
};
|
|
}
|
|
|
|
const RDF_NS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
|
|
// namespace map
|
|
var gNamespaces = {
|
|
"http://webns.net/mvcb/": "admin",
|
|
"http://backend.userland.com/rss": "",
|
|
"http://blogs.law.harvard.edu/tech/rss": "",
|
|
"http://www.w3.org/2005/Atom": "atom",
|
|
"http://purl.org/atom/ns#": "atom03",
|
|
"http://purl.org/rss/1.0/modules/content/": "content",
|
|
"http://purl.org/dc/elements/1.1/": "dc",
|
|
"http://purl.org/dc/terms/": "dcterms",
|
|
"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
|
|
"http://purl.org/rss/1.0/": "rss1",
|
|
"http://my.netscape.com/rdf/simple/0.9/": "rss1",
|
|
"http://wellformedweb.org/CommentAPI/": "wfw",
|
|
"http://purl.org/rss/1.0/modules/wiki/": "wiki",
|
|
"http://www.w3.org/XML/1998/namespace": "xml",
|
|
"http://search.yahoo.com/mrss/": "media",
|
|
"http://search.yahoo.com/mrss": "media",
|
|
"http://prismstandard.org/namespaces/1.2/basic/": "prism",
|
|
"http://prismstandard.org/namespaces/basic/2.0/": "prism",
|
|
"http://prismstandard.org/namespaces/basic/3.0/": "prism",
|
|
"https://prismdb.takanakahiko.me/prism-schema.ttl#": "prism",
|
|
};
|
|
|
|
// We allow a very small set of namespaces in XHTML content,
|
|
// for attributes only
|
|
var gAllowedXHTMLNamespaces = {
|
|
"http://www.w3.org/XML/1998/namespace": "xml",
|
|
// if someone ns qualifies XHTML, we have to prefix it to avoid an
|
|
// attribute collision.
|
|
"http://www.w3.org/1999/xhtml": "xhtml",
|
|
};
|
|
|
|
// Implements nsIFeedResult
|
|
function FeedResult() {}
|
|
FeedResult.prototype = {
|
|
bozo: false,
|
|
doc: null,
|
|
version: null,
|
|
headers: null,
|
|
uri: null,
|
|
stylesheet: null,
|
|
};
|
|
|
|
// Implements nsIFeed, nsIFeedContainer
|
|
function Feed() {
|
|
this.subtitle = null;
|
|
this.title = null;
|
|
this.items = [];
|
|
this.link = null;
|
|
this.id = null;
|
|
this.generator = null;
|
|
this.authors = [];
|
|
this.contributors = [];
|
|
this.baseURI = null;
|
|
this.enclosureCount = 0;
|
|
this.type = Feed.TYPE_FEED;
|
|
}
|
|
|
|
Feed.TYPE_FEED = 0;
|
|
Feed.TYPE_AUDIO = 1;
|
|
Feed.TYPE_IMAGE = 2;
|
|
Feed.TYPE_VIDEO = 4;
|
|
|
|
Feed.prototype = {
|
|
searchLists: {
|
|
title: ["title", "rss1:title", "atom03:title", "atom:title", "dc:title"],
|
|
subtitle: [
|
|
"description",
|
|
"dc:description",
|
|
"rss1:description",
|
|
"atom03:tagline",
|
|
"atom:subtitle",
|
|
],
|
|
pubTitle: ["pubTitle", "dc:source", "prism:publicationName"],
|
|
items: ["items", "atom03_entries", "entries"],
|
|
id: ["atom:id", "rdf:about"],
|
|
generator: ["generator"],
|
|
authors: ["authors"],
|
|
contributors: ["contributors"],
|
|
link: [["link", strToURI], ["rss1:link", strToURI]],
|
|
categories: ["categories", "dc:subject"],
|
|
rights: [
|
|
"dc:rights",
|
|
"atom03:rights",
|
|
"atom:rights",
|
|
"copyright",
|
|
"prism:copyright",
|
|
],
|
|
cloud: ["cloud"],
|
|
image: ["image", "rss1:image", "atom:logo"],
|
|
textInput: ["textInput", "rss1:textinput"],
|
|
skipDays: ["skipDays"],
|
|
skipHours: ["skipHours"],
|
|
ttl: ["ttl"],
|
|
updated: [
|
|
"pubDate",
|
|
"lastBuildDate",
|
|
"atom03:modified",
|
|
"dc:date",
|
|
"dcterms:modified",
|
|
"atom:updated",
|
|
],
|
|
issn: ["prism:issn"],
|
|
isbn: ["isbn", "prism:isbn"],
|
|
language: ["language", "dc:language"],
|
|
publisher: ["dc:publisher"],
|
|
},
|
|
|
|
normalize: function () {
|
|
fieldsToObj(this, this.searchLists);
|
|
if (this.skipDays) {
|
|
this.skipDays = this.skipDays.days;
|
|
}
|
|
if (this.skipHours) {
|
|
this.skipHours = this.skipHours.hours;
|
|
}
|
|
|
|
// Assign Atom link if needed
|
|
if (this.fields.links) {
|
|
this._atomLinksToURI();
|
|
}
|
|
|
|
this._calcEnclosureCountAndFeedType();
|
|
|
|
// Resolve relative image links
|
|
if (this.image && this.image.url) {
|
|
this._resolveImageLink();
|
|
}
|
|
|
|
this._resetBagMembersToRawText([this.searchLists.subtitle, this.searchLists.title]);
|
|
},
|
|
|
|
_calcEnclosureCountAndFeedType: function () {
|
|
var entriesWithEnclosures = 0;
|
|
var audioCount = 0;
|
|
var imageCount = 0;
|
|
var videoCount = 0;
|
|
var otherCount = 0;
|
|
|
|
for (var i = 0; i < this.items.length; ++i) {
|
|
var entry = this.items[i];
|
|
|
|
if (entry.enclosures && entry.enclosures.length > 0) {
|
|
++entriesWithEnclosures;
|
|
|
|
for (var e = 0; e < entry.enclosures.length; ++e) {
|
|
var enc = entry.enclosures[e];
|
|
if (enc.type) {
|
|
var enctype = enc.type;
|
|
|
|
if (/^audio/.test(enctype)) {
|
|
++audioCount;
|
|
}
|
|
else if (/^image/.test(enctype)) {
|
|
++imageCount;
|
|
}
|
|
else if (/^video/.test(enctype)) {
|
|
++videoCount;
|
|
}
|
|
else {
|
|
++otherCount;
|
|
}
|
|
}
|
|
else {
|
|
++otherCount;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
var feedtype = Feed.TYPE_FEED;
|
|
|
|
// For a feed to be marked as TYPE_VIDEO, TYPE_AUDIO and TYPE_IMAGE,
|
|
// we enforce two things:
|
|
//
|
|
// 1. all entries must have at least one enclosure
|
|
// 2. all enclosures must be video for TYPE_VIDEO, audio for TYPE_AUDIO or image
|
|
// for TYPE_IMAGE
|
|
//
|
|
// Otherwise it's a TYPE_FEED.
|
|
if (entriesWithEnclosures == this.items.length && otherCount == 0) {
|
|
if (audioCount > 0 && !videoCount && !imageCount) {
|
|
feedtype = Feed.TYPE_AUDIO;
|
|
}
|
|
else if (imageCount > 0 && !audioCount && !videoCount) {
|
|
feedtype = Feed.TYPE_IMAGE;
|
|
}
|
|
else if (videoCount > 0 && !audioCount && !imageCount) {
|
|
feedtype = Feed.TYPE_VIDEO;
|
|
}
|
|
}
|
|
|
|
this.type = feedtype;
|
|
this.enclosureCount = otherCount + videoCount + audioCount + imageCount;
|
|
},
|
|
|
|
_atomLinksToURI: function () {
|
|
var links = this.fields.links;
|
|
var alternates = findAtomLinks("alternate", links);
|
|
if (alternates.length > 0) {
|
|
var href = alternates[0].href;
|
|
var base;
|
|
if (alternates[0]["xml:base"]) {
|
|
base = alternates[0]["xml:base"];
|
|
}
|
|
this.link = this._resolveURI(href, base);
|
|
}
|
|
},
|
|
|
|
_resolveImageLink: function () {
|
|
var base;
|
|
if (this.image["xml:base"]) {
|
|
base = this.image["xml:base"];
|
|
}
|
|
var url = this._resolveURI(this.image.url, base);
|
|
if (url) {
|
|
this.image.url = url.href;
|
|
}
|
|
},
|
|
|
|
_resolveURI: function (linkSpec, baseSpec) {
|
|
var uri = null;
|
|
try {
|
|
var base = baseSpec ? strToURI(baseSpec, this.baseURI) : this.baseURI;
|
|
uri = strToURI(linkSpec, base);
|
|
}
|
|
catch (e) {
|
|
LOG(e);
|
|
}
|
|
|
|
return uri;
|
|
},
|
|
|
|
// reset the bag to raw contents, not text constructs
|
|
_resetBagMembersToRawText: function (fieldLists) {
|
|
for (var i = 0; i < fieldLists.length; i++) {
|
|
for (var j = 0; j < fieldLists[i].length; j++) {
|
|
if (this.fields[fieldLists[i][j]]) {
|
|
var textConstruct = this.fields[fieldLists[i][j]];
|
|
this.fields[fieldLists[i][j]] = textConstruct.text;
|
|
}
|
|
}
|
|
}
|
|
},
|
|
};
|
|
|
|
// Implements nsIFeedEntry, nsIFeedContainer
|
|
function Entry() {
|
|
this.summary = null;
|
|
this.content = null;
|
|
this.title = null;
|
|
this.fields = {};
|
|
this.link = null;
|
|
this.id = null;
|
|
this.baseURI = null;
|
|
this.updated = null;
|
|
this.published = null;
|
|
this.authors = [];
|
|
this.contributors = [];
|
|
}
|
|
|
|
Entry.prototype = {
|
|
fields: null,
|
|
enclosures: null,
|
|
mediaContent: null,
|
|
|
|
searchLists: {
|
|
title: ["title", "rss1:title", "atom03:title", "atom:title", "dc:title"],
|
|
link: [["link", strToURI], ["rss1:link", strToURI]],
|
|
id: [
|
|
["guid", makePropGetter("guid")],
|
|
"rdf:about",
|
|
"atom03:id",
|
|
"atom:id",
|
|
],
|
|
authors: ["authors"],
|
|
contributors: ["contributors"],
|
|
summary: [
|
|
"description",
|
|
"rss1:description",
|
|
"dc:description",
|
|
"atom03:summary",
|
|
"atom:summary",
|
|
],
|
|
content: ["content:encoded", "atom03:content", "atom:content"],
|
|
rights: [
|
|
"dc:rights",
|
|
"atom03:rights",
|
|
"atom:rights",
|
|
"copyright",
|
|
"prism:copyright",
|
|
],
|
|
published: [
|
|
"dc:date",
|
|
"pubDate",
|
|
"atom03:issued",
|
|
"dcterms:issued",
|
|
"atom:published",
|
|
"prism:publicationDate",
|
|
],
|
|
updated: [
|
|
"pubDate",
|
|
"atom03:modified",
|
|
"dc:date",
|
|
"dcterms:modified",
|
|
"atom:updated",
|
|
"prism:modificationDate",
|
|
],
|
|
pubTitle: ["pubTitle", "dc:source", "prism:publicationName"],
|
|
pubType: ["pubType"],
|
|
startPage: ["startPage", "prism:startingPage"],
|
|
endPage: ["endPage", "prism:endingPage"],
|
|
pageRange: ["prism:pageRange"],
|
|
issn: ["prism:issn"],
|
|
isbn: ["isbn", "prism:isbn"],
|
|
identifier: [
|
|
"dc:identifier",
|
|
"prism:doi",
|
|
],
|
|
publisher: ["dc:publisher"],
|
|
language: ["language", "dc:language"],
|
|
volume: ["prism:volume"],
|
|
issue: ["prism:number"],
|
|
section: ["prism:section"],
|
|
url: ["prism:url"],
|
|
},
|
|
|
|
normalize: function () {
|
|
fieldsToObj(this, this.searchLists);
|
|
|
|
// Assign Atom link if needed
|
|
if (this.fields.links) {
|
|
this._atomLinksToURI();
|
|
}
|
|
|
|
// Populate enclosures array
|
|
this._populateEnclosures();
|
|
|
|
// The link might be a guid w/ permalink=true
|
|
if (!this.link && this.fields.guid) {
|
|
var guid = this.fields.guid;
|
|
var isPermaLink = true;
|
|
|
|
if (guid.isPermaLink) {
|
|
isPermaLink = guid.isPermaLink.toLowerCase() != "false";
|
|
}
|
|
|
|
if (guid && isPermaLink) {
|
|
this.link = strToURI(guid.guid);
|
|
}
|
|
}
|
|
|
|
this._resetBagMembersToRawText([
|
|
this.searchLists.content,
|
|
this.searchLists.summary,
|
|
this.searchLists.title,
|
|
]);
|
|
},
|
|
|
|
_populateEnclosures: function () {
|
|
if (this.fields.links) {
|
|
this._atomLinksToEnclosures();
|
|
}
|
|
|
|
// Add RSS2 enclosure to enclosures
|
|
if (this.fields.enclosure) {
|
|
this._enclosureToEnclosures();
|
|
}
|
|
|
|
// Add media:content to enclosures
|
|
if (this.fields.mediacontent) {
|
|
this._mediaToEnclosures("mediacontent");
|
|
}
|
|
|
|
// Add media:thumbnail to enclosures
|
|
if (this.fields.mediathumbnail) {
|
|
this._mediaToEnclosures("mediathumbnail");
|
|
}
|
|
|
|
// Add media:content in media:group to enclosures
|
|
if (this.fields.mediagroup) {
|
|
this._mediaToEnclosures("mediagroup", "mediacontent");
|
|
}
|
|
},
|
|
|
|
__enclosureMap: null,
|
|
|
|
_addToEnclosures: function (newEnc) {
|
|
// items we add to the enclosures array get displayed in the FeedWriter and
|
|
// they must have non-empty urls.
|
|
if (!newEnc.url || newEnc.url == "") {
|
|
return;
|
|
}
|
|
|
|
if (this.__enclosureMap === null) {
|
|
this.__enclosureMap = {};
|
|
}
|
|
|
|
var previousEnc = this.__enclosureMap[newEnc.url];
|
|
|
|
if (previousEnc != undefined) {
|
|
if (!previousEnc.type && newEnc.type) {
|
|
previousEnc.type = newEnc.type;
|
|
}
|
|
|
|
if (!previousEnc.length && newEnc.length) {
|
|
previousEnc.length = newEnc.length;
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
if (this.enclosures === null) {
|
|
this.enclosures = [];
|
|
}
|
|
|
|
this.enclosures.push(newEnc);
|
|
this.__enclosureMap[newEnc.url] = newEnc;
|
|
},
|
|
|
|
_atomLinksToEnclosures: function () {
|
|
var links = this.fields.links;
|
|
var encLinks = findAtomLinks("enclosure", links);
|
|
if (encLinks.length == 0) {
|
|
return;
|
|
}
|
|
|
|
for (var i = 0; i < encLinks.length; ++i) {
|
|
var link = encLinks[i];
|
|
|
|
// an enclosure must have an href
|
|
if (!link.href) {
|
|
return;
|
|
}
|
|
|
|
var enc = {};
|
|
|
|
// copy Atom bits over to equivalent enclosure bits
|
|
enc.url = link.href;
|
|
if (link.type) {
|
|
enc.type = link.type;
|
|
}
|
|
if (link.length) {
|
|
enc.length = link.length;
|
|
}
|
|
|
|
this._addToEnclosures(enc);
|
|
}
|
|
},
|
|
|
|
_enclosureToEnclosures: function () {
|
|
var enc = this.fields.enclosure;
|
|
|
|
if (!enc.url) {
|
|
return;
|
|
}
|
|
|
|
this._addToEnclosures(enc);
|
|
},
|
|
|
|
_mediaToEnclosures: function (mediaType, contentType) {
|
|
var content;
|
|
|
|
// If a contentType is specified, the mediaType is a simple propertybag,
|
|
// and the contentType is an array inside it.
|
|
if (contentType) {
|
|
var group = this.fields[mediaType];
|
|
content = group[contentType];
|
|
}
|
|
else {
|
|
content = this.fields[mediaType];
|
|
}
|
|
|
|
for (var i = 0; i < content.length; ++i) {
|
|
var contentElement = content[i];
|
|
|
|
// media:content don't require url, but if it's not there, we should
|
|
// skip it.
|
|
if (!contentElement.url) {
|
|
continue;
|
|
}
|
|
|
|
var enc = {};
|
|
|
|
// copy media:content bits over to equivalent enclosure bits
|
|
enc.url = contentElement.url;
|
|
if (contentElement.type) {
|
|
enc.type = contentElement.type;
|
|
}
|
|
else if (mediaType == "mediathumbnail") {
|
|
// thumbnails won't have a type, but default to image types
|
|
enc.type = "image/*";
|
|
enc.thumbnail = true;
|
|
}
|
|
|
|
if (contentElement.fileSize) {
|
|
enc.length = contentElement.fileSize;
|
|
}
|
|
|
|
this._addToEnclosures(enc);
|
|
}
|
|
},
|
|
};
|
|
|
|
Entry.prototype._atomLinksToURI = Feed.prototype._atomLinksToURI;
|
|
Entry.prototype._resolveURI = Feed.prototype._resolveURI;
|
|
Entry.prototype._resetBagMembersToRawText = Feed.prototype._resetBagMembersToRawText;
|
|
|
|
// TextConstruct represents and element that could contain (X)HTML
|
|
// Implements nsIFeedTextConstruct
|
|
function TextConstruct() {
|
|
this.lang = null;
|
|
this.base = null;
|
|
this.type = "text";
|
|
this.text = null;
|
|
}
|
|
|
|
TextConstruct.prototype = {
|
|
plainText: function () {
|
|
if (this.type != "text") {
|
|
return stripTags(this.text);
|
|
}
|
|
return this.text;
|
|
},
|
|
|
|
createDocumentFragment: function () {
|
|
if (this.type == "text") {
|
|
const docFragment = new DOMParser().parseFromString('<!doctype html>', 'text/html')
|
|
.createDocumentFragment();
|
|
docFragment.append(this.text);
|
|
return docFragment;
|
|
}
|
|
|
|
let parserType;
|
|
if (this.type == "xhtml") {
|
|
parserType = "application/xhtml+xml";
|
|
}
|
|
else if (this.type == "html") {
|
|
parserType = "text/html";
|
|
}
|
|
else {
|
|
return null;
|
|
}
|
|
|
|
const parsedDoc = new DOMParser().parseFromString(this.text, parserType);
|
|
const docFragment = parsedDoc.createDocumentFragment();
|
|
docFragment.append(parsedDoc.documentElement);
|
|
return docFragment;
|
|
},
|
|
};
|
|
|
|
// Generator represents the software that produced the feed
|
|
// Implements nsIFeedGenerator, nsIFeedElementBase
|
|
function Generator() {
|
|
this.lang = null;
|
|
this.agent = null;
|
|
this.version = null;
|
|
this.uri = null;
|
|
|
|
// nsIFeedElementBase
|
|
this._attributes = null;
|
|
this.baseURI = null;
|
|
}
|
|
|
|
Generator.prototype = {
|
|
get attributes() {
|
|
return this._attributes;
|
|
},
|
|
|
|
set attributes(value) {
|
|
this._attributes = value;
|
|
this.version = (this._attributes.getNamedItemNS("", "version") || {}).value;
|
|
var uriAttribute = (this._attributes.getNamedItemNS("", "uri") || {}).value
|
|
|| (this._attributes.getNamedItemNS("", "url") || {}).value;
|
|
this.uri = strToURI(uriAttribute, this.baseURI);
|
|
|
|
// RSS1
|
|
uriAttribute = (this._attributes.getNamedItemNS(RDF_NS, "resource") || {}).value;
|
|
if (uriAttribute) {
|
|
this.agent = uriAttribute;
|
|
this.uri = strToURI(uriAttribute, this.baseURI);
|
|
}
|
|
},
|
|
};
|
|
|
|
// Implements nsIFeedPerson, nsIFeedElementBase
|
|
function Person() {
|
|
this.name = null;
|
|
this.uri = null;
|
|
this.email = null;
|
|
|
|
// nsIFeedElementBase
|
|
this.attributes = null;
|
|
this.baseURI = null;
|
|
}
|
|
|
|
/**
|
|
* Map a list of fields into properties on a container.
|
|
*
|
|
* @param container An nsIFeedContainer
|
|
* @param fields A list of fields to search for. List members can
|
|
* be a list, in which case the second member is
|
|
* transformation function (like parseInt).
|
|
*/
|
|
function fieldsToObj(container, fields) {
|
|
var props, prop, field, searchList;
|
|
for (var key in fields) {
|
|
searchList = fields[key];
|
|
for (var i = 0; i < searchList.length; ++i) {
|
|
props = searchList[i];
|
|
prop = null;
|
|
field = isArray(props) ? props[0] : props;
|
|
prop = container.fields[field];
|
|
if (prop) {
|
|
prop = isArray(props) ? props[1](prop) : prop;
|
|
container[key] = prop;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// create a generator element
|
|
function atomGenerator(s, generator) {
|
|
generator.agent = s.trim();
|
|
return generator;
|
|
}
|
|
|
|
// post-process atom:logo to create an RSS2-like structure
|
|
function atomLogo(s, logo) {
|
|
logo.url = s.trim();
|
|
}
|
|
|
|
// post-process an RSS category, map it to the Atom fields.
|
|
function rssCatTerm(s, cat) {
|
|
// add slash handling?
|
|
cat.term = s.trim();
|
|
return cat;
|
|
}
|
|
|
|
// post-process a GUID
|
|
function rssGuid(s, guid) {
|
|
guid.guid = s.trim();
|
|
return guid;
|
|
}
|
|
|
|
// post-process an RSS author element
|
|
//
|
|
// It can contain a field like this:
|
|
//
|
|
// <author>lawyer@boyer.net (Lawyer Boyer)</author>
|
|
//
|
|
// or, delightfully, a field like this:
|
|
//
|
|
// <dc:creator>Simon St.Laurent (mailto:simonstl@simonstl.com)</dc:creator>
|
|
//
|
|
// We want to split this up and assign it to corresponding Atom
|
|
// fields.
|
|
//
|
|
function rssAuthor(s, author) {
|
|
// check for RSS2 string format
|
|
var chars = s.trim();
|
|
var matches = chars.match(/(.*)\((.*)\)/);
|
|
var emailCheck
|
|
= /^([a-zA-Z0-9_.-])+@(([a-zA-Z0-9-])+\.)+([a-zA-Z0-9]{2,4})+$/;
|
|
if (matches) {
|
|
var match1 = matches[1].trim();
|
|
var match2 = matches[2].trim();
|
|
if (match2.indexOf("mailto:") == 0) {
|
|
match2 = match2.substring(7);
|
|
}
|
|
if (emailCheck.test(match1)) {
|
|
author.email = match1;
|
|
author.name = match2;
|
|
}
|
|
else if (emailCheck.test(match2)) {
|
|
author.email = match2;
|
|
author.name = match1;
|
|
}
|
|
else {
|
|
// put it back together
|
|
author.name = match1 + " (" + match2 + ")";
|
|
}
|
|
}
|
|
else {
|
|
author.name = chars;
|
|
if (chars.indexOf("@")) {
|
|
author.email = chars;
|
|
}
|
|
}
|
|
return author;
|
|
}
|
|
|
|
const XHTML_NS = "http://www.w3.org/1999/xhtml";
|
|
|
|
// The XHTMLHandler handles inline XHTML found in things like atom:summary
|
|
function XHTMLHandler(processor, isAtom) {
|
|
this._buf = "";
|
|
this._processor = processor;
|
|
this._depth = 0;
|
|
this._isAtom = isAtom;
|
|
// a stack of lists tracking in-scope namespaces
|
|
this._inScopeNS = [];
|
|
}
|
|
|
|
// The fidelity can be improved here, to allow handling of stuff like
|
|
// SVG and MathML. XXX
|
|
XHTMLHandler.prototype = {
|
|
|
|
// look back up at the declared namespaces
|
|
// we always use the same prefixes for our safe stuff
|
|
_isInScope: function (ns) {
|
|
for (var i in this._inScopeNS) {
|
|
for (var uri in this._inScopeNS[i]) {
|
|
if (this._inScopeNS[i][uri] == ns) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
},
|
|
|
|
startDocument: function () {
|
|
},
|
|
endDocument: function () {
|
|
},
|
|
startElement: function (namespace, localName, qName, attributes) {
|
|
++this._depth;
|
|
this._inScopeNS.push([]);
|
|
|
|
// RFC4287 requires XHTML to be wrapped in a div that is *not* part of
|
|
// the content. This prevents people from screwing up namespaces, but
|
|
// we need to skip it here.
|
|
if (this._isAtom && this._depth == 1 && localName == "div") {
|
|
return;
|
|
}
|
|
|
|
// If it's an XHTML element, record it. Otherwise, it's ignored.
|
|
if (namespace == XHTML_NS) {
|
|
this._buf += "<" + localName;
|
|
var uri;
|
|
for (var i = 0; i < attributes.length; ++i) {
|
|
uri = attributes.item(i).namespaceURI;
|
|
// XHTML attributes aren't in a namespace
|
|
if (uri == "") {
|
|
this._buf += (" " + attributes.item(i).localName + "='"
|
|
+ xmlEscape(attributes.item(i).value) + "'");
|
|
}
|
|
else {
|
|
// write a small set of allowed attribute namespaces
|
|
var prefix = gAllowedXHTMLNamespaces[uri];
|
|
if (prefix) {
|
|
// The attribute value we'll attempt to write
|
|
var attributeValue = xmlEscape(attributes.item(i).value);
|
|
|
|
// it's an allowed attribute NS.
|
|
// write the attribute
|
|
this._buf += (" " + prefix + ":"
|
|
+ attributes.item(i).localName
|
|
+ "='" + attributeValue + "'");
|
|
|
|
// write an xmlns declaration if necessary
|
|
if (prefix != "xml" && !this._isInScope(uri)) {
|
|
this._inScopeNS[this._inScopeNS.length - 1].push(uri);
|
|
this._buf += " xmlns:" + prefix + "='" + uri + "'";
|
|
}
|
|
}
|
|
}
|
|
}
|
|
this._buf += ">";
|
|
}
|
|
},
|
|
endElement: function (uri, localName, qName) {
|
|
--this._depth;
|
|
this._inScopeNS.pop();
|
|
|
|
// We need to skip outer divs in Atom. See comment in startElement.
|
|
if (this._isAtom && this._depth == 0 && localName == "div") {
|
|
return;
|
|
}
|
|
|
|
// When we peek too far, go back to the main processor
|
|
if (this._depth < 0) {
|
|
this._processor.returnFromXHTMLHandler(this._buf.trim(), uri, localName, qName);
|
|
return;
|
|
}
|
|
// If it's an XHTML element, record it. Otherwise, it's ignored.
|
|
if (uri == XHTML_NS) {
|
|
this._buf += "</" + localName + ">";
|
|
}
|
|
},
|
|
characters: function (data) {
|
|
this._buf += xmlEscape(data);
|
|
},
|
|
processingInstruction: function () {
|
|
},
|
|
};
|
|
|
|
/**
|
|
* The ExtensionHandler deals with elements we haven't explicitly
|
|
* added to our transition table in the FeedProcessor.
|
|
*/
|
|
function ExtensionHandler(processor) {
|
|
this._buf = "";
|
|
this._depth = 0;
|
|
this._hasChildElements = false;
|
|
|
|
// The FeedProcessor
|
|
this._processor = processor;
|
|
|
|
// Fields of the outermost extension element.
|
|
this._localName = null;
|
|
this._uri = null;
|
|
this._qName = null;
|
|
this._attrs = null;
|
|
}
|
|
|
|
ExtensionHandler.prototype = {
|
|
startDocument: function () {
|
|
},
|
|
endDocument: function () {
|
|
},
|
|
startElement: function (uri, localName, qName, attrs) {
|
|
++this._depth;
|
|
|
|
if (this._depth == 1) {
|
|
this._uri = uri;
|
|
this._localName = localName;
|
|
this._qName = qName;
|
|
this._attrs = attrs;
|
|
}
|
|
|
|
// if we descend into another element, we won't send text
|
|
this._hasChildElements = (this._depth > 1);
|
|
},
|
|
endElement: function (_uri, _localName, _qName) {
|
|
--this._depth;
|
|
if (this._depth == 0) {
|
|
var text = this._hasChildElements ? null : this._buf.trim();
|
|
this._processor.returnFromExtHandler(this._uri, this._localName, text, this._attrs);
|
|
}
|
|
},
|
|
characters: function (data) {
|
|
if (!this._hasChildElements) {
|
|
this._buf += data;
|
|
}
|
|
},
|
|
processingInstruction: function () {
|
|
},
|
|
};
|
|
|
|
|
|
/**
|
|
* ElementInfo is a simple container object that describes
|
|
* some characteristics of a feed element. For example, it
|
|
* says whether an element can be expected to appear more
|
|
* than once inside a given entry or feed.
|
|
*/
|
|
function ElementInfo(fieldName, containerClass, closeFunc, isArray) {
|
|
this.fieldName = fieldName;
|
|
this.containerClass = containerClass;
|
|
this.closeFunc = closeFunc;
|
|
this.isArray = isArray;
|
|
this.isWrapper = false;
|
|
}
|
|
|
|
/**
|
|
* FeedElementInfo represents a feed element, usually the root.
|
|
*/
|
|
function FeedElementInfo(fieldName, feedVersion) {
|
|
this.isWrapper = false;
|
|
this.fieldName = fieldName;
|
|
this.feedVersion = feedVersion;
|
|
}
|
|
|
|
/**
|
|
* Some feed formats include vestigial wrapper elements that we don't
|
|
* want to include in our object model, but we do need to keep track
|
|
* of during parsing.
|
|
*/
|
|
function WrapperElementInfo(fieldName) {
|
|
this.isWrapper = true;
|
|
this.fieldName = fieldName;
|
|
}
|
|
|
|
/** *** The Processor *****/
|
|
// Implements nsIFeedProcessor, nsISAXContentHandler, nsISAXErrorHandler,
|
|
// nsIStreamListener, nsIRequestObserver
|
|
function FeedProcessor() {
|
|
this._reader = new SAXXMLReader();
|
|
this._buf = "";
|
|
this._feed = {};
|
|
this._handlerStack = [];
|
|
this._xmlBaseStack = []; // sparse array keyed to nesting depth
|
|
this._depth = 0;
|
|
this._state = "START";
|
|
this._result = null;
|
|
this._extensionHandler = null;
|
|
this._xhtmlHandler = null;
|
|
this._haveSentResult = false;
|
|
|
|
// The nsIFeedResultListener waiting for the parse results
|
|
this.listener = null;
|
|
|
|
// These elements can contain (X)HTML or plain text.
|
|
// We keep a table here that contains their default treatment
|
|
this._textConstructs = {
|
|
"atom:title": "text",
|
|
"atom:summary": "text",
|
|
"atom:rights": "text",
|
|
"atom:content": "text",
|
|
"atom:subtitle": "text",
|
|
"description": "html",
|
|
"rss1:description": "html",
|
|
"dc:description": "html",
|
|
"content:encoded": "html",
|
|
"title": "text",
|
|
"rss1:title": "text",
|
|
"atom03:title": "text",
|
|
"atom03:tagline": "text",
|
|
"atom03:summary": "text",
|
|
"atom03:content": "text",
|
|
"dc:title": "text",
|
|
"dc:rights": "text",
|
|
"atom03:rights": "text",
|
|
"copyright": "text",
|
|
"prism:copyright": "text",
|
|
};
|
|
this._stack = [];
|
|
|
|
this._trans = {
|
|
"START": {
|
|
// If we hit a root RSS element, treat as RSS2.
|
|
"rss": new FeedElementInfo("RSS2", "rss2"),
|
|
|
|
// If we hit an RDF element, if could be RSS1, but we can't
|
|
// verify that until we hit a rss1:channel element.
|
|
"rdf:RDF": new WrapperElementInfo("RDF"),
|
|
|
|
// If we hit a Atom 1.0 element, treat as Atom 1.0.
|
|
"atom:feed": new FeedElementInfo("Atom", "atom"),
|
|
|
|
// Treat as Atom 0.3
|
|
"atom03:feed": new FeedElementInfo("Atom03", "atom03"),
|
|
},
|
|
|
|
/** ******* RSS2 **********/
|
|
"IN_RSS2": {
|
|
"channel": new WrapperElementInfo("channel"),
|
|
},
|
|
|
|
"IN_CHANNEL": {
|
|
"item": new ElementInfo("items", Entry, null, true),
|
|
"managingEditor": new ElementInfo("authors", Person, rssAuthor, true),
|
|
"dc:creator": new ElementInfo("authors", Person, rssAuthor, true),
|
|
"dc:author": new ElementInfo("authors", Person, rssAuthor, true),
|
|
"dc:contributor": new ElementInfo("contributors", Person, rssAuthor, true),
|
|
"category": new ElementInfo("categories", null, rssCatTerm, true),
|
|
"cloud": new ElementInfo("cloud", null, null, false),
|
|
"image": new ElementInfo("image", null, null, false),
|
|
"textInput": new ElementInfo("textInput", null, null, false),
|
|
"skipDays": new ElementInfo("skipDays", null, null, false),
|
|
"skipHours": new ElementInfo("skipHours", null, null, false),
|
|
"generator": new ElementInfo("generator", Generator, atomGenerator, false),
|
|
},
|
|
|
|
"IN_ITEMS": {
|
|
"author": new ElementInfo("authors", Person, rssAuthor, true),
|
|
"dc:creator": new ElementInfo("authors", Person, rssAuthor, true),
|
|
"dc:author": new ElementInfo("authors", Person, rssAuthor, true),
|
|
"dc:contributor": new ElementInfo("contributors", Person, rssAuthor, true),
|
|
"category": new ElementInfo("categories", null, rssCatTerm, true),
|
|
"enclosure": new ElementInfo("enclosure", null, null, false),
|
|
"media:content": new ElementInfo("mediacontent", null, null, true),
|
|
"media:group": new ElementInfo("mediagroup", null, null, false),
|
|
"media:thumbnail": new ElementInfo("mediathumbnail", null, null, true),
|
|
"guid": new ElementInfo("guid", null, rssGuid, false),
|
|
},
|
|
|
|
"IN_SKIPDAYS": {
|
|
"day": new ElementInfo("days", null, null, true),
|
|
},
|
|
|
|
"IN_SKIPHOURS": {
|
|
"hour": new ElementInfo("hours", null, null, true),
|
|
},
|
|
|
|
"IN_MEDIAGROUP": {
|
|
"media:content": new ElementInfo("mediacontent", null, null, true),
|
|
"media:thumbnail": new ElementInfo("mediathumbnail", null, null, true),
|
|
},
|
|
|
|
/** ******* RSS1 **********/
|
|
"IN_RDF": {
|
|
// If we hit a rss1:channel, we can verify that we have RSS1
|
|
"rss1:channel": new FeedElementInfo("rdf_channel", "rss1"),
|
|
"rss1:image": new ElementInfo("image", null, null, false),
|
|
"rss1:textinput": new ElementInfo("textInput", null, null, false),
|
|
"rss1:item": new ElementInfo("items", Entry, null, true),
|
|
},
|
|
|
|
"IN_RDF_CHANNEL": {
|
|
"admin:generatorAgent": new ElementInfo("generator", Generator, null, false),
|
|
"dc:creator": new ElementInfo("authors", Person, rssAuthor, true),
|
|
"dc:author": new ElementInfo("authors", Person, rssAuthor, true),
|
|
"dc:contributor": new ElementInfo("contributors", Person, rssAuthor, true),
|
|
},
|
|
|
|
/** ******* ATOM 1.0 **********/
|
|
"IN_ATOM": {
|
|
"atom:author": new ElementInfo("authors", Person, null, true),
|
|
"atom:generator": new ElementInfo("generator", Generator, atomGenerator, false),
|
|
"atom:contributor": new ElementInfo("contributors", Person, null, true),
|
|
"atom:link": new ElementInfo("links", null, null, true),
|
|
"atom:logo": new ElementInfo("atom:logo", null, atomLogo, false),
|
|
"atom:entry": new ElementInfo("entries", Entry, null, true),
|
|
},
|
|
|
|
"IN_ENTRIES": {
|
|
"atom:author": new ElementInfo("authors", Person, null, true),
|
|
"atom:contributor": new ElementInfo("contributors", Person, null, true),
|
|
"atom:link": new ElementInfo("links", null, null, true),
|
|
},
|
|
|
|
/** ******* ATOM 0.3 **********/
|
|
"IN_ATOM03": {
|
|
"atom03:author": new ElementInfo("authors", Person, null, true),
|
|
"atom03:contributor": new ElementInfo("contributors", Person, null, true),
|
|
"atom03:link": new ElementInfo("links", null, null, true),
|
|
"atom03:entry": new ElementInfo("atom03_entries", Entry, null, true),
|
|
"atom03:generator": new ElementInfo("generator", Generator, atomGenerator, false),
|
|
},
|
|
|
|
"IN_ATOM03_ENTRIES": {
|
|
"atom03:author": new ElementInfo("authors", Person, null, true),
|
|
"atom03:contributor": new ElementInfo("contributors", Person, null, true),
|
|
"atom03:link": new ElementInfo("links", null, null, true),
|
|
"atom03:entry": new ElementInfo("atom03_entries", Entry, null, true),
|
|
},
|
|
};
|
|
}
|
|
|
|
// See startElement for a long description of how feeds are processed.
|
|
FeedProcessor.prototype = {
|
|
// Set ourselves as the SAX handler, and set the base URI
|
|
_init: function (uri) {
|
|
this._reader.contentHandler = this;
|
|
this._reader.errorHandler = this;
|
|
this._result = new FeedResult();
|
|
if (uri) {
|
|
this._result.uri = uri;
|
|
this._reader.baseURI = uri;
|
|
this._xmlBaseStack[0] = uri;
|
|
}
|
|
},
|
|
|
|
// This function is called once we figure out what type of feed
|
|
// we're dealing with. Some feed types require digging a bit further
|
|
// than the root.
|
|
_docVerified: function (version) {
|
|
this._result.doc = new Feed();
|
|
this._result.doc.baseURI
|
|
= this._xmlBaseStack[this._xmlBaseStack.length - 1];
|
|
this._result.doc.fields = this._feed;
|
|
this._result.version = version;
|
|
},
|
|
|
|
// When we're done with the feed, let the listener know what
|
|
// happened.
|
|
_sendResult: function () {
|
|
this._haveSentResult = true;
|
|
try {
|
|
// Can be null when a non-feed is fed to us
|
|
if (this._result.doc) {
|
|
this._result.doc.normalize();
|
|
}
|
|
}
|
|
catch (e) {
|
|
LOG("FIXME: " + e);
|
|
}
|
|
|
|
try {
|
|
if (this.listener !== null) {
|
|
this.listener.handleResult(this._result);
|
|
}
|
|
}
|
|
finally {
|
|
this._result = null;
|
|
}
|
|
},
|
|
|
|
// Parsing functions
|
|
parseAsync: function (requestObserver, uri) {
|
|
this._init(uri);
|
|
this._reader.parseAsync(requestObserver);
|
|
},
|
|
|
|
// Fetch API
|
|
|
|
onResponseAvailable(response) {
|
|
return this._reader.onResponseAvailable(response);
|
|
},
|
|
|
|
// nsISAXErrorHandler
|
|
|
|
// We only care about fatal errors. When this happens, we may have
|
|
// parsed through the feed metadata and some number of entries. The
|
|
// listener can still show some of that data if it wants, and we'll
|
|
// set the bozo bit to indicate we were unable to parse all the way
|
|
// through.
|
|
fatalError: function () {
|
|
this._result.bozo = true;
|
|
// XXX need to QI to FeedProgressListener
|
|
if (!this._haveSentResult) {
|
|
this._sendResult();
|
|
}
|
|
},
|
|
|
|
// nsISAXContentHandler
|
|
|
|
startDocument: function () {
|
|
// LOG("----------");
|
|
},
|
|
|
|
endDocument: function () {
|
|
if (!this._haveSentResult) {
|
|
this._sendResult();
|
|
}
|
|
},
|
|
|
|
// The transitions defined above identify elements that contain more
|
|
// than just text. For example RSS items contain many fields, and so
|
|
// do Atom authors. The only commonly used elements that contain
|
|
// mixed content are Atom Text Constructs of type="xhtml", which we
|
|
// delegate to another handler for cleaning. That leaves a couple
|
|
// different types of elements to deal with: those that should occur
|
|
// only once, such as title elements, and those that can occur
|
|
// multiple times, such as the RSS category element and the Atom
|
|
// link element. Most of the RSS1/DC elements can occur multiple
|
|
// times in theory, but in practice, the only ones that do have
|
|
// analogues in Atom.
|
|
//
|
|
// Some elements are also groups of attributes or sub-elements,
|
|
// while others are simple text fields. For the most part, we don't
|
|
// have to pay explicit attention to the simple text elements,
|
|
// unless we want to post-process the resulting string to transform
|
|
// it into some richer object like a Date or URI.
|
|
//
|
|
// Elements that have more sophisticated content models still end up
|
|
// being dictionaries, whether they are based on attributes like RSS
|
|
// cloud, sub-elements like Atom author, or even items and
|
|
// entries. These elements are treated as "containers". It's
|
|
// theoretically possible for a container to have an attribute with
|
|
// the same universal name as a sub-element, but none of the feed
|
|
// formats allow this by default, and I don't of any extension that
|
|
// works this way.
|
|
//
|
|
startElement: function (uri, localName, qName, attributes) {
|
|
this._buf = "";
|
|
++this._depth;
|
|
var elementInfo;
|
|
|
|
// LOG("<" + localName + ">");
|
|
|
|
// Check for xml:base
|
|
var base = (attributes.getNamedItemNS(XMLNS, "base") || {}).value;
|
|
if (base) {
|
|
this._xmlBaseStack[this._depth]
|
|
= strToURI(base, this._xmlBaseStack[this._xmlBaseStack.length - 1]);
|
|
}
|
|
|
|
// To identify the element we're dealing with, we look up the
|
|
// namespace URI in our gNamespaces dictionary, which will give us
|
|
// a "canonical" prefix for a namespace URI. For example, this
|
|
// allows Dublin Core "creator" elements to be consistently mapped
|
|
// to "dc:creator", for easy field access by consumer code. This
|
|
// strategy also happens to shorten up our state table.
|
|
var key = this._prefixForNS(uri) + localName;
|
|
|
|
// Check to see if we need to hand this off to our XHTML handler.
|
|
// The elements we're dealing with will look like this:
|
|
//
|
|
// <title type="xhtml">
|
|
// <div xmlns="http://www.w3.org/1999/xhtml">
|
|
// A title with <b>bold</b> and <i>italics</i>.
|
|
// </div>
|
|
// </title>
|
|
//
|
|
// When it returns in returnFromXHTMLHandler, the handler should
|
|
// give us back a string like this:
|
|
//
|
|
// "A title with <b>bold</b> and <i>italics</i>."
|
|
//
|
|
// The Atom spec explicitly says the div is not part of the content,
|
|
// and explicitly allows whitespace collapsing.
|
|
//
|
|
if ((this._result.version == "atom" || this._result.version == "atom03")
|
|
&& this._textConstructs[key]) {
|
|
var type = (attributes.getNamedItemNS("", "type") || {}).value;
|
|
if (type && type.includes("xhtml")) {
|
|
this._xhtmlHandler
|
|
= new XHTMLHandler(this, (this._result.version == "atom"));
|
|
this._reader.contentHandler = this._xhtmlHandler;
|
|
return;
|
|
}
|
|
}
|
|
|
|
// Check our current state, and see if that state has a defined
|
|
// transition. For example, this._trans["atom:entry"]["atom:author"]
|
|
// will have one, and it tells us to add an item to our authors array.
|
|
if (this._trans[this._state] && this._trans[this._state][key]) {
|
|
elementInfo = this._trans[this._state][key];
|
|
}
|
|
else {
|
|
// If we don't have a transition, hand off to extension handler
|
|
this._extensionHandler = new ExtensionHandler(this);
|
|
this._reader.contentHandler = this._extensionHandler;
|
|
this._extensionHandler.startElement(uri, localName, qName, attributes);
|
|
return;
|
|
}
|
|
|
|
// This distinguishes wrappers like 'channel' from elements
|
|
// we'd actually like to do something with (which will test true).
|
|
this._handlerStack[this._depth] = elementInfo;
|
|
if (elementInfo.isWrapper) {
|
|
this._state = "IN_" + elementInfo.fieldName.toUpperCase();
|
|
this._stack.push([this._feed, this._state]);
|
|
}
|
|
else if (elementInfo.feedVersion) {
|
|
this._state = "IN_" + elementInfo.fieldName.toUpperCase();
|
|
|
|
// Check for the older RSS2 variants
|
|
if (elementInfo.feedVersion == "rss2") {
|
|
elementInfo.feedVersion = this._findRSSVersion(attributes);
|
|
}
|
|
else if (uri == RSS090NS) {
|
|
elementInfo.feedVersion = "rss090";
|
|
}
|
|
|
|
this._docVerified(elementInfo.feedVersion);
|
|
this._stack.push([this._feed, this._state]);
|
|
this._mapAttributes(this._feed, attributes);
|
|
}
|
|
else {
|
|
this._state = this._processComplexElement(elementInfo, attributes);
|
|
}
|
|
},
|
|
|
|
// In the endElement handler, we decrement the stack and look
|
|
// for cleanup/transition functions to execute. The second part
|
|
// of the state transition works as above in startElement, but
|
|
// the state we're looking for is prefixed with an underscore
|
|
// to distinguish endElement events from startElement events.
|
|
endElement: function (_uri, _localName, _qName) {
|
|
var elementInfo = this._handlerStack[this._depth];
|
|
// LOG("</" + localName + ">");
|
|
if (elementInfo && !elementInfo.isWrapper) {
|
|
this._closeComplexElement(elementInfo);
|
|
}
|
|
|
|
// cut down xml:base context
|
|
if (this._xmlBaseStack.length == this._depth + 1) {
|
|
this._xmlBaseStack = this._xmlBaseStack.slice(0, this._depth);
|
|
}
|
|
|
|
// our new state is whatever is at the top of the stack now
|
|
if (this._stack.length > 0) {
|
|
this._state = this._stack[this._stack.length - 1][1];
|
|
}
|
|
this._handlerStack = this._handlerStack.slice(0, this._depth);
|
|
--this._depth;
|
|
},
|
|
|
|
// Buffer up character data. The buffer is cleared with every
|
|
// opening element.
|
|
characters: function (data) {
|
|
this._buf += data;
|
|
},
|
|
|
|
processingInstruction: function (target, data) {
|
|
if (target == "xml-stylesheet") {
|
|
var hrefAttribute = data.match(/href=["'](.*?)["']/);
|
|
if (hrefAttribute && hrefAttribute.length == 2) {
|
|
this._result.stylesheet = strToURI(hrefAttribute[1], this._result.uri);
|
|
}
|
|
}
|
|
},
|
|
|
|
// end of nsISAXContentHandler
|
|
|
|
// Handle our more complicated elements--those that contain
|
|
// attributes and child elements.
|
|
_processComplexElement: function (elementInfo, attributes) {
|
|
var obj;
|
|
|
|
// If the container is an entry/item, it'll need to have its
|
|
// more esoteric properties put in the 'fields' property bag.
|
|
const Class = elementInfo.containerClass;
|
|
if (Class == Entry) {
|
|
obj = new Class();
|
|
obj.baseURI = this._xmlBaseStack[this._xmlBaseStack.length - 1];
|
|
this._mapAttributes(obj.fields, attributes);
|
|
}
|
|
else if (elementInfo.containerClass) {
|
|
obj = new Class();
|
|
obj.baseURI = this._xmlBaseStack[this._xmlBaseStack.length - 1];
|
|
obj.attributes = attributes; // just set the SAX attributes
|
|
}
|
|
else {
|
|
obj = {};
|
|
this._mapAttributes(obj, attributes);
|
|
}
|
|
|
|
// We should have a container/propertyBag that's had its
|
|
// attributes processed. Now we need to attach it to its
|
|
// container.
|
|
var newProp;
|
|
|
|
// First we'll see what's on top of the stack.
|
|
var container = this._stack[this._stack.length - 1][0];
|
|
|
|
// Check to see if it has the property
|
|
var prop = container[elementInfo.fieldName];
|
|
|
|
if (elementInfo.isArray) {
|
|
if (!prop) {
|
|
container[elementInfo.fieldName] = [];
|
|
}
|
|
|
|
newProp = container[elementInfo.fieldName];
|
|
newProp.push(obj);
|
|
|
|
// If new object is an nsIFeedContainer, we want to deal with
|
|
// its member nsIPropertyBag instead.
|
|
if (obj.fields) {
|
|
newProp = obj.fields;
|
|
}
|
|
}
|
|
else {
|
|
// If it doesn't, set it.
|
|
if (!prop) {
|
|
container[elementInfo.fieldName] = obj;
|
|
}
|
|
newProp = container[elementInfo.fieldName];
|
|
}
|
|
|
|
// make our new state name, and push the property onto the stack
|
|
var newState = "IN_" + elementInfo.fieldName.toUpperCase();
|
|
this._stack.push([newProp, newState, obj]);
|
|
return newState;
|
|
},
|
|
|
|
// Sometimes we need reconcile the element content with the object
|
|
// model for a given feed. We use helper functions to do the
|
|
// munging, but we need to identify array types here, so the munging
|
|
// happens only to the last element of an array.
|
|
_closeComplexElement: function (elementInfo) {
|
|
var stateTuple = this._stack.pop();
|
|
var container = stateTuple[0];
|
|
var containerParent = stateTuple[2];
|
|
var element = null;
|
|
|
|
// If it's an array and we have to post-process,
|
|
// grab the last element
|
|
if (isArray(container)) {
|
|
element = container[container.length - 1];
|
|
}
|
|
else {
|
|
element = container;
|
|
}
|
|
|
|
// Run the post-processing function if there is one.
|
|
if (elementInfo.closeFunc) {
|
|
element = elementInfo.closeFunc(this._buf, element);
|
|
}
|
|
|
|
// If an nsIFeedContainer was on top of the stack,
|
|
// we need to normalize it
|
|
if (elementInfo.containerClass == Entry) {
|
|
containerParent.normalize();
|
|
}
|
|
|
|
// If it's an array, re-set the last element
|
|
if (isArray(container)) {
|
|
container[container.length - 1] = element;
|
|
}
|
|
},
|
|
|
|
_prefixForNS: function (uri) {
|
|
if (!uri) {
|
|
return "";
|
|
}
|
|
var prefix = gNamespaces[uri];
|
|
if (prefix) {
|
|
return prefix + ":";
|
|
}
|
|
if (uri.toLowerCase().indexOf("http://backend.userland.com") == 0) {
|
|
return "";
|
|
}
|
|
return null;
|
|
},
|
|
|
|
_mapAttributes: function (bag, attributes) {
|
|
// Cycle through the attributes, and set our properties using the
|
|
// prefix:localNames we find in our namespace dictionary.
|
|
for (var i = 0; i < attributes.length; ++i) {
|
|
var key = this._prefixForNS(attributes.item(i).namespaceURI) + attributes.item(i).localName;
|
|
var val = attributes.item(i).value;
|
|
bag[key] = val;
|
|
}
|
|
},
|
|
|
|
// Only for RSS2esque formats
|
|
_findRSSVersion: function (attributes) {
|
|
var versionAttr = (attributes.getNamedItemNS("", "version") || {}).value.trim();
|
|
var versions = {
|
|
"0.91": "rss091",
|
|
"0.92": "rss092",
|
|
"0.93": "rss093",
|
|
"0.94": "rss094"
|
|
};
|
|
if (versions[versionAttr]) {
|
|
return versions[versionAttr];
|
|
}
|
|
if (versionAttr.substr(0, 2) != "2.") {
|
|
return "rssUnknown";
|
|
}
|
|
return "rss2";
|
|
},
|
|
|
|
// unknown element values are returned here. See startElement above
|
|
// for how this works.
|
|
returnFromExtHandler: function (uri, localName, chars, attributes) {
|
|
--this._depth;
|
|
|
|
// take control of the SAX events
|
|
this._reader.contentHandler = this;
|
|
if (localName === null && chars === null) {
|
|
return;
|
|
}
|
|
|
|
// we don't take random elements inside rdf:RDF
|
|
if (this._state == "IN_RDF") {
|
|
return;
|
|
}
|
|
|
|
// Grab the top of the stack
|
|
var top = this._stack[this._stack.length - 1];
|
|
if (!top) {
|
|
return;
|
|
}
|
|
|
|
var container = top[0];
|
|
// Grab the last element if it's an array
|
|
if (isArray(container)) {
|
|
var contract = this._handlerStack[this._depth].containerClass;
|
|
// check if it's something specific, but not an entry
|
|
if (contract && contract != Entry) {
|
|
var el = container[container.length - 1];
|
|
if (contract != Person) {
|
|
return; // don't know about this interface
|
|
}
|
|
|
|
let propName = localName;
|
|
var prefix = gNamespaces[uri];
|
|
|
|
// synonyms
|
|
if (
|
|
(uri == ""
|
|
|| prefix
|
|
&& ((prefix.indexOf("atom") > -1)
|
|
|| (prefix.indexOf("rss") > -1)))
|
|
&& (propName == "url" || propName == "href")
|
|
) {
|
|
propName = "uri";
|
|
}
|
|
|
|
try {
|
|
if (el[propName] !== "undefined") {
|
|
var propValue = chars;
|
|
// convert URI-bearing values to an nsIURI
|
|
if (propName == "uri") {
|
|
var base = this._xmlBaseStack[this._xmlBaseStack.length - 1];
|
|
propValue = strToURI(chars, base);
|
|
}
|
|
el[propName] = propValue;
|
|
}
|
|
}
|
|
catch (e) {
|
|
// ignore XPConnect errors
|
|
}
|
|
// the rest of the function deals with entry- and feed-level stuff
|
|
return;
|
|
}
|
|
container = container[container.length - 1];
|
|
}
|
|
|
|
// Make the buffer our new property
|
|
var propName = this._prefixForNS(uri) + localName;
|
|
|
|
// But, it could be something containing HTML. If so,
|
|
// we need to know about that.
|
|
if (this._textConstructs[propName]
|
|
&& this._handlerStack[this._depth].containerClass !== null) {
|
|
var newProp = new TextConstruct();
|
|
newProp.text = chars;
|
|
// Look up the default type in our table
|
|
var type = this._textConstructs[propName];
|
|
var typeAttribute = (attributes.getNamedItemNS("", "type") || {}).value;
|
|
if (this._result.version == "atom" && typeAttribute) {
|
|
type = typeAttribute;
|
|
}
|
|
else if (this._result.version == "atom03" && typeAttribute) {
|
|
if (typeAttribute.toLowerCase().includes("xhtml")) {
|
|
type = "xhtml";
|
|
}
|
|
else if (typeAttribute.toLowerCase().includes("html")) {
|
|
type = "html";
|
|
}
|
|
else if (typeAttribute.toLowerCase().includes("text")) {
|
|
type = "text";
|
|
}
|
|
}
|
|
|
|
// If it's rss feed-level description, it's not supposed to have html
|
|
if (this._result.version.includes("rss")
|
|
&& this._handlerStack[this._depth].containerClass != Entry) {
|
|
type = "text";
|
|
}
|
|
newProp.type = type;
|
|
newProp.base = this._xmlBaseStack[this._xmlBaseStack.length - 1];
|
|
container[propName] = newProp;
|
|
}
|
|
else {
|
|
container[propName] = chars;
|
|
}
|
|
},
|
|
|
|
// Sometimes, we'll hand off SAX handling duties to an XHTMLHandler
|
|
// (see above) that will scrape out non-XHTML stuff, normalize
|
|
// namespaces, and remove the wrapper div from Atom 1.0. When the
|
|
// XHTMLHandler is done, it'll callback here.
|
|
returnFromXHTMLHandler: function (chars, uri, localName, qName) {
|
|
// retake control of the SAX content events
|
|
this._reader.contentHandler = this;
|
|
|
|
// Grab the top of the stack
|
|
var top = this._stack[this._stack.length - 1];
|
|
if (!top) {
|
|
return;
|
|
}
|
|
var container = top[0];
|
|
|
|
// Assign the property
|
|
var newProp = new TextConstruct();
|
|
newProp.text = chars;
|
|
newProp.type = "xhtml";
|
|
newProp.base = this._xmlBaseStack[this._xmlBaseStack.length - 1];
|
|
container[this._prefixForNS(uri) + localName] = newProp;
|
|
|
|
// XHTML will cause us to peek too far. The XHTML handler will
|
|
// send us an end element to call. RFC4287-valid feeds allow a
|
|
// more graceful way to handle this. Unfortunately, we can't count
|
|
// on compliance at this point.
|
|
this.endElement(uri, localName, qName);
|
|
},
|
|
};
|
|
|
|
if (typeof module == "object") {
|
|
module.exports = FeedProcessor;
|
|
}
|