92bb96ac49
Seems to be more used for copyright owner names than license information, which is probably what we want in that field when it's available.
1658 lines
45 KiB
JavaScript
1658 lines
45 KiB
JavaScript
/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
|
|
/* eslint-disable quote-props */
|
|
/* globals SAXXMLReader */
|
|
|
|
"use strict";
|
|
|
|
function LOG(str) {
|
|
Zotero.debug("Feed Processor: " + str);
|
|
}
|
|
|
|
const XMLNS = "http://www.w3.org/XML/1998/namespace";
|
|
const RSS090NS = "http://my.netscape.com/rdf/simple/0.9/";
|
|
|
|
/** *** Some general utils *****/
|
|
function strToURI(link, base) {
|
|
base = base || undefined;
|
|
try {
|
|
return new URL(link, base);
|
|
}
|
|
catch (e) {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function isArray(a) {
|
|
return isObject(a) && a.constructor == Array;
|
|
}
|
|
|
|
function isObject(a) {
|
|
return (a && typeof a == "object") || isFunction(a);
|
|
}
|
|
|
|
function isFunction(a) {
|
|
return typeof a == "function";
|
|
}
|
|
|
|
function stripTags(someHTML) {
|
|
return someHTML.replace(/<[^>]+>/g, "");
|
|
}
|
|
|
|
/**
|
|
* Searches through an array of links and returns a JS array
|
|
* of matching property bags.
|
|
*/
|
|
const IANA_URI = "http://www.iana.org/assignments/relation/";
|
|
function findAtomLinks(rel, links) {
|
|
var rvLinks = [];
|
|
for (var i = 0; i < links.length; ++i) {
|
|
var linkElement = links[i];
|
|
// atom:link MUST have @href
|
|
if (linkElement.href) {
|
|
var relAttribute = null;
|
|
if (linkElement.rel) {
|
|
relAttribute = linkElement.rel;
|
|
}
|
|
if ((!relAttribute && rel == "alternate") || relAttribute == rel) {
|
|
rvLinks.push(linkElement);
|
|
continue;
|
|
}
|
|
// catch relations specified by IANA URI
|
|
if (relAttribute == IANA_URI + rel) {
|
|
rvLinks.push(linkElement);
|
|
}
|
|
}
|
|
}
|
|
return rvLinks;
|
|
}
|
|
|
|
function xmlEscape(s) {
|
|
s = s.replace(/&/g, "&");
|
|
s = s.replace(/>/g, ">");
|
|
s = s.replace(/</g, "<");
|
|
s = s.replace(/"/g, """);
|
|
s = s.replace(/'/g, "'");
|
|
return s;
|
|
}
|
|
|
|
function makePropGetter(key) {
|
|
return function(bag) {
|
|
return bag[key];
|
|
};
|
|
}
|
|
|
|
const RDF_NS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
|
|
// namespace map
|
|
var gNamespaces = {
|
|
"http://webns.net/mvcb/": "admin",
|
|
"http://backend.userland.com/rss": "",
|
|
"http://blogs.law.harvard.edu/tech/rss": "",
|
|
"http://www.w3.org/2005/Atom": "atom",
|
|
"http://purl.org/atom/ns#": "atom03",
|
|
"http://purl.org/rss/1.0/modules/content/": "content",
|
|
"http://purl.org/dc/elements/1.1/": "dc",
|
|
"http://purl.org/dc/terms/": "dcterms",
|
|
"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
|
|
"http://purl.org/rss/1.0/": "rss1",
|
|
"http://my.netscape.com/rdf/simple/0.9/": "rss1",
|
|
"http://wellformedweb.org/CommentAPI/": "wfw",
|
|
"http://purl.org/rss/1.0/modules/wiki/": "wiki",
|
|
"http://www.w3.org/XML/1998/namespace": "xml",
|
|
"http://search.yahoo.com/mrss/": "media",
|
|
"http://search.yahoo.com/mrss": "media",
|
|
"http://prismstandard.org/namespaces/1.2/basic/": "prism",
|
|
"http://prismstandard.org/namespaces/basic/2.0/": "prism",
|
|
"http://prismstandard.org/namespaces/basic/3.0/": "prism",
|
|
"https://prismdb.takanakahiko.me/prism-schema.ttl#": "prism",
|
|
};
|
|
|
|
// We allow a very small set of namespaces in XHTML content,
|
|
// for attributes only
|
|
var gAllowedXHTMLNamespaces = {
|
|
"http://www.w3.org/XML/1998/namespace": "xml",
|
|
// if someone ns qualifies XHTML, we have to prefix it to avoid an
|
|
// attribute collision.
|
|
"http://www.w3.org/1999/xhtml": "xhtml",
|
|
};
|
|
|
|
// Implements nsIFeedResult
|
|
function FeedResult() {}
|
|
FeedResult.prototype = {
|
|
bozo: false,
|
|
doc: null,
|
|
version: null,
|
|
headers: null,
|
|
uri: null,
|
|
stylesheet: null,
|
|
};
|
|
|
|
// Implements nsIFeed, nsIFeedContainer
|
|
function Feed() {
|
|
this.subtitle = null;
|
|
this.title = null;
|
|
this.items = [];
|
|
this.link = null;
|
|
this.id = null;
|
|
this.generator = null;
|
|
this.authors = [];
|
|
this.contributors = [];
|
|
this.baseURI = null;
|
|
this.enclosureCount = 0;
|
|
this.type = Feed.TYPE_FEED;
|
|
}
|
|
|
|
Feed.TYPE_FEED = 0;
|
|
Feed.TYPE_AUDIO = 1;
|
|
Feed.TYPE_IMAGE = 2;
|
|
Feed.TYPE_VIDEO = 4;
|
|
|
|
Feed.prototype = {
|
|
searchLists: {
|
|
title: ["title", "rss1:title", "atom03:title", "atom:title", "dc:title"],
|
|
subtitle: [
|
|
"description",
|
|
"dc:description",
|
|
"rss1:description",
|
|
"atom03:tagline",
|
|
"atom:subtitle",
|
|
],
|
|
pubTitle: ["pubTitle", "dc:source", "prism:publicationName"],
|
|
items: ["items", "atom03_entries", "entries"],
|
|
id: ["atom:id", "rdf:about"],
|
|
generator: ["generator"],
|
|
authors: ["authors"],
|
|
contributors: ["contributors"],
|
|
link: [["link", strToURI], ["rss1:link", strToURI]],
|
|
categories: ["categories", "dc:subject"],
|
|
rights: [
|
|
"prism:copyright",
|
|
"dc:rights",
|
|
"atom03:rights",
|
|
"atom:rights",
|
|
"copyright",
|
|
],
|
|
cloud: ["cloud"],
|
|
image: ["image", "rss1:image", "atom:logo"],
|
|
textInput: ["textInput", "rss1:textinput"],
|
|
skipDays: ["skipDays"],
|
|
skipHours: ["skipHours"],
|
|
ttl: ["ttl"],
|
|
updated: [
|
|
"pubDate",
|
|
"lastBuildDate",
|
|
"atom03:modified",
|
|
"dc:date",
|
|
"dcterms:modified",
|
|
"atom:updated",
|
|
],
|
|
issn: ["prism:issn"],
|
|
isbn: ["isbn", "prism:isbn"],
|
|
language: ["language", "dc:language"],
|
|
publisher: ["dc:publisher"],
|
|
},
|
|
|
|
normalize: function () {
|
|
fieldsToObj(this, this.searchLists);
|
|
if (this.skipDays) {
|
|
this.skipDays = this.skipDays.days;
|
|
}
|
|
if (this.skipHours) {
|
|
this.skipHours = this.skipHours.hours;
|
|
}
|
|
|
|
// Assign Atom link if needed
|
|
if (this.fields.links) {
|
|
this._atomLinksToURI();
|
|
}
|
|
|
|
this._calcEnclosureCountAndFeedType();
|
|
|
|
// Resolve relative image links
|
|
if (this.image && this.image.url) {
|
|
this._resolveImageLink();
|
|
}
|
|
|
|
this._resetBagMembersToRawText([this.searchLists.subtitle, this.searchLists.title]);
|
|
},
|
|
|
|
_calcEnclosureCountAndFeedType: function () {
|
|
var entriesWithEnclosures = 0;
|
|
var audioCount = 0;
|
|
var imageCount = 0;
|
|
var videoCount = 0;
|
|
var otherCount = 0;
|
|
|
|
for (var i = 0; i < this.items.length; ++i) {
|
|
var entry = this.items[i];
|
|
|
|
if (entry.enclosures && entry.enclosures.length > 0) {
|
|
++entriesWithEnclosures;
|
|
|
|
for (var e = 0; e < entry.enclosures.length; ++e) {
|
|
var enc = entry.enclosures[e];
|
|
if (enc.type) {
|
|
var enctype = enc.type;
|
|
|
|
if (/^audio/.test(enctype)) {
|
|
++audioCount;
|
|
}
|
|
else if (/^image/.test(enctype)) {
|
|
++imageCount;
|
|
}
|
|
else if (/^video/.test(enctype)) {
|
|
++videoCount;
|
|
}
|
|
else {
|
|
++otherCount;
|
|
}
|
|
}
|
|
else {
|
|
++otherCount;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
var feedtype = Feed.TYPE_FEED;
|
|
|
|
// For a feed to be marked as TYPE_VIDEO, TYPE_AUDIO and TYPE_IMAGE,
|
|
// we enforce two things:
|
|
//
|
|
// 1. all entries must have at least one enclosure
|
|
// 2. all enclosures must be video for TYPE_VIDEO, audio for TYPE_AUDIO or image
|
|
// for TYPE_IMAGE
|
|
//
|
|
// Otherwise it's a TYPE_FEED.
|
|
if (entriesWithEnclosures == this.items.length && otherCount == 0) {
|
|
if (audioCount > 0 && !videoCount && !imageCount) {
|
|
feedtype = Feed.TYPE_AUDIO;
|
|
}
|
|
else if (imageCount > 0 && !audioCount && !videoCount) {
|
|
feedtype = Feed.TYPE_IMAGE;
|
|
}
|
|
else if (videoCount > 0 && !audioCount && !imageCount) {
|
|
feedtype = Feed.TYPE_VIDEO;
|
|
}
|
|
}
|
|
|
|
this.type = feedtype;
|
|
this.enclosureCount = otherCount + videoCount + audioCount + imageCount;
|
|
},
|
|
|
|
_atomLinksToURI: function () {
|
|
var links = this.fields.links;
|
|
var alternates = findAtomLinks("alternate", links);
|
|
if (alternates.length > 0) {
|
|
var href = alternates[0].href;
|
|
var base;
|
|
if (alternates[0]["xml:base"]) {
|
|
base = alternates[0]["xml:base"];
|
|
}
|
|
this.link = this._resolveURI(href, base);
|
|
}
|
|
},
|
|
|
|
_resolveImageLink: function () {
|
|
var base;
|
|
if (this.image["xml:base"]) {
|
|
base = this.image["xml:base"];
|
|
}
|
|
var url = this._resolveURI(this.image.url, base);
|
|
if (url) {
|
|
this.image.url = url.href;
|
|
}
|
|
},
|
|
|
|
_resolveURI: function (linkSpec, baseSpec) {
|
|
var uri = null;
|
|
try {
|
|
var base = baseSpec ? strToURI(baseSpec, this.baseURI) : this.baseURI;
|
|
uri = strToURI(linkSpec, base);
|
|
}
|
|
catch (e) {
|
|
LOG(e);
|
|
}
|
|
|
|
return uri;
|
|
},
|
|
|
|
// reset the bag to raw contents, not text constructs
|
|
_resetBagMembersToRawText: function (fieldLists) {
|
|
for (var i = 0; i < fieldLists.length; i++) {
|
|
for (var j = 0; j < fieldLists[i].length; j++) {
|
|
if (this.fields[fieldLists[i][j]]) {
|
|
var textConstruct = this.fields[fieldLists[i][j]];
|
|
this.fields[fieldLists[i][j]] = textConstruct.text;
|
|
}
|
|
}
|
|
}
|
|
},
|
|
};
|
|
|
|
// Implements nsIFeedEntry, nsIFeedContainer
|
|
function Entry() {
|
|
this.summary = null;
|
|
this.content = null;
|
|
this.title = null;
|
|
this.fields = {};
|
|
this.link = null;
|
|
this.id = null;
|
|
this.baseURI = null;
|
|
this.updated = null;
|
|
this.published = null;
|
|
this.authors = [];
|
|
this.contributors = [];
|
|
}
|
|
|
|
Entry.prototype = {
|
|
fields: null,
|
|
enclosures: null,
|
|
mediaContent: null,
|
|
|
|
searchLists: {
|
|
title: ["title", "rss1:title", "atom03:title", "atom:title", "dc:title"],
|
|
link: [["link", strToURI], ["rss1:link", strToURI]],
|
|
id: [
|
|
["guid", makePropGetter("guid")],
|
|
"rdf:about",
|
|
"atom03:id",
|
|
"atom:id",
|
|
],
|
|
authors: ["authors"],
|
|
contributors: ["contributors"],
|
|
summary: [
|
|
"description",
|
|
"rss1:description",
|
|
"dc:description",
|
|
"atom03:summary",
|
|
"atom:summary",
|
|
],
|
|
content: ["content:encoded", "atom03:content", "atom:content"],
|
|
rights: [
|
|
"prism:copyright",
|
|
"dc:rights",
|
|
"atom03:rights",
|
|
"atom:rights",
|
|
"copyright",
|
|
],
|
|
published: [
|
|
"dc:date",
|
|
"pubDate",
|
|
"atom03:issued",
|
|
"dcterms:issued",
|
|
"atom:published",
|
|
"prism:publicationDate",
|
|
],
|
|
updated: [
|
|
"pubDate",
|
|
"atom03:modified",
|
|
"dc:date",
|
|
"dcterms:modified",
|
|
"atom:updated",
|
|
"prism:modificationDate",
|
|
],
|
|
pubTitle: ["pubTitle", "dc:source", "prism:publicationName"],
|
|
pubType: ["pubType"],
|
|
startPage: ["startPage", "prism:startingPage"],
|
|
endPage: ["endPage", "prism:endingPage"],
|
|
pageRange: ["prism:pageRange"],
|
|
issn: ["prism:issn"],
|
|
isbn: ["isbn", "prism:isbn"],
|
|
identifier: [
|
|
"dc:identifier",
|
|
"prism:doi",
|
|
],
|
|
publisher: ["dc:publisher"],
|
|
language: ["language", "dc:language"],
|
|
volume: ["prism:volume"],
|
|
issue: ["prism:number"],
|
|
section: ["prism:section"],
|
|
url: ["prism:url"],
|
|
},
|
|
|
|
normalize: function () {
|
|
fieldsToObj(this, this.searchLists);
|
|
|
|
// Assign Atom link if needed
|
|
if (this.fields.links) {
|
|
this._atomLinksToURI();
|
|
}
|
|
|
|
// Populate enclosures array
|
|
this._populateEnclosures();
|
|
|
|
// The link might be a guid w/ permalink=true
|
|
if (!this.link && this.fields.guid) {
|
|
var guid = this.fields.guid;
|
|
var isPermaLink = true;
|
|
|
|
if (guid.isPermaLink) {
|
|
isPermaLink = guid.isPermaLink.toLowerCase() != "false";
|
|
}
|
|
|
|
if (guid && isPermaLink) {
|
|
this.link = strToURI(guid.guid);
|
|
}
|
|
}
|
|
|
|
this._resetBagMembersToRawText([
|
|
this.searchLists.content,
|
|
this.searchLists.summary,
|
|
this.searchLists.title,
|
|
]);
|
|
},
|
|
|
|
_populateEnclosures: function () {
|
|
if (this.fields.links) {
|
|
this._atomLinksToEnclosures();
|
|
}
|
|
|
|
// Add RSS2 enclosure to enclosures
|
|
if (this.fields.enclosure) {
|
|
this._enclosureToEnclosures();
|
|
}
|
|
|
|
// Add media:content to enclosures
|
|
if (this.fields.mediacontent) {
|
|
this._mediaToEnclosures("mediacontent");
|
|
}
|
|
|
|
// Add media:thumbnail to enclosures
|
|
if (this.fields.mediathumbnail) {
|
|
this._mediaToEnclosures("mediathumbnail");
|
|
}
|
|
|
|
// Add media:content in media:group to enclosures
|
|
if (this.fields.mediagroup) {
|
|
this._mediaToEnclosures("mediagroup", "mediacontent");
|
|
}
|
|
},
|
|
|
|
__enclosureMap: null,
|
|
|
|
_addToEnclosures: function (newEnc) {
|
|
// items we add to the enclosures array get displayed in the FeedWriter and
|
|
// they must have non-empty urls.
|
|
if (!newEnc.url || newEnc.url == "") {
|
|
return;
|
|
}
|
|
|
|
if (this.__enclosureMap === null) {
|
|
this.__enclosureMap = {};
|
|
}
|
|
|
|
var previousEnc = this.__enclosureMap[newEnc.url];
|
|
|
|
if (previousEnc != undefined) {
|
|
if (!previousEnc.type && newEnc.type) {
|
|
previousEnc.type = newEnc.type;
|
|
}
|
|
|
|
if (!previousEnc.length && newEnc.length) {
|
|
previousEnc.length = newEnc.length;
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
if (this.enclosures === null) {
|
|
this.enclosures = [];
|
|
}
|
|
|
|
this.enclosures.push(newEnc);
|
|
this.__enclosureMap[newEnc.url] = newEnc;
|
|
},
|
|
|
|
_atomLinksToEnclosures: function () {
|
|
var links = this.fields.links;
|
|
var encLinks = findAtomLinks("enclosure", links);
|
|
if (encLinks.length == 0) {
|
|
return;
|
|
}
|
|
|
|
for (var i = 0; i < encLinks.length; ++i) {
|
|
var link = encLinks[i];
|
|
|
|
// an enclosure must have an href
|
|
if (!link.href) {
|
|
return;
|
|
}
|
|
|
|
var enc = {};
|
|
|
|
// copy Atom bits over to equivalent enclosure bits
|
|
enc.url = link.href;
|
|
if (link.type) {
|
|
enc.type = link.type;
|
|
}
|
|
if (link.length) {
|
|
enc.length = link.length;
|
|
}
|
|
|
|
this._addToEnclosures(enc);
|
|
}
|
|
},
|
|
|
|
_enclosureToEnclosures: function () {
|
|
var enc = this.fields.enclosure;
|
|
|
|
if (!enc.url) {
|
|
return;
|
|
}
|
|
|
|
this._addToEnclosures(enc);
|
|
},
|
|
|
|
_mediaToEnclosures: function (mediaType, contentType) {
|
|
var content;
|
|
|
|
// If a contentType is specified, the mediaType is a simple propertybag,
|
|
// and the contentType is an array inside it.
|
|
if (contentType) {
|
|
var group = this.fields[mediaType];
|
|
content = group[contentType];
|
|
}
|
|
else {
|
|
content = this.fields[mediaType];
|
|
}
|
|
|
|
for (var i = 0; i < content.length; ++i) {
|
|
var contentElement = content[i];
|
|
|
|
// media:content don't require url, but if it's not there, we should
|
|
// skip it.
|
|
if (!contentElement.url) {
|
|
continue;
|
|
}
|
|
|
|
var enc = {};
|
|
|
|
// copy media:content bits over to equivalent enclosure bits
|
|
enc.url = contentElement.url;
|
|
if (contentElement.type) {
|
|
enc.type = contentElement.type;
|
|
}
|
|
else if (mediaType == "mediathumbnail") {
|
|
// thumbnails won't have a type, but default to image types
|
|
enc.type = "image/*";
|
|
enc.thumbnail = true;
|
|
}
|
|
|
|
if (contentElement.fileSize) {
|
|
enc.length = contentElement.fileSize;
|
|
}
|
|
|
|
this._addToEnclosures(enc);
|
|
}
|
|
},
|
|
};
|
|
|
|
Entry.prototype._atomLinksToURI = Feed.prototype._atomLinksToURI;
|
|
Entry.prototype._resolveURI = Feed.prototype._resolveURI;
|
|
Entry.prototype._resetBagMembersToRawText = Feed.prototype._resetBagMembersToRawText;
|
|
|
|
// TextConstruct represents and element that could contain (X)HTML
|
|
// Implements nsIFeedTextConstruct
|
|
function TextConstruct() {
|
|
this.lang = null;
|
|
this.base = null;
|
|
this.type = "text";
|
|
this.text = null;
|
|
}
|
|
|
|
TextConstruct.prototype = {
|
|
plainText: function () {
|
|
if (this.type != "text") {
|
|
return stripTags(this.text);
|
|
}
|
|
return this.text;
|
|
},
|
|
|
|
createDocumentFragment: function () {
|
|
if (this.type == "text") {
|
|
const docFragment = new DOMParser().parseFromString('<!doctype html>', 'text/html')
|
|
.createDocumentFragment();
|
|
docFragment.append(this.text);
|
|
return docFragment;
|
|
}
|
|
|
|
let parserType;
|
|
if (this.type == "xhtml") {
|
|
parserType = "application/xhtml+xml";
|
|
}
|
|
else if (this.type == "html") {
|
|
parserType = "text/html";
|
|
}
|
|
else {
|
|
return null;
|
|
}
|
|
|
|
const parsedDoc = new DOMParser().parseFromString(this.text, parserType);
|
|
const docFragment = parsedDoc.createDocumentFragment();
|
|
docFragment.append(parsedDoc.documentElement);
|
|
return docFragment;
|
|
},
|
|
};
|
|
|
|
// Generator represents the software that produced the feed
|
|
// Implements nsIFeedGenerator, nsIFeedElementBase
|
|
function Generator() {
|
|
this.lang = null;
|
|
this.agent = null;
|
|
this.version = null;
|
|
this.uri = null;
|
|
|
|
// nsIFeedElementBase
|
|
this._attributes = null;
|
|
this.baseURI = null;
|
|
}
|
|
|
|
Generator.prototype = {
|
|
get attributes() {
|
|
return this._attributes;
|
|
},
|
|
|
|
set attributes(value) {
|
|
this._attributes = value;
|
|
this.version = (this._attributes.getNamedItemNS("", "version") || {}).value;
|
|
var uriAttribute = (this._attributes.getNamedItemNS("", "uri") || {}).value
|
|
|| (this._attributes.getNamedItemNS("", "url") || {}).value;
|
|
this.uri = strToURI(uriAttribute, this.baseURI);
|
|
|
|
// RSS1
|
|
uriAttribute = (this._attributes.getNamedItemNS(RDF_NS, "resource") || {}).value;
|
|
if (uriAttribute) {
|
|
this.agent = uriAttribute;
|
|
this.uri = strToURI(uriAttribute, this.baseURI);
|
|
}
|
|
},
|
|
};
|
|
|
|
// Implements nsIFeedPerson, nsIFeedElementBase
|
|
function Person() {
|
|
this.name = null;
|
|
this.uri = null;
|
|
this.email = null;
|
|
|
|
// nsIFeedElementBase
|
|
this.attributes = null;
|
|
this.baseURI = null;
|
|
}
|
|
|
|
/**
|
|
* Map a list of fields into properties on a container.
|
|
*
|
|
* @param container An nsIFeedContainer
|
|
* @param fields A list of fields to search for. List members can
|
|
* be a list, in which case the second member is
|
|
* transformation function (like parseInt).
|
|
*/
|
|
function fieldsToObj(container, fields) {
|
|
var props, prop, field, searchList;
|
|
for (var key in fields) {
|
|
searchList = fields[key];
|
|
for (var i = 0; i < searchList.length; ++i) {
|
|
props = searchList[i];
|
|
prop = null;
|
|
field = isArray(props) ? props[0] : props;
|
|
prop = container.fields[field];
|
|
if (prop) {
|
|
prop = isArray(props) ? props[1](prop) : prop;
|
|
container[key] = prop;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// create a generator element
|
|
function atomGenerator(s, generator) {
|
|
generator.agent = s.trim();
|
|
return generator;
|
|
}
|
|
|
|
// post-process atom:logo to create an RSS2-like structure
|
|
function atomLogo(s, logo) {
|
|
logo.url = s.trim();
|
|
}
|
|
|
|
// post-process an RSS category, map it to the Atom fields.
|
|
function rssCatTerm(s, cat) {
|
|
// add slash handling?
|
|
cat.term = s.trim();
|
|
return cat;
|
|
}
|
|
|
|
// post-process a GUID
|
|
function rssGuid(s, guid) {
|
|
guid.guid = s.trim();
|
|
return guid;
|
|
}
|
|
|
|
// post-process an RSS author element
|
|
//
|
|
// It can contain a field like this:
|
|
//
|
|
// <author>lawyer@boyer.net (Lawyer Boyer)</author>
|
|
//
|
|
// or, delightfully, a field like this:
|
|
//
|
|
// <dc:creator>Simon St.Laurent (mailto:simonstl@simonstl.com)</dc:creator>
|
|
//
|
|
// We want to split this up and assign it to corresponding Atom
|
|
// fields.
|
|
//
|
|
function rssAuthor(s, author) {
|
|
// check for RSS2 string format
|
|
var chars = s.trim();
|
|
var matches = chars.match(/(.*)\((.*)\)/);
|
|
var emailCheck
|
|
= /^([a-zA-Z0-9_.-])+@(([a-zA-Z0-9-])+\.)+([a-zA-Z0-9]{2,4})+$/;
|
|
if (matches) {
|
|
var match1 = matches[1].trim();
|
|
var match2 = matches[2].trim();
|
|
if (match2.indexOf("mailto:") == 0) {
|
|
match2 = match2.substring(7);
|
|
}
|
|
if (emailCheck.test(match1)) {
|
|
author.email = match1;
|
|
author.name = match2;
|
|
}
|
|
else if (emailCheck.test(match2)) {
|
|
author.email = match2;
|
|
author.name = match1;
|
|
}
|
|
else {
|
|
// put it back together
|
|
author.name = match1 + " (" + match2 + ")";
|
|
}
|
|
}
|
|
else {
|
|
author.name = chars;
|
|
if (chars.indexOf("@")) {
|
|
author.email = chars;
|
|
}
|
|
}
|
|
return author;
|
|
}
|
|
|
|
const XHTML_NS = "http://www.w3.org/1999/xhtml";
|
|
|
|
// The XHTMLHandler handles inline XHTML found in things like atom:summary
|
|
function XHTMLHandler(processor, isAtom) {
|
|
this._buf = "";
|
|
this._processor = processor;
|
|
this._depth = 0;
|
|
this._isAtom = isAtom;
|
|
// a stack of lists tracking in-scope namespaces
|
|
this._inScopeNS = [];
|
|
}
|
|
|
|
// The fidelity can be improved here, to allow handling of stuff like
|
|
// SVG and MathML. XXX
|
|
XHTMLHandler.prototype = {
|
|
|
|
// look back up at the declared namespaces
|
|
// we always use the same prefixes for our safe stuff
|
|
_isInScope: function (ns) {
|
|
for (var i in this._inScopeNS) {
|
|
for (var uri in this._inScopeNS[i]) {
|
|
if (this._inScopeNS[i][uri] == ns) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
},
|
|
|
|
startDocument: function () {
|
|
},
|
|
endDocument: function () {
|
|
},
|
|
startElement: function (namespace, localName, qName, attributes) {
|
|
++this._depth;
|
|
this._inScopeNS.push([]);
|
|
|
|
// RFC4287 requires XHTML to be wrapped in a div that is *not* part of
|
|
// the content. This prevents people from screwing up namespaces, but
|
|
// we need to skip it here.
|
|
if (this._isAtom && this._depth == 1 && localName == "div") {
|
|
return;
|
|
}
|
|
|
|
// If it's an XHTML element, record it. Otherwise, it's ignored.
|
|
if (namespace == XHTML_NS) {
|
|
this._buf += "<" + localName;
|
|
var uri;
|
|
for (var i = 0; i < attributes.length; ++i) {
|
|
uri = attributes.item(i).namespaceURI;
|
|
// XHTML attributes aren't in a namespace
|
|
if (uri == "") {
|
|
this._buf += (" " + attributes.item(i).localName + "='"
|
|
+ xmlEscape(attributes.item(i).value) + "'");
|
|
}
|
|
else {
|
|
// write a small set of allowed attribute namespaces
|
|
var prefix = gAllowedXHTMLNamespaces[uri];
|
|
if (prefix) {
|
|
// The attribute value we'll attempt to write
|
|
var attributeValue = xmlEscape(attributes.item(i).value);
|
|
|
|
// it's an allowed attribute NS.
|
|
// write the attribute
|
|
this._buf += (" " + prefix + ":"
|
|
+ attributes.item(i).localName
|
|
+ "='" + attributeValue + "'");
|
|
|
|
// write an xmlns declaration if necessary
|
|
if (prefix != "xml" && !this._isInScope(uri)) {
|
|
this._inScopeNS[this._inScopeNS.length - 1].push(uri);
|
|
this._buf += " xmlns:" + prefix + "='" + uri + "'";
|
|
}
|
|
}
|
|
}
|
|
}
|
|
this._buf += ">";
|
|
}
|
|
},
|
|
endElement: function (uri, localName, qName) {
|
|
--this._depth;
|
|
this._inScopeNS.pop();
|
|
|
|
// We need to skip outer divs in Atom. See comment in startElement.
|
|
if (this._isAtom && this._depth == 0 && localName == "div") {
|
|
return;
|
|
}
|
|
|
|
// When we peek too far, go back to the main processor
|
|
if (this._depth < 0) {
|
|
this._processor.returnFromXHTMLHandler(this._buf.trim(), uri, localName, qName);
|
|
return;
|
|
}
|
|
// If it's an XHTML element, record it. Otherwise, it's ignored.
|
|
if (uri == XHTML_NS) {
|
|
this._buf += "</" + localName + ">";
|
|
}
|
|
},
|
|
characters: function (data) {
|
|
this._buf += xmlEscape(data);
|
|
},
|
|
processingInstruction: function () {
|
|
},
|
|
};
|
|
|
|
/**
|
|
* The ExtensionHandler deals with elements we haven't explicitly
|
|
* added to our transition table in the FeedProcessor.
|
|
*/
|
|
function ExtensionHandler(processor) {
|
|
this._buf = "";
|
|
this._depth = 0;
|
|
this._hasChildElements = false;
|
|
|
|
// The FeedProcessor
|
|
this._processor = processor;
|
|
|
|
// Fields of the outermost extension element.
|
|
this._localName = null;
|
|
this._uri = null;
|
|
this._qName = null;
|
|
this._attrs = null;
|
|
}
|
|
|
|
ExtensionHandler.prototype = {
|
|
startDocument: function () {
|
|
},
|
|
endDocument: function () {
|
|
},
|
|
startElement: function (uri, localName, qName, attrs) {
|
|
++this._depth;
|
|
|
|
if (this._depth == 1) {
|
|
this._uri = uri;
|
|
this._localName = localName;
|
|
this._qName = qName;
|
|
this._attrs = attrs;
|
|
}
|
|
|
|
// if we descend into another element, we won't send text
|
|
this._hasChildElements = (this._depth > 1);
|
|
},
|
|
endElement: function (_uri, _localName, _qName) {
|
|
--this._depth;
|
|
if (this._depth == 0) {
|
|
var text = this._hasChildElements ? null : this._buf.trim();
|
|
this._processor.returnFromExtHandler(this._uri, this._localName, text, this._attrs);
|
|
}
|
|
},
|
|
characters: function (data) {
|
|
if (!this._hasChildElements) {
|
|
this._buf += data;
|
|
}
|
|
},
|
|
processingInstruction: function () {
|
|
},
|
|
};
|
|
|
|
|
|
/**
|
|
* ElementInfo is a simple container object that describes
|
|
* some characteristics of a feed element. For example, it
|
|
* says whether an element can be expected to appear more
|
|
* than once inside a given entry or feed.
|
|
*/
|
|
function ElementInfo(fieldName, containerClass, closeFunc, isArray) {
|
|
this.fieldName = fieldName;
|
|
this.containerClass = containerClass;
|
|
this.closeFunc = closeFunc;
|
|
this.isArray = isArray;
|
|
this.isWrapper = false;
|
|
}
|
|
|
|
/**
|
|
* FeedElementInfo represents a feed element, usually the root.
|
|
*/
|
|
function FeedElementInfo(fieldName, feedVersion) {
|
|
this.isWrapper = false;
|
|
this.fieldName = fieldName;
|
|
this.feedVersion = feedVersion;
|
|
}
|
|
|
|
/**
|
|
* Some feed formats include vestigial wrapper elements that we don't
|
|
* want to include in our object model, but we do need to keep track
|
|
* of during parsing.
|
|
*/
|
|
function WrapperElementInfo(fieldName) {
|
|
this.isWrapper = true;
|
|
this.fieldName = fieldName;
|
|
}
|
|
|
|
/** *** The Processor *****/
|
|
// Implements nsIFeedProcessor, nsISAXContentHandler, nsISAXErrorHandler,
|
|
// nsIStreamListener, nsIRequestObserver
|
|
function FeedProcessor() {
|
|
this._reader = new SAXXMLReader();
|
|
this._buf = "";
|
|
this._feed = {};
|
|
this._handlerStack = [];
|
|
this._xmlBaseStack = []; // sparse array keyed to nesting depth
|
|
this._depth = 0;
|
|
this._state = "START";
|
|
this._result = null;
|
|
this._extensionHandler = null;
|
|
this._xhtmlHandler = null;
|
|
this._haveSentResult = false;
|
|
|
|
// The nsIFeedResultListener waiting for the parse results
|
|
this.listener = null;
|
|
|
|
// These elements can contain (X)HTML or plain text.
|
|
// We keep a table here that contains their default treatment
|
|
this._textConstructs = {
|
|
"atom:title": "text",
|
|
"atom:summary": "text",
|
|
"atom:rights": "text",
|
|
"atom:content": "text",
|
|
"atom:subtitle": "text",
|
|
"description": "html",
|
|
"rss1:description": "html",
|
|
"dc:description": "html",
|
|
"content:encoded": "html",
|
|
"title": "text",
|
|
"rss1:title": "text",
|
|
"atom03:title": "text",
|
|
"atom03:tagline": "text",
|
|
"atom03:summary": "text",
|
|
"atom03:content": "text",
|
|
"dc:title": "text",
|
|
"dc:rights": "text",
|
|
"atom03:rights": "text",
|
|
"copyright": "text",
|
|
"prism:copyright": "text",
|
|
};
|
|
this._stack = [];
|
|
|
|
this._trans = {
|
|
"START": {
|
|
// If we hit a root RSS element, treat as RSS2.
|
|
"rss": new FeedElementInfo("RSS2", "rss2"),
|
|
|
|
// If we hit an RDF element, if could be RSS1, but we can't
|
|
// verify that until we hit a rss1:channel element.
|
|
"rdf:RDF": new WrapperElementInfo("RDF"),
|
|
|
|
// If we hit a Atom 1.0 element, treat as Atom 1.0.
|
|
"atom:feed": new FeedElementInfo("Atom", "atom"),
|
|
|
|
// Treat as Atom 0.3
|
|
"atom03:feed": new FeedElementInfo("Atom03", "atom03"),
|
|
},
|
|
|
|
/** ******* RSS2 **********/
|
|
"IN_RSS2": {
|
|
"channel": new WrapperElementInfo("channel"),
|
|
},
|
|
|
|
"IN_CHANNEL": {
|
|
"item": new ElementInfo("items", Entry, null, true),
|
|
"managingEditor": new ElementInfo("authors", Person, rssAuthor, true),
|
|
"dc:creator": new ElementInfo("authors", Person, rssAuthor, true),
|
|
"dc:author": new ElementInfo("authors", Person, rssAuthor, true),
|
|
"dc:contributor": new ElementInfo("contributors", Person, rssAuthor, true),
|
|
"category": new ElementInfo("categories", null, rssCatTerm, true),
|
|
"cloud": new ElementInfo("cloud", null, null, false),
|
|
"image": new ElementInfo("image", null, null, false),
|
|
"textInput": new ElementInfo("textInput", null, null, false),
|
|
"skipDays": new ElementInfo("skipDays", null, null, false),
|
|
"skipHours": new ElementInfo("skipHours", null, null, false),
|
|
"generator": new ElementInfo("generator", Generator, atomGenerator, false),
|
|
},
|
|
|
|
"IN_ITEMS": {
|
|
"author": new ElementInfo("authors", Person, rssAuthor, true),
|
|
"dc:creator": new ElementInfo("authors", Person, rssAuthor, true),
|
|
"dc:author": new ElementInfo("authors", Person, rssAuthor, true),
|
|
"dc:contributor": new ElementInfo("contributors", Person, rssAuthor, true),
|
|
"category": new ElementInfo("categories", null, rssCatTerm, true),
|
|
"enclosure": new ElementInfo("enclosure", null, null, false),
|
|
"media:content": new ElementInfo("mediacontent", null, null, true),
|
|
"media:group": new ElementInfo("mediagroup", null, null, false),
|
|
"media:thumbnail": new ElementInfo("mediathumbnail", null, null, true),
|
|
"guid": new ElementInfo("guid", null, rssGuid, false),
|
|
},
|
|
|
|
"IN_SKIPDAYS": {
|
|
"day": new ElementInfo("days", null, null, true),
|
|
},
|
|
|
|
"IN_SKIPHOURS": {
|
|
"hour": new ElementInfo("hours", null, null, true),
|
|
},
|
|
|
|
"IN_MEDIAGROUP": {
|
|
"media:content": new ElementInfo("mediacontent", null, null, true),
|
|
"media:thumbnail": new ElementInfo("mediathumbnail", null, null, true),
|
|
},
|
|
|
|
/** ******* RSS1 **********/
|
|
"IN_RDF": {
|
|
// If we hit a rss1:channel, we can verify that we have RSS1
|
|
"rss1:channel": new FeedElementInfo("rdf_channel", "rss1"),
|
|
"rss1:image": new ElementInfo("image", null, null, false),
|
|
"rss1:textinput": new ElementInfo("textInput", null, null, false),
|
|
"rss1:item": new ElementInfo("items", Entry, null, true),
|
|
},
|
|
|
|
"IN_RDF_CHANNEL": {
|
|
"admin:generatorAgent": new ElementInfo("generator", Generator, null, false),
|
|
"dc:creator": new ElementInfo("authors", Person, rssAuthor, true),
|
|
"dc:author": new ElementInfo("authors", Person, rssAuthor, true),
|
|
"dc:contributor": new ElementInfo("contributors", Person, rssAuthor, true),
|
|
},
|
|
|
|
/** ******* ATOM 1.0 **********/
|
|
"IN_ATOM": {
|
|
"atom:author": new ElementInfo("authors", Person, null, true),
|
|
"atom:generator": new ElementInfo("generator", Generator, atomGenerator, false),
|
|
"atom:contributor": new ElementInfo("contributors", Person, null, true),
|
|
"atom:link": new ElementInfo("links", null, null, true),
|
|
"atom:logo": new ElementInfo("atom:logo", null, atomLogo, false),
|
|
"atom:entry": new ElementInfo("entries", Entry, null, true),
|
|
},
|
|
|
|
"IN_ENTRIES": {
|
|
"atom:author": new ElementInfo("authors", Person, null, true),
|
|
"atom:contributor": new ElementInfo("contributors", Person, null, true),
|
|
"atom:link": new ElementInfo("links", null, null, true),
|
|
},
|
|
|
|
/** ******* ATOM 0.3 **********/
|
|
"IN_ATOM03": {
|
|
"atom03:author": new ElementInfo("authors", Person, null, true),
|
|
"atom03:contributor": new ElementInfo("contributors", Person, null, true),
|
|
"atom03:link": new ElementInfo("links", null, null, true),
|
|
"atom03:entry": new ElementInfo("atom03_entries", Entry, null, true),
|
|
"atom03:generator": new ElementInfo("generator", Generator, atomGenerator, false),
|
|
},
|
|
|
|
"IN_ATOM03_ENTRIES": {
|
|
"atom03:author": new ElementInfo("authors", Person, null, true),
|
|
"atom03:contributor": new ElementInfo("contributors", Person, null, true),
|
|
"atom03:link": new ElementInfo("links", null, null, true),
|
|
"atom03:entry": new ElementInfo("atom03_entries", Entry, null, true),
|
|
},
|
|
};
|
|
}
|
|
|
|
// See startElement for a long description of how feeds are processed.
|
|
FeedProcessor.prototype = {
|
|
// Set ourselves as the SAX handler, and set the base URI
|
|
_init: function (uri) {
|
|
this._reader.contentHandler = this;
|
|
this._reader.errorHandler = this;
|
|
this._result = new FeedResult();
|
|
if (uri) {
|
|
this._result.uri = uri;
|
|
this._reader.baseURI = uri;
|
|
this._xmlBaseStack[0] = uri;
|
|
}
|
|
},
|
|
|
|
// This function is called once we figure out what type of feed
|
|
// we're dealing with. Some feed types require digging a bit further
|
|
// than the root.
|
|
_docVerified: function (version) {
|
|
this._result.doc = new Feed();
|
|
this._result.doc.baseURI
|
|
= this._xmlBaseStack[this._xmlBaseStack.length - 1];
|
|
this._result.doc.fields = this._feed;
|
|
this._result.version = version;
|
|
},
|
|
|
|
// When we're done with the feed, let the listener know what
|
|
// happened.
|
|
_sendResult: function () {
|
|
this._haveSentResult = true;
|
|
try {
|
|
// Can be null when a non-feed is fed to us
|
|
if (this._result.doc) {
|
|
this._result.doc.normalize();
|
|
}
|
|
}
|
|
catch (e) {
|
|
LOG("FIXME: " + e);
|
|
}
|
|
|
|
try {
|
|
if (this.listener !== null) {
|
|
this.listener.handleResult(this._result);
|
|
}
|
|
}
|
|
finally {
|
|
this._result = null;
|
|
}
|
|
},
|
|
|
|
// Parsing functions
|
|
parseAsync: function (requestObserver, uri) {
|
|
this._init(uri);
|
|
this._reader.parseAsync(requestObserver);
|
|
},
|
|
|
|
// Fetch API
|
|
|
|
onResponseAvailable(response) {
|
|
return this._reader.onResponseAvailable(response);
|
|
},
|
|
|
|
// nsISAXErrorHandler
|
|
|
|
// We only care about fatal errors. When this happens, we may have
|
|
// parsed through the feed metadata and some number of entries. The
|
|
// listener can still show some of that data if it wants, and we'll
|
|
// set the bozo bit to indicate we were unable to parse all the way
|
|
// through.
|
|
fatalError: function () {
|
|
this._result.bozo = true;
|
|
// XXX need to QI to FeedProgressListener
|
|
if (!this._haveSentResult) {
|
|
this._sendResult();
|
|
}
|
|
},
|
|
|
|
// nsISAXContentHandler
|
|
|
|
startDocument: function () {
|
|
// LOG("----------");
|
|
},
|
|
|
|
endDocument: function () {
|
|
if (!this._haveSentResult) {
|
|
this._sendResult();
|
|
}
|
|
},
|
|
|
|
// The transitions defined above identify elements that contain more
|
|
// than just text. For example RSS items contain many fields, and so
|
|
// do Atom authors. The only commonly used elements that contain
|
|
// mixed content are Atom Text Constructs of type="xhtml", which we
|
|
// delegate to another handler for cleaning. That leaves a couple
|
|
// different types of elements to deal with: those that should occur
|
|
// only once, such as title elements, and those that can occur
|
|
// multiple times, such as the RSS category element and the Atom
|
|
// link element. Most of the RSS1/DC elements can occur multiple
|
|
// times in theory, but in practice, the only ones that do have
|
|
// analogues in Atom.
|
|
//
|
|
// Some elements are also groups of attributes or sub-elements,
|
|
// while others are simple text fields. For the most part, we don't
|
|
// have to pay explicit attention to the simple text elements,
|
|
// unless we want to post-process the resulting string to transform
|
|
// it into some richer object like a Date or URI.
|
|
//
|
|
// Elements that have more sophisticated content models still end up
|
|
// being dictionaries, whether they are based on attributes like RSS
|
|
// cloud, sub-elements like Atom author, or even items and
|
|
// entries. These elements are treated as "containers". It's
|
|
// theoretically possible for a container to have an attribute with
|
|
// the same universal name as a sub-element, but none of the feed
|
|
// formats allow this by default, and I don't of any extension that
|
|
// works this way.
|
|
//
|
|
startElement: function (uri, localName, qName, attributes) {
|
|
this._buf = "";
|
|
++this._depth;
|
|
var elementInfo;
|
|
|
|
// LOG("<" + localName + ">");
|
|
|
|
// Check for xml:base
|
|
var base = (attributes.getNamedItemNS(XMLNS, "base") || {}).value;
|
|
if (base) {
|
|
this._xmlBaseStack[this._depth]
|
|
= strToURI(base, this._xmlBaseStack[this._xmlBaseStack.length - 1]);
|
|
}
|
|
|
|
// To identify the element we're dealing with, we look up the
|
|
// namespace URI in our gNamespaces dictionary, which will give us
|
|
// a "canonical" prefix for a namespace URI. For example, this
|
|
// allows Dublin Core "creator" elements to be consistently mapped
|
|
// to "dc:creator", for easy field access by consumer code. This
|
|
// strategy also happens to shorten up our state table.
|
|
var key = this._prefixForNS(uri) + localName;
|
|
|
|
// Check to see if we need to hand this off to our XHTML handler.
|
|
// The elements we're dealing with will look like this:
|
|
//
|
|
// <title type="xhtml">
|
|
// <div xmlns="http://www.w3.org/1999/xhtml">
|
|
// A title with <b>bold</b> and <i>italics</i>.
|
|
// </div>
|
|
// </title>
|
|
//
|
|
// When it returns in returnFromXHTMLHandler, the handler should
|
|
// give us back a string like this:
|
|
//
|
|
// "A title with <b>bold</b> and <i>italics</i>."
|
|
//
|
|
// The Atom spec explicitly says the div is not part of the content,
|
|
// and explicitly allows whitespace collapsing.
|
|
//
|
|
if ((this._result.version == "atom" || this._result.version == "atom03")
|
|
&& this._textConstructs[key]) {
|
|
var type = (attributes.getNamedItemNS("", "type") || {}).value;
|
|
if (type && type.includes("xhtml")) {
|
|
this._xhtmlHandler
|
|
= new XHTMLHandler(this, (this._result.version == "atom"));
|
|
this._reader.contentHandler = this._xhtmlHandler;
|
|
return;
|
|
}
|
|
}
|
|
|
|
// Check our current state, and see if that state has a defined
|
|
// transition. For example, this._trans["atom:entry"]["atom:author"]
|
|
// will have one, and it tells us to add an item to our authors array.
|
|
if (this._trans[this._state] && this._trans[this._state][key]) {
|
|
elementInfo = this._trans[this._state][key];
|
|
}
|
|
else {
|
|
// If we don't have a transition, hand off to extension handler
|
|
this._extensionHandler = new ExtensionHandler(this);
|
|
this._reader.contentHandler = this._extensionHandler;
|
|
this._extensionHandler.startElement(uri, localName, qName, attributes);
|
|
return;
|
|
}
|
|
|
|
// This distinguishes wrappers like 'channel' from elements
|
|
// we'd actually like to do something with (which will test true).
|
|
this._handlerStack[this._depth] = elementInfo;
|
|
if (elementInfo.isWrapper) {
|
|
this._state = "IN_" + elementInfo.fieldName.toUpperCase();
|
|
this._stack.push([this._feed, this._state]);
|
|
}
|
|
else if (elementInfo.feedVersion) {
|
|
this._state = "IN_" + elementInfo.fieldName.toUpperCase();
|
|
|
|
// Check for the older RSS2 variants
|
|
if (elementInfo.feedVersion == "rss2") {
|
|
elementInfo.feedVersion = this._findRSSVersion(attributes);
|
|
}
|
|
else if (uri == RSS090NS) {
|
|
elementInfo.feedVersion = "rss090";
|
|
}
|
|
|
|
this._docVerified(elementInfo.feedVersion);
|
|
this._stack.push([this._feed, this._state]);
|
|
this._mapAttributes(this._feed, attributes);
|
|
}
|
|
else {
|
|
this._state = this._processComplexElement(elementInfo, attributes);
|
|
}
|
|
},
|
|
|
|
// In the endElement handler, we decrement the stack and look
|
|
// for cleanup/transition functions to execute. The second part
|
|
// of the state transition works as above in startElement, but
|
|
// the state we're looking for is prefixed with an underscore
|
|
// to distinguish endElement events from startElement events.
|
|
endElement: function (_uri, _localName, _qName) {
|
|
var elementInfo = this._handlerStack[this._depth];
|
|
// LOG("</" + localName + ">");
|
|
if (elementInfo && !elementInfo.isWrapper) {
|
|
this._closeComplexElement(elementInfo);
|
|
}
|
|
|
|
// cut down xml:base context
|
|
if (this._xmlBaseStack.length == this._depth + 1) {
|
|
this._xmlBaseStack = this._xmlBaseStack.slice(0, this._depth);
|
|
}
|
|
|
|
// our new state is whatever is at the top of the stack now
|
|
if (this._stack.length > 0) {
|
|
this._state = this._stack[this._stack.length - 1][1];
|
|
}
|
|
this._handlerStack = this._handlerStack.slice(0, this._depth);
|
|
--this._depth;
|
|
},
|
|
|
|
// Buffer up character data. The buffer is cleared with every
|
|
// opening element.
|
|
characters: function (data) {
|
|
this._buf += data;
|
|
},
|
|
|
|
processingInstruction: function (target, data) {
|
|
if (target == "xml-stylesheet") {
|
|
var hrefAttribute = data.match(/href=["'](.*?)["']/);
|
|
if (hrefAttribute && hrefAttribute.length == 2) {
|
|
this._result.stylesheet = strToURI(hrefAttribute[1], this._result.uri);
|
|
}
|
|
}
|
|
},
|
|
|
|
// end of nsISAXContentHandler
|
|
|
|
// Handle our more complicated elements--those that contain
|
|
// attributes and child elements.
|
|
_processComplexElement: function (elementInfo, attributes) {
|
|
var obj;
|
|
|
|
// If the container is an entry/item, it'll need to have its
|
|
// more esoteric properties put in the 'fields' property bag.
|
|
const Class = elementInfo.containerClass;
|
|
if (Class == Entry) {
|
|
obj = new Class();
|
|
obj.baseURI = this._xmlBaseStack[this._xmlBaseStack.length - 1];
|
|
this._mapAttributes(obj.fields, attributes);
|
|
}
|
|
else if (elementInfo.containerClass) {
|
|
obj = new Class();
|
|
obj.baseURI = this._xmlBaseStack[this._xmlBaseStack.length - 1];
|
|
obj.attributes = attributes; // just set the SAX attributes
|
|
}
|
|
else {
|
|
obj = {};
|
|
this._mapAttributes(obj, attributes);
|
|
}
|
|
|
|
// We should have a container/propertyBag that's had its
|
|
// attributes processed. Now we need to attach it to its
|
|
// container.
|
|
var newProp;
|
|
|
|
// First we'll see what's on top of the stack.
|
|
var container = this._stack[this._stack.length - 1][0];
|
|
|
|
// Check to see if it has the property
|
|
var prop = container[elementInfo.fieldName];
|
|
|
|
if (elementInfo.isArray) {
|
|
if (!prop) {
|
|
container[elementInfo.fieldName] = [];
|
|
}
|
|
|
|
newProp = container[elementInfo.fieldName];
|
|
newProp.push(obj);
|
|
|
|
// If new object is an nsIFeedContainer, we want to deal with
|
|
// its member nsIPropertyBag instead.
|
|
if (obj.fields) {
|
|
newProp = obj.fields;
|
|
}
|
|
}
|
|
else {
|
|
// If it doesn't, set it.
|
|
if (!prop) {
|
|
container[elementInfo.fieldName] = obj;
|
|
}
|
|
newProp = container[elementInfo.fieldName];
|
|
}
|
|
|
|
// make our new state name, and push the property onto the stack
|
|
var newState = "IN_" + elementInfo.fieldName.toUpperCase();
|
|
this._stack.push([newProp, newState, obj]);
|
|
return newState;
|
|
},
|
|
|
|
// Sometimes we need reconcile the element content with the object
|
|
// model for a given feed. We use helper functions to do the
|
|
// munging, but we need to identify array types here, so the munging
|
|
// happens only to the last element of an array.
|
|
_closeComplexElement: function (elementInfo) {
|
|
var stateTuple = this._stack.pop();
|
|
var container = stateTuple[0];
|
|
var containerParent = stateTuple[2];
|
|
var element = null;
|
|
|
|
// If it's an array and we have to post-process,
|
|
// grab the last element
|
|
if (isArray(container)) {
|
|
element = container[container.length - 1];
|
|
}
|
|
else {
|
|
element = container;
|
|
}
|
|
|
|
// Run the post-processing function if there is one.
|
|
if (elementInfo.closeFunc) {
|
|
element = elementInfo.closeFunc(this._buf, element);
|
|
}
|
|
|
|
// If an nsIFeedContainer was on top of the stack,
|
|
// we need to normalize it
|
|
if (elementInfo.containerClass == Entry) {
|
|
containerParent.normalize();
|
|
}
|
|
|
|
// If it's an array, re-set the last element
|
|
if (isArray(container)) {
|
|
container[container.length - 1] = element;
|
|
}
|
|
},
|
|
|
|
_prefixForNS: function (uri) {
|
|
if (!uri) {
|
|
return "";
|
|
}
|
|
var prefix = gNamespaces[uri];
|
|
if (prefix) {
|
|
return prefix + ":";
|
|
}
|
|
if (uri.toLowerCase().indexOf("http://backend.userland.com") == 0) {
|
|
return "";
|
|
}
|
|
return null;
|
|
},
|
|
|
|
_mapAttributes: function (bag, attributes) {
|
|
// Cycle through the attributes, and set our properties using the
|
|
// prefix:localNames we find in our namespace dictionary.
|
|
for (var i = 0; i < attributes.length; ++i) {
|
|
var key = this._prefixForNS(attributes.item(i).namespaceURI) + attributes.item(i).localName;
|
|
var val = attributes.item(i).value;
|
|
bag[key] = val;
|
|
}
|
|
},
|
|
|
|
// Only for RSS2esque formats
|
|
_findRSSVersion: function (attributes) {
|
|
var versionAttr = (attributes.getNamedItemNS("", "version") || {}).value.trim();
|
|
var versions = {
|
|
"0.91": "rss091",
|
|
"0.92": "rss092",
|
|
"0.93": "rss093",
|
|
"0.94": "rss094"
|
|
};
|
|
if (versions[versionAttr]) {
|
|
return versions[versionAttr];
|
|
}
|
|
if (versionAttr.substr(0, 2) != "2.") {
|
|
return "rssUnknown";
|
|
}
|
|
return "rss2";
|
|
},
|
|
|
|
// unknown element values are returned here. See startElement above
|
|
// for how this works.
|
|
returnFromExtHandler: function (uri, localName, chars, attributes) {
|
|
--this._depth;
|
|
|
|
// take control of the SAX events
|
|
this._reader.contentHandler = this;
|
|
if (localName === null && chars === null) {
|
|
return;
|
|
}
|
|
|
|
// we don't take random elements inside rdf:RDF
|
|
if (this._state == "IN_RDF") {
|
|
return;
|
|
}
|
|
|
|
// Grab the top of the stack
|
|
var top = this._stack[this._stack.length - 1];
|
|
if (!top) {
|
|
return;
|
|
}
|
|
|
|
var container = top[0];
|
|
// Grab the last element if it's an array
|
|
if (isArray(container)) {
|
|
var contract = this._handlerStack[this._depth].containerClass;
|
|
// check if it's something specific, but not an entry
|
|
if (contract && contract != Entry) {
|
|
var el = container[container.length - 1];
|
|
if (contract != Person) {
|
|
return; // don't know about this interface
|
|
}
|
|
|
|
let propName = localName;
|
|
var prefix = gNamespaces[uri];
|
|
|
|
// synonyms
|
|
if (
|
|
(uri == ""
|
|
|| prefix
|
|
&& ((prefix.indexOf("atom") > -1)
|
|
|| (prefix.indexOf("rss") > -1)))
|
|
&& (propName == "url" || propName == "href")
|
|
) {
|
|
propName = "uri";
|
|
}
|
|
|
|
try {
|
|
if (el[propName] !== "undefined") {
|
|
var propValue = chars;
|
|
// convert URI-bearing values to an nsIURI
|
|
if (propName == "uri") {
|
|
var base = this._xmlBaseStack[this._xmlBaseStack.length - 1];
|
|
propValue = strToURI(chars, base);
|
|
}
|
|
el[propName] = propValue;
|
|
}
|
|
}
|
|
catch (e) {
|
|
// ignore XPConnect errors
|
|
}
|
|
// the rest of the function deals with entry- and feed-level stuff
|
|
return;
|
|
}
|
|
container = container[container.length - 1];
|
|
}
|
|
|
|
// Make the buffer our new property
|
|
var propName = this._prefixForNS(uri) + localName;
|
|
|
|
// But, it could be something containing HTML. If so,
|
|
// we need to know about that.
|
|
if (this._textConstructs[propName]
|
|
&& this._handlerStack[this._depth].containerClass !== null) {
|
|
var newProp = new TextConstruct();
|
|
newProp.text = chars;
|
|
// Look up the default type in our table
|
|
var type = this._textConstructs[propName];
|
|
var typeAttribute = (attributes.getNamedItemNS("", "type") || {}).value;
|
|
if (this._result.version == "atom" && typeAttribute) {
|
|
type = typeAttribute;
|
|
}
|
|
else if (this._result.version == "atom03" && typeAttribute) {
|
|
if (typeAttribute.toLowerCase().includes("xhtml")) {
|
|
type = "xhtml";
|
|
}
|
|
else if (typeAttribute.toLowerCase().includes("html")) {
|
|
type = "html";
|
|
}
|
|
else if (typeAttribute.toLowerCase().includes("text")) {
|
|
type = "text";
|
|
}
|
|
}
|
|
|
|
// If it's rss feed-level description, it's not supposed to have html
|
|
if (this._result.version.includes("rss")
|
|
&& this._handlerStack[this._depth].containerClass != Entry) {
|
|
type = "text";
|
|
}
|
|
newProp.type = type;
|
|
newProp.base = this._xmlBaseStack[this._xmlBaseStack.length - 1];
|
|
container[propName] = newProp;
|
|
}
|
|
else {
|
|
container[propName] = chars;
|
|
}
|
|
},
|
|
|
|
// Sometimes, we'll hand off SAX handling duties to an XHTMLHandler
|
|
// (see above) that will scrape out non-XHTML stuff, normalize
|
|
// namespaces, and remove the wrapper div from Atom 1.0. When the
|
|
// XHTMLHandler is done, it'll callback here.
|
|
returnFromXHTMLHandler: function (chars, uri, localName, qName) {
|
|
// retake control of the SAX content events
|
|
this._reader.contentHandler = this;
|
|
|
|
// Grab the top of the stack
|
|
var top = this._stack[this._stack.length - 1];
|
|
if (!top) {
|
|
return;
|
|
}
|
|
var container = top[0];
|
|
|
|
// Assign the property
|
|
var newProp = new TextConstruct();
|
|
newProp.text = chars;
|
|
newProp.type = "xhtml";
|
|
newProp.base = this._xmlBaseStack[this._xmlBaseStack.length - 1];
|
|
container[this._prefixForNS(uri) + localName] = newProp;
|
|
|
|
// XHTML will cause us to peek too far. The XHTML handler will
|
|
// send us an end element to call. RFC4287-valid feeds allow a
|
|
// more graceful way to handle this. Unfortunately, we can't count
|
|
// on compliance at this point.
|
|
this.endElement(uri, localName, qName);
|
|
},
|
|
};
|
|
|
|
if (typeof module == "object") {
|
|
module.exports = FeedProcessor;
|
|
}
|