/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ /* eslint-disable quote-props */ /* globals SAXXMLReader */ "use strict"; function LOG(str) { Zotero.debug("Feed Processor: " + str); } const XMLNS = "http://www.w3.org/XML/1998/namespace"; const RSS090NS = "http://my.netscape.com/rdf/simple/0.9/"; /** *** Some general utils *****/ function strToURI(link, base) { base = base || undefined; try { return new URL(link, base); } catch (e) { return null; } } function isArray(a) { return isObject(a) && a.constructor == Array; } function isObject(a) { return (a && typeof a == "object") || isFunction(a); } function isFunction(a) { return typeof a == "function"; } function stripTags(someHTML) { return someHTML.replace(/<[^>]+>/g, ""); } /** * Searches through an array of links and returns a JS array * of matching property bags. */ const IANA_URI = "http://www.iana.org/assignments/relation/"; function findAtomLinks(rel, links) { var rvLinks = []; for (var i = 0; i < links.length; ++i) { var linkElement = links[i]; // atom:link MUST have @href if (linkElement.href) { var relAttribute = null; if (linkElement.rel) { relAttribute = linkElement.rel; } if ((!relAttribute && rel == "alternate") || relAttribute == rel) { rvLinks.push(linkElement); continue; } // catch relations specified by IANA URI if (relAttribute == IANA_URI + rel) { rvLinks.push(linkElement); } } } return rvLinks; } function xmlEscape(s) { s = s.replace(/&/g, "&"); s = s.replace(/>/g, ">"); s = s.replace(/ 0) { ++entriesWithEnclosures; for (var e = 0; e < entry.enclosures.length; ++e) { var enc = entry.enclosures[e]; if (enc.type) { var enctype = enc.type; if (/^audio/.test(enctype)) { ++audioCount; } else if (/^image/.test(enctype)) { ++imageCount; } else if (/^video/.test(enctype)) { ++videoCount; } else { ++otherCount; } } else { ++otherCount; } } } } var feedtype = Feed.TYPE_FEED; // For a feed to be marked as TYPE_VIDEO, TYPE_AUDIO and TYPE_IMAGE, // we enforce two things: // // 1. all entries must have at least one enclosure // 2. all enclosures must be video for TYPE_VIDEO, audio for TYPE_AUDIO or image // for TYPE_IMAGE // // Otherwise it's a TYPE_FEED. if (entriesWithEnclosures == this.items.length && otherCount == 0) { if (audioCount > 0 && !videoCount && !imageCount) { feedtype = Feed.TYPE_AUDIO; } else if (imageCount > 0 && !audioCount && !videoCount) { feedtype = Feed.TYPE_IMAGE; } else if (videoCount > 0 && !audioCount && !imageCount) { feedtype = Feed.TYPE_VIDEO; } } this.type = feedtype; this.enclosureCount = otherCount + videoCount + audioCount + imageCount; }, _atomLinksToURI: function () { var links = this.fields.links; var alternates = findAtomLinks("alternate", links); if (alternates.length > 0) { var href = alternates[0].href; var base; if (alternates[0]["xml:base"]) { base = alternates[0]["xml:base"]; } this.link = this._resolveURI(href, base); } }, _resolveImageLink: function () { var base; if (this.image["xml:base"]) { base = this.image["xml:base"]; } var url = this._resolveURI(this.image.url, base); if (url) { this.image.url = url.href; } }, _resolveURI: function (linkSpec, baseSpec) { var uri = null; try { var base = baseSpec ? strToURI(baseSpec, this.baseURI) : this.baseURI; uri = strToURI(linkSpec, base); } catch (e) { LOG(e); } return uri; }, // reset the bag to raw contents, not text constructs _resetBagMembersToRawText: function (fieldLists) { for (var i = 0; i < fieldLists.length; i++) { for (var j = 0; j < fieldLists[i].length; j++) { if (this.fields[fieldLists[i][j]]) { var textConstruct = this.fields[fieldLists[i][j]]; this.fields[fieldLists[i][j]] = textConstruct.text; } } } }, }; // Implements nsIFeedEntry, nsIFeedContainer function Entry() { this.summary = null; this.content = null; this.title = null; this.fields = {}; this.link = null; this.id = null; this.baseURI = null; this.updated = null; this.published = null; this.authors = []; this.contributors = []; } Entry.prototype = { fields: null, enclosures: null, mediaContent: null, searchLists: { title: ["title", "rss1:title", "atom03:title", "atom:title", "dc:title"], link: [["link", strToURI], ["rss1:link", strToURI]], id: [ ["guid", makePropGetter("guid")], "rdf:about", "atom03:id", "atom:id", ], authors: ["authors"], contributors: ["contributors"], summary: [ "description", "rss1:description", "dc:description", "atom03:summary", "atom:summary", ], content: ["content:encoded", "atom03:content", "atom:content"], rights: [ "prism:copyright", "dc:rights", "atom03:rights", "atom:rights", "copyright", ], published: [ "dc:date", "pubDate", "atom03:issued", "dcterms:issued", "atom:published", "prism:publicationDate", ], updated: [ "pubDate", "atom03:modified", "dc:date", "dcterms:modified", "atom:updated", "prism:modificationDate", ], pubTitle: ["pubTitle", "dc:source", "prism:publicationName"], pubType: ["pubType"], startPage: ["startPage", "prism:startingPage"], endPage: ["endPage", "prism:endingPage"], pageRange: ["prism:pageRange"], issn: ["prism:issn"], isbn: ["isbn", "prism:isbn"], identifier: [ "dc:identifier", "prism:doi", ], publisher: ["dc:publisher"], language: ["language", "dc:language"], volume: ["prism:volume"], issue: ["prism:number"], section: ["prism:section"], url: ["prism:url"], }, normalize: function () { fieldsToObj(this, this.searchLists); // Assign Atom link if needed if (this.fields.links) { this._atomLinksToURI(); } // Populate enclosures array this._populateEnclosures(); // The link might be a guid w/ permalink=true if (!this.link && this.fields.guid) { var guid = this.fields.guid; var isPermaLink = true; if (guid.isPermaLink) { isPermaLink = guid.isPermaLink.toLowerCase() != "false"; } if (guid && isPermaLink) { this.link = strToURI(guid.guid); } } this._resetBagMembersToRawText([ this.searchLists.content, this.searchLists.summary, this.searchLists.title, ]); }, _populateEnclosures: function () { if (this.fields.links) { this._atomLinksToEnclosures(); } // Add RSS2 enclosure to enclosures if (this.fields.enclosure) { this._enclosureToEnclosures(); } // Add media:content to enclosures if (this.fields.mediacontent) { this._mediaToEnclosures("mediacontent"); } // Add media:thumbnail to enclosures if (this.fields.mediathumbnail) { this._mediaToEnclosures("mediathumbnail"); } // Add media:content in media:group to enclosures if (this.fields.mediagroup) { this._mediaToEnclosures("mediagroup", "mediacontent"); } }, __enclosureMap: null, _addToEnclosures: function (newEnc) { // items we add to the enclosures array get displayed in the FeedWriter and // they must have non-empty urls. if (!newEnc.url || newEnc.url == "") { return; } if (this.__enclosureMap === null) { this.__enclosureMap = {}; } var previousEnc = this.__enclosureMap[newEnc.url]; if (previousEnc != undefined) { if (!previousEnc.type && newEnc.type) { previousEnc.type = newEnc.type; } if (!previousEnc.length && newEnc.length) { previousEnc.length = newEnc.length; } return; } if (this.enclosures === null) { this.enclosures = []; } this.enclosures.push(newEnc); this.__enclosureMap[newEnc.url] = newEnc; }, _atomLinksToEnclosures: function () { var links = this.fields.links; var encLinks = findAtomLinks("enclosure", links); if (encLinks.length == 0) { return; } for (var i = 0; i < encLinks.length; ++i) { var link = encLinks[i]; // an enclosure must have an href if (!link.href) { return; } var enc = {}; // copy Atom bits over to equivalent enclosure bits enc.url = link.href; if (link.type) { enc.type = link.type; } if (link.length) { enc.length = link.length; } this._addToEnclosures(enc); } }, _enclosureToEnclosures: function () { var enc = this.fields.enclosure; if (!enc.url) { return; } this._addToEnclosures(enc); }, _mediaToEnclosures: function (mediaType, contentType) { var content; // If a contentType is specified, the mediaType is a simple propertybag, // and the contentType is an array inside it. if (contentType) { var group = this.fields[mediaType]; content = group[contentType]; } else { content = this.fields[mediaType]; } for (var i = 0; i < content.length; ++i) { var contentElement = content[i]; // media:content don't require url, but if it's not there, we should // skip it. if (!contentElement.url) { continue; } var enc = {}; // copy media:content bits over to equivalent enclosure bits enc.url = contentElement.url; if (contentElement.type) { enc.type = contentElement.type; } else if (mediaType == "mediathumbnail") { // thumbnails won't have a type, but default to image types enc.type = "image/*"; enc.thumbnail = true; } if (contentElement.fileSize) { enc.length = contentElement.fileSize; } this._addToEnclosures(enc); } }, }; Entry.prototype._atomLinksToURI = Feed.prototype._atomLinksToURI; Entry.prototype._resolveURI = Feed.prototype._resolveURI; Entry.prototype._resetBagMembersToRawText = Feed.prototype._resetBagMembersToRawText; // TextConstruct represents and element that could contain (X)HTML // Implements nsIFeedTextConstruct function TextConstruct() { this.lang = null; this.base = null; this.type = "text"; this.text = null; } TextConstruct.prototype = { plainText: function () { if (this.type != "text") { return stripTags(this.text); } return this.text; }, createDocumentFragment: function () { if (this.type == "text") { const docFragment = new DOMParser().parseFromString('', 'text/html') .createDocumentFragment(); docFragment.append(this.text); return docFragment; } let parserType; if (this.type == "xhtml") { parserType = "application/xhtml+xml"; } else if (this.type == "html") { parserType = "text/html"; } else { return null; } const parsedDoc = new DOMParser().parseFromString(this.text, parserType); const docFragment = parsedDoc.createDocumentFragment(); docFragment.append(parsedDoc.documentElement); return docFragment; }, }; // Generator represents the software that produced the feed // Implements nsIFeedGenerator, nsIFeedElementBase function Generator() { this.lang = null; this.agent = null; this.version = null; this.uri = null; // nsIFeedElementBase this._attributes = null; this.baseURI = null; } Generator.prototype = { get attributes() { return this._attributes; }, set attributes(value) { this._attributes = value; this.version = (this._attributes.getNamedItemNS("", "version") || {}).value; var uriAttribute = (this._attributes.getNamedItemNS("", "uri") || {}).value || (this._attributes.getNamedItemNS("", "url") || {}).value; this.uri = strToURI(uriAttribute, this.baseURI); // RSS1 uriAttribute = (this._attributes.getNamedItemNS(RDF_NS, "resource") || {}).value; if (uriAttribute) { this.agent = uriAttribute; this.uri = strToURI(uriAttribute, this.baseURI); } }, }; // Implements nsIFeedPerson, nsIFeedElementBase function Person() { this.name = null; this.uri = null; this.email = null; // nsIFeedElementBase this.attributes = null; this.baseURI = null; } /** * Map a list of fields into properties on a container. * * @param container An nsIFeedContainer * @param fields A list of fields to search for. List members can * be a list, in which case the second member is * transformation function (like parseInt). */ function fieldsToObj(container, fields) { var props, prop, field, searchList; for (var key in fields) { searchList = fields[key]; for (var i = 0; i < searchList.length; ++i) { props = searchList[i]; prop = null; field = isArray(props) ? props[0] : props; prop = container.fields[field]; if (prop) { prop = isArray(props) ? props[1](prop) : prop; container[key] = prop; } } } } // create a generator element function atomGenerator(s, generator) { generator.agent = s.trim(); return generator; } // post-process atom:logo to create an RSS2-like structure function atomLogo(s, logo) { logo.url = s.trim(); } // post-process an RSS category, map it to the Atom fields. function rssCatTerm(s, cat) { // add slash handling? cat.term = s.trim(); return cat; } // post-process a GUID function rssGuid(s, guid) { guid.guid = s.trim(); return guid; } // post-process an RSS author element // // It can contain a field like this: // // lawyer@boyer.net (Lawyer Boyer) // // or, delightfully, a field like this: // // Simon St.Laurent (mailto:simonstl@simonstl.com) // // We want to split this up and assign it to corresponding Atom // fields. // function rssAuthor(s, author) { // check for RSS2 string format var chars = s.trim(); var matches = chars.match(/(.*)\((.*)\)/); var emailCheck = /^([a-zA-Z0-9_.-])+@(([a-zA-Z0-9-])+\.)+([a-zA-Z0-9]{2,4})+$/; if (matches) { var match1 = matches[1].trim(); var match2 = matches[2].trim(); if (match2.indexOf("mailto:") == 0) { match2 = match2.substring(7); } if (emailCheck.test(match1)) { author.email = match1; author.name = match2; } else if (emailCheck.test(match2)) { author.email = match2; author.name = match1; } else { // put it back together author.name = match1 + " (" + match2 + ")"; } } else { author.name = chars; if (chars.indexOf("@")) { author.email = chars; } } return author; } const XHTML_NS = "http://www.w3.org/1999/xhtml"; // The XHTMLHandler handles inline XHTML found in things like atom:summary function XHTMLHandler(processor, isAtom) { this._buf = ""; this._processor = processor; this._depth = 0; this._isAtom = isAtom; // a stack of lists tracking in-scope namespaces this._inScopeNS = []; } // The fidelity can be improved here, to allow handling of stuff like // SVG and MathML. XXX XHTMLHandler.prototype = { // look back up at the declared namespaces // we always use the same prefixes for our safe stuff _isInScope: function (ns) { for (var i in this._inScopeNS) { for (var uri in this._inScopeNS[i]) { if (this._inScopeNS[i][uri] == ns) { return true; } } } return false; }, startDocument: function () { }, endDocument: function () { }, startElement: function (namespace, localName, qName, attributes) { ++this._depth; this._inScopeNS.push([]); // RFC4287 requires XHTML to be wrapped in a div that is *not* part of // the content. This prevents people from screwing up namespaces, but // we need to skip it here. if (this._isAtom && this._depth == 1 && localName == "div") { return; } // If it's an XHTML element, record it. Otherwise, it's ignored. if (namespace == XHTML_NS) { this._buf += "<" + localName; var uri; for (var i = 0; i < attributes.length; ++i) { uri = attributes.item(i).namespaceURI; // XHTML attributes aren't in a namespace if (uri == "") { this._buf += (" " + attributes.item(i).localName + "='" + xmlEscape(attributes.item(i).value) + "'"); } else { // write a small set of allowed attribute namespaces var prefix = gAllowedXHTMLNamespaces[uri]; if (prefix) { // The attribute value we'll attempt to write var attributeValue = xmlEscape(attributes.item(i).value); // it's an allowed attribute NS. // write the attribute this._buf += (" " + prefix + ":" + attributes.item(i).localName + "='" + attributeValue + "'"); // write an xmlns declaration if necessary if (prefix != "xml" && !this._isInScope(uri)) { this._inScopeNS[this._inScopeNS.length - 1].push(uri); this._buf += " xmlns:" + prefix + "='" + uri + "'"; } } } } this._buf += ">"; } }, endElement: function (uri, localName, qName) { --this._depth; this._inScopeNS.pop(); // We need to skip outer divs in Atom. See comment in startElement. if (this._isAtom && this._depth == 0 && localName == "div") { return; } // When we peek too far, go back to the main processor if (this._depth < 0) { this._processor.returnFromXHTMLHandler(this._buf.trim(), uri, localName, qName); return; } // If it's an XHTML element, record it. Otherwise, it's ignored. if (uri == XHTML_NS) { this._buf += ""; } }, characters: function (data) { this._buf += xmlEscape(data); }, processingInstruction: function () { }, }; /** * The ExtensionHandler deals with elements we haven't explicitly * added to our transition table in the FeedProcessor. */ function ExtensionHandler(processor) { this._buf = ""; this._depth = 0; this._hasChildElements = false; // The FeedProcessor this._processor = processor; // Fields of the outermost extension element. this._localName = null; this._uri = null; this._qName = null; this._attrs = null; } ExtensionHandler.prototype = { startDocument: function () { }, endDocument: function () { }, startElement: function (uri, localName, qName, attrs) { ++this._depth; if (this._depth == 1) { this._uri = uri; this._localName = localName; this._qName = qName; this._attrs = attrs; } // if we descend into another element, we won't send text this._hasChildElements = (this._depth > 1); }, endElement: function (_uri, _localName, _qName) { --this._depth; if (this._depth == 0) { var text = this._hasChildElements ? null : this._buf.trim(); this._processor.returnFromExtHandler(this._uri, this._localName, text, this._attrs); } }, characters: function (data) { if (!this._hasChildElements) { this._buf += data; } }, processingInstruction: function () { }, }; /** * ElementInfo is a simple container object that describes * some characteristics of a feed element. For example, it * says whether an element can be expected to appear more * than once inside a given entry or feed. */ function ElementInfo(fieldName, containerClass, closeFunc, isArray) { this.fieldName = fieldName; this.containerClass = containerClass; this.closeFunc = closeFunc; this.isArray = isArray; this.isWrapper = false; } /** * FeedElementInfo represents a feed element, usually the root. */ function FeedElementInfo(fieldName, feedVersion) { this.isWrapper = false; this.fieldName = fieldName; this.feedVersion = feedVersion; } /** * Some feed formats include vestigial wrapper elements that we don't * want to include in our object model, but we do need to keep track * of during parsing. */ function WrapperElementInfo(fieldName) { this.isWrapper = true; this.fieldName = fieldName; } /** *** The Processor *****/ // Implements nsIFeedProcessor, nsISAXContentHandler, nsISAXErrorHandler, // nsIStreamListener, nsIRequestObserver function FeedProcessor() { this._reader = new SAXXMLReader(); this._buf = ""; this._feed = {}; this._handlerStack = []; this._xmlBaseStack = []; // sparse array keyed to nesting depth this._depth = 0; this._state = "START"; this._result = null; this._extensionHandler = null; this._xhtmlHandler = null; this._haveSentResult = false; // The nsIFeedResultListener waiting for the parse results this.listener = null; // These elements can contain (X)HTML or plain text. // We keep a table here that contains their default treatment this._textConstructs = { "atom:title": "text", "atom:summary": "text", "atom:rights": "text", "atom:content": "text", "atom:subtitle": "text", "description": "html", "rss1:description": "html", "dc:description": "html", "content:encoded": "html", "title": "text", "rss1:title": "text", "atom03:title": "text", "atom03:tagline": "text", "atom03:summary": "text", "atom03:content": "text", "dc:title": "text", "dc:rights": "text", "atom03:rights": "text", "copyright": "text", "prism:copyright": "text", }; this._stack = []; this._trans = { "START": { // If we hit a root RSS element, treat as RSS2. "rss": new FeedElementInfo("RSS2", "rss2"), // If we hit an RDF element, if could be RSS1, but we can't // verify that until we hit a rss1:channel element. "rdf:RDF": new WrapperElementInfo("RDF"), // If we hit a Atom 1.0 element, treat as Atom 1.0. "atom:feed": new FeedElementInfo("Atom", "atom"), // Treat as Atom 0.3 "atom03:feed": new FeedElementInfo("Atom03", "atom03"), }, /** ******* RSS2 **********/ "IN_RSS2": { "channel": new WrapperElementInfo("channel"), }, "IN_CHANNEL": { "item": new ElementInfo("items", Entry, null, true), "managingEditor": new ElementInfo("authors", Person, rssAuthor, true), "dc:creator": new ElementInfo("authors", Person, rssAuthor, true), "dc:author": new ElementInfo("authors", Person, rssAuthor, true), "dc:contributor": new ElementInfo("contributors", Person, rssAuthor, true), "category": new ElementInfo("categories", null, rssCatTerm, true), "cloud": new ElementInfo("cloud", null, null, false), "image": new ElementInfo("image", null, null, false), "textInput": new ElementInfo("textInput", null, null, false), "skipDays": new ElementInfo("skipDays", null, null, false), "skipHours": new ElementInfo("skipHours", null, null, false), "generator": new ElementInfo("generator", Generator, atomGenerator, false), }, "IN_ITEMS": { "author": new ElementInfo("authors", Person, rssAuthor, true), "dc:creator": new ElementInfo("authors", Person, rssAuthor, true), "dc:author": new ElementInfo("authors", Person, rssAuthor, true), "dc:contributor": new ElementInfo("contributors", Person, rssAuthor, true), "category": new ElementInfo("categories", null, rssCatTerm, true), "enclosure": new ElementInfo("enclosure", null, null, false), "media:content": new ElementInfo("mediacontent", null, null, true), "media:group": new ElementInfo("mediagroup", null, null, false), "media:thumbnail": new ElementInfo("mediathumbnail", null, null, true), "guid": new ElementInfo("guid", null, rssGuid, false), }, "IN_SKIPDAYS": { "day": new ElementInfo("days", null, null, true), }, "IN_SKIPHOURS": { "hour": new ElementInfo("hours", null, null, true), }, "IN_MEDIAGROUP": { "media:content": new ElementInfo("mediacontent", null, null, true), "media:thumbnail": new ElementInfo("mediathumbnail", null, null, true), }, /** ******* RSS1 **********/ "IN_RDF": { // If we hit a rss1:channel, we can verify that we have RSS1 "rss1:channel": new FeedElementInfo("rdf_channel", "rss1"), "rss1:image": new ElementInfo("image", null, null, false), "rss1:textinput": new ElementInfo("textInput", null, null, false), "rss1:item": new ElementInfo("items", Entry, null, true), }, "IN_RDF_CHANNEL": { "admin:generatorAgent": new ElementInfo("generator", Generator, null, false), "dc:creator": new ElementInfo("authors", Person, rssAuthor, true), "dc:author": new ElementInfo("authors", Person, rssAuthor, true), "dc:contributor": new ElementInfo("contributors", Person, rssAuthor, true), }, /** ******* ATOM 1.0 **********/ "IN_ATOM": { "atom:author": new ElementInfo("authors", Person, null, true), "atom:generator": new ElementInfo("generator", Generator, atomGenerator, false), "atom:contributor": new ElementInfo("contributors", Person, null, true), "atom:link": new ElementInfo("links", null, null, true), "atom:logo": new ElementInfo("atom:logo", null, atomLogo, false), "atom:entry": new ElementInfo("entries", Entry, null, true), }, "IN_ENTRIES": { "atom:author": new ElementInfo("authors", Person, null, true), "atom:contributor": new ElementInfo("contributors", Person, null, true), "atom:link": new ElementInfo("links", null, null, true), }, /** ******* ATOM 0.3 **********/ "IN_ATOM03": { "atom03:author": new ElementInfo("authors", Person, null, true), "atom03:contributor": new ElementInfo("contributors", Person, null, true), "atom03:link": new ElementInfo("links", null, null, true), "atom03:entry": new ElementInfo("atom03_entries", Entry, null, true), "atom03:generator": new ElementInfo("generator", Generator, atomGenerator, false), }, "IN_ATOM03_ENTRIES": { "atom03:author": new ElementInfo("authors", Person, null, true), "atom03:contributor": new ElementInfo("contributors", Person, null, true), "atom03:link": new ElementInfo("links", null, null, true), "atom03:entry": new ElementInfo("atom03_entries", Entry, null, true), }, }; } // See startElement for a long description of how feeds are processed. FeedProcessor.prototype = { // Set ourselves as the SAX handler, and set the base URI _init: function (uri) { this._reader.contentHandler = this; this._reader.errorHandler = this; this._result = new FeedResult(); if (uri) { this._result.uri = uri; this._reader.baseURI = uri; this._xmlBaseStack[0] = uri; } }, // This function is called once we figure out what type of feed // we're dealing with. Some feed types require digging a bit further // than the root. _docVerified: function (version) { this._result.doc = new Feed(); this._result.doc.baseURI = this._xmlBaseStack[this._xmlBaseStack.length - 1]; this._result.doc.fields = this._feed; this._result.version = version; }, // When we're done with the feed, let the listener know what // happened. _sendResult: function () { this._haveSentResult = true; try { // Can be null when a non-feed is fed to us if (this._result.doc) { this._result.doc.normalize(); } } catch (e) { LOG("FIXME: " + e); } try { if (this.listener !== null) { this.listener.handleResult(this._result); } } finally { this._result = null; } }, // Parsing functions parseAsync: function (requestObserver, uri) { this._init(uri); this._reader.parseAsync(requestObserver); }, // Fetch API onResponseAvailable(response) { return this._reader.onResponseAvailable(response); }, // nsISAXErrorHandler // We only care about fatal errors. When this happens, we may have // parsed through the feed metadata and some number of entries. The // listener can still show some of that data if it wants, and we'll // set the bozo bit to indicate we were unable to parse all the way // through. fatalError: function () { this._result.bozo = true; // XXX need to QI to FeedProgressListener if (!this._haveSentResult) { this._sendResult(); } }, // nsISAXContentHandler startDocument: function () { // LOG("----------"); }, endDocument: function () { if (!this._haveSentResult) { this._sendResult(); } }, // The transitions defined above identify elements that contain more // than just text. For example RSS items contain many fields, and so // do Atom authors. The only commonly used elements that contain // mixed content are Atom Text Constructs of type="xhtml", which we // delegate to another handler for cleaning. That leaves a couple // different types of elements to deal with: those that should occur // only once, such as title elements, and those that can occur // multiple times, such as the RSS category element and the Atom // link element. Most of the RSS1/DC elements can occur multiple // times in theory, but in practice, the only ones that do have // analogues in Atom. // // Some elements are also groups of attributes or sub-elements, // while others are simple text fields. For the most part, we don't // have to pay explicit attention to the simple text elements, // unless we want to post-process the resulting string to transform // it into some richer object like a Date or URI. // // Elements that have more sophisticated content models still end up // being dictionaries, whether they are based on attributes like RSS // cloud, sub-elements like Atom author, or even items and // entries. These elements are treated as "containers". It's // theoretically possible for a container to have an attribute with // the same universal name as a sub-element, but none of the feed // formats allow this by default, and I don't of any extension that // works this way. // startElement: function (uri, localName, qName, attributes) { this._buf = ""; ++this._depth; var elementInfo; // LOG("<" + localName + ">"); // Check for xml:base var base = (attributes.getNamedItemNS(XMLNS, "base") || {}).value; if (base) { this._xmlBaseStack[this._depth] = strToURI(base, this._xmlBaseStack[this._xmlBaseStack.length - 1]); } // To identify the element we're dealing with, we look up the // namespace URI in our gNamespaces dictionary, which will give us // a "canonical" prefix for a namespace URI. For example, this // allows Dublin Core "creator" elements to be consistently mapped // to "dc:creator", for easy field access by consumer code. This // strategy also happens to shorten up our state table. var key = this._prefixForNS(uri) + localName; // Check to see if we need to hand this off to our XHTML handler. // The elements we're dealing with will look like this: // // // <div xmlns="http://www.w3.org/1999/xhtml"> // A title with <b>bold</b> and <i>italics</i>. // </div> // // // When it returns in returnFromXHTMLHandler, the handler should // give us back a string like this: // // "A title with bold and italics." // // The Atom spec explicitly says the div is not part of the content, // and explicitly allows whitespace collapsing. // if ((this._result.version == "atom" || this._result.version == "atom03") && this._textConstructs[key]) { var type = (attributes.getNamedItemNS("", "type") || {}).value; if (type && type.includes("xhtml")) { this._xhtmlHandler = new XHTMLHandler(this, (this._result.version == "atom")); this._reader.contentHandler = this._xhtmlHandler; return; } } // Check our current state, and see if that state has a defined // transition. For example, this._trans["atom:entry"]["atom:author"] // will have one, and it tells us to add an item to our authors array. if (this._trans[this._state] && this._trans[this._state][key]) { elementInfo = this._trans[this._state][key]; } else { // If we don't have a transition, hand off to extension handler this._extensionHandler = new ExtensionHandler(this); this._reader.contentHandler = this._extensionHandler; this._extensionHandler.startElement(uri, localName, qName, attributes); return; } // This distinguishes wrappers like 'channel' from elements // we'd actually like to do something with (which will test true). this._handlerStack[this._depth] = elementInfo; if (elementInfo.isWrapper) { this._state = "IN_" + elementInfo.fieldName.toUpperCase(); this._stack.push([this._feed, this._state]); } else if (elementInfo.feedVersion) { this._state = "IN_" + elementInfo.fieldName.toUpperCase(); // Check for the older RSS2 variants if (elementInfo.feedVersion == "rss2") { elementInfo.feedVersion = this._findRSSVersion(attributes); } else if (uri == RSS090NS) { elementInfo.feedVersion = "rss090"; } this._docVerified(elementInfo.feedVersion); this._stack.push([this._feed, this._state]); this._mapAttributes(this._feed, attributes); } else { this._state = this._processComplexElement(elementInfo, attributes); } }, // In the endElement handler, we decrement the stack and look // for cleanup/transition functions to execute. The second part // of the state transition works as above in startElement, but // the state we're looking for is prefixed with an underscore // to distinguish endElement events from startElement events. endElement: function (_uri, _localName, _qName) { var elementInfo = this._handlerStack[this._depth]; // LOG(""); if (elementInfo && !elementInfo.isWrapper) { this._closeComplexElement(elementInfo); } // cut down xml:base context if (this._xmlBaseStack.length == this._depth + 1) { this._xmlBaseStack = this._xmlBaseStack.slice(0, this._depth); } // our new state is whatever is at the top of the stack now if (this._stack.length > 0) { this._state = this._stack[this._stack.length - 1][1]; } this._handlerStack = this._handlerStack.slice(0, this._depth); --this._depth; }, // Buffer up character data. The buffer is cleared with every // opening element. characters: function (data) { this._buf += data; }, processingInstruction: function (target, data) { if (target == "xml-stylesheet") { var hrefAttribute = data.match(/href=["'](.*?)["']/); if (hrefAttribute && hrefAttribute.length == 2) { this._result.stylesheet = strToURI(hrefAttribute[1], this._result.uri); } } }, // end of nsISAXContentHandler // Handle our more complicated elements--those that contain // attributes and child elements. _processComplexElement: function (elementInfo, attributes) { var obj; // If the container is an entry/item, it'll need to have its // more esoteric properties put in the 'fields' property bag. const Class = elementInfo.containerClass; if (Class == Entry) { obj = new Class(); obj.baseURI = this._xmlBaseStack[this._xmlBaseStack.length - 1]; this._mapAttributes(obj.fields, attributes); } else if (elementInfo.containerClass) { obj = new Class(); obj.baseURI = this._xmlBaseStack[this._xmlBaseStack.length - 1]; obj.attributes = attributes; // just set the SAX attributes } else { obj = {}; this._mapAttributes(obj, attributes); } // We should have a container/propertyBag that's had its // attributes processed. Now we need to attach it to its // container. var newProp; // First we'll see what's on top of the stack. var container = this._stack[this._stack.length - 1][0]; // Check to see if it has the property var prop = container[elementInfo.fieldName]; if (elementInfo.isArray) { if (!prop) { container[elementInfo.fieldName] = []; } newProp = container[elementInfo.fieldName]; newProp.push(obj); // If new object is an nsIFeedContainer, we want to deal with // its member nsIPropertyBag instead. if (obj.fields) { newProp = obj.fields; } } else { // If it doesn't, set it. if (!prop) { container[elementInfo.fieldName] = obj; } newProp = container[elementInfo.fieldName]; } // make our new state name, and push the property onto the stack var newState = "IN_" + elementInfo.fieldName.toUpperCase(); this._stack.push([newProp, newState, obj]); return newState; }, // Sometimes we need reconcile the element content with the object // model for a given feed. We use helper functions to do the // munging, but we need to identify array types here, so the munging // happens only to the last element of an array. _closeComplexElement: function (elementInfo) { var stateTuple = this._stack.pop(); var container = stateTuple[0]; var containerParent = stateTuple[2]; var element = null; // If it's an array and we have to post-process, // grab the last element if (isArray(container)) { element = container[container.length - 1]; } else { element = container; } // Run the post-processing function if there is one. if (elementInfo.closeFunc) { element = elementInfo.closeFunc(this._buf, element); } // If an nsIFeedContainer was on top of the stack, // we need to normalize it if (elementInfo.containerClass == Entry) { containerParent.normalize(); } // If it's an array, re-set the last element if (isArray(container)) { container[container.length - 1] = element; } }, _prefixForNS: function (uri) { if (!uri) { return ""; } var prefix = gNamespaces[uri]; if (prefix) { return prefix + ":"; } if (uri.toLowerCase().indexOf("http://backend.userland.com") == 0) { return ""; } return null; }, _mapAttributes: function (bag, attributes) { // Cycle through the attributes, and set our properties using the // prefix:localNames we find in our namespace dictionary. for (var i = 0; i < attributes.length; ++i) { var key = this._prefixForNS(attributes.item(i).namespaceURI) + attributes.item(i).localName; var val = attributes.item(i).value; bag[key] = val; } }, // Only for RSS2esque formats _findRSSVersion: function (attributes) { var versionAttr = (attributes.getNamedItemNS("", "version") || {}).value.trim(); var versions = { "0.91": "rss091", "0.92": "rss092", "0.93": "rss093", "0.94": "rss094" }; if (versions[versionAttr]) { return versions[versionAttr]; } if (versionAttr.substr(0, 2) != "2.") { return "rssUnknown"; } return "rss2"; }, // unknown element values are returned here. See startElement above // for how this works. returnFromExtHandler: function (uri, localName, chars, attributes) { --this._depth; // take control of the SAX events this._reader.contentHandler = this; if (localName === null && chars === null) { return; } // we don't take random elements inside rdf:RDF if (this._state == "IN_RDF") { return; } // Grab the top of the stack var top = this._stack[this._stack.length - 1]; if (!top) { return; } var container = top[0]; // Grab the last element if it's an array if (isArray(container)) { var contract = this._handlerStack[this._depth].containerClass; // check if it's something specific, but not an entry if (contract && contract != Entry) { var el = container[container.length - 1]; if (contract != Person) { return; // don't know about this interface } let propName = localName; var prefix = gNamespaces[uri]; // synonyms if ( (uri == "" || prefix && ((prefix.indexOf("atom") > -1) || (prefix.indexOf("rss") > -1))) && (propName == "url" || propName == "href") ) { propName = "uri"; } try { if (el[propName] !== "undefined") { var propValue = chars; // convert URI-bearing values to an nsIURI if (propName == "uri") { var base = this._xmlBaseStack[this._xmlBaseStack.length - 1]; propValue = strToURI(chars, base); } el[propName] = propValue; } } catch (e) { // ignore XPConnect errors } // the rest of the function deals with entry- and feed-level stuff return; } container = container[container.length - 1]; } // Make the buffer our new property var propName = this._prefixForNS(uri) + localName; // But, it could be something containing HTML. If so, // we need to know about that. if (this._textConstructs[propName] && this._handlerStack[this._depth].containerClass !== null) { var newProp = new TextConstruct(); newProp.text = chars; // Look up the default type in our table var type = this._textConstructs[propName]; var typeAttribute = (attributes.getNamedItemNS("", "type") || {}).value; if (this._result.version == "atom" && typeAttribute) { type = typeAttribute; } else if (this._result.version == "atom03" && typeAttribute) { if (typeAttribute.toLowerCase().includes("xhtml")) { type = "xhtml"; } else if (typeAttribute.toLowerCase().includes("html")) { type = "html"; } else if (typeAttribute.toLowerCase().includes("text")) { type = "text"; } } // If it's rss feed-level description, it's not supposed to have html if (this._result.version.includes("rss") && this._handlerStack[this._depth].containerClass != Entry) { type = "text"; } newProp.type = type; newProp.base = this._xmlBaseStack[this._xmlBaseStack.length - 1]; container[propName] = newProp; } else { container[propName] = chars; } }, // Sometimes, we'll hand off SAX handling duties to an XHTMLHandler // (see above) that will scrape out non-XHTML stuff, normalize // namespaces, and remove the wrapper div from Atom 1.0. When the // XHTMLHandler is done, it'll callback here. returnFromXHTMLHandler: function (chars, uri, localName, qName) { // retake control of the SAX content events this._reader.contentHandler = this; // Grab the top of the stack var top = this._stack[this._stack.length - 1]; if (!top) { return; } var container = top[0]; // Assign the property var newProp = new TextConstruct(); newProp.text = chars; newProp.type = "xhtml"; newProp.base = this._xmlBaseStack[this._xmlBaseStack.length - 1]; container[this._prefixForNS(uri) + localName] = newProp; // XHTML will cause us to peek too far. The XHTML handler will // send us an end element to call. RFC4287-valid feeds allow a // more graceful way to handle this. Unfortunately, we can't count // on compliance at this point. this.endElement(uri, localName, qName); }, }; if (typeof module == "object") { module.exports = FeedProcessor; }