Feeds: Refactor to move field extraction to FeedProcessor (#4158)

And support more PRISM fields.
This commit is contained in:
Abe Jellinek 2024-05-22 18:29:58 -04:00 committed by GitHub
parent 101e6d55d5
commit 59afef6c2e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 181 additions and 73 deletions

View file

@ -75,31 +75,25 @@ Zotero.FeedReader = function (url) {
// TODO: image as icon // TODO: image as icon
let publicationTitle = Zotero.FeedReader._getFeedField(feed, 'publicationName', 'prism') let publicationTitle = feed.pubTitle;
|| Zotero.FeedReader._getFeedField(feed, 'pubTitle');
if (publicationTitle) info.publicationTitle = publicationTitle; if (publicationTitle) info.publicationTitle = publicationTitle;
let publisher = Zotero.FeedReader._getFeedField(feed, 'publisher', 'dc'); let publisher = feed.publisher;
if (publisher) info.publisher = publisher; if (publisher) info.publisher = publisher;
let rights = (feed.rights && feed.rights.plainText()) let rights = feed.rights;
|| Zotero.FeedReader._getFeedField(feed, 'copyright', 'prism')
|| Zotero.FeedReader._getFeedField(feed, 'rights', 'dc')
|| Zotero.FeedReader._getFeedField(feed, 'copyright');
if (rights) info.rights = rights; if (rights) info.rights = rights;
let issn = Zotero.FeedReader._getFeedField(feed, 'issn', 'prism'); let issn = feed.issn;
if (issn) info.ISSN = issn; if (issn) info.ISSN = issn;
let isbn = Zotero.FeedReader._getFeedField(feed, 'isbn', 'prism') let isbn = feed.isbn;
|| Zotero.FeedReader._getFeedField(feed, 'isbn');
if (isbn) info.ISBN = isbn; if (isbn) info.ISBN = isbn;
let language = Zotero.FeedReader._getFeedField(feed, 'language', 'dc') let language = feed.language;
|| Zotero.FeedReader._getFeedField(feed, 'language');
if (language) info.language = language; if (language) info.language = language;
let ttl = Zotero.FeedReader._getFeedField(feed, 'ttl'); let ttl = feed.ttl;
if (ttl) info.ttl = ttl; if (ttl) info.ttl = ttl;
this._feedProperties = info; this._feedProperties = info;
@ -435,7 +429,8 @@ Zotero.FeedReader._getFeedItem = function (feedEntry, feedInfo) {
if (splitAt <= title.length) item.title += '...'; if (splitAt <= title.length) item.title += '...';
} }
if (feedEntry.link) item.url = feedEntry.link.href; let url = feedEntry.link?.href || feedEntry.url.plainText();
if (url) item.url = url;
if (feedEntry.rights) item.rights = Zotero.FeedReader._getRichText(feedEntry.rights, 'rights'); if (feedEntry.rights) item.rights = Zotero.FeedReader._getRichText(feedEntry.rights, 'rights');
@ -453,40 +448,34 @@ Zotero.FeedReader._getFeedItem = function (feedEntry, feedInfo) {
/** Done with basic metadata, now look for better data **/ /** Done with basic metadata, now look for better data **/
let date = Zotero.FeedReader._getFeedField(feedEntry, 'publicationDate', 'prism') let date = feedEntry.updated || feedEntry.published;
|| Zotero.FeedReader._getFeedField(feedEntry, 'date', 'dc')
// DEBUG: Why not get these from the feedEntry?
|| Zotero.FeedReader._getFeedField(feedEntry, 'pubDate') // RSS
|| Zotero.FeedReader._getFeedField(feedEntry, 'updated', 'atom') // Atom
|| Zotero.FeedReader._getFeedField(feedEntry, 'published', 'atom'); // Atom
if (date) item.date = date; if (date) item.date = date;
let publicationTitle = Zotero.FeedReader._getFeedField(feedEntry, 'publicationName', 'prism') let publicationTitle = feedEntry.pubTitle;
|| Zotero.FeedReader._getFeedField(feedEntry, 'source', 'dc')
|| Zotero.FeedReader._getFeedField(feedEntry, 'pubTitle');
if (publicationTitle) item.publicationTitle = publicationTitle; if (publicationTitle) item.publicationTitle = publicationTitle;
let publicationType = Zotero.FeedReader._getFeedField(feedEntry, 'pubType'); let publicationType = feedEntry.pubType;
if (publicationType) item.publicationType = publicationType; if (publicationType) item.publicationType = publicationType;
let startPage = Zotero.FeedReader._getFeedField(feedEntry, 'startPage'); let startPage = feedEntry.startPage;
let endPage = Zotero.FeedReader._getFeedField(feedEntry, 'endPage'); let endPage = feedEntry.endPage;
if (startPage || endPage) { if (startPage || endPage) {
item.pages = (startPage || '') item.pages = (startPage || '')
+ (endPage && startPage ? '' : '') + (endPage && startPage ? '' : '')
+ (endPage || ''); + (endPage || '');
} }
else {
let pageRange = feedEntry.pageRange;
if (pageRange) item.pages = pageRange;
}
let issn = Zotero.FeedReader._getFeedField(feedEntry, 'issn', 'prism'); let issn = feedEntry.issn;
if (issn) item.ISSN = issn; if (issn) item.ISSN = issn;
let isbn = Zotero.FeedReader._getFeedField(feedEntry, 'isbn', 'prism') let isbn = feedEntry.isbn;
|| Zotero.FeedReader._getFeedField(feedEntry, 'isbn');
if (isbn) item.ISBN = isbn; if (isbn) item.ISBN = isbn;
let identifier = Zotero.FeedReader._getFeedField(feedEntry, 'identifier', 'dc'); let identifier = feedEntry.identifier;
if (identifier) { if (identifier) {
for (let type of ['DOI', 'ISBN', 'ISSN']) { for (let type of ['DOI', 'ISBN', 'ISSN']) {
let cleanId = Zotero.Utilities[`clean${type}`](identifier); let cleanId = Zotero.Utilities[`clean${type}`](identifier);
@ -497,18 +486,24 @@ Zotero.FeedReader._getFeedItem = function (feedEntry, feedInfo) {
} }
} }
let publisher = Zotero.FeedReader._getFeedField(feedEntry, 'publisher', 'dc'); let publisher = feedEntry.publisher;
if (publisher) item.publisher = publisher; if (publisher) item.publisher = publisher;
let rights = Zotero.FeedReader._getFeedField(feedEntry, 'copyright', 'prism') let rights = feedEntry.rights;
|| Zotero.FeedReader._getFeedField(feedEntry, 'rights', 'dc')
|| Zotero.FeedReader._getFeedField(feedEntry, 'copyright');
if (rights) item.rights = rights; if (rights) item.rights = rights;
let language = Zotero.FeedReader._getFeedField(feedEntry, 'language', 'dc') let language = feedEntry.language;
|| Zotero.FeedReader._getFeedField(feedEntry, 'language');
if (language) item.language = language; if (language) item.language = language;
let volume = feedEntry.volume;
if (volume) item.volume = volume;
let issue = feedEntry.issue;
if (issue) item.issue = issue;
let section = feedEntry.section;
if (section) item.section = section;
/** Incorporate missing values from feed metadata **/ /** Incorporate missing values from feed metadata **/
let supplementFields = ['publicationTitle', 'ISSN', 'publisher', 'rights', 'language']; let supplementFields = ['publicationTitle', 'ISSN', 'publisher', 'rights', 'language'];

View file

@ -103,6 +103,10 @@ var gNamespaces = {
"http://www.w3.org/XML/1998/namespace": "xml", "http://www.w3.org/XML/1998/namespace": "xml",
"http://search.yahoo.com/mrss/": "media", "http://search.yahoo.com/mrss/": "media",
"http://search.yahoo.com/mrss": "media", "http://search.yahoo.com/mrss": "media",
"http://prismstandard.org/namespaces/1.2/basic/": "prism",
"http://prismstandard.org/namespaces/basic/2.0/": "prism",
"http://prismstandard.org/namespaces/basic/3.0/": "prism",
"https://prismdb.takanakahiko.me/prism-schema.ttl#": "prism",
}; };
// We allow a very small set of namespaces in XHTML content, // We allow a very small set of namespaces in XHTML content,
@ -147,7 +151,7 @@ Feed.TYPE_VIDEO = 4;
Feed.prototype = { Feed.prototype = {
searchLists: { searchLists: {
title: ["title", "rss1:title", "atom03:title", "atom:title"], title: ["title", "rss1:title", "atom03:title", "atom:title", "dc:title"],
subtitle: [ subtitle: [
"description", "description",
"dc:description", "dc:description",
@ -155,6 +159,7 @@ Feed.prototype = {
"atom03:tagline", "atom03:tagline",
"atom:subtitle", "atom:subtitle",
], ],
pubTitle: ["pubTitle", "dc:source", "prism:publicationName"],
items: ["items", "atom03_entries", "entries"], items: ["items", "atom03_entries", "entries"],
id: ["atom:id", "rdf:about"], id: ["atom:id", "rdf:about"],
generator: ["generator"], generator: ["generator"],
@ -162,12 +167,19 @@ Feed.prototype = {
contributors: ["contributors"], contributors: ["contributors"],
link: [["link", strToURI], ["rss1:link", strToURI]], link: [["link", strToURI], ["rss1:link", strToURI]],
categories: ["categories", "dc:subject"], categories: ["categories", "dc:subject"],
rights: ["atom03:rights", "atom:rights"], rights: [
"dc:rights",
"atom03:rights",
"atom:rights",
"copyright",
"prism:copyright",
],
cloud: ["cloud"], cloud: ["cloud"],
image: ["image", "rss1:image", "atom:logo"], image: ["image", "rss1:image", "atom:logo"],
textInput: ["textInput", "rss1:textinput"], textInput: ["textInput", "rss1:textinput"],
skipDays: ["skipDays"], skipDays: ["skipDays"],
skipHours: ["skipHours"], skipHours: ["skipHours"],
ttl: ["ttl"],
updated: [ updated: [
"pubDate", "pubDate",
"lastBuildDate", "lastBuildDate",
@ -176,6 +188,10 @@ Feed.prototype = {
"dcterms:modified", "dcterms:modified",
"atom:updated", "atom:updated",
], ],
issn: ["prism:issn"],
isbn: ["isbn", "prism:isbn"],
language: ["language", "dc:language"],
publisher: ["dc:publisher"],
}, },
normalize: function () { normalize: function () {
@ -187,10 +203,6 @@ Feed.prototype = {
this.skipHours = this.skipHours.hours; this.skipHours = this.skipHours.hours;
} }
if (this.updated) {
this.updated = dateParse(this.updated);
}
// Assign Atom link if needed // Assign Atom link if needed
if (this.fields.links) { if (this.fields.links) {
this._atomLinksToURI(); this._atomLinksToURI();
@ -341,7 +353,7 @@ Entry.prototype = {
mediaContent: null, mediaContent: null,
searchLists: { searchLists: {
title: ["title", "rss1:title", "atom03:title", "atom:title"], title: ["title", "rss1:title", "atom03:title", "atom:title", "dc:title"],
link: [["link", strToURI], ["rss1:link", strToURI]], link: [["link", strToURI], ["rss1:link", strToURI]],
id: [ id: [
["guid", makePropGetter("guid")], ["guid", makePropGetter("guid")],
@ -359,15 +371,46 @@ Entry.prototype = {
"atom:summary", "atom:summary",
], ],
content: ["content:encoded", "atom03:content", "atom:content"], content: ["content:encoded", "atom03:content", "atom:content"],
rights: ["atom03:rights", "atom:rights"], rights: [
published: ["pubDate", "atom03:issued", "dcterms:issued", "atom:published"], "dc:rights",
"atom03:rights",
"atom:rights",
"copyright",
"prism:copyright",
],
published: [
"dc:date",
"pubDate",
"atom03:issued",
"dcterms:issued",
"atom:published",
"prism:publicationDate",
],
updated: [ updated: [
"pubDate", "pubDate",
"atom03:modified", "atom03:modified",
"dc:date", "dc:date",
"dcterms:modified", "dcterms:modified",
"atom:updated", "atom:updated",
"prism:modificationDate",
], ],
pubTitle: ["pubTitle", "dc:source", "prism:publicationName"],
pubType: ["pubType"],
startPage: ["startPage", "prism:startingPage"],
endPage: ["endPage", "prism:endingPage"],
pageRange: ["prism:pageRange"],
issn: ["prism:issn"],
isbn: ["isbn", "prism:isbn"],
identifier: [
"dc:identifier",
"prism:doi",
],
publisher: ["dc:publisher"],
language: ["language", "dc:language"],
volume: ["prism:volume"],
issue: ["prism:number"],
section: ["prism:section"],
url: ["prism:url"],
}, },
normalize: function () { normalize: function () {
@ -395,13 +438,6 @@ Entry.prototype = {
} }
} }
if (this.updated) {
this.updated = dateParse(this.updated);
}
if (this.published) {
this.published = dateParse(this.published);
}
this._resetBagMembersToRawText([ this._resetBagMembersToRawText([
this.searchLists.content, this.searchLists.content,
this.searchLists.summary, this.searchLists.summary,
@ -742,24 +778,6 @@ function rssAuthor(s, author) {
return author; return author;
} }
/**
* Tries parsing a string through the JavaScript Date object.
* @param aDateString
* A string that is supposedly an RFC822 or RFC3339 date.
* @return A Date.toUTCString, or null if the string can't be parsed.
*/
function dateParse(aDateString) {
let dateString = aDateString.trim();
// Without bug 682781 fixed, JS won't parse an RFC822 date with a Z for the
// timezone, so convert to -00:00 which works for any date format.
dateString = dateString.replace(/z$/i, "-00:00");
let date = new Date(dateString);
if (!isNaN(date)) {
return date.toUTCString();
}
return null;
}
const XHTML_NS = "http://www.w3.org/1999/xhtml"; const XHTML_NS = "http://www.w3.org/1999/xhtml";
// The XHTMLHandler handles inline XHTML found in things like atom:summary // The XHTMLHandler handles inline XHTML found in things like atom:summary
@ -988,7 +1006,8 @@ function FeedProcessor() {
"atom03:title": "text", "atom03:title": "text",
"atom03:tagline": "text", "atom03:tagline": "text",
"atom03:summary": "text", "atom03:summary": "text",
"atom03:content": "text" "atom03:content": "text",
"dc:title": "text",
}; };
this._stack = []; this._stack = [];

View file

@ -0,0 +1,60 @@
<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF xmlns="http://purl.org/rss/1.0/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dcterms="http://purl.org/dc/terms/"
xmlns:cc="http://web.resource.org/cc/"
xmlns:prism="http://prismstandard.org/namespaces/basic/2.0/"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:admin="http://webns.net/mvcb/"
xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel rdf:about="https://www.mdpi.com/rss/journal/fluids">
<title>Fluids</title>
<description>Latest open access articles published in Fluids at https://www.mdpi.com/journal/fluids</description>
<link>https://www.mdpi.com/journal/fluids</link>
<admin:generatorAgent rdf:resource="https://www.mdpi.com/journal/fluids"/>
<admin:errorReportsTo rdf:resource="mailto:support@mdpi.com"/>
<dc:publisher>MDPI</dc:publisher>
<dc:language>en</dc:language>
<dc:rights>Creative Commons Attribution (CC-BY)</dc:rights>
<prism:copyright>MDPI</prism:copyright>
<prism:rightsAgent>support@mdpi.com</prism:rightsAgent>
<image rdf:resource="https://pub.mdpi-res.com/img/design/mdpi-pub-logo.png?13cf3b5bd783e021?1716292923"/>
<items>
<rdf:Seq>
<rdf:li rdf:resource="https://www.mdpi.com/2311-5521/9/6/120" />
</rdf:Seq>
</items>
<cc:license rdf:resource="http://creativecommons.org/licenses/by/3.0/" />
</channel>
<item rdf:about="https://www.mdpi.com/2311-5521/9/6/120">
<title>Fluids, Vol. 9, Pages 120: Environmental Hydraulics, Turbulence, and Sediment Transport, Second Edition</title>
<link>https://www.mdpi.com/2311-5521/9/6/120</link>
<description>Within river systems, the process of bed-forming is intricate, dynamic and is shaped by different factors [...]</description>
<pubDate>2024-05-22</pubDate>
<content:encoded><![CDATA[
Abstract
]]></content:encoded>
<dc:title>Environmental Hydraulics, Turbulence, and Sediment Transport, Second Edition</dc:title>
<dc:creator>Jaan H. Pu</dc:creator>
<dc:creator>Manish Pandey</dc:creator>
<dc:creator>Prashanth Reddy Hanmaiahgari</dc:creator>
<dc:identifier>doi: 10.3390/fluids9060120</dc:identifier>
<dc:source>Fluids</dc:source>
<dc:date>2024-05-22</dc:date>
<prism:publicationName>Fluids</prism:publicationName>
<prism:publicationDate>2024-05-22</prism:publicationDate>
<prism:volume>9</prism:volume>
<prism:number>6</prism:number>
<prism:section>Editorial</prism:section>
<prism:startingPage>120</prism:startingPage>
<prism:doi>10.3390/fluids9060120</prism:doi>
<prism:url>https://www.mdpi.com/2311-5521/9/6/120</prism:url>
<cc:license rdf:resource="CC BY 4.0"/>
</item>
</rdf:RDF>

View file

@ -32,6 +32,7 @@ describe("Zotero.FeedReader", function () {
var richTextRSSFeedURL = getTestDataUrl("feedRichText.rss"); var richTextRSSFeedURL = getTestDataUrl("feedRichText.rss");
var cdataRSSFeedURL = getTestDataUrl("feedCDATA.rss"); var cdataRSSFeedURL = getTestDataUrl("feedCDATA.rss");
var articleMetadataRSSFeedURL = getTestDataUrl("feedArticleMetadata.rss");
var atomFeedURL = getTestDataUrl("feed.atom"); var atomFeedURL = getTestDataUrl("feed.atom");
var mediaFeedURL = getTestDataUrl("feedMedia.xml"); var mediaFeedURL = getTestDataUrl("feedMedia.xml");
@ -157,6 +158,7 @@ describe("Zotero.FeedReader", function () {
publicationTitle: 'Publication', publicationTitle: 'Publication',
ISSN: '0000-0000', ISSN: '0000-0000',
publisher: 'Publisher', publisher: 'Publisher',
section: 'Article',
rights: '©2016 Published by Publisher', rights: '©2016 Published by Publisher',
language: 'en', language: 'en',
itemType: 'journalArticle', itemType: 'journalArticle',
@ -170,6 +172,38 @@ describe("Zotero.FeedReader", function () {
assert.deepEqual(item, expected); assert.deepEqual(item, expected);
}); });
it('should parse items correctly for an RSS feed with journal article metadata', function* () {
let expected = {
guid: 'https://www.mdpi.com/2311-5521/9/6/120',
title: 'Environmental Hydraulics, Turbulence, and Sediment Transport, Second Edition',
abstractNote: 'Abstract',
url: 'https://www.mdpi.com/2311-5521/9/6/120',
creators: [
{ firstName: 'Jaan H.', lastName: 'Pu', creatorType: 'author' },
{ firstName: 'Manish', lastName: 'Pandey', creatorType: 'author' },
{ firstName: 'Prashanth Reddy', lastName: 'Hanmaiahgari', creatorType: 'author' }
],
date: '2024-05-22',
publicationTitle: 'Fluids',
pages: '120',
DOI: '10.3390/fluids9060120',
volume: '9',
issue: '6',
section: 'Editorial',
publisher: 'MDPI',
rights: 'MDPI',
language: 'en',
itemType: 'journalArticle',
enclosedItems: []
};
let fr = new Zotero.FeedReader(articleMetadataRSSFeedURL);
yield fr.process();
let itemIterator = new fr.ItemIterator();
let item = yield itemIterator.next().value;
assert.deepEqual(item, expected);
});
it("should parse item from an Atom feed", function* () { it("should parse item from an Atom feed", function* () {
let expected = { let expected = {
guid: 'http://www.example.com/item1', guid: 'http://www.example.com/item1',