Feeds: Refactor to move field extraction to FeedProcessor (#4158)

And support more PRISM fields.
This commit is contained in:
Abe Jellinek 2024-05-22 18:29:58 -04:00 committed by GitHub
parent 101e6d55d5
commit 59afef6c2e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 181 additions and 73 deletions

View file

@ -75,31 +75,25 @@ Zotero.FeedReader = function (url) {
// TODO: image as icon
let publicationTitle = Zotero.FeedReader._getFeedField(feed, 'publicationName', 'prism')
|| Zotero.FeedReader._getFeedField(feed, 'pubTitle');
let publicationTitle = feed.pubTitle;
if (publicationTitle) info.publicationTitle = publicationTitle;
let publisher = Zotero.FeedReader._getFeedField(feed, 'publisher', 'dc');
let publisher = feed.publisher;
if (publisher) info.publisher = publisher;
let rights = (feed.rights && feed.rights.plainText())
|| Zotero.FeedReader._getFeedField(feed, 'copyright', 'prism')
|| Zotero.FeedReader._getFeedField(feed, 'rights', 'dc')
|| Zotero.FeedReader._getFeedField(feed, 'copyright');
let rights = feed.rights;
if (rights) info.rights = rights;
let issn = Zotero.FeedReader._getFeedField(feed, 'issn', 'prism');
let issn = feed.issn;
if (issn) info.ISSN = issn;
let isbn = Zotero.FeedReader._getFeedField(feed, 'isbn', 'prism')
|| Zotero.FeedReader._getFeedField(feed, 'isbn');
let isbn = feed.isbn;
if (isbn) info.ISBN = isbn;
let language = Zotero.FeedReader._getFeedField(feed, 'language', 'dc')
|| Zotero.FeedReader._getFeedField(feed, 'language');
let language = feed.language;
if (language) info.language = language;
let ttl = Zotero.FeedReader._getFeedField(feed, 'ttl');
let ttl = feed.ttl;
if (ttl) info.ttl = ttl;
this._feedProperties = info;
@ -435,7 +429,8 @@ Zotero.FeedReader._getFeedItem = function (feedEntry, feedInfo) {
if (splitAt <= title.length) item.title += '...';
}
if (feedEntry.link) item.url = feedEntry.link.href;
let url = feedEntry.link?.href || feedEntry.url.plainText();
if (url) item.url = url;
if (feedEntry.rights) item.rights = Zotero.FeedReader._getRichText(feedEntry.rights, 'rights');
@ -453,40 +448,34 @@ Zotero.FeedReader._getFeedItem = function (feedEntry, feedInfo) {
/** Done with basic metadata, now look for better data **/
let date = Zotero.FeedReader._getFeedField(feedEntry, 'publicationDate', 'prism')
|| Zotero.FeedReader._getFeedField(feedEntry, 'date', 'dc')
// DEBUG: Why not get these from the feedEntry?
|| Zotero.FeedReader._getFeedField(feedEntry, 'pubDate') // RSS
|| Zotero.FeedReader._getFeedField(feedEntry, 'updated', 'atom') // Atom
|| Zotero.FeedReader._getFeedField(feedEntry, 'published', 'atom'); // Atom
let date = feedEntry.updated || feedEntry.published;
if (date) item.date = date;
let publicationTitle = Zotero.FeedReader._getFeedField(feedEntry, 'publicationName', 'prism')
|| Zotero.FeedReader._getFeedField(feedEntry, 'source', 'dc')
|| Zotero.FeedReader._getFeedField(feedEntry, 'pubTitle');
let publicationTitle = feedEntry.pubTitle;
if (publicationTitle) item.publicationTitle = publicationTitle;
let publicationType = Zotero.FeedReader._getFeedField(feedEntry, 'pubType');
let publicationType = feedEntry.pubType;
if (publicationType) item.publicationType = publicationType;
let startPage = Zotero.FeedReader._getFeedField(feedEntry, 'startPage');
let endPage = Zotero.FeedReader._getFeedField(feedEntry, 'endPage');
let startPage = feedEntry.startPage;
let endPage = feedEntry.endPage;
if (startPage || endPage) {
item.pages = (startPage || '')
+ (endPage && startPage ? '' : '')
+ (endPage || '');
}
else {
let pageRange = feedEntry.pageRange;
if (pageRange) item.pages = pageRange;
}
let issn = Zotero.FeedReader._getFeedField(feedEntry, 'issn', 'prism');
let issn = feedEntry.issn;
if (issn) item.ISSN = issn;
let isbn = Zotero.FeedReader._getFeedField(feedEntry, 'isbn', 'prism')
|| Zotero.FeedReader._getFeedField(feedEntry, 'isbn');
let isbn = feedEntry.isbn;
if (isbn) item.ISBN = isbn;
let identifier = Zotero.FeedReader._getFeedField(feedEntry, 'identifier', 'dc');
let identifier = feedEntry.identifier;
if (identifier) {
for (let type of ['DOI', 'ISBN', 'ISSN']) {
let cleanId = Zotero.Utilities[`clean${type}`](identifier);
@ -497,18 +486,24 @@ Zotero.FeedReader._getFeedItem = function (feedEntry, feedInfo) {
}
}
let publisher = Zotero.FeedReader._getFeedField(feedEntry, 'publisher', 'dc');
let publisher = feedEntry.publisher;
if (publisher) item.publisher = publisher;
let rights = Zotero.FeedReader._getFeedField(feedEntry, 'copyright', 'prism')
|| Zotero.FeedReader._getFeedField(feedEntry, 'rights', 'dc')
|| Zotero.FeedReader._getFeedField(feedEntry, 'copyright');
let rights = feedEntry.rights;
if (rights) item.rights = rights;
let language = Zotero.FeedReader._getFeedField(feedEntry, 'language', 'dc')
|| Zotero.FeedReader._getFeedField(feedEntry, 'language');
let language = feedEntry.language;
if (language) item.language = language;
let volume = feedEntry.volume;
if (volume) item.volume = volume;
let issue = feedEntry.issue;
if (issue) item.issue = issue;
let section = feedEntry.section;
if (section) item.section = section;
/** Incorporate missing values from feed metadata **/
let supplementFields = ['publicationTitle', 'ISSN', 'publisher', 'rights', 'language'];

View file

@ -103,6 +103,10 @@ var gNamespaces = {
"http://www.w3.org/XML/1998/namespace": "xml",
"http://search.yahoo.com/mrss/": "media",
"http://search.yahoo.com/mrss": "media",
"http://prismstandard.org/namespaces/1.2/basic/": "prism",
"http://prismstandard.org/namespaces/basic/2.0/": "prism",
"http://prismstandard.org/namespaces/basic/3.0/": "prism",
"https://prismdb.takanakahiko.me/prism-schema.ttl#": "prism",
};
// We allow a very small set of namespaces in XHTML content,
@ -147,7 +151,7 @@ Feed.TYPE_VIDEO = 4;
Feed.prototype = {
searchLists: {
title: ["title", "rss1:title", "atom03:title", "atom:title"],
title: ["title", "rss1:title", "atom03:title", "atom:title", "dc:title"],
subtitle: [
"description",
"dc:description",
@ -155,6 +159,7 @@ Feed.prototype = {
"atom03:tagline",
"atom:subtitle",
],
pubTitle: ["pubTitle", "dc:source", "prism:publicationName"],
items: ["items", "atom03_entries", "entries"],
id: ["atom:id", "rdf:about"],
generator: ["generator"],
@ -162,12 +167,19 @@ Feed.prototype = {
contributors: ["contributors"],
link: [["link", strToURI], ["rss1:link", strToURI]],
categories: ["categories", "dc:subject"],
rights: ["atom03:rights", "atom:rights"],
rights: [
"dc:rights",
"atom03:rights",
"atom:rights",
"copyright",
"prism:copyright",
],
cloud: ["cloud"],
image: ["image", "rss1:image", "atom:logo"],
textInput: ["textInput", "rss1:textinput"],
skipDays: ["skipDays"],
skipHours: ["skipHours"],
ttl: ["ttl"],
updated: [
"pubDate",
"lastBuildDate",
@ -176,6 +188,10 @@ Feed.prototype = {
"dcterms:modified",
"atom:updated",
],
issn: ["prism:issn"],
isbn: ["isbn", "prism:isbn"],
language: ["language", "dc:language"],
publisher: ["dc:publisher"],
},
normalize: function () {
@ -187,10 +203,6 @@ Feed.prototype = {
this.skipHours = this.skipHours.hours;
}
if (this.updated) {
this.updated = dateParse(this.updated);
}
// Assign Atom link if needed
if (this.fields.links) {
this._atomLinksToURI();
@ -341,7 +353,7 @@ Entry.prototype = {
mediaContent: null,
searchLists: {
title: ["title", "rss1:title", "atom03:title", "atom:title"],
title: ["title", "rss1:title", "atom03:title", "atom:title", "dc:title"],
link: [["link", strToURI], ["rss1:link", strToURI]],
id: [
["guid", makePropGetter("guid")],
@ -359,15 +371,46 @@ Entry.prototype = {
"atom:summary",
],
content: ["content:encoded", "atom03:content", "atom:content"],
rights: ["atom03:rights", "atom:rights"],
published: ["pubDate", "atom03:issued", "dcterms:issued", "atom:published"],
rights: [
"dc:rights",
"atom03:rights",
"atom:rights",
"copyright",
"prism:copyright",
],
published: [
"dc:date",
"pubDate",
"atom03:issued",
"dcterms:issued",
"atom:published",
"prism:publicationDate",
],
updated: [
"pubDate",
"atom03:modified",
"dc:date",
"dcterms:modified",
"atom:updated",
"prism:modificationDate",
],
pubTitle: ["pubTitle", "dc:source", "prism:publicationName"],
pubType: ["pubType"],
startPage: ["startPage", "prism:startingPage"],
endPage: ["endPage", "prism:endingPage"],
pageRange: ["prism:pageRange"],
issn: ["prism:issn"],
isbn: ["isbn", "prism:isbn"],
identifier: [
"dc:identifier",
"prism:doi",
],
publisher: ["dc:publisher"],
language: ["language", "dc:language"],
volume: ["prism:volume"],
issue: ["prism:number"],
section: ["prism:section"],
url: ["prism:url"],
},
normalize: function () {
@ -395,13 +438,6 @@ Entry.prototype = {
}
}
if (this.updated) {
this.updated = dateParse(this.updated);
}
if (this.published) {
this.published = dateParse(this.published);
}
this._resetBagMembersToRawText([
this.searchLists.content,
this.searchLists.summary,
@ -742,24 +778,6 @@ function rssAuthor(s, author) {
return author;
}
/**
* Tries parsing a string through the JavaScript Date object.
* @param aDateString
* A string that is supposedly an RFC822 or RFC3339 date.
* @return A Date.toUTCString, or null if the string can't be parsed.
*/
function dateParse(aDateString) {
let dateString = aDateString.trim();
// Without bug 682781 fixed, JS won't parse an RFC822 date with a Z for the
// timezone, so convert to -00:00 which works for any date format.
dateString = dateString.replace(/z$/i, "-00:00");
let date = new Date(dateString);
if (!isNaN(date)) {
return date.toUTCString();
}
return null;
}
const XHTML_NS = "http://www.w3.org/1999/xhtml";
// The XHTMLHandler handles inline XHTML found in things like atom:summary
@ -988,7 +1006,8 @@ function FeedProcessor() {
"atom03:title": "text",
"atom03:tagline": "text",
"atom03:summary": "text",
"atom03:content": "text"
"atom03:content": "text",
"dc:title": "text",
};
this._stack = [];

View file

@ -0,0 +1,60 @@
<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF xmlns="http://purl.org/rss/1.0/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dcterms="http://purl.org/dc/terms/"
xmlns:cc="http://web.resource.org/cc/"
xmlns:prism="http://prismstandard.org/namespaces/basic/2.0/"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:admin="http://webns.net/mvcb/"
xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel rdf:about="https://www.mdpi.com/rss/journal/fluids">
<title>Fluids</title>
<description>Latest open access articles published in Fluids at https://www.mdpi.com/journal/fluids</description>
<link>https://www.mdpi.com/journal/fluids</link>
<admin:generatorAgent rdf:resource="https://www.mdpi.com/journal/fluids"/>
<admin:errorReportsTo rdf:resource="mailto:support@mdpi.com"/>
<dc:publisher>MDPI</dc:publisher>
<dc:language>en</dc:language>
<dc:rights>Creative Commons Attribution (CC-BY)</dc:rights>
<prism:copyright>MDPI</prism:copyright>
<prism:rightsAgent>support@mdpi.com</prism:rightsAgent>
<image rdf:resource="https://pub.mdpi-res.com/img/design/mdpi-pub-logo.png?13cf3b5bd783e021?1716292923"/>
<items>
<rdf:Seq>
<rdf:li rdf:resource="https://www.mdpi.com/2311-5521/9/6/120" />
</rdf:Seq>
</items>
<cc:license rdf:resource="http://creativecommons.org/licenses/by/3.0/" />
</channel>
<item rdf:about="https://www.mdpi.com/2311-5521/9/6/120">
<title>Fluids, Vol. 9, Pages 120: Environmental Hydraulics, Turbulence, and Sediment Transport, Second Edition</title>
<link>https://www.mdpi.com/2311-5521/9/6/120</link>
<description>Within river systems, the process of bed-forming is intricate, dynamic and is shaped by different factors [...]</description>
<pubDate>2024-05-22</pubDate>
<content:encoded><![CDATA[
Abstract
]]></content:encoded>
<dc:title>Environmental Hydraulics, Turbulence, and Sediment Transport, Second Edition</dc:title>
<dc:creator>Jaan H. Pu</dc:creator>
<dc:creator>Manish Pandey</dc:creator>
<dc:creator>Prashanth Reddy Hanmaiahgari</dc:creator>
<dc:identifier>doi: 10.3390/fluids9060120</dc:identifier>
<dc:source>Fluids</dc:source>
<dc:date>2024-05-22</dc:date>
<prism:publicationName>Fluids</prism:publicationName>
<prism:publicationDate>2024-05-22</prism:publicationDate>
<prism:volume>9</prism:volume>
<prism:number>6</prism:number>
<prism:section>Editorial</prism:section>
<prism:startingPage>120</prism:startingPage>
<prism:doi>10.3390/fluids9060120</prism:doi>
<prism:url>https://www.mdpi.com/2311-5521/9/6/120</prism:url>
<cc:license rdf:resource="CC BY 4.0"/>
</item>
</rdf:RDF>

View file

@ -32,6 +32,7 @@ describe("Zotero.FeedReader", function () {
var richTextRSSFeedURL = getTestDataUrl("feedRichText.rss");
var cdataRSSFeedURL = getTestDataUrl("feedCDATA.rss");
var articleMetadataRSSFeedURL = getTestDataUrl("feedArticleMetadata.rss");
var atomFeedURL = getTestDataUrl("feed.atom");
var mediaFeedURL = getTestDataUrl("feedMedia.xml");
@ -157,6 +158,7 @@ describe("Zotero.FeedReader", function () {
publicationTitle: 'Publication',
ISSN: '0000-0000',
publisher: 'Publisher',
section: 'Article',
rights: '©2016 Published by Publisher',
language: 'en',
itemType: 'journalArticle',
@ -170,6 +172,38 @@ describe("Zotero.FeedReader", function () {
assert.deepEqual(item, expected);
});
it('should parse items correctly for an RSS feed with journal article metadata', function* () {
let expected = {
guid: 'https://www.mdpi.com/2311-5521/9/6/120',
title: 'Environmental Hydraulics, Turbulence, and Sediment Transport, Second Edition',
abstractNote: 'Abstract',
url: 'https://www.mdpi.com/2311-5521/9/6/120',
creators: [
{ firstName: 'Jaan H.', lastName: 'Pu', creatorType: 'author' },
{ firstName: 'Manish', lastName: 'Pandey', creatorType: 'author' },
{ firstName: 'Prashanth Reddy', lastName: 'Hanmaiahgari', creatorType: 'author' }
],
date: '2024-05-22',
publicationTitle: 'Fluids',
pages: '120',
DOI: '10.3390/fluids9060120',
volume: '9',
issue: '6',
section: 'Editorial',
publisher: 'MDPI',
rights: 'MDPI',
language: 'en',
itemType: 'journalArticle',
enclosedItems: []
};
let fr = new Zotero.FeedReader(articleMetadataRSSFeedURL);
yield fr.process();
let itemIterator = new fr.ItemIterator();
let item = yield itemIterator.next().value;
assert.deepEqual(item, expected);
});
it("should parse item from an Atom feed", function* () {
let expected = {
guid: 'http://www.example.com/item1',