Feeds: Refactor to move field extraction to FeedProcessor (#4158)
And support more PRISM fields.
This commit is contained in:
parent
101e6d55d5
commit
59afef6c2e
4 changed files with 181 additions and 73 deletions
|
@ -75,31 +75,25 @@ Zotero.FeedReader = function (url) {
|
|||
|
||||
// TODO: image as icon
|
||||
|
||||
let publicationTitle = Zotero.FeedReader._getFeedField(feed, 'publicationName', 'prism')
|
||||
|| Zotero.FeedReader._getFeedField(feed, 'pubTitle');
|
||||
let publicationTitle = feed.pubTitle;
|
||||
if (publicationTitle) info.publicationTitle = publicationTitle;
|
||||
|
||||
let publisher = Zotero.FeedReader._getFeedField(feed, 'publisher', 'dc');
|
||||
let publisher = feed.publisher;
|
||||
if (publisher) info.publisher = publisher;
|
||||
|
||||
let rights = (feed.rights && feed.rights.plainText())
|
||||
|| Zotero.FeedReader._getFeedField(feed, 'copyright', 'prism')
|
||||
|| Zotero.FeedReader._getFeedField(feed, 'rights', 'dc')
|
||||
|| Zotero.FeedReader._getFeedField(feed, 'copyright');
|
||||
let rights = feed.rights;
|
||||
if (rights) info.rights = rights;
|
||||
|
||||
let issn = Zotero.FeedReader._getFeedField(feed, 'issn', 'prism');
|
||||
let issn = feed.issn;
|
||||
if (issn) info.ISSN = issn;
|
||||
|
||||
let isbn = Zotero.FeedReader._getFeedField(feed, 'isbn', 'prism')
|
||||
|| Zotero.FeedReader._getFeedField(feed, 'isbn');
|
||||
let isbn = feed.isbn;
|
||||
if (isbn) info.ISBN = isbn;
|
||||
|
||||
let language = Zotero.FeedReader._getFeedField(feed, 'language', 'dc')
|
||||
|| Zotero.FeedReader._getFeedField(feed, 'language');
|
||||
let language = feed.language;
|
||||
if (language) info.language = language;
|
||||
|
||||
let ttl = Zotero.FeedReader._getFeedField(feed, 'ttl');
|
||||
let ttl = feed.ttl;
|
||||
if (ttl) info.ttl = ttl;
|
||||
|
||||
this._feedProperties = info;
|
||||
|
@ -435,7 +429,8 @@ Zotero.FeedReader._getFeedItem = function (feedEntry, feedInfo) {
|
|||
if (splitAt <= title.length) item.title += '...';
|
||||
}
|
||||
|
||||
if (feedEntry.link) item.url = feedEntry.link.href;
|
||||
let url = feedEntry.link?.href || feedEntry.url.plainText();
|
||||
if (url) item.url = url;
|
||||
|
||||
if (feedEntry.rights) item.rights = Zotero.FeedReader._getRichText(feedEntry.rights, 'rights');
|
||||
|
||||
|
@ -453,40 +448,34 @@ Zotero.FeedReader._getFeedItem = function (feedEntry, feedInfo) {
|
|||
|
||||
/** Done with basic metadata, now look for better data **/
|
||||
|
||||
let date = Zotero.FeedReader._getFeedField(feedEntry, 'publicationDate', 'prism')
|
||||
|| Zotero.FeedReader._getFeedField(feedEntry, 'date', 'dc')
|
||||
// DEBUG: Why not get these from the feedEntry?
|
||||
|| Zotero.FeedReader._getFeedField(feedEntry, 'pubDate') // RSS
|
||||
|| Zotero.FeedReader._getFeedField(feedEntry, 'updated', 'atom') // Atom
|
||||
|| Zotero.FeedReader._getFeedField(feedEntry, 'published', 'atom'); // Atom
|
||||
|
||||
|
||||
let date = feedEntry.updated || feedEntry.published;
|
||||
if (date) item.date = date;
|
||||
|
||||
let publicationTitle = Zotero.FeedReader._getFeedField(feedEntry, 'publicationName', 'prism')
|
||||
|| Zotero.FeedReader._getFeedField(feedEntry, 'source', 'dc')
|
||||
|| Zotero.FeedReader._getFeedField(feedEntry, 'pubTitle');
|
||||
let publicationTitle = feedEntry.pubTitle;
|
||||
if (publicationTitle) item.publicationTitle = publicationTitle;
|
||||
|
||||
let publicationType = Zotero.FeedReader._getFeedField(feedEntry, 'pubType');
|
||||
let publicationType = feedEntry.pubType;
|
||||
if (publicationType) item.publicationType = publicationType;
|
||||
|
||||
let startPage = Zotero.FeedReader._getFeedField(feedEntry, 'startPage');
|
||||
let endPage = Zotero.FeedReader._getFeedField(feedEntry, 'endPage');
|
||||
let startPage = feedEntry.startPage;
|
||||
let endPage = feedEntry.endPage;
|
||||
if (startPage || endPage) {
|
||||
item.pages = (startPage || '')
|
||||
+ (endPage && startPage ? '–' : '')
|
||||
+ (endPage || '');
|
||||
}
|
||||
else {
|
||||
let pageRange = feedEntry.pageRange;
|
||||
if (pageRange) item.pages = pageRange;
|
||||
}
|
||||
|
||||
let issn = Zotero.FeedReader._getFeedField(feedEntry, 'issn', 'prism');
|
||||
let issn = feedEntry.issn;
|
||||
if (issn) item.ISSN = issn;
|
||||
|
||||
let isbn = Zotero.FeedReader._getFeedField(feedEntry, 'isbn', 'prism')
|
||||
|| Zotero.FeedReader._getFeedField(feedEntry, 'isbn');
|
||||
let isbn = feedEntry.isbn;
|
||||
if (isbn) item.ISBN = isbn;
|
||||
|
||||
let identifier = Zotero.FeedReader._getFeedField(feedEntry, 'identifier', 'dc');
|
||||
let identifier = feedEntry.identifier;
|
||||
if (identifier) {
|
||||
for (let type of ['DOI', 'ISBN', 'ISSN']) {
|
||||
let cleanId = Zotero.Utilities[`clean${type}`](identifier);
|
||||
|
@ -497,18 +486,24 @@ Zotero.FeedReader._getFeedItem = function (feedEntry, feedInfo) {
|
|||
}
|
||||
}
|
||||
|
||||
let publisher = Zotero.FeedReader._getFeedField(feedEntry, 'publisher', 'dc');
|
||||
let publisher = feedEntry.publisher;
|
||||
if (publisher) item.publisher = publisher;
|
||||
|
||||
let rights = Zotero.FeedReader._getFeedField(feedEntry, 'copyright', 'prism')
|
||||
|| Zotero.FeedReader._getFeedField(feedEntry, 'rights', 'dc')
|
||||
|| Zotero.FeedReader._getFeedField(feedEntry, 'copyright');
|
||||
let rights = feedEntry.rights;
|
||||
if (rights) item.rights = rights;
|
||||
|
||||
let language = Zotero.FeedReader._getFeedField(feedEntry, 'language', 'dc')
|
||||
|| Zotero.FeedReader._getFeedField(feedEntry, 'language');
|
||||
let language = feedEntry.language;
|
||||
if (language) item.language = language;
|
||||
|
||||
let volume = feedEntry.volume;
|
||||
if (volume) item.volume = volume;
|
||||
|
||||
let issue = feedEntry.issue;
|
||||
if (issue) item.issue = issue;
|
||||
|
||||
let section = feedEntry.section;
|
||||
if (section) item.section = section;
|
||||
|
||||
/** Incorporate missing values from feed metadata **/
|
||||
|
||||
let supplementFields = ['publicationTitle', 'ISSN', 'publisher', 'rights', 'language'];
|
||||
|
|
|
@ -103,6 +103,10 @@ var gNamespaces = {
|
|||
"http://www.w3.org/XML/1998/namespace": "xml",
|
||||
"http://search.yahoo.com/mrss/": "media",
|
||||
"http://search.yahoo.com/mrss": "media",
|
||||
"http://prismstandard.org/namespaces/1.2/basic/": "prism",
|
||||
"http://prismstandard.org/namespaces/basic/2.0/": "prism",
|
||||
"http://prismstandard.org/namespaces/basic/3.0/": "prism",
|
||||
"https://prismdb.takanakahiko.me/prism-schema.ttl#": "prism",
|
||||
};
|
||||
|
||||
// We allow a very small set of namespaces in XHTML content,
|
||||
|
@ -147,7 +151,7 @@ Feed.TYPE_VIDEO = 4;
|
|||
|
||||
Feed.prototype = {
|
||||
searchLists: {
|
||||
title: ["title", "rss1:title", "atom03:title", "atom:title"],
|
||||
title: ["title", "rss1:title", "atom03:title", "atom:title", "dc:title"],
|
||||
subtitle: [
|
||||
"description",
|
||||
"dc:description",
|
||||
|
@ -155,6 +159,7 @@ Feed.prototype = {
|
|||
"atom03:tagline",
|
||||
"atom:subtitle",
|
||||
],
|
||||
pubTitle: ["pubTitle", "dc:source", "prism:publicationName"],
|
||||
items: ["items", "atom03_entries", "entries"],
|
||||
id: ["atom:id", "rdf:about"],
|
||||
generator: ["generator"],
|
||||
|
@ -162,12 +167,19 @@ Feed.prototype = {
|
|||
contributors: ["contributors"],
|
||||
link: [["link", strToURI], ["rss1:link", strToURI]],
|
||||
categories: ["categories", "dc:subject"],
|
||||
rights: ["atom03:rights", "atom:rights"],
|
||||
rights: [
|
||||
"dc:rights",
|
||||
"atom03:rights",
|
||||
"atom:rights",
|
||||
"copyright",
|
||||
"prism:copyright",
|
||||
],
|
||||
cloud: ["cloud"],
|
||||
image: ["image", "rss1:image", "atom:logo"],
|
||||
textInput: ["textInput", "rss1:textinput"],
|
||||
skipDays: ["skipDays"],
|
||||
skipHours: ["skipHours"],
|
||||
ttl: ["ttl"],
|
||||
updated: [
|
||||
"pubDate",
|
||||
"lastBuildDate",
|
||||
|
@ -176,6 +188,10 @@ Feed.prototype = {
|
|||
"dcterms:modified",
|
||||
"atom:updated",
|
||||
],
|
||||
issn: ["prism:issn"],
|
||||
isbn: ["isbn", "prism:isbn"],
|
||||
language: ["language", "dc:language"],
|
||||
publisher: ["dc:publisher"],
|
||||
},
|
||||
|
||||
normalize: function () {
|
||||
|
@ -187,10 +203,6 @@ Feed.prototype = {
|
|||
this.skipHours = this.skipHours.hours;
|
||||
}
|
||||
|
||||
if (this.updated) {
|
||||
this.updated = dateParse(this.updated);
|
||||
}
|
||||
|
||||
// Assign Atom link if needed
|
||||
if (this.fields.links) {
|
||||
this._atomLinksToURI();
|
||||
|
@ -341,7 +353,7 @@ Entry.prototype = {
|
|||
mediaContent: null,
|
||||
|
||||
searchLists: {
|
||||
title: ["title", "rss1:title", "atom03:title", "atom:title"],
|
||||
title: ["title", "rss1:title", "atom03:title", "atom:title", "dc:title"],
|
||||
link: [["link", strToURI], ["rss1:link", strToURI]],
|
||||
id: [
|
||||
["guid", makePropGetter("guid")],
|
||||
|
@ -359,15 +371,46 @@ Entry.prototype = {
|
|||
"atom:summary",
|
||||
],
|
||||
content: ["content:encoded", "atom03:content", "atom:content"],
|
||||
rights: ["atom03:rights", "atom:rights"],
|
||||
published: ["pubDate", "atom03:issued", "dcterms:issued", "atom:published"],
|
||||
rights: [
|
||||
"dc:rights",
|
||||
"atom03:rights",
|
||||
"atom:rights",
|
||||
"copyright",
|
||||
"prism:copyright",
|
||||
],
|
||||
published: [
|
||||
"dc:date",
|
||||
"pubDate",
|
||||
"atom03:issued",
|
||||
"dcterms:issued",
|
||||
"atom:published",
|
||||
"prism:publicationDate",
|
||||
],
|
||||
updated: [
|
||||
"pubDate",
|
||||
"atom03:modified",
|
||||
"dc:date",
|
||||
"dcterms:modified",
|
||||
"atom:updated",
|
||||
"prism:modificationDate",
|
||||
],
|
||||
pubTitle: ["pubTitle", "dc:source", "prism:publicationName"],
|
||||
pubType: ["pubType"],
|
||||
startPage: ["startPage", "prism:startingPage"],
|
||||
endPage: ["endPage", "prism:endingPage"],
|
||||
pageRange: ["prism:pageRange"],
|
||||
issn: ["prism:issn"],
|
||||
isbn: ["isbn", "prism:isbn"],
|
||||
identifier: [
|
||||
"dc:identifier",
|
||||
"prism:doi",
|
||||
],
|
||||
publisher: ["dc:publisher"],
|
||||
language: ["language", "dc:language"],
|
||||
volume: ["prism:volume"],
|
||||
issue: ["prism:number"],
|
||||
section: ["prism:section"],
|
||||
url: ["prism:url"],
|
||||
},
|
||||
|
||||
normalize: function () {
|
||||
|
@ -395,13 +438,6 @@ Entry.prototype = {
|
|||
}
|
||||
}
|
||||
|
||||
if (this.updated) {
|
||||
this.updated = dateParse(this.updated);
|
||||
}
|
||||
if (this.published) {
|
||||
this.published = dateParse(this.published);
|
||||
}
|
||||
|
||||
this._resetBagMembersToRawText([
|
||||
this.searchLists.content,
|
||||
this.searchLists.summary,
|
||||
|
@ -742,24 +778,6 @@ function rssAuthor(s, author) {
|
|||
return author;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries parsing a string through the JavaScript Date object.
|
||||
* @param aDateString
|
||||
* A string that is supposedly an RFC822 or RFC3339 date.
|
||||
* @return A Date.toUTCString, or null if the string can't be parsed.
|
||||
*/
|
||||
function dateParse(aDateString) {
|
||||
let dateString = aDateString.trim();
|
||||
// Without bug 682781 fixed, JS won't parse an RFC822 date with a Z for the
|
||||
// timezone, so convert to -00:00 which works for any date format.
|
||||
dateString = dateString.replace(/z$/i, "-00:00");
|
||||
let date = new Date(dateString);
|
||||
if (!isNaN(date)) {
|
||||
return date.toUTCString();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
const XHTML_NS = "http://www.w3.org/1999/xhtml";
|
||||
|
||||
// The XHTMLHandler handles inline XHTML found in things like atom:summary
|
||||
|
@ -988,7 +1006,8 @@ function FeedProcessor() {
|
|||
"atom03:title": "text",
|
||||
"atom03:tagline": "text",
|
||||
"atom03:summary": "text",
|
||||
"atom03:content": "text"
|
||||
"atom03:content": "text",
|
||||
"dc:title": "text",
|
||||
};
|
||||
this._stack = [];
|
||||
|
||||
|
|
60
test/tests/data/feedArticleMetadata.rss
Normal file
60
test/tests/data/feedArticleMetadata.rss
Normal file
|
@ -0,0 +1,60 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rdf:RDF xmlns="http://purl.org/rss/1.0/"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:dcterms="http://purl.org/dc/terms/"
|
||||
xmlns:cc="http://web.resource.org/cc/"
|
||||
xmlns:prism="http://prismstandard.org/namespaces/basic/2.0/"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns:admin="http://webns.net/mvcb/"
|
||||
xmlns:content="http://purl.org/rss/1.0/modules/content/">
|
||||
<channel rdf:about="https://www.mdpi.com/rss/journal/fluids">
|
||||
<title>Fluids</title>
|
||||
<description>Latest open access articles published in Fluids at https://www.mdpi.com/journal/fluids</description>
|
||||
<link>https://www.mdpi.com/journal/fluids</link>
|
||||
<admin:generatorAgent rdf:resource="https://www.mdpi.com/journal/fluids"/>
|
||||
<admin:errorReportsTo rdf:resource="mailto:support@mdpi.com"/>
|
||||
<dc:publisher>MDPI</dc:publisher>
|
||||
<dc:language>en</dc:language>
|
||||
<dc:rights>Creative Commons Attribution (CC-BY)</dc:rights>
|
||||
<prism:copyright>MDPI</prism:copyright>
|
||||
<prism:rightsAgent>support@mdpi.com</prism:rightsAgent>
|
||||
<image rdf:resource="https://pub.mdpi-res.com/img/design/mdpi-pub-logo.png?13cf3b5bd783e021?1716292923"/>
|
||||
<items>
|
||||
<rdf:Seq>
|
||||
<rdf:li rdf:resource="https://www.mdpi.com/2311-5521/9/6/120" />
|
||||
</rdf:Seq>
|
||||
</items>
|
||||
<cc:license rdf:resource="http://creativecommons.org/licenses/by/3.0/" />
|
||||
</channel>
|
||||
|
||||
<item rdf:about="https://www.mdpi.com/2311-5521/9/6/120">
|
||||
|
||||
<title>Fluids, Vol. 9, Pages 120: Environmental Hydraulics, Turbulence, and Sediment Transport, Second Edition</title>
|
||||
<link>https://www.mdpi.com/2311-5521/9/6/120</link>
|
||||
<description>Within river systems, the process of bed-forming is intricate, dynamic and is shaped by different factors [...]</description>
|
||||
<pubDate>2024-05-22</pubDate>
|
||||
|
||||
<content:encoded><![CDATA[
|
||||
Abstract
|
||||
]]></content:encoded>
|
||||
|
||||
<dc:title>Environmental Hydraulics, Turbulence, and Sediment Transport, Second Edition</dc:title>
|
||||
<dc:creator>Jaan H. Pu</dc:creator>
|
||||
<dc:creator>Manish Pandey</dc:creator>
|
||||
<dc:creator>Prashanth Reddy Hanmaiahgari</dc:creator>
|
||||
<dc:identifier>doi: 10.3390/fluids9060120</dc:identifier>
|
||||
<dc:source>Fluids</dc:source>
|
||||
<dc:date>2024-05-22</dc:date>
|
||||
|
||||
<prism:publicationName>Fluids</prism:publicationName>
|
||||
<prism:publicationDate>2024-05-22</prism:publicationDate>
|
||||
<prism:volume>9</prism:volume>
|
||||
<prism:number>6</prism:number>
|
||||
<prism:section>Editorial</prism:section>
|
||||
<prism:startingPage>120</prism:startingPage>
|
||||
<prism:doi>10.3390/fluids9060120</prism:doi>
|
||||
<prism:url>https://www.mdpi.com/2311-5521/9/6/120</prism:url>
|
||||
|
||||
<cc:license rdf:resource="CC BY 4.0"/>
|
||||
</item>
|
||||
</rdf:RDF>
|
|
@ -32,6 +32,7 @@ describe("Zotero.FeedReader", function () {
|
|||
|
||||
var richTextRSSFeedURL = getTestDataUrl("feedRichText.rss");
|
||||
var cdataRSSFeedURL = getTestDataUrl("feedCDATA.rss");
|
||||
var articleMetadataRSSFeedURL = getTestDataUrl("feedArticleMetadata.rss");
|
||||
var atomFeedURL = getTestDataUrl("feed.atom");
|
||||
var mediaFeedURL = getTestDataUrl("feedMedia.xml");
|
||||
|
||||
|
@ -157,6 +158,7 @@ describe("Zotero.FeedReader", function () {
|
|||
publicationTitle: 'Publication',
|
||||
ISSN: '0000-0000',
|
||||
publisher: 'Publisher',
|
||||
section: 'Article',
|
||||
rights: '©2016 Published by Publisher',
|
||||
language: 'en',
|
||||
itemType: 'journalArticle',
|
||||
|
@ -170,6 +172,38 @@ describe("Zotero.FeedReader", function () {
|
|||
assert.deepEqual(item, expected);
|
||||
});
|
||||
|
||||
it('should parse items correctly for an RSS feed with journal article metadata', function* () {
|
||||
let expected = {
|
||||
guid: 'https://www.mdpi.com/2311-5521/9/6/120',
|
||||
title: 'Environmental Hydraulics, Turbulence, and Sediment Transport, Second Edition',
|
||||
abstractNote: 'Abstract',
|
||||
url: 'https://www.mdpi.com/2311-5521/9/6/120',
|
||||
creators: [
|
||||
{ firstName: 'Jaan H.', lastName: 'Pu', creatorType: 'author' },
|
||||
{ firstName: 'Manish', lastName: 'Pandey', creatorType: 'author' },
|
||||
{ firstName: 'Prashanth Reddy', lastName: 'Hanmaiahgari', creatorType: 'author' }
|
||||
],
|
||||
date: '2024-05-22',
|
||||
publicationTitle: 'Fluids',
|
||||
pages: '120',
|
||||
DOI: '10.3390/fluids9060120',
|
||||
volume: '9',
|
||||
issue: '6',
|
||||
section: 'Editorial',
|
||||
publisher: 'MDPI',
|
||||
rights: 'MDPI',
|
||||
language: 'en',
|
||||
itemType: 'journalArticle',
|
||||
enclosedItems: []
|
||||
};
|
||||
|
||||
let fr = new Zotero.FeedReader(articleMetadataRSSFeedURL);
|
||||
yield fr.process();
|
||||
let itemIterator = new fr.ItemIterator();
|
||||
let item = yield itemIterator.next().value;
|
||||
assert.deepEqual(item, expected);
|
||||
});
|
||||
|
||||
it("should parse item from an Atom feed", function* () {
|
||||
let expected = {
|
||||
guid: 'http://www.example.com/item1',
|
||||
|
|
Loading…
Reference in a new issue