From 8b13f717b4e01dc87c4f7e7a6a139c3525ae7eec Mon Sep 17 00:00:00 2001 From: Abe Jellinek Date: Wed, 13 Mar 2024 12:20:59 -0400 Subject: [PATCH] SAXXMLReader: Handle non-UTF-8 encodings (#3846) --- resource/feeds/SAXXMLReader.js | 10 ++++++++- test/tests/data/feedWindows1252.rss | 15 +++++++++++++ test/tests/feedReaderTest.js | 35 +++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 test/tests/data/feedWindows1252.rss diff --git a/resource/feeds/SAXXMLReader.js b/resource/feeds/SAXXMLReader.js index b1cf4413eb..eaba91509a 100644 --- a/resource/feeds/SAXXMLReader.js +++ b/resource/feeds/SAXXMLReader.js @@ -62,7 +62,15 @@ class SAXXMLReader { if (!response.ok) { throw new Error("Unable to fetch data"); } - this._data = await response.text(); + let buf = await response.arrayBuffer(); + // We should use NetUtil.parseResponseContentType, but we don't have access to it here + let charset = response.headers.get("Content-Type") + ?.match(/charset=([^;]+)/) + ?.[1]; + if (!charset) { + charset = 'utf-8'; + } + this._data = new TextDecoder(charset).decode(buf); this._parseAndNotify(); } diff --git a/test/tests/data/feedWindows1252.rss b/test/tests/data/feedWindows1252.rss new file mode 100644 index 0000000000..f4281f19c3 --- /dev/null +++ b/test/tests/data/feedWindows1252.rss @@ -0,0 +1,15 @@ + + + + Stortinget: Helse- og omsorgsministeren + https://www.stortinget.no + https://www.stortinget.no + no-NO + + Skriftlig spørsmål fra Tage Pettersen (H) til helse- og omsorgsministeren. Til behandling + https://www.stortinget.no/no/Saker-og-publikasjoner/Sporsmal/Skriftlige-sporsmal-og-svar/Skriftlig-sporsmal/?qid=98117&utm_medium=rss&utm_source=www.stortinget.no&utm_campaign=Helse- og omsorgsministeren + Hva vil helseministeren foreta seg på kort sikt for å sikre rekruttering av helsepersonell og at stortingets vedtak om å tilby en tverrfaglig helsekartlegging av barn som flyttes ut av hjemmet etterleves? + 2024-03-13 + + + \ No newline at end of file diff --git a/test/tests/feedReaderTest.js b/test/tests/feedReaderTest.js index 84cddf6d26..0256e17248 100644 --- a/test/tests/feedReaderTest.js +++ b/test/tests/feedReaderTest.js @@ -251,4 +251,39 @@ describe("Zotero.FeedReader", function () { assert.equal(item.enclosedItems[0].url, "https://static01.nyt.com/images/2021/06/16/world/16biden-photos1/16biden-photos1-moth.jpg"); }); }); + + describe("Legacy text encodings", function () { + var httpd; + var port = 16213; + var baseURL = `http://127.0.0.1:${port}/`; + + before(function () { + Cu.import("resource://zotero-unit/httpd.js"); + httpd = new HttpServer(); + httpd.start(port); + + httpd._handler._mimeMappings.rss = "text/xml; charset=ISO-8859-1"; + + httpd.registerPathHandler("/feedWindows1252.rss", { + handle(request, response) { + response.setStatusLine(null, 200, 'OK'); + let file = getTestDataDirectory(); + file.append("feedWindows1252.rss"); + httpd._handler._writeFileResponse(request, file, response, 0, file.fileSize); + } + }); + }); + + after(async function () { + await new Promise(resolve => httpd.stop(resolve)); + }); + + it("should handle an ISO-8859-1 (windows-1252) feed", async function () { + let fr = new Zotero.FeedReader(baseURL + "feedWindows1252.rss"); + await fr.process(); + let itemIterator = new fr.ItemIterator(); + let item = await itemIterator.next().value; + assert.equal(item.title, "Skriftlig spørsmÃ¥l fra Tage Pettersen (H) til helse- og omsorgsministeren. Til behandling"); + }); + }); })