Fix indexing files with text content types that Firefox won't display (#3708)

This commit is contained in:
Abe Jellinek 2024-02-19 05:11:16 -05:00 committed by GitHub
parent 24cb38cfc8
commit 1f599283df
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 45 additions and 6 deletions

View file

@ -130,7 +130,7 @@ class HiddenBrowser {
async load(source, options) {
await this._createdPromise;
let url;
if (/^(file|https?|chrome|resource):/.test(source)) {
if (/^(file|https?|chrome|resource|blob):/.test(source)) {
url = source;
}
// Convert string path to file: URL

View file

@ -521,7 +521,7 @@ Zotero.Fulltext = Zotero.FullText = new function(){
}
var contentType = item.attachmentContentType;
var charset = item.attachmentCharacterSet;
var charset = item.attachmentCharset;
if (!contentType) {
Zotero.debug("No content type in indexItem()", 2);
@ -557,7 +557,13 @@ Zotero.Fulltext = Zotero.FullText = new function(){
}
// Otherwise load it in a hidden browser
else {
let pageData = await getPageData(path);
// If the file's content type can't be displayed in a browser, treat it as text/plain
if (!Cc["@mozilla.org/webnavigation-info;1"].getService(Ci.nsIWebNavigationInfo)
.isTypeSupported(contentType)) {
contentType = 'text/plain';
}
let pageData = await getPageData(path, contentType);
text = pageData.bodyText;
if (!charset) {
charset = pageData.characterSet;
@ -1601,17 +1607,24 @@ Zotero.Fulltext = Zotero.FullText = new function(){
});
async function getPageData(path) {
async function getPageData(path, contentType) {
const { HiddenBrowser } = ChromeUtils.import("chrome://zotero/content/HiddenBrowser.jsm");
var blobURL;
var browser;
var pageData;
try {
let url = Zotero.File.pathToFileURI(path);
// Wrap the file in a blob to set its content type
let arrayBuffer = await (await fetch(Zotero.File.pathToFileURI(path))).arrayBuffer();
let blob = new Blob([arrayBuffer], { type: contentType });
blobURL = URL.createObjectURL(blob);
browser = new HiddenBrowser({ blockRemoteResources: true });
await browser.load(url);
await browser.load(blobURL);
pageData = await browser.getPageData(['characterSet', 'bodyText']);
}
finally {
if (blobURL) {
URL.revokeObjectURL(blobURL);
}
if (browser) {
browser.destroy();
}

3
test/tests/data/test.sh Normal file
View file

@ -0,0 +1,3 @@
#!/bin/sh
echo "Nothing"

View file

@ -65,6 +65,29 @@ describe("Zotero.FullText", function () {
Zotero.Fulltext.INDEX_STATE_UNINDEXED
);
})
describe("Indexing with HiddenBrowser", () => {
it("should index attachment as its attachmentContentType when supported", async function () {
// Firefox would normally load this as text/x-shellscript, but we detect text/plain
let item = await importFileAttachment('test.sh');
assert.equal(item.attachmentContentType, 'text/plain');
assert.equal(await Zotero.Fulltext.getIndexedState(item), Zotero.Fulltext.INDEX_STATE_INDEXED);
});
it("should index attachment as text/plain when its text/* attachmentContentType is unsupported", async function () {
// Now we force text/x-shellscript, which the HiddenBrowser would normally refuse to load
// It should still load, because we fall back to text/plain from an unsupported text/* content type
let item = await importFileAttachment('test.sh', { contentType: 'text/x-shellscript' });
assert.equal(item.attachmentContentType, 'text/x-shellscript');
assert.equal(await Zotero.Fulltext.getIndexedState(item), Zotero.Fulltext.INDEX_STATE_INDEXED);
});
it("should not index attachment with non-text attachmentContentType", async function () {
let item = await importFileAttachment('test.txt', { contentType: 'image/png' });
assert.equal(item.attachmentContentType, 'image/png');
assert.equal(await Zotero.Fulltext.getIndexedState(item), Zotero.Fulltext.INDEX_STATE_UNINDEXED);
});
});
});
describe("#indexPDF()", function () {