Add a function to download PDFs via a browser (#2248)

Fixes zotero/translators#2739
This commit is contained in:
Adomas Ven 2021-12-02 11:27:33 +02:00 committed by GitHub
parent e54f59ae28
commit 4405b59044
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 189 additions and 52 deletions

View file

@ -1087,7 +1087,9 @@ Zotero.Attachments = new function(){
*/
this.downloadFile = async function (url, path, options = {}) {
Zotero.debug(`Downloading file from ${url}`);
let unproxiedUrls = Object.keys(Zotero.Proxies.getPotentialProxies(url));
let enforcingPDF = false;
try {
await new Zotero.Promise(function (resolve) {
var wbp = Components.classes["@mozilla.org/embedding/browser/nsWebBrowserPersist;1"]
@ -1105,6 +1107,7 @@ Zotero.Attachments = new function(){
});
if (options.isPDF) {
enforcingPDF = true;
await _enforcePDF(path);
}
}
@ -1113,10 +1116,103 @@ Zotero.Attachments = new function(){
await OS.File.remove(path, { ignoreAbsent: true });
}
catch (e) {
Zotero.debug(e, 1);
Zotero.logError(e);
}
const downloadViaBrowserList = [
'https://zotero-static.s3.amazonaws.com/test-pdf-redirect.html',
'://www.sciencedirect.com',
];
// Custom handling for PDFs that are bot-guarded
// via a JS-redirect
if (enforcingPDF && e instanceof this.InvalidPDFException) {
for (let unproxiedUrl of unproxiedUrls) {
if (downloadViaBrowserList.some(checkUrl => unproxiedUrl.includes(checkUrl))) {
return this.downloadPDFViaBrowser(url, path, options);
}
}
}
else {
throw e;
}
}
};
/**
* @param {String} url
* @param {String} path
* @param {Object} [options]
* @param {Object} [options.cookieSandbox]
*/
this.downloadPDFViaBrowser = async function (url, path, options = {}) {
Zotero.debug(`downloadPDFViaBrowser: Downloading file via browser from ${url}`);
const timeout = 60e3;
let channelBrowser, hiddenBrowser;
let hiddenBrowserPDFFoundDeferred = Zotero.Promise.defer();
var pdfMIMETypeHandler = async (blob, name, _, channel) => {
Zotero.debug(`downloadPDFViaBrowser: Sniffing a PDF loaded at ${name}`);
let isOurPDF = false;
// try the browser
try {
channelBrowser = channel.notificationCallbacks.getInterface(Ci.nsIWebNavigation)
.QueryInterface(Ci.nsIDocShell).chromeEventHandler;
}
catch (e) {}
if (channelBrowser) {
isOurPDF = hiddenBrowser === channelBrowser;
}
else {
// try the document for the load group
try {
channelBrowser = channel.loadGroup.notificationCallbacks.getInterface(Ci.nsIWebNavigation)
.QueryInterface(Ci.nsIDocShell).chromeEventHandler;
}
catch(e) {}
if (channelBrowser) {
isOurPDF = hiddenBrowser === channelBrowser;
}
}
if (isOurPDF) {
Zotero.debug(`downloadPDFViaBrowser: Found our PDF at ${name}`);
await Zotero.File.putContentsAsync(path, blob);
hiddenBrowserPDFFoundDeferred.resolve();
return true;
}
else {
Zotero.debug(`downloadPDFViaBrowser: Not our PDF at ${name}`);
return false;
}
};
try {
Zotero.MIMETypeHandler.addHandler("application/pdf", pdfMIMETypeHandler, true);
let noop = () => 0;
hiddenBrowser = Zotero.HTTP.loadDocuments([url], noop, noop, noop, true, options.cookieSandbox);
await Zotero.Promise.race([
Zotero.Promise.delay(timeout).then(() => {
if (!hiddenBrowserPDFFoundDeferred.promise.isResolved()) {
throw new Error(`Loading PDF via browser timed out after ${timeout}ms`);
}
}),
hiddenBrowserPDFFoundDeferred.promise
]);
}
catch (e) {
try {
await OS.File.remove(path, { ignoreAbsent: true });
}
catch (e) {
Zotero.logError(e, 1);
}
throw e;
}
finally {
Zotero.MIMETypeHandler.removeHandler('application/pdf', pdfMIMETypeHandler);
if (hiddenBrowser) {
Zotero.Browser.deleteHiddenBrowser(hiddenBrowser);
}
}
};

View file

@ -49,28 +49,29 @@ Zotero.MIMETypeHandler = new function () {
/**
* Initializes handlers for MIME types
*/
this.initializeHandlers = function() {
this.initializeHandlers = function () {
_typeHandlers = {};
_ignoreContentDispositionTypes = [];
_ignoreContentDispositionTypes = new Set();
_observers = [];
// Install styles from the Cite preferences
this.addHandler("application/vnd.citationstyles.style+xml", Zotero.Promise.coroutine(function* (a1, a2) {
this.addHandler("application/vnd.citationstyles.style+xml", async function (blob, origin) {
let win = Services.wm.getMostRecentWindow("zotero:basicViewer");
var data = await Zotero.Utilities.Internal.blobToText(blob);
try {
yield Zotero.Styles.install(a1, a2, true);
await Zotero.Styles.install(data, origin, true);
}
catch (e) {
Zotero.logError(e);
(new Zotero.Exception.Alert("styles.install.unexpectedError",
a2, "styles.install.title", e)).present();
origin, "styles.install.title", e)).present();
}
// Close styles page in basic viewer after installing a style
if (win) {
win.close();
}
}));
}
}, true);
};
/**
* Adds a handler to handle a specific MIME type
@ -80,22 +81,37 @@ Zotero.MIMETypeHandler = new function () {
* which is often used to force a file to download rather than let it be handled by the web
* browser
*/
this.addHandler = function(type, fn, ignoreContentDisposition) {
_typeHandlers[type] = fn;
_ignoreContentDispositionTypes.push(type);
}
this.addHandler = function (type, fn, ignoreContentDisposition) {
if (_typeHandlers[type]) {
_typeHandlers[type].push(fn);
}
else {
_typeHandlers[type] = [fn];
}
if (ignoreContentDisposition) {
_ignoreContentDispositionTypes.add(type);
}
};
/**
* Removes a handler for a specific MIME type
* @param {String} type MIME type to handle
* @param {Function} handler Function handler to remove
*/
this.removeHandler = function(type) {
delete _typeHandlers[type];
var i = _ignoreContentDispositionTypes.indexOf(type);
if (i != -1) {
_ignoreContentDispositionTypes.splice(i, 1);
this.removeHandler = function (type, handler) {
// If no handler specified or this is the last handler for the type
// stop monitoring the content type completely.
if (!handler || _typeHandlers[type] && _typeHandlers[type].length <= 1) {
delete _typeHandlers[type];
_ignoreContentDispositionTypes.delete(type);
}
}
else if (_typeHandlers[type]) {
var i = _typeHandlers[type].indexOf(handler);
if (i != -1) {
_typeHandlers.splice(i, 1);
}
}
};
/**
* Adds an observer to inspect and possibly modify page headers
@ -119,13 +135,9 @@ Zotero.MIMETypeHandler = new function () {
// remove content-disposition headers for EndNote, etc.
var contentType = channel.getResponseHeader("Content-Type").toLowerCase();
for (let handledType of _ignoreContentDispositionTypes) {
if(contentType.length < handledType.length) {
if (contentType.startsWith(handledType)) {
channel.setResponseHeader("Content-Disposition", "inline", false);
break;
} else {
if(contentType.substr(0, handledType.length) == handledType) {
channel.setResponseHeader("Content-Disposition", "", false);
break;
}
}
}
} catch(e) {}
@ -232,31 +244,34 @@ Zotero.MIMETypeHandler = new function () {
/**
* Called when the request is done
*/
_StreamListener.prototype.onStopRequest = Zotero.Promise.coroutine(function* (channel, context, status) {
_StreamListener.prototype.onStopRequest = async function (channel, context, status) {
Zotero.debug("charset is " + channel.contentCharset);
var inputStream = this._storageStream.newInputStream(0);
var charset = channel.contentCharset ? channel.contentCharset : "UTF-8";
const replacementChar = Components.interfaces.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER;
var convStream = Components.classes["@mozilla.org/intl/converter-input-stream;1"]
.createInstance(Components.interfaces.nsIConverterInputStream);
convStream.init(inputStream, charset, 16384, replacementChar);
var readString = "";
var str = {};
while (convStream.readString(16384, str) != 0) {
readString += str.value;
}
convStream.close();
var stream = Components.classes["@mozilla.org/binaryinputstream;1"]
.createInstance(Components.interfaces.nsIBinaryInputStream);
stream.setInputStream(inputStream);
let buffer = new ArrayBuffer(this._storageStream.length);
stream.readArrayBuffer(buffer.byteLength, buffer);
stream.close();
inputStream.close();
let blob = new (Zotero.getMainWindow()).Blob([buffer], { type: this._contentType });
var handled = false;
try {
handled = _typeHandlers[this._contentType](
readString,
this._request.name ? this._request.name : null,
this._contentType,
channel
);
for (let handler of _typeHandlers[this._contentType]) {
let maybePromise = handler(
blob,
this._request.name ? this._request.name : null,
this._contentType,
channel
);
if (maybePromise.then) {
maybePromise = await maybePromise;
}
handled = handled || maybePromise;
if (handled) break;
}
}
catch (e) {
Zotero.logError(e);
@ -283,5 +298,5 @@ Zotero.MIMETypeHandler = new function () {
}
this._storageStream.close();
});
};
}

View file

@ -942,22 +942,27 @@ Zotero.Utilities.Internal = {
* Parse a Blob (e.g., as received from Zotero.HTTP.request()) into an HTML Document
*/
blobToHTMLDocument: async function (blob, url) {
var charset = null;
var matches = blob.type && blob.type.match(/charset=([a-z0-9\-_+])/i);
if (matches) {
charset = matches[1];
var responseText = await Zotero.Utilities.Internal.blobToText(blob);
var parser = Components.classes["@mozilla.org/xmlextras/domparser;1"]
.createInstance(Components.interfaces.nsIDOMParser);
var doc = parser.parseFromString(responseText, 'text/html');
return Zotero.HTTP.wrapDocument(doc, url);
},
blobToText: async function (blob, charset=null) {
if (!charset) {
var matches = blob.type && blob.type.match(/charset=([a-z0-9\-_+])/i);
if (matches) {
charset = matches[1];
}
}
var responseText = await new Promise(function (resolve) {
return new Promise(function (resolve) {
let fr = new FileReader();
fr.addEventListener("loadend", function() {
resolve(fr.result);
});
fr.readAsText(blob, charset);
});
var parser = Components.classes["@mozilla.org/xmlextras/domparser;1"]
.createInstance(Components.interfaces.nsIDOMParser);
var doc = parser.parseFromString(responseText, 'text/html');
return Zotero.HTTP.wrapDocument(doc, url);
},

View file

@ -279,6 +279,27 @@ describe("Zotero.Attachments", function() {
});
});
describe("#importFromURL()", function () {
it("should download a PDF from a JS redirect page", async function () {
this.timeout(65e3);
var item = await Zotero.Attachments.importFromURL({
libraryID: Zotero.Libraries.userLibraryID,
url: 'https://zotero-static.s3.amazonaws.com/test-pdf-redirect.html',
contentType: 'application/pdf'
});
assert.isTrue(item.isPDFAttachment());
var sample = await Zotero.File.getContentsAsync(item.getFilePath(), null, 1000);
assert.equal(Zotero.MIME.sniffForMIMEType(sample), 'application/pdf');
// Clean up
await Zotero.Items.erase(item.id);
});
});
describe("#linkFromDocument", function () {
it("should add a link attachment for the current webpage", function* () {
var item = yield createDataObject('item');