Add a function to download PDFs via a browser (#2248)
Fixes zotero/translators#2739
This commit is contained in:
parent
e54f59ae28
commit
4405b59044
4 changed files with 189 additions and 52 deletions
|
@ -1087,7 +1087,9 @@ Zotero.Attachments = new function(){
|
|||
*/
|
||||
this.downloadFile = async function (url, path, options = {}) {
|
||||
Zotero.debug(`Downloading file from ${url}`);
|
||||
let unproxiedUrls = Object.keys(Zotero.Proxies.getPotentialProxies(url));
|
||||
|
||||
let enforcingPDF = false;
|
||||
try {
|
||||
await new Zotero.Promise(function (resolve) {
|
||||
var wbp = Components.classes["@mozilla.org/embedding/browser/nsWebBrowserPersist;1"]
|
||||
|
@ -1105,6 +1107,7 @@ Zotero.Attachments = new function(){
|
|||
});
|
||||
|
||||
if (options.isPDF) {
|
||||
enforcingPDF = true;
|
||||
await _enforcePDF(path);
|
||||
}
|
||||
}
|
||||
|
@ -1113,10 +1116,103 @@ Zotero.Attachments = new function(){
|
|||
await OS.File.remove(path, { ignoreAbsent: true });
|
||||
}
|
||||
catch (e) {
|
||||
Zotero.debug(e, 1);
|
||||
Zotero.logError(e);
|
||||
}
|
||||
const downloadViaBrowserList = [
|
||||
'https://zotero-static.s3.amazonaws.com/test-pdf-redirect.html',
|
||||
'://www.sciencedirect.com',
|
||||
];
|
||||
// Custom handling for PDFs that are bot-guarded
|
||||
// via a JS-redirect
|
||||
if (enforcingPDF && e instanceof this.InvalidPDFException) {
|
||||
for (let unproxiedUrl of unproxiedUrls) {
|
||||
if (downloadViaBrowserList.some(checkUrl => unproxiedUrl.includes(checkUrl))) {
|
||||
return this.downloadPDFViaBrowser(url, path, options);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @param {String} url
|
||||
* @param {String} path
|
||||
* @param {Object} [options]
|
||||
* @param {Object} [options.cookieSandbox]
|
||||
*/
|
||||
this.downloadPDFViaBrowser = async function (url, path, options = {}) {
|
||||
Zotero.debug(`downloadPDFViaBrowser: Downloading file via browser from ${url}`);
|
||||
const timeout = 60e3;
|
||||
let channelBrowser, hiddenBrowser;
|
||||
let hiddenBrowserPDFFoundDeferred = Zotero.Promise.defer();
|
||||
|
||||
var pdfMIMETypeHandler = async (blob, name, _, channel) => {
|
||||
Zotero.debug(`downloadPDFViaBrowser: Sniffing a PDF loaded at ${name}`);
|
||||
|
||||
let isOurPDF = false;
|
||||
// try the browser
|
||||
try {
|
||||
channelBrowser = channel.notificationCallbacks.getInterface(Ci.nsIWebNavigation)
|
||||
.QueryInterface(Ci.nsIDocShell).chromeEventHandler;
|
||||
}
|
||||
catch (e) {}
|
||||
if (channelBrowser) {
|
||||
isOurPDF = hiddenBrowser === channelBrowser;
|
||||
}
|
||||
else {
|
||||
// try the document for the load group
|
||||
try {
|
||||
channelBrowser = channel.loadGroup.notificationCallbacks.getInterface(Ci.nsIWebNavigation)
|
||||
.QueryInterface(Ci.nsIDocShell).chromeEventHandler;
|
||||
}
|
||||
catch(e) {}
|
||||
if (channelBrowser) {
|
||||
isOurPDF = hiddenBrowser === channelBrowser;
|
||||
}
|
||||
}
|
||||
|
||||
if (isOurPDF) {
|
||||
Zotero.debug(`downloadPDFViaBrowser: Found our PDF at ${name}`);
|
||||
await Zotero.File.putContentsAsync(path, blob);
|
||||
hiddenBrowserPDFFoundDeferred.resolve();
|
||||
return true;
|
||||
}
|
||||
else {
|
||||
Zotero.debug(`downloadPDFViaBrowser: Not our PDF at ${name}`);
|
||||
return false;
|
||||
}
|
||||
};
|
||||
try {
|
||||
Zotero.MIMETypeHandler.addHandler("application/pdf", pdfMIMETypeHandler, true);
|
||||
let noop = () => 0;
|
||||
hiddenBrowser = Zotero.HTTP.loadDocuments([url], noop, noop, noop, true, options.cookieSandbox);
|
||||
await Zotero.Promise.race([
|
||||
Zotero.Promise.delay(timeout).then(() => {
|
||||
if (!hiddenBrowserPDFFoundDeferred.promise.isResolved()) {
|
||||
throw new Error(`Loading PDF via browser timed out after ${timeout}ms`);
|
||||
}
|
||||
}),
|
||||
hiddenBrowserPDFFoundDeferred.promise
|
||||
]);
|
||||
}
|
||||
catch (e) {
|
||||
try {
|
||||
await OS.File.remove(path, { ignoreAbsent: true });
|
||||
}
|
||||
catch (e) {
|
||||
Zotero.logError(e, 1);
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
finally {
|
||||
Zotero.MIMETypeHandler.removeHandler('application/pdf', pdfMIMETypeHandler);
|
||||
if (hiddenBrowser) {
|
||||
Zotero.Browser.deleteHiddenBrowser(hiddenBrowser);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -49,28 +49,29 @@ Zotero.MIMETypeHandler = new function () {
|
|||
/**
|
||||
* Initializes handlers for MIME types
|
||||
*/
|
||||
this.initializeHandlers = function() {
|
||||
this.initializeHandlers = function () {
|
||||
_typeHandlers = {};
|
||||
_ignoreContentDispositionTypes = [];
|
||||
_ignoreContentDispositionTypes = new Set();
|
||||
_observers = [];
|
||||
|
||||
// Install styles from the Cite preferences
|
||||
this.addHandler("application/vnd.citationstyles.style+xml", Zotero.Promise.coroutine(function* (a1, a2) {
|
||||
this.addHandler("application/vnd.citationstyles.style+xml", async function (blob, origin) {
|
||||
let win = Services.wm.getMostRecentWindow("zotero:basicViewer");
|
||||
var data = await Zotero.Utilities.Internal.blobToText(blob);
|
||||
try {
|
||||
yield Zotero.Styles.install(a1, a2, true);
|
||||
await Zotero.Styles.install(data, origin, true);
|
||||
}
|
||||
catch (e) {
|
||||
Zotero.logError(e);
|
||||
(new Zotero.Exception.Alert("styles.install.unexpectedError",
|
||||
a2, "styles.install.title", e)).present();
|
||||
origin, "styles.install.title", e)).present();
|
||||
}
|
||||
// Close styles page in basic viewer after installing a style
|
||||
if (win) {
|
||||
win.close();
|
||||
}
|
||||
}));
|
||||
}
|
||||
}, true);
|
||||
};
|
||||
|
||||
/**
|
||||
* Adds a handler to handle a specific MIME type
|
||||
|
@ -80,22 +81,37 @@ Zotero.MIMETypeHandler = new function () {
|
|||
* which is often used to force a file to download rather than let it be handled by the web
|
||||
* browser
|
||||
*/
|
||||
this.addHandler = function(type, fn, ignoreContentDisposition) {
|
||||
_typeHandlers[type] = fn;
|
||||
_ignoreContentDispositionTypes.push(type);
|
||||
}
|
||||
this.addHandler = function (type, fn, ignoreContentDisposition) {
|
||||
if (_typeHandlers[type]) {
|
||||
_typeHandlers[type].push(fn);
|
||||
}
|
||||
else {
|
||||
_typeHandlers[type] = [fn];
|
||||
}
|
||||
if (ignoreContentDisposition) {
|
||||
_ignoreContentDispositionTypes.add(type);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Removes a handler for a specific MIME type
|
||||
* @param {String} type MIME type to handle
|
||||
* @param {Function} handler Function handler to remove
|
||||
*/
|
||||
this.removeHandler = function(type) {
|
||||
delete _typeHandlers[type];
|
||||
var i = _ignoreContentDispositionTypes.indexOf(type);
|
||||
if (i != -1) {
|
||||
_ignoreContentDispositionTypes.splice(i, 1);
|
||||
this.removeHandler = function (type, handler) {
|
||||
// If no handler specified or this is the last handler for the type
|
||||
// stop monitoring the content type completely.
|
||||
if (!handler || _typeHandlers[type] && _typeHandlers[type].length <= 1) {
|
||||
delete _typeHandlers[type];
|
||||
_ignoreContentDispositionTypes.delete(type);
|
||||
}
|
||||
}
|
||||
else if (_typeHandlers[type]) {
|
||||
var i = _typeHandlers[type].indexOf(handler);
|
||||
if (i != -1) {
|
||||
_typeHandlers.splice(i, 1);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Adds an observer to inspect and possibly modify page headers
|
||||
|
@ -119,13 +135,9 @@ Zotero.MIMETypeHandler = new function () {
|
|||
// remove content-disposition headers for EndNote, etc.
|
||||
var contentType = channel.getResponseHeader("Content-Type").toLowerCase();
|
||||
for (let handledType of _ignoreContentDispositionTypes) {
|
||||
if(contentType.length < handledType.length) {
|
||||
if (contentType.startsWith(handledType)) {
|
||||
channel.setResponseHeader("Content-Disposition", "inline", false);
|
||||
break;
|
||||
} else {
|
||||
if(contentType.substr(0, handledType.length) == handledType) {
|
||||
channel.setResponseHeader("Content-Disposition", "", false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch(e) {}
|
||||
|
@ -232,31 +244,34 @@ Zotero.MIMETypeHandler = new function () {
|
|||
/**
|
||||
* Called when the request is done
|
||||
*/
|
||||
_StreamListener.prototype.onStopRequest = Zotero.Promise.coroutine(function* (channel, context, status) {
|
||||
_StreamListener.prototype.onStopRequest = async function (channel, context, status) {
|
||||
Zotero.debug("charset is " + channel.contentCharset);
|
||||
|
||||
var inputStream = this._storageStream.newInputStream(0);
|
||||
var charset = channel.contentCharset ? channel.contentCharset : "UTF-8";
|
||||
const replacementChar = Components.interfaces.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER;
|
||||
var convStream = Components.classes["@mozilla.org/intl/converter-input-stream;1"]
|
||||
.createInstance(Components.interfaces.nsIConverterInputStream);
|
||||
convStream.init(inputStream, charset, 16384, replacementChar);
|
||||
var readString = "";
|
||||
var str = {};
|
||||
while (convStream.readString(16384, str) != 0) {
|
||||
readString += str.value;
|
||||
}
|
||||
convStream.close();
|
||||
var stream = Components.classes["@mozilla.org/binaryinputstream;1"]
|
||||
.createInstance(Components.interfaces.nsIBinaryInputStream);
|
||||
stream.setInputStream(inputStream);
|
||||
let buffer = new ArrayBuffer(this._storageStream.length);
|
||||
stream.readArrayBuffer(buffer.byteLength, buffer);
|
||||
stream.close();
|
||||
inputStream.close();
|
||||
let blob = new (Zotero.getMainWindow()).Blob([buffer], { type: this._contentType });
|
||||
|
||||
var handled = false;
|
||||
try {
|
||||
handled = _typeHandlers[this._contentType](
|
||||
readString,
|
||||
this._request.name ? this._request.name : null,
|
||||
this._contentType,
|
||||
channel
|
||||
);
|
||||
for (let handler of _typeHandlers[this._contentType]) {
|
||||
let maybePromise = handler(
|
||||
blob,
|
||||
this._request.name ? this._request.name : null,
|
||||
this._contentType,
|
||||
channel
|
||||
);
|
||||
if (maybePromise.then) {
|
||||
maybePromise = await maybePromise;
|
||||
}
|
||||
handled = handled || maybePromise;
|
||||
if (handled) break;
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
Zotero.logError(e);
|
||||
|
@ -283,5 +298,5 @@ Zotero.MIMETypeHandler = new function () {
|
|||
}
|
||||
|
||||
this._storageStream.close();
|
||||
});
|
||||
};
|
||||
}
|
||||
|
|
|
@ -942,22 +942,27 @@ Zotero.Utilities.Internal = {
|
|||
* Parse a Blob (e.g., as received from Zotero.HTTP.request()) into an HTML Document
|
||||
*/
|
||||
blobToHTMLDocument: async function (blob, url) {
|
||||
var charset = null;
|
||||
var matches = blob.type && blob.type.match(/charset=([a-z0-9\-_+])/i);
|
||||
if (matches) {
|
||||
charset = matches[1];
|
||||
var responseText = await Zotero.Utilities.Internal.blobToText(blob);
|
||||
var parser = Components.classes["@mozilla.org/xmlextras/domparser;1"]
|
||||
.createInstance(Components.interfaces.nsIDOMParser);
|
||||
var doc = parser.parseFromString(responseText, 'text/html');
|
||||
return Zotero.HTTP.wrapDocument(doc, url);
|
||||
},
|
||||
|
||||
blobToText: async function (blob, charset=null) {
|
||||
if (!charset) {
|
||||
var matches = blob.type && blob.type.match(/charset=([a-z0-9\-_+])/i);
|
||||
if (matches) {
|
||||
charset = matches[1];
|
||||
}
|
||||
}
|
||||
var responseText = await new Promise(function (resolve) {
|
||||
return new Promise(function (resolve) {
|
||||
let fr = new FileReader();
|
||||
fr.addEventListener("loadend", function() {
|
||||
resolve(fr.result);
|
||||
});
|
||||
fr.readAsText(blob, charset);
|
||||
});
|
||||
var parser = Components.classes["@mozilla.org/xmlextras/domparser;1"]
|
||||
.createInstance(Components.interfaces.nsIDOMParser);
|
||||
var doc = parser.parseFromString(responseText, 'text/html');
|
||||
return Zotero.HTTP.wrapDocument(doc, url);
|
||||
},
|
||||
|
||||
|
||||
|
|
|
@ -279,6 +279,27 @@ describe("Zotero.Attachments", function() {
|
|||
});
|
||||
});
|
||||
|
||||
|
||||
describe("#importFromURL()", function () {
|
||||
it("should download a PDF from a JS redirect page", async function () {
|
||||
this.timeout(65e3);
|
||||
|
||||
var item = await Zotero.Attachments.importFromURL({
|
||||
libraryID: Zotero.Libraries.userLibraryID,
|
||||
url: 'https://zotero-static.s3.amazonaws.com/test-pdf-redirect.html',
|
||||
contentType: 'application/pdf'
|
||||
});
|
||||
|
||||
assert.isTrue(item.isPDFAttachment());
|
||||
var sample = await Zotero.File.getContentsAsync(item.getFilePath(), null, 1000);
|
||||
assert.equal(Zotero.MIME.sniffForMIMEType(sample), 'application/pdf');
|
||||
|
||||
// Clean up
|
||||
await Zotero.Items.erase(item.id);
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
describe("#linkFromDocument", function () {
|
||||
it("should add a link attachment for the current webpage", function* () {
|
||||
var item = yield createDataObject('item');
|
||||
|
|
Loading…
Reference in a new issue