Add a function to download PDFs via a browser (#2248)
Fixes zotero/translators#2739
This commit is contained in:
parent
e54f59ae28
commit
4405b59044
4 changed files with 189 additions and 52 deletions
|
@ -1087,7 +1087,9 @@ Zotero.Attachments = new function(){
|
||||||
*/
|
*/
|
||||||
this.downloadFile = async function (url, path, options = {}) {
|
this.downloadFile = async function (url, path, options = {}) {
|
||||||
Zotero.debug(`Downloading file from ${url}`);
|
Zotero.debug(`Downloading file from ${url}`);
|
||||||
|
let unproxiedUrls = Object.keys(Zotero.Proxies.getPotentialProxies(url));
|
||||||
|
|
||||||
|
let enforcingPDF = false;
|
||||||
try {
|
try {
|
||||||
await new Zotero.Promise(function (resolve) {
|
await new Zotero.Promise(function (resolve) {
|
||||||
var wbp = Components.classes["@mozilla.org/embedding/browser/nsWebBrowserPersist;1"]
|
var wbp = Components.classes["@mozilla.org/embedding/browser/nsWebBrowserPersist;1"]
|
||||||
|
@ -1105,6 +1107,7 @@ Zotero.Attachments = new function(){
|
||||||
});
|
});
|
||||||
|
|
||||||
if (options.isPDF) {
|
if (options.isPDF) {
|
||||||
|
enforcingPDF = true;
|
||||||
await _enforcePDF(path);
|
await _enforcePDF(path);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1113,10 +1116,103 @@ Zotero.Attachments = new function(){
|
||||||
await OS.File.remove(path, { ignoreAbsent: true });
|
await OS.File.remove(path, { ignoreAbsent: true });
|
||||||
}
|
}
|
||||||
catch (e) {
|
catch (e) {
|
||||||
Zotero.debug(e, 1);
|
Zotero.logError(e);
|
||||||
|
}
|
||||||
|
const downloadViaBrowserList = [
|
||||||
|
'https://zotero-static.s3.amazonaws.com/test-pdf-redirect.html',
|
||||||
|
'://www.sciencedirect.com',
|
||||||
|
];
|
||||||
|
// Custom handling for PDFs that are bot-guarded
|
||||||
|
// via a JS-redirect
|
||||||
|
if (enforcingPDF && e instanceof this.InvalidPDFException) {
|
||||||
|
for (let unproxiedUrl of unproxiedUrls) {
|
||||||
|
if (downloadViaBrowserList.some(checkUrl => unproxiedUrl.includes(checkUrl))) {
|
||||||
|
return this.downloadPDFViaBrowser(url, path, options);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param {String} url
|
||||||
|
* @param {String} path
|
||||||
|
* @param {Object} [options]
|
||||||
|
* @param {Object} [options.cookieSandbox]
|
||||||
|
*/
|
||||||
|
this.downloadPDFViaBrowser = async function (url, path, options = {}) {
|
||||||
|
Zotero.debug(`downloadPDFViaBrowser: Downloading file via browser from ${url}`);
|
||||||
|
const timeout = 60e3;
|
||||||
|
let channelBrowser, hiddenBrowser;
|
||||||
|
let hiddenBrowserPDFFoundDeferred = Zotero.Promise.defer();
|
||||||
|
|
||||||
|
var pdfMIMETypeHandler = async (blob, name, _, channel) => {
|
||||||
|
Zotero.debug(`downloadPDFViaBrowser: Sniffing a PDF loaded at ${name}`);
|
||||||
|
|
||||||
|
let isOurPDF = false;
|
||||||
|
// try the browser
|
||||||
|
try {
|
||||||
|
channelBrowser = channel.notificationCallbacks.getInterface(Ci.nsIWebNavigation)
|
||||||
|
.QueryInterface(Ci.nsIDocShell).chromeEventHandler;
|
||||||
|
}
|
||||||
|
catch (e) {}
|
||||||
|
if (channelBrowser) {
|
||||||
|
isOurPDF = hiddenBrowser === channelBrowser;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// try the document for the load group
|
||||||
|
try {
|
||||||
|
channelBrowser = channel.loadGroup.notificationCallbacks.getInterface(Ci.nsIWebNavigation)
|
||||||
|
.QueryInterface(Ci.nsIDocShell).chromeEventHandler;
|
||||||
|
}
|
||||||
|
catch(e) {}
|
||||||
|
if (channelBrowser) {
|
||||||
|
isOurPDF = hiddenBrowser === channelBrowser;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isOurPDF) {
|
||||||
|
Zotero.debug(`downloadPDFViaBrowser: Found our PDF at ${name}`);
|
||||||
|
await Zotero.File.putContentsAsync(path, blob);
|
||||||
|
hiddenBrowserPDFFoundDeferred.resolve();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
Zotero.debug(`downloadPDFViaBrowser: Not our PDF at ${name}`);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
try {
|
||||||
|
Zotero.MIMETypeHandler.addHandler("application/pdf", pdfMIMETypeHandler, true);
|
||||||
|
let noop = () => 0;
|
||||||
|
hiddenBrowser = Zotero.HTTP.loadDocuments([url], noop, noop, noop, true, options.cookieSandbox);
|
||||||
|
await Zotero.Promise.race([
|
||||||
|
Zotero.Promise.delay(timeout).then(() => {
|
||||||
|
if (!hiddenBrowserPDFFoundDeferred.promise.isResolved()) {
|
||||||
|
throw new Error(`Loading PDF via browser timed out after ${timeout}ms`);
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
hiddenBrowserPDFFoundDeferred.promise
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
catch (e) {
|
||||||
|
try {
|
||||||
|
await OS.File.remove(path, { ignoreAbsent: true });
|
||||||
|
}
|
||||||
|
catch (e) {
|
||||||
|
Zotero.logError(e, 1);
|
||||||
}
|
}
|
||||||
throw e;
|
throw e;
|
||||||
}
|
}
|
||||||
|
finally {
|
||||||
|
Zotero.MIMETypeHandler.removeHandler('application/pdf', pdfMIMETypeHandler);
|
||||||
|
if (hiddenBrowser) {
|
||||||
|
Zotero.Browser.deleteHiddenBrowser(hiddenBrowser);
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -49,28 +49,29 @@ Zotero.MIMETypeHandler = new function () {
|
||||||
/**
|
/**
|
||||||
* Initializes handlers for MIME types
|
* Initializes handlers for MIME types
|
||||||
*/
|
*/
|
||||||
this.initializeHandlers = function() {
|
this.initializeHandlers = function () {
|
||||||
_typeHandlers = {};
|
_typeHandlers = {};
|
||||||
_ignoreContentDispositionTypes = [];
|
_ignoreContentDispositionTypes = new Set();
|
||||||
_observers = [];
|
_observers = [];
|
||||||
|
|
||||||
// Install styles from the Cite preferences
|
// Install styles from the Cite preferences
|
||||||
this.addHandler("application/vnd.citationstyles.style+xml", Zotero.Promise.coroutine(function* (a1, a2) {
|
this.addHandler("application/vnd.citationstyles.style+xml", async function (blob, origin) {
|
||||||
let win = Services.wm.getMostRecentWindow("zotero:basicViewer");
|
let win = Services.wm.getMostRecentWindow("zotero:basicViewer");
|
||||||
|
var data = await Zotero.Utilities.Internal.blobToText(blob);
|
||||||
try {
|
try {
|
||||||
yield Zotero.Styles.install(a1, a2, true);
|
await Zotero.Styles.install(data, origin, true);
|
||||||
}
|
}
|
||||||
catch (e) {
|
catch (e) {
|
||||||
Zotero.logError(e);
|
Zotero.logError(e);
|
||||||
(new Zotero.Exception.Alert("styles.install.unexpectedError",
|
(new Zotero.Exception.Alert("styles.install.unexpectedError",
|
||||||
a2, "styles.install.title", e)).present();
|
origin, "styles.install.title", e)).present();
|
||||||
}
|
}
|
||||||
// Close styles page in basic viewer after installing a style
|
// Close styles page in basic viewer after installing a style
|
||||||
if (win) {
|
if (win) {
|
||||||
win.close();
|
win.close();
|
||||||
}
|
}
|
||||||
}));
|
}, true);
|
||||||
}
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Adds a handler to handle a specific MIME type
|
* Adds a handler to handle a specific MIME type
|
||||||
|
@ -80,22 +81,37 @@ Zotero.MIMETypeHandler = new function () {
|
||||||
* which is often used to force a file to download rather than let it be handled by the web
|
* which is often used to force a file to download rather than let it be handled by the web
|
||||||
* browser
|
* browser
|
||||||
*/
|
*/
|
||||||
this.addHandler = function(type, fn, ignoreContentDisposition) {
|
this.addHandler = function (type, fn, ignoreContentDisposition) {
|
||||||
_typeHandlers[type] = fn;
|
if (_typeHandlers[type]) {
|
||||||
_ignoreContentDispositionTypes.push(type);
|
_typeHandlers[type].push(fn);
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
_typeHandlers[type] = [fn];
|
||||||
|
}
|
||||||
|
if (ignoreContentDisposition) {
|
||||||
|
_ignoreContentDispositionTypes.add(type);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Removes a handler for a specific MIME type
|
* Removes a handler for a specific MIME type
|
||||||
* @param {String} type MIME type to handle
|
* @param {String} type MIME type to handle
|
||||||
|
* @param {Function} handler Function handler to remove
|
||||||
*/
|
*/
|
||||||
this.removeHandler = function(type) {
|
this.removeHandler = function (type, handler) {
|
||||||
delete _typeHandlers[type];
|
// If no handler specified or this is the last handler for the type
|
||||||
var i = _ignoreContentDispositionTypes.indexOf(type);
|
// stop monitoring the content type completely.
|
||||||
if (i != -1) {
|
if (!handler || _typeHandlers[type] && _typeHandlers[type].length <= 1) {
|
||||||
_ignoreContentDispositionTypes.splice(i, 1);
|
delete _typeHandlers[type];
|
||||||
|
_ignoreContentDispositionTypes.delete(type);
|
||||||
}
|
}
|
||||||
}
|
else if (_typeHandlers[type]) {
|
||||||
|
var i = _typeHandlers[type].indexOf(handler);
|
||||||
|
if (i != -1) {
|
||||||
|
_typeHandlers.splice(i, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Adds an observer to inspect and possibly modify page headers
|
* Adds an observer to inspect and possibly modify page headers
|
||||||
|
@ -119,13 +135,9 @@ Zotero.MIMETypeHandler = new function () {
|
||||||
// remove content-disposition headers for EndNote, etc.
|
// remove content-disposition headers for EndNote, etc.
|
||||||
var contentType = channel.getResponseHeader("Content-Type").toLowerCase();
|
var contentType = channel.getResponseHeader("Content-Type").toLowerCase();
|
||||||
for (let handledType of _ignoreContentDispositionTypes) {
|
for (let handledType of _ignoreContentDispositionTypes) {
|
||||||
if(contentType.length < handledType.length) {
|
if (contentType.startsWith(handledType)) {
|
||||||
|
channel.setResponseHeader("Content-Disposition", "inline", false);
|
||||||
break;
|
break;
|
||||||
} else {
|
|
||||||
if(contentType.substr(0, handledType.length) == handledType) {
|
|
||||||
channel.setResponseHeader("Content-Disposition", "", false);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch(e) {}
|
} catch(e) {}
|
||||||
|
@ -232,31 +244,34 @@ Zotero.MIMETypeHandler = new function () {
|
||||||
/**
|
/**
|
||||||
* Called when the request is done
|
* Called when the request is done
|
||||||
*/
|
*/
|
||||||
_StreamListener.prototype.onStopRequest = Zotero.Promise.coroutine(function* (channel, context, status) {
|
_StreamListener.prototype.onStopRequest = async function (channel, context, status) {
|
||||||
Zotero.debug("charset is " + channel.contentCharset);
|
Zotero.debug("charset is " + channel.contentCharset);
|
||||||
|
|
||||||
var inputStream = this._storageStream.newInputStream(0);
|
var inputStream = this._storageStream.newInputStream(0);
|
||||||
var charset = channel.contentCharset ? channel.contentCharset : "UTF-8";
|
var stream = Components.classes["@mozilla.org/binaryinputstream;1"]
|
||||||
const replacementChar = Components.interfaces.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER;
|
.createInstance(Components.interfaces.nsIBinaryInputStream);
|
||||||
var convStream = Components.classes["@mozilla.org/intl/converter-input-stream;1"]
|
stream.setInputStream(inputStream);
|
||||||
.createInstance(Components.interfaces.nsIConverterInputStream);
|
let buffer = new ArrayBuffer(this._storageStream.length);
|
||||||
convStream.init(inputStream, charset, 16384, replacementChar);
|
stream.readArrayBuffer(buffer.byteLength, buffer);
|
||||||
var readString = "";
|
stream.close();
|
||||||
var str = {};
|
|
||||||
while (convStream.readString(16384, str) != 0) {
|
|
||||||
readString += str.value;
|
|
||||||
}
|
|
||||||
convStream.close();
|
|
||||||
inputStream.close();
|
inputStream.close();
|
||||||
|
let blob = new (Zotero.getMainWindow()).Blob([buffer], { type: this._contentType });
|
||||||
|
|
||||||
var handled = false;
|
var handled = false;
|
||||||
try {
|
try {
|
||||||
handled = _typeHandlers[this._contentType](
|
for (let handler of _typeHandlers[this._contentType]) {
|
||||||
readString,
|
let maybePromise = handler(
|
||||||
this._request.name ? this._request.name : null,
|
blob,
|
||||||
this._contentType,
|
this._request.name ? this._request.name : null,
|
||||||
channel
|
this._contentType,
|
||||||
);
|
channel
|
||||||
|
);
|
||||||
|
if (maybePromise.then) {
|
||||||
|
maybePromise = await maybePromise;
|
||||||
|
}
|
||||||
|
handled = handled || maybePromise;
|
||||||
|
if (handled) break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
catch (e) {
|
catch (e) {
|
||||||
Zotero.logError(e);
|
Zotero.logError(e);
|
||||||
|
@ -283,5 +298,5 @@ Zotero.MIMETypeHandler = new function () {
|
||||||
}
|
}
|
||||||
|
|
||||||
this._storageStream.close();
|
this._storageStream.close();
|
||||||
});
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -942,22 +942,27 @@ Zotero.Utilities.Internal = {
|
||||||
* Parse a Blob (e.g., as received from Zotero.HTTP.request()) into an HTML Document
|
* Parse a Blob (e.g., as received from Zotero.HTTP.request()) into an HTML Document
|
||||||
*/
|
*/
|
||||||
blobToHTMLDocument: async function (blob, url) {
|
blobToHTMLDocument: async function (blob, url) {
|
||||||
var charset = null;
|
var responseText = await Zotero.Utilities.Internal.blobToText(blob);
|
||||||
var matches = blob.type && blob.type.match(/charset=([a-z0-9\-_+])/i);
|
var parser = Components.classes["@mozilla.org/xmlextras/domparser;1"]
|
||||||
if (matches) {
|
.createInstance(Components.interfaces.nsIDOMParser);
|
||||||
charset = matches[1];
|
var doc = parser.parseFromString(responseText, 'text/html');
|
||||||
|
return Zotero.HTTP.wrapDocument(doc, url);
|
||||||
|
},
|
||||||
|
|
||||||
|
blobToText: async function (blob, charset=null) {
|
||||||
|
if (!charset) {
|
||||||
|
var matches = blob.type && blob.type.match(/charset=([a-z0-9\-_+])/i);
|
||||||
|
if (matches) {
|
||||||
|
charset = matches[1];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
var responseText = await new Promise(function (resolve) {
|
return new Promise(function (resolve) {
|
||||||
let fr = new FileReader();
|
let fr = new FileReader();
|
||||||
fr.addEventListener("loadend", function() {
|
fr.addEventListener("loadend", function() {
|
||||||
resolve(fr.result);
|
resolve(fr.result);
|
||||||
});
|
});
|
||||||
fr.readAsText(blob, charset);
|
fr.readAsText(blob, charset);
|
||||||
});
|
});
|
||||||
var parser = Components.classes["@mozilla.org/xmlextras/domparser;1"]
|
|
||||||
.createInstance(Components.interfaces.nsIDOMParser);
|
|
||||||
var doc = parser.parseFromString(responseText, 'text/html');
|
|
||||||
return Zotero.HTTP.wrapDocument(doc, url);
|
|
||||||
},
|
},
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -279,6 +279,27 @@ describe("Zotero.Attachments", function() {
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
describe("#importFromURL()", function () {
|
||||||
|
it("should download a PDF from a JS redirect page", async function () {
|
||||||
|
this.timeout(65e3);
|
||||||
|
|
||||||
|
var item = await Zotero.Attachments.importFromURL({
|
||||||
|
libraryID: Zotero.Libraries.userLibraryID,
|
||||||
|
url: 'https://zotero-static.s3.amazonaws.com/test-pdf-redirect.html',
|
||||||
|
contentType: 'application/pdf'
|
||||||
|
});
|
||||||
|
|
||||||
|
assert.isTrue(item.isPDFAttachment());
|
||||||
|
var sample = await Zotero.File.getContentsAsync(item.getFilePath(), null, 1000);
|
||||||
|
assert.equal(Zotero.MIME.sniffForMIMEType(sample), 'application/pdf');
|
||||||
|
|
||||||
|
// Clean up
|
||||||
|
await Zotero.Items.erase(item.id);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
describe("#linkFromDocument", function () {
|
describe("#linkFromDocument", function () {
|
||||||
it("should add a link attachment for the current webpage", function* () {
|
it("should add a link attachment for the current webpage", function* () {
|
||||||
var item = yield createDataObject('item');
|
var item = yield createDataObject('item');
|
||||||
|
|
Loading…
Add table
Reference in a new issue