diff --git a/chrome/content/zotero/xpcom/attachments.js b/chrome/content/zotero/xpcom/attachments.js index 2e21ac3cbf..a9d8d7fba8 100644 --- a/chrome/content/zotero/xpcom/attachments.js +++ b/chrome/content/zotero/xpcom/attachments.js @@ -1681,6 +1681,9 @@ Zotero.Attachments = new function(){ let redirects = 0; let nextURL = pageURL; let req; + let blob; + let doc; + let contentType; let skip = false; let domains = new Set(); while (true) { @@ -1725,21 +1728,39 @@ Zotero.Attachments = new function(){ skip = true; break; } + addTriedURL(nextURL); continue; } + + blob = req.response; + responseURL = req.responseURL; + if (pageURL != responseURL) { + Zotero.debug("Redirected to " + responseURL); + } + + // If HTML, check for a meta redirect + contentType = req.getResponseHeader('Content-Type'); + if (contentType.startsWith('text/html')) { + doc = await Zotero.Utilities.Internal.blobToHTMLDocument(blob, responseURL); + let refreshURL = Zotero.HTTP.getHTMLMetaRefreshURL(doc, responseURL); + if (refreshURL) { + if (isTriedURL(refreshURL)) { + Zotero.debug("Meta refresh URL has already been tried -- skipping"); + skip = true; + break; + } + doc = null; + nextURL = refreshURL; + addTriedURL(nextURL); + continue; + } + } break; } if (skip) { continue; } - let blob = req.response; - responseURL = req.responseURL; - if (pageURL != responseURL) { - Zotero.debug("Redirected to " + responseURL); - } - addTriedURL(responseURL); - let contentType = req.getResponseHeader('Content-Type'); // If DOI resolves directly to a PDF, save it to disk if (contentType == 'application/pdf') { Zotero.debug("URL resolves directly to PDF"); @@ -1747,9 +1768,8 @@ Zotero.Attachments = new function(){ await _enforcePDF(path); return { url: responseURL, props: urlResolver }; } - // Otherwise parse the Blob into a Document and translate that - else if (contentType.startsWith('text/html')) { - let doc = await Zotero.Utilities.Internal.blobToHTMLDocument(blob, responseURL); + // Otherwise translate the Document we parsed above + else if (doc) { url = await Zotero.Utilities.Internal.getPDFFromDocument(doc); } } diff --git a/chrome/content/zotero/xpcom/http.js b/chrome/content/zotero/xpcom/http.js index 808507f092..f30c3c5bdc 100644 --- a/chrome/content/zotero/xpcom/http.js +++ b/chrome/content/zotero/xpcom/http.js @@ -361,38 +361,28 @@ Zotero.HTTP = new function() { (!options.numRedirects || options.numRedirects < 3)) { let contentType = xmlhttp.getResponseHeader('Content-Type'); if (contentType && contentType.startsWith('text/html')) { - let meta = xmlhttp.response.querySelector('meta[http-equiv="refresh" i]'); - if (meta) { - let content = meta.getAttribute('content'); - if (content) { - let parts = content.split(/;\s*url=/); - // If there's a redirect to another URL in less than 15 seconds, - // follow it - if (parts.length === 2 && parseInt(parts[0]) <= 15) { - let url = parts[1].trim().replace(/^'(.+)'/, '$1'); - - // Resolve URL. P.S.: For unknown reason this only works - // if server returns 'Content-Type: text/html' header - let a = xmlhttp.response.createElement('a'); - a.href = url; - let resolvedUrl = a.href; - - // Make sure the absolute URL is actually resolved - if (/^https?:\/\//.test(resolvedUrl)) { - if (options.numRedirects) { - options.numRedirects++; - } - else { - options.numRedirects = 1; - } - - // Meta redirect is always GET - return Zotero.HTTP.request("GET", resolvedUrl, options) - .then(xmlhttp => deferred.resolve(xmlhttp)) - .catch(e => deferred.reject(e)); - } - } + let doc = xmlhttp.response; + let url = xmlhttp.responseURL; + let resolvedURL; + try { + resolvedURL = this.getHTMLMetaRefreshURL(doc, url); + } + catch (e) { + deferred.reject(e); + return; + } + if (resolvedURL) { + if (options.numRedirects) { + options.numRedirects++; } + else { + options.numRedirects = 1; + } + + // Meta redirect is always GET + return Zotero.HTTP.request("GET", resolvedURL, options) + .then(xmlhttp => deferred.resolve(xmlhttp)) + .catch(e => deferred.reject(e)); } } } @@ -682,6 +672,36 @@ Zotero.HTTP = new function() { } + this.getHTMLMetaRefreshURL = function (doc, url) { + var meta = doc.querySelector('meta[http-equiv="refresh" i]'); + if (!meta) { + return false; + } + var content = meta.getAttribute('content'); + if (!content) { + return false; + } + var parts = content.split(/;\s*url=/); + // If there's a redirect to another URL in less than 15 seconds, + // follow it + if (parts.length === 2 && parseInt(parts[0]) <= 15) { + let refreshURL = parts[1].trim().replace(/^'(.+)'/, '$1'); + let resolvedURL; + try { + resolvedURL = Services.io.newURI(url, null, null).resolve(refreshURL); + } + catch (e) { + Zotero.logError(e); + } + // Make sure the URL is actually resolved + if (resolvedURL && /^https?:\/\//.test(resolvedURL)) { + return resolvedURL; + } + } + return false; + }; + + /** * Make a foreground HTTP request in order to trigger a proxy authentication dialog * diff --git a/test/tests/attachmentsTest.js b/test/tests/attachmentsTest.js index 0d64c9775d..2acac904b6 100644 --- a/test/tests/attachmentsTest.js +++ b/test/tests/attachmentsTest.js @@ -354,6 +354,7 @@ describe("Zotero.Attachments", function() { var pageURL7 = doiPrefix + doi5; var pageURL8 = 'http://website2/article8'; var pageURL9 = 'http://website/article9'; + var pageURL10 = 'http://website/refresh'; Components.utils.import("resource://zotero-unit/httpd.js"); var httpd; @@ -534,6 +535,11 @@ describe("Zotero.Attachments", function() { } } + if (url == pageURL10) { + let html = ``; + return makeHTMLResponseFromType(html, options.responseType, pageURL10); + } + // OA PDF lookup if (url.startsWith(ZOTERO_CONFIG.SERVICES_URL)) { let json = JSON.parse(options.body); @@ -830,6 +836,25 @@ describe("Zotero.Attachments", function() { assert.equal(item2.numAttachments(), 1); }); + it("should follow a meta redirect", async function () { + var url = pageURL10; + var item = createUnsavedDataObject('item', { itemType: 'journalArticle' }); + item.setField('title', 'Test'); + item.setField('url', url); + await item.saveTx(); + var attachment = await Zotero.Attachments.addAvailablePDF(item); + + assert.isTrue(requestStub.calledTwice); + assert.equal(requestStub.getCall(0).args[1], pageURL10) + assert.equal(requestStub.getCall(1).args[1], pageURL1) + assert.ok(attachment); + var json = attachment.toJSON(); + assert.equal(json.url, pdfURL); + assert.equal(json.contentType, 'application/pdf'); + assert.equal(json.filename, 'Test.pdf'); + assert.equal(await OS.File.stat(attachment.getFilePath()).size, pdfSize); + }); + it("should handle a custom resolver in HTML mode", async function () { var doi = doi4; var item = createUnsavedDataObject('item', { itemType: 'journalArticle' });