Follow meta redirects for Find Available PDF

This fixes direct and VPN-based retrieval of PDFs for Elsevier (e.g.,
ScienceDirect) items that have a DOI but no URL, since Elsevier resolves
DOIs through an intermediate page.
This commit is contained in:
Dan Stillman 2018-11-26 00:35:51 -07:00
parent 7d9b94c79e
commit 6137aeddb8
3 changed files with 106 additions and 41 deletions

View file

@ -1681,6 +1681,9 @@ Zotero.Attachments = new function(){
let redirects = 0; let redirects = 0;
let nextURL = pageURL; let nextURL = pageURL;
let req; let req;
let blob;
let doc;
let contentType;
let skip = false; let skip = false;
let domains = new Set(); let domains = new Set();
while (true) { while (true) {
@ -1725,21 +1728,39 @@ Zotero.Attachments = new function(){
skip = true; skip = true;
break; break;
} }
addTriedURL(nextURL);
continue; continue;
} }
blob = req.response;
responseURL = req.responseURL;
if (pageURL != responseURL) {
Zotero.debug("Redirected to " + responseURL);
}
// If HTML, check for a meta redirect
contentType = req.getResponseHeader('Content-Type');
if (contentType.startsWith('text/html')) {
doc = await Zotero.Utilities.Internal.blobToHTMLDocument(blob, responseURL);
let refreshURL = Zotero.HTTP.getHTMLMetaRefreshURL(doc, responseURL);
if (refreshURL) {
if (isTriedURL(refreshURL)) {
Zotero.debug("Meta refresh URL has already been tried -- skipping");
skip = true;
break;
}
doc = null;
nextURL = refreshURL;
addTriedURL(nextURL);
continue;
}
}
break; break;
} }
if (skip) { if (skip) {
continue; continue;
} }
let blob = req.response;
responseURL = req.responseURL;
if (pageURL != responseURL) {
Zotero.debug("Redirected to " + responseURL);
}
addTriedURL(responseURL);
let contentType = req.getResponseHeader('Content-Type');
// If DOI resolves directly to a PDF, save it to disk // If DOI resolves directly to a PDF, save it to disk
if (contentType == 'application/pdf') { if (contentType == 'application/pdf') {
Zotero.debug("URL resolves directly to PDF"); Zotero.debug("URL resolves directly to PDF");
@ -1747,9 +1768,8 @@ Zotero.Attachments = new function(){
await _enforcePDF(path); await _enforcePDF(path);
return { url: responseURL, props: urlResolver }; return { url: responseURL, props: urlResolver };
} }
// Otherwise parse the Blob into a Document and translate that // Otherwise translate the Document we parsed above
else if (contentType.startsWith('text/html')) { else if (doc) {
let doc = await Zotero.Utilities.Internal.blobToHTMLDocument(blob, responseURL);
url = await Zotero.Utilities.Internal.getPDFFromDocument(doc); url = await Zotero.Utilities.Internal.getPDFFromDocument(doc);
} }
} }

View file

@ -361,38 +361,28 @@ Zotero.HTTP = new function() {
(!options.numRedirects || options.numRedirects < 3)) { (!options.numRedirects || options.numRedirects < 3)) {
let contentType = xmlhttp.getResponseHeader('Content-Type'); let contentType = xmlhttp.getResponseHeader('Content-Type');
if (contentType && contentType.startsWith('text/html')) { if (contentType && contentType.startsWith('text/html')) {
let meta = xmlhttp.response.querySelector('meta[http-equiv="refresh" i]'); let doc = xmlhttp.response;
if (meta) { let url = xmlhttp.responseURL;
let content = meta.getAttribute('content'); let resolvedURL;
if (content) { try {
let parts = content.split(/;\s*url=/); resolvedURL = this.getHTMLMetaRefreshURL(doc, url);
// If there's a redirect to another URL in less than 15 seconds, }
// follow it catch (e) {
if (parts.length === 2 && parseInt(parts[0]) <= 15) { deferred.reject(e);
let url = parts[1].trim().replace(/^'(.+)'/, '$1'); return;
}
// Resolve URL. P.S.: For unknown reason this only works if (resolvedURL) {
// if server returns 'Content-Type: text/html' header if (options.numRedirects) {
let a = xmlhttp.response.createElement('a'); options.numRedirects++;
a.href = url;
let resolvedUrl = a.href;
// Make sure the absolute URL is actually resolved
if (/^https?:\/\//.test(resolvedUrl)) {
if (options.numRedirects) {
options.numRedirects++;
}
else {
options.numRedirects = 1;
}
// Meta redirect is always GET
return Zotero.HTTP.request("GET", resolvedUrl, options)
.then(xmlhttp => deferred.resolve(xmlhttp))
.catch(e => deferred.reject(e));
}
}
} }
else {
options.numRedirects = 1;
}
// Meta redirect is always GET
return Zotero.HTTP.request("GET", resolvedURL, options)
.then(xmlhttp => deferred.resolve(xmlhttp))
.catch(e => deferred.reject(e));
} }
} }
} }
@ -682,6 +672,36 @@ Zotero.HTTP = new function() {
} }
this.getHTMLMetaRefreshURL = function (doc, url) {
var meta = doc.querySelector('meta[http-equiv="refresh" i]');
if (!meta) {
return false;
}
var content = meta.getAttribute('content');
if (!content) {
return false;
}
var parts = content.split(/;\s*url=/);
// If there's a redirect to another URL in less than 15 seconds,
// follow it
if (parts.length === 2 && parseInt(parts[0]) <= 15) {
let refreshURL = parts[1].trim().replace(/^'(.+)'/, '$1');
let resolvedURL;
try {
resolvedURL = Services.io.newURI(url, null, null).resolve(refreshURL);
}
catch (e) {
Zotero.logError(e);
}
// Make sure the URL is actually resolved
if (resolvedURL && /^https?:\/\//.test(resolvedURL)) {
return resolvedURL;
}
}
return false;
};
/** /**
* Make a foreground HTTP request in order to trigger a proxy authentication dialog * Make a foreground HTTP request in order to trigger a proxy authentication dialog
* *

View file

@ -354,6 +354,7 @@ describe("Zotero.Attachments", function() {
var pageURL7 = doiPrefix + doi5; var pageURL7 = doiPrefix + doi5;
var pageURL8 = 'http://website2/article8'; var pageURL8 = 'http://website2/article8';
var pageURL9 = 'http://website/article9'; var pageURL9 = 'http://website/article9';
var pageURL10 = 'http://website/refresh';
Components.utils.import("resource://zotero-unit/httpd.js"); Components.utils.import("resource://zotero-unit/httpd.js");
var httpd; var httpd;
@ -534,6 +535,11 @@ describe("Zotero.Attachments", function() {
} }
} }
if (url == pageURL10) {
let html = `<html><head><meta http-equiv=\"refresh\" content=\"2;url=${pageURL1}\"/></head><body></body></html>`;
return makeHTMLResponseFromType(html, options.responseType, pageURL10);
}
// OA PDF lookup // OA PDF lookup
if (url.startsWith(ZOTERO_CONFIG.SERVICES_URL)) { if (url.startsWith(ZOTERO_CONFIG.SERVICES_URL)) {
let json = JSON.parse(options.body); let json = JSON.parse(options.body);
@ -830,6 +836,25 @@ describe("Zotero.Attachments", function() {
assert.equal(item2.numAttachments(), 1); assert.equal(item2.numAttachments(), 1);
}); });
it("should follow a meta redirect", async function () {
var url = pageURL10;
var item = createUnsavedDataObject('item', { itemType: 'journalArticle' });
item.setField('title', 'Test');
item.setField('url', url);
await item.saveTx();
var attachment = await Zotero.Attachments.addAvailablePDF(item);
assert.isTrue(requestStub.calledTwice);
assert.equal(requestStub.getCall(0).args[1], pageURL10)
assert.equal(requestStub.getCall(1).args[1], pageURL1)
assert.ok(attachment);
var json = attachment.toJSON();
assert.equal(json.url, pdfURL);
assert.equal(json.contentType, 'application/pdf');
assert.equal(json.filename, 'Test.pdf');
assert.equal(await OS.File.stat(attachment.getFilePath()).size, pdfSize);
});
it("should handle a custom resolver in HTML mode", async function () { it("should handle a custom resolver in HTML mode", async function () {
var doi = doi4; var doi = doi4;
var item = createUnsavedDataObject('item', { itemType: 'journalArticle' }); var item = createUnsavedDataObject('item', { itemType: 'journalArticle' });