Follow meta redirects for Find Available PDF
This fixes direct and VPN-based retrieval of PDFs for Elsevier (e.g., ScienceDirect) items that have a DOI but no URL, since Elsevier resolves DOIs through an intermediate page.
This commit is contained in:
parent
7d9b94c79e
commit
6137aeddb8
3 changed files with 106 additions and 41 deletions
|
@ -1681,6 +1681,9 @@ Zotero.Attachments = new function(){
|
|||
let redirects = 0;
|
||||
let nextURL = pageURL;
|
||||
let req;
|
||||
let blob;
|
||||
let doc;
|
||||
let contentType;
|
||||
let skip = false;
|
||||
let domains = new Set();
|
||||
while (true) {
|
||||
|
@ -1725,21 +1728,39 @@ Zotero.Attachments = new function(){
|
|||
skip = true;
|
||||
break;
|
||||
}
|
||||
addTriedURL(nextURL);
|
||||
continue;
|
||||
}
|
||||
|
||||
blob = req.response;
|
||||
responseURL = req.responseURL;
|
||||
if (pageURL != responseURL) {
|
||||
Zotero.debug("Redirected to " + responseURL);
|
||||
}
|
||||
|
||||
// If HTML, check for a meta redirect
|
||||
contentType = req.getResponseHeader('Content-Type');
|
||||
if (contentType.startsWith('text/html')) {
|
||||
doc = await Zotero.Utilities.Internal.blobToHTMLDocument(blob, responseURL);
|
||||
let refreshURL = Zotero.HTTP.getHTMLMetaRefreshURL(doc, responseURL);
|
||||
if (refreshURL) {
|
||||
if (isTriedURL(refreshURL)) {
|
||||
Zotero.debug("Meta refresh URL has already been tried -- skipping");
|
||||
skip = true;
|
||||
break;
|
||||
}
|
||||
doc = null;
|
||||
nextURL = refreshURL;
|
||||
addTriedURL(nextURL);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (skip) {
|
||||
continue;
|
||||
}
|
||||
let blob = req.response;
|
||||
responseURL = req.responseURL;
|
||||
if (pageURL != responseURL) {
|
||||
Zotero.debug("Redirected to " + responseURL);
|
||||
}
|
||||
addTriedURL(responseURL);
|
||||
|
||||
let contentType = req.getResponseHeader('Content-Type');
|
||||
// If DOI resolves directly to a PDF, save it to disk
|
||||
if (contentType == 'application/pdf') {
|
||||
Zotero.debug("URL resolves directly to PDF");
|
||||
|
@ -1747,9 +1768,8 @@ Zotero.Attachments = new function(){
|
|||
await _enforcePDF(path);
|
||||
return { url: responseURL, props: urlResolver };
|
||||
}
|
||||
// Otherwise parse the Blob into a Document and translate that
|
||||
else if (contentType.startsWith('text/html')) {
|
||||
let doc = await Zotero.Utilities.Internal.blobToHTMLDocument(blob, responseURL);
|
||||
// Otherwise translate the Document we parsed above
|
||||
else if (doc) {
|
||||
url = await Zotero.Utilities.Internal.getPDFFromDocument(doc);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -361,38 +361,28 @@ Zotero.HTTP = new function() {
|
|||
(!options.numRedirects || options.numRedirects < 3)) {
|
||||
let contentType = xmlhttp.getResponseHeader('Content-Type');
|
||||
if (contentType && contentType.startsWith('text/html')) {
|
||||
let meta = xmlhttp.response.querySelector('meta[http-equiv="refresh" i]');
|
||||
if (meta) {
|
||||
let content = meta.getAttribute('content');
|
||||
if (content) {
|
||||
let parts = content.split(/;\s*url=/);
|
||||
// If there's a redirect to another URL in less than 15 seconds,
|
||||
// follow it
|
||||
if (parts.length === 2 && parseInt(parts[0]) <= 15) {
|
||||
let url = parts[1].trim().replace(/^'(.+)'/, '$1');
|
||||
|
||||
// Resolve URL. P.S.: For unknown reason this only works
|
||||
// if server returns 'Content-Type: text/html' header
|
||||
let a = xmlhttp.response.createElement('a');
|
||||
a.href = url;
|
||||
let resolvedUrl = a.href;
|
||||
|
||||
// Make sure the absolute URL is actually resolved
|
||||
if (/^https?:\/\//.test(resolvedUrl)) {
|
||||
if (options.numRedirects) {
|
||||
options.numRedirects++;
|
||||
}
|
||||
else {
|
||||
options.numRedirects = 1;
|
||||
}
|
||||
|
||||
// Meta redirect is always GET
|
||||
return Zotero.HTTP.request("GET", resolvedUrl, options)
|
||||
.then(xmlhttp => deferred.resolve(xmlhttp))
|
||||
.catch(e => deferred.reject(e));
|
||||
}
|
||||
}
|
||||
let doc = xmlhttp.response;
|
||||
let url = xmlhttp.responseURL;
|
||||
let resolvedURL;
|
||||
try {
|
||||
resolvedURL = this.getHTMLMetaRefreshURL(doc, url);
|
||||
}
|
||||
catch (e) {
|
||||
deferred.reject(e);
|
||||
return;
|
||||
}
|
||||
if (resolvedURL) {
|
||||
if (options.numRedirects) {
|
||||
options.numRedirects++;
|
||||
}
|
||||
else {
|
||||
options.numRedirects = 1;
|
||||
}
|
||||
|
||||
// Meta redirect is always GET
|
||||
return Zotero.HTTP.request("GET", resolvedURL, options)
|
||||
.then(xmlhttp => deferred.resolve(xmlhttp))
|
||||
.catch(e => deferred.reject(e));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -682,6 +672,36 @@ Zotero.HTTP = new function() {
|
|||
}
|
||||
|
||||
|
||||
this.getHTMLMetaRefreshURL = function (doc, url) {
|
||||
var meta = doc.querySelector('meta[http-equiv="refresh" i]');
|
||||
if (!meta) {
|
||||
return false;
|
||||
}
|
||||
var content = meta.getAttribute('content');
|
||||
if (!content) {
|
||||
return false;
|
||||
}
|
||||
var parts = content.split(/;\s*url=/);
|
||||
// If there's a redirect to another URL in less than 15 seconds,
|
||||
// follow it
|
||||
if (parts.length === 2 && parseInt(parts[0]) <= 15) {
|
||||
let refreshURL = parts[1].trim().replace(/^'(.+)'/, '$1');
|
||||
let resolvedURL;
|
||||
try {
|
||||
resolvedURL = Services.io.newURI(url, null, null).resolve(refreshURL);
|
||||
}
|
||||
catch (e) {
|
||||
Zotero.logError(e);
|
||||
}
|
||||
// Make sure the URL is actually resolved
|
||||
if (resolvedURL && /^https?:\/\//.test(resolvedURL)) {
|
||||
return resolvedURL;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Make a foreground HTTP request in order to trigger a proxy authentication dialog
|
||||
*
|
||||
|
|
|
@ -354,6 +354,7 @@ describe("Zotero.Attachments", function() {
|
|||
var pageURL7 = doiPrefix + doi5;
|
||||
var pageURL8 = 'http://website2/article8';
|
||||
var pageURL9 = 'http://website/article9';
|
||||
var pageURL10 = 'http://website/refresh';
|
||||
|
||||
Components.utils.import("resource://zotero-unit/httpd.js");
|
||||
var httpd;
|
||||
|
@ -534,6 +535,11 @@ describe("Zotero.Attachments", function() {
|
|||
}
|
||||
}
|
||||
|
||||
if (url == pageURL10) {
|
||||
let html = `<html><head><meta http-equiv=\"refresh\" content=\"2;url=${pageURL1}\"/></head><body></body></html>`;
|
||||
return makeHTMLResponseFromType(html, options.responseType, pageURL10);
|
||||
}
|
||||
|
||||
// OA PDF lookup
|
||||
if (url.startsWith(ZOTERO_CONFIG.SERVICES_URL)) {
|
||||
let json = JSON.parse(options.body);
|
||||
|
@ -830,6 +836,25 @@ describe("Zotero.Attachments", function() {
|
|||
assert.equal(item2.numAttachments(), 1);
|
||||
});
|
||||
|
||||
it("should follow a meta redirect", async function () {
|
||||
var url = pageURL10;
|
||||
var item = createUnsavedDataObject('item', { itemType: 'journalArticle' });
|
||||
item.setField('title', 'Test');
|
||||
item.setField('url', url);
|
||||
await item.saveTx();
|
||||
var attachment = await Zotero.Attachments.addAvailablePDF(item);
|
||||
|
||||
assert.isTrue(requestStub.calledTwice);
|
||||
assert.equal(requestStub.getCall(0).args[1], pageURL10)
|
||||
assert.equal(requestStub.getCall(1).args[1], pageURL1)
|
||||
assert.ok(attachment);
|
||||
var json = attachment.toJSON();
|
||||
assert.equal(json.url, pdfURL);
|
||||
assert.equal(json.contentType, 'application/pdf');
|
||||
assert.equal(json.filename, 'Test.pdf');
|
||||
assert.equal(await OS.File.stat(attachment.getFilePath()).size, pdfSize);
|
||||
});
|
||||
|
||||
it("should handle a custom resolver in HTML mode", async function () {
|
||||
var doi = doi4;
|
||||
var item = createUnsavedDataObject('item', { itemType: 'journalArticle' });
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue