Follow meta redirects for Find Available PDF
This fixes direct and VPN-based retrieval of PDFs for Elsevier (e.g., ScienceDirect) items that have a DOI but no URL, since Elsevier resolves DOIs through an intermediate page.
This commit is contained in:
parent
7d9b94c79e
commit
6137aeddb8
3 changed files with 106 additions and 41 deletions
|
@ -1681,6 +1681,9 @@ Zotero.Attachments = new function(){
|
||||||
let redirects = 0;
|
let redirects = 0;
|
||||||
let nextURL = pageURL;
|
let nextURL = pageURL;
|
||||||
let req;
|
let req;
|
||||||
|
let blob;
|
||||||
|
let doc;
|
||||||
|
let contentType;
|
||||||
let skip = false;
|
let skip = false;
|
||||||
let domains = new Set();
|
let domains = new Set();
|
||||||
while (true) {
|
while (true) {
|
||||||
|
@ -1725,21 +1728,39 @@ Zotero.Attachments = new function(){
|
||||||
skip = true;
|
skip = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
addTriedURL(nextURL);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
blob = req.response;
|
||||||
|
responseURL = req.responseURL;
|
||||||
|
if (pageURL != responseURL) {
|
||||||
|
Zotero.debug("Redirected to " + responseURL);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If HTML, check for a meta redirect
|
||||||
|
contentType = req.getResponseHeader('Content-Type');
|
||||||
|
if (contentType.startsWith('text/html')) {
|
||||||
|
doc = await Zotero.Utilities.Internal.blobToHTMLDocument(blob, responseURL);
|
||||||
|
let refreshURL = Zotero.HTTP.getHTMLMetaRefreshURL(doc, responseURL);
|
||||||
|
if (refreshURL) {
|
||||||
|
if (isTriedURL(refreshURL)) {
|
||||||
|
Zotero.debug("Meta refresh URL has already been tried -- skipping");
|
||||||
|
skip = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
doc = null;
|
||||||
|
nextURL = refreshURL;
|
||||||
|
addTriedURL(nextURL);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (skip) {
|
if (skip) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
let blob = req.response;
|
|
||||||
responseURL = req.responseURL;
|
|
||||||
if (pageURL != responseURL) {
|
|
||||||
Zotero.debug("Redirected to " + responseURL);
|
|
||||||
}
|
|
||||||
addTriedURL(responseURL);
|
|
||||||
|
|
||||||
let contentType = req.getResponseHeader('Content-Type');
|
|
||||||
// If DOI resolves directly to a PDF, save it to disk
|
// If DOI resolves directly to a PDF, save it to disk
|
||||||
if (contentType == 'application/pdf') {
|
if (contentType == 'application/pdf') {
|
||||||
Zotero.debug("URL resolves directly to PDF");
|
Zotero.debug("URL resolves directly to PDF");
|
||||||
|
@ -1747,9 +1768,8 @@ Zotero.Attachments = new function(){
|
||||||
await _enforcePDF(path);
|
await _enforcePDF(path);
|
||||||
return { url: responseURL, props: urlResolver };
|
return { url: responseURL, props: urlResolver };
|
||||||
}
|
}
|
||||||
// Otherwise parse the Blob into a Document and translate that
|
// Otherwise translate the Document we parsed above
|
||||||
else if (contentType.startsWith('text/html')) {
|
else if (doc) {
|
||||||
let doc = await Zotero.Utilities.Internal.blobToHTMLDocument(blob, responseURL);
|
|
||||||
url = await Zotero.Utilities.Internal.getPDFFromDocument(doc);
|
url = await Zotero.Utilities.Internal.getPDFFromDocument(doc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -361,38 +361,28 @@ Zotero.HTTP = new function() {
|
||||||
(!options.numRedirects || options.numRedirects < 3)) {
|
(!options.numRedirects || options.numRedirects < 3)) {
|
||||||
let contentType = xmlhttp.getResponseHeader('Content-Type');
|
let contentType = xmlhttp.getResponseHeader('Content-Type');
|
||||||
if (contentType && contentType.startsWith('text/html')) {
|
if (contentType && contentType.startsWith('text/html')) {
|
||||||
let meta = xmlhttp.response.querySelector('meta[http-equiv="refresh" i]');
|
let doc = xmlhttp.response;
|
||||||
if (meta) {
|
let url = xmlhttp.responseURL;
|
||||||
let content = meta.getAttribute('content');
|
let resolvedURL;
|
||||||
if (content) {
|
try {
|
||||||
let parts = content.split(/;\s*url=/);
|
resolvedURL = this.getHTMLMetaRefreshURL(doc, url);
|
||||||
// If there's a redirect to another URL in less than 15 seconds,
|
}
|
||||||
// follow it
|
catch (e) {
|
||||||
if (parts.length === 2 && parseInt(parts[0]) <= 15) {
|
deferred.reject(e);
|
||||||
let url = parts[1].trim().replace(/^'(.+)'/, '$1');
|
return;
|
||||||
|
}
|
||||||
// Resolve URL. P.S.: For unknown reason this only works
|
if (resolvedURL) {
|
||||||
// if server returns 'Content-Type: text/html' header
|
if (options.numRedirects) {
|
||||||
let a = xmlhttp.response.createElement('a');
|
options.numRedirects++;
|
||||||
a.href = url;
|
|
||||||
let resolvedUrl = a.href;
|
|
||||||
|
|
||||||
// Make sure the absolute URL is actually resolved
|
|
||||||
if (/^https?:\/\//.test(resolvedUrl)) {
|
|
||||||
if (options.numRedirects) {
|
|
||||||
options.numRedirects++;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
options.numRedirects = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Meta redirect is always GET
|
|
||||||
return Zotero.HTTP.request("GET", resolvedUrl, options)
|
|
||||||
.then(xmlhttp => deferred.resolve(xmlhttp))
|
|
||||||
.catch(e => deferred.reject(e));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
options.numRedirects = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Meta redirect is always GET
|
||||||
|
return Zotero.HTTP.request("GET", resolvedURL, options)
|
||||||
|
.then(xmlhttp => deferred.resolve(xmlhttp))
|
||||||
|
.catch(e => deferred.reject(e));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -682,6 +672,36 @@ Zotero.HTTP = new function() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
this.getHTMLMetaRefreshURL = function (doc, url) {
|
||||||
|
var meta = doc.querySelector('meta[http-equiv="refresh" i]');
|
||||||
|
if (!meta) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
var content = meta.getAttribute('content');
|
||||||
|
if (!content) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
var parts = content.split(/;\s*url=/);
|
||||||
|
// If there's a redirect to another URL in less than 15 seconds,
|
||||||
|
// follow it
|
||||||
|
if (parts.length === 2 && parseInt(parts[0]) <= 15) {
|
||||||
|
let refreshURL = parts[1].trim().replace(/^'(.+)'/, '$1');
|
||||||
|
let resolvedURL;
|
||||||
|
try {
|
||||||
|
resolvedURL = Services.io.newURI(url, null, null).resolve(refreshURL);
|
||||||
|
}
|
||||||
|
catch (e) {
|
||||||
|
Zotero.logError(e);
|
||||||
|
}
|
||||||
|
// Make sure the URL is actually resolved
|
||||||
|
if (resolvedURL && /^https?:\/\//.test(resolvedURL)) {
|
||||||
|
return resolvedURL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Make a foreground HTTP request in order to trigger a proxy authentication dialog
|
* Make a foreground HTTP request in order to trigger a proxy authentication dialog
|
||||||
*
|
*
|
||||||
|
|
|
@ -354,6 +354,7 @@ describe("Zotero.Attachments", function() {
|
||||||
var pageURL7 = doiPrefix + doi5;
|
var pageURL7 = doiPrefix + doi5;
|
||||||
var pageURL8 = 'http://website2/article8';
|
var pageURL8 = 'http://website2/article8';
|
||||||
var pageURL9 = 'http://website/article9';
|
var pageURL9 = 'http://website/article9';
|
||||||
|
var pageURL10 = 'http://website/refresh';
|
||||||
|
|
||||||
Components.utils.import("resource://zotero-unit/httpd.js");
|
Components.utils.import("resource://zotero-unit/httpd.js");
|
||||||
var httpd;
|
var httpd;
|
||||||
|
@ -534,6 +535,11 @@ describe("Zotero.Attachments", function() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (url == pageURL10) {
|
||||||
|
let html = `<html><head><meta http-equiv=\"refresh\" content=\"2;url=${pageURL1}\"/></head><body></body></html>`;
|
||||||
|
return makeHTMLResponseFromType(html, options.responseType, pageURL10);
|
||||||
|
}
|
||||||
|
|
||||||
// OA PDF lookup
|
// OA PDF lookup
|
||||||
if (url.startsWith(ZOTERO_CONFIG.SERVICES_URL)) {
|
if (url.startsWith(ZOTERO_CONFIG.SERVICES_URL)) {
|
||||||
let json = JSON.parse(options.body);
|
let json = JSON.parse(options.body);
|
||||||
|
@ -830,6 +836,25 @@ describe("Zotero.Attachments", function() {
|
||||||
assert.equal(item2.numAttachments(), 1);
|
assert.equal(item2.numAttachments(), 1);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("should follow a meta redirect", async function () {
|
||||||
|
var url = pageURL10;
|
||||||
|
var item = createUnsavedDataObject('item', { itemType: 'journalArticle' });
|
||||||
|
item.setField('title', 'Test');
|
||||||
|
item.setField('url', url);
|
||||||
|
await item.saveTx();
|
||||||
|
var attachment = await Zotero.Attachments.addAvailablePDF(item);
|
||||||
|
|
||||||
|
assert.isTrue(requestStub.calledTwice);
|
||||||
|
assert.equal(requestStub.getCall(0).args[1], pageURL10)
|
||||||
|
assert.equal(requestStub.getCall(1).args[1], pageURL1)
|
||||||
|
assert.ok(attachment);
|
||||||
|
var json = attachment.toJSON();
|
||||||
|
assert.equal(json.url, pdfURL);
|
||||||
|
assert.equal(json.contentType, 'application/pdf');
|
||||||
|
assert.equal(json.filename, 'Test.pdf');
|
||||||
|
assert.equal(await OS.File.stat(attachment.getFilePath()).size, pdfSize);
|
||||||
|
});
|
||||||
|
|
||||||
it("should handle a custom resolver in HTML mode", async function () {
|
it("should handle a custom resolver in HTML mode", async function () {
|
||||||
var doi = doi4;
|
var doi = doi4;
|
||||||
var item = createUnsavedDataObject('item', { itemType: 'journalArticle' });
|
var item = createUnsavedDataObject('item', { itemType: 'journalArticle' });
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue