Find Available PDF: Don't mark URLs that redirect as tried

https://forums.zotero.org/discussion/81182
This commit is contained in:
Dan Stillman 2020-02-02 23:34:57 -05:00
parent b4b33c07de
commit 86a5c46b1e
2 changed files with 17 additions and 5 deletions

View file

@ -1666,7 +1666,6 @@ Zotero.Attachments = new function(){
// If URL wasn't available or failed, try to get a URL from a page // If URL wasn't available or failed, try to get a URL from a page
if (pageURL) { if (pageURL) {
addTriedURL(pageURL);
url = null; url = null;
let responseURL; let responseURL;
try { try {
@ -1722,7 +1721,6 @@ Zotero.Attachments = new function(){
skip = true; skip = true;
break; break;
} }
addTriedURL(nextURL);
continue; continue;
} }
@ -1732,10 +1730,11 @@ Zotero.Attachments = new function(){
Zotero.debug("Redirected to " + responseURL); Zotero.debug("Redirected to " + responseURL);
} }
// If HTML, check for a meta redirect
contentType = req.getResponseHeader('Content-Type'); contentType = req.getResponseHeader('Content-Type');
if (contentType.startsWith('text/html')) { if (contentType.startsWith('text/html')) {
doc = await Zotero.Utilities.Internal.blobToHTMLDocument(blob, responseURL); doc = await Zotero.Utilities.Internal.blobToHTMLDocument(blob, responseURL);
// Check for a meta redirect on HTML pages
let refreshURL = Zotero.HTTP.getHTMLMetaRefreshURL(doc, responseURL); let refreshURL = Zotero.HTTP.getHTMLMetaRefreshURL(doc, responseURL);
if (refreshURL) { if (refreshURL) {
if (isTriedURL(refreshURL)) { if (isTriedURL(refreshURL)) {
@ -1745,10 +1744,22 @@ Zotero.Attachments = new function(){
} }
doc = null; doc = null;
nextURL = refreshURL; nextURL = refreshURL;
addTriedURL(nextURL);
continue; continue;
} }
} }
// Don't try this page URL again
//
// We only do this for URLs that don't redirect, since some sites seem to
// use redirects plus cookies for IP-based authentication [1]. The downside
// is that we might follow the same set of redirects more than once, but we
// won't process the final page multiple times, and if a publisher URL does
// redirect that's hopefully a decent indication that a PDF will be found
// the first time around.
//
// [1] https://forums.zotero.org/discussion/81182
addTriedURL(responseURL);
break; break;
} }
if (skip) { if (skip) {
@ -1779,6 +1790,7 @@ Zotero.Attachments = new function(){
Zotero.debug(`PDF at ${url} was already tried -- skipping`); Zotero.debug(`PDF at ${url} was already tried -- skipping`);
continue; continue;
} }
// Don't try this PDF URL again
addTriedURL(url); addTriedURL(url);
// Use the page we loaded as the referrer // Use the page we loaded as the referrer

View file

@ -726,7 +726,7 @@ describe("Zotero.Attachments", function() {
assert.equal(await OS.File.stat(attachment.getFilePath()).size, pdfSize); assert.equal(await OS.File.stat(attachment.getFilePath()).size, pdfSize);
}); });
it("shouldn't try the redirected DOI page again if also in the URL field", async function () { it("shouldn't try the URL-field URL again if it was already checked as the redirected DOI URL", async function () {
var doi = doi4; var doi = doi4;
var item = createUnsavedDataObject('item', { itemType: 'journalArticle' }); var item = createUnsavedDataObject('item', { itemType: 'journalArticle' });
item.setField('title', 'Test'); item.setField('title', 'Test');