From 7cf466a0b65d571a795c6ecb58c37afc8fdf2b5b Mon Sep 17 00:00:00 2001 From: Dan Stillman Date: Thu, 6 Sep 2018 16:38:28 -0400 Subject: [PATCH] Save OA PDFs when the DOI resolves directly to the file --- chrome/content/zotero/xpcom/attachments.js | 34 +++++- test/tests/attachmentsTest.js | 135 +++++++++++++++------ 2 files changed, 130 insertions(+), 39 deletions(-) diff --git a/chrome/content/zotero/xpcom/attachments.js b/chrome/content/zotero/xpcom/attachments.js index de6413f59d..da9780ceb8 100644 --- a/chrome/content/zotero/xpcom/attachments.js +++ b/chrome/content/zotero/xpcom/attachments.js @@ -1287,15 +1287,41 @@ Zotero.Attachments = new function(){ // TODO: Handle redirects manually so we can avoid loading a page we've already // tried - let xmlhttp = await Zotero.HTTP.request("GET", pageURL, { responseType: 'document' }); - responseURL = xmlhttp.responseURL; + let req = await Zotero.HTTP.request("GET", pageURL, { responseType: 'blob' }); + let blob = req.response; + responseURL = req.responseURL; if (pageURL != responseURL) { Zotero.debug("Redirected to " + responseURL); } triedPages.add(responseURL); - let doc = Zotero.HTTP.wrapDocument(xmlhttp.response, responseURL); - url = await Zotero.Utilities.Internal.getPDFFromDocument(doc); + let contentType = req.getResponseHeader('Content-Type'); + // If DOI resolves directly to a PDF, save it to disk + if (contentType == 'application/pdf') { + Zotero.debug("DOI resolves directly to PDF"); + await Zotero.File.putContentsAsync(path, blob); + return { url: responseURL, props: urlResolver }; + } + // Otherwise parse the Blob into a Document and translate that + else if (contentType.startsWith('text/html')) { + let charset = 'utf-8'; + let matches = contentType.match(/charset=([a-z0-9\-_+])/i); + if (matches) { + charset = matches[1]; + } + let responseText = await new Promise(function (resolve) { + let fr = new FileReader(); + fr.addEventListener("loadend", function() { + resolve(fr.result); + }); + fr.readAsText(blob, charset); + }); + let parser = Components.classes["@mozilla.org/xmlextras/domparser;1"] + .createInstance(Components.interfaces.nsIDOMParser); + let doc = parser.parseFromString(responseText, 'text/html'); + doc = Zotero.HTTP.wrapDocument(doc, responseURL); + url = await Zotero.Utilities.Internal.getPDFFromDocument(doc); + } } catch (e) { Zotero.debug(`Error getting PDF from ${pageURL}: ${e}`); diff --git a/test/tests/attachmentsTest.js b/test/tests/attachmentsTest.js index 16b2ad550c..6622a12188 100644 --- a/test/tests/attachmentsTest.js +++ b/test/tests/attachmentsTest.js @@ -343,25 +343,67 @@ describe("Zotero.Attachments", function() { var doi2 = '10.2222/bcde'; var doi3 = '10.3333/cdef'; var doi4 = '10.4444/defg'; + var doi5 = '10.5555/efgh'; var pageURL1 = 'http://website/article1'; var pageURL2 = 'http://website/article2'; var pageURL3 = 'http://website/article3'; var pageURL4 = 'http://website/article4'; var pageURL5 = `http://website/${doi4}`; var pageURL6 = `http://website/${doi4}/json`; + var pageURL7 = doiPrefix + doi5; Components.utils.import("resource://zotero-unit/httpd.js"); var httpd; var port = 16213; var baseURL = `http://localhost:${port}/`; + var pdfPath = OS.Path.join(getTestDataDirectory().path, 'test.pdf'); var pdfURL = `${baseURL}article1/pdf`; var pdfSize; var requestStub; + function makeGetResponseHeader(headers) { + return function (header) { + if (headers[header] !== undefined) { + return headers[header]; + } + throw new Error("Unimplemented"); + }; + } + + function makeHTMLResponseFromType(html, responseType, responseURL) { + var response; + if (responseType == 'document') { + let parser = new DOMParser(); + let doc = parser.parseFromString(html, 'text/html'); + doc = Zotero.HTTP.wrapDocument(doc, responseURL); + response = doc; + } + else if (responseType == 'blob') { + let blob = new Blob([html], {type: 'text/html'}); + response = blob; + } + else { + throw new Error("Request not mocked"); + } + + return { + status: 200, + response, + responseURL, + getResponseHeader: makeGetResponseHeader({ + 'Content-Type': 'text/html' + }) + }; + } + before(async function () { + var pdfBlob = await File.createFromFileName(pdfPath); + var origFunc = Zotero.HTTP.request.bind(Zotero.HTTP); requestStub = sinon.stub(Zotero.HTTP, 'request'); requestStub.callsFake(function (method, url, options) { + Zotero.debug("Intercepting " + method + " " + url); + // Page responses var routes = [ // Page 1 contains a PDF @@ -376,14 +418,14 @@ describe("Zotero.Attachments", function() { [doiPrefix + doi3, pageURL2, false], [pageURL3, pageURL3, true], // DOI 4 redirects to page 4, which doesn't contain a PDF - [doiPrefix + doi4, pageURL4, false] + [doiPrefix + doi4, pageURL4, false], ]; for (let route of routes) { let [expectedURL, responseURL, includePDF] = route; if (url != expectedURL) continue; - var html = ` + let html = ` Page Title @@ -392,19 +434,13 @@ describe("Zotero.Attachments", function() { Body `; - let parser = new DOMParser(); - let doc = parser.parseFromString(html, 'text/html'); - doc = Zotero.HTTP.wrapDocument(doc, responseURL); - return { - status: 200, - response: doc, - responseURL - }; + + return makeHTMLResponseFromType(html, options.responseType, responseURL); } // HTML page with PDF download link if (url == pageURL5) { - var html = ` + let html = ` Page Title @@ -412,31 +448,41 @@ describe("Zotero.Attachments", function() { Download PDF `; - let parser = new DOMParser(); - let doc = parser.parseFromString(html, 'text/html'); - doc = Zotero.HTTP.wrapDocument(doc, pageURL5); - return { - status: 200, - response: doc, - responseURL: pageURL5 - }; + + return makeHTMLResponseFromType(html, options.responseType, pageURL5); } // JSON response with PDF download links if (url == pageURL6) { + let response = { + oa_locations: [ + { + url_for_landing_page: pageURL1 + }, + { + url_for_pdf: pdfURL + } + ] + }; return { status: 200, - response: { - oa_locations: [ - { - url_for_landing_page: pageURL1 - }, - { - url_for_pdf: pdfURL - } - ] - }, - responseURL: pageURL6 + response, + responseURL: pageURL6, + getResponseHeader: makeGetResponseHeader({ + 'Content-Type': 'application/json' + }) + }; + } + + // DOI that redirects directly to a PDF + if (url == pageURL7) { + return { + status: 200, + response: pdfBlob, + responseURL: pdfURL, + getResponseHeader: makeGetResponseHeader({ + 'Content-Type': 'application/pdf' + }) }; } @@ -458,15 +504,16 @@ describe("Zotero.Attachments", function() { } return { status: 200, - response + response, + getResponseHeader: makeGetResponseHeader({ + 'Content-Type': 'application/pdf' + }) }; } return origFunc(...arguments); }); - pdfSize = await OS.File.stat( - OS.Path.join(getTestDataDirectory().path, 'test.pdf') - ).size; + pdfSize = await OS.File.stat(pdfPath).size; Zotero.Prefs.clear('findPDFs.resolvers'); }); @@ -492,7 +539,7 @@ describe("Zotero.Attachments", function() { Zotero.HTTP.request.restore(); }); - it("should add a PDF from a resolved DOI", async function () { + it("should add a PDF from a resolved DOI webpage", async function () { var doi = doi1; var item = createUnsavedDataObject('item', { itemType: 'journalArticle' }); item.setField('title', 'Test'); @@ -510,6 +557,24 @@ describe("Zotero.Attachments", function() { assert.equal(await OS.File.stat(attachment.getFilePath()).size, pdfSize); }); + it("should add a PDF from a DOI that resolves directly to the file", async function () { + var doi = doi5; + var item = createUnsavedDataObject('item', { itemType: 'journalArticle' }); + item.setField('title', 'Test'); + item.setField('DOI', doi); + await item.saveTx(); + var attachment = await Zotero.Attachments.addAvailablePDF(item); + + assert.isTrue(requestStub.calledOnce); + assert.isTrue(requestStub.calledWith('GET', 'https://doi.org/' + doi)); + assert.ok(attachment); + var json = attachment.toJSON(); + assert.equal(json.url, pdfURL); + assert.equal(json.contentType, 'application/pdf'); + assert.equal(json.filename, 'Test.pdf'); + assert.equal(await OS.File.stat(attachment.getFilePath()).size, pdfSize); + }); + it("should add a PDF from a resolved DOI from the Extra field", async function () { var doi = doi1; var item = createUnsavedDataObject('item', { itemType: 'journalArticle' });