Save OA PDFs when the DOI resolves directly to the file

2018-09-06 16:38:28 -04:00 · 2018-09-06 16:38:28 -04:00 · 7cf466a0b6
commit 7cf466a0b6
parent 18821984e0
2 changed files with 130 additions and 39 deletions
--- a/chrome/content/zotero/xpcom/attachments.js
+++ b/chrome/content/zotero/xpcom/attachments.js
@ -1287,15 +1287,41 @@ Zotero.Attachments = new function(){
 					// TODO: Handle redirects manually so we can avoid loading a page we've already
 					// tried
-					let xmlhttp = await Zotero.HTTP.request("GET", pageURL, { responseType: 'document' });
+					let req = await Zotero.HTTP.request("GET", pageURL, { responseType: 'blob' });
-					responseURL = xmlhttp.responseURL;
+					let blob = req.response;
 					responseURL = req.responseURL;
 					if (pageURL != responseURL) {
 						Zotero.debug("Redirected to " + responseURL);
 					}
 					triedPages.add(responseURL);
 					let doc = Zotero.HTTP.wrapDocument(xmlhttp.response, responseURL);
-					url = await Zotero.Utilities.Internal.getPDFFromDocument(doc);
+					let contentType = req.getResponseHeader('Content-Type');
 					// If DOI resolves directly to a PDF, save it to disk
 					if (contentType == 'application/pdf') {
 						Zotero.debug("DOI resolves directly to PDF");
 						await Zotero.File.putContentsAsync(path, blob);
 						return { url: responseURL, props: urlResolver };
 					}
 					// Otherwise parse the Blob into a Document and translate that
 					else if (contentType.startsWith('text/html')) {
 						let charset = 'utf-8';
 						let matches = contentType.match(/charset=([a-z0-9\-_+])/i);
 						if (matches) {
 							charset = matches[1];
 						}
 						let responseText = await new Promise(function (resolve) {
 							let fr = new FileReader();
 							fr.addEventListener("loadend", function() {
 								resolve(fr.result);
 							});
 							fr.readAsText(blob, charset);
 						});
 						let parser = Components.classes["@mozilla.org/xmlextras/domparser;1"]
 						 .createInstance(Components.interfaces.nsIDOMParser);
 						let doc = parser.parseFromString(responseText, 'text/html');
 						doc = Zotero.HTTP.wrapDocument(doc, responseURL);
 						url = await Zotero.Utilities.Internal.getPDFFromDocument(doc);
 					}
 				}
 				catch (e) {
 					Zotero.debug(`Error getting PDF from ${pageURL}: ${e}`);
--- a/test/tests/attachmentsTest.js
+++ b/test/tests/attachmentsTest.js
@ -343,25 +343,67 @@ describe("Zotero.Attachments", function() {
 		var doi2 = '10.2222/bcde';
 		var doi3 = '10.3333/cdef';
 		var doi4 = '10.4444/defg';
 		var doi5 = '10.5555/efgh';
 		var pageURL1 = 'http://website/article1';
 		var pageURL2 = 'http://website/article2';
 		var pageURL3 = 'http://website/article3';
 		var pageURL4 = 'http://website/article4';
 		var pageURL5 = `http://website/${doi4}`;
 		var pageURL6 = `http://website/${doi4}/json`;
 		var pageURL7 = doiPrefix + doi5;
 		Components.utils.import("resource://zotero-unit/httpd.js");
 		var httpd;
 		var port = 16213;
 		var baseURL = `http://localhost:${port}/`;
 		var pdfPath = OS.Path.join(getTestDataDirectory().path, 'test.pdf');
 		var pdfURL = `${baseURL}article1/pdf`;
 		var pdfSize;
 		var requestStub;
 		function makeGetResponseHeader(headers) {
 			return function (header) {
 				if (headers[header] !== undefined) {
 					return headers[header];
 				}
 				throw new Error("Unimplemented");
 			};
 		}
 		function makeHTMLResponseFromType(html, responseType, responseURL) {
 			var response;
 			if (responseType == 'document') {
 				let parser = new DOMParser();
 				let doc = parser.parseFromString(html, 'text/html');
 				doc = Zotero.HTTP.wrapDocument(doc, responseURL);
 				response = doc;
 			}
 			else if (responseType == 'blob') {
 				let blob = new Blob([html], {type: 'text/html'});
 				response = blob;
 			}
 			else {
 				throw new Error("Request not mocked");
 			}
 			return {
 				status: 200,
 				response,
 				responseURL,
 				getResponseHeader: makeGetResponseHeader({
 					'Content-Type': 'text/html'
 				})
 			};
 		}
 		before(async function () {
 			var pdfBlob = await File.createFromFileName(pdfPath);
 			var origFunc = Zotero.HTTP.request.bind(Zotero.HTTP);
 			requestStub = sinon.stub(Zotero.HTTP, 'request');
 			requestStub.callsFake(function (method, url, options) {
 				Zotero.debug("Intercepting " + method + " " + url);
 				// Page responses
 				var routes = [
 					// Page 1 contains a PDF
@ -376,14 +418,14 @@ describe("Zotero.Attachments", function() {
 					[doiPrefix + doi3, pageURL2, false],
 					[pageURL3, pageURL3, true],
 					// DOI 4 redirects to page 4, which doesn't contain a PDF
-					[doiPrefix + doi4, pageURL4, false]
+					[doiPrefix + doi4, pageURL4, false],
 				];
 				for (let route of routes) {
 					let [expectedURL, responseURL, includePDF] = route;
 					if (url != expectedURL) continue;
-					var html = `<html>
+					let html = `<html>
 						<head>
 							<title>Page Title</title>
 							<link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" />
@ -392,19 +434,13 @@ describe("Zotero.Attachments", function() {
 						</head>
 						<body>Body</body>
 					</html>`;
-					let parser = new DOMParser();
+					
-					let doc = parser.parseFromString(html, 'text/html');
+					return makeHTMLResponseFromType(html, options.responseType, responseURL);
 					doc = Zotero.HTTP.wrapDocument(doc, responseURL);
 					return {
 						status: 200,
 						response: doc,
 						responseURL
 					};
 				}
 				// HTML page with PDF download link
 				if (url == pageURL5) {
-					var html = `<html>
+					let html = `<html>
 						<head>
 							<title>Page Title</title>
 						</head>
@ -412,31 +448,41 @@ describe("Zotero.Attachments", function() {
 							<a id="pdf-link" href="${pdfURL}">Download PDF</a>
 						</body>
 					</html>`;
-					let parser = new DOMParser();
+					
-					let doc = parser.parseFromString(html, 'text/html');
+					return makeHTMLResponseFromType(html, options.responseType, pageURL5);
 					doc = Zotero.HTTP.wrapDocument(doc, pageURL5);
 					return {
 						status: 200,
 						response: doc,
 						responseURL: pageURL5
 					};
 				}
 				// JSON response with PDF download links
 				if (url == pageURL6) {
 					let response = {
 						oa_locations: [
 							{
 								url_for_landing_page: pageURL1
 							},
 							{
 								url_for_pdf: pdfURL
 							}
 						]
 					};
 					return {
 						status: 200,
-						response: {
+						response,
-							oa_locations: [
+						responseURL: pageURL6,
-								{
+						getResponseHeader: makeGetResponseHeader({
-									url_for_landing_page: pageURL1
+							'Content-Type': 'application/json'
-								},
+						})
-								{
+					};
-									url_for_pdf: pdfURL
+				}
-								}
+				
-							]
+				// DOI that redirects directly to a PDF
-						},
+				if (url == pageURL7) {
-						responseURL: pageURL6
+					return {
 						status: 200,
 						response: pdfBlob,
 						responseURL: pdfURL,
 						getResponseHeader: makeGetResponseHeader({
 							'Content-Type': 'application/pdf'
 						})
 					};
 				}
@ -458,15 +504,16 @@ describe("Zotero.Attachments", function() {
 					}
 					return {
 						status: 200,
-						response
+						response,
 						getResponseHeader: makeGetResponseHeader({
 							'Content-Type': 'application/pdf'
 						})
 					};
 				}
 				return origFunc(...arguments);
 			});
-			pdfSize = await OS.File.stat(
+			pdfSize = await OS.File.stat(pdfPath).size;
 				OS.Path.join(getTestDataDirectory().path, 'test.pdf')
 			).size;
 			Zotero.Prefs.clear('findPDFs.resolvers');
 		});
@ -492,7 +539,7 @@ describe("Zotero.Attachments", function() {
 			Zotero.HTTP.request.restore();
 		});
-		it("should add a PDF from a resolved DOI", async function () {
+		it("should add a PDF from a resolved DOI webpage", async function () {
 			var doi = doi1;
 			var item = createUnsavedDataObject('item', { itemType: 'journalArticle' });
 			item.setField('title', 'Test');
@ -510,6 +557,24 @@ describe("Zotero.Attachments", function() {
 			assert.equal(await OS.File.stat(attachment.getFilePath()).size, pdfSize);
 		});
 		it("should add a PDF from a DOI that resolves directly to the file", async function () {
 			var doi = doi5;
 			var item = createUnsavedDataObject('item', { itemType: 'journalArticle' });
 			item.setField('title', 'Test');
 			item.setField('DOI', doi);
 			await item.saveTx();
 			var attachment = await Zotero.Attachments.addAvailablePDF(item);
 			assert.isTrue(requestStub.calledOnce);
 			assert.isTrue(requestStub.calledWith('GET', 'https://doi.org/' + doi));
 			assert.ok(attachment);
 			var json = attachment.toJSON();
 			assert.equal(json.url, pdfURL);
 			assert.equal(json.contentType, 'application/pdf');
 			assert.equal(json.filename, 'Test.pdf');
 			assert.equal(await OS.File.stat(attachment.getFilePath()).size, pdfSize);
 		});
 		it("should add a PDF from a resolved DOI from the Extra field", async function () {
 			var doi = doi1;
 			var item = createUnsavedDataObject('item', { itemType: 'journalArticle' });