Save OA PDFs when the DOI resolves directly to the file

2018-09-06 16:38:28 -04:00 · 2018-09-06 16:38:28 -04:00 · 7cf466a0b6
commit 7cf466a0b6
parent 18821984e0
2 changed files with 130 additions and 39 deletions
--- a/chrome/content/zotero/xpcom/attachments.js
+++ b/chrome/content/zotero/xpcom/attachments.js
@ -1287,16 +1287,42 @@ Zotero.Attachments = new function(){
 					
 					// TODO: Handle redirects manually so we can avoid loading a page we've already
 					// tried
-					let xmlhttp = await Zotero.HTTP.request("GET", pageURL, { responseType: 'document' });
-					responseURL = xmlhttp.responseURL;
+					let req = await Zotero.HTTP.request("GET", pageURL, { responseType: 'blob' });
+					let blob = req.response;
+					responseURL = req.responseURL;
 					if (pageURL != responseURL) {
 						Zotero.debug("Redirected to " + responseURL);
 					}
 					triedPages.add(responseURL);
-					let doc = Zotero.HTTP.wrapDocument(xmlhttp.response, responseURL);
 					
+					let contentType = req.getResponseHeader('Content-Type');
+					// If DOI resolves directly to a PDF, save it to disk
+					if (contentType == 'application/pdf') {
+						Zotero.debug("DOI resolves directly to PDF");
+						await Zotero.File.putContentsAsync(path, blob);
+						return { url: responseURL, props: urlResolver };
+					}
+					// Otherwise parse the Blob into a Document and translate that
+					else if (contentType.startsWith('text/html')) {
+						let charset = 'utf-8';
+						let matches = contentType.match(/charset=([a-z0-9\-_+])/i);
+						if (matches) {
+							charset = matches[1];
+						}
+						let responseText = await new Promise(function (resolve) {
+							let fr = new FileReader();
+							fr.addEventListener("loadend", function() {
+								resolve(fr.result);
+							});
+							fr.readAsText(blob, charset);
+						});
+						let parser = Components.classes["@mozilla.org/xmlextras/domparser;1"]
+						 .createInstance(Components.interfaces.nsIDOMParser);
+						let doc = parser.parseFromString(responseText, 'text/html');
+						doc = Zotero.HTTP.wrapDocument(doc, responseURL);
 						url = await Zotero.Utilities.Internal.getPDFFromDocument(doc);
 					}
+				}
 				catch (e) {
 					Zotero.debug(`Error getting PDF from ${pageURL}: ${e}`);
 					continue;
--- a/test/tests/attachmentsTest.js
+++ b/test/tests/attachmentsTest.js
@ -343,25 +343,67 @@ describe("Zotero.Attachments", function() {
 		var doi2 = '10.2222/bcde';
 		var doi3 = '10.3333/cdef';
 		var doi4 = '10.4444/defg';
+		var doi5 = '10.5555/efgh';
 		var pageURL1 = 'http://website/article1';
 		var pageURL2 = 'http://website/article2';
 		var pageURL3 = 'http://website/article3';
 		var pageURL4 = 'http://website/article4';
 		var pageURL5 = `http://website/${doi4}`;
 		var pageURL6 = `http://website/${doi4}/json`;
+		var pageURL7 = doiPrefix + doi5;
 		
 		Components.utils.import("resource://zotero-unit/httpd.js");
 		var httpd;
 		var port = 16213;
 		var baseURL = `http://localhost:${port}/`;
+		var pdfPath = OS.Path.join(getTestDataDirectory().path, 'test.pdf');
 		var pdfURL = `${baseURL}article1/pdf`;
 		var pdfSize;
 		var requestStub;
 		
+		function makeGetResponseHeader(headers) {
+			return function (header) {
+				if (headers[header] !== undefined) {
+					return headers[header];
+				}
+				throw new Error("Unimplemented");
+			};
+		}
+		
+		function makeHTMLResponseFromType(html, responseType, responseURL) {
+			var response;
+			if (responseType == 'document') {
+				let parser = new DOMParser();
+				let doc = parser.parseFromString(html, 'text/html');
+				doc = Zotero.HTTP.wrapDocument(doc, responseURL);
+				response = doc;
+			}
+			else if (responseType == 'blob') {
+				let blob = new Blob([html], {type: 'text/html'});
+				response = blob;
+			}
+			else {
+				throw new Error("Request not mocked");
+			}
+			
+			return {
+				status: 200,
+				response,
+				responseURL,
+				getResponseHeader: makeGetResponseHeader({
+					'Content-Type': 'text/html'
+				})
+			};
+		}
+		
 		before(async function () {
+			var pdfBlob = await File.createFromFileName(pdfPath);
+			
 			var origFunc = Zotero.HTTP.request.bind(Zotero.HTTP);
 			requestStub = sinon.stub(Zotero.HTTP, 'request');
 			requestStub.callsFake(function (method, url, options) {
+				Zotero.debug("Intercepting " + method + " " + url);
+				
 				// Page responses
 				var routes = [
 					// Page 1 contains a PDF
@ -376,14 +418,14 @@ describe("Zotero.Attachments", function() {
 					[doiPrefix + doi3, pageURL2, false],
 					[pageURL3, pageURL3, true],
 					// DOI 4 redirects to page 4, which doesn't contain a PDF
-					[doiPrefix + doi4, pageURL4, false]
+					[doiPrefix + doi4, pageURL4, false],
 				];
 				for (let route of routes) {
 					let [expectedURL, responseURL, includePDF] = route;
 					
 					if (url != expectedURL) continue;
 					
-					var html = `<html>
+					let html = `<html>
 						<head>
 							<title>Page Title</title>
 							<link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" />
@ -392,19 +434,13 @@ describe("Zotero.Attachments", function() {
 						</head>
 						<body>Body</body>
 					</html>`;
-					let parser = new DOMParser();
-					let doc = parser.parseFromString(html, 'text/html');
-					doc = Zotero.HTTP.wrapDocument(doc, responseURL);
-					return {
-						status: 200,
-						response: doc,
-						responseURL
-					};
+					
+					return makeHTMLResponseFromType(html, options.responseType, responseURL);
 				}
 				
 				// HTML page with PDF download link
 				if (url == pageURL5) {
-					var html = `<html>
+					let html = `<html>
 						<head>
 							<title>Page Title</title>
 						</head>
@ -412,21 +448,13 @@ describe("Zotero.Attachments", function() {
 							<a id="pdf-link" href="${pdfURL}">Download PDF</a>
 						</body>
 					</html>`;
-					let parser = new DOMParser();
-					let doc = parser.parseFromString(html, 'text/html');
-					doc = Zotero.HTTP.wrapDocument(doc, pageURL5);
-					return {
-						status: 200,
-						response: doc,
-						responseURL: pageURL5
-					};
+					
+					return makeHTMLResponseFromType(html, options.responseType, pageURL5);
 				}
 				
 				// JSON response with PDF download links
 				if (url == pageURL6) {
-					return {
-						status: 200,
-						response: {
+					let response = {
 						oa_locations: [
 							{
 								url_for_landing_page: pageURL1
@ -435,8 +463,26 @@ describe("Zotero.Attachments", function() {
 								url_for_pdf: pdfURL
 							}
 						]
-						},
-						responseURL: pageURL6
+					};
+					return {
+						status: 200,
+						response,
+						responseURL: pageURL6,
+						getResponseHeader: makeGetResponseHeader({
+							'Content-Type': 'application/json'
+						})
+					};
+				}
+				
+				// DOI that redirects directly to a PDF
+				if (url == pageURL7) {
+					return {
+						status: 200,
+						response: pdfBlob,
+						responseURL: pdfURL,
+						getResponseHeader: makeGetResponseHeader({
+							'Content-Type': 'application/pdf'
+						})
 					};
 				}
 				
@ -458,15 +504,16 @@ describe("Zotero.Attachments", function() {
 					}
 					return {
 						status: 200,
-						response
+						response,
+						getResponseHeader: makeGetResponseHeader({
+							'Content-Type': 'application/pdf'
+						})
 					};
 				}
 				return origFunc(...arguments);
 			});
 			
-			pdfSize = await OS.File.stat(
-				OS.Path.join(getTestDataDirectory().path, 'test.pdf')
-			).size;
+			pdfSize = await OS.File.stat(pdfPath).size;
 			
 			Zotero.Prefs.clear('findPDFs.resolvers');
 		});
@ -492,7 +539,7 @@ describe("Zotero.Attachments", function() {
 			Zotero.HTTP.request.restore();
 		});
 		
-		it("should add a PDF from a resolved DOI", async function () {
+		it("should add a PDF from a resolved DOI webpage", async function () {
 			var doi = doi1;
 			var item = createUnsavedDataObject('item', { itemType: 'journalArticle' });
 			item.setField('title', 'Test');
@ -510,6 +557,24 @@ describe("Zotero.Attachments", function() {
 			assert.equal(await OS.File.stat(attachment.getFilePath()).size, pdfSize);
 		});
 		
+		it("should add a PDF from a DOI that resolves directly to the file", async function () {
+			var doi = doi5;
+			var item = createUnsavedDataObject('item', { itemType: 'journalArticle' });
+			item.setField('title', 'Test');
+			item.setField('DOI', doi);
+			await item.saveTx();
+			var attachment = await Zotero.Attachments.addAvailablePDF(item);
+			
+			assert.isTrue(requestStub.calledOnce);
+			assert.isTrue(requestStub.calledWith('GET', 'https://doi.org/' + doi));
+			assert.ok(attachment);
+			var json = attachment.toJSON();
+			assert.equal(json.url, pdfURL);
+			assert.equal(json.contentType, 'application/pdf');
+			assert.equal(json.filename, 'Test.pdf');
+			assert.equal(await OS.File.stat(attachment.getFilePath()).size, pdfSize);
+		});
+		
 		it("should add a PDF from a resolved DOI from the Extra field", async function () {
 			var doi = doi1;
 			var item = createUnsavedDataObject('item', { itemType: 'journalArticle' });