Save OA PDFs when the DOI resolves directly to the file
This commit is contained in:
parent
18821984e0
commit
7cf466a0b6
2 changed files with 130 additions and 39 deletions
|
@ -1287,15 +1287,41 @@ Zotero.Attachments = new function(){
|
||||||
|
|
||||||
// TODO: Handle redirects manually so we can avoid loading a page we've already
|
// TODO: Handle redirects manually so we can avoid loading a page we've already
|
||||||
// tried
|
// tried
|
||||||
let xmlhttp = await Zotero.HTTP.request("GET", pageURL, { responseType: 'document' });
|
let req = await Zotero.HTTP.request("GET", pageURL, { responseType: 'blob' });
|
||||||
responseURL = xmlhttp.responseURL;
|
let blob = req.response;
|
||||||
|
responseURL = req.responseURL;
|
||||||
if (pageURL != responseURL) {
|
if (pageURL != responseURL) {
|
||||||
Zotero.debug("Redirected to " + responseURL);
|
Zotero.debug("Redirected to " + responseURL);
|
||||||
}
|
}
|
||||||
triedPages.add(responseURL);
|
triedPages.add(responseURL);
|
||||||
let doc = Zotero.HTTP.wrapDocument(xmlhttp.response, responseURL);
|
|
||||||
|
|
||||||
url = await Zotero.Utilities.Internal.getPDFFromDocument(doc);
|
let contentType = req.getResponseHeader('Content-Type');
|
||||||
|
// If DOI resolves directly to a PDF, save it to disk
|
||||||
|
if (contentType == 'application/pdf') {
|
||||||
|
Zotero.debug("DOI resolves directly to PDF");
|
||||||
|
await Zotero.File.putContentsAsync(path, blob);
|
||||||
|
return { url: responseURL, props: urlResolver };
|
||||||
|
}
|
||||||
|
// Otherwise parse the Blob into a Document and translate that
|
||||||
|
else if (contentType.startsWith('text/html')) {
|
||||||
|
let charset = 'utf-8';
|
||||||
|
let matches = contentType.match(/charset=([a-z0-9\-_+])/i);
|
||||||
|
if (matches) {
|
||||||
|
charset = matches[1];
|
||||||
|
}
|
||||||
|
let responseText = await new Promise(function (resolve) {
|
||||||
|
let fr = new FileReader();
|
||||||
|
fr.addEventListener("loadend", function() {
|
||||||
|
resolve(fr.result);
|
||||||
|
});
|
||||||
|
fr.readAsText(blob, charset);
|
||||||
|
});
|
||||||
|
let parser = Components.classes["@mozilla.org/xmlextras/domparser;1"]
|
||||||
|
.createInstance(Components.interfaces.nsIDOMParser);
|
||||||
|
let doc = parser.parseFromString(responseText, 'text/html');
|
||||||
|
doc = Zotero.HTTP.wrapDocument(doc, responseURL);
|
||||||
|
url = await Zotero.Utilities.Internal.getPDFFromDocument(doc);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
catch (e) {
|
catch (e) {
|
||||||
Zotero.debug(`Error getting PDF from ${pageURL}: ${e}`);
|
Zotero.debug(`Error getting PDF from ${pageURL}: ${e}`);
|
||||||
|
|
|
@ -343,25 +343,67 @@ describe("Zotero.Attachments", function() {
|
||||||
var doi2 = '10.2222/bcde';
|
var doi2 = '10.2222/bcde';
|
||||||
var doi3 = '10.3333/cdef';
|
var doi3 = '10.3333/cdef';
|
||||||
var doi4 = '10.4444/defg';
|
var doi4 = '10.4444/defg';
|
||||||
|
var doi5 = '10.5555/efgh';
|
||||||
var pageURL1 = 'http://website/article1';
|
var pageURL1 = 'http://website/article1';
|
||||||
var pageURL2 = 'http://website/article2';
|
var pageURL2 = 'http://website/article2';
|
||||||
var pageURL3 = 'http://website/article3';
|
var pageURL3 = 'http://website/article3';
|
||||||
var pageURL4 = 'http://website/article4';
|
var pageURL4 = 'http://website/article4';
|
||||||
var pageURL5 = `http://website/${doi4}`;
|
var pageURL5 = `http://website/${doi4}`;
|
||||||
var pageURL6 = `http://website/${doi4}/json`;
|
var pageURL6 = `http://website/${doi4}/json`;
|
||||||
|
var pageURL7 = doiPrefix + doi5;
|
||||||
|
|
||||||
Components.utils.import("resource://zotero-unit/httpd.js");
|
Components.utils.import("resource://zotero-unit/httpd.js");
|
||||||
var httpd;
|
var httpd;
|
||||||
var port = 16213;
|
var port = 16213;
|
||||||
var baseURL = `http://localhost:${port}/`;
|
var baseURL = `http://localhost:${port}/`;
|
||||||
|
var pdfPath = OS.Path.join(getTestDataDirectory().path, 'test.pdf');
|
||||||
var pdfURL = `${baseURL}article1/pdf`;
|
var pdfURL = `${baseURL}article1/pdf`;
|
||||||
var pdfSize;
|
var pdfSize;
|
||||||
var requestStub;
|
var requestStub;
|
||||||
|
|
||||||
|
function makeGetResponseHeader(headers) {
|
||||||
|
return function (header) {
|
||||||
|
if (headers[header] !== undefined) {
|
||||||
|
return headers[header];
|
||||||
|
}
|
||||||
|
throw new Error("Unimplemented");
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function makeHTMLResponseFromType(html, responseType, responseURL) {
|
||||||
|
var response;
|
||||||
|
if (responseType == 'document') {
|
||||||
|
let parser = new DOMParser();
|
||||||
|
let doc = parser.parseFromString(html, 'text/html');
|
||||||
|
doc = Zotero.HTTP.wrapDocument(doc, responseURL);
|
||||||
|
response = doc;
|
||||||
|
}
|
||||||
|
else if (responseType == 'blob') {
|
||||||
|
let blob = new Blob([html], {type: 'text/html'});
|
||||||
|
response = blob;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
throw new Error("Request not mocked");
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
status: 200,
|
||||||
|
response,
|
||||||
|
responseURL,
|
||||||
|
getResponseHeader: makeGetResponseHeader({
|
||||||
|
'Content-Type': 'text/html'
|
||||||
|
})
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
before(async function () {
|
before(async function () {
|
||||||
|
var pdfBlob = await File.createFromFileName(pdfPath);
|
||||||
|
|
||||||
var origFunc = Zotero.HTTP.request.bind(Zotero.HTTP);
|
var origFunc = Zotero.HTTP.request.bind(Zotero.HTTP);
|
||||||
requestStub = sinon.stub(Zotero.HTTP, 'request');
|
requestStub = sinon.stub(Zotero.HTTP, 'request');
|
||||||
requestStub.callsFake(function (method, url, options) {
|
requestStub.callsFake(function (method, url, options) {
|
||||||
|
Zotero.debug("Intercepting " + method + " " + url);
|
||||||
|
|
||||||
// Page responses
|
// Page responses
|
||||||
var routes = [
|
var routes = [
|
||||||
// Page 1 contains a PDF
|
// Page 1 contains a PDF
|
||||||
|
@ -376,14 +418,14 @@ describe("Zotero.Attachments", function() {
|
||||||
[doiPrefix + doi3, pageURL2, false],
|
[doiPrefix + doi3, pageURL2, false],
|
||||||
[pageURL3, pageURL3, true],
|
[pageURL3, pageURL3, true],
|
||||||
// DOI 4 redirects to page 4, which doesn't contain a PDF
|
// DOI 4 redirects to page 4, which doesn't contain a PDF
|
||||||
[doiPrefix + doi4, pageURL4, false]
|
[doiPrefix + doi4, pageURL4, false],
|
||||||
];
|
];
|
||||||
for (let route of routes) {
|
for (let route of routes) {
|
||||||
let [expectedURL, responseURL, includePDF] = route;
|
let [expectedURL, responseURL, includePDF] = route;
|
||||||
|
|
||||||
if (url != expectedURL) continue;
|
if (url != expectedURL) continue;
|
||||||
|
|
||||||
var html = `<html>
|
let html = `<html>
|
||||||
<head>
|
<head>
|
||||||
<title>Page Title</title>
|
<title>Page Title</title>
|
||||||
<link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" />
|
<link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" />
|
||||||
|
@ -392,19 +434,13 @@ describe("Zotero.Attachments", function() {
|
||||||
</head>
|
</head>
|
||||||
<body>Body</body>
|
<body>Body</body>
|
||||||
</html>`;
|
</html>`;
|
||||||
let parser = new DOMParser();
|
|
||||||
let doc = parser.parseFromString(html, 'text/html');
|
return makeHTMLResponseFromType(html, options.responseType, responseURL);
|
||||||
doc = Zotero.HTTP.wrapDocument(doc, responseURL);
|
|
||||||
return {
|
|
||||||
status: 200,
|
|
||||||
response: doc,
|
|
||||||
responseURL
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// HTML page with PDF download link
|
// HTML page with PDF download link
|
||||||
if (url == pageURL5) {
|
if (url == pageURL5) {
|
||||||
var html = `<html>
|
let html = `<html>
|
||||||
<head>
|
<head>
|
||||||
<title>Page Title</title>
|
<title>Page Title</title>
|
||||||
</head>
|
</head>
|
||||||
|
@ -412,31 +448,41 @@ describe("Zotero.Attachments", function() {
|
||||||
<a id="pdf-link" href="${pdfURL}">Download PDF</a>
|
<a id="pdf-link" href="${pdfURL}">Download PDF</a>
|
||||||
</body>
|
</body>
|
||||||
</html>`;
|
</html>`;
|
||||||
let parser = new DOMParser();
|
|
||||||
let doc = parser.parseFromString(html, 'text/html');
|
return makeHTMLResponseFromType(html, options.responseType, pageURL5);
|
||||||
doc = Zotero.HTTP.wrapDocument(doc, pageURL5);
|
|
||||||
return {
|
|
||||||
status: 200,
|
|
||||||
response: doc,
|
|
||||||
responseURL: pageURL5
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// JSON response with PDF download links
|
// JSON response with PDF download links
|
||||||
if (url == pageURL6) {
|
if (url == pageURL6) {
|
||||||
|
let response = {
|
||||||
|
oa_locations: [
|
||||||
|
{
|
||||||
|
url_for_landing_page: pageURL1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
url_for_pdf: pdfURL
|
||||||
|
}
|
||||||
|
]
|
||||||
|
};
|
||||||
return {
|
return {
|
||||||
status: 200,
|
status: 200,
|
||||||
response: {
|
response,
|
||||||
oa_locations: [
|
responseURL: pageURL6,
|
||||||
{
|
getResponseHeader: makeGetResponseHeader({
|
||||||
url_for_landing_page: pageURL1
|
'Content-Type': 'application/json'
|
||||||
},
|
})
|
||||||
{
|
};
|
||||||
url_for_pdf: pdfURL
|
}
|
||||||
}
|
|
||||||
]
|
// DOI that redirects directly to a PDF
|
||||||
},
|
if (url == pageURL7) {
|
||||||
responseURL: pageURL6
|
return {
|
||||||
|
status: 200,
|
||||||
|
response: pdfBlob,
|
||||||
|
responseURL: pdfURL,
|
||||||
|
getResponseHeader: makeGetResponseHeader({
|
||||||
|
'Content-Type': 'application/pdf'
|
||||||
|
})
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -458,15 +504,16 @@ describe("Zotero.Attachments", function() {
|
||||||
}
|
}
|
||||||
return {
|
return {
|
||||||
status: 200,
|
status: 200,
|
||||||
response
|
response,
|
||||||
|
getResponseHeader: makeGetResponseHeader({
|
||||||
|
'Content-Type': 'application/pdf'
|
||||||
|
})
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
return origFunc(...arguments);
|
return origFunc(...arguments);
|
||||||
});
|
});
|
||||||
|
|
||||||
pdfSize = await OS.File.stat(
|
pdfSize = await OS.File.stat(pdfPath).size;
|
||||||
OS.Path.join(getTestDataDirectory().path, 'test.pdf')
|
|
||||||
).size;
|
|
||||||
|
|
||||||
Zotero.Prefs.clear('findPDFs.resolvers');
|
Zotero.Prefs.clear('findPDFs.resolvers');
|
||||||
});
|
});
|
||||||
|
@ -492,7 +539,7 @@ describe("Zotero.Attachments", function() {
|
||||||
Zotero.HTTP.request.restore();
|
Zotero.HTTP.request.restore();
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should add a PDF from a resolved DOI", async function () {
|
it("should add a PDF from a resolved DOI webpage", async function () {
|
||||||
var doi = doi1;
|
var doi = doi1;
|
||||||
var item = createUnsavedDataObject('item', { itemType: 'journalArticle' });
|
var item = createUnsavedDataObject('item', { itemType: 'journalArticle' });
|
||||||
item.setField('title', 'Test');
|
item.setField('title', 'Test');
|
||||||
|
@ -510,6 +557,24 @@ describe("Zotero.Attachments", function() {
|
||||||
assert.equal(await OS.File.stat(attachment.getFilePath()).size, pdfSize);
|
assert.equal(await OS.File.stat(attachment.getFilePath()).size, pdfSize);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("should add a PDF from a DOI that resolves directly to the file", async function () {
|
||||||
|
var doi = doi5;
|
||||||
|
var item = createUnsavedDataObject('item', { itemType: 'journalArticle' });
|
||||||
|
item.setField('title', 'Test');
|
||||||
|
item.setField('DOI', doi);
|
||||||
|
await item.saveTx();
|
||||||
|
var attachment = await Zotero.Attachments.addAvailablePDF(item);
|
||||||
|
|
||||||
|
assert.isTrue(requestStub.calledOnce);
|
||||||
|
assert.isTrue(requestStub.calledWith('GET', 'https://doi.org/' + doi));
|
||||||
|
assert.ok(attachment);
|
||||||
|
var json = attachment.toJSON();
|
||||||
|
assert.equal(json.url, pdfURL);
|
||||||
|
assert.equal(json.contentType, 'application/pdf');
|
||||||
|
assert.equal(json.filename, 'Test.pdf');
|
||||||
|
assert.equal(await OS.File.stat(attachment.getFilePath()).size, pdfSize);
|
||||||
|
});
|
||||||
|
|
||||||
it("should add a PDF from a resolved DOI from the Extra field", async function () {
|
it("should add a PDF from a resolved DOI from the Extra field", async function () {
|
||||||
var doi = doi1;
|
var doi = doi1;
|
||||||
var item = createUnsavedDataObject('item', { itemType: 'journalArticle' });
|
var item = createUnsavedDataObject('item', { itemType: 'journalArticle' });
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue