Save OA PDFs when the DOI resolves directly to the file
This commit is contained in:
parent
18821984e0
commit
7cf466a0b6
2 changed files with 130 additions and 39 deletions
|
@ -1287,16 +1287,42 @@ Zotero.Attachments = new function(){
|
|||
|
||||
// TODO: Handle redirects manually so we can avoid loading a page we've already
|
||||
// tried
|
||||
let xmlhttp = await Zotero.HTTP.request("GET", pageURL, { responseType: 'document' });
|
||||
responseURL = xmlhttp.responseURL;
|
||||
let req = await Zotero.HTTP.request("GET", pageURL, { responseType: 'blob' });
|
||||
let blob = req.response;
|
||||
responseURL = req.responseURL;
|
||||
if (pageURL != responseURL) {
|
||||
Zotero.debug("Redirected to " + responseURL);
|
||||
}
|
||||
triedPages.add(responseURL);
|
||||
let doc = Zotero.HTTP.wrapDocument(xmlhttp.response, responseURL);
|
||||
|
||||
let contentType = req.getResponseHeader('Content-Type');
|
||||
// If DOI resolves directly to a PDF, save it to disk
|
||||
if (contentType == 'application/pdf') {
|
||||
Zotero.debug("DOI resolves directly to PDF");
|
||||
await Zotero.File.putContentsAsync(path, blob);
|
||||
return { url: responseURL, props: urlResolver };
|
||||
}
|
||||
// Otherwise parse the Blob into a Document and translate that
|
||||
else if (contentType.startsWith('text/html')) {
|
||||
let charset = 'utf-8';
|
||||
let matches = contentType.match(/charset=([a-z0-9\-_+])/i);
|
||||
if (matches) {
|
||||
charset = matches[1];
|
||||
}
|
||||
let responseText = await new Promise(function (resolve) {
|
||||
let fr = new FileReader();
|
||||
fr.addEventListener("loadend", function() {
|
||||
resolve(fr.result);
|
||||
});
|
||||
fr.readAsText(blob, charset);
|
||||
});
|
||||
let parser = Components.classes["@mozilla.org/xmlextras/domparser;1"]
|
||||
.createInstance(Components.interfaces.nsIDOMParser);
|
||||
let doc = parser.parseFromString(responseText, 'text/html');
|
||||
doc = Zotero.HTTP.wrapDocument(doc, responseURL);
|
||||
url = await Zotero.Utilities.Internal.getPDFFromDocument(doc);
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
Zotero.debug(`Error getting PDF from ${pageURL}: ${e}`);
|
||||
continue;
|
||||
|
|
|
@ -343,25 +343,67 @@ describe("Zotero.Attachments", function() {
|
|||
var doi2 = '10.2222/bcde';
|
||||
var doi3 = '10.3333/cdef';
|
||||
var doi4 = '10.4444/defg';
|
||||
var doi5 = '10.5555/efgh';
|
||||
var pageURL1 = 'http://website/article1';
|
||||
var pageURL2 = 'http://website/article2';
|
||||
var pageURL3 = 'http://website/article3';
|
||||
var pageURL4 = 'http://website/article4';
|
||||
var pageURL5 = `http://website/${doi4}`;
|
||||
var pageURL6 = `http://website/${doi4}/json`;
|
||||
var pageURL7 = doiPrefix + doi5;
|
||||
|
||||
Components.utils.import("resource://zotero-unit/httpd.js");
|
||||
var httpd;
|
||||
var port = 16213;
|
||||
var baseURL = `http://localhost:${port}/`;
|
||||
var pdfPath = OS.Path.join(getTestDataDirectory().path, 'test.pdf');
|
||||
var pdfURL = `${baseURL}article1/pdf`;
|
||||
var pdfSize;
|
||||
var requestStub;
|
||||
|
||||
function makeGetResponseHeader(headers) {
|
||||
return function (header) {
|
||||
if (headers[header] !== undefined) {
|
||||
return headers[header];
|
||||
}
|
||||
throw new Error("Unimplemented");
|
||||
};
|
||||
}
|
||||
|
||||
function makeHTMLResponseFromType(html, responseType, responseURL) {
|
||||
var response;
|
||||
if (responseType == 'document') {
|
||||
let parser = new DOMParser();
|
||||
let doc = parser.parseFromString(html, 'text/html');
|
||||
doc = Zotero.HTTP.wrapDocument(doc, responseURL);
|
||||
response = doc;
|
||||
}
|
||||
else if (responseType == 'blob') {
|
||||
let blob = new Blob([html], {type: 'text/html'});
|
||||
response = blob;
|
||||
}
|
||||
else {
|
||||
throw new Error("Request not mocked");
|
||||
}
|
||||
|
||||
return {
|
||||
status: 200,
|
||||
response,
|
||||
responseURL,
|
||||
getResponseHeader: makeGetResponseHeader({
|
||||
'Content-Type': 'text/html'
|
||||
})
|
||||
};
|
||||
}
|
||||
|
||||
before(async function () {
|
||||
var pdfBlob = await File.createFromFileName(pdfPath);
|
||||
|
||||
var origFunc = Zotero.HTTP.request.bind(Zotero.HTTP);
|
||||
requestStub = sinon.stub(Zotero.HTTP, 'request');
|
||||
requestStub.callsFake(function (method, url, options) {
|
||||
Zotero.debug("Intercepting " + method + " " + url);
|
||||
|
||||
// Page responses
|
||||
var routes = [
|
||||
// Page 1 contains a PDF
|
||||
|
@ -376,14 +418,14 @@ describe("Zotero.Attachments", function() {
|
|||
[doiPrefix + doi3, pageURL2, false],
|
||||
[pageURL3, pageURL3, true],
|
||||
// DOI 4 redirects to page 4, which doesn't contain a PDF
|
||||
[doiPrefix + doi4, pageURL4, false]
|
||||
[doiPrefix + doi4, pageURL4, false],
|
||||
];
|
||||
for (let route of routes) {
|
||||
let [expectedURL, responseURL, includePDF] = route;
|
||||
|
||||
if (url != expectedURL) continue;
|
||||
|
||||
var html = `<html>
|
||||
let html = `<html>
|
||||
<head>
|
||||
<title>Page Title</title>
|
||||
<link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" />
|
||||
|
@ -392,19 +434,13 @@ describe("Zotero.Attachments", function() {
|
|||
</head>
|
||||
<body>Body</body>
|
||||
</html>`;
|
||||
let parser = new DOMParser();
|
||||
let doc = parser.parseFromString(html, 'text/html');
|
||||
doc = Zotero.HTTP.wrapDocument(doc, responseURL);
|
||||
return {
|
||||
status: 200,
|
||||
response: doc,
|
||||
responseURL
|
||||
};
|
||||
|
||||
return makeHTMLResponseFromType(html, options.responseType, responseURL);
|
||||
}
|
||||
|
||||
// HTML page with PDF download link
|
||||
if (url == pageURL5) {
|
||||
var html = `<html>
|
||||
let html = `<html>
|
||||
<head>
|
||||
<title>Page Title</title>
|
||||
</head>
|
||||
|
@ -412,21 +448,13 @@ describe("Zotero.Attachments", function() {
|
|||
<a id="pdf-link" href="${pdfURL}">Download PDF</a>
|
||||
</body>
|
||||
</html>`;
|
||||
let parser = new DOMParser();
|
||||
let doc = parser.parseFromString(html, 'text/html');
|
||||
doc = Zotero.HTTP.wrapDocument(doc, pageURL5);
|
||||
return {
|
||||
status: 200,
|
||||
response: doc,
|
||||
responseURL: pageURL5
|
||||
};
|
||||
|
||||
return makeHTMLResponseFromType(html, options.responseType, pageURL5);
|
||||
}
|
||||
|
||||
// JSON response with PDF download links
|
||||
if (url == pageURL6) {
|
||||
return {
|
||||
status: 200,
|
||||
response: {
|
||||
let response = {
|
||||
oa_locations: [
|
||||
{
|
||||
url_for_landing_page: pageURL1
|
||||
|
@ -435,8 +463,26 @@ describe("Zotero.Attachments", function() {
|
|||
url_for_pdf: pdfURL
|
||||
}
|
||||
]
|
||||
},
|
||||
responseURL: pageURL6
|
||||
};
|
||||
return {
|
||||
status: 200,
|
||||
response,
|
||||
responseURL: pageURL6,
|
||||
getResponseHeader: makeGetResponseHeader({
|
||||
'Content-Type': 'application/json'
|
||||
})
|
||||
};
|
||||
}
|
||||
|
||||
// DOI that redirects directly to a PDF
|
||||
if (url == pageURL7) {
|
||||
return {
|
||||
status: 200,
|
||||
response: pdfBlob,
|
||||
responseURL: pdfURL,
|
||||
getResponseHeader: makeGetResponseHeader({
|
||||
'Content-Type': 'application/pdf'
|
||||
})
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -458,15 +504,16 @@ describe("Zotero.Attachments", function() {
|
|||
}
|
||||
return {
|
||||
status: 200,
|
||||
response
|
||||
response,
|
||||
getResponseHeader: makeGetResponseHeader({
|
||||
'Content-Type': 'application/pdf'
|
||||
})
|
||||
};
|
||||
}
|
||||
return origFunc(...arguments);
|
||||
});
|
||||
|
||||
pdfSize = await OS.File.stat(
|
||||
OS.Path.join(getTestDataDirectory().path, 'test.pdf')
|
||||
).size;
|
||||
pdfSize = await OS.File.stat(pdfPath).size;
|
||||
|
||||
Zotero.Prefs.clear('findPDFs.resolvers');
|
||||
});
|
||||
|
@ -492,7 +539,7 @@ describe("Zotero.Attachments", function() {
|
|||
Zotero.HTTP.request.restore();
|
||||
});
|
||||
|
||||
it("should add a PDF from a resolved DOI", async function () {
|
||||
it("should add a PDF from a resolved DOI webpage", async function () {
|
||||
var doi = doi1;
|
||||
var item = createUnsavedDataObject('item', { itemType: 'journalArticle' });
|
||||
item.setField('title', 'Test');
|
||||
|
@ -510,6 +557,24 @@ describe("Zotero.Attachments", function() {
|
|||
assert.equal(await OS.File.stat(attachment.getFilePath()).size, pdfSize);
|
||||
});
|
||||
|
||||
it("should add a PDF from a DOI that resolves directly to the file", async function () {
|
||||
var doi = doi5;
|
||||
var item = createUnsavedDataObject('item', { itemType: 'journalArticle' });
|
||||
item.setField('title', 'Test');
|
||||
item.setField('DOI', doi);
|
||||
await item.saveTx();
|
||||
var attachment = await Zotero.Attachments.addAvailablePDF(item);
|
||||
|
||||
assert.isTrue(requestStub.calledOnce);
|
||||
assert.isTrue(requestStub.calledWith('GET', 'https://doi.org/' + doi));
|
||||
assert.ok(attachment);
|
||||
var json = attachment.toJSON();
|
||||
assert.equal(json.url, pdfURL);
|
||||
assert.equal(json.contentType, 'application/pdf');
|
||||
assert.equal(json.filename, 'Test.pdf');
|
||||
assert.equal(await OS.File.stat(attachment.getFilePath()).size, pdfSize);
|
||||
});
|
||||
|
||||
it("should add a PDF from a resolved DOI from the Extra field", async function () {
|
||||
var doi = doi1;
|
||||
var item = createUnsavedDataObject('item', { itemType: 'journalArticle' });
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue