Automatically download open-access PDFs via Add Item by Identifier

When the associated-files pref is enabled, Add Item by Identifier uses a
Zotero Unpaywall mirror to find available open-access PDFs. No details
about the contents of searches are logged.
This commit is contained in:
Dan Stillman 2018-06-16 14:34:29 -04:00
parent a9dcea7b13
commit f3a62f5a63
5 changed files with 289 additions and 80 deletions

View file

@ -64,10 +64,19 @@ var Zotero_Lookup = new function () {
translate.setTranslator(translators);
try {
yield translate.translate({
let newItems = yield translate.translate({
libraryID,
collections: collection ? [collection.id] : false
})
});
// If there's a DOI and we don't yet have a file, check for open-access PDFs
if (identifier.DOI && !newItems.find(x => x.isImportedAttachment())) {
try {
yield Zotero.Attachments.addOpenAccessPDF(newItems[0]);
}
catch (e) {
Zotero.logError(e);
}
}
successful++;
}
// Continue with other ids on failure

View file

@ -343,7 +343,7 @@ Zotero.Attachments = new function(){
};
// Save using remote web browser persist
var externalHandlerImport = Zotero.Promise.coroutine(function* (contentType) {
var externalHandlerImport = async function (contentType) {
// Rename attachment
if (renameIfAllowedType && !fileBaseName && this.getRenamedFileTypes().includes(contentType)) {
let parentItem = Zotero.Items.get(parentItemID);
@ -351,91 +351,47 @@ Zotero.Attachments = new function(){
}
if (fileBaseName) {
let ext = _getExtensionFromURL(url, contentType);
var fileName = fileBaseName + (ext != '' ? '.' + ext : '');
var filename = fileBaseName + (ext != '' ? '.' + ext : '');
}
else {
var fileName = _getFileNameFromURL(url, contentType);
var filename = _getFileNameFromURL(url, contentType);
}
const nsIWBP = Components.interfaces.nsIWebBrowserPersist;
var wbp = Components.classes["@mozilla.org/embedding/browser/nsWebBrowserPersist;1"]
.createInstance(nsIWBP);
if(cookieSandbox) cookieSandbox.attachToInterfaceRequestor(wbp);
var encodingFlags = false;
// Create a temporary directory to save to within the storage directory.
// We don't use the normal temp directory because people might have 'storage'
// symlinked to another volume, which makes moving complicated.
var tmpDir = (yield this.createTemporaryStorageDirectory()).path;
var tmpFile = OS.Path.join(tmpDir, fileName);
// symlinked to another volume, which would make the save slower.
var tmpDir = (await this.createTemporaryStorageDirectory()).path;
var tmpFile = OS.Path.join(tmpDir, filename);
var attachmentItem;
// Save to temp dir
var deferred = Zotero.Promise.defer();
wbp.progressListener = new Zotero.WebProgressFinishListener(function() {
deferred.resolve();
});
var nsIURL = Components.classes["@mozilla.org/network/standard-url;1"]
.createInstance(Components.interfaces.nsIURL);
nsIURL.spec = url;
var headers = {};
if (referrer) {
headers.Referer = referrer;
}
Zotero.Utilities.Internal.saveURI(wbp, nsIURL, tmpFile, headers);
yield deferred.promise;
let sample = yield Zotero.File.getContentsAsync(tmpFile, null, 1000);
try {
if (contentType == 'application/pdf' &&
Zotero.MIME.sniffForMIMEType(sample) != 'application/pdf') {
let errString = "Downloaded PDF did not have MIME type "
+ "'application/pdf' in Attachments.importFromURL()";
Zotero.debug(errString, 2);
Zotero.debug(sample, 3);
throw(new Error(errString));
}
// Create DB item
var attachmentItem;
var destDir;
yield Zotero.DB.executeTransaction(function*() {
// Create a new attachment
attachmentItem = new Zotero.Item('attachment');
if (libraryID) {
attachmentItem.libraryID = libraryID;
await this.downloadFile(
url,
tmpFile,
{
cookieSandbox,
referrer,
isPDF: contentType == 'application/pdf'
}
else if (parentItemID) {
let {libraryID: parentLibraryID, key: parentKey} =
Zotero.Items.getLibraryAndKeyFromID(parentItemID);
attachmentItem.libraryID = parentLibraryID;
}
attachmentItem.setField('title', title ? title : fileName);
attachmentItem.setField('url', url);
attachmentItem.setField('accessDate', "CURRENT_TIMESTAMP");
attachmentItem.parentID = parentItemID;
attachmentItem.attachmentLinkMode = Zotero.Attachments.LINK_MODE_IMPORTED_URL;
attachmentItem.attachmentContentType = contentType;
if (collections) {
attachmentItem.setCollections(collections);
}
attachmentItem.attachmentPath = 'storage:' + fileName;
var itemID = yield attachmentItem.save(saveOptions);
Zotero.Fulltext.queueItem(attachmentItem);
// DEBUG: Does this fail if 'storage' is symlinked to another drive?
destDir = this.getStorageDirectory(attachmentItem).path;
yield OS.File.move(tmpDir, destDir);
}.bind(this));
} catch (e) {
);
attachmentItem = await this.createURLAttachmentFromTemporaryStorageDirectory({
directory: tmpDir,
libraryID,
parentItemID,
title,
filename,
url,
contentType,
collections,
saveOptions
});
}
catch (e) {
try {
if (tmpDir) {
yield OS.File.removeDir(tmpDir, { ignoreAbsent: true });
}
if (destDir) {
yield OS.File.removeDir(destDir, { ignoreAbsent: true });
await OS.File.removeDir(tmpDir, { ignoreAbsent: true });
}
}
catch (e) {
@ -445,7 +401,7 @@ Zotero.Attachments = new function(){
}
return attachmentItem;
}.bind(this));
}.bind(this);
var process = function (contentType, hasNativeHandler) {
// If we can load this natively, use a hidden browser
@ -466,6 +422,83 @@ Zotero.Attachments = new function(){
});
/**
* Create an imported-URL attachment using a file downloaded to a temporary directory
* in 'storage', moving the directory into place
*
* We download files to temporary 'storage' directories rather than the normal temporary
* directory because people might have their storage directory on another device, which
* would make the move a copy.
*
* @param {Object} options
* @param {String} options.directory
* @param {Number} options.libraryID
* @param {String} options.filename
* @param {String} options.url
* @param {Number} [options.parentItemID]
* @param {String} [options.title]
* @param {String} options.contentType
* @param {String[]} [options.collections]
* @param {Object} [options.saveOptions]
* @return {Zotero.Item}
*/
this.createURLAttachmentFromTemporaryStorageDirectory = async function (options) {
if (!options.directory) throw new Error("'directory' not provided");
if (!options.libraryID) throw new Error("'libraryID' not provided");
if (!options.filename) throw new Error("'filename' not provided");
if (!options.url) throw new Error("'directory' not provided");
if (!options.contentType) throw new Error("'contentType' not provided");
var notifierQueue = (options.saveOptions && options.saveOptions.notifierQueue)
|| new Zotero.Notifier.Queue;
var attachmentItem = new Zotero.Item('attachment');
try {
// Create DB item
if (options.libraryID) {
attachmentItem.libraryID = options.libraryID;
}
else if (options.parentItemID) {
let {libraryID: parentLibraryID, key: parentKey} =
Zotero.Items.getLibraryAndKeyFromID(options.parentItemID);
attachmentItem.libraryID = parentLibraryID;
}
attachmentItem.setField('title', options.title != undefined ? options.title : options.filename);
attachmentItem.setField('url', options.url);
attachmentItem.setField('accessDate', "CURRENT_TIMESTAMP");
attachmentItem.parentID = options.parentItemID;
attachmentItem.attachmentLinkMode = Zotero.Attachments.LINK_MODE_IMPORTED_URL;
attachmentItem.attachmentContentType = options.contentType;
if (options.collections) {
attachmentItem.setCollections(options.collections);
}
attachmentItem.attachmentPath = 'storage:' + options.filename;
await attachmentItem.saveTx(
Object.assign(
options.saveOptions || {},
{ notifierQueue }
)
);
// Move file to final location
let destDir = this.getStorageDirectory(attachmentItem).path;
try {
await OS.File.move(options.directory, destDir);
}
catch (e) {
await attachmentItem.eraseTx({ notifierQueue });
throw e;
}
}
finally {
await Zotero.Notifier.commit(notifierQueue);
}
Zotero.Fulltext.queueItem(attachmentItem);
return attachmentItem;
};
/**
* Create a link attachment from a URL
*
@ -709,8 +742,140 @@ Zotero.Attachments = new function(){
return attachmentItem;
});
/**
* @param {String} url
* @param {String} path
* @param {Object} [options]
* @param {Object} [options.cookieSandbox]
* @param {String} [options.referrer]
* @param {Boolean} [options.isPDF] - Delete file if not PDF
*/
this.downloadFile = async function (url, path, options = {}) {
Zotero.debug(`Downloading ${url}`);
try {
await new Zotero.Promise(function (resolve) {
var wbp = Components.classes["@mozilla.org/embedding/browser/nsWebBrowserPersist;1"]
.createInstance(Components.interfaces.nsIWebBrowserPersist);
if (options.cookieSandbox) {
options.cookieSandbox.attachToInterfaceRequestor(wbp);
}
wbp.progressListener = new Zotero.WebProgressFinishListener(() => resolve());
var nsIURL = Components.classes["@mozilla.org/network/standard-url;1"]
.createInstance(Components.interfaces.nsIURL);
nsIURL.spec = url;
var headers = {};
if (options.referrer) {
headers.Referer = options.referrer;
}
Zotero.Utilities.Internal.saveURI(wbp, nsIURL, path, headers);
});
// If the file is supposed to be a PDF directory, fail if it's not
let sample = await Zotero.File.getContentsAsync(path, null, 1000);
if (options.isPDF && Zotero.MIME.sniffForMIMEType(sample) != 'application/pdf') {
let errString = "Downloaded PDF was not a PDF";
Zotero.debug(errString, 2);
Zotero.debug(sample, 3);
throw new Error(errString);
}
}
catch (e) {
try {
await OS.File.remove(path, { ignoreAbsent: true });
}
catch (e) {
Zotero.debug(e, 1);
}
throw e;
}
};
/**
* Try to download a file from a list of URLs, keeping the first one that succeeds
*
*
* @param {String[]} urls
* @param {String} path
* @param {Object} [options] - Options to pass to this.downloadFile()
* @return {String|false} - URL that succeeded, or false if none
*/
this.downloadFirstAvailableFile = async function (urls, path, options) {
var url;
while (url = urls.shift()) {
try {
await this.downloadFile(url, path, options);
return url;
}
catch (e) {
Zotero.debug(`Error downloading ${url}: ${e}`);
}
}
return false;
};
/**
* Look for an open-access PDF for an item and add it as an attachment
*
* @param {Zotero.Item} item
* @return {Zotero.Item|false} - New attachment item, or false if unsuccessful
*/
this.addOpenAccessPDF = async function (item) {
if (!Zotero.Prefs.get('downloadAssociatedFiles')) {
return false;
}
var doi = item.getField('DOI');
if (!doi) {
return false;
}
var urls = await Zotero.Utilities.Internal.getOpenAccessPDFURLs(doi);
if (!urls.length) {
return false;
}
var fileBaseName = this.getFileBaseNameFromItem(item);
var tmpDir;
var tmpFile;
var attachmentItem = false;
try {
tmpDir = (await this.createTemporaryStorageDirectory()).path;
tmpFile = OS.Path.join(tmpDir, fileBaseName + '.pdf');
let url = await this.downloadFirstAvailableFile(
urls, tmpFile, { isPDF: true }
);
if (url) {
attachmentItem = await this.createURLAttachmentFromTemporaryStorageDirectory({
directory: tmpDir,
libraryID: item.libraryID,
filename: OS.Path.basename(tmpFile),
url,
contentType: 'application/pdf',
parentItemID: item.id
});
}
else {
await OS.File.removeDir(tmpDir);
}
}
catch (e) {
if (tmpDir) {
await OS.File.removeDir(tmpDir, { ignoreAbsent: true });
}
throw e;
}
return attachmentItem;
};
/**
* @deprecated Use Zotero.Utilities.cleanURL instead
*/

View file

@ -930,6 +930,35 @@ Zotero.Utilities.Internal = {
},
/**
* Look for open-access PDFs for a given DOI using Zotero's Unpaywall mirror
*
* Note: This uses a private API. Please use Unpaywall directly for non-Zotero projects.
*
* @param {String} doi
* @return {String[]} - An array of PDF URLs
*/
getOpenAccessPDFURLs: async function (doi) {
doi = Zotero.Utilities.cleanDOI(doi);
if (!doi) {
throw new Error(`Invalid DOI '${doi}'`);
}
Zotero.debug(`Looking for open-access PDFs for ${doi}`);
var url = ZOTERO_CONFIG.SERVICES_URL + 'oa/search';
var req = await Zotero.HTTP.request('POST', url, {
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({ doi }),
responseType: 'json'
});
var urls = req.response;
Zotero.debug(`Found ${urls.length} ${Zotero.Utilities.pluralize(urls.length, ['URL', 'URLs'])}`);
return urls;
},
/**
* Hyphenate an ISBN based on the registrant table available from
* https://www.isbn-international.org/range_file_generation

View file

@ -12,6 +12,7 @@ var ZOTERO_CONFIG = {
API_URL: 'https://api.zotero.org/',
STREAMING_URL: 'wss://stream.zotero.org/',
RECOGNIZE_URL: 'https://recognize.zotero.org/',
SERVICES_URL: 'https://services.zotero.org/',
API_VERSION: 3,
CONNECTOR_MIN_VERSION: '5.0.39', // show upgrade prompt for requests from below this version
PREF_BRANCH: 'extensions.zotero.',

View file

@ -49,6 +49,11 @@ describe("Add Item by Identifier", function() {
});
});
it.skip("should add a DOI with an open-access PDF");
// e.g., arXiv
it.skip("should not add a PDF if a DOI already retrieves one");
it("should add a PMID", function() {
this.timeout(10000);
return lookupIdentifier(win, "24297125").then(function(ids) {