Add bulk folder import #2252

This commit is contained in:
Tom Najdek 2022-10-09 14:01:50 +02:00
parent 6117221cbc
commit 2820add3d1
No known key found for this signature in database
GPG key ID: EEC61A7B4C667D77
9 changed files with 435 additions and 17 deletions

View file

@ -27,7 +27,7 @@ import PropTypes from 'prop-types';
import { getDOMElement } from 'components/icons'; import { getDOMElement } from 'components/icons';
import VirtualizedTable, { renderCell } from 'components/virtualized-table'; import VirtualizedTable, { renderCell } from 'components/virtualized-table';
import { noop } from './utils'; import { nextHTMLID, noop } from './utils';
function getImageByStatus(status) { function getImageByStatus(status) {
@ -45,8 +45,9 @@ function getImageByStatus(status) {
const ProgressQueueTable = ({ onActivate = noop, progressQueue }) => { const ProgressQueueTable = ({ onActivate = noop, progressQueue }) => {
const treeRef = useRef(null); const treeRef = useRef(null);
const htmlID = useRef(nextHTMLID());
const getRowCount = useCallback(() => progressQueue.getRows().length, [progressQueue]); const getRowCount = useCallback(() => progressQueue.getTotal(), [progressQueue]);
const rowToTreeItem = useCallback((index, selection, oldDiv = null, columns) => { const rowToTreeItem = useCallback((index, selection, oldDiv = null, columns) => {
let rows = progressQueue.getRows(); let rows = progressQueue.getRows();
@ -92,6 +93,7 @@ const ProgressQueueTable = ({ onActivate = noop, progressQueue }) => {
progressQueue.addListener('rowadded', refreshTree); progressQueue.addListener('rowadded', refreshTree);
progressQueue.addListener('rowupdated', refreshTree); progressQueue.addListener('rowupdated', refreshTree);
progressQueue.addListener('rowdeleted', refreshTree); progressQueue.addListener('rowdeleted', refreshTree);
return () => { return () => {
progressQueue.removeListener('rowadded', refreshTree); progressQueue.removeListener('rowadded', refreshTree);
progressQueue.removeListener('rowupdated', refreshTree); progressQueue.removeListener('rowupdated', refreshTree);
@ -103,7 +105,7 @@ const ProgressQueueTable = ({ onActivate = noop, progressQueue }) => {
<VirtualizedTable <VirtualizedTable
getRowCount={ getRowCount } getRowCount={ getRowCount }
ref={ treeRef } ref={ treeRef }
id="progress-queue-table" id={ htmlID.current + '-progress-queue-table' }
renderItem={ rowToTreeItem } renderItem={ rowToTreeItem }
showHeader={ true } showHeader={ true }
columns={ tableColumns } columns={ tableColumns }

View file

@ -1,9 +1,9 @@
/* /*
***** BEGIN LICENSE BLOCK ***** ***** BEGIN LICENSE BLOCK *****
Copyright © 2019 Center for History and New Media Copyright © 2020 Corporation for Digital Scholarship
George Mason University, Fairfax, Virginia, USA Vienna, Virginia, USA
http://zotero.org https://digitalscholar.org
This file is part of Zotero. This file is part of Zotero.
@ -16,17 +16,16 @@
but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details. GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License You should have received a copy of the GNU Affero General Public License
along with Zotero. If not, see <http://www.gnu.org/licenses/>. along with Zotero. If not, see <http://www.gnu.org/licenses/>.
***** END LICENSE BLOCK ***** ***** END LICENSE BLOCK *****
*/ */
'use strict';
const noop = () => {}; const noop = () => {};
function getDragTargetOrient(event, target) { function getDragTargetOrient(event, target) {
const elem = target || event.target; const elem = target || event.target;
const {y, height} = elem.getBoundingClientRect(); const {y, height} = elem.getBoundingClientRect();
@ -72,9 +71,30 @@ function createDragHandler({ handleDrag, handleDragStop }) {
return { return {
start: onDragStart, start: onDragStart,
stop: onDragStop stop: onDragStop
} };
} }
export { var _htmlID = 1;
noop, getDragTargetOrient, createDragHandler
const nextHTMLID = (prefix = 'id-') => prefix + _htmlID++;
const scrollIntoViewIfNeeded = (element, container, opts = {}) => {
const containerTop = container.scrollTop;
const containerBottom = containerTop + container.clientHeight;
const elementTop = element.offsetTop;
const elementBottom = elementTop + element.clientHeight;
if (elementTop < containerTop || elementBottom > containerBottom) {
const before = container.scrollTop;
element.scrollIntoView(opts);
const after = container.scrollTop;
return after - before;
}
return 0;
};
const stopPropagation = ev => ev.stopPropagation();
export {
nextHTMLID, noop, getDragTargetOrient, createDragHandler, scrollIntoViewIfNeeded, stopPropagation
}; };

View file

@ -448,6 +448,15 @@ var Zotero_File_Interface = new function() {
translation.createNewCollection = createNewCollection; translation.createNewCollection = createNewCollection;
translation.mendeleyCode = options.mendeleyCode; translation.mendeleyCode = options.mendeleyCode;
} }
else if (options.folder) {
Components.utils.import("chrome://zotero/content/import/folderImport.js");
translation = new Zotero_Import_Folder({
folder: options.folder,
recreateStructure: options.recreateStructure,
fileTypes: options.fileTypes,
mimeTypes: options.mimeTypes,
});
}
else { else {
// Check if the file is an SQLite database // Check if the file is an SQLite database
var sample = yield Zotero.File.getSample(file.path); var sample = yield Zotero.File.getSample(file.path);

View file

@ -0,0 +1,241 @@
var EXPORTED_SYMBOLS = ["Zotero_Import_Folder"]; // eslint-disable-line no-unused-vars
Components.utils.import("resource://gre/modules/Services.jsm");
Services.scriptloader.loadSubScript("chrome://zotero/content/include.js");
const multimatch = require('multimatch');
const collectFilesRecursive = async (dirPath, parents = [], files = []) => {
await Zotero.File.iterateDirectory(dirPath, async ({ isDir, _isSymlink, name, path }) => {
if (isDir) {
await collectFilesRecursive(path, [...parents, name], files);
}
// TODO: Also check for hidden file attribute on windows?
else if (!name.startsWith('.')) {
files.push({ parents, path, name });
}
});
return files;
};
const findCollection = (libraryID, parentCollectionID, collectionName) => {
const collections = parentCollectionID
? Zotero.Collections.getByParent(parentCollectionID)
: Zotero.Collections.getByLibrary(libraryID);
return collections.find(c => c.name === collectionName);
};
const findItemByHash = async (libraryID, hash) => {
let items = (await Zotero.Relations.getByPredicateAndObject('item', 'zotero:attachmentHash', hash))
.filter(item => item.libraryID == libraryID && !item.deleted && item.isTopLevelItem());
if (!items.length) {
items = (await Zotero.Relations.getByPredicateAndObject('item', 'zotero:fileHash', hash))
.filter(item => item.libraryID == libraryID && !item.deleted && item.isTopLevelItem());
}
if (!items.length) {
return null;
}
return items[0];
};
class Zotero_Import_Folder { // eslint-disable-line camelcase,no-unused-vars
constructor({ mimeTypes = ['application/pdf'], fileTypes, folder, libraryID, recreateStructure }) {
this.folder = folder;
this.libraryID = libraryID;
this.newItems = [];
this.recreateStructure = recreateStructure;
this.fileTypes = fileTypes && fileTypes.length ? fileTypes.split(',').map(ft => ft.trim()) : [];
this._progress = 0;
this._progressMax = 0;
this._itemDone = () => {};
this.types = mimeTypes; // whitelist of mime types to process
}
setLocation(folder) {
this.folder = folder;
}
setHandler(name, handler) {
switch (name) {
case 'itemDone':
this._itemDone = handler;
break;
}
}
setTranslator() {}
getProgress() {
return this._progress / this._progressMax * 100;
}
async getTranslators() {
return [{ label: 'Folder import' }];
}
async translate({ collections = [], linkFiles = false } = {}) {
const libraryID = this.libraryID || Zotero.Libraries.userLibraryID;
const files = await collectFilesRecursive(this.folder);
// import is done in four phases: sniff for mime type, calculate md5, import as attachment, recognize.
// hence number of files is multiplied by 4 to determine max progress
this._progressMax = files.length * 4;
const mimeTypes = await Promise.all(files.map(
async ({ path }) => {
const mimeType = Zotero.MIME.sniffForMIMEType(await Zotero.File.getSample(path));
this._progress++;
this._itemDone();
return mimeType;
}
));
const fileHashes = await Promise.all(files.map(
async ({ name, path }, index) => {
const contentType = mimeTypes[index];
this._progress++;
if (!(this.types.includes(contentType) || multimatch(name, this.fileTypes, { nocase: true }).length > 0)) {
// don't bother calculating a hash for file that will be ignored
return null;
}
const md5Hash = await Zotero.Utilities.Internal.md5Async(path);
this._itemDone();
return md5Hash;
}
));
files.forEach((fileData, index) => {
fileData.parentCollectionIDs = (collections && collections.length) ? [...collections] : [];
fileData.mimeType = mimeTypes[index];
});
if (this.recreateStructure) {
for (const fileData of files) {
const { parents } = fileData;
let prevParentCollectionID = null;
if (parents.length) {
prevParentCollectionID = (collections && collections.length) ? collections[0] : null;
for (const parentName of parents) {
const parentCollection = findCollection(libraryID, prevParentCollectionID, parentName) || new Zotero.Collection;
parentCollection.libraryID = libraryID;
parentCollection.name = parentName;
if (prevParentCollectionID) {
parentCollection.parentID = prevParentCollectionID;
}
await parentCollection.saveTx({ skipSelect: true }); //eslint-disable-line no-await-in-loop
prevParentCollectionID = parentCollection.id;
}
}
if (prevParentCollectionID) {
fileData.parentCollectionIDs = [prevParentCollectionID];
}
}
}
// index files by hash to avoid importing duplicate files. Keep track of where duplicates were found so that
// duplicate item is still added to one collection per folder
const fileDataByHash = {};
files.forEach((fileData, index) => {
const hash = fileHashes[index];
if (hash in fileDataByHash) {
fileDataByHash[hash].parentCollectionIDs.push(...fileData.parentCollectionIDs);
}
else {
fileDataByHash[hash] = fileData;
}
});
// advance progress to account for duplicates found within file structure
// these files won't be imported nor recognized so advance 2 ticks per file
this._progress += 2 * (files.length - Object.keys(fileDataByHash).length);
this._itemDone();
const attachmentItemHashLookup = {};
const attachmentItems = await Promise.all(Object.entries(fileDataByHash).map(
async ([hash, { name, path, parentCollectionIDs, mimeType }]) => {
const options = {
collections: parentCollectionIDs,
contentType: mimeType,
file: path,
libraryID,
};
let attachmentItem = null;
if ((this.types.includes(mimeType) || multimatch(name, this.fileTypes, { nocase: true }).length > 0)) {
const existingItem = await findItemByHash(libraryID, hash);
if (existingItem) {
existingItem.setCollections([...existingItem.getCollections(), ...parentCollectionIDs]);
existingItem.saveTx({ skipSelect: true });
}
else {
if (linkFiles) {
attachmentItem = await Zotero.Attachments.linkFromFile(options);
}
else {
attachmentItem = await Zotero.Attachments.importFromFile(options);
}
this.newItems.push(attachmentItem);
attachmentItemHashLookup[attachmentItem.id] = hash;
}
}
if (attachmentItem && !Zotero.RecognizePDF.canRecognize(attachmentItem)) {
attachmentItem.setRelations({ 'zotero:fileHash': hash });
await attachmentItem.saveTx({ skipSelect: true });
attachmentItem = null;
}
this._progress++;
this._itemDone();
return attachmentItem;
}
));
// discard unrecognizable items, increase progress for discarded items
const recognizableItems = attachmentItems.filter(item => item !== null);
this._progress += attachmentItems.length - recognizableItems.length;
this._itemDone();
const recognizeQueue = Zotero.ProgressQueues.get('recognize');
const itemsToSavePostRecognize = [];
const processRecognizedItem = ({ status, id }) => {
const updatedItem = recognizableItems.find(i => i.id === id);
if (status === Zotero.ProgressQueue.ROW_SUCCEEDED) {
const recognizedItem = updatedItem.parentItem;
if (recognizedItem && id in attachmentItemHashLookup) {
recognizedItem.setRelations({ 'zotero:attachmentHash': attachmentItemHashLookup[id] });
itemsToSavePostRecognize.push(recognizedItem);
}
}
if (status === Zotero.ProgressQueue.ROW_FAILED) {
if (updatedItem && id in attachmentItemHashLookup) {
updatedItem.setRelations({ 'zotero:fileHash': attachmentItemHashLookup[id] });
itemsToSavePostRecognize.push(updatedItem);
}
}
if ([Zotero.ProgressQueue.ROW_FAILED, Zotero.ProgressQueue.ROW_SUCCEEDED].includes(status)) {
this._progress++;
this._itemDone();
}
};
recognizeQueue.addListener('rowupdated', processRecognizedItem);
try {
await Zotero.RecognizePDF.recognizeItems(recognizableItems);
}
finally {
recognizeQueue.removeListener('rowupdated', processRecognizedItem);
}
await Zotero.Promise.all(
itemsToSavePostRecognize.map(async item => item.saveTx({ skipSelect: true }))
);
}
}

View file

@ -33,7 +33,8 @@ Zotero.Relations = new function () {
this._namespaces = { this._namespaces = {
dc: 'http://purl.org/dc/elements/1.1/', dc: 'http://purl.org/dc/elements/1.1/',
owl: 'http://www.w3.org/2002/07/owl#', owl: 'http://www.w3.org/2002/07/owl#',
mendeleyDB: 'http://zotero.org/namespaces/mendeleyDB#' mendeleyDB: 'http://zotero.org/namespaces/mendeleyDB#',
zotero: 'http://zotero.org/namespaces/zotero'
}; };
var _types = ['collection', 'item']; var _types = ['collection', 'item'];

View file

@ -44,6 +44,7 @@
"colors": "^1.4.0", "colors": "^1.4.0",
"eslint": "^8.5.0", "eslint": "^8.5.0",
"eslint-plugin-react": "^7.28.0", "eslint-plugin-react": "^7.28.0",
"eslint-plugin-react-hooks": "^4.0.4",
"fs-extra": "^3.0.1", "fs-extra": "^3.0.1",
"globby": "^6.1.0", "globby": "^6.1.0",
"jspath": "^0.4.0", "jspath": "^0.4.0",

View file

@ -33,8 +33,9 @@ var ZOTERO_CONFIG = {
PLUGINS_URL: 'https://www.zotero.org/support/plugins', PLUGINS_URL: 'https://www.zotero.org/support/plugins',
}; };
if (typeof process === 'object' && process + '' === '[object process]'){ if (typeof exports === 'object' && typeof module !== 'undefined') {
module.exports = ZOTERO_CONFIG; module.exports = ZOTERO_CONFIG;
} else { }
else {
var EXPORTED_SYMBOLS = ["ZOTERO_CONFIG"]; var EXPORTED_SYMBOLS = ["ZOTERO_CONFIG"];
} }

View file

@ -97,7 +97,14 @@ const browserifyConfigs = [
config: { config: {
standalone: 'chaiAsPromised' standalone: 'chaiAsPromised'
} }
} },
{
src: 'node_modules/multimatch/index.js',
dest: 'resource/multimatch.js',
config: {
standalone: 'multimatch'
}
},
]; ];
// exclude mask used for js, copy, symlink and sass tasks // exclude mask used for js, copy, symlink and sass tasks

View file

@ -0,0 +1,136 @@
/* global Zotero_Import_Folder: false */
describe('Zotero_Import_Folder', function () {
var tmpDir;
const uc = (name) => 'Zotero_Import_Folder_' + name;
before(async () => {
tmpDir = await getTempDirectory();
await OS.File.makeDir(OS.Path.join(tmpDir, uc('dir1')));
await OS.File.makeDir(OS.Path.join(tmpDir, uc('dir1'), uc('subdir1')));
await OS.File.makeDir(OS.Path.join(tmpDir, uc('dir2')));
await OS.File.copy(
OS.Path.join(getTestDataDirectory().path, 'recognizePDF_test_title.pdf'),
OS.Path.join(tmpDir, 'recognizePDF_test_title.pdf')
);
await OS.File.copy(
OS.Path.join(getTestDataDirectory().path, 'recognizePDF_test_title.pdf'),
OS.Path.join(tmpDir, uc('dir1'), 'recognizePDF_test_title.pdf')
);
await OS.File.copy(
OS.Path.join(getTestDataDirectory().path, 'recognizePDF_test_arXiv.pdf'),
OS.Path.join(tmpDir, uc('dir1'), uc('subdir1'), 'recognizePDF_test_arXiv.pdf')
);
await OS.File.copy(
OS.Path.join(getTestDataDirectory().path, 'recognizePDF_test_title.pdf'),
OS.Path.join(tmpDir, uc('dir2'), 'recognizePDF_test_title.pdf')
);
await OS.File.copy(
OS.Path.join(getTestDataDirectory().path, 'test.png'),
OS.Path.join(tmpDir, uc('dir2'), 'test.png')
);
await OS.File.copy(
OS.Path.join(getTestDataDirectory().path, 'test.html'),
OS.Path.join(tmpDir, uc('dir2'), 'test.html')
);
await OS.File.copy(
OS.Path.join(getTestDataDirectory().path, 'test.txt'),
OS.Path.join(tmpDir, uc('dir2'), 'test.txt')
);
Components.utils.import('chrome://zotero/content/import/folderImport.js');
});
describe('#import', () => {
it('should import PDFs from a folder and recreate structure without creating duplicates', async function () {
this.timeout(30000);
if (Zotero.automatedTest) {
this.skip();
}
const importer = new Zotero_Import_Folder({
folder: tmpDir,
recreateStructure: true,
});
await importer.translate({
libraryID: Zotero.Libraries.userLibraryID,
linkFiles: true,
});
assert.equal(importer.newItems.length, 2);
const firstPDFAttachment = importer.newItems.find(ni => ni.getField('title') === 'recognizePDF_test_arXiv.pdf');
const firstPDFItem = await Zotero.Items.getAsync(firstPDFAttachment.parentID);
const firstPDFCollections = await Zotero.Collections.getAsync(firstPDFItem.getCollections());
assert.equal(firstPDFItem.getField('title'), 'Scaling study of an improved fermion action on quenched lattices');
assert.equal(firstPDFCollections.length, 1);
assert.equal(firstPDFCollections[0].name, uc('subdir1'));
assert.equal((await Zotero.Collections.getAsync(firstPDFCollections[0].parentID)).name, uc('dir1'));
const secondPDFAttachment = importer.newItems.find(ni => ni.getField('title') === 'recognizePDF_test_title.pdf');
const secondPDFItem = await Zotero.Items.getAsync(secondPDFAttachment.parentID);
const secondPDFCollections = await Zotero.Collections.getAsync(secondPDFItem.getCollections());
assert.equal(secondPDFItem.getField('title'), 'Bitcoin: A Peer-to-Peer Electronic Cash System');
assert.equal(secondPDFCollections.length, 2);
assert.sameMembers(secondPDFCollections.map(c => c.name), [uc('dir1'), uc('dir2')]);
assert.sameMembers(
Zotero.Collections.getByLibrary(Zotero.Libraries.userLibraryID, true)
.map(c => c.name)
.filter(c => c.startsWith('Zotero_Import_Folder')),
[uc('dir1'), uc('dir2'), uc('subdir1')]
);
const importer2 = new Zotero_Import_Folder({
folder: tmpDir,
recreateStructure: true,
});
await importer2.translate({
libraryID: Zotero.Libraries.userLibraryID,
linkFiles: true,
});
assert.lengthOf(importer2.newItems, 0);
assert.sameMembers(
Zotero.Collections.getByLibrary(Zotero.Libraries.userLibraryID, true)
.map(c => c.name)
.filter(c => c.startsWith('Zotero_Import_Folder')),
[uc('dir1'), uc('dir2'), uc('subdir1')]
);
});
it('should only import specified file types from a folder', async function () {
this.timeout(30000);
if (Zotero.automatedTest) {
this.skip();
}
const importer = new Zotero_Import_Folder({
folder: tmpDir,
recreateStructure: false,
fileTypes: '*.png,*.txt',
mimeTypes: []
});
await importer.translate({
libraryID: Zotero.Libraries.userLibraryID,
linkFiles: true,
});
assert.equal(importer.newItems.length, 2);
const pngItem = importer.newItems.find(ni => ni.getField('title') === 'test.png');
assert.isDefined(pngItem);
assert.isFalse(pngItem.parentID);
const txtItem = importer.newItems.find(ni => ni.getField('title') === 'test.txt');
assert.isDefined(txtItem);
assert.isFalse(txtItem.parentID);
const htmlItem = importer.newItems.find(ni => ni.getField('title') === 'test.html');
assert.isUndefined(htmlItem);
});
});
});