Merge attachments and update notes (#2336)
We follow a different merge procedure for each attachment type: - For PDF attachments, compare by MD5. If no match, get the top 50 words in the attachment's text and hash those, then check again for a match. Update references to item keys in notes and annotations. - For web (snapshot / link) attachments, compare by title and URL. Prefer a title + URL match but accept a title-only match. - For other attachment types, keep all attachments from all items being merged. Also: - Move most merge tests from Duplicates to Items#merge(). It just doesn't make sense to worry about the UI in these.
This commit is contained in:
parent
8e8b03e5ff
commit
ef82becf00
13 changed files with 753 additions and 95 deletions
|
@ -413,6 +413,371 @@ describe("Zotero.Items", function () {
|
|||
var rels = item3.getRelationsByPredicate(predicate);
|
||||
assert.deepEqual(rels, [item2URI]);
|
||||
})
|
||||
|
||||
it("should merge identical attachments based on file hash", async function () {
|
||||
let item1 = await createDataObject('item', { setTitle: true });
|
||||
let attachment1 = await importPDFAttachment(item1);
|
||||
|
||||
let item2 = item1.clone();
|
||||
await item2.saveTx();
|
||||
let attachment2 = await importPDFAttachment(item2);
|
||||
|
||||
await Zotero.Items.merge(item1, [item2]);
|
||||
|
||||
assert.isFalse(item1.deleted);
|
||||
assert.isFalse(attachment1.deleted);
|
||||
assert.equal(item1.numAttachments(true), 1);
|
||||
assert.isTrue(item2.deleted);
|
||||
assert.isTrue(attachment2.deleted);
|
||||
});
|
||||
|
||||
it("should merge one attachment per item into the master attachment", async function () {
|
||||
let item1 = await createDataObject('item', { setTitle: true });
|
||||
let attachment1 = await importPDFAttachment(item1);
|
||||
|
||||
let item2 = item1.clone();
|
||||
await item2.saveTx();
|
||||
let attachment2 = await importPDFAttachment(item2);
|
||||
|
||||
let item3 = item1.clone();
|
||||
await item3.saveTx();
|
||||
let attachment3 = await importPDFAttachment(item3);
|
||||
|
||||
await Zotero.Items.merge(item1, [item2, item3]);
|
||||
|
||||
assert.isFalse(item1.deleted);
|
||||
assert.isFalse(attachment1.deleted);
|
||||
assert.equal(item1.numAttachments(true), 1);
|
||||
assert.isTrue(item2.deleted);
|
||||
assert.isTrue(attachment2.deleted);
|
||||
assert.isTrue(item3.deleted);
|
||||
assert.isTrue(attachment3.deleted);
|
||||
});
|
||||
|
||||
it("should merge identical attachments based on content hash", async function () {
|
||||
let item1 = await createDataObject('item', { setTitle: true });
|
||||
let attachment1 = await importPDFAttachment(item1);
|
||||
|
||||
let item2 = item1.clone();
|
||||
await item2.saveTx();
|
||||
let attachment2 = await importFileAttachment('duplicatesMerge_test_new_md5.pdf', { parentItemID: item2.id });
|
||||
|
||||
assert.equal(await attachment1.attachmentText, await attachment2.attachmentText);
|
||||
assert.notEqual(await attachment1.attachmentHash, await attachment2.attachmentHash);
|
||||
|
||||
await Zotero.Items.merge(item1, [item2]);
|
||||
|
||||
assert.isFalse(item1.deleted);
|
||||
assert.isFalse(attachment1.deleted);
|
||||
assert.equal(item1.numAttachments(true), 1);
|
||||
assert.isTrue(item2.deleted);
|
||||
assert.isTrue(attachment2.deleted);
|
||||
});
|
||||
|
||||
it("shouldn't merge based on content hash when files are empty", async function () {
|
||||
let item1 = await createDataObject('item', { setTitle: true });
|
||||
let attachment1 = await importFileAttachment('empty.pdf', { parentItemID: item1.id });
|
||||
|
||||
let item2 = item1.clone();
|
||||
await item2.saveTx();
|
||||
let attachment2 = await importFileAttachment('duplicatesMerge_empty_new_md5.pdf', { parentItemID: item2.id });
|
||||
|
||||
assert.equal(await attachment1.attachmentText, await attachment2.attachmentText);
|
||||
assert.notEqual(await attachment1.attachmentHash, await attachment2.attachmentHash);
|
||||
assert.isEmpty(await attachment1.attachmentText);
|
||||
|
||||
await Zotero.Items.merge(item1, [item2]);
|
||||
|
||||
assert.isFalse(item1.deleted);
|
||||
assert.isFalse(attachment1.deleted);
|
||||
assert.equal(item1.numAttachments(true), 2);
|
||||
assert.isTrue(item2.deleted);
|
||||
assert.isFalse(attachment2.deleted);
|
||||
});
|
||||
|
||||
it("should allow small differences when hashing content", async function () {
|
||||
let item1 = await createDataObject('item', { setTitle: true });
|
||||
let attachment1 = await importFileAttachment('duplicatesMerge_JSTOR_1.pdf', { parentItemID: item1.id });
|
||||
|
||||
let item2 = item1.clone();
|
||||
await item2.saveTx();
|
||||
let attachment2 = await importFileAttachment('duplicatesMerge_JSTOR_2.pdf', { parentItemID: item2.id });
|
||||
|
||||
assert.notEqual(await attachment1.attachmentText, await attachment2.attachmentText);
|
||||
assert.notEqual(await attachment1.attachmentHash, await attachment2.attachmentHash);
|
||||
assert.equal(
|
||||
(await Zotero.Items._hashAttachmentText(attachment1)).fromText,
|
||||
(await Zotero.Items._hashAttachmentText(attachment2)).fromText
|
||||
);
|
||||
|
||||
await Zotero.Items.merge(item1, [item2]);
|
||||
|
||||
assert.isFalse(item1.deleted);
|
||||
assert.isFalse(attachment1.deleted);
|
||||
assert.equal(item1.numAttachments(true), 1);
|
||||
assert.isTrue(item2.deleted);
|
||||
assert.isTrue(attachment2.deleted);
|
||||
});
|
||||
|
||||
it("should keep similar but not identical attachments separate", async function () {
|
||||
let item1 = await createDataObject('item', { setTitle: true });
|
||||
let attachment1 = await importFileAttachment('wonderland_short.pdf', { parentItemID: item1.id });
|
||||
|
||||
let item2 = item1.clone();
|
||||
await item2.saveTx();
|
||||
let attachment2 = await importFileAttachment('wonderland_long.pdf', { parentItemID: item2.id });
|
||||
|
||||
assert.notEqual(await attachment1.attachmentText, await attachment2.attachmentText);
|
||||
assert.notEqual(await attachment1.attachmentHash, await attachment2.attachmentHash);
|
||||
|
||||
await Zotero.Items.merge(item1, [item2]);
|
||||
|
||||
assert.isFalse(item1.deleted);
|
||||
assert.isFalse(attachment1.deleted);
|
||||
assert.equal(item1.numAttachments(true), 2);
|
||||
assert.isTrue(item2.deleted);
|
||||
assert.isFalse(attachment2.deleted);
|
||||
});
|
||||
|
||||
it("should only match attachments one-to-one", async function () {
|
||||
let item1 = await createDataObject('item', { setTitle: true });
|
||||
let attachment1 = await importFileAttachment('wonderland_short_watermarked_1.pdf', { parentItemID: item1.id });
|
||||
|
||||
let item2 = item1.clone();
|
||||
await item2.saveTx();
|
||||
let attachment2 = await importFileAttachment('wonderland_short_watermarked_2.pdf', { parentItemID: item2.id });
|
||||
let attachment3 = await importFileAttachment('wonderland_short_watermarked_2.pdf', { parentItemID: item2.id });
|
||||
|
||||
await Zotero.Items.merge(item1, [item2]);
|
||||
|
||||
assert.isFalse(item1.deleted);
|
||||
assert.isFalse(attachment1.deleted);
|
||||
assert.equal(item1.numAttachments(true), 2);
|
||||
assert.isTrue(item2.deleted);
|
||||
// Doesn't matter which got merged
|
||||
assert.isTrue((attachment2.deleted || attachment3.deleted) && !(attachment2.deleted && attachment3.deleted));
|
||||
});
|
||||
|
||||
it("should copy annotations when merging", async function () {
|
||||
let item1 = await createDataObject('item', { setTitle: true });
|
||||
let attachment1 = await importPDFAttachment(item1);
|
||||
let annotation1 = await createAnnotation('note', attachment1);
|
||||
|
||||
let item2 = item1.clone();
|
||||
await item2.saveTx();
|
||||
let attachment2 = await importPDFAttachment(item2);
|
||||
let annotation2 = await createAnnotation('highlight', attachment2);
|
||||
let annotation2Note = await Zotero.EditorInstance.createNoteFromAnnotations([annotation2], item2.id);
|
||||
|
||||
assert.include(annotation2Note.getNote(), attachment2.key);
|
||||
|
||||
await Zotero.Items.merge(item1, [item2]);
|
||||
|
||||
assert.isFalse(item1.deleted);
|
||||
assert.isFalse(attachment1.deleted);
|
||||
assert.isFalse(annotation1.deleted);
|
||||
assert.equal(item1.numAttachments(true), 1);
|
||||
assert.isTrue(item2.deleted);
|
||||
assert.isTrue(attachment2.deleted);
|
||||
assert.isFalse(annotation2.deleted);
|
||||
assert.equal(annotation1.parentItemID, attachment1.id);
|
||||
assert.equal(annotation2.parentItemID, attachment1.id);
|
||||
assert.notInclude(annotation2Note.getNote(), item2.key);
|
||||
assert.include(annotation2Note.getNote(), item1.key);
|
||||
assert.notInclude(annotation2Note.getNote(), attachment2.key);
|
||||
assert.include(annotation2Note.getNote(), attachment1.key);
|
||||
});
|
||||
|
||||
it("should update all item keys when moving notes", async function () {
|
||||
let attachmentFilenames = [
|
||||
'recognizePDF_test_arXiv.pdf',
|
||||
'recognizePDF_test_DOI.pdf',
|
||||
'recognizePDF_test_title.pdf'
|
||||
];
|
||||
|
||||
let item1 = await createDataObject('item', { setTitle: true });
|
||||
let attachments1 = [];
|
||||
for (let filename of attachmentFilenames) {
|
||||
let attachment = await importFileAttachment(filename, { parentID: item1.id });
|
||||
attachments1.push(attachment);
|
||||
}
|
||||
|
||||
let item2 = item1.clone();
|
||||
await item2.saveTx();
|
||||
let attachments2 = [];
|
||||
let annotations2 = [];
|
||||
let notes2 = [];
|
||||
for (let filename of attachmentFilenames) {
|
||||
let attachment = await importFileAttachment(filename, { parentID: item2.id });
|
||||
let annotation = await createAnnotation('highlight', attachment);
|
||||
let note = await Zotero.EditorInstance.createNoteFromAnnotations([annotation], item2.id);
|
||||
attachments2.push(attachment);
|
||||
annotations2.push(annotation);
|
||||
notes2.push(note);
|
||||
|
||||
assert.include(note.getNote(), item2.key);
|
||||
assert.include(note.getNote(), attachment.key);
|
||||
}
|
||||
|
||||
await Zotero.Items.merge(item1, [item2]);
|
||||
|
||||
assert.isFalse(item1.deleted);
|
||||
assert.equal(item1.numAttachments(true), 3);
|
||||
assert.isTrue(item2.deleted);
|
||||
|
||||
for (let i = 0; i < 3; i++) {
|
||||
let attachment1 = attachments1[i];
|
||||
let attachment2 = attachments2[i];
|
||||
let note = notes2[i];
|
||||
|
||||
assert.equal(note.parentItemID, item1.id);
|
||||
assert.include(note.getNote(), item1.key);
|
||||
assert.notInclude(note.getNote(), item2.key);
|
||||
assert.include(note.getNote(), attachment1.key);
|
||||
assert.notInclude(note.getNote(), attachment2.key);
|
||||
}
|
||||
});
|
||||
|
||||
it("should merge snapshots with the same title, even if URL differs", async function () {
|
||||
let content = getTestDataDirectory();
|
||||
content.append('snapshot');
|
||||
content.append('index.html');
|
||||
|
||||
let snapshotContent = await Zotero.File.getContentsAsync(content);
|
||||
|
||||
let item1 = await createDataObject('item', { setTitle: true });
|
||||
let attachment1 = await Zotero.Attachments.importFromSnapshotContent({
|
||||
parentItemID: item1.id,
|
||||
url: 'https://example.com/test.html',
|
||||
title: 'Snapshot',
|
||||
snapshotContent
|
||||
});
|
||||
|
||||
let item2 = item1.clone();
|
||||
await item2.saveTx();
|
||||
let attachment2 = await Zotero.Attachments.importFromSnapshotContent({
|
||||
parentItemID: item2.id,
|
||||
url: 'https://otherdomain.example.com/test.html',
|
||||
title: 'Snapshot',
|
||||
snapshotContent
|
||||
});
|
||||
|
||||
await Zotero.Items.merge(item1, [item2]);
|
||||
|
||||
assert.isFalse(item1.deleted);
|
||||
assert.isFalse(attachment1.deleted);
|
||||
assert.equal(item1.numAttachments(true), 1);
|
||||
assert.isTrue(item2.deleted);
|
||||
assert.isTrue(attachment2.deleted);
|
||||
});
|
||||
|
||||
it("should merge linked URLs", async function () {
|
||||
let item1 = await createDataObject('item', { setTitle: true });
|
||||
let attachment1 = await Zotero.Attachments.linkFromURL({
|
||||
url: 'https://example.com/',
|
||||
title: 'Catalog Entry',
|
||||
parentItemID: item1.id
|
||||
});
|
||||
|
||||
let item2 = item1.clone();
|
||||
await item2.saveTx();
|
||||
let attachment2 = await Zotero.Attachments.linkFromURL({
|
||||
url: 'https://example.com/',
|
||||
title: 'Catalog Entry',
|
||||
parentItemID: item2.id
|
||||
});
|
||||
let attachment3 = await Zotero.Attachments.linkFromURL({
|
||||
url: 'https://example.com/',
|
||||
title: 'Catalog Entry',
|
||||
parentItemID: item2.id
|
||||
});
|
||||
|
||||
await Zotero.Items.merge(item1, [item2]);
|
||||
|
||||
assert.isFalse(item1.deleted);
|
||||
assert.isFalse(attachment1.deleted);
|
||||
assert.equal(attachment1.getField('url'), 'https://example.com/');
|
||||
assert.equal(item1.numAttachments(true), 2);
|
||||
assert.isTrue(item2.deleted);
|
||||
assert.isTrue(attachment2.deleted);
|
||||
assert.equal(attachment3.parentItemID, item1.id);
|
||||
assert.isFalse(attachment3.deleted);
|
||||
});
|
||||
|
||||
it("should keep web attachment with same URL but different title", async function () {
|
||||
let item1 = await createDataObject('item', { setTitle: true });
|
||||
let attachment1 = await Zotero.Attachments.linkFromURL({
|
||||
url: 'https://example.com/',
|
||||
title: 'Catalog Entry',
|
||||
parentItemID: item1.id
|
||||
});
|
||||
|
||||
let item2 = item1.clone();
|
||||
await item2.saveTx();
|
||||
let attachment2 = await Zotero.Attachments.linkFromURL({
|
||||
url: 'https://example.com/',
|
||||
title: 'Official Website',
|
||||
parentItemID: item2.id
|
||||
});
|
||||
let attachment3 = await Zotero.Attachments.linkFromURL({
|
||||
url: 'https://example.com/',
|
||||
title: 'Catalog Entry',
|
||||
parentItemID: item2.id
|
||||
});
|
||||
|
||||
await Zotero.Items.merge(item1, [item2]);
|
||||
|
||||
assert.isFalse(item1.deleted);
|
||||
assert.isFalse(attachment1.deleted);
|
||||
assert.equal(attachment1.getField('url'), 'https://example.com/');
|
||||
assert.equal(item1.numAttachments(true), 2);
|
||||
assert.isTrue(item2.deleted);
|
||||
assert.equal(attachment2.parentItemID, item1.id);
|
||||
assert.isFalse(attachment2.deleted);
|
||||
assert.isTrue(attachment3.deleted);
|
||||
});
|
||||
|
||||
it("should move related items of merged attachments", async function () {
|
||||
let relatedItem = await createDataObject('item');
|
||||
|
||||
let item1 = await createDataObject('item', { setTitle: true });
|
||||
let attachment1 = await importPDFAttachment(item1);
|
||||
|
||||
let item2 = item1.clone();
|
||||
await item2.saveTx();
|
||||
let attachment2 = await importPDFAttachment(item2);
|
||||
attachment2.addRelatedItem(relatedItem);
|
||||
|
||||
await Zotero.Items.merge(item1, [item2]);
|
||||
|
||||
assert.isFalse(item1.deleted);
|
||||
assert.isFalse(attachment1.deleted);
|
||||
assert.equal(item1.numAttachments(true), 1);
|
||||
assert.isTrue(item2.deleted);
|
||||
assert.isTrue(attachment2.deleted);
|
||||
assert.lengthOf(attachment1.relatedItems, 1);
|
||||
assert.equal(attachment1.relatedItems[0], relatedItem.key);
|
||||
});
|
||||
|
||||
it("should move merge-tracking relation from replaced attachment to master attachment", async function () {
|
||||
let item1 = await createDataObject('item');
|
||||
let attachment1 = await importPDFAttachment(item1);
|
||||
|
||||
let item2 = await createDataObject('item');
|
||||
let attachment2 = await importPDFAttachment(item2);
|
||||
let attachment2URI = Zotero.URI.getItemURI(attachment2);
|
||||
|
||||
let item3 = await createDataObject('item');
|
||||
let attachment3 = await importPDFAttachment(item3);
|
||||
let attachment3URI = Zotero.URI.getItemURI(attachment3);
|
||||
|
||||
await Zotero.Items.merge(item2, [item3]);
|
||||
await Zotero.Items.merge(item1, [item2]);
|
||||
|
||||
var rels = attachment1.getRelationsByPredicate(Zotero.Relations.replacedItemPredicate);
|
||||
assert.lengthOf(rels, 2);
|
||||
assert.sameMembers(rels, [attachment2URI, attachment3URI]);
|
||||
});
|
||||
})
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue