Merge attachments and update notes (#2336)

We follow a different merge procedure for each attachment type:

- For PDF attachments, compare by MD5. If no match, get the top 50 words
  in the attachment's text and hash those, then check again for a match.
  Update references to item keys in notes and annotations.
- For web (snapshot / link) attachments, compare by title and URL.
  Prefer a title + URL match but accept a title-only match.
- For other attachment types, keep all attachments from all items being
  merged.

Also:

- Move most merge tests from Duplicates to Items#merge(). It just doesn't
  make sense to worry about the UI in these.
This commit is contained in:
Abe Jellinek 2022-03-09 22:26:26 +00:00 committed by GitHub
parent 8e8b03e5ff
commit ef82becf00
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 753 additions and 95 deletions

View file

@ -413,6 +413,371 @@ describe("Zotero.Items", function () {
var rels = item3.getRelationsByPredicate(predicate);
assert.deepEqual(rels, [item2URI]);
})
it("should merge identical attachments based on file hash", async function () {
let item1 = await createDataObject('item', { setTitle: true });
let attachment1 = await importPDFAttachment(item1);
let item2 = item1.clone();
await item2.saveTx();
let attachment2 = await importPDFAttachment(item2);
await Zotero.Items.merge(item1, [item2]);
assert.isFalse(item1.deleted);
assert.isFalse(attachment1.deleted);
assert.equal(item1.numAttachments(true), 1);
assert.isTrue(item2.deleted);
assert.isTrue(attachment2.deleted);
});
it("should merge one attachment per item into the master attachment", async function () {
let item1 = await createDataObject('item', { setTitle: true });
let attachment1 = await importPDFAttachment(item1);
let item2 = item1.clone();
await item2.saveTx();
let attachment2 = await importPDFAttachment(item2);
let item3 = item1.clone();
await item3.saveTx();
let attachment3 = await importPDFAttachment(item3);
await Zotero.Items.merge(item1, [item2, item3]);
assert.isFalse(item1.deleted);
assert.isFalse(attachment1.deleted);
assert.equal(item1.numAttachments(true), 1);
assert.isTrue(item2.deleted);
assert.isTrue(attachment2.deleted);
assert.isTrue(item3.deleted);
assert.isTrue(attachment3.deleted);
});
it("should merge identical attachments based on content hash", async function () {
let item1 = await createDataObject('item', { setTitle: true });
let attachment1 = await importPDFAttachment(item1);
let item2 = item1.clone();
await item2.saveTx();
let attachment2 = await importFileAttachment('duplicatesMerge_test_new_md5.pdf', { parentItemID: item2.id });
assert.equal(await attachment1.attachmentText, await attachment2.attachmentText);
assert.notEqual(await attachment1.attachmentHash, await attachment2.attachmentHash);
await Zotero.Items.merge(item1, [item2]);
assert.isFalse(item1.deleted);
assert.isFalse(attachment1.deleted);
assert.equal(item1.numAttachments(true), 1);
assert.isTrue(item2.deleted);
assert.isTrue(attachment2.deleted);
});
it("shouldn't merge based on content hash when files are empty", async function () {
let item1 = await createDataObject('item', { setTitle: true });
let attachment1 = await importFileAttachment('empty.pdf', { parentItemID: item1.id });
let item2 = item1.clone();
await item2.saveTx();
let attachment2 = await importFileAttachment('duplicatesMerge_empty_new_md5.pdf', { parentItemID: item2.id });
assert.equal(await attachment1.attachmentText, await attachment2.attachmentText);
assert.notEqual(await attachment1.attachmentHash, await attachment2.attachmentHash);
assert.isEmpty(await attachment1.attachmentText);
await Zotero.Items.merge(item1, [item2]);
assert.isFalse(item1.deleted);
assert.isFalse(attachment1.deleted);
assert.equal(item1.numAttachments(true), 2);
assert.isTrue(item2.deleted);
assert.isFalse(attachment2.deleted);
});
it("should allow small differences when hashing content", async function () {
let item1 = await createDataObject('item', { setTitle: true });
let attachment1 = await importFileAttachment('duplicatesMerge_JSTOR_1.pdf', { parentItemID: item1.id });
let item2 = item1.clone();
await item2.saveTx();
let attachment2 = await importFileAttachment('duplicatesMerge_JSTOR_2.pdf', { parentItemID: item2.id });
assert.notEqual(await attachment1.attachmentText, await attachment2.attachmentText);
assert.notEqual(await attachment1.attachmentHash, await attachment2.attachmentHash);
assert.equal(
(await Zotero.Items._hashAttachmentText(attachment1)).fromText,
(await Zotero.Items._hashAttachmentText(attachment2)).fromText
);
await Zotero.Items.merge(item1, [item2]);
assert.isFalse(item1.deleted);
assert.isFalse(attachment1.deleted);
assert.equal(item1.numAttachments(true), 1);
assert.isTrue(item2.deleted);
assert.isTrue(attachment2.deleted);
});
it("should keep similar but not identical attachments separate", async function () {
let item1 = await createDataObject('item', { setTitle: true });
let attachment1 = await importFileAttachment('wonderland_short.pdf', { parentItemID: item1.id });
let item2 = item1.clone();
await item2.saveTx();
let attachment2 = await importFileAttachment('wonderland_long.pdf', { parentItemID: item2.id });
assert.notEqual(await attachment1.attachmentText, await attachment2.attachmentText);
assert.notEqual(await attachment1.attachmentHash, await attachment2.attachmentHash);
await Zotero.Items.merge(item1, [item2]);
assert.isFalse(item1.deleted);
assert.isFalse(attachment1.deleted);
assert.equal(item1.numAttachments(true), 2);
assert.isTrue(item2.deleted);
assert.isFalse(attachment2.deleted);
});
it("should only match attachments one-to-one", async function () {
let item1 = await createDataObject('item', { setTitle: true });
let attachment1 = await importFileAttachment('wonderland_short_watermarked_1.pdf', { parentItemID: item1.id });
let item2 = item1.clone();
await item2.saveTx();
let attachment2 = await importFileAttachment('wonderland_short_watermarked_2.pdf', { parentItemID: item2.id });
let attachment3 = await importFileAttachment('wonderland_short_watermarked_2.pdf', { parentItemID: item2.id });
await Zotero.Items.merge(item1, [item2]);
assert.isFalse(item1.deleted);
assert.isFalse(attachment1.deleted);
assert.equal(item1.numAttachments(true), 2);
assert.isTrue(item2.deleted);
// Doesn't matter which got merged
assert.isTrue((attachment2.deleted || attachment3.deleted) && !(attachment2.deleted && attachment3.deleted));
});
it("should copy annotations when merging", async function () {
let item1 = await createDataObject('item', { setTitle: true });
let attachment1 = await importPDFAttachment(item1);
let annotation1 = await createAnnotation('note', attachment1);
let item2 = item1.clone();
await item2.saveTx();
let attachment2 = await importPDFAttachment(item2);
let annotation2 = await createAnnotation('highlight', attachment2);
let annotation2Note = await Zotero.EditorInstance.createNoteFromAnnotations([annotation2], item2.id);
assert.include(annotation2Note.getNote(), attachment2.key);
await Zotero.Items.merge(item1, [item2]);
assert.isFalse(item1.deleted);
assert.isFalse(attachment1.deleted);
assert.isFalse(annotation1.deleted);
assert.equal(item1.numAttachments(true), 1);
assert.isTrue(item2.deleted);
assert.isTrue(attachment2.deleted);
assert.isFalse(annotation2.deleted);
assert.equal(annotation1.parentItemID, attachment1.id);
assert.equal(annotation2.parentItemID, attachment1.id);
assert.notInclude(annotation2Note.getNote(), item2.key);
assert.include(annotation2Note.getNote(), item1.key);
assert.notInclude(annotation2Note.getNote(), attachment2.key);
assert.include(annotation2Note.getNote(), attachment1.key);
});
it("should update all item keys when moving notes", async function () {
let attachmentFilenames = [
'recognizePDF_test_arXiv.pdf',
'recognizePDF_test_DOI.pdf',
'recognizePDF_test_title.pdf'
];
let item1 = await createDataObject('item', { setTitle: true });
let attachments1 = [];
for (let filename of attachmentFilenames) {
let attachment = await importFileAttachment(filename, { parentID: item1.id });
attachments1.push(attachment);
}
let item2 = item1.clone();
await item2.saveTx();
let attachments2 = [];
let annotations2 = [];
let notes2 = [];
for (let filename of attachmentFilenames) {
let attachment = await importFileAttachment(filename, { parentID: item2.id });
let annotation = await createAnnotation('highlight', attachment);
let note = await Zotero.EditorInstance.createNoteFromAnnotations([annotation], item2.id);
attachments2.push(attachment);
annotations2.push(annotation);
notes2.push(note);
assert.include(note.getNote(), item2.key);
assert.include(note.getNote(), attachment.key);
}
await Zotero.Items.merge(item1, [item2]);
assert.isFalse(item1.deleted);
assert.equal(item1.numAttachments(true), 3);
assert.isTrue(item2.deleted);
for (let i = 0; i < 3; i++) {
let attachment1 = attachments1[i];
let attachment2 = attachments2[i];
let note = notes2[i];
assert.equal(note.parentItemID, item1.id);
assert.include(note.getNote(), item1.key);
assert.notInclude(note.getNote(), item2.key);
assert.include(note.getNote(), attachment1.key);
assert.notInclude(note.getNote(), attachment2.key);
}
});
it("should merge snapshots with the same title, even if URL differs", async function () {
let content = getTestDataDirectory();
content.append('snapshot');
content.append('index.html');
let snapshotContent = await Zotero.File.getContentsAsync(content);
let item1 = await createDataObject('item', { setTitle: true });
let attachment1 = await Zotero.Attachments.importFromSnapshotContent({
parentItemID: item1.id,
url: 'https://example.com/test.html',
title: 'Snapshot',
snapshotContent
});
let item2 = item1.clone();
await item2.saveTx();
let attachment2 = await Zotero.Attachments.importFromSnapshotContent({
parentItemID: item2.id,
url: 'https://otherdomain.example.com/test.html',
title: 'Snapshot',
snapshotContent
});
await Zotero.Items.merge(item1, [item2]);
assert.isFalse(item1.deleted);
assert.isFalse(attachment1.deleted);
assert.equal(item1.numAttachments(true), 1);
assert.isTrue(item2.deleted);
assert.isTrue(attachment2.deleted);
});
it("should merge linked URLs", async function () {
let item1 = await createDataObject('item', { setTitle: true });
let attachment1 = await Zotero.Attachments.linkFromURL({
url: 'https://example.com/',
title: 'Catalog Entry',
parentItemID: item1.id
});
let item2 = item1.clone();
await item2.saveTx();
let attachment2 = await Zotero.Attachments.linkFromURL({
url: 'https://example.com/',
title: 'Catalog Entry',
parentItemID: item2.id
});
let attachment3 = await Zotero.Attachments.linkFromURL({
url: 'https://example.com/',
title: 'Catalog Entry',
parentItemID: item2.id
});
await Zotero.Items.merge(item1, [item2]);
assert.isFalse(item1.deleted);
assert.isFalse(attachment1.deleted);
assert.equal(attachment1.getField('url'), 'https://example.com/');
assert.equal(item1.numAttachments(true), 2);
assert.isTrue(item2.deleted);
assert.isTrue(attachment2.deleted);
assert.equal(attachment3.parentItemID, item1.id);
assert.isFalse(attachment3.deleted);
});
it("should keep web attachment with same URL but different title", async function () {
let item1 = await createDataObject('item', { setTitle: true });
let attachment1 = await Zotero.Attachments.linkFromURL({
url: 'https://example.com/',
title: 'Catalog Entry',
parentItemID: item1.id
});
let item2 = item1.clone();
await item2.saveTx();
let attachment2 = await Zotero.Attachments.linkFromURL({
url: 'https://example.com/',
title: 'Official Website',
parentItemID: item2.id
});
let attachment3 = await Zotero.Attachments.linkFromURL({
url: 'https://example.com/',
title: 'Catalog Entry',
parentItemID: item2.id
});
await Zotero.Items.merge(item1, [item2]);
assert.isFalse(item1.deleted);
assert.isFalse(attachment1.deleted);
assert.equal(attachment1.getField('url'), 'https://example.com/');
assert.equal(item1.numAttachments(true), 2);
assert.isTrue(item2.deleted);
assert.equal(attachment2.parentItemID, item1.id);
assert.isFalse(attachment2.deleted);
assert.isTrue(attachment3.deleted);
});
it("should move related items of merged attachments", async function () {
let relatedItem = await createDataObject('item');
let item1 = await createDataObject('item', { setTitle: true });
let attachment1 = await importPDFAttachment(item1);
let item2 = item1.clone();
await item2.saveTx();
let attachment2 = await importPDFAttachment(item2);
attachment2.addRelatedItem(relatedItem);
await Zotero.Items.merge(item1, [item2]);
assert.isFalse(item1.deleted);
assert.isFalse(attachment1.deleted);
assert.equal(item1.numAttachments(true), 1);
assert.isTrue(item2.deleted);
assert.isTrue(attachment2.deleted);
assert.lengthOf(attachment1.relatedItems, 1);
assert.equal(attachment1.relatedItems[0], relatedItem.key);
});
it("should move merge-tracking relation from replaced attachment to master attachment", async function () {
let item1 = await createDataObject('item');
let attachment1 = await importPDFAttachment(item1);
let item2 = await createDataObject('item');
let attachment2 = await importPDFAttachment(item2);
let attachment2URI = Zotero.URI.getItemURI(attachment2);
let item3 = await createDataObject('item');
let attachment3 = await importPDFAttachment(item3);
let attachment3URI = Zotero.URI.getItemURI(attachment3);
await Zotero.Items.merge(item2, [item3]);
await Zotero.Items.merge(item1, [item2]);
var rels = attachment1.getRelationsByPredicate(Zotero.Relations.replacedItemPredicate);
assert.lengthOf(rels, 2);
assert.sameMembers(rels, [attachment2URI, attachment3URI]);
});
})