Get attachment text on-demand if not cached in Item::attachmentText

Follow-up to 58f515058 with a better approach: if no full-text cache
file, just get text directly without indexing. In the one existing use
of `attachmentText`, attachment merging, this is better anyway, because
we might be deleting the file, so there's no point wasting time
inserting words into the database.
This commit is contained in:
Dan Stillman 2022-03-12 20:17:13 -05:00
parent 46ff3cf4fb
commit 3ec883a7f6
4 changed files with 54 additions and 35 deletions

View file

@ -3502,7 +3502,7 @@ Zotero.defineProperty(Zotero.Item.prototype, 'attachmentHash', {
* @return {Promise<String>} - A promise for attachment text or empty string if unavailable * @return {Promise<String>} - A promise for attachment text or empty string if unavailable
*/ */
Zotero.defineProperty(Zotero.Item.prototype, 'attachmentText', { Zotero.defineProperty(Zotero.Item.prototype, 'attachmentText', {
get: Zotero.Promise.coroutine(function* () { get: async function () {
if (!this.isAttachment()) { if (!this.isAttachment()) {
return undefined; return undefined;
} }
@ -3511,53 +3511,65 @@ Zotero.defineProperty(Zotero.Item.prototype, 'attachmentText', {
return null; return null;
} }
var file = this.getFile(); var path = await this.getFilePathAsync();
if (!(yield OS.File.exists(file.path))) {
file = false;
}
var cacheFile = Zotero.Fulltext.getItemCacheFile(this);
if (!file) {
if (cacheFile.exists()) {
var str = yield Zotero.File.getContentsAsync(cacheFile);
return str.trim();
}
return '';
}
var contentType = this.attachmentContentType; var contentType = this.attachmentContentType;
if (!contentType) { if (!contentType) {
contentType = yield Zotero.MIME.getMIMETypeFromFile(file); if (!path) {
if (contentType) { Zotero.debug(`Can't get attachment text for item ${this.libraryKey}`);
this.attachmentContentType = contentType; return '';
yield this.save();
} }
contentType = await Zotero.MIME.getMIMETypeFromFile(path);
} }
var str; var str;
if (Zotero.Fulltext.isCachedMIMEType(contentType)) { if (Zotero.Fulltext.isCachedMIMEType(contentType)) {
if (!cacheFile.exists()) { // If no cache file or not fully indexed, get text on-demand
Zotero.debug(`Cache file doesn't exist for item ${this.libraryKey}-- returning empty .attachmentText`); let cacheFile = Zotero.Fulltext.getItemCacheFile(this);
return ''; if (!cacheFile.exists() || !await Zotero.FullText.isFullyIndexed(this)) {
// Use processor cache file if it exists
let processorCacheFile = Zotero.FullText.getItemProcessorCacheFile(this).path;
if (await OS.File.exists(processorCacheFile)) {
let json = await Zotero.File.getContentsAsync(processorCacheFile);
let data = JSON.parse(json);
str = data.text;
}
// Otherwise extract text to temporary file and read that
else if (contentType == 'application/pdf') {
let tmpCacheFile = OS.Path.join(
Zotero.getTempDirectory().path, Zotero.Utilities.randomString()
);
let { exec, args } = Zotero.FullText.getPDFConverterExecAndArgs();
args.push(
'-nopgbrk',
path,
tmpCacheFile
);
await Zotero.Utilities.Internal.exec(exec, args);
if (!await OS.File.exists(tmpCacheFile)) {
Zotero.logError("Cache file not found after running PDF converter");
return '';
}
str = await Zotero.File.getContentsAsync(tmpCacheFile);
await OS.File.remove(tmpCacheFile);
}
else {
Zotero.logError("Unsupported cached file type in .attachmentText");
return '';
}
} }
// Return empty string if not fully indexed else {
if (!(yield Zotero.Fulltext.isFullyIndexed(this))) { str = await Zotero.File.getContentsAsync(cacheFile);
Zotero.debug("Item " + this.libraryKey + " is not fully indexed -- returning empty .attachmentText");
return '';
} }
str = yield Zotero.File.getContentsAsync(cacheFile);
} }
else if (contentType == 'text/html') { else if (contentType == 'text/html') {
str = yield Zotero.File.getContentsAsync(file); str = await Zotero.File.getContentsAsync(path);
str = Zotero.Utilities.unescapeHTML(str); str = Zotero.Utilities.unescapeHTML(str);
} }
else if (contentType == 'text/plain') { else if (contentType == 'text/plain') {
str = yield Zotero.File.getContentsAsync(file); str = await Zotero.File.getContentsAsync(path);
} }
else { else {
@ -3565,7 +3577,7 @@ Zotero.defineProperty(Zotero.Item.prototype, 'attachmentText', {
} }
return str.trim(); return str.trim();
}) }
}); });

View file

@ -1209,8 +1209,14 @@ Zotero.Items = function() {
Zotero.debug('_hashAttachmentText: Attachment too large'); Zotero.debug('_hashAttachmentText: Attachment too large');
return null; return null;
} }
let text = await attachment.attachmentText; let text;
try {
text = await attachment.attachmentText;
}
catch (e) {
Zotero.logError(e);
}
if (!text) { if (!text) {
Zotero.debug('_hashAttachmentText: Attachment has no text'); Zotero.debug('_hashAttachmentText: Attachment has no text');
return null; return null;

View file

@ -460,6 +460,7 @@ Zotero.Fulltext = Zotero.FullText = new function(){
var {exec, args} = this.getPDFConverterExecAndArgs(); var {exec, args} = this.getPDFConverterExecAndArgs();
// Keep in sync with Item::attachmentText
args.push('-nopgbrk'); args.push('-nopgbrk');
if (allPages) { if (allPages) {

View file

@ -474,7 +474,7 @@ describe("Zotero.Items", function () {
assert.isTrue(attachment2.deleted); assert.isTrue(attachment2.deleted);
}); });
it.skip("should merge identical attachments based on content hash when unindexed", async function () { it("should merge identical attachments based on content hash when unindexed", async function () {
let item1 = await createDataObject('item'); let item1 = await createDataObject('item');
let attachment1 = await importPDFAttachment(item1); let attachment1 = await importPDFAttachment(item1);