Get attachment text on-demand if not cached in Item::attachmentText

Follow-up to 58f515058 with a better approach: if no full-text cache
file, just get text directly without indexing. In the one existing use
of `attachmentText`, attachment merging, this is better anyway, because
we might be deleting the file, so there's no point wasting time
inserting words into the database.
This commit is contained in:
Dan Stillman 2022-03-12 20:17:13 -05:00
parent 46ff3cf4fb
commit 3ec883a7f6
4 changed files with 54 additions and 35 deletions

View file

@ -3502,7 +3502,7 @@ Zotero.defineProperty(Zotero.Item.prototype, 'attachmentHash', {
* @return {Promise<String>} - A promise for attachment text or empty string if unavailable
*/
Zotero.defineProperty(Zotero.Item.prototype, 'attachmentText', {
get: Zotero.Promise.coroutine(function* () {
get: async function () {
if (!this.isAttachment()) {
return undefined;
}
@ -3511,53 +3511,65 @@ Zotero.defineProperty(Zotero.Item.prototype, 'attachmentText', {
return null;
}
var file = this.getFile();
if (!(yield OS.File.exists(file.path))) {
file = false;
}
var cacheFile = Zotero.Fulltext.getItemCacheFile(this);
if (!file) {
if (cacheFile.exists()) {
var str = yield Zotero.File.getContentsAsync(cacheFile);
return str.trim();
}
return '';
}
var path = await this.getFilePathAsync();
var contentType = this.attachmentContentType;
if (!contentType) {
contentType = yield Zotero.MIME.getMIMETypeFromFile(file);
if (contentType) {
this.attachmentContentType = contentType;
yield this.save();
if (!path) {
Zotero.debug(`Can't get attachment text for item ${this.libraryKey}`);
return '';
}
contentType = await Zotero.MIME.getMIMETypeFromFile(path);
}
var str;
if (Zotero.Fulltext.isCachedMIMEType(contentType)) {
if (!cacheFile.exists()) {
Zotero.debug(`Cache file doesn't exist for item ${this.libraryKey}-- returning empty .attachmentText`);
// If no cache file or not fully indexed, get text on-demand
let cacheFile = Zotero.Fulltext.getItemCacheFile(this);
if (!cacheFile.exists() || !await Zotero.FullText.isFullyIndexed(this)) {
// Use processor cache file if it exists
let processorCacheFile = Zotero.FullText.getItemProcessorCacheFile(this).path;
if (await OS.File.exists(processorCacheFile)) {
let json = await Zotero.File.getContentsAsync(processorCacheFile);
let data = JSON.parse(json);
str = data.text;
}
// Otherwise extract text to temporary file and read that
else if (contentType == 'application/pdf') {
let tmpCacheFile = OS.Path.join(
Zotero.getTempDirectory().path, Zotero.Utilities.randomString()
);
let { exec, args } = Zotero.FullText.getPDFConverterExecAndArgs();
args.push(
'-nopgbrk',
path,
tmpCacheFile
);
await Zotero.Utilities.Internal.exec(exec, args);
if (!await OS.File.exists(tmpCacheFile)) {
Zotero.logError("Cache file not found after running PDF converter");
return '';
}
// Return empty string if not fully indexed
if (!(yield Zotero.Fulltext.isFullyIndexed(this))) {
Zotero.debug("Item " + this.libraryKey + " is not fully indexed -- returning empty .attachmentText");
str = await Zotero.File.getContentsAsync(tmpCacheFile);
await OS.File.remove(tmpCacheFile);
}
else {
Zotero.logError("Unsupported cached file type in .attachmentText");
return '';
}
str = yield Zotero.File.getContentsAsync(cacheFile);
}
else {
str = await Zotero.File.getContentsAsync(cacheFile);
}
}
else if (contentType == 'text/html') {
str = yield Zotero.File.getContentsAsync(file);
str = await Zotero.File.getContentsAsync(path);
str = Zotero.Utilities.unescapeHTML(str);
}
else if (contentType == 'text/plain') {
str = yield Zotero.File.getContentsAsync(file);
str = await Zotero.File.getContentsAsync(path);
}
else {
@ -3565,7 +3577,7 @@ Zotero.defineProperty(Zotero.Item.prototype, 'attachmentText', {
}
return str.trim();
})
}
});

View file

@ -1210,7 +1210,13 @@ Zotero.Items = function() {
return null;
}
let text = await attachment.attachmentText;
let text;
try {
text = await attachment.attachmentText;
}
catch (e) {
Zotero.logError(e);
}
if (!text) {
Zotero.debug('_hashAttachmentText: Attachment has no text');
return null;

View file

@ -460,6 +460,7 @@ Zotero.Fulltext = Zotero.FullText = new function(){
var {exec, args} = this.getPDFConverterExecAndArgs();
// Keep in sync with Item::attachmentText
args.push('-nopgbrk');
if (allPages) {

View file

@ -474,7 +474,7 @@ describe("Zotero.Items", function () {
assert.isTrue(attachment2.deleted);
});
it.skip("should merge identical attachments based on content hash when unindexed", async function () {
it("should merge identical attachments based on content hash when unindexed", async function () {
let item1 = await createDataObject('item');
let attachment1 = await importPDFAttachment(item1);