Get attachment text on-demand if not cached in Item::attachmentText
Follow-up to 58f515058
with a better approach: if no full-text cache
file, just get text directly without indexing. In the one existing use
of `attachmentText`, attachment merging, this is better anyway, because
we might be deleting the file, so there's no point wasting time
inserting words into the database.
This commit is contained in:
parent
46ff3cf4fb
commit
3ec883a7f6
4 changed files with 54 additions and 35 deletions
|
@ -3502,7 +3502,7 @@ Zotero.defineProperty(Zotero.Item.prototype, 'attachmentHash', {
|
|||
* @return {Promise<String>} - A promise for attachment text or empty string if unavailable
|
||||
*/
|
||||
Zotero.defineProperty(Zotero.Item.prototype, 'attachmentText', {
|
||||
get: Zotero.Promise.coroutine(function* () {
|
||||
get: async function () {
|
||||
if (!this.isAttachment()) {
|
||||
return undefined;
|
||||
}
|
||||
|
@ -3511,53 +3511,65 @@ Zotero.defineProperty(Zotero.Item.prototype, 'attachmentText', {
|
|||
return null;
|
||||
}
|
||||
|
||||
var file = this.getFile();
|
||||
|
||||
if (!(yield OS.File.exists(file.path))) {
|
||||
file = false;
|
||||
}
|
||||
|
||||
var cacheFile = Zotero.Fulltext.getItemCacheFile(this);
|
||||
if (!file) {
|
||||
if (cacheFile.exists()) {
|
||||
var str = yield Zotero.File.getContentsAsync(cacheFile);
|
||||
|
||||
return str.trim();
|
||||
}
|
||||
return '';
|
||||
}
|
||||
var path = await this.getFilePathAsync();
|
||||
|
||||
var contentType = this.attachmentContentType;
|
||||
if (!contentType) {
|
||||
contentType = yield Zotero.MIME.getMIMETypeFromFile(file);
|
||||
if (contentType) {
|
||||
this.attachmentContentType = contentType;
|
||||
yield this.save();
|
||||
if (!path) {
|
||||
Zotero.debug(`Can't get attachment text for item ${this.libraryKey}`);
|
||||
return '';
|
||||
}
|
||||
contentType = await Zotero.MIME.getMIMETypeFromFile(path);
|
||||
}
|
||||
|
||||
var str;
|
||||
if (Zotero.Fulltext.isCachedMIMEType(contentType)) {
|
||||
if (!cacheFile.exists()) {
|
||||
Zotero.debug(`Cache file doesn't exist for item ${this.libraryKey}-- returning empty .attachmentText`);
|
||||
// If no cache file or not fully indexed, get text on-demand
|
||||
let cacheFile = Zotero.Fulltext.getItemCacheFile(this);
|
||||
if (!cacheFile.exists() || !await Zotero.FullText.isFullyIndexed(this)) {
|
||||
// Use processor cache file if it exists
|
||||
let processorCacheFile = Zotero.FullText.getItemProcessorCacheFile(this).path;
|
||||
if (await OS.File.exists(processorCacheFile)) {
|
||||
let json = await Zotero.File.getContentsAsync(processorCacheFile);
|
||||
let data = JSON.parse(json);
|
||||
str = data.text;
|
||||
}
|
||||
// Otherwise extract text to temporary file and read that
|
||||
else if (contentType == 'application/pdf') {
|
||||
let tmpCacheFile = OS.Path.join(
|
||||
Zotero.getTempDirectory().path, Zotero.Utilities.randomString()
|
||||
);
|
||||
let { exec, args } = Zotero.FullText.getPDFConverterExecAndArgs();
|
||||
args.push(
|
||||
'-nopgbrk',
|
||||
path,
|
||||
tmpCacheFile
|
||||
);
|
||||
await Zotero.Utilities.Internal.exec(exec, args);
|
||||
if (!await OS.File.exists(tmpCacheFile)) {
|
||||
Zotero.logError("Cache file not found after running PDF converter");
|
||||
return '';
|
||||
}
|
||||
// Return empty string if not fully indexed
|
||||
if (!(yield Zotero.Fulltext.isFullyIndexed(this))) {
|
||||
Zotero.debug("Item " + this.libraryKey + " is not fully indexed -- returning empty .attachmentText");
|
||||
str = await Zotero.File.getContentsAsync(tmpCacheFile);
|
||||
await OS.File.remove(tmpCacheFile);
|
||||
}
|
||||
else {
|
||||
Zotero.logError("Unsupported cached file type in .attachmentText");
|
||||
return '';
|
||||
}
|
||||
|
||||
str = yield Zotero.File.getContentsAsync(cacheFile);
|
||||
}
|
||||
else {
|
||||
str = await Zotero.File.getContentsAsync(cacheFile);
|
||||
}
|
||||
}
|
||||
|
||||
else if (contentType == 'text/html') {
|
||||
str = yield Zotero.File.getContentsAsync(file);
|
||||
str = await Zotero.File.getContentsAsync(path);
|
||||
str = Zotero.Utilities.unescapeHTML(str);
|
||||
}
|
||||
|
||||
else if (contentType == 'text/plain') {
|
||||
str = yield Zotero.File.getContentsAsync(file);
|
||||
str = await Zotero.File.getContentsAsync(path);
|
||||
}
|
||||
|
||||
else {
|
||||
|
@ -3565,7 +3577,7 @@ Zotero.defineProperty(Zotero.Item.prototype, 'attachmentText', {
|
|||
}
|
||||
|
||||
return str.trim();
|
||||
})
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
|
|
|
@ -1210,7 +1210,13 @@ Zotero.Items = function() {
|
|||
return null;
|
||||
}
|
||||
|
||||
let text = await attachment.attachmentText;
|
||||
let text;
|
||||
try {
|
||||
text = await attachment.attachmentText;
|
||||
}
|
||||
catch (e) {
|
||||
Zotero.logError(e);
|
||||
}
|
||||
if (!text) {
|
||||
Zotero.debug('_hashAttachmentText: Attachment has no text');
|
||||
return null;
|
||||
|
|
|
@ -460,6 +460,7 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
|||
|
||||
|
||||
var {exec, args} = this.getPDFConverterExecAndArgs();
|
||||
// Keep in sync with Item::attachmentText
|
||||
args.push('-nopgbrk');
|
||||
|
||||
if (allPages) {
|
||||
|
|
|
@ -474,7 +474,7 @@ describe("Zotero.Items", function () {
|
|||
assert.isTrue(attachment2.deleted);
|
||||
});
|
||||
|
||||
it.skip("should merge identical attachments based on content hash when unindexed", async function () {
|
||||
it("should merge identical attachments based on content hash when unindexed", async function () {
|
||||
let item1 = await createDataObject('item');
|
||||
let attachment1 = await importPDFAttachment(item1);
|
||||
|
||||
|
|
Loading…
Reference in a new issue