Get attachment text on-demand if not cached in Item::attachmentText
Follow-up to 58f515058
with a better approach: if no full-text cache
file, just get text directly without indexing. In the one existing use
of `attachmentText`, attachment merging, this is better anyway, because
we might be deleting the file, so there's no point wasting time
inserting words into the database.
This commit is contained in:
parent
46ff3cf4fb
commit
3ec883a7f6
4 changed files with 54 additions and 35 deletions
|
@ -3502,7 +3502,7 @@ Zotero.defineProperty(Zotero.Item.prototype, 'attachmentHash', {
|
||||||
* @return {Promise<String>} - A promise for attachment text or empty string if unavailable
|
* @return {Promise<String>} - A promise for attachment text or empty string if unavailable
|
||||||
*/
|
*/
|
||||||
Zotero.defineProperty(Zotero.Item.prototype, 'attachmentText', {
|
Zotero.defineProperty(Zotero.Item.prototype, 'attachmentText', {
|
||||||
get: Zotero.Promise.coroutine(function* () {
|
get: async function () {
|
||||||
if (!this.isAttachment()) {
|
if (!this.isAttachment()) {
|
||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
|
@ -3511,53 +3511,65 @@ Zotero.defineProperty(Zotero.Item.prototype, 'attachmentText', {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
var file = this.getFile();
|
var path = await this.getFilePathAsync();
|
||||||
|
|
||||||
if (!(yield OS.File.exists(file.path))) {
|
|
||||||
file = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
var cacheFile = Zotero.Fulltext.getItemCacheFile(this);
|
|
||||||
if (!file) {
|
|
||||||
if (cacheFile.exists()) {
|
|
||||||
var str = yield Zotero.File.getContentsAsync(cacheFile);
|
|
||||||
|
|
||||||
return str.trim();
|
|
||||||
}
|
|
||||||
return '';
|
|
||||||
}
|
|
||||||
|
|
||||||
var contentType = this.attachmentContentType;
|
var contentType = this.attachmentContentType;
|
||||||
if (!contentType) {
|
if (!contentType) {
|
||||||
contentType = yield Zotero.MIME.getMIMETypeFromFile(file);
|
if (!path) {
|
||||||
if (contentType) {
|
Zotero.debug(`Can't get attachment text for item ${this.libraryKey}`);
|
||||||
this.attachmentContentType = contentType;
|
return '';
|
||||||
yield this.save();
|
|
||||||
}
|
}
|
||||||
|
contentType = await Zotero.MIME.getMIMETypeFromFile(path);
|
||||||
}
|
}
|
||||||
|
|
||||||
var str;
|
var str;
|
||||||
if (Zotero.Fulltext.isCachedMIMEType(contentType)) {
|
if (Zotero.Fulltext.isCachedMIMEType(contentType)) {
|
||||||
if (!cacheFile.exists()) {
|
// If no cache file or not fully indexed, get text on-demand
|
||||||
Zotero.debug(`Cache file doesn't exist for item ${this.libraryKey}-- returning empty .attachmentText`);
|
let cacheFile = Zotero.Fulltext.getItemCacheFile(this);
|
||||||
|
if (!cacheFile.exists() || !await Zotero.FullText.isFullyIndexed(this)) {
|
||||||
|
// Use processor cache file if it exists
|
||||||
|
let processorCacheFile = Zotero.FullText.getItemProcessorCacheFile(this).path;
|
||||||
|
if (await OS.File.exists(processorCacheFile)) {
|
||||||
|
let json = await Zotero.File.getContentsAsync(processorCacheFile);
|
||||||
|
let data = JSON.parse(json);
|
||||||
|
str = data.text;
|
||||||
|
}
|
||||||
|
// Otherwise extract text to temporary file and read that
|
||||||
|
else if (contentType == 'application/pdf') {
|
||||||
|
let tmpCacheFile = OS.Path.join(
|
||||||
|
Zotero.getTempDirectory().path, Zotero.Utilities.randomString()
|
||||||
|
);
|
||||||
|
let { exec, args } = Zotero.FullText.getPDFConverterExecAndArgs();
|
||||||
|
args.push(
|
||||||
|
'-nopgbrk',
|
||||||
|
path,
|
||||||
|
tmpCacheFile
|
||||||
|
);
|
||||||
|
await Zotero.Utilities.Internal.exec(exec, args);
|
||||||
|
if (!await OS.File.exists(tmpCacheFile)) {
|
||||||
|
Zotero.logError("Cache file not found after running PDF converter");
|
||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
// Return empty string if not fully indexed
|
str = await Zotero.File.getContentsAsync(tmpCacheFile);
|
||||||
if (!(yield Zotero.Fulltext.isFullyIndexed(this))) {
|
await OS.File.remove(tmpCacheFile);
|
||||||
Zotero.debug("Item " + this.libraryKey + " is not fully indexed -- returning empty .attachmentText");
|
}
|
||||||
|
else {
|
||||||
|
Zotero.logError("Unsupported cached file type in .attachmentText");
|
||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
|
}
|
||||||
str = yield Zotero.File.getContentsAsync(cacheFile);
|
else {
|
||||||
|
str = await Zotero.File.getContentsAsync(cacheFile);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
else if (contentType == 'text/html') {
|
else if (contentType == 'text/html') {
|
||||||
str = yield Zotero.File.getContentsAsync(file);
|
str = await Zotero.File.getContentsAsync(path);
|
||||||
str = Zotero.Utilities.unescapeHTML(str);
|
str = Zotero.Utilities.unescapeHTML(str);
|
||||||
}
|
}
|
||||||
|
|
||||||
else if (contentType == 'text/plain') {
|
else if (contentType == 'text/plain') {
|
||||||
str = yield Zotero.File.getContentsAsync(file);
|
str = await Zotero.File.getContentsAsync(path);
|
||||||
}
|
}
|
||||||
|
|
||||||
else {
|
else {
|
||||||
|
@ -3565,7 +3577,7 @@ Zotero.defineProperty(Zotero.Item.prototype, 'attachmentText', {
|
||||||
}
|
}
|
||||||
|
|
||||||
return str.trim();
|
return str.trim();
|
||||||
})
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1210,7 +1210,13 @@ Zotero.Items = function() {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
let text = await attachment.attachmentText;
|
let text;
|
||||||
|
try {
|
||||||
|
text = await attachment.attachmentText;
|
||||||
|
}
|
||||||
|
catch (e) {
|
||||||
|
Zotero.logError(e);
|
||||||
|
}
|
||||||
if (!text) {
|
if (!text) {
|
||||||
Zotero.debug('_hashAttachmentText: Attachment has no text');
|
Zotero.debug('_hashAttachmentText: Attachment has no text');
|
||||||
return null;
|
return null;
|
||||||
|
|
|
@ -460,6 +460,7 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
||||||
|
|
||||||
|
|
||||||
var {exec, args} = this.getPDFConverterExecAndArgs();
|
var {exec, args} = this.getPDFConverterExecAndArgs();
|
||||||
|
// Keep in sync with Item::attachmentText
|
||||||
args.push('-nopgbrk');
|
args.push('-nopgbrk');
|
||||||
|
|
||||||
if (allPages) {
|
if (allPages) {
|
||||||
|
|
|
@ -474,7 +474,7 @@ describe("Zotero.Items", function () {
|
||||||
assert.isTrue(attachment2.deleted);
|
assert.isTrue(attachment2.deleted);
|
||||||
});
|
});
|
||||||
|
|
||||||
it.skip("should merge identical attachments based on content hash when unindexed", async function () {
|
it("should merge identical attachments based on content hash when unindexed", async function () {
|
||||||
let item1 = await createDataObject('item');
|
let item1 = await createDataObject('item');
|
||||||
let attachment1 = await importPDFAttachment(item1);
|
let attachment1 = await importPDFAttachment(item1);
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue