Full-text indexing improvements

- Use full-text cache file from syncing if available when reindexing via
  info pane or Rebuild Index → Index Unindexed Items. Only discard it for
  full index rebuild. This allows Index Unindexed Items to be used to
  force immediate processing of queued content from syncing and avoids
  unnecessary syncing back of identical content. Previously, the cache
  file was used for a manual index only when the local file didn't exist.
- When rebuilding index, don't clear indexed items with missing local
  file that are missing stats due to a pre-411180ef bug.
- indexItems() now takes an 'options' object as its second parameter
- Minor code cleanup
This commit is contained in:
Dan Stillman 2020-03-09 01:07:06 -04:00
parent 411180ef83
commit 76a1535a60
3 changed files with 183 additions and 42 deletions

View file

@ -257,7 +257,6 @@ Zotero.Fulltext = Zotero.FullText = new function(){
version ? parseInt(version) : 0,
synced ? parseInt(synced) : Zotero.FullText.SYNC_STATE_UNSYNCED
];
if (stats) {
for (let stat in stats) {
cols.push(stat);
@ -508,16 +507,28 @@ Zotero.Fulltext = Zotero.FullText = new function(){
/**
* @param {Integer[]|Integer} items - One or more itemIDs
* @param {Object} [options]
* @param {Boolean} [options.complete=false] - Ignore page/character limits
* @param {Boolean} [options.ignoreErrors=false] - Continue on error instead of throwing
*/
this.indexItems = Zotero.Promise.coroutine(function* (items, complete, ignoreErrors) {
if (!Array.isArray(items)) {
items = [items];
this.indexItems = async function (itemIDs, options = {}) {
var complete;
var ignoreErrors;
if (typeof options == 'boolean') {
Zotero.logError("indexItems() now takes an 'options' object -- please update your code");
complete = options;
ignoreErrors = arguments[2];
}
else {
complete = options.complete;
ignoreErrors = options.ignoreErrors;
}
var items = yield Zotero.Items.getAsync(items);
var found = [];
for (let i=0; i<items.length; i++) {
let item = items[i];
if (!Array.isArray(itemIDs)) {
itemIDs = [itemIDs];
}
var items = await Zotero.Items.getAsync(itemIDs);
for (let item of items) {
if (!item.isAttachment()) {
continue;
}
@ -525,32 +536,34 @@ Zotero.Fulltext = Zotero.FullText = new function(){
Zotero.debug("Indexing item " + item.libraryKey);
let itemID = item.id;
var path = yield item.getFilePathAsync();
// If there's a processor cache file from syncing, use it
let processorCacheFile = this.getItemProcessorCacheFile(item).path;
if (await OS.File.exists(processorCacheFile)) {
let indexed = await Zotero.Fulltext.indexFromProcessorCache(itemID);
if (indexed) {
continue;
}
}
var path = await item.getFilePathAsync();
if (!path) {
if (yield OS.File.exists(this.getItemProcessorCacheFile(item).path)) {
yield Zotero.Fulltext.indexFromProcessorCache(itemID);
}
else {
Zotero.debug("No file to index for item " + item.libraryKey
+ " in Zotero.FullText.indexItems()");
}
Zotero.debug("No file to index for item " + item.libraryKey);
continue;
}
try {
yield indexFile(path, item.attachmentContentType, item.attachmentCharset, itemID, complete);
await indexFile(path, item.attachmentContentType, item.attachmentCharset, itemID, complete);
}
catch (e) {
if (ignoreErrors) {
Components.utils.reportError("Error indexing " + path);
Zotero.logError("Error indexing " + path);
Zotero.logError(e);
continue;
}
else {
throw e;
}
throw e;
}
}
});
};
// TEMP: Temporary mechanism to serialize indexing of new attachments
@ -581,7 +594,7 @@ Zotero.Fulltext = Zotero.FullText = new function(){
_indexing = true;
var itemID = _queue.shift();
try {
await Zotero.Fulltext.indexItems([itemID], false, true);
await Zotero.FullText.indexItems([itemID], { ignoreErrors: true });
}
finally {
_indexing = false;
@ -982,8 +995,7 @@ Zotero.Fulltext = Zotero.FullText = new function(){
return true;
}
catch (e) {
Components.utils.reportError(e);
Zotero.debug(e, 1);
Zotero.logError(e);
return false;
};
});
@ -1458,7 +1470,7 @@ Zotero.Fulltext = Zotero.FullText = new function(){
/**
* @return {Promise}
*/
this.rebuildIndex = Zotero.Promise.coroutine(function* (unindexedOnly) {
this.rebuildIndex = async function (unindexedOnly) {
// Get all attachments other than web links
var sql = "SELECT itemID FROM itemAttachments WHERE linkMode!="
+ Zotero.Attachments.LINK_MODE_LINKED_URL;
@ -1468,20 +1480,30 @@ Zotero.Fulltext = Zotero.FullText = new function(){
+ "WHERE synced != ? AND (indexedChars IS NOT NULL OR indexedPages IS NOT NULL))";
params.push(this.SYNC_STATE_MISSING);
}
var items = yield Zotero.DB.columnQueryAsync(sql, params);
if (items) {
yield Zotero.DB.executeTransaction(function* () {
yield Zotero.DB.queryAsync(
"DELETE FROM fulltextItemWords WHERE itemID IN (" + sql + ")", params
);
yield Zotero.DB.queryAsync(
"DELETE FROM fulltextItems WHERE itemID IN (" + sql + ")", params
);
});
yield this.indexItems(items, false, true);
var itemIDs = await Zotero.DB.columnQueryAsync(sql, params);
if (!itemIDs.length) {
Zotero.debug("No items to index");
return;
}
});
// If rebuilding from scratch, delete any processor cache files so they're not used.
// Otherwise, indexing unindexed items will force indexing of processor cache files
// without waiting for idle processing.
if (!unindexedOnly) {
for (let itemID of itemIDs) {
let item = await Zotero.Items.getAsync(itemID);
let cacheFile = this.getItemProcessorCacheFile(item).path;
try {
await OS.File.remove(cacheFile, { ignoreAbsent: true });
}
catch (e) {
Zotero.logError(e);
}
}
}
await this.indexItems(itemIDs, { ignoreErrors: true });
};
/**

View file

@ -1650,7 +1650,7 @@ var ZoteroPane = new function()
itemIDs.push(items[i].id);
}
yield Zotero.Fulltext.indexItems(itemIDs, true);
yield Zotero.FullText.indexItems(itemIDs, { complete: true });
yield document.getElementById('zotero-attachment-box').updateItemIndexedState();
});

View file

@ -189,8 +189,6 @@ describe("Zotero.Fulltext", function () {
var item = await importFileAttachment('test.pdf');
var processorCacheFile = Zotero.Fulltext.getItemProcessorCacheFile(item).path;
var itemCacheFile = Zotero.Fulltext.getItemCacheFile(item).path;
await Zotero.File.putContentsAsync(itemCacheFile, "Test");
var version = 5;
await Zotero.Fulltext.setItemContent(
@ -249,4 +247,125 @@ describe("Zotero.Fulltext", function () {
assert.isFalse(await OS.File.exists(processorCacheFile));
});
});
describe("#rebuildIndex()", function () {
afterEach(() => {
// Re-enable PDF indexing
Zotero.Prefs.clear('fulltext.pdfMaxPages');
});
it("should process queued full-text content in indexedOnly mode", async function () {
Zotero.Prefs.set('fulltext.pdfMaxPages', 0);
var item = await importFileAttachment('test.pdf');
Zotero.Prefs.clear('fulltext.pdfMaxPages');
var version = 5;
await Zotero.FullText.setItemContent(
item.libraryID,
item.key,
{
content: "Test",
indexedPages: 4,
totalPages: 4
},
version
);
var processorCacheFile = Zotero.FullText.getItemProcessorCacheFile(item).path;
var itemCacheFile = Zotero.FullText.getItemCacheFile(item).path;
assert.isTrue(await OS.File.exists(processorCacheFile));
await Zotero.FullText.rebuildIndex(true);
// .zotero-ft-unprocessed should have been deleted
assert.isFalse(await OS.File.exists(processorCacheFile));
// .zotero-ft-cache should now exist
assert.isTrue(await OS.File.exists(itemCacheFile));
assert.equal(await Zotero.FullText.getItemVersion(item.id), version);
assert.equal(
await Zotero.DB.valueQueryAsync("SELECT synced FROM fulltextItems WHERE itemID=?", item.id),
Zotero.FullText.SYNC_STATE_IN_SYNC
);
var { indexedPages, total } = await Zotero.FullText.getPages(item.id);
assert.equal(indexedPages, 4);
assert.equal(total, 4);
});
it("should ignore queued full-text content in non-indexedOnly mode", async function () {
Zotero.Prefs.set('fulltext.pdfMaxPages', 0);
var item = await importFileAttachment('test.pdf');
Zotero.Prefs.clear('fulltext.pdfMaxPages');
var version = 5;
await Zotero.FullText.setItemContent(
item.libraryID,
item.key,
{
content: "Test",
indexedPages: 4,
totalPages: 4
},
version
);
var processorCacheFile = Zotero.FullText.getItemProcessorCacheFile(item).path;
var itemCacheFile = Zotero.FullText.getItemCacheFile(item).path;
assert.isTrue(await OS.File.exists(processorCacheFile));
await Zotero.FullText.rebuildIndex();
// .zotero-ft-unprocessed should have been deleted
assert.isFalse(await OS.File.exists(processorCacheFile));
// .zotero-ft-cache should now exist
assert.isTrue(await OS.File.exists(itemCacheFile));
// Processor cache file shouldn't have been used, and full text should be marked for
// syncing
assert.equal(await Zotero.FullText.getItemVersion(item.id), 0);
assert.equal(
await Zotero.DB.valueQueryAsync(
"SELECT synced FROM fulltextItems WHERE itemID=?",
item.id
),
Zotero.FullText.SYNC_STATE_UNSYNCED
);
var { indexedPages, total } = await Zotero.FullText.getPages(item.id);
assert.equal(indexedPages, 1);
assert.equal(total, 1);
});
// This shouldn't happen, but before 5.0.85 items reindexed elsewhere could clear local stats
it("shouldn't clear indexed items with missing file and no stats", async function () {
Zotero.Prefs.set('fulltext.pdfMaxPages', 1);
var item = await importFileAttachment('test.pdf');
Zotero.Prefs.clear('fulltext.pdfMaxPages');
var itemCacheFile = Zotero.FullText.getItemCacheFile(item).path;
assert.isTrue(await OS.File.exists(itemCacheFile));
var { indexedPages, total } = await Zotero.FullText.getPages(item.id);
assert.equal(indexedPages, 1);
assert.equal(total, 1);
await Zotero.DB.queryAsync(
"UPDATE fulltextItems SET indexedPages=NULL, totalPages=NULL WHERE itemID=?",
item.id
);
await Zotero.FullText.rebuildIndex();
// .zotero-ft-cache should still exist
assert.isTrue(await OS.File.exists(itemCacheFile));
assert.equal(
await Zotero.DB.valueQueryAsync(
"SELECT COUNT(*) FROM fulltextItems WHERE itemID=?",
item.id
),
1
);
});
});
})