zotero/test/tests/fulltextTest.js
Dan Stillman 76a1535a60 Full-text indexing improvements
- Use full-text cache file from syncing if available when reindexing via
  info pane or Rebuild Index → Index Unindexed Items. Only discard it for
  full index rebuild. This allows Index Unindexed Items to be used to
  force immediate processing of queued content from syncing and avoids
  unnecessary syncing back of identical content. Previously, the cache
  file was used for a manual index only when the local file didn't exist.
- When rebuilding index, don't clear indexed items with missing local
  file that are missing stats due to a pre-411180ef bug.
- indexItems() now takes an 'options' object as its second parameter
- Minor code cleanup
2020-03-09 01:19:52 -04:00

371 lines
12 KiB
JavaScript

describe("Zotero.Fulltext", function () {
var win;
before(function* () {
// Hidden browser, which requires a browser window, needed for charset detection
// (until we figure out a better way)
win = yield loadBrowserWindow();
});
after(function () {
if (win) {
win.close();
}
});
describe("Indexing", function () {
beforeEach(function () {
Zotero.Prefs.clear('fulltext.textMaxLength');
Zotero.Prefs.clear('fulltext.pdfMaxPages');
});
after(function () {
Zotero.Prefs.clear('fulltext.textMaxLength');
Zotero.Prefs.clear('fulltext.pdfMaxPages');
});
describe("#indexItems()", function () {
it("should index a text file by default", function* () {
var item = yield importFileAttachment('test.txt');
assert.equal(
(yield Zotero.Fulltext.getIndexedState(item)),
Zotero.Fulltext.INDEX_STATE_INDEXED
);
})
it("should skip indexing of a text file if fulltext.textMaxLength is 0", function* () {
Zotero.Prefs.set('fulltext.textMaxLength', 0);
var item = yield importFileAttachment('test.txt');
assert.equal(
(yield Zotero.Fulltext.getIndexedState(item)),
Zotero.Fulltext.INDEX_STATE_UNINDEXED
);
})
it("should index a PDF by default", function* () {
var item = yield importFileAttachment('test.pdf');
assert.equal(
(yield Zotero.Fulltext.getIndexedState(item)),
Zotero.Fulltext.INDEX_STATE_INDEXED
);
})
it("should skip indexing of a PDF if fulltext.textMaxLength is 0", function* () {
Zotero.Prefs.set('fulltext.textMaxLength', 0);
var item = yield importFileAttachment('test.pdf');
assert.equal(
(yield Zotero.Fulltext.getIndexedState(item)),
Zotero.Fulltext.INDEX_STATE_UNINDEXED
);
})
it("should skip indexing of a PDF if fulltext.pdfMaxPages is 0", function* () {
Zotero.Prefs.set('fulltext.pdfMaxPages', 0);
var item = yield importFileAttachment('test.pdf');
assert.equal(
(yield Zotero.Fulltext.getIndexedState(item)),
Zotero.Fulltext.INDEX_STATE_UNINDEXED
);
})
});
describe("#indexPDF()", function () {
it("should create cache files for linked attachments in storage directory", function* () {
var filename = 'test.pdf';
var file = OS.Path.join(getTestDataDirectory().path, filename);
var tempDir = yield getTempDirectory();
var linkedFile = OS.Path.join(tempDir, filename);
yield OS.File.copy(file, linkedFile);
var item = yield Zotero.Attachments.linkFromFile({ file: linkedFile });
var storageDir = Zotero.Attachments.getStorageDirectory(item).path;
assert.isTrue(yield OS.File.exists(storageDir));
assert.isTrue(yield OS.File.exists(OS.Path.join(storageDir, '.zotero-ft-info')));
assert.isTrue(yield OS.File.exists(OS.Path.join(storageDir, '.zotero-ft-cache')));
assert.isFalse(yield OS.File.exists(OS.Path.join(storageDir, filename)));
});
});
});
describe("#getUnsyncedContent()", function () {
it("should get content that hasn't been uploaded", function* () {
var toSync = [];
var group = yield getGroup();
var add = Zotero.Promise.coroutine(function* (options = {}) {
let item = yield createDataObject('item', { libraryID: options.libraryID });
let attachment = new Zotero.Item('attachment');
if (options.libraryID) {
attachment.libraryID = options.libraryID;
}
attachment.parentItemID = item.id;
attachment.attachmentLinkMode = 'imported_file';
attachment.attachmentContentType = 'text/plain';
attachment.attachmentCharset = 'utf-8';
attachment.attachmentFilename = 'test.txt';
if (options.synced) {
attachment.synced = true;
}
yield attachment.saveTx();
yield Zotero.Attachments.createDirectoryForItem(attachment);
let path = attachment.getFilePath();
let content = new Array(10).fill("").map(x => Zotero.Utilities.randomString()).join(" ");
yield Zotero.File.putContentsAsync(path, content);
if (!options.skip) {
toSync.push({
item: attachment,
content,
indexedChars: content.length,
indexedPages: 0
});
}
});
yield add({ synced: true });
yield add({ synced: true });
// Unsynced attachment shouldn't uploaded
yield add({ skip: true });
// Attachment in another library shouldn't be uploaded
yield add({ libraryID: group.libraryID, synced: true, skip: true });
// PDF attachment
var pdfAttachment = yield importFileAttachment('test.pdf');
pdfAttachment.synced = true;
yield pdfAttachment.saveTx();
toSync.push({
item: pdfAttachment,
content: "Zotero [zoh-TAIR-oh] is a free, easy-to-use tool to help you collect, "
+ "organize, cite, and share your research sources.\n\n",
indexedChars: 0,
indexedPages: 1
});
yield Zotero.Fulltext.indexItems(toSync.map(x => x.item.id));
var data = yield Zotero.FullText.getUnsyncedContent(Zotero.Libraries.userLibraryID);
assert.lengthOf(data, 3);
let contents = toSync.map(x => x.content);
for (let d of data) {
let pos = contents.indexOf(d.content);
assert.isAbove(pos, -1);
assert.equal(d.indexedChars, toSync[pos].indexedChars);
assert.equal(d.indexedPages, toSync[pos].indexedPages);
}
});
it("should mark PDF attachment content as missing if cache file doesn't exist", function* () {
var item = yield importFileAttachment('test.pdf');
item.synced = true;
yield item.saveTx();
yield Zotero.Fulltext.indexItems([item.id]);
yield OS.File.remove(Zotero.Fulltext.getItemCacheFile(item).path);
var sql = "SELECT synced FROM fulltextItems WHERE itemID=?";
var synced = yield Zotero.DB.valueQueryAsync(sql, item.id);
assert.equal(synced, Zotero.Fulltext.SYNC_STATE_UNSYNCED);
var indexed = yield Zotero.Fulltext.getIndexedState(item);
assert.equal(indexed, Zotero.Fulltext.INDEX_STATE_INDEXED);
yield Zotero.Fulltext.getUnsyncedContent(item.libraryID);
synced = yield Zotero.DB.valueQueryAsync(sql, item.id);
assert.equal(synced, Zotero.Fulltext.SYNC_STATE_MISSING);
indexed = yield Zotero.Fulltext.getIndexedState(item);
assert.equal(indexed, Zotero.Fulltext.INDEX_STATE_UNINDEXED);
});
})
describe("#setItemContent()", function () {
before(() => {
// Disable PDF indexing
Zotero.Prefs.set('fulltext.pdfMaxPages', 0);
});
after(() => {
// Re-enable PDF indexing
Zotero.Prefs.clear('fulltext.pdfMaxPages');
});
it("should store data in .zotero-ft-unprocessed file", async function () {
var item = await importFileAttachment('test.pdf');
var processorCacheFile = Zotero.Fulltext.getItemProcessorCacheFile(item).path;
var version = 5;
await Zotero.Fulltext.setItemContent(
item.libraryID,
item.key,
{
content: "Test",
indexedPages: 4,
totalPages: 4
},
version
);
assert.equal(await Zotero.Fulltext.getItemVersion(item.id), 0);
assert.equal(
await Zotero.DB.valueQueryAsync("SELECT synced FROM fulltextItems WHERE itemID=?", item.id),
Zotero.FullText.SYNC_STATE_TO_PROCESS
);
assert.isTrue(await OS.File.exists(processorCacheFile));
});
it("should update the version if the local version is 0 but the text matches", async function () {
var item = await importFileAttachment('test.pdf');
await Zotero.DB.queryAsync(
"REPLACE INTO fulltextItems (itemID, version, indexedPages, totalPages, synced) "
+ "VALUES (?, 0, 4, 4, ?)",
[item.id, Zotero.FullText.SYNC_STATE_UNSYNCED]
);
var processorCacheFile = Zotero.FullText.getItemProcessorCacheFile(item).path;
var itemCacheFile = Zotero.FullText.getItemCacheFile(item).path;
await Zotero.File.putContentsAsync(itemCacheFile, "Test");
var version = 5;
await Zotero.FullText.setItemContent(
item.libraryID,
item.key,
{
content: "Test",
indexedPages: 4,
totalPages: 4
},
version
);
assert.equal(await Zotero.FullText.getItemVersion(item.id), version);
assert.equal(
await Zotero.DB.valueQueryAsync("SELECT synced FROM fulltextItems WHERE itemID=?", item.id),
Zotero.FullText.SYNC_STATE_IN_SYNC
);
var { indexedPages, total } = await Zotero.FullText.getPages(item.id);
assert.equal(indexedPages, 4);
assert.equal(total, 4);
assert.isFalse(await OS.File.exists(processorCacheFile));
});
});
describe("#rebuildIndex()", function () {
afterEach(() => {
// Re-enable PDF indexing
Zotero.Prefs.clear('fulltext.pdfMaxPages');
});
it("should process queued full-text content in indexedOnly mode", async function () {
Zotero.Prefs.set('fulltext.pdfMaxPages', 0);
var item = await importFileAttachment('test.pdf');
Zotero.Prefs.clear('fulltext.pdfMaxPages');
var version = 5;
await Zotero.FullText.setItemContent(
item.libraryID,
item.key,
{
content: "Test",
indexedPages: 4,
totalPages: 4
},
version
);
var processorCacheFile = Zotero.FullText.getItemProcessorCacheFile(item).path;
var itemCacheFile = Zotero.FullText.getItemCacheFile(item).path;
assert.isTrue(await OS.File.exists(processorCacheFile));
await Zotero.FullText.rebuildIndex(true);
// .zotero-ft-unprocessed should have been deleted
assert.isFalse(await OS.File.exists(processorCacheFile));
// .zotero-ft-cache should now exist
assert.isTrue(await OS.File.exists(itemCacheFile));
assert.equal(await Zotero.FullText.getItemVersion(item.id), version);
assert.equal(
await Zotero.DB.valueQueryAsync("SELECT synced FROM fulltextItems WHERE itemID=?", item.id),
Zotero.FullText.SYNC_STATE_IN_SYNC
);
var { indexedPages, total } = await Zotero.FullText.getPages(item.id);
assert.equal(indexedPages, 4);
assert.equal(total, 4);
});
it("should ignore queued full-text content in non-indexedOnly mode", async function () {
Zotero.Prefs.set('fulltext.pdfMaxPages', 0);
var item = await importFileAttachment('test.pdf');
Zotero.Prefs.clear('fulltext.pdfMaxPages');
var version = 5;
await Zotero.FullText.setItemContent(
item.libraryID,
item.key,
{
content: "Test",
indexedPages: 4,
totalPages: 4
},
version
);
var processorCacheFile = Zotero.FullText.getItemProcessorCacheFile(item).path;
var itemCacheFile = Zotero.FullText.getItemCacheFile(item).path;
assert.isTrue(await OS.File.exists(processorCacheFile));
await Zotero.FullText.rebuildIndex();
// .zotero-ft-unprocessed should have been deleted
assert.isFalse(await OS.File.exists(processorCacheFile));
// .zotero-ft-cache should now exist
assert.isTrue(await OS.File.exists(itemCacheFile));
// Processor cache file shouldn't have been used, and full text should be marked for
// syncing
assert.equal(await Zotero.FullText.getItemVersion(item.id), 0);
assert.equal(
await Zotero.DB.valueQueryAsync(
"SELECT synced FROM fulltextItems WHERE itemID=?",
item.id
),
Zotero.FullText.SYNC_STATE_UNSYNCED
);
var { indexedPages, total } = await Zotero.FullText.getPages(item.id);
assert.equal(indexedPages, 1);
assert.equal(total, 1);
});
// This shouldn't happen, but before 5.0.85 items reindexed elsewhere could clear local stats
it("shouldn't clear indexed items with missing file and no stats", async function () {
Zotero.Prefs.set('fulltext.pdfMaxPages', 1);
var item = await importFileAttachment('test.pdf');
Zotero.Prefs.clear('fulltext.pdfMaxPages');
var itemCacheFile = Zotero.FullText.getItemCacheFile(item).path;
assert.isTrue(await OS.File.exists(itemCacheFile));
var { indexedPages, total } = await Zotero.FullText.getPages(item.id);
assert.equal(indexedPages, 1);
assert.equal(total, 1);
await Zotero.DB.queryAsync(
"UPDATE fulltextItems SET indexedPages=NULL, totalPages=NULL WHERE itemID=?",
item.id
);
await Zotero.FullText.rebuildIndex();
// .zotero-ft-cache should still exist
assert.isTrue(await OS.File.exists(itemCacheFile));
assert.equal(
await Zotero.DB.valueQueryAsync(
"SELECT COUNT(*) FROM fulltextItems WHERE itemID=?",
item.id
),
1
);
});
});
})