a497f501e9
Strip '\f' at end of extracted text and ignore '\n' in the middle of text from the test PDF
373 lines
12 KiB
JavaScript
373 lines
12 KiB
JavaScript
describe("Zotero.FullText", function () {
|
|
var win;
|
|
|
|
before(function* () {
|
|
// Hidden browser, which requires a browser window, needed for charset detection
|
|
// (until we figure out a better way)
|
|
win = yield loadBrowserWindow();
|
|
});
|
|
after(function () {
|
|
if (win) {
|
|
win.close();
|
|
}
|
|
});
|
|
|
|
describe("Indexing", function () {
|
|
beforeEach(function () {
|
|
Zotero.Prefs.clear('fulltext.textMaxLength');
|
|
Zotero.Prefs.clear('fulltext.pdfMaxPages');
|
|
});
|
|
after(function () {
|
|
Zotero.Prefs.clear('fulltext.textMaxLength');
|
|
Zotero.Prefs.clear('fulltext.pdfMaxPages');
|
|
});
|
|
|
|
describe("#indexItems()", function () {
|
|
it("should index a text file by default", function* () {
|
|
var item = yield importFileAttachment('test.txt');
|
|
assert.equal(
|
|
(yield Zotero.Fulltext.getIndexedState(item)),
|
|
Zotero.Fulltext.INDEX_STATE_INDEXED
|
|
);
|
|
})
|
|
|
|
it("should skip indexing of a text file if fulltext.textMaxLength is 0", function* () {
|
|
Zotero.Prefs.set('fulltext.textMaxLength', 0);
|
|
var item = yield importFileAttachment('test.txt');
|
|
assert.equal(
|
|
(yield Zotero.Fulltext.getIndexedState(item)),
|
|
Zotero.Fulltext.INDEX_STATE_UNINDEXED
|
|
);
|
|
})
|
|
|
|
it("should index a PDF by default", function* () {
|
|
var item = yield importFileAttachment('test.pdf');
|
|
assert.equal(
|
|
(yield Zotero.Fulltext.getIndexedState(item)),
|
|
Zotero.Fulltext.INDEX_STATE_INDEXED
|
|
);
|
|
})
|
|
|
|
it("should skip indexing of a PDF if fulltext.textMaxLength is 0", function* () {
|
|
Zotero.Prefs.set('fulltext.textMaxLength', 0);
|
|
var item = yield importFileAttachment('test.pdf');
|
|
assert.equal(
|
|
(yield Zotero.Fulltext.getIndexedState(item)),
|
|
Zotero.Fulltext.INDEX_STATE_UNINDEXED
|
|
);
|
|
})
|
|
|
|
it("should skip indexing of a PDF if fulltext.pdfMaxPages is 0", function* () {
|
|
Zotero.Prefs.set('fulltext.pdfMaxPages', 0);
|
|
var item = yield importFileAttachment('test.pdf');
|
|
assert.equal(
|
|
(yield Zotero.Fulltext.getIndexedState(item)),
|
|
Zotero.Fulltext.INDEX_STATE_UNINDEXED
|
|
);
|
|
})
|
|
});
|
|
|
|
describe("#indexPDF()", function () {
|
|
it("should create cache files for linked attachments in storage directory", function* () {
|
|
var filename = 'test.pdf';
|
|
var file = OS.Path.join(getTestDataDirectory().path, filename);
|
|
var tempDir = yield getTempDirectory();
|
|
var linkedFile = OS.Path.join(tempDir, filename);
|
|
yield OS.File.copy(file, linkedFile);
|
|
|
|
var item = yield Zotero.Attachments.linkFromFile({ file: linkedFile });
|
|
var storageDir = Zotero.Attachments.getStorageDirectory(item).path;
|
|
assert.isTrue(yield OS.File.exists(storageDir));
|
|
assert.isTrue(yield OS.File.exists(OS.Path.join(storageDir, '.zotero-ft-cache')));
|
|
assert.isFalse(yield OS.File.exists(OS.Path.join(storageDir, filename)));
|
|
});
|
|
});
|
|
});
|
|
|
|
describe("#getUnsyncedContent()", function () {
|
|
it("should get content that hasn't been uploaded", function* () {
|
|
var toSync = [];
|
|
var group = yield getGroup();
|
|
|
|
var add = Zotero.Promise.coroutine(function* (options = {}) {
|
|
let item = yield createDataObject('item', { libraryID: options.libraryID });
|
|
let attachment = new Zotero.Item('attachment');
|
|
if (options.libraryID) {
|
|
attachment.libraryID = options.libraryID;
|
|
}
|
|
attachment.parentItemID = item.id;
|
|
attachment.attachmentLinkMode = 'imported_file';
|
|
attachment.attachmentContentType = 'text/plain';
|
|
attachment.attachmentCharset = 'utf-8';
|
|
attachment.attachmentFilename = 'test.txt';
|
|
if (options.synced) {
|
|
attachment.synced = true;
|
|
}
|
|
yield attachment.saveTx();
|
|
yield Zotero.Attachments.createDirectoryForItem(attachment);
|
|
|
|
let path = attachment.getFilePath();
|
|
let content = new Array(10).fill("").map(x => Zotero.Utilities.randomString()).join(" ");
|
|
yield Zotero.File.putContentsAsync(path, content);
|
|
|
|
if (!options.skip) {
|
|
toSync.push({
|
|
item: attachment,
|
|
content,
|
|
indexedChars: content.length,
|
|
indexedPages: 0
|
|
});
|
|
}
|
|
});
|
|
yield add({ synced: true });
|
|
yield add({ synced: true });
|
|
// Unsynced attachment shouldn't uploaded
|
|
yield add({ skip: true });
|
|
// Attachment in another library shouldn't be uploaded
|
|
yield add({ libraryID: group.libraryID, synced: true, skip: true });
|
|
// PDF attachment
|
|
var pdfAttachment = yield importFileAttachment('test.pdf');
|
|
pdfAttachment.synced = true;
|
|
yield pdfAttachment.saveTx();
|
|
toSync.push({
|
|
item: pdfAttachment,
|
|
content: "Zotero [zoh-TAIR-oh] is a free, easy-to-use tool to help you collect, "
|
|
// pdf-worker handles whitespace differently than pdftotext
|
|
//+ "organize, cite, and share your research sources.\n\n",
|
|
+ "organize, cite, and share\nyour research sources.\n\n",
|
|
indexedChars: 0,
|
|
indexedPages: 1
|
|
});
|
|
|
|
yield Zotero.Fulltext.indexItems(toSync.map(x => x.item.id));
|
|
|
|
var data = yield Zotero.FullText.getUnsyncedContent(Zotero.Libraries.userLibraryID);
|
|
assert.lengthOf(data, 3);
|
|
let contents = toSync.map(x => x.content);
|
|
|
|
for (let d of data) {
|
|
assert.include(contents, d.content);
|
|
let pos = contents.indexOf(d.content);
|
|
assert.equal(d.indexedChars, toSync[pos].indexedChars);
|
|
assert.equal(d.indexedPages, toSync[pos].indexedPages);
|
|
}
|
|
});
|
|
|
|
it("should mark PDF attachment content as missing if cache file doesn't exist", function* () {
|
|
var item = yield importFileAttachment('test.pdf');
|
|
item.synced = true;
|
|
yield item.saveTx();
|
|
|
|
yield Zotero.Fulltext.indexItems([item.id]);
|
|
yield OS.File.remove(Zotero.Fulltext.getItemCacheFile(item).path);
|
|
|
|
var sql = "SELECT synced FROM fulltextItems WHERE itemID=?";
|
|
var synced = yield Zotero.DB.valueQueryAsync(sql, item.id);
|
|
assert.equal(synced, Zotero.Fulltext.SYNC_STATE_UNSYNCED);
|
|
var indexed = yield Zotero.Fulltext.getIndexedState(item);
|
|
assert.equal(indexed, Zotero.Fulltext.INDEX_STATE_INDEXED);
|
|
|
|
yield Zotero.Fulltext.getUnsyncedContent(item.libraryID);
|
|
|
|
synced = yield Zotero.DB.valueQueryAsync(sql, item.id);
|
|
assert.equal(synced, Zotero.Fulltext.SYNC_STATE_MISSING);
|
|
indexed = yield Zotero.Fulltext.getIndexedState(item);
|
|
assert.equal(indexed, Zotero.Fulltext.INDEX_STATE_UNINDEXED);
|
|
});
|
|
})
|
|
|
|
describe("#setItemContent()", function () {
|
|
before(() => {
|
|
// Disable PDF indexing
|
|
Zotero.Prefs.set('fulltext.pdfMaxPages', 0);
|
|
});
|
|
|
|
after(() => {
|
|
// Re-enable PDF indexing
|
|
Zotero.Prefs.clear('fulltext.pdfMaxPages');
|
|
});
|
|
|
|
it("should store data in .zotero-ft-unprocessed file", async function () {
|
|
var item = await importFileAttachment('test.pdf');
|
|
|
|
var processorCacheFile = Zotero.Fulltext.getItemProcessorCacheFile(item).path;
|
|
|
|
var version = 5;
|
|
await Zotero.Fulltext.setItemContent(
|
|
item.libraryID,
|
|
item.key,
|
|
{
|
|
content: "Test",
|
|
indexedPages: 4,
|
|
totalPages: 4
|
|
},
|
|
version
|
|
);
|
|
|
|
assert.equal(await Zotero.Fulltext.getItemVersion(item.id), 0);
|
|
assert.equal(
|
|
await Zotero.DB.valueQueryAsync("SELECT synced FROM fulltextItems WHERE itemID=?", item.id),
|
|
Zotero.FullText.SYNC_STATE_TO_PROCESS
|
|
);
|
|
assert.isTrue(await OS.File.exists(processorCacheFile));
|
|
});
|
|
|
|
|
|
it("should update the version if the local version is 0 but the text matches", async function () {
|
|
var item = await importFileAttachment('test.pdf');
|
|
|
|
await Zotero.DB.queryAsync(
|
|
"REPLACE INTO fulltextItems (itemID, version, indexedPages, totalPages, synced) "
|
|
+ "VALUES (?, 0, 4, 4, ?)",
|
|
[item.id, Zotero.FullText.SYNC_STATE_UNSYNCED]
|
|
);
|
|
|
|
var processorCacheFile = Zotero.FullText.getItemProcessorCacheFile(item).path;
|
|
var itemCacheFile = Zotero.FullText.getItemCacheFile(item).path;
|
|
await Zotero.File.putContentsAsync(itemCacheFile, "Test");
|
|
|
|
var version = 5;
|
|
await Zotero.FullText.setItemContent(
|
|
item.libraryID,
|
|
item.key,
|
|
{
|
|
content: "Test",
|
|
indexedPages: 4,
|
|
totalPages: 4
|
|
},
|
|
version
|
|
);
|
|
|
|
assert.equal(await Zotero.FullText.getItemVersion(item.id), version);
|
|
assert.equal(
|
|
await Zotero.DB.valueQueryAsync("SELECT synced FROM fulltextItems WHERE itemID=?", item.id),
|
|
Zotero.FullText.SYNC_STATE_IN_SYNC
|
|
);
|
|
var { indexedPages, total } = await Zotero.FullText.getPages(item.id);
|
|
assert.equal(indexedPages, 4);
|
|
assert.equal(total, 4);
|
|
assert.isFalse(await OS.File.exists(processorCacheFile));
|
|
});
|
|
});
|
|
|
|
describe("#rebuildIndex()", function () {
|
|
afterEach(() => {
|
|
// Re-enable PDF indexing
|
|
Zotero.Prefs.clear('fulltext.pdfMaxPages');
|
|
});
|
|
|
|
it("should process queued full-text content in indexedOnly mode", async function () {
|
|
Zotero.Prefs.set('fulltext.pdfMaxPages', 0);
|
|
var item = await importFileAttachment('test.pdf');
|
|
Zotero.Prefs.clear('fulltext.pdfMaxPages');
|
|
|
|
var version = 5;
|
|
await Zotero.FullText.setItemContent(
|
|
item.libraryID,
|
|
item.key,
|
|
{
|
|
content: "Test",
|
|
indexedPages: 4,
|
|
totalPages: 4
|
|
},
|
|
version
|
|
);
|
|
|
|
var processorCacheFile = Zotero.FullText.getItemProcessorCacheFile(item).path;
|
|
var itemCacheFile = Zotero.FullText.getItemCacheFile(item).path;
|
|
|
|
assert.isTrue(await OS.File.exists(processorCacheFile));
|
|
|
|
await Zotero.FullText.rebuildIndex(true);
|
|
|
|
// .zotero-ft-unprocessed should have been deleted
|
|
assert.isFalse(await OS.File.exists(processorCacheFile));
|
|
// .zotero-ft-cache should now exist
|
|
assert.isTrue(await OS.File.exists(itemCacheFile));
|
|
|
|
assert.equal(await Zotero.FullText.getItemVersion(item.id), version);
|
|
assert.equal(
|
|
await Zotero.DB.valueQueryAsync("SELECT synced FROM fulltextItems WHERE itemID=?", item.id),
|
|
Zotero.FullText.SYNC_STATE_IN_SYNC
|
|
);
|
|
var { indexedPages, total } = await Zotero.FullText.getPages(item.id);
|
|
assert.equal(indexedPages, 4);
|
|
assert.equal(total, 4);
|
|
});
|
|
|
|
it("should ignore queued full-text content in non-indexedOnly mode", async function () {
|
|
Zotero.Prefs.set('fulltext.pdfMaxPages', 0);
|
|
var item = await importFileAttachment('test.pdf');
|
|
Zotero.Prefs.clear('fulltext.pdfMaxPages');
|
|
|
|
var version = 5;
|
|
await Zotero.FullText.setItemContent(
|
|
item.libraryID,
|
|
item.key,
|
|
{
|
|
content: "Test",
|
|
indexedPages: 4,
|
|
totalPages: 4
|
|
},
|
|
version
|
|
);
|
|
|
|
var processorCacheFile = Zotero.FullText.getItemProcessorCacheFile(item).path;
|
|
var itemCacheFile = Zotero.FullText.getItemCacheFile(item).path;
|
|
|
|
assert.isTrue(await OS.File.exists(processorCacheFile));
|
|
|
|
await Zotero.FullText.rebuildIndex();
|
|
|
|
// .zotero-ft-unprocessed should have been deleted
|
|
assert.isFalse(await OS.File.exists(processorCacheFile));
|
|
// .zotero-ft-cache should now exist
|
|
assert.isTrue(await OS.File.exists(itemCacheFile));
|
|
|
|
// Processor cache file shouldn't have been used, and full text should be marked for
|
|
// syncing
|
|
assert.equal(await Zotero.FullText.getItemVersion(item.id), 0);
|
|
assert.equal(
|
|
await Zotero.DB.valueQueryAsync(
|
|
"SELECT synced FROM fulltextItems WHERE itemID=?",
|
|
item.id
|
|
),
|
|
Zotero.FullText.SYNC_STATE_UNSYNCED
|
|
);
|
|
var { indexedPages, total } = await Zotero.FullText.getPages(item.id);
|
|
assert.equal(indexedPages, 1);
|
|
assert.equal(total, 1);
|
|
});
|
|
|
|
// This shouldn't happen, but before 5.0.85 items reindexed elsewhere could clear local stats
|
|
it("shouldn't clear indexed items with missing file and no stats", async function () {
|
|
Zotero.Prefs.set('fulltext.pdfMaxPages', 1);
|
|
var item = await importFileAttachment('test.pdf');
|
|
Zotero.Prefs.clear('fulltext.pdfMaxPages');
|
|
|
|
var itemCacheFile = Zotero.FullText.getItemCacheFile(item).path;
|
|
assert.isTrue(await OS.File.exists(itemCacheFile));
|
|
|
|
var { indexedPages, total } = await Zotero.FullText.getPages(item.id);
|
|
assert.equal(indexedPages, 1);
|
|
assert.equal(total, 1);
|
|
await Zotero.DB.queryAsync(
|
|
"UPDATE fulltextItems SET indexedPages=NULL, totalPages=NULL WHERE itemID=?",
|
|
item.id
|
|
);
|
|
|
|
await Zotero.FullText.rebuildIndex();
|
|
|
|
// .zotero-ft-cache should still exist
|
|
assert.isTrue(await OS.File.exists(itemCacheFile));
|
|
|
|
assert.equal(
|
|
await Zotero.DB.valueQueryAsync(
|
|
"SELECT COUNT(*) FROM fulltextItems WHERE itemID=?",
|
|
item.id
|
|
),
|
|
1
|
|
);
|
|
});
|
|
});
|
|
})
|