Use multi-item requests for full-text writes

This is necessary to get a library version after the write instead of an
item version. Otherwise after a full-text write, the main library
version is behind, so the next sync checks all object types for that
library instead of getting a 304.

Full text is batched up to 500K characters or 10 items, whichever is
less.

This also switches to using ?format=versions for /fulltext requests,
which isn't currently necessary but reflects what it's actually doing.
This commit is contained in:
Dan Stillman 2016-05-02 13:13:19 -04:00
parent a0c7cf9bee
commit e0e744f9b1
6 changed files with 246 additions and 122 deletions

View file

@ -105,6 +105,7 @@ Zotero.Fulltext = Zotero.FullText = new function(){
this.getLibraryVersion = function (libraryID) {
if (!libraryID) throw new Error("libraryID not provided");
return Zotero.DB.valueQueryAsync(
"SELECT version FROM version WHERE schema=?", "fulltext_" + libraryID
)
@ -112,6 +113,7 @@ Zotero.Fulltext = Zotero.FullText = new function(){
this.setLibraryVersion = Zotero.Promise.coroutine(function* (libraryID, version) {
if (!libraryID) throw new Error("libraryID not provided");
yield Zotero.DB.queryAsync(
"REPLACE INTO version VALUES (?, ?)", ["fulltext_" + libraryID, version]
);
@ -130,12 +132,12 @@ Zotero.Fulltext = Zotero.FullText = new function(){
});
this.setItemSynced = Zotero.Promise.coroutine(function* (itemID, version) {
this.setItemSynced = function (itemID, version) {
return Zotero.DB.queryAsync(
"UPDATE fulltextItems SET synced=?, version=? WHERE itemID=?",
[SYNC_STATE_IN_SYNC, version, itemID]
);
});
};
// this is a port from http://mxr.mozilla.org/mozilla-central/source/intl/lwbrk/src/nsSampleWordBreaker.cpp to
@ -787,22 +789,25 @@ Zotero.Fulltext = Zotero.FullText = new function(){
* Get content and stats that haven't yet been synced
*
* @param {Integer} libraryID
* @param {Integer} numItems
* @param {Integer} [options]
* @param {Integer} [options.maxSize]
* @param {Integer} [options.maxItems]
* @param {Integer} [options.lastItemID] - Only return content for items above this id
* @return {Promise<Array<Object>>}
*/
this.getUnsyncedContent = Zotero.Promise.coroutine(function* (libraryID, numItems) {
var maxLength = Zotero.Prefs.get('fulltext.textMaxLength');
this.getUnsyncedContent = Zotero.Promise.coroutine(function* (libraryID, options = {}) {
var contentItems = [];
var sql = "SELECT itemID, indexedChars, totalChars, indexedPages, totalPages "
+ "FROM fulltextItems FI JOIN items I USING (itemID) WHERE libraryID=? AND "
+ "FI.synced=? AND I.synced=1 ORDER BY clientDateModified DESC";
+ "FI.synced=? AND I.synced=1 ";
var params = [libraryID, SYNC_STATE_UNSYNCED];
if (numItems) {
sql += " LIMIT ?";
params.push(numItems);
if (options.lastItemID) {
sql += "AND itemID>?";
params.push(options.lastItemID);
}
sql += "ORDER BY itemID DESC";
var rows = yield Zotero.DB.queryAsync(sql, params);
var contentSize = 0;
for (let i = 0; i < rows.length; i++) {
let row = rows[i];
let content;
@ -868,8 +873,13 @@ Zotero.Fulltext = Zotero.FullText = new function(){
continue;
}
// If this isn't the first item and it would put us over the size limit, stop
if (contentItems.length && options.maxSize && contentSize + content.length > options.maxSize) {
break;
}
contentItems.push({
libraryID: item.libraryID,
itemID: item.id,
key: item.key,
content,
indexedChars: row.indexedChars ? row.indexedChars : 0,
@ -877,6 +887,11 @@ Zotero.Fulltext = Zotero.FullText = new function(){
indexedPages: row.indexedPages ? row.indexedPages : 0,
totalPages: row.totalPages ? row.totalPages : 0
});
if (options.maxItems && contentItems.length >= options.maxItems) {
break;
}
contentSize += content.length;
}
return contentItems;
});

View file

@ -376,7 +376,8 @@ Zotero.Sync.APIClient.prototype = {
var params = {
libraryType: libraryType,
libraryTypeID: libraryTypeID,
target: "fulltext"
target: "fulltext",
format: "versions"
};
if (since) {
params.since = since;
@ -415,26 +416,31 @@ Zotero.Sync.APIClient.prototype = {
}),
setFullTextForItem: Zotero.Promise.coroutine(function* (libraryType, libraryTypeID, itemKey, data) {
setFullTextForItems: Zotero.Promise.coroutine(function* (libraryType, libraryTypeID, libraryVersion, data) {
var params = {
libraryType: libraryType,
libraryTypeID: libraryTypeID,
target: `items/${itemKey}/fulltext`
target: "fulltext"
};
var uri = this.buildRequestURI(params);
var xmlhttp = yield this.makeRequest(
"PUT",
"POST",
uri,
{
headers: {
"Content-Type": "application/json"
"Content-Type": "application/json",
"If-Unmodified-Since-Version": libraryVersion
},
body: JSON.stringify(data),
successCodes: [204],
successCodes: [200, 412],
debug: true
}
);
return this._getLastModifiedVersion(xmlhttp);
this._check412(xmlhttp);
return {
libraryVersion: this._getLastModifiedVersion(xmlhttp),
results: this._parseJSON(xmlhttp.responseText)
};
}),
@ -727,5 +733,6 @@ Zotero.Sync.APIClient.prototype = {
if (!libraryVersion) {
throw new Error("Last-Modified-Version not provided");
}
return libraryVersion;
}
}

View file

@ -35,6 +35,9 @@ Zotero.Sync.Data.FullTextEngine = function (options) {
throw new Error("options.libraryID not set");
}
this.MAX_BATCH_SIZE = 500000;
this.MAX_BATCH_ITEMS = 10;
this.apiClient = options.apiClient;
this.libraryID = options.libraryID;
this.library = Zotero.Libraries.get(options.libraryID);
@ -111,33 +114,71 @@ Zotero.Sync.Data.FullTextEngine.prototype._upload = Zotero.Promise.coroutine(fun
Zotero.debug("Uploading full-text content for " + this.library.name);
var props = ['content', 'indexedChars', 'totalChars', 'indexedPages', 'totalPages'];
var libraryVersion = this.library.libraryVersion;
var props = ['key', 'content', 'indexedChars', 'totalChars', 'indexedPages', 'totalPages'];
let lastItemID = 0;
while (true) {
let numSuccessful = 0;
let objs = yield Zotero.FullText.getUnsyncedContent(this.libraryID, 10);
let objs = yield Zotero.FullText.getUnsyncedContent(this.libraryID, {
maxSize: this.MAX_BATCH_SIZE,
maxItems: this.MAX_BATCH_ITEMS,
lastItemID
});
if (!objs.length) {
break;
}
let promises = [];
let jsonArray = [];
let results;
for (let obj of objs) {
let json = {};
for (let prop of props) {
json[prop] = obj[prop];
}
promises.push(this.apiClient.setFullTextForItem(
this.library.libraryType, this.library.libraryTypeID, obj.key, json
));
jsonArray.push(json);
lastItemID = obj.itemID;
}
var results = yield Zotero.Promise.all(promises);
({ libraryVersion, results } = yield this.apiClient.setFullTextForItems(
this.library.libraryType,
this.library.libraryTypeID,
libraryVersion,
jsonArray
));
yield Zotero.DB.executeTransaction(function* () {
for (let i = 0; i < results.length; i++) {
let itemID = yield Zotero.Items.getIDFromLibraryAndKey(
this.libraryID, objs[i].key
);
yield Zotero.FullText.setItemSynced(itemID, results[i]);
for (let state of ['successful', 'unchanged']) {
for (let index in results[state]) {
let key = results[state][index].key;
let itemID = Zotero.Items.getIDFromLibraryAndKey(this.libraryID, key);
yield Zotero.FullText.setItemSynced(itemID, libraryVersion);
}
}
// Set both the library version and the full-text library version. The latter is necessary
// because full-text sync can be turned off at any time, so we have to keep track of the
// last version we've seen for full-text in case the main library version has advanced since.
yield Zotero.FullText.setLibraryVersion(this.libraryID, libraryVersion);
this.library.libraryVersion = libraryVersion;
yield this.library.save();
}.bind(this));
for (let index in results.failed) {
let { code, message, data } = results.failed[index];
let e = new Error(message);
e.name = "ZoteroObjectUploadError";
e.code = code;
if (data) {
e.data = data;
}
Zotero.logError("Error uploading full text for item " + jsonArray[index].key + " in "
+ this.library.name + ":\n\n" + e);
if (this.onError) {
this.onError(e);
}
if (this.stopOnError) {
throw new Error(e);
}
}
}
});

View file

@ -184,27 +184,32 @@ Zotero.Sync.Runner_Module = function (options = {}) {
// Sync data and files, and then repeat if necessary
let attempt = 1;
let nextLibraries = librariesToSync.concat();
let resyncLibraries = [];
while (nextLibraries.length) {
let successfulLibraries = new Set(librariesToSync);
while (librariesToSync.length) {
if (attempt > 3) {
// TODO: Back off and/or nicer error
throw new Error("Too many sync attempts -- stopping");
}
nextLibraries = yield _doDataSync(
resyncLibraries.length ? resyncLibraries : nextLibraries,
engineOptions
);
resyncLibraries = yield _doFileSync(nextLibraries, engineOptions);
if (!resyncLibraries.length) {
break;
}
attempt++;
}
let nextLibraries = yield _doDataSync(librariesToSync, engineOptions);
// Remove failed libraries from the successful set
Zotero.Utilities.arrayDiff(librariesToSync, nextLibraries).forEach(libraryID => {
successfulLibraries.delete(libraryID);
});
// Sync full-text content in libraries with successful data sync. Full-text syncing
// still happens for libraries with failed file syncs.
if (nextLibraries.length) {
yield _doFullTextSync(nextLibraries, engineOptions);
// Run file sync on all libraries that passed the last data sync
librariesToSync = yield _doFileSync(nextLibraries, engineOptions);
if (librariesToSync.length) {
attempt++;
continue;
}
// Run full-text sync on all libraries that haven't failed a data sync
librariesToSync = yield _doFullTextSync([...successfulLibraries], engineOptions);
if (librariesToSync.length) {
attempt++;
continue;
}
break;
}
}
catch (e) {
@ -570,15 +575,22 @@ Zotero.Sync.Runner_Module = function (options = {}) {
}
}
Zotero.debug("Done with file syncing");
if (resyncLibraries.length) {
Zotero.debug("Libraries to resync: " + resyncLibraries.join(", "));
}
return resyncLibraries;
}.bind(this));
/**
* @return {Integer[]} - Array of libraries that need data syncing again
*/
var _doFullTextSync = Zotero.Promise.coroutine(function* (libraries, options) {
if (!Zotero.Prefs.get("sync.fulltext.enabled")) return;
Zotero.debug("Starting full-text syncing");
this.setSyncStatus(Zotero.getString('sync.status.syncingFullText'));
var resyncLibraries = [];
for (let libraryID of libraries) {
try {
let opts = {};
@ -589,6 +601,10 @@ Zotero.Sync.Runner_Module = function (options = {}) {
yield engine.start();
}
catch (e) {
if (e instanceof Zotero.HTTP.UnexpectedStatusException && e.status == 412) {
resyncLibraries.push(libraryID);
continue;
}
Zotero.debug("Full-text sync failed for library " + libraryID);
Zotero.logError(e);
this.checkError(e);
@ -600,6 +616,10 @@ Zotero.Sync.Runner_Module = function (options = {}) {
}
}
Zotero.debug("Done with full-text syncing");
if (resyncLibraries.length) {
Zotero.debug("Libraries to resync: " + resyncLibraries.join(", "));
}
return resyncLibraries;
}.bind(this));

View file

@ -74,13 +74,13 @@ describe("Zotero.Sync.Data.FullTextEngine", function () {
var spy = sinon.spy(Zotero.Fulltext, "startContentProcessor")
var itemFullTextVersion = 10;
var libraryFullTextVersion = 15;
var libraryVersion = 15;
setResponse({
method: "GET",
url: "users/1/fulltext",
url: "users/1/fulltext?format=versions",
status: 200,
headers: {
"Last-Modified-Version": libraryFullTextVersion
"Last-Modified-Version": libraryVersion
},
json: {
[attachment.key]: itemFullTextVersion
@ -111,7 +111,7 @@ describe("Zotero.Sync.Data.FullTextEngine", function () {
assert.propertyVal(data, 'version', itemFullTextVersion);
yield assert.eventually.equal(
Zotero.FullText.getLibraryVersion(item.libraryID),
libraryFullTextVersion
libraryVersion
);
sinon.assert.calledOnce(spy);
@ -134,14 +134,14 @@ describe("Zotero.Sync.Data.FullTextEngine", function () {
spy = sinon.spy(Zotero.Fulltext, "startContentProcessor")
itemFullTextVersion = 17;
var lastLibraryFullTextVersion = libraryFullTextVersion;
libraryFullTextVersion = 20;
var lastLibraryVersion = libraryVersion;
libraryVersion = 20;
setResponse({
method: "GET",
url: "users/1/fulltext?since=" + lastLibraryFullTextVersion,
url: "users/1/fulltext?format=versions&since=" + lastLibraryVersion,
status: 200,
headers: {
"Last-Modified-Version": libraryFullTextVersion
"Last-Modified-Version": libraryVersion
},
json: {
[attachment.key]: itemFullTextVersion
@ -172,7 +172,7 @@ describe("Zotero.Sync.Data.FullTextEngine", function () {
assert.propertyVal(data, 'version', itemFullTextVersion);
yield assert.eventually.equal(
Zotero.FullText.getLibraryVersion(item.libraryID),
libraryFullTextVersion
libraryVersion
);
sinon.assert.calledOnce(spy);
@ -191,13 +191,13 @@ describe("Zotero.Sync.Data.FullTextEngine", function () {
yield attachment.saveTx();
var itemFullTextVersion = 10;
var libraryFullTextVersion = 15;
var libraryVersion = 15;
setResponse({
method: "GET",
url: "users/1/fulltext",
url: "users/1/fulltext?format=versions",
status: 200,
headers: {
"Last-Modified-Version": libraryFullTextVersion
"Last-Modified-Version": libraryVersion
},
json: {
[attachment.key]: itemFullTextVersion
@ -225,29 +225,41 @@ describe("Zotero.Sync.Data.FullTextEngine", function () {
({ engine, client, caller } = yield setup());
var item = yield createDataObject('item');
var attachment = new Zotero.Item('attachment');
attachment.parentItemID = item.id;
attachment.attachmentLinkMode = 'imported_file';
attachment.attachmentContentType = 'text/html';
attachment.attachmentFilename = 'test.html';
attachment.attachmentCharset = 'utf-8';
attachment.synced = true;
yield attachment.saveTx();
yield Zotero.Attachments.createDirectoryForItem(attachment);
var path = attachment.getFilePath();
var content = generateContent()
var htmlContent = "<html><body>" + content + "</body></html>";
yield Zotero.File.putContentsAsync(path, content);
yield Zotero.Fulltext.indexItems([attachment.id]);
var attachment1 = new Zotero.Item('attachment');
attachment1.parentItemID = item.id;
attachment1.attachmentLinkMode = 'imported_file';
attachment1.attachmentContentType = 'text/html';
attachment1.attachmentFilename = 'test.html';
attachment1.attachmentCharset = 'utf-8';
attachment1.synced = true;
yield attachment1.saveTx();
yield Zotero.Attachments.createDirectoryForItem(attachment1);
var path = attachment1.getFilePath();
var content1 = "A" + generateContent()
yield Zotero.File.putContentsAsync(path, content1);
var attachment2 = new Zotero.Item('attachment');
attachment2.parentItemID = item.id;
attachment2.attachmentLinkMode = 'imported_file';
attachment2.attachmentContentType = 'text/html';
attachment2.attachmentFilename = 'test.html';
attachment2.attachmentCharset = 'utf-8';
attachment2.synced = true;
yield attachment2.saveTx();
yield Zotero.Attachments.createDirectoryForItem(attachment2);
path = attachment2.getFilePath();
var content2 = "B" + generateContent()
yield Zotero.File.putContentsAsync(path, content2);
yield Zotero.Fulltext.indexItems([attachment1.id, attachment2.id]);
var libraryVersion = 15;
var previousLibraryVersion = libraryVersion;
var count = 1;
setResponse({
method: "GET",
url: "users/1/fulltext",
url: "users/1/fulltext?format=versions",
status: 200,
headers: {
"Last-Modified-Version": libraryVersion
@ -255,8 +267,8 @@ describe("Zotero.Sync.Data.FullTextEngine", function () {
json: {}
});
server.respond(function (req) {
if (req.method == "PUT") {
if (req.url == `${baseURL}users/1/items/${attachment.key}/fulltext`) {
if (req.method == "POST") {
if (req.url == `${baseURL}users/1/fulltext`) {
assert.propertyVal(
req.requestHeaders,
'Content-Type',
@ -264,19 +276,40 @@ describe("Zotero.Sync.Data.FullTextEngine", function () {
);
let json = JSON.parse(req.requestBody);
assert.propertyVal(json, 'content', content);
assert.propertyVal(json, 'indexedChars', content.length);
assert.propertyVal(json, 'totalChars', content.length);
assert.propertyVal(json, 'indexedPages', 0);
assert.propertyVal(json, 'totalPages', 0);
assert.lengthOf(json, 2);
json.sort((a, b) => a.content < b.content ? -1 : 1);
assert.propertyVal(json[0], 'key', attachment1.key);
assert.propertyVal(json[0], 'content', content1);
assert.propertyVal(json[0], 'indexedChars', content1.length);
assert.propertyVal(json[0], 'totalChars', content1.length);
assert.propertyVal(json[0], 'indexedPages', 0);
assert.propertyVal(json[0], 'totalPages', 0);
assert.propertyVal(json[1], 'key', attachment2.key);
assert.propertyVal(json[1], 'content', content2);
assert.propertyVal(json[1], 'indexedChars', content2.length);
assert.propertyVal(json[1], 'totalChars', content2.length);
assert.propertyVal(json[1], 'indexedPages', 0);
assert.propertyVal(json[1], 'totalPages', 0);
req.respond(
204,
200,
{
"Content-Type": "application/json",
"Last-Modified-Version": ++libraryVersion
},
""
JSON.stringify({
"successful": {
"0": {
key: attachment1.key
},
"1": {
key: attachment2.key
}
},
"unchanged": {},
"failed": {}
})
);
count--;
}
@ -285,10 +318,10 @@ describe("Zotero.Sync.Data.FullTextEngine", function () {
yield engine.start();
assert.equal(count, 0);
yield assert.eventually.equal(
Zotero.FullText.getItemVersion(attachment.id),
libraryVersion
);
yield assert.eventually.equal(Zotero.FullText.getItemVersion(attachment1.id), libraryVersion);
yield assert.eventually.equal(Zotero.FullText.getItemVersion(attachment2.id), libraryVersion);
yield assert.eventually.equal(Zotero.Fulltext.getLibraryVersion(libraryID), libraryVersion);
assert.equal(Zotero.Libraries.userLibrary.libraryVersion, libraryVersion);
//
// Upload new content
@ -296,27 +329,25 @@ describe("Zotero.Sync.Data.FullTextEngine", function () {
({ engine, client, caller } = yield setup());
yield Zotero.Libraries.setVersion(libraryID, libraryVersion);
item = yield createDataObject('item');
attachment = new Zotero.Item('attachment');
attachment.parentItemID = item.id;
attachment.attachmentLinkMode = 'imported_file';
attachment.attachmentContentType = 'text/html';
attachment.attachmentFilename = 'test.html';
attachment.attachmentCharset = 'utf-8';
attachment.synced = true;
yield attachment.saveTx();
yield Zotero.Attachments.createDirectoryForItem(attachment);
var attachment3 = new Zotero.Item('attachment');
attachment3.parentItemID = item.id;
attachment3.attachmentLinkMode = 'imported_file';
attachment3.attachmentContentType = 'text/html';
attachment3.attachmentFilename = 'test.html';
attachment3.attachmentCharset = 'utf-8';
attachment3.synced = true;
yield attachment3.saveTx();
yield Zotero.Attachments.createDirectoryForItem(attachment3);
path = attachment.getFilePath();
content = generateContent()
htmlContent = "<html><body>" + content + "</body></html>";
yield Zotero.File.putContentsAsync(path, content);
yield Zotero.Fulltext.indexItems([attachment.id]);
path = attachment3.getFilePath();
var content3 = generateContent()
yield Zotero.File.putContentsAsync(path, content3);
yield Zotero.Fulltext.indexItems([attachment3.id]);
count = 1;
setResponse({
method: "GET",
url: "users/1/fulltext?since=" + previousLibraryVersion,
url: "users/1/fulltext?format=versions&since=" + libraryVersion,
status: 200,
headers: {
"Last-Modified-Version": libraryVersion
@ -324,8 +355,8 @@ describe("Zotero.Sync.Data.FullTextEngine", function () {
json: {}
});
server.respond(function (req) {
if (req.method == "PUT") {
if (req.url == `${baseURL}users/1/items/${attachment.key}/fulltext`) {
if (req.method == "POST") {
if (req.url == `${baseURL}users/1/fulltext`) {
assert.propertyVal(req.requestHeaders, 'Zotero-API-Key', apiKey);
assert.propertyVal(
req.requestHeaders,
@ -334,19 +365,30 @@ describe("Zotero.Sync.Data.FullTextEngine", function () {
);
let json = JSON.parse(req.requestBody);
assert.propertyVal(json, 'content', content);
assert.propertyVal(json, 'indexedChars', content.length);
assert.propertyVal(json, 'totalChars', content.length);
assert.lengthOf(json, 1);
json = json[0];
assert.propertyVal(json, 'key', attachment3.key);
assert.propertyVal(json, 'content', content3);
assert.propertyVal(json, 'indexedChars', content3.length);
assert.propertyVal(json, 'totalChars', content3.length);
assert.propertyVal(json, 'indexedPages', 0);
assert.propertyVal(json, 'totalPages', 0);
req.respond(
204,
200,
{
"Content-Type": "application/json",
"Last-Modified-Version": ++libraryVersion
},
""
JSON.stringify({
"successful": {
"0": {
key: attachment3.key
}
},
"unchanged": {},
"failed": {}
})
);
count--;
}
@ -355,10 +397,9 @@ describe("Zotero.Sync.Data.FullTextEngine", function () {
yield engine.start();
assert.equal(count, 0);
yield assert.eventually.equal(
Zotero.FullText.getItemVersion(attachment.id),
libraryVersion
);
yield assert.eventually.equal(Zotero.FullText.getItemVersion(attachment3.id), libraryVersion);
yield assert.eventually.equal(Zotero.Fulltext.getLibraryVersion(libraryID), libraryVersion);
assert.equal(Zotero.Libraries.userLibrary.libraryVersion, libraryVersion);
})
})
});
})

View file

@ -608,7 +608,7 @@ describe("Zotero.Sync.Runner", function () {
// Full-text syncing
setResponse({
method: "GET",
url: "users/1/fulltext",
url: "users/1/fulltext?format=versions",
status: 200,
headers: {
"Last-Modified-Version": 5
@ -617,7 +617,7 @@ describe("Zotero.Sync.Runner", function () {
});
setResponse({
method: "GET",
url: "users/1/publications/fulltext",
url: "users/1/publications/fulltext?format=versions",
status: 200,
headers: {
"Last-Modified-Version": 10
@ -626,7 +626,7 @@ describe("Zotero.Sync.Runner", function () {
});
setResponse({
method: "GET",
url: "groups/1623562/fulltext",
url: "groups/1623562/fulltext?format=versions",
status: 200,
headers: {
"Last-Modified-Version": 15
@ -635,7 +635,7 @@ describe("Zotero.Sync.Runner", function () {
});
setResponse({
method: "GET",
url: "groups/2694172/fulltext",
url: "groups/2694172/fulltext?format=versions",
status: 200,
headers: {
"Last-Modified-Version": 20
@ -780,7 +780,7 @@ describe("Zotero.Sync.Runner", function () {
});
setResponse({
method: "GET",
url: "users/1/publications/fulltext",
url: "users/1/publications/fulltext?format=versions",
status: 200,
headers: {
"Last-Modified-Version": 5