Merge pull request #440 from friflaj/indexing

Faster indexed word insert, optional faster (but simpler) word splitter
2014-01-23 12:21:58 -08:00 · 2014-01-23 12:21:58 -08:00 · f40aeeb9e6
commit f40aeeb9e6
parent fb5d0b8d1d 7395dc8275
1 changed files with 85 additions and 126 deletions
--- a/chrome/content/zotero/xpcom/fulltext.js
+++ b/chrome/content/zotero/xpcom/fulltext.js
@ -71,6 +71,16 @@ Zotero.Fulltext = new function(){
 	const _processorCacheFile = '.zotero-ft-unprocessed';
 	const kWbClassSpace =            0;
 	const kWbClassAlphaLetter =      1;
 	const kWbClassPunct =            2;
 	const kWbClassHanLetter =        3;
 	const kWbClassKatakanaLetter =   4;
 	const kWbClassHiraganaLetter =   5;
 	const kWbClassHWKatakanaLetter = 6;
 	const kWbClassThaiLetter =       7;
 	var _pdfConverterVersion = null;
 	var _pdfConverterFileName = null;
 	var _pdfConverter = null; // nsIFile to executable
@ -92,6 +102,12 @@ Zotero.Fulltext = new function(){
 	var self = this;
 	function init() {
 		Zotero.DB.query("ATTACH ':memory:' AS 'indexing'");
 		Zotero.DB.query('CREATE TABLE indexing.fulltextWords (word NOT NULL)');
 		this.decoder = Components.classes["@mozilla.org/intl/utf8converterservice;1"].
 			getService(Components.interfaces.nsIUTF8ConverterService);
 		var platform = Zotero.platform.replace(' ', '-');
 		_pdfConverterFileName = this.pdfConverterName + '-' + platform;
 		_pdfInfoFileName = this.pdfInfoName + '-' + platform;
@ -122,6 +138,31 @@ Zotero.Fulltext = new function(){
 	}
 	// this is a port from http://mxr.mozilla.org/mozilla-central/source/intl/lwbrk/src/nsSampleWordBreaker.cpp to
 	// Javascript to avoid the overhead of xpcom calls. The port keeps to the mozilla naming of interfaces/constants as
 	// closely as possible.
 	function getClass(c, cc) {
 		if (cc < 0x2E80) { //alphabetical script
 			if ((cc & 0xFF80) == 0) { // ascii
 				if (c == ' '  || c == "\t" || c == "\r" || c == "\n") { return kWbClassSpace; }
 				// deviation from Mozilla algorithm: count "'" as an alphaletter
 				if (c == "'" || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) { return kWbClassAlphaLetter; }
 				return kWbClassPunct;
 			}
 			if ((0xFF80 & cc) == 0x0E00) { return kWbClassThaiLetter; }
 			if (cc == 0x00A0/*NBSP*/) { return kWbClassSpace; }
 			return kWbClassAlphaLetter;
 		}
 		if ((cc >= 0x3400 && cc <= 0x9fff) || (cc>= 0xf900 && cc <= 0xfaff)) /*han*/ { return kWbClassHanLetter; }
 		if (cc >= 0x30A0 && cc <= 0x30FF) { return kWbClassKatakanaLetter; }
 		if (cc >= 0x3040 && cc <= 0x309F) { return kWbClassHiraganaLetter; }
 		if (cc>= 0xFF60 && cc <= 0xFF9F) { return kWbClassHWKatakanaLetter; }
 		return kWbClassAlphaLetter;
 	}
 	/*
 	 * Looks for pdftotext-{platform}[.exe] in the Zotero data directory
 	 *
@ -135,12 +176,12 @@ Zotero.Fulltext = new function(){
 		switch (tool) {
 			case 'converter':
 				var toolName = this.pdfConverterName;
-				var fileName = _pdfConverterFileName
+				var fileName = _pdfConverterFileName;
 				break;
 			case 'info':
 				var toolName = this.pdfInfoName;
-				var fileName = _pdfInfoFileName
+				var fileName = _pdfInfoFileName;
 				break;
 			default:
@ -214,77 +255,21 @@ Zotero.Fulltext = new function(){
 	 * Index multiple words at once
 	 */
 	function indexWords(itemID, words) {
-		if (!words || !words.length || !itemID){
+		let chunk;
 			return false;
 		}
 		var existing = [];
 		var done = 0;
 		var maxWords = 999; // compiled limit
 		var numWords = words.length;
 		Zotero.DB.beginTransaction();
-		
+		Zotero.DB.query("DELETE FROM indexing.fulltextWords");
-		var origWords = [];
+		while (words.length > 0) {
-		
+			chunk = words.splice(0, 100);
-		do {
+			Zotero.DB.query('INSERT INTO indexing.fulltextWords (word) ' + ['SELECT ?' for (word of chunk)].join(' UNION '), chunk);
 			var chunk = words.splice(0, maxWords);
 			origWords = origWords.concat(chunk);
 			var sqlQues = [];
 			var sqlParams = [];
 			for each(var word in chunk) {
 				sqlQues.push('?');
 				sqlParams.push( { string: word } );
 			}
 			var sql = "SELECT word, wordID from fulltextWords WHERE word IN ("
 			sql += sqlQues.join() + ")";
 			var wordIDs = Zotero.DB.query(sql, sqlParams);
 			for (var i in wordIDs) {
 				// Underscore avoids problems with JS reserved words
 				existing['_' + wordIDs[i].word] = wordIDs[i].wordID;
 			}
 			done += chunk.length;
 		}
-		while (done < numWords);
+		Zotero.DB.query('INSERT OR IGNORE INTO fulltextWords (word) SELECT word FROM indexing.fulltextWords');
-		
+		Zotero.DB.query('DELETE FROM fulltextItemWords WHERE itemID = ?', [itemID]);
-		if (!Zotero.DB.valueQuery("SELECT COUNT(*) FROM fulltextItems WHERE itemID=?", itemID)) {
+		Zotero.DB.query('INSERT INTO fulltextItemWords (wordID, itemID) SELECT wordID, ? FROM fulltextWords JOIN indexing.fulltextWords USING(word)', [itemID]);
-			let sql = "INSERT INTO fulltextItems (itemID, version) VALUES (?,?)";
+		Zotero.DB.query("REPLACE INTO fulltextItems (itemID, version) VALUES (?,?)", [itemID, 0]);
-			Zotero.DB.query(sql, [itemID, 0]);
+		Zotero.DB.query("DELETE FROM indexing.fulltextWords");
-		}
+	
 		// Handle bound parameters manually for optimal speed
 		var statement1 = Zotero.DB.getStatement("INSERT INTO fulltextWords (word) VALUES (?)");
 		var statement2 = Zotero.DB.getStatement("INSERT OR IGNORE INTO fulltextItemWords VALUES (?,?)");
 		for each(var word in origWords) {
 			// Skip words containing invalid characters
 			if (word.match(/[\u0000-\u0008\u000b\u000c\u000e-\u001f\ud800-\udfff\ufffe\uffff]/)) {
 				Zotero.debug("Skipping word '" + word + "' due to invalid characters");
 				continue;
 			}
 			if (existing['_' + word]){
 				var wordID = existing['_' + word];
 			}
 			else {
 				statement1.bindUTF8StringParameter(0, word);
 				statement1.execute()
 				var wordID = Zotero.DB.getLastInsertID();
 			}
 			statement2.bindInt32Parameter(0, wordID);
 			statement2.bindInt32Parameter(1, itemID);
 			statement2.execute();
 		}
 		statement1.reset();
 		statement2.reset();
 		Zotero.DB.commitTransaction();
 		return true;
 	}
@ -937,16 +922,6 @@ Zotero.Fulltext = new function(){
 			.then(function (json) {
 				data = JSON.parse(json);
 				// TEMP: until we replace nsISemanticUnitScanner
 				if (data.text.length > 250000) {
 					let item = Zotero.Items.get(itemID);
 					Zotero.debug("Skipping processing of full-text content for item "
 						+ item.libraryKey + " with length " + data.text.length
 						+ " -- will be processed in future version", 2);
 					_processorBlacklist[itemID] = true;
 					return false;
 				}
 				// Write the text content to the regular cache file
 				cacheFile = self.getItemCacheFile(itemID);
@ -1553,43 +1528,39 @@ Zotero.Fulltext = new function(){
 			Zotero.debug('No text to index');
 			return;
 		}
-		
+
-		text = _markTroubleChars(text);
+		try {
-		
+			if (charset && charset != 'utf-8') {
-		var serv = Components.classes["@mozilla.org/intl/semanticunitscanner;1"]
+				text = this.decoder.convertStringToUTF8(text, charset, true);
 				.createInstance(Components.interfaces.nsISemanticUnitScanner);
 		var words = [], unique = {}, begin = {}, end = {}, nextPos = 0;
 		serv.start(charset ? charset : null);
 		do {
 			var next = serv.next(text, text.length, nextPos, true, begin, end);
 			var str = text.substring(begin.value, end.value);
 			// Skip non-breaking spaces
 			if (!str || str.charCodeAt(0)==32 || str.charCodeAt(0)==160){
 				nextPos = end.value;
 				begin = {}, end = {};
 				continue;
 			}
-			
+		} catch (err) {
-			// Create alphanum hash keys out of the character codes
+			Zotero.debug(err, 1);
 			var lc = str.toLowerCase();
 			// And store the unique ones
 			if (!unique[lc]){
 				unique[lc] = true;
 			}
 			nextPos = end.value;
 			begin = {}, end = {};
 		}
 		while (next);
 		for (var i in unique){
 			words.push(_restoreTroubleChars(i));
 		}
-		return words;
+		var words = {};
 		var word = '';
 		var cclass = null;
 		var strlen = text.length;
 		for (var i = 0; i < strlen; i++) {
 			var c = text.charAt(i);
 			var cc = getClass(c, text.charCodeAt(i));
 			if (cc == kWbClassSpace || cc == kWbClassPunct) {
 				if (word != '') { words[word] = true; word = ''; }
 			} else if (cc == kWbClassHanLetter) {
 				if (word != '') { words[word] = true; word = ''; }
 				words[c] = true;
 			} else if (cc == cclass) {
 				word += c.toLowerCase();
 			} else {
 				if (word != '') { words[word] = true; }
 				word = c.toLowerCase();
 			}
 			cclass = cc;
 		}
 		if (word != '') { words[word] = true; }
 		return Object.keys(words);
 	}
@ -1610,16 +1581,4 @@ Zotero.Fulltext = new function(){
 		}
 		while (next = next.nextSibling);
 	}
 	function _markTroubleChars(text){
 		text = text.replace(/'/g, "zoteroapostrophe");
 		return text;
 	}
 	function _restoreTroubleChars(text){
 		text = text.replace(/zoteroapostrophe/g, "'");
 		return text;
 	}
 }