zotero/chrome/chromeFiles/content/scholar/xpcom/fulltext.js

Scholar.Fulltext = new function(){
	this.indexWord = indexWord;
	this.indexWords = indexWords;
	this.indexDocument = indexDocument;
	this.indexString = indexString;
	this.indexFile = indexFile;
	this.indexItems = indexItems;
	this.findTextInFile = findTextInFile;
	this.findTextInItems = findTextInItems;
	this.cacheIsOutdated = cacheIsOutdated;
	this.rebuildCache = rebuildCache;
	this.clearItemWords = clearItemWords;
	this.clearItemContent = clearItemContent;
	this.purgeUnusedWords = purgeUnusedWords;
	this.HTMLToText = HTMLToText;
	this.semanticSplitter = semanticSplitter;

	const FULLTEXT_VERSION = 1;


	function cacheIsOutdated(){
		var sql = "SELECT version FROM version WHERE schema='fulltext'";
		return Scholar.DB.valueQuery(sql) < FULLTEXT_VERSION;
	}


	function rebuildCache(){
		Scholar.DB.beginTransaction();
		Scholar.DB.query("DELETE FROM fulltextWords");
		Scholar.DB.query("DELETE FROM fulltextItems");
		//Scholar.DB.query("DELETE FROM fulltextContent");

		var sql = "SELECT itemID FROM itemAttachments";
		var items = Scholar.DB.columnQuery(sql);
		this.indexItems(items);

		Scholar.DB.commitTransaction();
	}


	/*
	 * Index a single word
	 */
	function indexWord(itemID, word){
		Scholar.DB.beginTransaction();

		var sql = "SELECT wordID FROM fulltextWords WHERE word=?";
		var wordID = Scholar.DB.valueQuery(sql, {string:word});

		if (!wordID){
			var sql = "INSERT INTO fulltextWords (word) VALUES (?)";
			var wordID = Scholar.DB.query(sql, {string:word});
		}

		var sql = "INSERT OR IGNORE INTO fulltextItems VALUES (?,?)";
		Scholar.DB.query(sql, [wordID, itemID]);

		Scholar.DB.commitTransaction();
	}


	/*
	 * Index multiple words at once
	 */
	function indexWords(itemID, words){
		if (!words.length){
			return false;
		}

		var sqlQues = [];
		var sqlParams = [];

		for each(var word in words){
			sqlQues.push('?');
			sqlParams.push({string:word});
		}

		Scholar.DB.beginTransaction();

		var sql = "SELECT word, wordID from fulltextWords WHERE word IN ("
		sql += sqlQues.join() + ")";
		var wordIDs = Scholar.DB.query(sql, sqlParams);

		var existing = [];
		for (var i in wordIDs){
			// Underscore avoids problems with JS reserved words
			existing['_' + wordIDs[i]['word']] = wordIDs[i]['wordID'];
		}

		// TODO: use repeated bound statements once db.js supports it
		for each(var word in words){
			if (existing['_' + word]){
				var wordID = existing['_' + word];
			}
			else {
				var sql = "INSERT INTO fulltextWords (word) VALUES (?)";
				var wordID = Scholar.DB.query(sql, {string:word});
			}

			var sql = "INSERT OR IGNORE INTO fulltextItems VALUES (?,?)";
			Scholar.DB.query(sql, [{int:wordID}, {int:itemID}]);
		}

		Scholar.DB.commitTransaction();
	}


	function indexString(text, charset, itemID){
		var words = this.semanticSplitter(text, charset);

		Scholar.DB.beginTransaction();

		this.clearItemWords(itemID);
		this.indexWords(itemID, words);

		/*
		var sql = "REPLACE INTO fulltextContent (itemID, textContent) VALUES (?,?)";
		Scholar.DB.query(sql, [itemID, {string:text}]);
		*/

		Scholar.DB.commitTransaction();
	}


	function indexDocument(document, itemID){
		if (!itemID){
			throw ('Item ID not provided to indexDocument()');
		}

		Scholar.debug("Indexing document '" + document.title + "'");

		_separateElements(document.body);
		var text = this.HTMLToText(document.body.innerHTML);
		this.indexString(text, document.characterSet, itemID);
	}


	function indexFile(file, mimeType, charset, itemID){
		if (!file.exists()){
			Scholar.debug('File not found in indexFile()', 2);
			return false;
		}

		if (!itemID){ throw ('Item ID not provided to indexFile()'); }
		if (!mimeType){ throw ('MIME type not provided to indexFile()'); }

		if (mimeType.substr(0, 5)!='text/'){
			Scholar.debug('File is not text in indexFile()', 2);
			return false;
		}

		if (!charset){ throw ('Charset not provided to indexFile()'); }

		var text = Scholar.File.getContents(file, charset);
		// Split elements to avoid word concatentation
		text = text.replace(/(>)/g, '$1 ');
		text = this.HTMLToText(text);
		this.indexString(text, charset, itemID);
	}


	function indexItems(items){
		var items = Scholar.Items.get(items);
		var found = [];

		Scholar.DB.beginTransaction();

		for each(var i in items){
			if (!i.isAttachment()){
				continue;
			}

			var file = i.getFile();
			if (!file){
				continue;
			}

			this.indexFile(file, i.getAttachmentMimeType(),
				i.getAttachmentCharset(), i.getID());
		}

		var sql = "REPLACE INTO version (schema,version) VALUES (?,?)";
		Scholar.DB.query(sql, ['fulltext', FULLTEXT_VERSION]);

		Scholar.DB.commitTransaction();
	}


	/*
	 * Scan a file for a text string
	 *
	 * _items_ -- one or more attachment items to search
	 * _searchText_ -- text pattern to search for
	 * _mode_:
	 *    'regexp' -- regular expression (case-insensitive)
	 *    'regexpCS' -- regular expression (case-sensitive)
	 *
	 * - Slashes in regex are optional
	 */
	function findTextInFile(file, charset, searchText, mode){
		Scholar.debug("Searching for text '" + searchText + "' in " + file.path);

		var str = Scholar.File.getContents(file, charset);

		// If not binary mode, convert HTML to text
		if (!mode || mode.indexOf('Binary')==-1){
			// Split elements to avoid word concatentation
			str = str.replace(/(>)/g, '$1 ');

			// Parse to avoid searching on HTML
			str = this.HTMLToText(str);
		}

		switch (mode){
			case 'regexp':
			case 'regexpCS':
			case 'regexpBinary':
			case 'regexpCSBinary':
				// Do a multiline search by default
				var flags = 'm';
				var parts = searchText.match(/^\/(.*)\/([^\/]*)/);
				if (parts){
					searchText = parts[1];
					// Ignore user-supplied flags
					//flags = parts[2];
				}

				if (mode.indexOf('regexpCS')==-1){
					flags += 'i';
				}

				var re = new RegExp(searchText, flags);
				var matches = re(str);
				if (matches){
					Scholar.debug("Text found");
					return str.substr(matches.index, 50);
				}

				break;

			default:
				// Case-insensitive
				searchText = searchText.toLowerCase();
				str = str.toLowerCase();

				var pos = str.indexOf(searchText);
				if (pos!=-1){
					Scholar.debug('Text found');
					return str.substr(pos, 50);
				}
		}

		return -1;
	}

	/*
	 * Scan item files for a text string
	 *
	 * _items_ -- one or more attachment items to search
	 * _searchText_ -- text pattern to search for
	 * _mode_:
	 *    'phrase'
	 *    'regexp'
	 *    'regexpCS' -- case-sensitive regular expression
	 *
	 * Note:
	 *  - Slashes in regex are optional
	 *  - Add 'Binary' to the mode to search all files, not just text files
	 */
	function findTextInItems(items, searchText, mode){
		if (!searchText){
			return [];
		}

		var items = Scholar.Items.get(items);
		var found = [];

		for each(var i in items){
			if (!i.isAttachment()){
				continue;
			}

			var file = i.getFile();
			if (!file){
				continue;
			}

			// If not binary mode, only scan plaintext files
			if (!mode || mode.indexOf('Binary')==-1){
				if (i.getAttachmentMimeType().substr(0,5)!='text/'){
					continue;
				}
			}

			var charset = i.getAttachmentCharset();

			var match = this.findTextInFile(file, charset, searchText, mode);

			if (match != -1){
				found.push({id:i.getID(), match:match});
			}
		}

		return found;
	}


	function clearItemWords(itemID){
		Scholar.DB.query("DELETE FROM fulltextItems WHERE itemID=" + itemID);
	}


	function clearItemContent(itemID){
		Scholar.DB.query("DELETE FROM fulltextContent WHERE itemID=" + itemID);
	}


	function purgeUnusedWords(){
		var sql = "DELETE FROM fulltextWords WHERE wordID NOT IN "
			+ "(SELECT wordID FROM fulltextItems)";
		Scholar.DB.query(sql);
	}


	function HTMLToText(text){
		var	nsIFC =
			Components.classes['@mozilla.org/widget/htmlformatconverter;1'].
				createInstance(Components.interfaces.nsIFormatConverter);
		var from = Components.classes['@mozilla.org/supports-string;1'].
			createInstance(Components.interfaces.nsISupportsString);
		from.data = text;
		var to = {value:null};
		try {
			nsIFC.convert('text/html', from, from.toString().length,
				'text/unicode', to, {});
			to = to.value.QueryInterface(Components.interfaces.nsISupportsString);
			return to.toString();
		}
		catch(e){
			Scholar.debug(e, 1);
			return text;
		}
	}


	function semanticSplitter(text, charset){
		if (!text){
			Scholar.debug('No text to index');
			return;
		}

		text = _markTroubleChars(text);

		var serv = Components.classes["@mozilla.org/intl/semanticunitscanner;1"]
				.createInstance(Components.interfaces.nsISemanticUnitScanner);

		var words = [], unique = {}, begin = {}, end = {}, nextPos = 0;
		serv.start(charset ? charset : null);
		do {
			var next = serv.next(text, text.length, nextPos, true, begin, end);
			var str = text.substring(begin.value, end.value);

			// Skip non-breaking spaces
			if (!str || str.charCodeAt(0)==32 || str.charCodeAt(0)==160){
				nextPos = end.value;
				begin = {}, end = {};
				continue;
			}

			// Create alphanum hash keys out of the character codes
			var lc = str.toLowerCase();

			// And store the unique ones
			if (!unique[lc]){
				unique[lc] = true;
			}

			nextPos = end.value;
			begin = {}, end = {};
		}
		while (next);

		for (var i in unique){
			words.push(_restoreTroubleChars(i));
		}

		return words;
	}


	/*
	 * Add spaces between elements, since body.textContent doesn't
	 */
	function _separateElements(node){
		var next = node;
		do {
			if (next.hasChildNodes()){
				_separateElements(next.firstChild);
			}

			var space = node.ownerDocument.createTextNode(' ');
			next.parentNode.insertBefore(space, next);
		}
		while (next = next.nextSibling);
	}


	function _markTroubleChars(text){
		text = text.replace("'", "zoteroapostrophe");
		return text;
	}


	function _restoreTroubleChars(text){
		text = text.replace("zoteroapostrophe", "'");
		return text;
	}
}