zotero/chrome/chromeFiles/content/scholar/xpcom/fulltext.js
Dan Stillman 27d1d63bfc Sped up fulltext indexing (of loaded documents, at least) by about 75%
- Switched to manually repeated bound parameters in indexWords()
- Switched to the innerHTML regex used elsewhere instead of a more proper but nevertheless misguided DOM traverser to split elements in indexDocument

This may invalidate the fulltext progress indicator ticket
2006-09-23 09:05:01 +00:00

430 lines
10 KiB
JavaScript

Scholar.Fulltext = new function(){
this.indexWord = indexWord;
this.indexWords = indexWords;
this.indexDocument = indexDocument;
this.indexString = indexString;
this.indexFile = indexFile;
this.indexItems = indexItems;
this.findTextInFile = findTextInFile;
this.findTextInItems = findTextInItems;
this.cacheIsOutdated = cacheIsOutdated;
this.rebuildCache = rebuildCache;
this.clearItemWords = clearItemWords;
//this.clearItemContent = clearItemContent;
this.purgeUnusedWords = purgeUnusedWords;
this.HTMLToText = HTMLToText;
this.semanticSplitter = semanticSplitter;
const FULLTEXT_VERSION = 1;
function cacheIsOutdated(){
var sql = "SELECT version FROM version WHERE schema='fulltext'";
return Scholar.DB.valueQuery(sql) < FULLTEXT_VERSION;
}
function rebuildCache(){
Scholar.DB.beginTransaction();
Scholar.DB.query("DELETE FROM fulltextWords");
Scholar.DB.query("DELETE FROM fulltextItems");
//Scholar.DB.query("DELETE FROM fulltextContent");
var sql = "SELECT itemID FROM itemAttachments";
var items = Scholar.DB.columnQuery(sql);
indexItems(items);
Scholar.DB.commitTransaction();
}
/*
* Index a single word
*/
function indexWord(itemID, word){
Scholar.DB.beginTransaction();
var sql = "SELECT wordID FROM fulltextWords WHERE word=?";
var wordID = Scholar.DB.valueQuery(sql, {string:word});
if (!wordID){
var sql = "INSERT INTO fulltextWords (word) VALUES (?)";
var wordID = Scholar.DB.query(sql, {string:word});
}
var sql = "INSERT OR IGNORE INTO fulltextItems VALUES (?,?)";
Scholar.DB.query(sql, [wordID, itemID]);
Scholar.DB.commitTransaction();
}
/*
* Index multiple words at once
*/
function indexWords(itemID, words){
if (!words || !words.length){
return false;
}
var sqlQues = [];
var sqlParams = [];
for each(var word in words){
sqlQues.push('?');
sqlParams.push({string:word});
}
Scholar.DB.beginTransaction();
var sql = "SELECT word, wordID from fulltextWords WHERE word IN ("
sql += sqlQues.join() + ")";
var wordIDs = Scholar.DB.query(sql, sqlParams);
var existing = [];
for (var i in wordIDs){
// Underscore avoids problems with JS reserved words
existing['_' + wordIDs[i]['word']] = wordIDs[i]['wordID'];
}
// Handle bound parameters manually for optimal speed
var statement1 = Scholar.DB.getStatement("INSERT INTO fulltextWords (word) VALUES (?)");
var statement2 = Scholar.DB.getStatement("INSERT OR IGNORE INTO fulltextItems VALUES (?,?)");
statement2.bindInt32Parameter(1, itemID);
for each(var word in words){
if (existing['_' + word]){
var wordID = existing['_' + word];
}
else {
statement1.bindUTF8StringParameter(0, word);
statement1.execute()
var wordID = Scholar.DB.getLastInsertID();
}
statement2.bindInt32Parameter(0, wordID);
statement2.execute();
}
statement1.reset();
statement2.reset();
Scholar.DB.commitTransaction();
}
function indexString(text, charset, itemID){
var words = semanticSplitter(text, charset);
Scholar.DB.beginTransaction();
clearItemWords(itemID);
indexWords(itemID, words);
/*
var sql = "REPLACE INTO fulltextContent (itemID, textContent) VALUES (?,?)";
Scholar.DB.query(sql, [itemID, {string:text}]);
*/
Scholar.DB.commitTransaction();
}
function indexDocument(document, itemID){
if (!itemID){
throw ('Item ID not provided to indexDocument()');
}
Scholar.debug("Indexing document '" + document.title + "'");
var text = document.body.innerHTML.replace(/(>)/g, '$1 ');
text = HTMLToText(text);
indexString(text, document.characterSet, itemID);
}
function indexFile(file, mimeType, charset, itemID){
if (!file.exists()){
Scholar.debug('File not found in indexFile()', 2);
return false;
}
if (!itemID){ throw ('Item ID not provided to indexFile()'); }
if (!mimeType){ throw ('MIME type not provided to indexFile()'); }
if (mimeType.substr(0, 5)!='text/'){
Scholar.debug('File is not text in indexFile()', 2);
return false;
}
if (!charset){ throw ('Charset not provided to indexFile()'); }
var text = Scholar.File.getContents(file, charset);
// Split elements to avoid word concatentation
text = text.replace(/(>)/g, '$1 ');
text = HTMLToText(text);
indexString(text, charset, itemID);
}
function indexItems(items){
var items = Scholar.Items.get(items);
var found = [];
Scholar.DB.beginTransaction();
for each(var i in items){
if (!i.isAttachment()){
continue;
}
var file = i.getFile();
if (!file){
continue;
}
indexFile(file, i.getAttachmentMimeType(),
i.getAttachmentCharset(), i.getID());
}
var sql = "REPLACE INTO version (schema,version) VALUES (?,?)";
Scholar.DB.query(sql, ['fulltext', FULLTEXT_VERSION]);
Scholar.DB.commitTransaction();
}
/*
* Scan a file for a text string
*
* _items_ -- one or more attachment items to search
* _searchText_ -- text pattern to search for
* _mode_:
* 'regexp' -- regular expression (case-insensitive)
* 'regexpCS' -- regular expression (case-sensitive)
*
* - Slashes in regex are optional
*/
function findTextInFile(file, charset, searchText, mode){
Scholar.debug("Searching for text '" + searchText + "' in " + file.path);
var str = Scholar.File.getContents(file, charset);
// If not binary mode, convert HTML to text
if (!mode || mode.indexOf('Binary')==-1){
// Split elements to avoid word concatentation
str = str.replace(/(>)/g, '$1 ');
// Parse to avoid searching on HTML
str = HTMLToText(str);
}
switch (mode){
case 'regexp':
case 'regexpCS':
case 'regexpBinary':
case 'regexpCSBinary':
// Do a multiline search by default
var flags = 'm';
var parts = searchText.match(/^\/(.*)\/([^\/]*)/);
if (parts){
searchText = parts[1];
// Ignore user-supplied flags
//flags = parts[2];
}
if (mode.indexOf('regexpCS')==-1){
flags += 'i';
}
var re = new RegExp(searchText, flags);
var matches = re(str);
if (matches){
Scholar.debug("Text found");
return str.substr(matches.index, 50);
}
break;
default:
// Case-insensitive
searchText = searchText.toLowerCase();
str = str.toLowerCase();
var pos = str.indexOf(searchText);
if (pos!=-1){
Scholar.debug('Text found');
return str.substr(pos, 50);
}
}
return -1;
}
/*
* Scan item files for a text string
*
* _items_ -- one or more attachment items to search
* _searchText_ -- text pattern to search for
* _mode_:
* 'phrase'
* 'regexp'
* 'regexpCS' -- case-sensitive regular expression
*
* Note:
* - Slashes in regex are optional
* - Add 'Binary' to the mode to search all files, not just text files
*/
function findTextInItems(items, searchText, mode){
if (!searchText){
return [];
}
var items = Scholar.Items.get(items);
var found = [];
for each(var i in items){
if (!i.isAttachment()){
continue;
}
var file = i.getFile();
if (!file){
continue;
}
// If not binary mode, only scan plaintext files
if (!mode || mode.indexOf('Binary')==-1){
if (i.getAttachmentMimeType().substr(0,5)!='text/'){
continue;
}
}
var charset = i.getAttachmentCharset();
var match = findTextInFile(file, charset, searchText, mode);
if (match != -1){
found.push({id:i.getID(), match:match});
}
}
return found;
}
function clearItemWords(itemID){
Scholar.DB.query("DELETE FROM fulltextItems WHERE itemID=" + itemID);
}
/*
function clearItemContent(itemID){
Scholar.DB.query("DELETE FROM fulltextContent WHERE itemID=" + itemID);
}
*/
function purgeUnusedWords(){
var sql = "DELETE FROM fulltextWords WHERE wordID NOT IN "
+ "(SELECT wordID FROM fulltextItems)";
Scholar.DB.query(sql);
}
function HTMLToText(text){
var nsIFC =
Components.classes['@mozilla.org/widget/htmlformatconverter;1'].
createInstance(Components.interfaces.nsIFormatConverter);
var from = Components.classes['@mozilla.org/supports-string;1'].
createInstance(Components.interfaces.nsISupportsString);
from.data = text;
var to = {value:null};
try {
nsIFC.convert('text/html', from, from.toString().length,
'text/unicode', to, {});
to = to.value.QueryInterface(Components.interfaces.nsISupportsString);
return to.toString();
}
catch(e){
Scholar.debug(e, 1);
return text;
}
}
function semanticSplitter(text, charset){
if (!text){
Scholar.debug('No text to index');
return;
}
text = _markTroubleChars(text);
var serv = Components.classes["@mozilla.org/intl/semanticunitscanner;1"]
.createInstance(Components.interfaces.nsISemanticUnitScanner);
var words = [], unique = {}, begin = {}, end = {}, nextPos = 0;
serv.start(charset ? charset : null);
do {
var next = serv.next(text, text.length, nextPos, true, begin, end);
var str = text.substring(begin.value, end.value);
// Skip non-breaking spaces
if (!str || str.charCodeAt(0)==32 || str.charCodeAt(0)==160){
nextPos = end.value;
begin = {}, end = {};
continue;
}
// Create alphanum hash keys out of the character codes
var lc = str.toLowerCase();
// And store the unique ones
if (!unique[lc]){
unique[lc] = true;
}
nextPos = end.value;
begin = {}, end = {};
}
while (next);
for (var i in unique){
words.push(_restoreTroubleChars(i));
}
return words;
}
/*
* Add spaces between elements, since HTMLToText doesn't
*
* NOTE: SLOW AND NOT USED!
*/
function _separateElements(node){
var next = node;
do {
if (next.hasChildNodes()){
_separateElements(next.firstChild);
}
var space = node.ownerDocument.createTextNode(' ');
next.parentNode.insertBefore(space, next);
}
while (next = next.nextSibling);
}
function _markTroubleChars(text){
text = text.replace("'", "zoteroapostrophe");
return text;
}
function _restoreTroubleChars(text){
text = text.replace("zoteroapostrophe", "'");
return text;
}
}