27d1d63bfc
- Switched to manually repeated bound parameters in indexWords() - Switched to the innerHTML regex used elsewhere instead of a more proper but nevertheless misguided DOM traverser to split elements in indexDocument This may invalidate the fulltext progress indicator ticket
430 lines
10 KiB
JavaScript
430 lines
10 KiB
JavaScript
Scholar.Fulltext = new function(){
|
|
this.indexWord = indexWord;
|
|
this.indexWords = indexWords;
|
|
this.indexDocument = indexDocument;
|
|
this.indexString = indexString;
|
|
this.indexFile = indexFile;
|
|
this.indexItems = indexItems;
|
|
this.findTextInFile = findTextInFile;
|
|
this.findTextInItems = findTextInItems;
|
|
this.cacheIsOutdated = cacheIsOutdated;
|
|
this.rebuildCache = rebuildCache;
|
|
this.clearItemWords = clearItemWords;
|
|
//this.clearItemContent = clearItemContent;
|
|
this.purgeUnusedWords = purgeUnusedWords;
|
|
this.HTMLToText = HTMLToText;
|
|
this.semanticSplitter = semanticSplitter;
|
|
|
|
const FULLTEXT_VERSION = 1;
|
|
|
|
|
|
function cacheIsOutdated(){
|
|
var sql = "SELECT version FROM version WHERE schema='fulltext'";
|
|
return Scholar.DB.valueQuery(sql) < FULLTEXT_VERSION;
|
|
}
|
|
|
|
|
|
function rebuildCache(){
|
|
Scholar.DB.beginTransaction();
|
|
Scholar.DB.query("DELETE FROM fulltextWords");
|
|
Scholar.DB.query("DELETE FROM fulltextItems");
|
|
//Scholar.DB.query("DELETE FROM fulltextContent");
|
|
|
|
var sql = "SELECT itemID FROM itemAttachments";
|
|
var items = Scholar.DB.columnQuery(sql);
|
|
indexItems(items);
|
|
|
|
Scholar.DB.commitTransaction();
|
|
}
|
|
|
|
|
|
/*
|
|
* Index a single word
|
|
*/
|
|
function indexWord(itemID, word){
|
|
Scholar.DB.beginTransaction();
|
|
|
|
var sql = "SELECT wordID FROM fulltextWords WHERE word=?";
|
|
var wordID = Scholar.DB.valueQuery(sql, {string:word});
|
|
|
|
if (!wordID){
|
|
var sql = "INSERT INTO fulltextWords (word) VALUES (?)";
|
|
var wordID = Scholar.DB.query(sql, {string:word});
|
|
}
|
|
|
|
var sql = "INSERT OR IGNORE INTO fulltextItems VALUES (?,?)";
|
|
Scholar.DB.query(sql, [wordID, itemID]);
|
|
|
|
Scholar.DB.commitTransaction();
|
|
}
|
|
|
|
|
|
/*
|
|
* Index multiple words at once
|
|
*/
|
|
function indexWords(itemID, words){
|
|
if (!words || !words.length){
|
|
return false;
|
|
}
|
|
|
|
var sqlQues = [];
|
|
var sqlParams = [];
|
|
|
|
for each(var word in words){
|
|
sqlQues.push('?');
|
|
sqlParams.push({string:word});
|
|
}
|
|
|
|
Scholar.DB.beginTransaction();
|
|
|
|
var sql = "SELECT word, wordID from fulltextWords WHERE word IN ("
|
|
sql += sqlQues.join() + ")";
|
|
var wordIDs = Scholar.DB.query(sql, sqlParams);
|
|
|
|
var existing = [];
|
|
for (var i in wordIDs){
|
|
// Underscore avoids problems with JS reserved words
|
|
existing['_' + wordIDs[i]['word']] = wordIDs[i]['wordID'];
|
|
}
|
|
|
|
// Handle bound parameters manually for optimal speed
|
|
var statement1 = Scholar.DB.getStatement("INSERT INTO fulltextWords (word) VALUES (?)");
|
|
var statement2 = Scholar.DB.getStatement("INSERT OR IGNORE INTO fulltextItems VALUES (?,?)");
|
|
statement2.bindInt32Parameter(1, itemID);
|
|
|
|
for each(var word in words){
|
|
if (existing['_' + word]){
|
|
var wordID = existing['_' + word];
|
|
}
|
|
else {
|
|
statement1.bindUTF8StringParameter(0, word);
|
|
statement1.execute()
|
|
var wordID = Scholar.DB.getLastInsertID();
|
|
}
|
|
|
|
statement2.bindInt32Parameter(0, wordID);
|
|
statement2.execute();
|
|
}
|
|
|
|
statement1.reset();
|
|
statement2.reset();
|
|
|
|
Scholar.DB.commitTransaction();
|
|
}
|
|
|
|
|
|
function indexString(text, charset, itemID){
|
|
var words = semanticSplitter(text, charset);
|
|
|
|
Scholar.DB.beginTransaction();
|
|
|
|
clearItemWords(itemID);
|
|
indexWords(itemID, words);
|
|
|
|
/*
|
|
var sql = "REPLACE INTO fulltextContent (itemID, textContent) VALUES (?,?)";
|
|
Scholar.DB.query(sql, [itemID, {string:text}]);
|
|
*/
|
|
|
|
Scholar.DB.commitTransaction();
|
|
}
|
|
|
|
|
|
function indexDocument(document, itemID){
|
|
if (!itemID){
|
|
throw ('Item ID not provided to indexDocument()');
|
|
}
|
|
|
|
Scholar.debug("Indexing document '" + document.title + "'");
|
|
|
|
var text = document.body.innerHTML.replace(/(>)/g, '$1 ');
|
|
text = HTMLToText(text);
|
|
indexString(text, document.characterSet, itemID);
|
|
}
|
|
|
|
|
|
function indexFile(file, mimeType, charset, itemID){
|
|
if (!file.exists()){
|
|
Scholar.debug('File not found in indexFile()', 2);
|
|
return false;
|
|
}
|
|
|
|
if (!itemID){ throw ('Item ID not provided to indexFile()'); }
|
|
if (!mimeType){ throw ('MIME type not provided to indexFile()'); }
|
|
|
|
if (mimeType.substr(0, 5)!='text/'){
|
|
Scholar.debug('File is not text in indexFile()', 2);
|
|
return false;
|
|
}
|
|
|
|
if (!charset){ throw ('Charset not provided to indexFile()'); }
|
|
|
|
var text = Scholar.File.getContents(file, charset);
|
|
// Split elements to avoid word concatentation
|
|
text = text.replace(/(>)/g, '$1 ');
|
|
text = HTMLToText(text);
|
|
indexString(text, charset, itemID);
|
|
}
|
|
|
|
|
|
function indexItems(items){
|
|
var items = Scholar.Items.get(items);
|
|
var found = [];
|
|
|
|
Scholar.DB.beginTransaction();
|
|
|
|
for each(var i in items){
|
|
if (!i.isAttachment()){
|
|
continue;
|
|
}
|
|
|
|
var file = i.getFile();
|
|
if (!file){
|
|
continue;
|
|
}
|
|
|
|
indexFile(file, i.getAttachmentMimeType(),
|
|
i.getAttachmentCharset(), i.getID());
|
|
}
|
|
|
|
var sql = "REPLACE INTO version (schema,version) VALUES (?,?)";
|
|
Scholar.DB.query(sql, ['fulltext', FULLTEXT_VERSION]);
|
|
|
|
Scholar.DB.commitTransaction();
|
|
}
|
|
|
|
|
|
/*
|
|
* Scan a file for a text string
|
|
*
|
|
* _items_ -- one or more attachment items to search
|
|
* _searchText_ -- text pattern to search for
|
|
* _mode_:
|
|
* 'regexp' -- regular expression (case-insensitive)
|
|
* 'regexpCS' -- regular expression (case-sensitive)
|
|
*
|
|
* - Slashes in regex are optional
|
|
*/
|
|
function findTextInFile(file, charset, searchText, mode){
|
|
Scholar.debug("Searching for text '" + searchText + "' in " + file.path);
|
|
|
|
var str = Scholar.File.getContents(file, charset);
|
|
|
|
// If not binary mode, convert HTML to text
|
|
if (!mode || mode.indexOf('Binary')==-1){
|
|
// Split elements to avoid word concatentation
|
|
str = str.replace(/(>)/g, '$1 ');
|
|
|
|
// Parse to avoid searching on HTML
|
|
str = HTMLToText(str);
|
|
}
|
|
|
|
switch (mode){
|
|
case 'regexp':
|
|
case 'regexpCS':
|
|
case 'regexpBinary':
|
|
case 'regexpCSBinary':
|
|
// Do a multiline search by default
|
|
var flags = 'm';
|
|
var parts = searchText.match(/^\/(.*)\/([^\/]*)/);
|
|
if (parts){
|
|
searchText = parts[1];
|
|
// Ignore user-supplied flags
|
|
//flags = parts[2];
|
|
}
|
|
|
|
if (mode.indexOf('regexpCS')==-1){
|
|
flags += 'i';
|
|
}
|
|
|
|
var re = new RegExp(searchText, flags);
|
|
var matches = re(str);
|
|
if (matches){
|
|
Scholar.debug("Text found");
|
|
return str.substr(matches.index, 50);
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
// Case-insensitive
|
|
searchText = searchText.toLowerCase();
|
|
str = str.toLowerCase();
|
|
|
|
var pos = str.indexOf(searchText);
|
|
if (pos!=-1){
|
|
Scholar.debug('Text found');
|
|
return str.substr(pos, 50);
|
|
}
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
/*
|
|
* Scan item files for a text string
|
|
*
|
|
* _items_ -- one or more attachment items to search
|
|
* _searchText_ -- text pattern to search for
|
|
* _mode_:
|
|
* 'phrase'
|
|
* 'regexp'
|
|
* 'regexpCS' -- case-sensitive regular expression
|
|
*
|
|
* Note:
|
|
* - Slashes in regex are optional
|
|
* - Add 'Binary' to the mode to search all files, not just text files
|
|
*/
|
|
function findTextInItems(items, searchText, mode){
|
|
if (!searchText){
|
|
return [];
|
|
}
|
|
|
|
var items = Scholar.Items.get(items);
|
|
var found = [];
|
|
|
|
for each(var i in items){
|
|
if (!i.isAttachment()){
|
|
continue;
|
|
}
|
|
|
|
var file = i.getFile();
|
|
if (!file){
|
|
continue;
|
|
}
|
|
|
|
// If not binary mode, only scan plaintext files
|
|
if (!mode || mode.indexOf('Binary')==-1){
|
|
if (i.getAttachmentMimeType().substr(0,5)!='text/'){
|
|
continue;
|
|
}
|
|
}
|
|
|
|
var charset = i.getAttachmentCharset();
|
|
|
|
var match = findTextInFile(file, charset, searchText, mode);
|
|
|
|
if (match != -1){
|
|
found.push({id:i.getID(), match:match});
|
|
}
|
|
}
|
|
|
|
return found;
|
|
}
|
|
|
|
|
|
function clearItemWords(itemID){
|
|
Scholar.DB.query("DELETE FROM fulltextItems WHERE itemID=" + itemID);
|
|
}
|
|
|
|
|
|
/*
|
|
function clearItemContent(itemID){
|
|
Scholar.DB.query("DELETE FROM fulltextContent WHERE itemID=" + itemID);
|
|
}
|
|
*/
|
|
|
|
|
|
function purgeUnusedWords(){
|
|
var sql = "DELETE FROM fulltextWords WHERE wordID NOT IN "
|
|
+ "(SELECT wordID FROM fulltextItems)";
|
|
Scholar.DB.query(sql);
|
|
}
|
|
|
|
|
|
function HTMLToText(text){
|
|
var nsIFC =
|
|
Components.classes['@mozilla.org/widget/htmlformatconverter;1'].
|
|
createInstance(Components.interfaces.nsIFormatConverter);
|
|
var from = Components.classes['@mozilla.org/supports-string;1'].
|
|
createInstance(Components.interfaces.nsISupportsString);
|
|
from.data = text;
|
|
var to = {value:null};
|
|
try {
|
|
nsIFC.convert('text/html', from, from.toString().length,
|
|
'text/unicode', to, {});
|
|
to = to.value.QueryInterface(Components.interfaces.nsISupportsString);
|
|
return to.toString();
|
|
}
|
|
catch(e){
|
|
Scholar.debug(e, 1);
|
|
return text;
|
|
}
|
|
}
|
|
|
|
|
|
function semanticSplitter(text, charset){
|
|
if (!text){
|
|
Scholar.debug('No text to index');
|
|
return;
|
|
}
|
|
|
|
text = _markTroubleChars(text);
|
|
|
|
var serv = Components.classes["@mozilla.org/intl/semanticunitscanner;1"]
|
|
.createInstance(Components.interfaces.nsISemanticUnitScanner);
|
|
|
|
var words = [], unique = {}, begin = {}, end = {}, nextPos = 0;
|
|
serv.start(charset ? charset : null);
|
|
do {
|
|
var next = serv.next(text, text.length, nextPos, true, begin, end);
|
|
var str = text.substring(begin.value, end.value);
|
|
|
|
// Skip non-breaking spaces
|
|
if (!str || str.charCodeAt(0)==32 || str.charCodeAt(0)==160){
|
|
nextPos = end.value;
|
|
begin = {}, end = {};
|
|
continue;
|
|
}
|
|
|
|
// Create alphanum hash keys out of the character codes
|
|
var lc = str.toLowerCase();
|
|
|
|
// And store the unique ones
|
|
if (!unique[lc]){
|
|
unique[lc] = true;
|
|
}
|
|
|
|
nextPos = end.value;
|
|
begin = {}, end = {};
|
|
}
|
|
while (next);
|
|
|
|
for (var i in unique){
|
|
words.push(_restoreTroubleChars(i));
|
|
}
|
|
|
|
return words;
|
|
}
|
|
|
|
|
|
/*
|
|
* Add spaces between elements, since HTMLToText doesn't
|
|
*
|
|
* NOTE: SLOW AND NOT USED!
|
|
*/
|
|
function _separateElements(node){
|
|
var next = node;
|
|
do {
|
|
if (next.hasChildNodes()){
|
|
_separateElements(next.firstChild);
|
|
}
|
|
|
|
var space = node.ownerDocument.createTextNode(' ');
|
|
next.parentNode.insertBefore(space, next);
|
|
}
|
|
while (next = next.nextSibling);
|
|
}
|
|
|
|
|
|
function _markTroubleChars(text){
|
|
text = text.replace("'", "zoteroapostrophe");
|
|
return text;
|
|
}
|
|
|
|
|
|
function _restoreTroubleChars(text){
|
|
text = text.replace("zoteroapostrophe", "'");
|
|
return text;
|
|
}
|
|
}
|