419 lines
9.8 KiB
JavaScript
419 lines
9.8 KiB
JavaScript
![]() |
Scholar.Fulltext = new function(){
|
||
|
this.indexWord = indexWord;
|
||
|
this.indexWords = indexWords;
|
||
|
this.indexDocument = indexDocument;
|
||
|
this.indexString = indexString;
|
||
|
this.indexFile = indexFile;
|
||
|
this.indexItems = indexItems;
|
||
|
this.findTextInFile = findTextInFile;
|
||
|
this.findTextInItems = findTextInItems;
|
||
|
this.cacheIsOutdated = cacheIsOutdated;
|
||
|
this.rebuildCache = rebuildCache;
|
||
|
this.clearItemWords = clearItemWords;
|
||
|
this.clearItemContent = clearItemContent;
|
||
|
this.purgeUnusedWords = purgeUnusedWords;
|
||
|
this.HTMLToText = HTMLToText;
|
||
|
this.semanticSplitter = semanticSplitter;
|
||
|
|
||
|
const FULLTEXT_VERSION = 1;
|
||
|
|
||
|
|
||
|
function cacheIsOutdated(){
|
||
|
var sql = "SELECT version FROM version WHERE schema='fulltext'";
|
||
|
return Scholar.DB.valueQuery(sql) < FULLTEXT_VERSION;
|
||
|
}
|
||
|
|
||
|
|
||
|
function rebuildCache(){
|
||
|
Scholar.DB.beginTransaction();
|
||
|
Scholar.DB.query("DELETE FROM fulltextWords");
|
||
|
Scholar.DB.query("DELETE FROM fulltextItems");
|
||
|
//Scholar.DB.query("DELETE FROM fulltextContent");
|
||
|
|
||
|
var sql = "SELECT itemID FROM itemAttachments";
|
||
|
var items = Scholar.DB.columnQuery(sql);
|
||
|
this.indexItems(items);
|
||
|
|
||
|
Scholar.DB.commitTransaction();
|
||
|
}
|
||
|
|
||
|
|
||
|
/*
|
||
|
* Index a single word
|
||
|
*/
|
||
|
function indexWord(itemID, word){
|
||
|
Scholar.DB.beginTransaction();
|
||
|
|
||
|
var sql = "SELECT wordID FROM fulltextWords WHERE word=?";
|
||
|
var wordID = Scholar.DB.valueQuery(sql, {string:word});
|
||
|
|
||
|
if (!wordID){
|
||
|
var sql = "INSERT INTO fulltextWords (word) VALUES (?)";
|
||
|
var wordID = Scholar.DB.query(sql, {string:word});
|
||
|
}
|
||
|
|
||
|
var sql = "INSERT OR IGNORE INTO fulltextItems VALUES (?,?)";
|
||
|
Scholar.DB.query(sql, [wordID, itemID]);
|
||
|
|
||
|
Scholar.DB.commitTransaction();
|
||
|
}
|
||
|
|
||
|
|
||
|
/*
|
||
|
* Index multiple words at once
|
||
|
*/
|
||
|
function indexWords(itemID, words){
|
||
|
if (!words.length){
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
var sqlQues = [];
|
||
|
var sqlParams = [];
|
||
|
|
||
|
for each(var word in words){
|
||
|
sqlQues.push('?');
|
||
|
sqlParams.push({string:word});
|
||
|
}
|
||
|
|
||
|
Scholar.DB.beginTransaction();
|
||
|
|
||
|
var sql = "SELECT word, wordID from fulltextWords WHERE word IN ("
|
||
|
sql += sqlQues.join() + ")";
|
||
|
var wordIDs = Scholar.DB.query(sql, sqlParams);
|
||
|
|
||
|
var existing = [];
|
||
|
for (var i in wordIDs){
|
||
|
// Underscore avoids problems with JS reserved words
|
||
|
existing['_' + wordIDs[i]['word']] = wordIDs[i]['wordID'];
|
||
|
}
|
||
|
|
||
|
// TODO: use repeated bound statements once db.js supports it
|
||
|
for each(var word in words){
|
||
|
if (existing['_' + word]){
|
||
|
var wordID = existing['_' + word];
|
||
|
}
|
||
|
else {
|
||
|
var sql = "INSERT INTO fulltextWords (word) VALUES (?)";
|
||
|
var wordID = Scholar.DB.query(sql, {string:word});
|
||
|
}
|
||
|
|
||
|
var sql = "INSERT OR IGNORE INTO fulltextItems VALUES (?,?)";
|
||
|
Scholar.DB.query(sql, [{int:wordID}, {int:itemID}]);
|
||
|
}
|
||
|
|
||
|
Scholar.DB.commitTransaction();
|
||
|
}
|
||
|
|
||
|
|
||
|
function indexString(text, charset, itemID){
|
||
|
var words = this.semanticSplitter(text, charset);
|
||
|
|
||
|
Scholar.DB.beginTransaction();
|
||
|
|
||
|
this.clearItemWords(itemID);
|
||
|
this.indexWords(itemID, words);
|
||
|
|
||
|
/*
|
||
|
var sql = "REPLACE INTO fulltextContent (itemID, textContent) VALUES (?,?)";
|
||
|
Scholar.DB.query(sql, [itemID, {string:text}]);
|
||
|
*/
|
||
|
|
||
|
Scholar.DB.commitTransaction();
|
||
|
}
|
||
|
|
||
|
|
||
|
function indexDocument(document, itemID){
|
||
|
if (!itemID){
|
||
|
throw ('Item ID not provided to indexDocument()');
|
||
|
}
|
||
|
|
||
|
Scholar.debug("Indexing document '" + document.title + "'");
|
||
|
|
||
|
_separateElements(document.body);
|
||
|
var text = this.HTMLToText(document.body.innerHTML);
|
||
|
this.indexString(text, document.characterSet, itemID);
|
||
|
}
|
||
|
|
||
|
|
||
|
function indexFile(file, mimeType, charset, itemID){
|
||
|
if (!file.exists()){
|
||
|
Scholar.debug('File not found in indexFile()', 2);
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
if (!itemID){ throw ('Item ID not provided to indexFile()'); }
|
||
|
if (!mimeType){ throw ('MIME type not provided to indexFile()'); }
|
||
|
|
||
|
if (mimeType.substr(0, 5)!='text/'){
|
||
|
Scholar.debug('File is not text in indexFile()', 2);
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
if (!charset){ throw ('Charset not provided to indexFile()'); }
|
||
|
|
||
|
var text = Scholar.File.getContents(file, charset);
|
||
|
// Split elements to avoid word concatentation
|
||
|
text = text.replace(/(>)/g, '$1 ');
|
||
|
text = this.HTMLToText(text);
|
||
|
this.indexString(text, charset, itemID);
|
||
|
}
|
||
|
|
||
|
|
||
|
function indexItems(items){
|
||
|
var items = Scholar.Items.get(items);
|
||
|
var found = [];
|
||
|
|
||
|
Scholar.DB.beginTransaction();
|
||
|
|
||
|
for each(var i in items){
|
||
|
if (!i.isAttachment()){
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
var file = i.getFile();
|
||
|
if (!file){
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
this.indexFile(file, i.getAttachmentMimeType(),
|
||
|
i.getAttachmentCharset(), i.getID());
|
||
|
}
|
||
|
|
||
|
var sql = "REPLACE INTO version (schema,version) VALUES (?,?)";
|
||
|
Scholar.DB.query(sql, ['fulltext', FULLTEXT_VERSION]);
|
||
|
|
||
|
Scholar.DB.commitTransaction();
|
||
|
}
|
||
|
|
||
|
|
||
|
/*
|
||
|
* Scan a file for a text string
|
||
|
*
|
||
|
* _items_ -- one or more attachment items to search
|
||
|
* _searchText_ -- text pattern to search for
|
||
|
* _mode_:
|
||
|
* 'regexp' -- regular expression (case-insensitive)
|
||
|
* 'regexpCS' -- regular expression (case-sensitive)
|
||
|
*
|
||
|
* - Slashes in regex are optional
|
||
|
*/
|
||
|
function findTextInFile(file, charset, searchText, mode){
|
||
|
Scholar.debug("Searching for text '" + searchText + "' in " + file.path);
|
||
|
|
||
|
var str = Scholar.File.getContents(file, charset);
|
||
|
|
||
|
// If not binary mode, convert HTML to text
|
||
|
if (!mode || mode.indexOf('Binary')==-1){
|
||
|
// Split elements to avoid word concatentation
|
||
|
str = str.replace(/(>)/g, '$1 ');
|
||
|
|
||
|
// Parse to avoid searching on HTML
|
||
|
str = this.HTMLToText(str);
|
||
|
}
|
||
|
|
||
|
switch (mode){
|
||
|
case 'regexp':
|
||
|
case 'regexpCS':
|
||
|
case 'regexpBinary':
|
||
|
case 'regexpCSBinary':
|
||
|
// Do a multiline search by default
|
||
|
var flags = 'm';
|
||
|
var parts = searchText.match(/^\/(.*)\/([^\/]*)/);
|
||
|
if (parts){
|
||
|
searchText = parts[1];
|
||
|
// Ignore user-supplied flags
|
||
|
//flags = parts[2];
|
||
|
}
|
||
|
|
||
|
if (mode.indexOf('regexpCS')==-1){
|
||
|
flags += 'i';
|
||
|
}
|
||
|
|
||
|
var re = new RegExp(searchText, flags);
|
||
|
var matches = re(str);
|
||
|
if (matches){
|
||
|
Scholar.debug("Text found");
|
||
|
return str.substr(matches.index, 50);
|
||
|
}
|
||
|
|
||
|
break;
|
||
|
|
||
|
default:
|
||
|
// Case-insensitive
|
||
|
searchText = searchText.toLowerCase();
|
||
|
str = str.toLowerCase();
|
||
|
|
||
|
var pos = str.indexOf(searchText);
|
||
|
if (pos!=-1){
|
||
|
Scholar.debug('Text found');
|
||
|
return str.substr(pos, 50);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return -1;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Scan item files for a text string
|
||
|
*
|
||
|
* _items_ -- one or more attachment items to search
|
||
|
* _searchText_ -- text pattern to search for
|
||
|
* _mode_:
|
||
|
* 'phrase'
|
||
|
* 'regexp'
|
||
|
* 'regexpCS' -- case-sensitive regular expression
|
||
|
*
|
||
|
* Note:
|
||
|
* - Slashes in regex are optional
|
||
|
* - Add 'Binary' to the mode to search all files, not just text files
|
||
|
*/
|
||
|
function findTextInItems(items, searchText, mode){
|
||
|
if (!searchText){
|
||
|
return [];
|
||
|
}
|
||
|
|
||
|
var items = Scholar.Items.get(items);
|
||
|
var found = [];
|
||
|
|
||
|
for each(var i in items){
|
||
|
if (!i.isAttachment()){
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
var file = i.getFile();
|
||
|
if (!file){
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
// If not binary mode, only scan plaintext files
|
||
|
if (!mode || mode.indexOf('Binary')==-1){
|
||
|
if (i.getAttachmentMimeType().substr(0,5)!='text/'){
|
||
|
continue;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
var charset = i.getAttachmentCharset();
|
||
|
|
||
|
var match = this.findTextInFile(file, charset, searchText, mode);
|
||
|
|
||
|
if (match != -1){
|
||
|
found.push({id:i.getID(), match:match});
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return found;
|
||
|
}
|
||
|
|
||
|
|
||
|
function clearItemWords(itemID){
|
||
|
Scholar.DB.query("DELETE FROM fulltextItems WHERE itemID=" + itemID);
|
||
|
}
|
||
|
|
||
|
|
||
|
function clearItemContent(itemID){
|
||
|
Scholar.DB.query("DELETE FROM fulltextContent WHERE itemID=" + itemID);
|
||
|
}
|
||
|
|
||
|
|
||
|
function purgeUnusedWords(){
|
||
|
var sql = "DELETE FROM fulltextWords WHERE wordID NOT IN "
|
||
|
+ "(SELECT wordID FROM fulltextItems)";
|
||
|
Scholar.DB.query(sql);
|
||
|
}
|
||
|
|
||
|
|
||
|
function HTMLToText(text){
|
||
|
var nsIFC =
|
||
|
Components.classes['@mozilla.org/widget/htmlformatconverter;1'].
|
||
|
createInstance(Components.interfaces.nsIFormatConverter);
|
||
|
var from = Components.classes['@mozilla.org/supports-string;1'].
|
||
|
createInstance(Components.interfaces.nsISupportsString);
|
||
|
from.data = text;
|
||
|
var to = {value:null};
|
||
|
try {
|
||
|
nsIFC.convert('text/html', from, from.toString().length,
|
||
|
'text/unicode', to, {});
|
||
|
to = to.value.QueryInterface(Components.interfaces.nsISupportsString);
|
||
|
return to.toString();
|
||
|
}
|
||
|
catch(e){
|
||
|
Scholar.debug(e, 1);
|
||
|
return text;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
function semanticSplitter(text, charset){
|
||
|
if (!text){
|
||
|
Scholar.debug('No text to index');
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
text = _markTroubleChars(text);
|
||
|
|
||
|
var serv = Components.classes["@mozilla.org/intl/semanticunitscanner;1"]
|
||
|
.createInstance(Components.interfaces.nsISemanticUnitScanner);
|
||
|
|
||
|
var words = [], unique = {}, begin = {}, end = {}, nextPos = 0;
|
||
|
serv.start(charset ? charset : null);
|
||
|
do {
|
||
|
var next = serv.next(text, text.length, nextPos, true, begin, end);
|
||
|
var str = text.substring(begin.value, end.value);
|
||
|
|
||
|
// Skip non-breaking spaces
|
||
|
if (!str || str.charCodeAt(0)==32 || str.charCodeAt(0)==160){
|
||
|
nextPos = end.value;
|
||
|
begin = {}, end = {};
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
// Create alphanum hash keys out of the character codes
|
||
|
var lc = str.toLowerCase();
|
||
|
|
||
|
// And store the unique ones
|
||
|
if (!unique[lc]){
|
||
|
unique[lc] = true;
|
||
|
}
|
||
|
|
||
|
nextPos = end.value;
|
||
|
begin = {}, end = {};
|
||
|
}
|
||
|
while (next);
|
||
|
|
||
|
for (var i in unique){
|
||
|
words.push(_restoreTroubleChars(i));
|
||
|
}
|
||
|
|
||
|
return words;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*
|
||
|
* Add spaces between elements, since body.textContent doesn't
|
||
|
*/
|
||
|
function _separateElements(node){
|
||
|
var next = node;
|
||
|
do {
|
||
|
if (next.hasChildNodes()){
|
||
|
_separateElements(next.firstChild);
|
||
|
}
|
||
|
|
||
|
var space = node.ownerDocument.createTextNode(' ');
|
||
|
next.parentNode.insertBefore(space, next);
|
||
|
}
|
||
|
while (next = next.nextSibling);
|
||
|
}
|
||
|
|
||
|
|
||
|
function _markTroubleChars(text){
|
||
|
text = text.replace("'", "zoteroapostrophe");
|
||
|
return text;
|
||
|
}
|
||
|
|
||
|
|
||
|
function _restoreTroubleChars(text){
|
||
|
text = text.replace("zoteroapostrophe", "'");
|
||
|
return text;
|
||
|
}
|
||
|
}
|