zotero/chrome/chromeFiles/content/scholar/xpcom/fulltext.js
Dan Stillman ab13c3980a Fulltext search support
There are currently two types of fulltext searching: an SQL-based word index and a file scanner. They each have their advantages and drawbacks.

The word index is very fast to search and is currently used for the find-as-you-type quicksearch. However, indexing files takes some time, so we should probably offer a preference to turn it off ("Index attachment content for quicksearch" or something). There's also an issue with Chinese characters (which are indexed by character rather than word, since there are no spaces to go by, so a search for a word with common characters could produce erroneous results). The quicksearch doesn't use a left-bound index (since that would probably upset German speakers searching for "musik" in "nachtmusik," though I don't know for sure how they think of words) but still seems pretty fast.

* Note: There will be a potentially long delay when you start Firefox with this revision as it builds a fulltext word index of your existing items. We obviously need a notification/option for this. *

The file scanner, used in the Attachment Content condition of the search dialog, offers phrase searching as well as regex support (both case-sensitive and not, and defaulting to multiline). It doesn't require an index, though it should probably be optimized to use the word index, if available, for narrowing the results when not in regex mode. (It does only scan files that pass all the other search conditions, which speeds it up considerably for multi-condition searches, and skips non-text files unless instructed otherwise, but it's still relatively slow.)

Both convert HTML to text before searching (with the exception of the binary file scanning mode).

There are some issues with which files get indexed and which don't that we can't do much about and that will probably confuse users immensely. Dan C. suggested some sort of indicator (say, a green dot) to show which files are indexed.

Also added (very ugly) charset detection (anybody want to figure out getCharsetFromString(str)?), a setTimeout() replacement in the XPCOM service, an arrayToHash() method, and a new header to timedtextarea.xml, since it's really not copyright CHNM (it's really just a few lines off from the toolkit timed-textbox binding--I tried to change it to extend timed-textbox and just ignore Return keypress events so that we didn't need to duplicate the Mozilla code, but timed-textbox's reliance on html:input instead of html:textarea made things rather difficult).

To do:

- Pref/buttons to disable/clear/rebuild fulltext index
- Hidden prefs to set maximum file size to index/scan
- Don't index words of fewer than 3 non-Asian characters
- MRU cache for saved searches
- Use word index if available to narrow search scope of fulltext scanner
- Cache attachment info methods
- Show content excerpt in search results (at least in advanced search window, when it exists)
- Notification window (a la scraping) to show when indexing
- Indicator of indexed status
- Context menu option to index
- Indicator that a file scanning search is in progress, if possible
- Find other ways to make it index the NYT front page in under 10 seconds
- Probably fix lots of bugs, which you will likely start telling me about...now.
2006-09-21 00:10:29 +00:00

418 lines
9.8 KiB
JavaScript

Scholar.Fulltext = new function(){
this.indexWord = indexWord;
this.indexWords = indexWords;
this.indexDocument = indexDocument;
this.indexString = indexString;
this.indexFile = indexFile;
this.indexItems = indexItems;
this.findTextInFile = findTextInFile;
this.findTextInItems = findTextInItems;
this.cacheIsOutdated = cacheIsOutdated;
this.rebuildCache = rebuildCache;
this.clearItemWords = clearItemWords;
this.clearItemContent = clearItemContent;
this.purgeUnusedWords = purgeUnusedWords;
this.HTMLToText = HTMLToText;
this.semanticSplitter = semanticSplitter;
const FULLTEXT_VERSION = 1;
function cacheIsOutdated(){
var sql = "SELECT version FROM version WHERE schema='fulltext'";
return Scholar.DB.valueQuery(sql) < FULLTEXT_VERSION;
}
function rebuildCache(){
Scholar.DB.beginTransaction();
Scholar.DB.query("DELETE FROM fulltextWords");
Scholar.DB.query("DELETE FROM fulltextItems");
//Scholar.DB.query("DELETE FROM fulltextContent");
var sql = "SELECT itemID FROM itemAttachments";
var items = Scholar.DB.columnQuery(sql);
this.indexItems(items);
Scholar.DB.commitTransaction();
}
/*
* Index a single word
*/
function indexWord(itemID, word){
Scholar.DB.beginTransaction();
var sql = "SELECT wordID FROM fulltextWords WHERE word=?";
var wordID = Scholar.DB.valueQuery(sql, {string:word});
if (!wordID){
var sql = "INSERT INTO fulltextWords (word) VALUES (?)";
var wordID = Scholar.DB.query(sql, {string:word});
}
var sql = "INSERT OR IGNORE INTO fulltextItems VALUES (?,?)";
Scholar.DB.query(sql, [wordID, itemID]);
Scholar.DB.commitTransaction();
}
/*
* Index multiple words at once
*/
function indexWords(itemID, words){
if (!words.length){
return false;
}
var sqlQues = [];
var sqlParams = [];
for each(var word in words){
sqlQues.push('?');
sqlParams.push({string:word});
}
Scholar.DB.beginTransaction();
var sql = "SELECT word, wordID from fulltextWords WHERE word IN ("
sql += sqlQues.join() + ")";
var wordIDs = Scholar.DB.query(sql, sqlParams);
var existing = [];
for (var i in wordIDs){
// Underscore avoids problems with JS reserved words
existing['_' + wordIDs[i]['word']] = wordIDs[i]['wordID'];
}
// TODO: use repeated bound statements once db.js supports it
for each(var word in words){
if (existing['_' + word]){
var wordID = existing['_' + word];
}
else {
var sql = "INSERT INTO fulltextWords (word) VALUES (?)";
var wordID = Scholar.DB.query(sql, {string:word});
}
var sql = "INSERT OR IGNORE INTO fulltextItems VALUES (?,?)";
Scholar.DB.query(sql, [{int:wordID}, {int:itemID}]);
}
Scholar.DB.commitTransaction();
}
function indexString(text, charset, itemID){
var words = this.semanticSplitter(text, charset);
Scholar.DB.beginTransaction();
this.clearItemWords(itemID);
this.indexWords(itemID, words);
/*
var sql = "REPLACE INTO fulltextContent (itemID, textContent) VALUES (?,?)";
Scholar.DB.query(sql, [itemID, {string:text}]);
*/
Scholar.DB.commitTransaction();
}
function indexDocument(document, itemID){
if (!itemID){
throw ('Item ID not provided to indexDocument()');
}
Scholar.debug("Indexing document '" + document.title + "'");
_separateElements(document.body);
var text = this.HTMLToText(document.body.innerHTML);
this.indexString(text, document.characterSet, itemID);
}
function indexFile(file, mimeType, charset, itemID){
if (!file.exists()){
Scholar.debug('File not found in indexFile()', 2);
return false;
}
if (!itemID){ throw ('Item ID not provided to indexFile()'); }
if (!mimeType){ throw ('MIME type not provided to indexFile()'); }
if (mimeType.substr(0, 5)!='text/'){
Scholar.debug('File is not text in indexFile()', 2);
return false;
}
if (!charset){ throw ('Charset not provided to indexFile()'); }
var text = Scholar.File.getContents(file, charset);
// Split elements to avoid word concatentation
text = text.replace(/(>)/g, '$1 ');
text = this.HTMLToText(text);
this.indexString(text, charset, itemID);
}
function indexItems(items){
var items = Scholar.Items.get(items);
var found = [];
Scholar.DB.beginTransaction();
for each(var i in items){
if (!i.isAttachment()){
continue;
}
var file = i.getFile();
if (!file){
continue;
}
this.indexFile(file, i.getAttachmentMimeType(),
i.getAttachmentCharset(), i.getID());
}
var sql = "REPLACE INTO version (schema,version) VALUES (?,?)";
Scholar.DB.query(sql, ['fulltext', FULLTEXT_VERSION]);
Scholar.DB.commitTransaction();
}
/*
* Scan a file for a text string
*
* _items_ -- one or more attachment items to search
* _searchText_ -- text pattern to search for
* _mode_:
* 'regexp' -- regular expression (case-insensitive)
* 'regexpCS' -- regular expression (case-sensitive)
*
* - Slashes in regex are optional
*/
function findTextInFile(file, charset, searchText, mode){
Scholar.debug("Searching for text '" + searchText + "' in " + file.path);
var str = Scholar.File.getContents(file, charset);
// If not binary mode, convert HTML to text
if (!mode || mode.indexOf('Binary')==-1){
// Split elements to avoid word concatentation
str = str.replace(/(>)/g, '$1 ');
// Parse to avoid searching on HTML
str = this.HTMLToText(str);
}
switch (mode){
case 'regexp':
case 'regexpCS':
case 'regexpBinary':
case 'regexpCSBinary':
// Do a multiline search by default
var flags = 'm';
var parts = searchText.match(/^\/(.*)\/([^\/]*)/);
if (parts){
searchText = parts[1];
// Ignore user-supplied flags
//flags = parts[2];
}
if (mode.indexOf('regexpCS')==-1){
flags += 'i';
}
var re = new RegExp(searchText, flags);
var matches = re(str);
if (matches){
Scholar.debug("Text found");
return str.substr(matches.index, 50);
}
break;
default:
// Case-insensitive
searchText = searchText.toLowerCase();
str = str.toLowerCase();
var pos = str.indexOf(searchText);
if (pos!=-1){
Scholar.debug('Text found');
return str.substr(pos, 50);
}
}
return -1;
}
/*
* Scan item files for a text string
*
* _items_ -- one or more attachment items to search
* _searchText_ -- text pattern to search for
* _mode_:
* 'phrase'
* 'regexp'
* 'regexpCS' -- case-sensitive regular expression
*
* Note:
* - Slashes in regex are optional
* - Add 'Binary' to the mode to search all files, not just text files
*/
function findTextInItems(items, searchText, mode){
if (!searchText){
return [];
}
var items = Scholar.Items.get(items);
var found = [];
for each(var i in items){
if (!i.isAttachment()){
continue;
}
var file = i.getFile();
if (!file){
continue;
}
// If not binary mode, only scan plaintext files
if (!mode || mode.indexOf('Binary')==-1){
if (i.getAttachmentMimeType().substr(0,5)!='text/'){
continue;
}
}
var charset = i.getAttachmentCharset();
var match = this.findTextInFile(file, charset, searchText, mode);
if (match != -1){
found.push({id:i.getID(), match:match});
}
}
return found;
}
function clearItemWords(itemID){
Scholar.DB.query("DELETE FROM fulltextItems WHERE itemID=" + itemID);
}
function clearItemContent(itemID){
Scholar.DB.query("DELETE FROM fulltextContent WHERE itemID=" + itemID);
}
function purgeUnusedWords(){
var sql = "DELETE FROM fulltextWords WHERE wordID NOT IN "
+ "(SELECT wordID FROM fulltextItems)";
Scholar.DB.query(sql);
}
function HTMLToText(text){
var nsIFC =
Components.classes['@mozilla.org/widget/htmlformatconverter;1'].
createInstance(Components.interfaces.nsIFormatConverter);
var from = Components.classes['@mozilla.org/supports-string;1'].
createInstance(Components.interfaces.nsISupportsString);
from.data = text;
var to = {value:null};
try {
nsIFC.convert('text/html', from, from.toString().length,
'text/unicode', to, {});
to = to.value.QueryInterface(Components.interfaces.nsISupportsString);
return to.toString();
}
catch(e){
Scholar.debug(e, 1);
return text;
}
}
function semanticSplitter(text, charset){
if (!text){
Scholar.debug('No text to index');
return;
}
text = _markTroubleChars(text);
var serv = Components.classes["@mozilla.org/intl/semanticunitscanner;1"]
.createInstance(Components.interfaces.nsISemanticUnitScanner);
var words = [], unique = {}, begin = {}, end = {}, nextPos = 0;
serv.start(charset ? charset : null);
do {
var next = serv.next(text, text.length, nextPos, true, begin, end);
var str = text.substring(begin.value, end.value);
// Skip non-breaking spaces
if (!str || str.charCodeAt(0)==32 || str.charCodeAt(0)==160){
nextPos = end.value;
begin = {}, end = {};
continue;
}
// Create alphanum hash keys out of the character codes
var lc = str.toLowerCase();
// And store the unique ones
if (!unique[lc]){
unique[lc] = true;
}
nextPos = end.value;
begin = {}, end = {};
}
while (next);
for (var i in unique){
words.push(_restoreTroubleChars(i));
}
return words;
}
/*
* Add spaces between elements, since body.textContent doesn't
*/
function _separateElements(node){
var next = node;
do {
if (next.hasChildNodes()){
_separateElements(next.firstChild);
}
var space = node.ownerDocument.createTextNode(' ');
next.parentNode.insertBefore(space, next);
}
while (next = next.nextSibling);
}
function _markTroubleChars(text){
text = text.replace("'", "zoteroapostrophe");
return text;
}
function _restoreTroubleChars(text){
text = text.replace("zoteroapostrophe", "'");
return text;
}
}