More full-text indexing tweaks
Follow up from #440 - Convert curly single quotes to straight quotes before inserting - Add General and Supplemental Unicode punctuation ranges to getClass() (so that fancy punctuation doesn't end up in words) - Move single-quote test from getClass() to semanticSplitter(), and consider it a letter only if in the middle of a word - Add comments to semanticSplitter() This might be ever-so-slightly slower, but it's neglible. (War and Peace seems to now take ~1570ms instead of ~1500ms for me.)
This commit is contained in:
parent
7036d07acc
commit
f9f61fca86
1 changed files with 52 additions and 12 deletions
|
@ -145,13 +145,15 @@ Zotero.Fulltext = new function(){
|
||||||
if (cc < 0x2E80) { //alphabetical script
|
if (cc < 0x2E80) { //alphabetical script
|
||||||
if ((cc & 0xFF80) == 0) { // ascii
|
if ((cc & 0xFF80) == 0) { // ascii
|
||||||
if (c == ' ' || c == "\t" || c == "\r" || c == "\n") { return kWbClassSpace; }
|
if (c == ' ' || c == "\t" || c == "\r" || c == "\n") { return kWbClassSpace; }
|
||||||
|
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) { return kWbClassAlphaLetter; }
|
||||||
// deviation from Mozilla algorithm: count "'" as an alphaletter
|
|
||||||
if (c == "'" || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) { return kWbClassAlphaLetter; }
|
|
||||||
return kWbClassPunct;
|
return kWbClassPunct;
|
||||||
}
|
}
|
||||||
if ((0xFF80 & cc) == 0x0E00) { return kWbClassThaiLetter; }
|
if ((0xFF80 & cc) == 0x0E00) { return kWbClassThaiLetter; }
|
||||||
if (cc == 0x00A0/*NBSP*/) { return kWbClassSpace; }
|
if (cc == 0x00A0/*NBSP*/) { return kWbClassSpace; }
|
||||||
|
|
||||||
|
// General and Supplemental Unicode punctuation
|
||||||
|
if ((cc >= 0x2000 && cc <= 0x206f) || (cc >= 0x2e00 && cc <= 0x2e7f)) { return kWbClassPunct; }
|
||||||
|
|
||||||
return kWbClassAlphaLetter;
|
return kWbClassAlphaLetter;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1542,24 +1544,62 @@ Zotero.Fulltext = new function(){
|
||||||
var cclass = null;
|
var cclass = null;
|
||||||
var strlen = text.length;
|
var strlen = text.length;
|
||||||
for (var i = 0; i < strlen; i++) {
|
for (var i = 0; i < strlen; i++) {
|
||||||
var c = text.charAt(i);
|
var charCode = text.charCodeAt(i);
|
||||||
var cc = getClass(c, text.charCodeAt(i));
|
var cc = null;
|
||||||
|
|
||||||
|
// Adjustments
|
||||||
|
if (charCode == 8216 || charCode == 8217) {
|
||||||
|
// Curly quotes to straight
|
||||||
|
var c = "'";
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
var c = text.charAt(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Consider single quote in the middle of a word a letter
|
||||||
|
if (c == "'" && word !== '') {
|
||||||
|
cc = kWbClassAlphaLetter;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!cc) {
|
||||||
|
cc = getClass(c, charCode);
|
||||||
|
}
|
||||||
|
|
||||||
|
// When we reach space or punctuation, store the previous word if there is one
|
||||||
if (cc == kWbClassSpace || cc == kWbClassPunct) {
|
if (cc == kWbClassSpace || cc == kWbClassPunct) {
|
||||||
if (word != '') { words[word] = true; word = ''; }
|
if (word != '') {
|
||||||
|
words[word] = true;
|
||||||
|
word = '';
|
||||||
|
}
|
||||||
|
// When we reach Han character, store previous word and add Han character
|
||||||
} else if (cc == kWbClassHanLetter) {
|
} else if (cc == kWbClassHanLetter) {
|
||||||
if (word != '') { words[word] = true; word = ''; }
|
if (word !== '') {
|
||||||
|
words[word] = true;
|
||||||
|
word = '';
|
||||||
|
}
|
||||||
words[c] = true;
|
words[c] = true;
|
||||||
|
// Otherwise, if character class hasn't changed, keep adding characters to previous word
|
||||||
} else if (cc == cclass) {
|
} else if (cc == cclass) {
|
||||||
word += c.toLowerCase();
|
word += c.toLowerCase();
|
||||||
|
// If character class is different, store previous word and start new word
|
||||||
} else {
|
} else {
|
||||||
if (word != '') { words[word] = true; }
|
if (word !== '') {
|
||||||
|
words[word] = true;
|
||||||
|
}
|
||||||
word = c.toLowerCase();
|
word = c.toLowerCase();
|
||||||
}
|
}
|
||||||
cclass = cc;
|
cclass = cc;
|
||||||
}
|
}
|
||||||
if (word != '') { words[word] = true; }
|
if (word !== '') {
|
||||||
|
words[word] = true;
|
||||||
|
}
|
||||||
|
|
||||||
return Object.keys(words);
|
return Object.keys(words).map(function (w) {
|
||||||
|
// Trim trailing single quotes
|
||||||
|
if (w.slice(-1) == "'") {
|
||||||
|
w = w.substr(0, w.length - 1);
|
||||||
|
}
|
||||||
|
return w;
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue