From c6a18b1b5919323413d81ebbdc84b950fa76eec0 Mon Sep 17 00:00:00 2001 From: Cheng Zhao Date: Fri, 19 Dec 2014 20:42:19 -0800 Subject: [PATCH] Split the words before sending it to spellCheck --- atom.gyp | 2 + .../api/atom_api_spell_check_client.cc | 94 +++- .../api/atom_api_spell_check_client.h | 28 +- atom/renderer/api/atom_api_web_frame.cc | 11 +- atom/renderer/api/atom_api_web_frame.h | 7 +- .../spellchecker/spellcheck_worditerator.cc | 422 ++++++++++++++++++ .../spellchecker/spellcheck_worditerator.h | 175 ++++++++ 7 files changed, 720 insertions(+), 19 deletions(-) create mode 100644 chromium_src/chrome/renderer/spellchecker/spellcheck_worditerator.cc create mode 100644 chromium_src/chrome/renderer/spellchecker/spellcheck_worditerator.h diff --git a/atom.gyp b/atom.gyp index 17c1b60a111e..345474b5b6de 100644 --- a/atom.gyp +++ b/atom.gyp @@ -323,6 +323,8 @@ 'chromium_src/chrome/renderer/printing/print_web_view_helper_mac.mm', 'chromium_src/chrome/renderer/printing/print_web_view_helper_pdf_win.cc', 'chromium_src/chrome/renderer/printing/print_web_view_helper.h', + 'chromium_src/chrome/renderer/spellchecker/spellcheck_worditerator.cc', + 'chromium_src/chrome/renderer/spellchecker/spellcheck_worditerator.h', 'chromium_src/chrome/renderer/tts_dispatcher.cc', 'chromium_src/chrome/renderer/tts_dispatcher.h', 'chromium_src/library_loaders/libgio_loader.cc', diff --git a/atom/renderer/api/atom_api_spell_check_client.cc b/atom/renderer/api/atom_api_spell_check_client.cc index 8b97af3cc663..198c42a189c3 100644 --- a/atom/renderer/api/atom_api_spell_check_client.cc +++ b/atom/renderer/api/atom_api_spell_check_client.cc @@ -54,22 +54,53 @@ bool HasWordCharacters(const base::string16& text, int index) { } // namespace SpellCheckClient::SpellCheckClient(v8::Isolate* isolate, + const std::string& language, v8::Handle provider) - : isolate_(isolate), provider_(isolate, provider) {} + : isolate_(isolate), provider_(isolate, provider) { + character_attributes_.SetDefaultLanguage(language); + + // Persistent the method. + mate::Dictionary dict(isolate, provider); + dict.Get("spellCheck", &spell_check_); +} SpellCheckClient::~SpellCheckClient() {} void SpellCheckClient::spellCheck( const blink::WebString& text, - int& misspelledOffset, - int& misspelledLength, - blink::WebVector* optionalSuggestions) { - blink::WebTextCheckingResult result; - if (!CallProviderMethod("spellCheck", text, &result)) + int& misspelling_start, + int& misspelling_len, + blink::WebVector* optional_suggestions) { + if (text.length() == 0 || spell_check_.IsEmpty()) return; - misspelledOffset = result.location; - misspelledLength = result.length; + base::string16 word; + int word_start; + int word_length; + if (!text_iterator_.IsInitialized() && + !text_iterator_.Initialize(&character_attributes_, true)) { + // We failed to initialize text_iterator_, return as spelled correctly. + VLOG(1) << "Failed to initialize SpellcheckWordIterator"; + return; + } + + base::string16 in_word(text); + text_iterator_.SetText(in_word.c_str(), in_word.size()); + while (text_iterator_.GetNextWord(&word, &word_start, &word_length)) { + // Found a word (or a contraction) that the spellchecker can check the + // spelling of. + if (CheckSpelling(word)) + continue; + + // If the given word is a concatenated word of two or more valid words + // (e.g. "hello:hello"), we should treat it as a valid word. + if (IsValidContraction(word)) + continue; + + misspelling_start = word_start; + misspelling_len = word_length; + return; + } } void SpellCheckClient::checkTextOfParagraph( @@ -90,13 +121,6 @@ void SpellCheckClient::requestCheckingOfText( const blink::WebVector& markersInText, const blink::WebVector& markerOffsets, blink::WebTextCheckingCompletion* completionCallback) { - v8::HandleScope handle_scope(isolate_); - v8::Handle provider = provider_.NewHandle(); - if (!provider->Has(mate::StringToV8(isolate_, "requestCheckingOfText"))) { - completionCallback->didCancelCheckingText(); - return; - } - base::string16 text(textToCheck); if (text.empty() || !HasWordCharacters(text, 0)) { completionCallback->didCancelCheckingText(); @@ -151,6 +175,46 @@ bool SpellCheckClient::CallProviderMethod(const char* method, return mate::ConvertFromV8(isolate_, v8_result, result);; } +bool SpellCheckClient::CheckSpelling(const base::string16& word_to_check) { + if (spell_check_.IsEmpty()) + return true; + + v8::HandleScope handle_scope(isolate_); + v8::Handle word = mate::ConvertToV8(isolate_, word_to_check); + v8::Handle result = spell_check_.NewHandle()->Call( + provider_.NewHandle(), 1, &word); + + if (result->IsBoolean()) + return result->BooleanValue(); + else + return true; +} + +// Returns whether or not the given string is a valid contraction. +// This function is a fall-back when the SpellcheckWordIterator class +// returns a concatenated word which is not in the selected dictionary +// (e.g. "in'n'out") but each word is valid. +bool SpellCheckClient::IsValidContraction(const base::string16& contraction) { + if (!contraction_iterator_.IsInitialized() && + !contraction_iterator_.Initialize(&character_attributes_, false)) { + // We failed to initialize the word iterator, return as spelled correctly. + VLOG(1) << "Failed to initialize contraction_iterator_"; + return true; + } + + contraction_iterator_.SetText(contraction.c_str(), contraction.length()); + + base::string16 word; + int word_start; + int word_length; + + while (contraction_iterator_.GetNextWord(&word, &word_start, &word_length)) { + if (!CheckSpelling(word)) + return false; + } + return true; +} + } // namespace api } // namespace atom diff --git a/atom/renderer/api/atom_api_spell_check_client.h b/atom/renderer/api/atom_api_spell_check_client.h index d6ec801734f6..e8c921b6a376 100644 --- a/atom/renderer/api/atom_api_spell_check_client.h +++ b/atom/renderer/api/atom_api_spell_check_client.h @@ -5,6 +5,10 @@ #ifndef ATOM_RENDERER_API_ATOM_API_SPELL_CHECK_CLIENT_H_ #define ATOM_RENDERER_API_ATOM_API_SPELL_CHECK_CLIENT_H_ +#include + +#include "base/callback.h" +#include "chrome/renderer/spellchecker/spellcheck_worditerator.h" #include "native_mate/scoped_persistent.h" #include "third_party/WebKit/public/web/WebSpellCheckClient.h" @@ -14,7 +18,9 @@ namespace api { class SpellCheckClient : public blink::WebSpellCheckClient { public: - SpellCheckClient(v8::Isolate* isolate, v8::Handle provider); + SpellCheckClient(v8::Isolate* isolate, + const std::string& language, + v8::Handle provider); virtual ~SpellCheckClient(); private: @@ -44,8 +50,28 @@ class SpellCheckClient : public blink::WebSpellCheckClient { bool CallProviderMethod(const char* method, const blink::WebString& text, T* result); + // Call JavaScript to check spelling. + bool CheckSpelling(const base::string16& word_to_check); + + // Returns whether or not the given word is a contraction of valid words + // (e.g. "word:word"). + bool IsValidContraction(const base::string16& word); + + // Represents character attributes used for filtering out characters which + // are not supported by this SpellCheck object. + SpellcheckCharAttribute character_attributes_; + + // Represents word iterators used in this spellchecker. The |text_iterator_| + // splits text provided by WebKit into words, contractions, or concatenated + // words. The |contraction_iterator_| splits a concatenated word extracted by + // |text_iterator_| into word components so we can treat a concatenated word + // consisting only of correct words as a correct word. + SpellcheckWordIterator text_iterator_; + SpellcheckWordIterator contraction_iterator_; + v8::Isolate* isolate_; mate::ScopedPersistent provider_; + mate::ScopedPersistent spell_check_; DISALLOW_COPY_AND_ASSIGN(SpellCheckClient); }; diff --git a/atom/renderer/api/atom_api_web_frame.cc b/atom/renderer/api/atom_api_web_frame.cc index e1b016c5b5b5..0d514e823198 100644 --- a/atom/renderer/api/atom_api_web_frame.cc +++ b/atom/renderer/api/atom_api_web_frame.cc @@ -57,9 +57,16 @@ void WebFrame::AttachGuest(int id) { content::RenderFrame::FromWebFrame(web_frame_)->AttachGuest(id); } -void WebFrame::SetSpellCheckProvider(v8::Isolate* isolate, +void WebFrame::SetSpellCheckProvider(mate::Arguments* args, + const std::string& language, v8::Handle provider) { - spell_check_client_.reset(new SpellCheckClient(isolate, provider)); + v8::Isolate* isolate = args->isolate(); + if (!provider->Has(mate::StringToV8(isolate, "spellCheck"))) { + args->ThrowError("\"spellCheck\" has to be defined"); + return; + } + + spell_check_client_.reset(new SpellCheckClient(isolate, language, provider)); web_frame_->view()->setSpellCheckClient(spell_check_client_.get()); } diff --git a/atom/renderer/api/atom_api_web_frame.h b/atom/renderer/api/atom_api_web_frame.h index 077aa64e3b1f..d6556621ec37 100644 --- a/atom/renderer/api/atom_api_web_frame.h +++ b/atom/renderer/api/atom_api_web_frame.h @@ -15,6 +15,10 @@ namespace blink { class WebLocalFrame; } +namespace mate { +class Arguments; +} + namespace atom { namespace api { @@ -41,7 +45,8 @@ class WebFrame : public mate::Wrappable { void AttachGuest(int element_instance_id); // Set the provider that will be used by SpellCheckClient for spell check. - void SetSpellCheckProvider(v8::Isolate* isolate, + void SetSpellCheckProvider(mate::Arguments* args, + const std::string& language, v8::Handle provider); // mate::Wrappable: diff --git a/chromium_src/chrome/renderer/spellchecker/spellcheck_worditerator.cc b/chromium_src/chrome/renderer/spellchecker/spellcheck_worditerator.cc new file mode 100644 index 000000000000..815a9c08b345 --- /dev/null +++ b/chromium_src/chrome/renderer/spellchecker/spellcheck_worditerator.cc @@ -0,0 +1,422 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Implements a custom word iterator used for our spellchecker. + +#include "chrome/renderer/spellchecker/spellcheck_worditerator.h" + +#include +#include + +#include "base/basictypes.h" +#include "base/i18n/break_iterator.h" +#include "base/logging.h" +#include "base/strings/stringprintf.h" +#include "base/strings/utf_string_conversions.h" +#include "third_party/icu/source/common/unicode/normlzr.h" +#include "third_party/icu/source/common/unicode/schriter.h" +#include "third_party/icu/source/common/unicode/uscript.h" +#include "third_party/icu/source/i18n/unicode/ulocdata.h" + +// SpellcheckCharAttribute implementation: + +SpellcheckCharAttribute::SpellcheckCharAttribute() + : script_code_(USCRIPT_LATIN) { +} + +SpellcheckCharAttribute::~SpellcheckCharAttribute() { +} + +void SpellcheckCharAttribute::SetDefaultLanguage(const std::string& language) { + CreateRuleSets(language); +} + +base::string16 SpellcheckCharAttribute::GetRuleSet( + bool allow_contraction) const { + return allow_contraction ? + ruleset_allow_contraction_ : ruleset_disallow_contraction_; +} + +void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) { + // The template for our custom rule sets, which is based on the word-break + // rules of ICU 4.0: + // . + // The major differences from the original one are listed below: + // * It discards comments in the original rules. + // * It discards characters not needed by our spellchecker (e.g. numbers, + // punctuation characters, Hiraganas, Katakanas, CJK Ideographs, and so on). + // * It allows customization of the $ALetter value (i.e. word characters). + // * It allows customization of the $ALetterPlus value (i.e. whether or not to + // use the dictionary data). + // * It allows choosing whether or not to split a text at contraction + // characters. + // This template only changes the forward-iteration rules. So, calling + // ubrk_prev() returns the same results as the original template. + static const char kRuleTemplate[] = + "!!chain;" + "$CR = [\\p{Word_Break = CR}];" + "$LF = [\\p{Word_Break = LF}];" + "$Newline = [\\p{Word_Break = Newline}];" + "$Extend = [\\p{Word_Break = Extend}];" + "$Format = [\\p{Word_Break = Format}];" + "$Katakana = [\\p{Word_Break = Katakana}];" + // Not all the characters in a given script are ALetter. + // For instance, U+05F4 is MidLetter. So, this may be + // better, but it leads to an empty set error in Thai. + // "$ALetter = [[\\p{script=%s}] & [\\p{Word_Break = ALetter}]];" + "$ALetter = [\\p{script=%s}%s];" + // U+0027 (single quote/apostrophe) is not in MidNumLet any more + // in UAX 29 rev 21 or later. For our purpose, U+0027 + // has to be treated as MidNumLet. ( http://crbug.com/364072 ) + "$MidNumLet = [\\p{Word_Break = MidNumLet} \\u0027];" + "$MidLetter = [\\p{Word_Break = MidLetter}%s];" + "$MidNum = [\\p{Word_Break = MidNum}];" + "$Numeric = [\\p{Word_Break = Numeric}];" + "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];" + + "$Control = [\\p{Grapheme_Cluster_Break = Control}]; " + "%s" // ALetterPlus + + "$KatakanaEx = $Katakana ($Extend | $Format)*;" + "$ALetterEx = $ALetterPlus ($Extend | $Format)*;" + "$MidNumLetEx = $MidNumLet ($Extend | $Format)*;" + "$MidLetterEx = $MidLetter ($Extend | $Format)*;" + "$MidNumEx = $MidNum ($Extend | $Format)*;" + "$NumericEx = $Numeric ($Extend | $Format)*;" + "$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;" + + "$Hiragana = [\\p{script=Hiragana}];" + "$Ideographic = [\\p{Ideographic}];" + "$HiraganaEx = $Hiragana ($Extend | $Format)*;" + "$IdeographicEx = $Ideographic ($Extend | $Format)*;" + + "!!forward;" + "$CR $LF;" + "[^$CR $LF $Newline]? ($Extend | $Format)+;" + "$ALetterEx {200};" + "$ALetterEx $ALetterEx {200};" + "%s" // (Allow|Disallow) Contraction + + "!!reverse;" + "$BackALetterEx = ($Format | $Extend)* $ALetterPlus;" + "$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet;" + "$BackNumericEx = ($Format | $Extend)* $Numeric;" + "$BackMidNumEx = ($Format | $Extend)* $MidNum;" + "$BackMidLetterEx = ($Format | $Extend)* $MidLetter;" + "$BackKatakanaEx = ($Format | $Extend)* $Katakana;" + "$BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;" + "$LF $CR;" + "($Format | $Extend)* [^$CR $LF $Newline]?;" + "$BackALetterEx $BackALetterEx;" + "$BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx;" + "$BackNumericEx $BackNumericEx;" + "$BackNumericEx $BackALetterEx;" + "$BackALetterEx $BackNumericEx;" + "$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx;" + "$BackKatakanaEx $BackKatakanaEx;" + "$BackExtendNumLetEx ($BackALetterEx | $BackNumericEx |" + " $BackKatakanaEx | $BackExtendNumLetEx);" + "($BackALetterEx | $BackNumericEx | $BackKatakanaEx)" + " $BackExtendNumLetEx;" + + "!!safe_reverse;" + "($Extend | $Format)+ .?;" + "($MidLetter | $MidNumLet) $BackALetterEx;" + "($MidNum | $MidNumLet) $BackNumericEx;" + + "!!safe_forward;" + "($Extend | $Format)+ .?;" + "($MidLetterEx | $MidNumLetEx) $ALetterEx;" + "($MidNumEx | $MidNumLetEx) $NumericEx;"; + + // Retrieve the script codes used by the given language from ICU. When the + // given language consists of two or more scripts, we just use the first + // script. The size of returned script codes is always < 8. Therefore, we use + // an array of size 8 so we can include all script codes without insufficient + // buffer errors. + UErrorCode error = U_ZERO_ERROR; + UScriptCode script_code[8]; + int scripts = uscript_getCode(language.c_str(), script_code, + arraysize(script_code), &error); + if (U_SUCCESS(error) && scripts >= 1) + script_code_ = script_code[0]; + + // Retrieve the values for $ALetter and $ALetterPlus. We use the dictionary + // only for the languages which need it (i.e. Korean and Thai) to prevent ICU + // from returning dictionary words (i.e. Korean or Thai words) for languages + // which don't need them. + const char* aletter = uscript_getName(script_code_); + if (!aletter) + aletter = "Latin"; + + const char kWithDictionary[] = + "$dictionary = [:LineBreak = Complex_Context:];" + "$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];"; + const char kWithoutDictionary[] = "$ALetterPlus = $ALetter;"; + const char* aletter_plus = kWithoutDictionary; + if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI || + script_code_ == USCRIPT_LAO || script_code_ == USCRIPT_KHMER) + aletter_plus = kWithDictionary; + + // Treat numbers as word characters except for Arabic and Hebrew. + const char* aletter_extra = " [0123456789]"; + if (script_code_ == USCRIPT_HEBREW || script_code_ == USCRIPT_ARABIC) + aletter_extra = ""; + + const char kMidLetterExtra[] = ""; + // For Hebrew, treat single/double quoation marks as MidLetter. + const char kMidLetterExtraHebrew[] = "\"'"; + const char* midletter_extra = kMidLetterExtra; + if (script_code_ == USCRIPT_HEBREW) + midletter_extra = kMidLetterExtraHebrew; + + // Create two custom rule-sets: one allows contraction and the other does not. + // We save these strings in UTF-16 so we can use it without conversions. (ICU + // needs UTF-16 strings.) + const char kAllowContraction[] = + "$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};"; + const char kDisallowContraction[] = ""; + + ruleset_allow_contraction_ = base::ASCIIToUTF16( + base::StringPrintf(kRuleTemplate, + aletter, + aletter_extra, + midletter_extra, + aletter_plus, + kAllowContraction)); + ruleset_disallow_contraction_ = base::ASCIIToUTF16( + base::StringPrintf(kRuleTemplate, + aletter, + aletter_extra, + midletter_extra, + aletter_plus, + kDisallowContraction)); +} + +bool SpellcheckCharAttribute::OutputChar(UChar c, + base::string16* output) const { + // Call the language-specific function if necessary. + // Otherwise, we call the default one. + switch (script_code_) { + case USCRIPT_ARABIC: + return OutputArabic(c, output); + + case USCRIPT_HANGUL: + return OutputHangul(c, output); + + case USCRIPT_HEBREW: + return OutputHebrew(c, output); + + default: + return OutputDefault(c, output); + } +} + +bool SpellcheckCharAttribute::OutputArabic(UChar c, + base::string16* output) const { + // Discard characters not from Arabic alphabets. We also discard vowel marks + // of Arabic (Damma, Fatha, Kasra, etc.) to prevent our Arabic dictionary from + // marking an Arabic word including vowel marks as misspelled. (We need to + // check these vowel marks manually and filter them out since their script + // codes are USCRIPT_ARABIC.) + if (0x0621 <= c && c <= 0x064D) + output->push_back(c); + return true; +} + +bool SpellcheckCharAttribute::OutputHangul(UChar c, + base::string16* output) const { + // Decompose a Hangul character to a Hangul vowel and consonants used by our + // spellchecker. A Hangul character of Unicode is a ligature consisting of a + // Hangul vowel and consonants, e.g. U+AC01 "Gag" consists of U+1100 "G", + // U+1161 "a", and U+11A8 "g". That is, we can treat each Hangul character as + // a point of a cubic linear space consisting of (first consonant, vowel, last + // consonant). Therefore, we can compose a Hangul character from a vowel and + // two consonants with linear composition: + // character = 0xAC00 + + // (first consonant - 0x1100) * 28 * 21 + + // (vowel - 0x1161) * 28 + + // (last consonant - 0x11A7); + // We can also decompose a Hangul character with linear decomposition: + // first consonant = (character - 0xAC00) / 28 / 21; + // vowel = (character - 0xAC00) / 28 % 21; + // last consonant = (character - 0xAC00) % 28; + // This code is copied from Unicode Standard Annex #15 + // and added some comments. + const int kSBase = 0xAC00; // U+AC00: the top of Hangul characters. + const int kLBase = 0x1100; // U+1100: the top of Hangul first consonants. + const int kVBase = 0x1161; // U+1161: the top of Hangul vowels. + const int kTBase = 0x11A7; // U+11A7: the top of Hangul last consonants. + const int kLCount = 19; // The number of Hangul first consonants. + const int kVCount = 21; // The number of Hangul vowels. + const int kTCount = 28; // The number of Hangul last consonants. + const int kNCount = kVCount * kTCount; + const int kSCount = kLCount * kNCount; + + int index = c - kSBase; + if (index < 0 || index >= kSBase + kSCount) { + // This is not a Hangul syllable. Call the default output function since we + // should output this character when it is a Hangul syllable. + return OutputDefault(c, output); + } + + // This is a Hangul character. Decompose this characters into Hangul vowels + // and consonants. + int l = kLBase + index / kNCount; + int v = kVBase + (index % kNCount) / kTCount; + int t = kTBase + index % kTCount; + output->push_back(l); + output->push_back(v); + if (t != kTBase) + output->push_back(t); + return true; +} + +bool SpellcheckCharAttribute::OutputHebrew(UChar c, + base::string16* output) const { + // Discard characters except Hebrew alphabets. We also discard Hebrew niqquds + // to prevent our Hebrew dictionary from marking a Hebrew word including + // niqquds as misspelled. (Same as Arabic vowel marks, we need to check + // niqquds manually and filter them out since their script codes are + // USCRIPT_HEBREW.) + // Pass through ASCII single/double quotation marks and Hebrew Geresh and + // Gershayim. + if ((0x05D0 <= c && c <= 0x05EA) || c == 0x22 || c == 0x27 || + c == 0x05F4 || c == 0x05F3) + output->push_back(c); + return true; +} + +bool SpellcheckCharAttribute::OutputDefault(UChar c, + base::string16* output) const { + // Check the script code of this character and output only if it is the one + // used by the spellchecker language. + UErrorCode status = U_ZERO_ERROR; + UScriptCode script_code = uscript_getScript(c, &status); + if (script_code == script_code_ || script_code == USCRIPT_COMMON) + output->push_back(c); + return true; +} + +// SpellcheckWordIterator implementation: + +SpellcheckWordIterator::SpellcheckWordIterator() + : text_(NULL), + attribute_(NULL), + iterator_() { +} + +SpellcheckWordIterator::~SpellcheckWordIterator() { + Reset(); +} + +bool SpellcheckWordIterator::Initialize( + const SpellcheckCharAttribute* attribute, + bool allow_contraction) { + // Create a custom ICU break iterator with empty text used in this object. (We + // allow setting text later so we can re-use this iterator.) + DCHECK(attribute); + const base::string16 rule(attribute->GetRuleSet(allow_contraction)); + + // If there is no rule set, the attributes were invalid. + if (rule.empty()) + return false; + + scoped_ptr iterator( + new base::i18n::BreakIterator(base::string16(), rule)); + if (!iterator->Init()) { + // Since we're not passing in any text, the only reason this could fail + // is if we fail to parse the rules. Since the rules are hardcoded, + // that would be a bug in this class. + NOTREACHED() << "failed to open iterator (broken rules)"; + return false; + } + iterator_ = iterator.Pass(); + + // Set the character attributes so we can normalize the words extracted by + // this iterator. + attribute_ = attribute; + return true; +} + +bool SpellcheckWordIterator::IsInitialized() const { + // Return true iff we have an iterator. + return !!iterator_; +} + +bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) { + DCHECK(!!iterator_); + + // Set the text to be split by this iterator. + if (!iterator_->SetText(text, length)) { + LOG(ERROR) << "failed to set text"; + return false; + } + + text_ = text; + return true; +} + +bool SpellcheckWordIterator::GetNextWord(base::string16* word_string, + int* word_start, + int* word_length) { + DCHECK(!!text_); + + word_string->clear(); + *word_start = 0; + *word_length = 0; + + if (!text_) { + return false; + } + + // Find a word that can be checked for spelling. Our rule sets filter out + // invalid words (e.g. numbers and characters not supported by the + // spellchecker language) so this ubrk_getRuleStatus() call returns + // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such + // words until we can find a valid word or reach the end of the input string. + while (iterator_->Advance()) { + const size_t start = iterator_->prev(); + const size_t length = iterator_->pos() - start; + if (iterator_->IsWord()) { + if (Normalize(start, length, word_string)) { + *word_start = start; + *word_length = length; + return true; + } + } + } + + // There aren't any more words in the given text. + return false; +} + +void SpellcheckWordIterator::Reset() { + iterator_.reset(); +} + +bool SpellcheckWordIterator::Normalize(int input_start, + int input_length, + base::string16* output_string) const { + // We use NFKC (Normalization Form, Compatible decomposition, followed by + // canonical Composition) defined in Unicode Standard Annex #15 to normalize + // this token because it it the most suitable normalization algorithm for our + // spellchecker. Nevertheless, it is not a perfect algorithm for our + // spellchecker and we need manual normalization as well. The normalized + // text does not have to be NUL-terminated since its characters are copied to + // string16, which adds a NUL character when we need. + icu::UnicodeString input(FALSE, &text_[input_start], input_length); + UErrorCode status = U_ZERO_ERROR; + icu::UnicodeString output; + icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status); + if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) + return false; + + // Copy the normalized text to the output. + icu::StringCharacterIterator it(output); + for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) + attribute_->OutputChar(c, output_string); + + return !output_string->empty(); +} diff --git a/chromium_src/chrome/renderer/spellchecker/spellcheck_worditerator.h b/chromium_src/chrome/renderer/spellchecker/spellcheck_worditerator.h new file mode 100644 index 000000000000..2ac28a2e2402 --- /dev/null +++ b/chromium_src/chrome/renderer/spellchecker/spellcheck_worditerator.h @@ -0,0 +1,175 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Defines an iterator class that enumerates words supported by our spellchecker +// from multi-language text. This class is used for filtering out characters +// not supported by our spellchecker. + +#ifndef CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ +#define CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ + +#include + +#include "base/basictypes.h" +#include "base/memory/scoped_ptr.h" +#include "base/strings/string16.h" +#include "third_party/icu/source/common/unicode/uscript.h" + +namespace base { +namespace i18n { +class BreakIterator; +} // namespace i18n +} // namespace base + +// A class which encapsulates language-specific operations used by +// SpellcheckWordIterator. When we set the spellchecker language, this class +// creates rule sets that filter out the characters not supported by the +// spellchecker. (Please read the comment in the SpellcheckWordIterator class +// about how to use this class.) +class SpellcheckCharAttribute { + public: + SpellcheckCharAttribute(); + ~SpellcheckCharAttribute(); + + // Sets the language of the spellchecker. When this function is called with an + // ISO language code, this function creates the custom rule-sets used by + // the ICU break iterator so it can extract only words used by the language. + // GetRuleSet() returns the rule-sets created in this function. + void SetDefaultLanguage(const std::string& language); + + // Returns a custom rule-set string used by the ICU break iterator. This class + // has two rule-sets, one splits a contraction and the other does not, so we + // can split a concaticated word (e.g. "seven-year-old") into words (e.g. + // "seven", "year", and "old") and check their spellings. The result stirng is + // encoded in UTF-16 since ICU needs UTF-16 strings. + base::string16 GetRuleSet(bool allow_contraction) const; + + // Outputs a character only if it is a word character. (Please read the + // comments in CreateRuleSets() why we need this function.) + bool OutputChar(UChar c, base::string16* output) const; + + private: + // Creates the rule-sets that return words possibly used by the given + // language. Unfortunately, these rule-sets are not perfect and have some + // false-positives. For example, they return combined accent marks even though + // we need English words only. We call OutputCharacter() to filter out such + // false-positive characters. + void CreateRuleSets(const std::string& language); + + // Outputs a character only if it is one used by the given language. These + // functions are called from OutputChar(). + bool OutputArabic(UChar c, base::string16* output) const; + bool OutputHangul(UChar c, base::string16* output) const; + bool OutputHebrew(UChar c, base::string16* output) const; + bool OutputDefault(UChar c, base::string16* output) const; + + // The custom rule-set strings used by ICU break iterator. Since it is not so + // easy to create custom rule-sets from an ISO language code, this class + // saves these rule-set strings created when we set the language. + base::string16 ruleset_allow_contraction_; + base::string16 ruleset_disallow_contraction_; + + // The script code used by this language. + UScriptCode script_code_; + + DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute); +}; + +// A class which extracts words that can be checked for spelling from a +// multi-language string. The ICU word-break iterator does not discard some +// punctuation characters attached to a word. For example, when we set a word +// "_hello_" to a word-break iterator, it just returns "_hello_". Neither does +// it discard characters not used by the language. For example, it returns +// Russian words even though we need English words only. To extract only the +// words that our spellchecker can check their spellings, this class uses custom +// rule-sets created by the SpellcheckCharAttribute class. Also, this class +// normalizes extracted words so our spellchecker can check the spellings of +// words that include ligatures, combined characters, full-width characters, +// etc. This class uses UTF-16 strings as its input and output strings since +// UTF-16 is the native encoding of ICU and avoid unnecessary conversions +// when changing the encoding of this string for our spellchecker. (Chrome can +// use two or more spellcheckers and we cannot assume their encodings.) +// The following snippet is an example that extracts words with this class. +// +// // Creates the language-specific attributes for US English. +// SpellcheckCharAttribute attribute; +// attribute.SetDefaultLanguage("en-US"); +// +// // Set up a SpellcheckWordIterator object which extracts English words, +// // and retrieve them. +// SpellcheckWordIterator iterator; +// base::string16 text(base::UTF8ToUTF16("this is a test.")); +// iterator.Initialize(&attribute, true); +// iterator.SetText(text.c_str(), text_.length()); +// +// base::string16 word; +// int offset; +// int length; +// while (iterator.GetNextWord(&word, &offset, &length)) { +// ... +// } +// +class SpellcheckWordIterator { + public: + SpellcheckWordIterator(); + ~SpellcheckWordIterator(); + + // Initializes a word-iterator object with the language-specific attribute. If + // we need to split contractions and concatenated words, call this function + // with its 'allow_contraction' parameter false. (This function uses lots of + // temporal memory to compile a custom word-break rule into an automaton.) + bool Initialize(const SpellcheckCharAttribute* attribute, + bool allow_contraction); + + // Returns whether this word iterator is initialized. + bool IsInitialized() const; + + // Set text to be iterated. (This text does not have to be NULL-terminated.) + // This function also resets internal state so we can reuse this iterator + // without calling Initialize(). + bool SetText(const base::char16* text, size_t length); + + // Retrieves a word (or a contraction), stores its copy to 'word_string', and + // stores the position and the length for input word to 'word_start'. Since + // this function normalizes the output word, the length of 'word_string' may + // be different from the 'word_length'. Therefore, when we call functions that + // changes the input text, such as string16::replace(), we need to use + // 'word_start' and 'word_length' as listed in the following snippet. + // + // while(iterator.GetNextWord(&word, &offset, &length)) + // text.replace(offset, length, word); + // + bool GetNextWord(base::string16* word_string, + int* word_start, + int* word_length); + + // Releases all the resources attached to this object. + void Reset(); + + private: + // Normalizes a non-terminated string returned from an ICU word-break + // iterator. A word returned from an ICU break iterator may include characters + // not supported by our spellchecker, e.g. ligatures, combining/ characters, + // full-width letters, etc. This function replaces such characters with + // alternative characters supported by our spellchecker. This function also + // calls SpellcheckWordIterator::OutputChar() to filter out false-positive + // characters. + bool Normalize(int input_start, + int input_length, + base::string16* output_string) const; + + // The pointer to the input string from which we are extracting words. + const base::char16* text_; + + // The language-specific attributes used for filtering out non-word + // characters. + const SpellcheckCharAttribute* attribute_; + + // The break iterator. + scoped_ptr iterator_; + + DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator); +}; + +#endif // CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_