Split the words before sending it to spellCheck
This commit is contained in:
		
					parent
					
						
							
								b801a93dc5
							
						
					
				
			
			
				commit
				
					
						c6a18b1b59
					
				
			
		
					 7 changed files with 720 additions and 19 deletions
				
			
		
							
								
								
									
										2
									
								
								atom.gyp
									
										
									
									
									
								
							
							
						
						
									
										2
									
								
								atom.gyp
									
										
									
									
									
								
							|  | @ -323,6 +323,8 @@ | ||||||
|       'chromium_src/chrome/renderer/printing/print_web_view_helper_mac.mm', |       'chromium_src/chrome/renderer/printing/print_web_view_helper_mac.mm', | ||||||
|       'chromium_src/chrome/renderer/printing/print_web_view_helper_pdf_win.cc', |       'chromium_src/chrome/renderer/printing/print_web_view_helper_pdf_win.cc', | ||||||
|       'chromium_src/chrome/renderer/printing/print_web_view_helper.h', |       'chromium_src/chrome/renderer/printing/print_web_view_helper.h', | ||||||
|  |       'chromium_src/chrome/renderer/spellchecker/spellcheck_worditerator.cc', | ||||||
|  |       'chromium_src/chrome/renderer/spellchecker/spellcheck_worditerator.h', | ||||||
|       'chromium_src/chrome/renderer/tts_dispatcher.cc', |       'chromium_src/chrome/renderer/tts_dispatcher.cc', | ||||||
|       'chromium_src/chrome/renderer/tts_dispatcher.h', |       'chromium_src/chrome/renderer/tts_dispatcher.h', | ||||||
|       'chromium_src/library_loaders/libgio_loader.cc', |       'chromium_src/library_loaders/libgio_loader.cc', | ||||||
|  |  | ||||||
|  | @ -54,22 +54,53 @@ bool HasWordCharacters(const base::string16& text, int index) { | ||||||
| }  // namespace
 | }  // namespace
 | ||||||
| 
 | 
 | ||||||
| SpellCheckClient::SpellCheckClient(v8::Isolate* isolate, | SpellCheckClient::SpellCheckClient(v8::Isolate* isolate, | ||||||
|  |                                    const std::string& language, | ||||||
|                                    v8::Handle<v8::Object> provider) |                                    v8::Handle<v8::Object> provider) | ||||||
|     : isolate_(isolate), provider_(isolate, provider) {} |     : isolate_(isolate), provider_(isolate, provider) { | ||||||
|  |   character_attributes_.SetDefaultLanguage(language); | ||||||
|  | 
 | ||||||
|  |   // Persistent the method.
 | ||||||
|  |   mate::Dictionary dict(isolate, provider); | ||||||
|  |   dict.Get("spellCheck", &spell_check_); | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| SpellCheckClient::~SpellCheckClient() {} | SpellCheckClient::~SpellCheckClient() {} | ||||||
| 
 | 
 | ||||||
| void SpellCheckClient::spellCheck( | void SpellCheckClient::spellCheck( | ||||||
|     const blink::WebString& text, |     const blink::WebString& text, | ||||||
|     int& misspelledOffset, |     int& misspelling_start, | ||||||
|     int& misspelledLength, |     int& misspelling_len, | ||||||
|     blink::WebVector<blink::WebString>* optionalSuggestions) { |     blink::WebVector<blink::WebString>* optional_suggestions) { | ||||||
|   blink::WebTextCheckingResult result; |   if (text.length() == 0 || spell_check_.IsEmpty()) | ||||||
|   if (!CallProviderMethod("spellCheck", text, &result)) |  | ||||||
|     return; |     return; | ||||||
| 
 | 
 | ||||||
|   misspelledOffset = result.location; |   base::string16 word; | ||||||
|   misspelledLength = result.length; |   int word_start; | ||||||
|  |   int word_length; | ||||||
|  |   if (!text_iterator_.IsInitialized() && | ||||||
|  |       !text_iterator_.Initialize(&character_attributes_, true)) { | ||||||
|  |       // We failed to initialize text_iterator_, return as spelled correctly.
 | ||||||
|  |       VLOG(1) << "Failed to initialize SpellcheckWordIterator"; | ||||||
|  |       return; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   base::string16 in_word(text); | ||||||
|  |   text_iterator_.SetText(in_word.c_str(), in_word.size()); | ||||||
|  |   while (text_iterator_.GetNextWord(&word, &word_start, &word_length)) { | ||||||
|  |     // Found a word (or a contraction) that the spellchecker can check the
 | ||||||
|  |     // spelling of.
 | ||||||
|  |     if (CheckSpelling(word)) | ||||||
|  |       continue; | ||||||
|  | 
 | ||||||
|  |     // If the given word is a concatenated word of two or more valid words
 | ||||||
|  |     // (e.g. "hello:hello"), we should treat it as a valid word.
 | ||||||
|  |     if (IsValidContraction(word)) | ||||||
|  |       continue; | ||||||
|  | 
 | ||||||
|  |     misspelling_start = word_start; | ||||||
|  |     misspelling_len = word_length; | ||||||
|  |     return; | ||||||
|  |   } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void SpellCheckClient::checkTextOfParagraph( | void SpellCheckClient::checkTextOfParagraph( | ||||||
|  | @ -90,13 +121,6 @@ void SpellCheckClient::requestCheckingOfText( | ||||||
|     const blink::WebVector<uint32_t>& markersInText, |     const blink::WebVector<uint32_t>& markersInText, | ||||||
|     const blink::WebVector<unsigned>& markerOffsets, |     const blink::WebVector<unsigned>& markerOffsets, | ||||||
|     blink::WebTextCheckingCompletion* completionCallback) { |     blink::WebTextCheckingCompletion* completionCallback) { | ||||||
|   v8::HandleScope handle_scope(isolate_); |  | ||||||
|   v8::Handle<v8::Object> provider = provider_.NewHandle(); |  | ||||||
|   if (!provider->Has(mate::StringToV8(isolate_, "requestCheckingOfText"))) { |  | ||||||
|     completionCallback->didCancelCheckingText(); |  | ||||||
|     return; |  | ||||||
|   } |  | ||||||
| 
 |  | ||||||
|   base::string16 text(textToCheck); |   base::string16 text(textToCheck); | ||||||
|   if (text.empty() || !HasWordCharacters(text, 0)) { |   if (text.empty() || !HasWordCharacters(text, 0)) { | ||||||
|     completionCallback->didCancelCheckingText(); |     completionCallback->didCancelCheckingText(); | ||||||
|  | @ -151,6 +175,46 @@ bool SpellCheckClient::CallProviderMethod(const char* method, | ||||||
|   return mate::ConvertFromV8(isolate_, v8_result, result);; |   return mate::ConvertFromV8(isolate_, v8_result, result);; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | bool SpellCheckClient::CheckSpelling(const base::string16& word_to_check) { | ||||||
|  |   if (spell_check_.IsEmpty()) | ||||||
|  |     return true; | ||||||
|  | 
 | ||||||
|  |   v8::HandleScope handle_scope(isolate_); | ||||||
|  |   v8::Handle<v8::Value> word = mate::ConvertToV8(isolate_, word_to_check); | ||||||
|  |   v8::Handle<v8::Value> result = spell_check_.NewHandle()->Call( | ||||||
|  |       provider_.NewHandle(), 1, &word); | ||||||
|  | 
 | ||||||
|  |   if (result->IsBoolean()) | ||||||
|  |     return result->BooleanValue(); | ||||||
|  |   else | ||||||
|  |     return true; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // Returns whether or not the given string is a valid contraction.
 | ||||||
|  | // This function is a fall-back when the SpellcheckWordIterator class
 | ||||||
|  | // returns a concatenated word which is not in the selected dictionary
 | ||||||
|  | // (e.g. "in'n'out") but each word is valid.
 | ||||||
|  | bool SpellCheckClient::IsValidContraction(const base::string16& contraction) { | ||||||
|  |   if (!contraction_iterator_.IsInitialized() && | ||||||
|  |       !contraction_iterator_.Initialize(&character_attributes_, false)) { | ||||||
|  |     // We failed to initialize the word iterator, return as spelled correctly.
 | ||||||
|  |     VLOG(1) << "Failed to initialize contraction_iterator_"; | ||||||
|  |     return true; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   contraction_iterator_.SetText(contraction.c_str(), contraction.length()); | ||||||
|  | 
 | ||||||
|  |   base::string16 word; | ||||||
|  |   int word_start; | ||||||
|  |   int word_length; | ||||||
|  | 
 | ||||||
|  |   while (contraction_iterator_.GetNextWord(&word, &word_start, &word_length)) { | ||||||
|  |     if (!CheckSpelling(word)) | ||||||
|  |       return false; | ||||||
|  |   } | ||||||
|  |   return true; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| }  // namespace api
 | }  // namespace api
 | ||||||
| 
 | 
 | ||||||
| }  // namespace atom
 | }  // namespace atom
 | ||||||
|  |  | ||||||
|  | @ -5,6 +5,10 @@ | ||||||
| #ifndef ATOM_RENDERER_API_ATOM_API_SPELL_CHECK_CLIENT_H_ | #ifndef ATOM_RENDERER_API_ATOM_API_SPELL_CHECK_CLIENT_H_ | ||||||
| #define ATOM_RENDERER_API_ATOM_API_SPELL_CHECK_CLIENT_H_ | #define ATOM_RENDERER_API_ATOM_API_SPELL_CHECK_CLIENT_H_ | ||||||
| 
 | 
 | ||||||
|  | #include <string> | ||||||
|  | 
 | ||||||
|  | #include "base/callback.h" | ||||||
|  | #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" | ||||||
| #include "native_mate/scoped_persistent.h" | #include "native_mate/scoped_persistent.h" | ||||||
| #include "third_party/WebKit/public/web/WebSpellCheckClient.h" | #include "third_party/WebKit/public/web/WebSpellCheckClient.h" | ||||||
| 
 | 
 | ||||||
|  | @ -14,7 +18,9 @@ namespace api { | ||||||
| 
 | 
 | ||||||
| class SpellCheckClient : public blink::WebSpellCheckClient { | class SpellCheckClient : public blink::WebSpellCheckClient { | ||||||
|  public: |  public: | ||||||
|   SpellCheckClient(v8::Isolate* isolate, v8::Handle<v8::Object> provider); |   SpellCheckClient(v8::Isolate* isolate, | ||||||
|  |                    const std::string& language, | ||||||
|  |                    v8::Handle<v8::Object> provider); | ||||||
|   virtual ~SpellCheckClient(); |   virtual ~SpellCheckClient(); | ||||||
| 
 | 
 | ||||||
|  private: |  private: | ||||||
|  | @ -44,8 +50,28 @@ class SpellCheckClient : public blink::WebSpellCheckClient { | ||||||
|   bool CallProviderMethod(const char* method, const blink::WebString& text, |   bool CallProviderMethod(const char* method, const blink::WebString& text, | ||||||
|                           T* result); |                           T* result); | ||||||
| 
 | 
 | ||||||
|  |   // Call JavaScript to check spelling.
 | ||||||
|  |   bool CheckSpelling(const base::string16& word_to_check); | ||||||
|  | 
 | ||||||
|  |   // Returns whether or not the given word is a contraction of valid words
 | ||||||
|  |   // (e.g. "word:word").
 | ||||||
|  |   bool IsValidContraction(const base::string16& word); | ||||||
|  | 
 | ||||||
|  |   // Represents character attributes used for filtering out characters which
 | ||||||
|  |   // are not supported by this SpellCheck object.
 | ||||||
|  |   SpellcheckCharAttribute character_attributes_; | ||||||
|  | 
 | ||||||
|  |   // Represents word iterators used in this spellchecker. The |text_iterator_|
 | ||||||
|  |   // splits text provided by WebKit into words, contractions, or concatenated
 | ||||||
|  |   // words. The |contraction_iterator_| splits a concatenated word extracted by
 | ||||||
|  |   // |text_iterator_| into word components so we can treat a concatenated word
 | ||||||
|  |   // consisting only of correct words as a correct word.
 | ||||||
|  |   SpellcheckWordIterator text_iterator_; | ||||||
|  |   SpellcheckWordIterator contraction_iterator_; | ||||||
|  | 
 | ||||||
|   v8::Isolate* isolate_; |   v8::Isolate* isolate_; | ||||||
|   mate::ScopedPersistent<v8::Object> provider_; |   mate::ScopedPersistent<v8::Object> provider_; | ||||||
|  |   mate::ScopedPersistent<v8::Function> spell_check_; | ||||||
| 
 | 
 | ||||||
|   DISALLOW_COPY_AND_ASSIGN(SpellCheckClient); |   DISALLOW_COPY_AND_ASSIGN(SpellCheckClient); | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | @ -57,9 +57,16 @@ void WebFrame::AttachGuest(int id) { | ||||||
|   content::RenderFrame::FromWebFrame(web_frame_)->AttachGuest(id); |   content::RenderFrame::FromWebFrame(web_frame_)->AttachGuest(id); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void WebFrame::SetSpellCheckProvider(v8::Isolate* isolate, | void WebFrame::SetSpellCheckProvider(mate::Arguments* args, | ||||||
|  |                                      const std::string& language, | ||||||
|                                      v8::Handle<v8::Object> provider) { |                                      v8::Handle<v8::Object> provider) { | ||||||
|   spell_check_client_.reset(new SpellCheckClient(isolate, provider)); |   v8::Isolate* isolate = args->isolate(); | ||||||
|  |   if (!provider->Has(mate::StringToV8(isolate, "spellCheck"))) { | ||||||
|  |     args->ThrowError("\"spellCheck\" has to be defined"); | ||||||
|  |     return; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   spell_check_client_.reset(new SpellCheckClient(isolate, language, provider)); | ||||||
|   web_frame_->view()->setSpellCheckClient(spell_check_client_.get()); |   web_frame_->view()->setSpellCheckClient(spell_check_client_.get()); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -15,6 +15,10 @@ namespace blink { | ||||||
| class WebLocalFrame; | class WebLocalFrame; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | namespace mate { | ||||||
|  | class Arguments; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| namespace atom { | namespace atom { | ||||||
| 
 | 
 | ||||||
| namespace api { | namespace api { | ||||||
|  | @ -41,7 +45,8 @@ class WebFrame : public mate::Wrappable { | ||||||
|   void AttachGuest(int element_instance_id); |   void AttachGuest(int element_instance_id); | ||||||
| 
 | 
 | ||||||
|   // Set the provider that will be used by SpellCheckClient for spell check.
 |   // Set the provider that will be used by SpellCheckClient for spell check.
 | ||||||
|   void SetSpellCheckProvider(v8::Isolate* isolate, |   void SetSpellCheckProvider(mate::Arguments* args, | ||||||
|  |                              const std::string& language, | ||||||
|                              v8::Handle<v8::Object> provider); |                              v8::Handle<v8::Object> provider); | ||||||
| 
 | 
 | ||||||
|   // mate::Wrappable:
 |   // mate::Wrappable:
 | ||||||
|  |  | ||||||
|  | @ -0,0 +1,422 @@ | ||||||
|  | // Copyright (c) 2012 The Chromium Authors. All rights reserved.
 | ||||||
|  | // Use of this source code is governed by a BSD-style license that can be
 | ||||||
|  | // found in the LICENSE file.
 | ||||||
|  | 
 | ||||||
|  | // Implements a custom word iterator used for our spellchecker.
 | ||||||
|  | 
 | ||||||
|  | #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" | ||||||
|  | 
 | ||||||
|  | #include <map> | ||||||
|  | #include <string> | ||||||
|  | 
 | ||||||
|  | #include "base/basictypes.h" | ||||||
|  | #include "base/i18n/break_iterator.h" | ||||||
|  | #include "base/logging.h" | ||||||
|  | #include "base/strings/stringprintf.h" | ||||||
|  | #include "base/strings/utf_string_conversions.h" | ||||||
|  | #include "third_party/icu/source/common/unicode/normlzr.h" | ||||||
|  | #include "third_party/icu/source/common/unicode/schriter.h" | ||||||
|  | #include "third_party/icu/source/common/unicode/uscript.h" | ||||||
|  | #include "third_party/icu/source/i18n/unicode/ulocdata.h" | ||||||
|  | 
 | ||||||
|  | // SpellcheckCharAttribute implementation:
 | ||||||
|  | 
 | ||||||
|  | SpellcheckCharAttribute::SpellcheckCharAttribute() | ||||||
|  |     : script_code_(USCRIPT_LATIN) { | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | SpellcheckCharAttribute::~SpellcheckCharAttribute() { | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void SpellcheckCharAttribute::SetDefaultLanguage(const std::string& language) { | ||||||
|  |   CreateRuleSets(language); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | base::string16 SpellcheckCharAttribute::GetRuleSet( | ||||||
|  |     bool allow_contraction) const { | ||||||
|  |   return allow_contraction ? | ||||||
|  |       ruleset_allow_contraction_ : ruleset_disallow_contraction_; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) { | ||||||
|  |   // The template for our custom rule sets, which is based on the word-break
 | ||||||
|  |   // rules of ICU 4.0:
 | ||||||
|  |   // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/word.txt>.
 | ||||||
|  |   // The major differences from the original one are listed below:
 | ||||||
|  |   // * It discards comments in the original rules.
 | ||||||
|  |   // * It discards characters not needed by our spellchecker (e.g. numbers,
 | ||||||
|  |   //   punctuation characters, Hiraganas, Katakanas, CJK Ideographs, and so on).
 | ||||||
|  |   // * It allows customization of the $ALetter value (i.e. word characters).
 | ||||||
|  |   // * It allows customization of the $ALetterPlus value (i.e. whether or not to
 | ||||||
|  |   //   use the dictionary data).
 | ||||||
|  |   // * It allows choosing whether or not to split a text at contraction
 | ||||||
|  |   //   characters.
 | ||||||
|  |   // This template only changes the forward-iteration rules. So, calling
 | ||||||
|  |   // ubrk_prev() returns the same results as the original template.
 | ||||||
|  |   static const char kRuleTemplate[] = | ||||||
|  |       "!!chain;" | ||||||
|  |       "$CR           = [\\p{Word_Break = CR}];" | ||||||
|  |       "$LF           = [\\p{Word_Break = LF}];" | ||||||
|  |       "$Newline      = [\\p{Word_Break = Newline}];" | ||||||
|  |       "$Extend       = [\\p{Word_Break = Extend}];" | ||||||
|  |       "$Format       = [\\p{Word_Break = Format}];" | ||||||
|  |       "$Katakana     = [\\p{Word_Break = Katakana}];" | ||||||
|  |       // Not all the characters in a given script are ALetter.
 | ||||||
|  |       // For instance, U+05F4 is MidLetter. So, this may be
 | ||||||
|  |       // better, but it leads to an empty set error in Thai.
 | ||||||
|  |       // "$ALetter   = [[\\p{script=%s}] & [\\p{Word_Break = ALetter}]];"
 | ||||||
|  |       "$ALetter      = [\\p{script=%s}%s];" | ||||||
|  |       // U+0027 (single quote/apostrophe) is not in MidNumLet any more
 | ||||||
|  |       // in UAX 29 rev 21 or later. For our purpose, U+0027
 | ||||||
|  |       // has to be treated as MidNumLet. ( http://crbug.com/364072 )
 | ||||||
|  |       "$MidNumLet    = [\\p{Word_Break = MidNumLet} \\u0027];" | ||||||
|  |       "$MidLetter    = [\\p{Word_Break = MidLetter}%s];" | ||||||
|  |       "$MidNum       = [\\p{Word_Break = MidNum}];" | ||||||
|  |       "$Numeric      = [\\p{Word_Break = Numeric}];" | ||||||
|  |       "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];" | ||||||
|  | 
 | ||||||
|  |       "$Control        = [\\p{Grapheme_Cluster_Break = Control}]; " | ||||||
|  |       "%s"  // ALetterPlus
 | ||||||
|  | 
 | ||||||
|  |       "$KatakanaEx     = $Katakana     ($Extend |  $Format)*;" | ||||||
|  |       "$ALetterEx      = $ALetterPlus  ($Extend |  $Format)*;" | ||||||
|  |       "$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;" | ||||||
|  |       "$MidLetterEx    = $MidLetter    ($Extend |  $Format)*;" | ||||||
|  |       "$MidNumEx       = $MidNum       ($Extend |  $Format)*;" | ||||||
|  |       "$NumericEx      = $Numeric      ($Extend |  $Format)*;" | ||||||
|  |       "$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;" | ||||||
|  | 
 | ||||||
|  |       "$Hiragana       = [\\p{script=Hiragana}];" | ||||||
|  |       "$Ideographic    = [\\p{Ideographic}];" | ||||||
|  |       "$HiraganaEx     = $Hiragana     ($Extend |  $Format)*;" | ||||||
|  |       "$IdeographicEx  = $Ideographic  ($Extend |  $Format)*;" | ||||||
|  | 
 | ||||||
|  |       "!!forward;" | ||||||
|  |       "$CR $LF;" | ||||||
|  |       "[^$CR $LF $Newline]? ($Extend |  $Format)+;" | ||||||
|  |       "$ALetterEx {200};" | ||||||
|  |       "$ALetterEx $ALetterEx {200};" | ||||||
|  |       "%s"  // (Allow|Disallow) Contraction
 | ||||||
|  | 
 | ||||||
|  |       "!!reverse;" | ||||||
|  |       "$BackALetterEx     = ($Format | $Extend)* $ALetterPlus;" | ||||||
|  |       "$BackMidNumLetEx   = ($Format | $Extend)* $MidNumLet;" | ||||||
|  |       "$BackNumericEx     = ($Format | $Extend)* $Numeric;" | ||||||
|  |       "$BackMidNumEx      = ($Format | $Extend)* $MidNum;" | ||||||
|  |       "$BackMidLetterEx   = ($Format | $Extend)* $MidLetter;" | ||||||
|  |       "$BackKatakanaEx    = ($Format | $Extend)* $Katakana;" | ||||||
|  |       "$BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;" | ||||||
|  |       "$LF $CR;" | ||||||
|  |       "($Format | $Extend)*  [^$CR $LF $Newline]?;" | ||||||
|  |       "$BackALetterEx $BackALetterEx;" | ||||||
|  |       "$BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx;" | ||||||
|  |       "$BackNumericEx $BackNumericEx;" | ||||||
|  |       "$BackNumericEx $BackALetterEx;" | ||||||
|  |       "$BackALetterEx $BackNumericEx;" | ||||||
|  |       "$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx;" | ||||||
|  |       "$BackKatakanaEx $BackKatakanaEx;" | ||||||
|  |       "$BackExtendNumLetEx ($BackALetterEx | $BackNumericEx |" | ||||||
|  |       " $BackKatakanaEx | $BackExtendNumLetEx);" | ||||||
|  |       "($BackALetterEx | $BackNumericEx | $BackKatakanaEx)" | ||||||
|  |       " $BackExtendNumLetEx;" | ||||||
|  | 
 | ||||||
|  |       "!!safe_reverse;" | ||||||
|  |       "($Extend | $Format)+ .?;" | ||||||
|  |       "($MidLetter | $MidNumLet) $BackALetterEx;" | ||||||
|  |       "($MidNum | $MidNumLet) $BackNumericEx;" | ||||||
|  | 
 | ||||||
|  |       "!!safe_forward;" | ||||||
|  |       "($Extend | $Format)+ .?;" | ||||||
|  |       "($MidLetterEx | $MidNumLetEx) $ALetterEx;" | ||||||
|  |       "($MidNumEx | $MidNumLetEx) $NumericEx;"; | ||||||
|  | 
 | ||||||
|  |   // Retrieve the script codes used by the given language from ICU. When the
 | ||||||
|  |   // given language consists of two or more scripts, we just use the first
 | ||||||
|  |   // script. The size of returned script codes is always < 8. Therefore, we use
 | ||||||
|  |   // an array of size 8 so we can include all script codes without insufficient
 | ||||||
|  |   // buffer errors.
 | ||||||
|  |   UErrorCode error = U_ZERO_ERROR; | ||||||
|  |   UScriptCode script_code[8]; | ||||||
|  |   int scripts = uscript_getCode(language.c_str(), script_code, | ||||||
|  |                                 arraysize(script_code), &error); | ||||||
|  |   if (U_SUCCESS(error) && scripts >= 1) | ||||||
|  |     script_code_ = script_code[0]; | ||||||
|  | 
 | ||||||
|  |   // Retrieve the values for $ALetter and $ALetterPlus. We use the dictionary
 | ||||||
|  |   // only for the languages which need it (i.e. Korean and Thai) to prevent ICU
 | ||||||
|  |   // from returning dictionary words (i.e. Korean or Thai words) for languages
 | ||||||
|  |   // which don't need them.
 | ||||||
|  |   const char* aletter = uscript_getName(script_code_); | ||||||
|  |   if (!aletter) | ||||||
|  |     aletter = "Latin"; | ||||||
|  | 
 | ||||||
|  |   const char kWithDictionary[] = | ||||||
|  |       "$dictionary   = [:LineBreak = Complex_Context:];" | ||||||
|  |       "$ALetterPlus  = [$ALetter [$dictionary-$Extend-$Control]];"; | ||||||
|  |   const char kWithoutDictionary[] = "$ALetterPlus  = $ALetter;"; | ||||||
|  |   const char* aletter_plus = kWithoutDictionary; | ||||||
|  |   if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI || | ||||||
|  |       script_code_ == USCRIPT_LAO || script_code_ == USCRIPT_KHMER) | ||||||
|  |     aletter_plus = kWithDictionary; | ||||||
|  | 
 | ||||||
|  |   // Treat numbers as word characters except for Arabic and Hebrew.
 | ||||||
|  |   const char* aletter_extra = " [0123456789]"; | ||||||
|  |   if (script_code_ == USCRIPT_HEBREW || script_code_ == USCRIPT_ARABIC) | ||||||
|  |     aletter_extra = ""; | ||||||
|  | 
 | ||||||
|  |   const char kMidLetterExtra[] = ""; | ||||||
|  |   // For Hebrew, treat single/double quoation marks as MidLetter.
 | ||||||
|  |   const char kMidLetterExtraHebrew[] = "\"'"; | ||||||
|  |   const char* midletter_extra = kMidLetterExtra; | ||||||
|  |   if (script_code_ == USCRIPT_HEBREW) | ||||||
|  |     midletter_extra = kMidLetterExtraHebrew; | ||||||
|  | 
 | ||||||
|  |   // Create two custom rule-sets: one allows contraction and the other does not.
 | ||||||
|  |   // We save these strings in UTF-16 so we can use it without conversions. (ICU
 | ||||||
|  |   // needs UTF-16 strings.)
 | ||||||
|  |   const char kAllowContraction[] = | ||||||
|  |       "$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};"; | ||||||
|  |   const char kDisallowContraction[] = ""; | ||||||
|  | 
 | ||||||
|  |   ruleset_allow_contraction_ = base::ASCIIToUTF16( | ||||||
|  |       base::StringPrintf(kRuleTemplate, | ||||||
|  |                          aletter, | ||||||
|  |                          aletter_extra, | ||||||
|  |                          midletter_extra, | ||||||
|  |                          aletter_plus, | ||||||
|  |                          kAllowContraction)); | ||||||
|  |   ruleset_disallow_contraction_ = base::ASCIIToUTF16( | ||||||
|  |       base::StringPrintf(kRuleTemplate, | ||||||
|  |                          aletter, | ||||||
|  |                          aletter_extra, | ||||||
|  |                          midletter_extra, | ||||||
|  |                          aletter_plus, | ||||||
|  |                          kDisallowContraction)); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | bool SpellcheckCharAttribute::OutputChar(UChar c, | ||||||
|  |                                          base::string16* output) const { | ||||||
|  |   // Call the language-specific function if necessary.
 | ||||||
|  |   // Otherwise, we call the default one.
 | ||||||
|  |   switch (script_code_) { | ||||||
|  |     case USCRIPT_ARABIC: | ||||||
|  |       return OutputArabic(c, output); | ||||||
|  | 
 | ||||||
|  |     case USCRIPT_HANGUL: | ||||||
|  |       return OutputHangul(c, output); | ||||||
|  | 
 | ||||||
|  |     case USCRIPT_HEBREW: | ||||||
|  |       return OutputHebrew(c, output); | ||||||
|  | 
 | ||||||
|  |     default: | ||||||
|  |       return OutputDefault(c, output); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | bool SpellcheckCharAttribute::OutputArabic(UChar c, | ||||||
|  |                                            base::string16* output) const { | ||||||
|  |   // Discard characters not from Arabic alphabets. We also discard vowel marks
 | ||||||
|  |   // of Arabic (Damma, Fatha, Kasra, etc.) to prevent our Arabic dictionary from
 | ||||||
|  |   // marking an Arabic word including vowel marks as misspelled. (We need to
 | ||||||
|  |   // check these vowel marks manually and filter them out since their script
 | ||||||
|  |   // codes are USCRIPT_ARABIC.)
 | ||||||
|  |   if (0x0621 <= c && c <= 0x064D) | ||||||
|  |     output->push_back(c); | ||||||
|  |   return true; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | bool SpellcheckCharAttribute::OutputHangul(UChar c, | ||||||
|  |                                            base::string16* output) const { | ||||||
|  |   // Decompose a Hangul character to a Hangul vowel and consonants used by our
 | ||||||
|  |   // spellchecker. A Hangul character of Unicode is a ligature consisting of a
 | ||||||
|  |   // Hangul vowel and consonants, e.g. U+AC01 "Gag" consists of U+1100 "G",
 | ||||||
|  |   // U+1161 "a", and U+11A8 "g". That is, we can treat each Hangul character as
 | ||||||
|  |   // a point of a cubic linear space consisting of (first consonant, vowel, last
 | ||||||
|  |   // consonant). Therefore, we can compose a Hangul character from a vowel and
 | ||||||
|  |   // two consonants with linear composition:
 | ||||||
|  |   //   character =  0xAC00 +
 | ||||||
|  |   //                (first consonant - 0x1100) * 28 * 21 +
 | ||||||
|  |   //                (vowel           - 0x1161) * 28 +
 | ||||||
|  |   //                (last consonant  - 0x11A7);
 | ||||||
|  |   // We can also decompose a Hangul character with linear decomposition:
 | ||||||
|  |   //   first consonant = (character - 0xAC00) / 28 / 21;
 | ||||||
|  |   //   vowel           = (character - 0xAC00) / 28 % 21;
 | ||||||
|  |   //   last consonant  = (character - 0xAC00) % 28;
 | ||||||
|  |   // This code is copied from Unicode Standard Annex #15
 | ||||||
|  |   // <http://unicode.org/reports/tr15> and added some comments.
 | ||||||
|  |   const int kSBase = 0xAC00;  // U+AC00: the top of Hangul characters.
 | ||||||
|  |   const int kLBase = 0x1100;  // U+1100: the top of Hangul first consonants.
 | ||||||
|  |   const int kVBase = 0x1161;  // U+1161: the top of Hangul vowels.
 | ||||||
|  |   const int kTBase = 0x11A7;  // U+11A7: the top of Hangul last consonants.
 | ||||||
|  |   const int kLCount = 19;     // The number of Hangul first consonants.
 | ||||||
|  |   const int kVCount = 21;     // The number of Hangul vowels.
 | ||||||
|  |   const int kTCount = 28;     // The number of Hangul last consonants.
 | ||||||
|  |   const int kNCount = kVCount * kTCount; | ||||||
|  |   const int kSCount = kLCount * kNCount; | ||||||
|  | 
 | ||||||
|  |   int index = c - kSBase; | ||||||
|  |   if (index < 0 || index >= kSBase + kSCount) { | ||||||
|  |     // This is not a Hangul syllable. Call the default output function since we
 | ||||||
|  |     // should output this character when it is a Hangul syllable.
 | ||||||
|  |     return OutputDefault(c, output); | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   // This is a Hangul character. Decompose this characters into Hangul vowels
 | ||||||
|  |   // and consonants.
 | ||||||
|  |   int l = kLBase + index / kNCount; | ||||||
|  |   int v = kVBase + (index % kNCount) / kTCount; | ||||||
|  |   int t = kTBase + index % kTCount; | ||||||
|  |   output->push_back(l); | ||||||
|  |   output->push_back(v); | ||||||
|  |   if (t != kTBase) | ||||||
|  |     output->push_back(t); | ||||||
|  |   return true; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | bool SpellcheckCharAttribute::OutputHebrew(UChar c, | ||||||
|  |                                            base::string16* output) const { | ||||||
|  |   // Discard characters except Hebrew alphabets. We also discard Hebrew niqquds
 | ||||||
|  |   // to prevent our Hebrew dictionary from marking a Hebrew word including
 | ||||||
|  |   // niqquds as misspelled. (Same as Arabic vowel marks, we need to check
 | ||||||
|  |   // niqquds manually and filter them out since their script codes are
 | ||||||
|  |   // USCRIPT_HEBREW.)
 | ||||||
|  |   // Pass through ASCII single/double quotation marks and Hebrew Geresh and
 | ||||||
|  |   // Gershayim.
 | ||||||
|  |   if ((0x05D0 <= c && c <= 0x05EA) || c == 0x22 || c == 0x27 || | ||||||
|  |       c == 0x05F4 || c == 0x05F3) | ||||||
|  |     output->push_back(c); | ||||||
|  |   return true; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | bool SpellcheckCharAttribute::OutputDefault(UChar c, | ||||||
|  |                                             base::string16* output) const { | ||||||
|  |   // Check the script code of this character and output only if it is the one
 | ||||||
|  |   // used by the spellchecker language.
 | ||||||
|  |   UErrorCode status = U_ZERO_ERROR; | ||||||
|  |   UScriptCode script_code = uscript_getScript(c, &status); | ||||||
|  |   if (script_code == script_code_ || script_code == USCRIPT_COMMON) | ||||||
|  |     output->push_back(c); | ||||||
|  |   return true; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // SpellcheckWordIterator implementation:
 | ||||||
|  | 
 | ||||||
|  | SpellcheckWordIterator::SpellcheckWordIterator() | ||||||
|  |     : text_(NULL), | ||||||
|  |       attribute_(NULL), | ||||||
|  |       iterator_() { | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | SpellcheckWordIterator::~SpellcheckWordIterator() { | ||||||
|  |   Reset(); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | bool SpellcheckWordIterator::Initialize( | ||||||
|  |     const SpellcheckCharAttribute* attribute, | ||||||
|  |     bool allow_contraction) { | ||||||
|  |   // Create a custom ICU break iterator with empty text used in this object. (We
 | ||||||
|  |   // allow setting text later so we can re-use this iterator.)
 | ||||||
|  |   DCHECK(attribute); | ||||||
|  |   const base::string16 rule(attribute->GetRuleSet(allow_contraction)); | ||||||
|  | 
 | ||||||
|  |   // If there is no rule set, the attributes were invalid.
 | ||||||
|  |   if (rule.empty()) | ||||||
|  |     return false; | ||||||
|  | 
 | ||||||
|  |   scoped_ptr<base::i18n::BreakIterator> iterator( | ||||||
|  |       new base::i18n::BreakIterator(base::string16(), rule)); | ||||||
|  |   if (!iterator->Init()) { | ||||||
|  |     // Since we're not passing in any text, the only reason this could fail
 | ||||||
|  |     // is if we fail to parse the rules. Since the rules are hardcoded,
 | ||||||
|  |     // that would be a bug in this class.
 | ||||||
|  |     NOTREACHED() << "failed to open iterator (broken rules)"; | ||||||
|  |     return false; | ||||||
|  |   } | ||||||
|  |   iterator_ = iterator.Pass(); | ||||||
|  | 
 | ||||||
|  |   // Set the character attributes so we can normalize the words extracted by
 | ||||||
|  |   // this iterator.
 | ||||||
|  |   attribute_ = attribute; | ||||||
|  |   return true; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | bool SpellcheckWordIterator::IsInitialized() const { | ||||||
|  |   // Return true iff we have an iterator.
 | ||||||
|  |   return !!iterator_; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) { | ||||||
|  |   DCHECK(!!iterator_); | ||||||
|  | 
 | ||||||
|  |   // Set the text to be split by this iterator.
 | ||||||
|  |   if (!iterator_->SetText(text, length)) { | ||||||
|  |     LOG(ERROR) << "failed to set text"; | ||||||
|  |     return false; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   text_ = text; | ||||||
|  |   return true; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | bool SpellcheckWordIterator::GetNextWord(base::string16* word_string, | ||||||
|  |                                          int* word_start, | ||||||
|  |                                          int* word_length) { | ||||||
|  |   DCHECK(!!text_); | ||||||
|  | 
 | ||||||
|  |   word_string->clear(); | ||||||
|  |   *word_start = 0; | ||||||
|  |   *word_length = 0; | ||||||
|  | 
 | ||||||
|  |   if (!text_) { | ||||||
|  |     return false; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   // Find a word that can be checked for spelling. Our rule sets filter out
 | ||||||
|  |   // invalid words (e.g. numbers and characters not supported by the
 | ||||||
|  |   // spellchecker language) so this ubrk_getRuleStatus() call returns
 | ||||||
|  |   // UBRK_WORD_NONE when this iterator finds an invalid word. So, we skip such
 | ||||||
|  |   // words until we can find a valid word or reach the end of the input string.
 | ||||||
|  |   while (iterator_->Advance()) { | ||||||
|  |     const size_t start = iterator_->prev(); | ||||||
|  |     const size_t length = iterator_->pos() - start; | ||||||
|  |     if (iterator_->IsWord()) { | ||||||
|  |       if (Normalize(start, length, word_string)) { | ||||||
|  |         *word_start = start; | ||||||
|  |         *word_length = length; | ||||||
|  |         return true; | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   // There aren't any more words in the given text.
 | ||||||
|  |   return false; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void SpellcheckWordIterator::Reset() { | ||||||
|  |   iterator_.reset(); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | bool SpellcheckWordIterator::Normalize(int input_start, | ||||||
|  |                                        int input_length, | ||||||
|  |                                        base::string16* output_string) const { | ||||||
|  |   // We use NFKC (Normalization Form, Compatible decomposition, followed by
 | ||||||
|  |   // canonical Composition) defined in Unicode Standard Annex #15 to normalize
 | ||||||
|  |   // this token because it it the most suitable normalization algorithm for our
 | ||||||
|  |   // spellchecker. Nevertheless, it is not a perfect algorithm for our
 | ||||||
|  |   // spellchecker and we need manual normalization as well. The normalized
 | ||||||
|  |   // text does not have to be NUL-terminated since its characters are copied to
 | ||||||
|  |   // string16, which adds a NUL character when we need.
 | ||||||
|  |   icu::UnicodeString input(FALSE, &text_[input_start], input_length); | ||||||
|  |   UErrorCode status = U_ZERO_ERROR; | ||||||
|  |   icu::UnicodeString output; | ||||||
|  |   icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status); | ||||||
|  |   if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) | ||||||
|  |     return false; | ||||||
|  | 
 | ||||||
|  |   // Copy the normalized text to the output.
 | ||||||
|  |   icu::StringCharacterIterator it(output); | ||||||
|  |   for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) | ||||||
|  |     attribute_->OutputChar(c, output_string); | ||||||
|  | 
 | ||||||
|  |   return !output_string->empty(); | ||||||
|  | } | ||||||
|  | @ -0,0 +1,175 @@ | ||||||
|  | // Copyright (c) 2011 The Chromium Authors. All rights reserved.
 | ||||||
|  | // Use of this source code is governed by a BSD-style license that can be
 | ||||||
|  | // found in the LICENSE file.
 | ||||||
|  | 
 | ||||||
|  | // Defines an iterator class that enumerates words supported by our spellchecker
 | ||||||
|  | // from multi-language text. This class is used for filtering out characters
 | ||||||
|  | // not supported by our spellchecker.
 | ||||||
|  | 
 | ||||||
|  | #ifndef CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ | ||||||
|  | #define CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ | ||||||
|  | 
 | ||||||
|  | #include <string> | ||||||
|  | 
 | ||||||
|  | #include "base/basictypes.h" | ||||||
|  | #include "base/memory/scoped_ptr.h" | ||||||
|  | #include "base/strings/string16.h" | ||||||
|  | #include "third_party/icu/source/common/unicode/uscript.h" | ||||||
|  | 
 | ||||||
|  | namespace base { | ||||||
|  | namespace i18n { | ||||||
|  | class BreakIterator; | ||||||
|  | } // namespace i18n
 | ||||||
|  | } // namespace base
 | ||||||
|  | 
 | ||||||
|  | // A class which encapsulates language-specific operations used by
 | ||||||
|  | // SpellcheckWordIterator. When we set the spellchecker language, this class
 | ||||||
|  | // creates rule sets that filter out the characters not supported by the
 | ||||||
|  | // spellchecker. (Please read the comment in the SpellcheckWordIterator class
 | ||||||
|  | // about how to use this class.)
 | ||||||
|  | class SpellcheckCharAttribute { | ||||||
|  |  public: | ||||||
|  |   SpellcheckCharAttribute(); | ||||||
|  |   ~SpellcheckCharAttribute(); | ||||||
|  | 
 | ||||||
|  |   // Sets the language of the spellchecker. When this function is called with an
 | ||||||
|  |   // ISO language code, this function creates the custom rule-sets used by
 | ||||||
|  |   // the ICU break iterator so it can extract only words used by the language.
 | ||||||
|  |   // GetRuleSet() returns the rule-sets created in this function.
 | ||||||
|  |   void SetDefaultLanguage(const std::string& language); | ||||||
|  | 
 | ||||||
|  |   // Returns a custom rule-set string used by the ICU break iterator. This class
 | ||||||
|  |   // has two rule-sets, one splits a contraction and the other does not, so we
 | ||||||
|  |   // can split a concaticated word (e.g. "seven-year-old") into words (e.g.
 | ||||||
|  |   // "seven", "year", and "old") and check their spellings. The result stirng is
 | ||||||
|  |   // encoded in UTF-16 since ICU needs UTF-16 strings.
 | ||||||
|  |   base::string16 GetRuleSet(bool allow_contraction) const; | ||||||
|  | 
 | ||||||
|  |   // Outputs a character only if it is a word character. (Please read the
 | ||||||
|  |   // comments in CreateRuleSets() why we need this function.)
 | ||||||
|  |   bool OutputChar(UChar c, base::string16* output) const; | ||||||
|  | 
 | ||||||
|  |  private: | ||||||
|  |   // Creates the rule-sets that return words possibly used by the given
 | ||||||
|  |   // language. Unfortunately, these rule-sets are not perfect and have some
 | ||||||
|  |   // false-positives. For example, they return combined accent marks even though
 | ||||||
|  |   // we need English words only. We call OutputCharacter() to filter out such
 | ||||||
|  |   // false-positive characters.
 | ||||||
|  |   void CreateRuleSets(const std::string& language); | ||||||
|  | 
 | ||||||
|  |   // Outputs a character only if it is one used by the given language. These
 | ||||||
|  |   // functions are called from OutputChar().
 | ||||||
|  |   bool OutputArabic(UChar c, base::string16* output) const; | ||||||
|  |   bool OutputHangul(UChar c, base::string16* output) const; | ||||||
|  |   bool OutputHebrew(UChar c, base::string16* output) const; | ||||||
|  |   bool OutputDefault(UChar c, base::string16* output) const; | ||||||
|  | 
 | ||||||
|  |   // The custom rule-set strings used by ICU break iterator. Since it is not so
 | ||||||
|  |   // easy to create custom rule-sets from an ISO language code, this class
 | ||||||
|  |   // saves these rule-set strings created when we set the language.
 | ||||||
|  |   base::string16 ruleset_allow_contraction_; | ||||||
|  |   base::string16 ruleset_disallow_contraction_; | ||||||
|  | 
 | ||||||
|  |   // The script code used by this language.
 | ||||||
|  |   UScriptCode script_code_; | ||||||
|  | 
 | ||||||
|  |   DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute); | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | // A class which extracts words that can be checked for spelling from a
 | ||||||
|  | // multi-language string. The ICU word-break iterator does not discard some
 | ||||||
|  | // punctuation characters attached to a word. For example, when we set a word
 | ||||||
|  | // "_hello_" to a word-break iterator, it just returns "_hello_". Neither does
 | ||||||
|  | // it discard characters not used by the language. For example, it returns
 | ||||||
|  | // Russian words even though we need English words only. To extract only the
 | ||||||
|  | // words that our spellchecker can check their spellings, this class uses custom
 | ||||||
|  | // rule-sets created by the SpellcheckCharAttribute class. Also, this class
 | ||||||
|  | // normalizes extracted words so our spellchecker can check the spellings of
 | ||||||
|  | // words that include ligatures, combined characters, full-width characters,
 | ||||||
|  | // etc. This class uses UTF-16 strings as its input and output strings since
 | ||||||
|  | // UTF-16 is the native encoding of ICU and avoid unnecessary conversions
 | ||||||
|  | // when changing the encoding of this string for our spellchecker. (Chrome can
 | ||||||
|  | // use two or more spellcheckers and we cannot assume their encodings.)
 | ||||||
|  | // The following snippet is an example that extracts words with this class.
 | ||||||
|  | //
 | ||||||
|  | //   // Creates the language-specific attributes for US English.
 | ||||||
|  | //   SpellcheckCharAttribute attribute;
 | ||||||
|  | //   attribute.SetDefaultLanguage("en-US");
 | ||||||
|  | //
 | ||||||
|  | //   // Set up a SpellcheckWordIterator object which extracts English words,
 | ||||||
|  | //   // and retrieve them.
 | ||||||
|  | //   SpellcheckWordIterator iterator;
 | ||||||
|  | //   base::string16 text(base::UTF8ToUTF16("this is a test."));
 | ||||||
|  | //   iterator.Initialize(&attribute, true);
 | ||||||
|  | //   iterator.SetText(text.c_str(), text_.length());
 | ||||||
|  | //
 | ||||||
|  | //   base::string16 word;
 | ||||||
|  | //   int offset;
 | ||||||
|  | //   int length;
 | ||||||
|  | //   while (iterator.GetNextWord(&word, &offset, &length)) {
 | ||||||
|  | //     ...
 | ||||||
|  | //   }
 | ||||||
|  | //
 | ||||||
|  | class SpellcheckWordIterator { | ||||||
|  |  public: | ||||||
|  |   SpellcheckWordIterator(); | ||||||
|  |   ~SpellcheckWordIterator(); | ||||||
|  | 
 | ||||||
|  |   // Initializes a word-iterator object with the language-specific attribute. If
 | ||||||
|  |   // we need to split contractions and concatenated words, call this function
 | ||||||
|  |   // with its 'allow_contraction' parameter false. (This function uses lots of
 | ||||||
|  |   // temporal memory to compile a custom word-break rule into an automaton.)
 | ||||||
|  |   bool Initialize(const SpellcheckCharAttribute* attribute, | ||||||
|  |                   bool allow_contraction); | ||||||
|  | 
 | ||||||
|  |   // Returns whether this word iterator is initialized.
 | ||||||
|  |   bool IsInitialized() const; | ||||||
|  | 
 | ||||||
|  |   // Set text to be iterated. (This text does not have to be NULL-terminated.)
 | ||||||
|  |   // This function also resets internal state so we can reuse this iterator
 | ||||||
|  |   // without calling Initialize().
 | ||||||
|  |   bool SetText(const base::char16* text, size_t length); | ||||||
|  | 
 | ||||||
|  |   // Retrieves a word (or a contraction), stores its copy to 'word_string', and
 | ||||||
|  |   // stores the position and the length for input word to 'word_start'. Since
 | ||||||
|  |   // this function normalizes the output word, the length of 'word_string' may
 | ||||||
|  |   // be different from the 'word_length'. Therefore, when we call functions that
 | ||||||
|  |   // changes the input text, such as string16::replace(), we need to use
 | ||||||
|  |   // 'word_start' and 'word_length' as listed in the following snippet.
 | ||||||
|  |   //
 | ||||||
|  |   //   while(iterator.GetNextWord(&word, &offset, &length))
 | ||||||
|  |   //     text.replace(offset, length, word);
 | ||||||
|  |   //
 | ||||||
|  |   bool GetNextWord(base::string16* word_string, | ||||||
|  |                    int* word_start, | ||||||
|  |                    int* word_length); | ||||||
|  | 
 | ||||||
|  |   // Releases all the resources attached to this object.
 | ||||||
|  |   void Reset(); | ||||||
|  | 
 | ||||||
|  |  private: | ||||||
|  |   // Normalizes a non-terminated string returned from an ICU word-break
 | ||||||
|  |   // iterator. A word returned from an ICU break iterator may include characters
 | ||||||
|  |   // not supported by our spellchecker, e.g. ligatures, combining/ characters,
 | ||||||
|  |   // full-width letters, etc. This function replaces such characters with
 | ||||||
|  |   // alternative characters supported by our spellchecker. This function also
 | ||||||
|  |   // calls SpellcheckWordIterator::OutputChar() to filter out false-positive
 | ||||||
|  |   // characters.
 | ||||||
|  |   bool Normalize(int input_start, | ||||||
|  |                  int input_length, | ||||||
|  |                  base::string16* output_string) const; | ||||||
|  | 
 | ||||||
|  |   // The pointer to the input string from which we are extracting words.
 | ||||||
|  |   const base::char16* text_; | ||||||
|  | 
 | ||||||
|  |   // The language-specific attributes used for filtering out non-word
 | ||||||
|  |   // characters.
 | ||||||
|  |   const SpellcheckCharAttribute* attribute_; | ||||||
|  | 
 | ||||||
|  |   // The break iterator.
 | ||||||
|  |   scoped_ptr<base::i18n::BreakIterator> iterator_; | ||||||
|  | 
 | ||||||
|  |   DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator); | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | #endif  // CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_
 | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Cheng Zhao
				Cheng Zhao