Changeset 117628 in webkit
- Timestamp:
- May 18, 2012 2:30:52 PM (12 years ago)
- Location:
- trunk/Source/WebKit/gtk
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/Source/WebKit/gtk/ChangeLog
r117612 r117628 1 2012-05-18 Martin Robinson <mrobinson@igalia.com> 2 3 Spell checker doesn't recognize contractions (apostrophes) 4 https://bugs.webkit.org/show_bug.cgi?id=86118 5 6 Reviewed by Gustavo Noronha Silva. 7 8 The Enchant spell checker was breaking words on apostrophes, because 9 apparently they were always being detected as Pango word-end 10 characters. In reality, to know whether or not the apostrophe is a 11 word end character requires looking at a string with a larger 12 granularity than one character. In reality, WebCore has already 13 ensured that any apostrophes in the string belong to contractions by 14 using the WordBreakIterator. 15 16 Simplify the way the we break strings, by just manually trimming off 17 all non-graphable characters from the string and then finding the end 18 of the first word by looking for the next non-graphable character. 19 This has the side effect of removing the dependency on Pango and 20 eliminating one copy. 21 22 This change also cleans up some misbehavior on the part of the 23 WebCoreSupport layer which was not converting from Unicode character 24 offsets to UTF-16. These offsets can be different if any of the 25 characters in the UTF-16 string are surrogate pairs (non BMP 26 characters). 27 28 * WebCoreSupport/TextCheckerClientGtk.cpp: 29 (WebKit::TextCheckerClientGtk::checkSpellingOfString): Properly 30 convert from Unicode offsets to UTF-16 offsets. 31 * webkit/webkitspellcheckerenchant.cpp: 32 (findByteOffsetToFirstNonGraphableCharacter): Added this helper. 33 (checkSpellingOfString): Don't split words on apostrophes. 34 1 35 2012-05-18 Martin Robinson <mrobinson@igalia.com> 2 36 -
trunk/Source/WebKit/gtk/WebCoreSupport/TextCheckerClientGtk.cpp
r117470 r117628 64 64 GOwnPtr<gchar> utf8Text(g_utf16_to_utf8(const_cast<gunichar2*>(text), length, 0, 0, 0)); 65 65 webkit_spell_checker_check_spelling_of_string(m_spellChecker.get(), utf8Text.get(), misspellingLocation, misspellingLength); 66 67 // We have the offset and length in Unicode characters, but we need to convert them to UTF-16 offsets. 68 // Unfortunately there doesn't seem to be a simple way to do this. 69 if (!*misspellingLength) 70 return; 71 72 bool pastStartOfWord = false; 73 for (int i = 0; i < length; i++) { 74 if (i >= *misspellingLocation + *misspellingLength) 75 return; 76 if (!pastStartOfWord && i > *misspellingLocation) 77 pastStartOfWord = true; 78 79 // If this character is part of a surrogate pair, we need to skip the next character (the trail) 80 // and to increase our offsets. 81 if (!U16_IS_SINGLE(text[i])) { 82 i++; 83 (*misspellingLength)++; 84 *misspellingLocation += pastStartOfWord ? 0 : 1; 85 } 86 } 66 87 } 67 88 -
trunk/Source/WebKit/gtk/webkit/webkitspellcheckerenchant.cpp
r111696 r117628 89 89 } 90 90 91 static size_t findByteOffsetToFirstNonGraphableCharacter(const char* utf8String) 92 { 93 const char* firstNonGraphableCharacter = utf8String; 94 while (firstNonGraphableCharacter && g_unichar_isgraph(g_utf8_get_char(firstNonGraphableCharacter))) 95 firstNonGraphableCharacter = g_utf8_find_next_char(firstNonGraphableCharacter, 0); 96 return firstNonGraphableCharacter - utf8String; 97 } 98 91 99 static void checkSpellingOfString(WebKitSpellChecker* checker, const char* string, int* misspellingLocation, int* misspellingLength) 92 100 { … … 97 105 return; 98 106 99 int length = g_utf8_strlen(string, -1); 100 101 PangoLanguage* language(pango_language_get_default()); 102 GOwnPtr<PangoLogAttr> attrs(g_new(PangoLogAttr, length + 1)); 103 104 // pango_get_log_attrs uses an aditional position at the end of the text. 105 pango_get_log_attrs(string, -1, -1, language, attrs.get(), length + 1); 106 107 for (int i = 0; i < length + 1; i++) { 108 // We go through each character until we find an is_word_start, 109 // then we get into an inner loop to find the is_word_end corresponding 110 // to it. 111 if (attrs.get()[i].is_word_start) { 112 int start = i; 113 int end = i; 114 int wordLength; 115 116 while (attrs.get()[end].is_word_end < 1) 117 end++; 118 119 wordLength = end - start; 120 // Set the iterator to be at the current word end, so we don't 121 // check characters twice. 122 i = end; 123 124 gchar* cstart = g_utf8_offset_to_pointer(string, start); 125 gint bytes = static_cast<gint>(g_utf8_offset_to_pointer(string, end) - cstart); 126 GOwnPtr<gchar> word(g_new0(gchar, bytes + 1)); 127 128 g_utf8_strncpy(word.get(), cstart, wordLength); 129 130 for (; dicts; dicts = dicts->next) { 131 EnchantDict* dict = static_cast<EnchantDict*>(dicts->data); 132 if (enchant_dict_check(dict, word.get(), wordLength)) { 133 *misspellingLocation = start; 134 *misspellingLength = wordLength; 135 } else { 136 // Stop checking, this word is ok in at least one dict. 137 *misspellingLocation = -1; 138 *misspellingLength = 0; 139 break; 140 } 141 } 142 } 107 // At the time this code was written, WebCore only sends us one word at a 108 // time during spellchecking, with a chance of having some small amount of 109 // leading and trailing whitespace. For this reason we can merely chop off 110 // the whitespace and send the word directly to Enchant. 111 const char* firstWord = string; 112 while (firstWord && !g_unichar_isgraph(g_utf8_get_char(firstWord))) 113 firstWord = g_utf8_find_next_char(firstWord, NULL); 114 115 // Either the string only had whitespace characters or no characters at all. 116 if (!firstWord) 117 return; 118 119 size_t byteOffsetToEndOfFirstWord = findByteOffsetToFirstNonGraphableCharacter(firstWord); 120 for (; dicts; dicts = dicts->next) { 121 EnchantDict* dict = static_cast<EnchantDict*>(dicts->data); 122 int result = enchant_dict_check(dict, firstWord, byteOffsetToEndOfFirstWord); 123 124 if (result < 0) // Error during checking. 125 continue; 126 if (!result) { // Stop checking, as this word is correct for at least one dictionary. 127 *misspellingLocation = -1; 128 *misspellingLength = 0; 129 return; 130 } 131 132 *misspellingLocation = g_utf8_pointer_to_offset(string, firstWord); 133 *misspellingLength = g_utf8_pointer_to_offset(string, firstWord + byteOffsetToEndOfFirstWord) - *misspellingLocation; 143 134 } 144 135 }
Note: See TracChangeset
for help on using the changeset viewer.