Context Navigation

← Previous Changeset
Next Changeset →

Changeset 119113 in webkit

Timestamp:

May 31, 2012 9:36:29 AM (12 years ago)

Author:

commit-queue@webkit.org

Message:

Spell checker doesn't recognize contractions (apostrophes)
https://bugs.webkit.org/show_bug.cgi?id=86118

Patch by commit-queue@webkit.org <commit-queue@webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc> on 2012-05-31
Reviewed by Gustavo Noronha Silva.

The Enchant spell checker was breaking words on apostrophes, because
apparently they were always being detected as Pango word-end
characters. To know whether or not the apostrophe is a
word end character requires looking at a string with a larger
granularity than one character.

Simplify the way the we break strings, by search for non-graphable
character manually to find word starts and ends. This has the side
effect of removing the dependency on Pango and eliminating one copy.

This change also cleans up some misbehavior on the part of the
WebCoreSupport layer which was not converting from Unicode character
offsets to UTF-16. These offsets can be different if any of the
characters in the UTF-16 string are surrogate pairs (non BMP
characters).

WebCoreSupport/TextCheckerClientGtk.cpp:

(WebKit::TextCheckerClientGtk::checkSpellingOfString): Properly
convert from Unicode offsets to UTF-16 offsets.

webkit/webkitspellcheckerenchant.cpp:

(findByteOffsetToFirstNonGraphableCharacter): Added this helper.
(getExtentsOfNextWord): Ditto.
(wordIsSpelledCorrectlyInAtLeastOneDictionary): Ditto.
(checkSpellingOfString): Don't split words on apostrophes.

Location:

trunk/Source/WebKit/gtk

Files:

: 3 edited

ChangeLog (modified) (1 diff)
WebCoreSupport/TextCheckerClientGtk.cpp (modified) (1 diff)
webkit/webkitspellcheckerenchant.cpp (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

trunk/Source/WebKit/gtk/ChangeLog

-                      r119063
+                      r119113
+-05-31  commit-queue@webkit.org  <commit-queue@webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
+        Spell checker doesn't recognize contractions (apostrophes)
+        https://bugs.webkit.org/show_bug.cgi?id=86118
+        Reviewed by Gustavo Noronha Silva.
+        The Enchant spell checker was breaking words on apostrophes, because
+        apparently they were always being detected as Pango word-end
+        characters. To know whether or not the apostrophe is a
+        word end character requires looking at a string with a larger
+        granularity than one character.
+        Simplify the way the we break strings, by search for non-graphable
+        character manually to find word starts and ends. This has the side
+        effect of removing the dependency on Pango and eliminating one copy.
+        This change also cleans up some misbehavior on the part of the
+        WebCoreSupport layer which was not converting from Unicode character
+        offsets to UTF-16. These offsets can be different if any of the
+        characters in the UTF-16 string are surrogate pairs (non BMP
+        characters).
+        * WebCoreSupport/TextCheckerClientGtk.cpp:
+        (WebKit::TextCheckerClientGtk::checkSpellingOfString): Properly
+        convert from Unicode offsets to UTF-16 offsets.
+        * webkit/webkitspellcheckerenchant.cpp:
+        (findByteOffsetToFirstNonGraphableCharacter): Added this helper.
+        (getExtentsOfNextWord): Ditto.
+        (wordIsSpelledCorrectlyInAtLeastOneDictionary): Ditto.
+        (checkSpellingOfString): Don't split words on apostrophes.
 -05-30  Sudarsana Nagineni  <sudarsana.nagineni@linux.intel.com>

trunk/Source/WebKit/gtk/WebCoreSupport/TextCheckerClientGtk.cpp

-                      r117671
+                      r119113
     GOwnPtr<gchar> utf8Text(g_utf16_to_utf8(const_cast<gunichar2*>(text), length, 0, 0, 0));
     webkit_spell_checker_check_spelling_of_string(m_spellChecker.get(), utf8Text.get(), misspellingLocation, misspellingLength);
+    // We have the offset and length in Unicode characters, but we need to convert them to UTF-16 offsets.
+    // Unfortunately there doesn't seem to be a simple way to do this.
+    if (!*misspellingLength)
+        return;
+    bool pastStartOfWord = false;
+    for (int i = 0; i < length; i++) {
+        if (i >= *misspellingLocation + *misspellingLength)
+            return;
+        if (!pastStartOfWord && i > *misspellingLocation)
+            pastStartOfWord = true;
+        // If this character is part of a surrogate pair, we need to skip the next character (the trail)
+        // and to increase our offsets.
+        if (!U16_IS_SINGLE(text[i])) {
+            i++;
+            (*misspellingLength)++;
+            *misspellingLocation += pastStartOfWord ? 0 : 1;
+        }
+    }
+}

trunk/Source/WebKit/gtk/webkit/webkitspellcheckerenchant.cpp

-                      r117671
+                      r119113
+}
+static size_t findByteOffsetToFirstNonGraphableCharacter(const char* utf8String)
+{
+    const char* firstNonGraphableCharacter = utf8String;
+    while (firstNonGraphableCharacter && *firstNonGraphableCharacter && g_unichar_isgraph(g_utf8_get_char(firstNonGraphableCharacter)))
+        firstNonGraphableCharacter = g_utf8_find_next_char(firstNonGraphableCharacter, 0);
+    if (!firstNonGraphableCharacter) // If there was an error, the offset will be zero.
+        return 0;
+    return firstNonGraphableCharacter - utf8String;
+}
+static void getExtentsOfNextWord(const char* utf8String, const char*& wordStart, size_t& length)
+{
+    wordStart = utf8String;
+    while (wordStart && *wordStart && !g_unichar_isgraph(g_utf8_get_char(wordStart)))
+        wordStart = g_utf8_find_next_char(wordStart, NULL);
+    if (!wordStart || !*wordStart) {
+        length = 0;
+        return;
+    }
+    length = findByteOffsetToFirstNonGraphableCharacter(wordStart);
+    return;
+}
+static bool wordIsSpelledCorrectlyInAtLeastOneDictionary(GSList* dictionaries, const char* word, size_t length, int* misspellingLocation, int* misspellingLength)
+{
+    for (; dictionaries; dictionaries = dictionaries->next) {
+        EnchantDict* dictionary = static_cast<EnchantDict*>(dictionaries->data);
+        int result = enchant_dict_check(dictionary, word, length);
+        if (result < 0) // Error during checking.
+            continue;
+        if (!result) // Stop checking, as this word is correct for at least one dictionary.
+            return true;
+    }
+    return false;
+}
 static void checkSpellingOfString(WebKitSpellChecker* checker, const char* string, int* misspellingLocation, int* misspellingLength)
+{
     WebKitSpellCheckerEnchantPrivate* priv = WEBKIT_SPELL_CHECKER_ENCHANT(checker)->priv;
+    GSList* dicts = priv->enchantDicts;
+    if (!dicts)
+    *misspellingLocation = -1;
+    *misspellingLength = 0;
+    GSList* dictionaries = priv->enchantDicts;
+    if (!dictionaries)
         return;
+    int length = g_utf8_strlen(string, -1);
+    PangoLanguage* language(pango_language_get_default());
+    GOwnPtr<PangoLogAttr> attrs(g_new(PangoLogAttr, length + 1));
+    // pango_get_log_attrs uses an aditional position at the end of the text.
+    pango_get_log_attrs(string, -1, -1, language, attrs.get(), length + 1);
+    for (int i = 0; i < length + 1; i++) {
+        // We go through each character until we find an is_word_start,
+        // then we get into an inner loop to find the is_word_end corresponding
+        // to it.
+        if (attrs.get()[i].is_word_start) {
+            int start = i;
+            int end = i;
+            int wordLength;
+            while (attrs.get()[end].is_word_end < 1)
+                end++;
+            wordLength = end - start;
+            // Set the iterator to be at the current word end, so we don't
+            // check characters twice.
+            i = end;
+            gchar* cstart = g_utf8_offset_to_pointer(string, start);
+            gint bytes = static_cast<gint>(g_utf8_offset_to_pointer(string, end) - cstart);
+            GOwnPtr<gchar> word(g_new0(gchar, bytes + 1));
+            g_utf8_strncpy(word.get(), cstart, wordLength);
+            for (; dicts; dicts = dicts->next) {
+                EnchantDict* dict = static_cast<EnchantDict*>(dicts->data);
+                if (enchant_dict_check(dict, word.get(), wordLength)) {
+                    *misspellingLocation = start;
+                    *misspellingLength = wordLength;
+                } else {
+                    // Stop checking, this word is ok in at least one dict.
+                    *misspellingLocation = -1;
+                    *misspellingLength = 0;
+                    break;
+                }
+            }
+        }
+    const char* wordStart = string;
+    size_t wordLength;
+    getExtentsOfNextWord(wordStart, wordStart, wordLength);
+    while (wordLength > 0) {
+        if (!wordIsSpelledCorrectlyInAtLeastOneDictionary(dictionaries, wordStart, wordLength, misspellingLocation, misspellingLength)) {
+            *misspellingLocation = g_utf8_pointer_to_offset(string, wordStart);
+            *misspellingLength = g_utf8_pointer_to_offset(string, wordStart + wordLength) - *misspellingLocation;
+            return;
+        }
+        getExtentsOfNextWord(wordStart + wordLength, wordStart, wordLength);
+    }
+}

Note: See TracChangeset for help on using the changeset viewer.