Context Navigation

← Previous Changeset
Next Changeset →

Changeset 117628 in webkit

Timestamp:

May 18, 2012 2:30:52 PM (12 years ago)

Author:

commit-queue@webkit.org

Message:

Spell checker doesn't recognize contractions (apostrophes)
https://bugs.webkit.org/show_bug.cgi?id=86118

Patch by Martin Robinson <mrobinson@igalia.com> on 2012-05-18
Reviewed by Gustavo Noronha Silva.

The Enchant spell checker was breaking words on apostrophes, because
apparently they were always being detected as Pango word-end
characters. In reality, to know whether or not the apostrophe is a
word end character requires looking at a string with a larger
granularity than one character. In reality, WebCore has already
ensured that any apostrophes in the string belong to contractions by
using the WordBreakIterator.

Simplify the way the we break strings, by just manually trimming off
all non-graphable characters from the string and then finding the end
of the first word by looking for the next non-graphable character.
This has the side effect of removing the dependency on Pango and
eliminating one copy.

This change also cleans up some misbehavior on the part of the
WebCoreSupport layer which was not converting from Unicode character
offsets to UTF-16. These offsets can be different if any of the
characters in the UTF-16 string are surrogate pairs (non BMP
characters).

WebCoreSupport/TextCheckerClientGtk.cpp:

(WebKit::TextCheckerClientGtk::checkSpellingOfString): Properly
convert from Unicode offsets to UTF-16 offsets.

webkit/webkitspellcheckerenchant.cpp:

(findByteOffsetToFirstNonGraphableCharacter): Added this helper.
(checkSpellingOfString): Don't split words on apostrophes.

Location:

trunk/Source/WebKit/gtk

Files:

: 3 edited

ChangeLog (modified) (1 diff)
WebCoreSupport/TextCheckerClientGtk.cpp (modified) (1 diff)
webkit/webkitspellcheckerenchant.cpp (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/Source/WebKit/gtk/ChangeLog

-                      r117612
+                      r117628
+-05-18  Martin Robinson  <mrobinson@igalia.com>
+        Spell checker doesn't recognize contractions (apostrophes)
+        https://bugs.webkit.org/show_bug.cgi?id=86118
+        Reviewed by Gustavo Noronha Silva.
+        The Enchant spell checker was breaking words on apostrophes, because
+        apparently they were always being detected as Pango word-end
+        characters. In reality, to know whether or not the apostrophe is a
+        word end character requires looking at a string with a larger
+        granularity than one character. In reality, WebCore has already
+        ensured that any apostrophes in the string belong to contractions by
+        using the WordBreakIterator.
+        Simplify the way the we break strings, by just manually trimming off
+        all non-graphable characters from the string and then finding the end
+        of the first word by looking for the next non-graphable character.
+        This has the side effect of removing the dependency on Pango and
+        eliminating one copy.
+        This change also cleans up some misbehavior on the part of the
+        WebCoreSupport layer which was not converting from Unicode character
+        offsets to UTF-16. These offsets can be different if any of the
+        characters in the UTF-16 string are surrogate pairs (non BMP
+        characters).
+        * WebCoreSupport/TextCheckerClientGtk.cpp:
+        (WebKit::TextCheckerClientGtk::checkSpellingOfString): Properly
+        convert from Unicode offsets to UTF-16 offsets.
+        * webkit/webkitspellcheckerenchant.cpp:
+        (findByteOffsetToFirstNonGraphableCharacter): Added this helper.
+        (checkSpellingOfString): Don't split words on apostrophes.
 -05-18  Martin Robinson  <mrobinson@igalia.com>

trunk/Source/WebKit/gtk/WebCoreSupport/TextCheckerClientGtk.cpp

-                      r117470
+                      r117628
     GOwnPtr<gchar> utf8Text(g_utf16_to_utf8(const_cast<gunichar2*>(text), length, 0, 0, 0));
     webkit_spell_checker_check_spelling_of_string(m_spellChecker.get(), utf8Text.get(), misspellingLocation, misspellingLength);
+    // We have the offset and length in Unicode characters, but we need to convert them to UTF-16 offsets.
+    // Unfortunately there doesn't seem to be a simple way to do this.
+    if (!*misspellingLength)
+        return;
+    bool pastStartOfWord = false;
+    for (int i = 0; i < length; i++) {
+        if (i >= *misspellingLocation + *misspellingLength)
+            return;
+        if (!pastStartOfWord && i > *misspellingLocation)
+            pastStartOfWord = true;
+        // If this character is part of a surrogate pair, we need to skip the next character (the trail)
+        // and to increase our offsets.
+        if (!U16_IS_SINGLE(text[i])) {
+            i++;
+            (*misspellingLength)++;
+            *misspellingLocation += pastStartOfWord ? 0 : 1;
+        }
+    }
+}

trunk/Source/WebKit/gtk/webkit/webkitspellcheckerenchant.cpp

-                      r111696
+                      r117628
+}
+static size_t findByteOffsetToFirstNonGraphableCharacter(const char* utf8String)
+{
+    const char* firstNonGraphableCharacter = utf8String;
+    while (firstNonGraphableCharacter && g_unichar_isgraph(g_utf8_get_char(firstNonGraphableCharacter)))
+        firstNonGraphableCharacter = g_utf8_find_next_char(firstNonGraphableCharacter, 0);
+    return firstNonGraphableCharacter - utf8String;
+}
 static void checkSpellingOfString(WebKitSpellChecker* checker, const char* string, int* misspellingLocation, int* misspellingLength)
+{
 …
         return;
+    int length = g_utf8_strlen(string, -1);
+    PangoLanguage* language(pango_language_get_default());
+    GOwnPtr<PangoLogAttr> attrs(g_new(PangoLogAttr, length + 1));
+    // pango_get_log_attrs uses an aditional position at the end of the text.
+    pango_get_log_attrs(string, -1, -1, language, attrs.get(), length + 1);
+    for (int i = 0; i < length + 1; i++) {
+        // We go through each character until we find an is_word_start,
+        // then we get into an inner loop to find the is_word_end corresponding
+        // to it.
+        if (attrs.get()[i].is_word_start) {
+            int start = i;
+            int end = i;
+            int wordLength;
+            while (attrs.get()[end].is_word_end < 1)
+                end++;
+            wordLength = end - start;
+            // Set the iterator to be at the current word end, so we don't
+            // check characters twice.
+            i = end;
+            gchar* cstart = g_utf8_offset_to_pointer(string, start);
+            gint bytes = static_cast<gint>(g_utf8_offset_to_pointer(string, end) - cstart);
+            GOwnPtr<gchar> word(g_new0(gchar, bytes + 1));
+            g_utf8_strncpy(word.get(), cstart, wordLength);
+            for (; dicts; dicts = dicts->next) {
+                EnchantDict* dict = static_cast<EnchantDict*>(dicts->data);
+                if (enchant_dict_check(dict, word.get(), wordLength)) {
+                    *misspellingLocation = start;
+                    *misspellingLength = wordLength;
+                } else {
+                    // Stop checking, this word is ok in at least one dict.
+                    *misspellingLocation = -1;
+                    *misspellingLength = 0;
+                    break;
+                }
+            }
+        }
+    // At the time this code was written, WebCore only sends us one word at a
+    // time during spellchecking, with a chance of having some small amount of
+    // leading and trailing whitespace. For this reason we can merely chop off
+    // the whitespace and send the word directly to Enchant.
+    const char* firstWord = string;
+    while (firstWord && !g_unichar_isgraph(g_utf8_get_char(firstWord)))
+        firstWord = g_utf8_find_next_char(firstWord, NULL);
+    // Either the string only had whitespace characters or no characters at all.
+    if (!firstWord)
+        return;
+    size_t byteOffsetToEndOfFirstWord = findByteOffsetToFirstNonGraphableCharacter(firstWord);
+    for (; dicts; dicts = dicts->next) {
+        EnchantDict* dict = static_cast<EnchantDict*>(dicts->data);
+        int result = enchant_dict_check(dict, firstWord, byteOffsetToEndOfFirstWord);
+        if (result < 0) // Error during checking.
+            continue;
+        if (!result) { // Stop checking, as this word is correct for at least one dictionary.
+            *misspellingLocation = -1;
+            *misspellingLength = 0;
+            return;
+        }
+        *misspellingLocation = g_utf8_pointer_to_offset(string, firstWord);
+        *misspellingLength = g_utf8_pointer_to_offset(string, firstWord + byteOffsetToEndOfFirstWord) - *misspellingLocation;
+    }
+}

Note: See TracChangeset for help on using the changeset viewer.