Changeset 266457 in webkit


Ignore:
Timestamp:
Sep 2, 2020, 7:45:32 AM (5 years ago)
Author:
achristensen@apple.com
Message:

Align UTF-16 decoder with Chrome, Firefox, and specification
https://bugs.webkit.org/show_bug.cgi?id=216058

Reviewed by Youenn Fablet.

LayoutTests/imported/w3c:

  • web-platform-tests/encoding/streams/decode-non-utf8.any-expected.txt:
  • web-platform-tests/encoding/streams/decode-non-utf8.any.worker-expected.txt:
  • web-platform-tests/encoding/textdecoder-fatal-streaming.any-expected.txt:
  • web-platform-tests/encoding/textdecoder-fatal-streaming.any.worker-expected.txt:
  • web-platform-tests/encoding/textdecoder-utf16-surrogates.any-expected.txt:
  • web-platform-tests/encoding/textdecoder-utf16-surrogates.any.worker-expected.txt:

Source/WebCore:

Covered by web platform tests.

  • platform/text/TextCodecUTF16.cpp:

(WebCore::TextCodecUTF16::decode):

  • platform/text/TextCodecUTF16.h:

(): Deleted.

LayoutTests:

  • imported/blink/fast/encoding/utf-16-odd-byte-expected.txt:
Location:
trunk
Files:
13 edited

Legend:

Unmodified
Added
Removed
  • trunk/LayoutTests/ChangeLog

    r266453 r266457  
     12020-09-02  Alex Christensen  <achristensen@webkit.org>
     2
     3        Align UTF-16 decoder with Chrome, Firefox, and specification
     4        https://bugs.webkit.org/show_bug.cgi?id=216058
     5
     6        Reviewed by Youenn Fablet.
     7
     8        * imported/blink/fast/encoding/utf-16-odd-byte-expected.txt:
     9
    1102020-09-02  Diego Pino Garcia  <dpino@igalia.com>
    211
  • trunk/LayoutTests/fast/parser/test-unicode-characters-in-attribute-name-expected.txt

    r28908 r266457  
    441.  title : http://www.315.com.cn/newwebsite/images0821/315_ca_03.gif
    552. la ng : zh-cn
    6 
     6
  • trunk/LayoutTests/imported/blink/fast/encoding/utf-16-odd-byte-expected.txt

    r190629 r266457  
    1 This document has an odd number of bytes - the UTF-16 decoder should emit a replacement character here:
     1This document has an odd number of bytes - the UTF-16 decoder should emit a replacement character here: �
  • trunk/LayoutTests/imported/w3c/ChangeLog

    r266448 r266457  
     12020-09-02  Alex Christensen  <achristensen@webkit.org>
     2
     3        Align UTF-16 decoder with Chrome, Firefox, and specification
     4        https://bugs.webkit.org/show_bug.cgi?id=216058
     5
     6        Reviewed by Youenn Fablet.
     7
     8        * web-platform-tests/encoding/streams/decode-non-utf8.any-expected.txt:
     9        * web-platform-tests/encoding/streams/decode-non-utf8.any.worker-expected.txt:
     10        * web-platform-tests/encoding/textdecoder-fatal-streaming.any-expected.txt:
     11        * web-platform-tests/encoding/textdecoder-fatal-streaming.any.worker-expected.txt:
     12        * web-platform-tests/encoding/textdecoder-utf16-surrogates.any-expected.txt:
     13        * web-platform-tests/encoding/textdecoder-utf16-surrogates.any.worker-expected.txt:
     14
    1152020-09-01  Alex Christensen  <achristensen@webkit.org>
    216
  • trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/streams/decode-non-utf8.any-expected.txt

    r266348 r266457  
    11
    22PASS TextDecoderStream should be able to decode UTF-16BE
    3 FAIL TextDecoderStream should be able to decode invalid sequences in UTF-16BE assert_equals: output should be replacement character expected "\ufffd" but got "�"
    4 FAIL TextDecoderStream should be able to reject invalid sequences in UTF-16BE assert_unreached: Should have rejected: readable should be errored Reached unreachable code
     3PASS TextDecoderStream should be able to decode invalid sequences in UTF-16BE
     4PASS TextDecoderStream should be able to reject invalid sequences in UTF-16BE
    55PASS TextDecoderStream should be able to decode UTF-16LE
    6 FAIL TextDecoderStream should be able to decode invalid sequences in UTF-16LE assert_equals: output should be replacement character expected "\ufffd" but got "�"
    7 FAIL TextDecoderStream should be able to reject invalid sequences in UTF-16LE assert_unreached: Should have rejected: readable should be errored Reached unreachable code
     6PASS TextDecoderStream should be able to decode invalid sequences in UTF-16LE
     7PASS TextDecoderStream should be able to reject invalid sequences in UTF-16LE
    88PASS TextDecoderStream should be able to decode Shift_JIS
    99FAIL TextDecoderStream should be able to decode invalid sequences in Shift_JIS assert_equals: output should be replacement character expected "\ufffd" but got "\x1a"
  • trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/streams/decode-non-utf8.any.worker-expected.txt

    r266348 r266457  
    11
    22PASS TextDecoderStream should be able to decode UTF-16BE
    3 FAIL TextDecoderStream should be able to decode invalid sequences in UTF-16BE assert_equals: output should be replacement character expected "\ufffd" but got "�"
    4 FAIL TextDecoderStream should be able to reject invalid sequences in UTF-16BE assert_unreached: Should have rejected: readable should be errored Reached unreachable code
     3PASS TextDecoderStream should be able to decode invalid sequences in UTF-16BE
     4PASS TextDecoderStream should be able to reject invalid sequences in UTF-16BE
    55PASS TextDecoderStream should be able to decode UTF-16LE
    6 FAIL TextDecoderStream should be able to decode invalid sequences in UTF-16LE assert_equals: output should be replacement character expected "\ufffd" but got "�"
    7 FAIL TextDecoderStream should be able to reject invalid sequences in UTF-16LE assert_unreached: Should have rejected: readable should be errored Reached unreachable code
     6PASS TextDecoderStream should be able to decode invalid sequences in UTF-16LE
     7PASS TextDecoderStream should be able to reject invalid sequences in UTF-16LE
    88PASS TextDecoderStream should be able to decode Shift_JIS
    99FAIL TextDecoderStream should be able to decode invalid sequences in Shift_JIS assert_equals: output should be replacement character expected "\ufffd" but got "\x1a"
  • trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-fatal-streaming.any-expected.txt

    r256730 r266457  
    11
    2 FAIL Fatal flag, non-streaming cases assert_equals: Unterminated UTF-8 sequence should emit replacement character if fatal flag is unset expected "\ufffd" but got ""
     2PASS Fatal flag, non-streaming cases
    33FAIL Fatal flag, streaming cases assert_equals: expected "\0" but got ""
    44
  • trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-fatal-streaming.any.worker-expected.txt

    r256730 r266457  
    11
    2 FAIL Fatal flag, non-streaming cases assert_equals: Unterminated UTF-8 sequence should emit replacement character if fatal flag is unset expected "\ufffd" but got ""
     2PASS Fatal flag, non-streaming cases
    33FAIL Fatal flag, streaming cases assert_equals: expected "\0" but got ""
    44
  • trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-utf16-surrogates.any-expected.txt

    r256730 r266457  
    11
    2 FAIL utf-16le - lone surrogate lead assert_equals: expected "\ufffd" but got "�"
    3 FAIL utf-16le - lone surrogate lead (fatal flag set) assert_throws_js: function "function () {
    4             new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input))
    5         }" did not throw
    6 FAIL utf-16le - lone surrogate trail assert_equals: expected "\ufffd" but got "�"
    7 FAIL utf-16le - lone surrogate trail (fatal flag set) assert_throws_js: function "function () {
    8             new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input))
    9         }" did not throw
    10 FAIL utf-16le - unmatched surrogate lead assert_equals: expected "\ufffd\0" but got "�\0"
    11 FAIL utf-16le - unmatched surrogate lead (fatal flag set) assert_throws_js: function "function () {
    12             new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input))
    13         }" did not throw
    14 FAIL utf-16le - unmatched surrogate trail assert_equals: expected "\ufffd\0" but got "�\0"
    15 FAIL utf-16le - unmatched surrogate trail (fatal flag set) assert_throws_js: function "function () {
    16             new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input))
    17         }" did not throw
    18 FAIL utf-16le - swapped surrogate pair assert_equals: expected "\ufffd\ufffd" but got "��"
    19 FAIL utf-16le - swapped surrogate pair (fatal flag set) assert_throws_js: function "function () {
    20             new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input))
    21         }" did not throw
     2PASS utf-16le - lone surrogate lead
     3PASS utf-16le - lone surrogate lead (fatal flag set)
     4PASS utf-16le - lone surrogate trail
     5PASS utf-16le - lone surrogate trail (fatal flag set)
     6PASS utf-16le - unmatched surrogate lead
     7PASS utf-16le - unmatched surrogate lead (fatal flag set)
     8PASS utf-16le - unmatched surrogate trail
     9PASS utf-16le - unmatched surrogate trail (fatal flag set)
     10PASS utf-16le - swapped surrogate pair
     11PASS utf-16le - swapped surrogate pair (fatal flag set)
    2212
  • trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-utf16-surrogates.any.worker-expected.txt

    r256730 r266457  
    11
    2 FAIL utf-16le - lone surrogate lead assert_equals: expected "\ufffd" but got "�"
    3 FAIL utf-16le - lone surrogate lead (fatal flag set) assert_throws_js: function "function () {
    4             new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input))
    5         }" did not throw
    6 FAIL utf-16le - lone surrogate trail assert_equals: expected "\ufffd" but got "�"
    7 FAIL utf-16le - lone surrogate trail (fatal flag set) assert_throws_js: function "function () {
    8             new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input))
    9         }" did not throw
    10 FAIL utf-16le - unmatched surrogate lead assert_equals: expected "\ufffd\0" but got "�\0"
    11 FAIL utf-16le - unmatched surrogate lead (fatal flag set) assert_throws_js: function "function () {
    12             new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input))
    13         }" did not throw
    14 FAIL utf-16le - unmatched surrogate trail assert_equals: expected "\ufffd\0" but got "�\0"
    15 FAIL utf-16le - unmatched surrogate trail (fatal flag set) assert_throws_js: function "function () {
    16             new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input))
    17         }" did not throw
    18 FAIL utf-16le - swapped surrogate pair assert_equals: expected "\ufffd\ufffd" but got "��"
    19 FAIL utf-16le - swapped surrogate pair (fatal flag set) assert_throws_js: function "function () {
    20             new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input))
    21         }" did not throw
     2PASS utf-16le - lone surrogate lead
     3PASS utf-16le - lone surrogate lead (fatal flag set)
     4PASS utf-16le - lone surrogate trail
     5PASS utf-16le - lone surrogate trail (fatal flag set)
     6PASS utf-16le - unmatched surrogate lead
     7PASS utf-16le - unmatched surrogate lead (fatal flag set)
     8PASS utf-16le - unmatched surrogate trail
     9PASS utf-16le - unmatched surrogate trail (fatal flag set)
     10PASS utf-16le - swapped surrogate pair
     11PASS utf-16le - swapped surrogate pair (fatal flag set)
    2212
  • trunk/Source/WebCore/ChangeLog

    r266454 r266457  
     12020-09-02  Alex Christensen  <achristensen@webkit.org>
     2
     3        Align UTF-16 decoder with Chrome, Firefox, and specification
     4        https://bugs.webkit.org/show_bug.cgi?id=216058
     5
     6        Reviewed by Youenn Fablet.
     7
     8        Covered by web platform tests.
     9
     10        * platform/text/TextCodecUTF16.cpp:
     11        (WebCore::TextCodecUTF16::decode):
     12        * platform/text/TextCodecUTF16.h:
     13        (): Deleted.
     14
    1152020-09-02  Youenn Fablet  <youenn@apple.com>
    216
  • trunk/Source/WebCore/platform/text/TextCodecUTF16.cpp

    r248846 r266457  
    2828
    2929#include <wtf/text/CString.h>
     30#include <wtf/text/StringBuilder.h>
    3031#include <wtf/text/WTFString.h>
    3132
     
    6263}
    6364
    64 String TextCodecUTF16::decode(const char* bytes, size_t length, bool, bool, bool&)
     65// https://encoding.spec.whatwg.org/#shared-utf-16-decoder
     66String TextCodecUTF16::decode(const char* bytes, size_t length, bool flush, bool, bool& sawError)
    6567{
    66     if (!length)
    67         return String();
     68    const auto* p = reinterpret_cast<const uint8_t*>(bytes);
     69    const auto* const end = p + length;
     70    const auto* const endMinusOneOrNull = end ? end - 1 : nullptr;
    6871
    69     // FIXME: This should generate an error if there is an unpaired surrogate.
     72    StringBuilder result;
     73    result.reserveCapacity(length / 2);
    7074
    71     const unsigned char* p = reinterpret_cast<const unsigned char*>(bytes);
    72     size_t numBytes = length + m_haveBufferedByte;
    73     size_t numCodeUnits = numBytes / 2;
    74     RELEASE_ASSERT(numCodeUnits <= std::numeric_limits<unsigned>::max());
     75    Function<void(UChar)> processBytesShared;
     76    processBytesShared = [&] (UChar codeUnit) {
     77        if (m_leadSurrogate) {
     78            auto leadSurrogate = *std::exchange(m_leadSurrogate, WTF::nullopt);
     79            if (codeUnit >= 0xDC00 && codeUnit <= 0xDFFF) {
     80                result.appendCharacter(0x10000 + ((leadSurrogate - 0xD800) << 10) + codeUnit - 0xDC00);
     81                return;
     82            }
     83            sawError = true;
     84            result.append(replacementCharacter);
     85            processBytesShared(codeUnit);
     86            return;
     87        }
     88        if (codeUnit >= 0xD800 && codeUnit <= 0xDBFF) {
     89            m_leadSurrogate = codeUnit;
     90            return;
     91        }
     92        if (codeUnit >= 0xDC00 && codeUnit <=0xDFFF) {
     93            sawError = true;
     94            result.append(replacementCharacter);
     95            return;
     96        }
     97        result.append(codeUnit);
     98    };
     99    auto processBytesLE = [&] (uint8_t first, uint8_t second) {
     100        processBytesShared(first | (second << 8));
     101    };
     102    auto processBytesBE = [&] (uint8_t first, uint8_t second) {
     103        processBytesShared((first << 8) | second);
     104    };
    75105
    76     UChar* q;
    77     auto result = String::createUninitialized(numCodeUnits, q);
    78 
    79     if (m_haveBufferedByte) {
    80         UChar c;
     106    if (m_leadByte && p < end) {
     107        auto leadByte = *std::exchange(m_leadByte, WTF::nullopt);
    81108        if (m_littleEndian)
    82             c = m_bufferedByte | (p[0] << 8);
     109            processBytesLE(leadByte, p[0]);
    83110        else
    84             c = (m_bufferedByte << 8) | p[0];
    85         *q++ = c;
    86         m_haveBufferedByte = false;
    87         p += 1;
    88         numCodeUnits -= 1;
     111            processBytesBE(leadByte, p[0]);
     112        p++;
    89113    }
    90114
    91115    if (m_littleEndian) {
    92         for (size_t i = 0; i < numCodeUnits; ++i) {
    93             UChar c = p[0] | (p[1] << 8);
     116        while (p < endMinusOneOrNull) {
     117            processBytesLE(p[0], p[1]);
    94118            p += 2;
    95             *q++ = c;
    96119        }
    97120    } else {
    98         for (size_t i = 0; i < numCodeUnits; ++i) {
    99             UChar c = (p[0] << 8) | p[1];
     121        while (p < endMinusOneOrNull) {
     122            processBytesBE(p[0], p[1]);
    100123            p += 2;
    101             *q++ = c;
    102124        }
    103125    }
    104126
    105     if (numBytes & 1) {
    106         ASSERT(!m_haveBufferedByte);
    107         m_haveBufferedByte = true;
    108         m_bufferedByte = p[0];
     127    if (p && p == endMinusOneOrNull) {
     128        ASSERT(!m_leadByte);
     129        m_leadByte = p[0];
     130    } else
     131        ASSERT(!p || p == end);
     132   
     133    if (flush && (m_leadByte || m_leadSurrogate)) {
     134        m_leadByte = WTF::nullopt;
     135        m_leadSurrogate = WTF::nullopt;
     136        sawError = true;
     137        result.append(replacementCharacter);
    109138    }
    110139
    111     return result;
     140    return result.toString();
    112141}
    113142
  • trunk/Source/WebCore/platform/text/TextCodecUTF16.h

    r225618 r266457  
    4242
    4343    bool m_littleEndian;
    44     bool m_haveBufferedByte { false };
    45     unsigned char m_bufferedByte;
     44    Optional<uint8_t> m_leadByte;
     45    Optional<UChar> m_leadSurrogate;
    4646};
    4747
Note: See TracChangeset for help on using the changeset viewer.