Changeset 266528 in webkit


Ignore:
Timestamp:
Sep 3, 2020, 9:57:49 AM (5 years ago)
Author:
achristensen@apple.com
Message:

TextDecoder should ignore byte-order-mark like other browsers and spec
https://bugs.webkit.org/show_bug.cgi?id=216108

Reviewed by Darin Adler.

LayoutTests/imported/w3c:

  • web-platform-tests/encoding/streams/decode-ignore-bom.any-expected.txt:
  • web-platform-tests/encoding/textdecoder-ignorebom.any-expected.txt:

Source/WebCore:

Covered by newly passing web platform tests.

  • dom/TextDecoder.cpp:

(WebCore::TextDecoder::ignoreBOMIfNecessary):
(WebCore::TextDecoder::decode):
(WebCore::TextDecoder::prependBOMIfNecessary): Deleted.

  • dom/TextDecoder.h:
Location:
trunk
Files:
10 edited

Legend:

Unmodified
Added
Removed
  • trunk/LayoutTests/imported/w3c/ChangeLog

    r266527 r266528  
     12020-09-03  Alex Christensen  <achristensen@webkit.org>
     2
     3        TextDecoder should ignore byte-order-mark like other browsers and spec
     4        https://bugs.webkit.org/show_bug.cgi?id=216108
     5
     6        Reviewed by Darin Adler.
     7
     8        * web-platform-tests/encoding/streams/decode-ignore-bom.any-expected.txt:
     9        * web-platform-tests/encoding/textdecoder-ignorebom.any-expected.txt:
     10
    1112020-09-03  Alex Christensen  <achristensen@webkit.org>
    212
  • trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/streams/decode-ignore-bom.any-expected.txt

    r266348 r266528  
    11
    22PASS ignoreBOM should work for encoding utf-8, split at character 0
    3 FAIL ignoreBOM should work for encoding utf-8, split at character 1 assert_equals: BOM should be stripped expected "abc" but got "abc"
    4 FAIL ignoreBOM should work for encoding utf-8, split at character 2 assert_equals: BOM should be stripped expected "abc" but got "abc"
     3PASS ignoreBOM should work for encoding utf-8, split at character 1
     4PASS ignoreBOM should work for encoding utf-8, split at character 2
    55PASS ignoreBOM should work for encoding utf-8, split at character 3
    66PASS ignoreBOM should work for encoding utf-16le, split at character 0
    7 FAIL ignoreBOM should work for encoding utf-16le, split at character 1 assert_equals: BOM should be stripped expected "abc" but got "abc"
     7PASS ignoreBOM should work for encoding utf-16le, split at character 1
    88PASS ignoreBOM should work for encoding utf-16le, split at character 2
    9 FAIL ignoreBOM should work for encoding utf-16le, split at character 3 assert_equals: BOM should be preserved expected "abc" but got "abc"
     9PASS ignoreBOM should work for encoding utf-16le, split at character 3
    1010PASS ignoreBOM should work for encoding utf-16be, split at character 0
    11 FAIL ignoreBOM should work for encoding utf-16be, split at character 1 assert_equals: BOM should be stripped expected "abc" but got "abc"
     11PASS ignoreBOM should work for encoding utf-16be, split at character 1
    1212PASS ignoreBOM should work for encoding utf-16be, split at character 2
    13 FAIL ignoreBOM should work for encoding utf-16be, split at character 3 assert_equals: BOM should be preserved expected "abc" but got "abc"
     13PASS ignoreBOM should work for encoding utf-16be, split at character 3
    1414
  • trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/streams/decode-ignore-bom.any.worker-expected.txt

    r266348 r266528  
    11
    22PASS ignoreBOM should work for encoding utf-8, split at character 0
    3 FAIL ignoreBOM should work for encoding utf-8, split at character 1 assert_equals: BOM should be stripped expected "abc" but got "abc"
    4 FAIL ignoreBOM should work for encoding utf-8, split at character 2 assert_equals: BOM should be stripped expected "abc" but got "abc"
     3PASS ignoreBOM should work for encoding utf-8, split at character 1
     4PASS ignoreBOM should work for encoding utf-8, split at character 2
    55PASS ignoreBOM should work for encoding utf-8, split at character 3
    66PASS ignoreBOM should work for encoding utf-16le, split at character 0
    7 FAIL ignoreBOM should work for encoding utf-16le, split at character 1 assert_equals: BOM should be stripped expected "abc" but got "abc"
     7PASS ignoreBOM should work for encoding utf-16le, split at character 1
    88PASS ignoreBOM should work for encoding utf-16le, split at character 2
    9 FAIL ignoreBOM should work for encoding utf-16le, split at character 3 assert_equals: BOM should be preserved expected "abc" but got "abc"
     9PASS ignoreBOM should work for encoding utf-16le, split at character 3
    1010PASS ignoreBOM should work for encoding utf-16be, split at character 0
    11 FAIL ignoreBOM should work for encoding utf-16be, split at character 1 assert_equals: BOM should be stripped expected "abc" but got "abc"
     11PASS ignoreBOM should work for encoding utf-16be, split at character 1
    1212PASS ignoreBOM should work for encoding utf-16be, split at character 2
    13 FAIL ignoreBOM should work for encoding utf-16be, split at character 3 assert_equals: BOM should be preserved expected "abc" but got "abc"
     13PASS ignoreBOM should work for encoding utf-16be, split at character 3
    1414
  • trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-copy.any-expected.txt

    r264561 r266528  
    11
    2 FAIL Modify buffer after passing it in (ArrayBuffer) assert_equals: expected "@" but got "@"
    3 FAIL Modify buffer after passing it in (SharedArrayBuffer) assert_equals: expected "@" but got "@"
     2PASS Modify buffer after passing it in (ArrayBuffer)
     3PASS Modify buffer after passing it in (SharedArrayBuffer)
    44
  • trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-copy.any.worker-expected.txt

    r264561 r266528  
    11
    2 FAIL Modify buffer after passing it in (ArrayBuffer) assert_equals: expected "@" but got "@"
    3 FAIL Modify buffer after passing it in (SharedArrayBuffer) assert_equals: expected "@" but got "@"
     2PASS Modify buffer after passing it in (ArrayBuffer)
     3PASS Modify buffer after passing it in (SharedArrayBuffer)
    44
  • trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-ignorebom.any-expected.txt

    r256730 r266528  
    11
    2 FAIL BOM is ignored if ignoreBOM option is specified: utf-8 assert_equals: utf-8: BOM should be present in decoded string if ignored by a reused decoder expected "abc" but got "abc"
    3 FAIL BOM is ignored if ignoreBOM option is specified: utf-16le assert_equals: utf-16le: BOM should be present in decoded string if ignored by a reused decoder expected "abc" but got "abc"
    4 FAIL BOM is ignored if ignoreBOM option is specified: utf-16be assert_equals: utf-16be: BOM should be present in decoded string if ignored by a reused decoder expected "abc" but got "abc"
     2PASS BOM is ignored if ignoreBOM option is specified: utf-8
     3PASS BOM is ignored if ignoreBOM option is specified: utf-16le
     4PASS BOM is ignored if ignoreBOM option is specified: utf-16be
    55PASS The ignoreBOM attribute of TextDecoder
    66
  • trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-ignorebom.any.worker-expected.txt

    r256730 r266528  
    11
    2 FAIL BOM is ignored if ignoreBOM option is specified: utf-8 assert_equals: utf-8: BOM should be present in decoded string if ignored by a reused decoder expected "abc" but got "abc"
    3 FAIL BOM is ignored if ignoreBOM option is specified: utf-16le assert_equals: utf-16le: BOM should be present in decoded string if ignored by a reused decoder expected "abc" but got "abc"
    4 FAIL BOM is ignored if ignoreBOM option is specified: utf-16be assert_equals: utf-16be: BOM should be present in decoded string if ignored by a reused decoder expected "abc" but got "abc"
     2PASS BOM is ignored if ignoreBOM option is specified: utf-8
     3PASS BOM is ignored if ignoreBOM option is specified: utf-16le
     4PASS BOM is ignored if ignoreBOM option is specified: utf-16be
    55PASS The ignoreBOM attribute of TextDecoder
    66
  • trunk/Source/WebCore/ChangeLog

    r266527 r266528  
     12020-09-03  Alex Christensen  <achristensen@webkit.org>
     2
     3        TextDecoder should ignore byte-order-mark like other browsers and spec
     4        https://bugs.webkit.org/show_bug.cgi?id=216108
     5
     6        Reviewed by Darin Adler.
     7
     8        Covered by newly passing web platform tests.
     9
     10        * dom/TextDecoder.cpp:
     11        (WebCore::TextDecoder::ignoreBOMIfNecessary):
     12        (WebCore::TextDecoder::decode):
     13        (WebCore::TextDecoder::prependBOMIfNecessary): Deleted.
     14        * dom/TextDecoder.h:
     15
    1162020-09-03  Alex Christensen  <achristensen@webkit.org>
    217
  • trunk/Source/WebCore/dom/TextDecoder.cpp

    r243163 r266528  
    4949}
    5050
    51 void TextDecoder::ignoreBOMIfNecessary(const uint8_t*& data, size_t& length)
     51constexpr uint8_t utf8BOMBytes[3] { 0xEF, 0xBB, 0xBF };
     52constexpr uint8_t utf16BEBOMBytes[2] { 0xFE, 0xFF };
     53constexpr uint8_t utf16LEBOMBytes[2] { 0xFF, 0xFE };
     54
     55size_t TextDecoder::bytesNeededForFullBOMIgnoreCheck() const
    5256{
    53     const uint8_t utf8BOMBytes[3] = {0xEF, 0xBB, 0xBF};
    54     const uint8_t utf16BEBOMBytes[2] = {0xFE, 0xFF};
    55     const uint8_t utf16LEBOMBytes[2] = {0xFF, 0xFE};
     57    if (m_textEncoding == UTF8Encoding())
     58        return sizeof(utf8BOMBytes);
     59    if (m_textEncoding == UTF16BigEndianEncoding())
     60        return sizeof(utf16BEBOMBytes);
     61    if (m_textEncoding == UTF16LittleEndianEncoding())
     62        return sizeof(utf16LEBOMBytes);
     63    return 0;
     64}
     65
     66bool TextDecoder::isBeginningOfIncompleteBOM(const uint8_t* bytes, size_t length) const
     67{
     68    if (!length)
     69        return true;
     70
     71    if (m_textEncoding == UTF8Encoding()) {
     72        if (length == 1)
     73            return bytes[0] == utf8BOMBytes[0];
     74        return length == 2 && bytes[0] == utf8BOMBytes[0] && bytes[1] == utf8BOMBytes[1];
     75    }
     76    if (m_textEncoding == UTF16BigEndianEncoding())
     77        return length == 1 && bytes[0] == utf16BEBOMBytes[0];
     78    if (m_textEncoding == UTF16LittleEndianEncoding())
     79        return length == 1 && bytes[0] == utf16LEBOMBytes[0];
     80
     81    return false;
     82}
     83
     84auto TextDecoder::ignoreBOMIfNecessary(const uint8_t*& data, size_t& length, bool stream) -> WaitForMoreBOMBytes
     85{
     86    if (m_bomIgnoredIfNecessary || m_options.ignoreBOM)
     87        return WaitForMoreBOMBytes::No;
     88
     89    if (stream && length < bytesNeededForFullBOMIgnoreCheck()) {
     90        if (isBeginningOfIncompleteBOM(data, length))
     91            return WaitForMoreBOMBytes::Yes;
     92        m_bomIgnoredIfNecessary = true;
     93        return WaitForMoreBOMBytes::No;
     94    }
    5695
    5796    if (m_textEncoding == UTF8Encoding()
     
    75114        length -= sizeof(utf16LEBOMBytes);
    76115    }
    77 }
    78 
    79 String TextDecoder::prependBOMIfNecessary(const String& decoded)
    80 {
    81     if (m_hasDecoded || !m_options.ignoreBOM)
    82         return decoded;
    83     const UChar utf16BEBOM[2] = {0xFEFF, '\0'};
    84 
    85     // FIXME: Make TextCodec::decode take a flag for prepending BOM so we don't need to do this extra allocation and copy.
    86     return makeString(utf16BEBOM, decoded);
     116    m_bomIgnoredIfNecessary = true;
     117    return WaitForMoreBOMBytes::No;
    87118}
    88119
     
    103134    }
    104135
    105     ignoreBOMIfNecessary(data, length);
     136    if (!options.stream)
     137        m_bomIgnoredIfNecessary = false;
    106138
     139    bool alreadyBuffered = false;
    107140    if (m_buffer.size()) {
    108141        m_buffer.append(data, length);
    109142        data = m_buffer.data();
    110143        length = m_buffer.size();
     144        alreadyBuffered = true;
     145    }
     146
     147    if (ignoreBOMIfNecessary(data, length, options.stream) == WaitForMoreBOMBytes::Yes) {
     148        ASSERT(options.stream);
     149        if (!alreadyBuffered)
     150            m_buffer.append(data, length);
     151        return String();
    111152    }
    112153
     
    118159    String result;
    119160    if (!sawError)
    120         result = prependBOMIfNecessary(m_textEncoding.decode(charData, length, stopOnError, sawError));
     161        result = m_textEncoding.decode(charData, length, stopOnError, sawError);
    121162
    122163    if (sawError) {
     
    128169            if (m_options.fatal)
    129170                return Exception { TypeError };
    130             result = prependBOMIfNecessary(m_textEncoding.decode(charData, length));
     171            result = m_textEncoding.decode(charData, length);
    131172        }
    132173    } else
    133174        m_buffer.clear();
    134175
    135     m_hasDecoded = true;
    136176    return result;
    137177}
  • trunk/Source/WebCore/dom/TextDecoder.h

    r242776 r266528  
    5151
    5252private:
    53     String prependBOMIfNecessary(const String&);
    54     void ignoreBOMIfNecessary(const uint8_t*& data, size_t& length);
    5553    TextDecoder(const char*, Options);
     54
     55    enum class WaitForMoreBOMBytes : bool { No, Yes };
     56    WaitForMoreBOMBytes ignoreBOMIfNecessary(const uint8_t*& data, size_t& length, bool stream);
     57    size_t bytesNeededForFullBOMIgnoreCheck() const;
     58    bool isBeginningOfIncompleteBOM(const uint8_t*, size_t) const;
     59
    5660    TextEncoding m_textEncoding;
    5761    Options m_options;
    58     bool m_hasDecoded { false };
    5962    Vector<uint8_t> m_buffer;
     63    bool m_bomIgnoredIfNecessary { false };
    6064};
    6165
Note: See TracChangeset for help on using the changeset viewer.