Changeset 287024 in webkit


Ignore:
Timestamp:
Dec 14, 2021 8:29:11 AM (7 months ago)
Author:
commit-queue@webkit.org
Message:

TextDecoder doesn't detect invalid UTF-8 sequences early enough
https://bugs.webkit.org/show_bug.cgi?id=233921

Patch by Andreu Botella <andreu@andreubotella.com> on 2021-12-14
Reviewed by Darin Adler.

LayoutTests/imported/w3c:

Import WPT tests from
https://github.com/web-platform-tests/wpt/pull/31537.

  • web-platform-tests/encoding/textdecoder-eof.any.js:

(test):

  • web-platform-tests/encoding/textdecoder-streaming.any-expected.txt:
  • web-platform-tests/encoding/textdecoder-streaming.any.js:

(string_appeared_here.forEach.):
(string_appeared_here.forEach.test):
(string_appeared_here.forEach):

  • web-platform-tests/encoding/textdecoder-streaming.any.worker-expected.txt:

Source/WebCore/PAL:

In streaming mode, when TextCodecUTF8 found a lead byte for which a
valid sequence would span longer than the currently available bytes, it
used to defer any processing of that sequence until all such bytes were
available, even if errors could be detected earlier. Additionally, if
the stream was flushed at that point, it would emit a single replacement
character, regardless of whether the remaining bytes formed a valid
sequence, even if they had lead bytes, resulting in skipped characters.
Both issues are solved by always checking the validity of partial
sequences.

The approach used in this patch uses decodeNonASCIISequence to find
the length of the maximal subpart of a partial sequence, and if the
length is equal to the partial sequence size and we're not at EOF, we
don't emit the error. This is enough to handle the missing characters at
EOF, and when combined with changing the condition of the outer do-while
loops in the decode method from flush && m_partialSequenceSize to
only m_partialSequenceSize, it also fixes the streaming issue.

This patch is a port of
https://chromium-review.googlesource.com/c/chromium/src/+/3263938

Tests: imported/w3c/web-platform-tests/encoding/textdecoder-eof.any.html

imported/w3c/web-platform-tests/encoding/textdecoder-stream.any.html

  • pal/text/TextCodecUTF8.cpp:

(PAL::TextCodecUTF8::handlePartialSequence): Changed to always process
partial sequences.
(PAL::TextCodecUTF8::decode): Changed the loop condition of the outer
do-while loops to not depend on flush.

Location:
trunk
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • trunk/LayoutTests/imported/w3c/ChangeLog

    r287023 r287024  
     12021-12-14  Andreu Botella  <andreu@andreubotella.com>
     2
     3        TextDecoder doesn't detect invalid UTF-8 sequences early enough
     4        https://bugs.webkit.org/show_bug.cgi?id=233921
     5
     6        Reviewed by Darin Adler.
     7
     8        Import WPT tests from
     9        https://github.com/web-platform-tests/wpt/pull/31537.
     10
     11        * web-platform-tests/encoding/textdecoder-eof.any.js:
     12        (test):
     13        * web-platform-tests/encoding/textdecoder-streaming.any-expected.txt:
     14        * web-platform-tests/encoding/textdecoder-streaming.any.js:
     15        (string_appeared_here.forEach.):
     16        (string_appeared_here.forEach.test):
     17        (string_appeared_here.forEach):
     18        * web-platform-tests/encoding/textdecoder-streaming.any.worker-expected.txt:
     19
    1202021-12-14  Rob Buis  <rbuis@igalia.com>
    221
  • trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-eof.any.js

    r279513 r287024  
    11test(() => {
     2  // Truncated sequences
    23  assert_equals(new TextDecoder().decode(new Uint8Array([0xF0])), "\uFFFD");
    34  assert_equals(new TextDecoder().decode(new Uint8Array([0xF0, 0x9F])), "\uFFFD");
    45  assert_equals(new TextDecoder().decode(new Uint8Array([0xF0, 0x9F, 0x92])), "\uFFFD");
     6
     7  // Errors near end-of-queue
     8  assert_equals(new TextDecoder().decode(new Uint8Array([0xF0, 0x9F, 0x41])), "\uFFFDA");
     9  assert_equals(new TextDecoder().decode(new Uint8Array([0xF0, 0x41, 0x42])), "\uFFFDAB");
     10  assert_equals(new TextDecoder().decode(new Uint8Array([0xF0, 0x41, 0xF0])), "\uFFFDA\uFFFD");
     11  assert_equals(new TextDecoder().decode(new Uint8Array([0xF0, 0x8F, 0x92])), "\uFFFD\uFFFD\uFFFD");
    512}, "TextDecoder end-of-queue handling");
    613
     
    1623  decoder.decode(new Uint8Array([0xF0, 0x9F]), { stream: true });
    1724  assert_equals(decoder.decode(new Uint8Array([0x92])), "\uFFFD");
     25
     26  assert_equals(decoder.decode(new Uint8Array([0xF0, 0x9F]), { stream: true }), "");
     27  assert_equals(decoder.decode(new Uint8Array([0x41]), { stream: true }), "\uFFFDA");
     28  assert_equals(decoder.decode(), "");
     29
     30  assert_equals(decoder.decode(new Uint8Array([0xF0, 0x41, 0x42]), { stream: true }), "\uFFFDAB");
     31  assert_equals(decoder.decode(), "");
     32
     33  assert_equals(decoder.decode(new Uint8Array([0xF0, 0x41, 0xF0]), { stream: true }), "\uFFFDA");
     34  assert_equals(decoder.decode(), "\uFFFD");
     35
     36  assert_equals(decoder.decode(new Uint8Array([0xF0]), { stream: true }), "");
     37  assert_equals(decoder.decode(new Uint8Array([0x8F]), { stream: true }), "\uFFFD\uFFFD");
     38  assert_equals(decoder.decode(new Uint8Array([0x92]), { stream: true }), "\uFFFD");
     39  assert_equals(decoder.decode(), "");
    1840}, "TextDecoder end-of-queue handling using stream: true");
  • trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-streaming.any-expected.txt

    r267659 r287024  
    1515PASS Streaming decode: utf-16be, 4 byte window (ArrayBuffer)
    1616PASS Streaming decode: utf-16be, 5 byte window (ArrayBuffer)
     17PASS Streaming decode: UTF-8 chunk tests (ArrayBuffer)
    1718PASS Streaming decode: utf-8, 1 byte window (SharedArrayBuffer)
    1819PASS Streaming decode: utf-8, 2 byte window (SharedArrayBuffer)
     
    3031PASS Streaming decode: utf-16be, 4 byte window (SharedArrayBuffer)
    3132PASS Streaming decode: utf-16be, 5 byte window (SharedArrayBuffer)
     33PASS Streaming decode: UTF-8 chunk tests (SharedArrayBuffer)
    3234
  • trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-streaming.any.js

    r264561 r287024  
    2929                for (var i = 0; i < encoded.length; i += len) {
    3030                    var sub = [];
    31                     for (var j = i; j < encoded.length && j < i + len; ++j)
     31                    for (var j = i; j < encoded.length && j < i + len; ++j) {
    3232                        sub.push(encoded[j]);
    33                         var uintArray = new Uint8Array(createBuffer(arrayBufferOrSharedArrayBuffer, sub.length));
    34                         uintArray.set(sub);
     33                    }
     34                    var uintArray = new Uint8Array(createBuffer(arrayBufferOrSharedArrayBuffer, sub.length));
     35                    uintArray.set(sub);
    3536                    out += decoder.decode(uintArray, {stream: true});
    3637                }
     
    4041        }
    4142    });
     43
     44    test(() => {
     45        function bytes(byteArray) {
     46            const view = new Uint8Array(createBuffer(arrayBufferOrSharedArrayBuffer, byteArray.length));
     47            view.set(byteArray);
     48            return view;
     49        }
     50
     51        const decoder = new TextDecoder();
     52
     53        assert_equals(decoder.decode(bytes([0xC1]), {stream: true}), "\uFFFD");
     54        assert_equals(decoder.decode(), "");
     55
     56        assert_equals(decoder.decode(bytes([0xF5]), {stream: true}), "\uFFFD");
     57        assert_equals(decoder.decode(), "");
     58
     59        assert_equals(decoder.decode(bytes([0xE0, 0x41]), {stream: true}), "\uFFFDA");
     60        assert_equals(decoder.decode(bytes([0x42])), "B");
     61
     62        assert_equals(decoder.decode(bytes([0xE0, 0x80]), {stream: true}), "\uFFFD\uFFFD");
     63        assert_equals(decoder.decode(bytes([0x80])), "\uFFFD");
     64
     65        assert_equals(decoder.decode(bytes([0xED, 0xA0]), {stream: true}), "\uFFFD\uFFFD");
     66        assert_equals(decoder.decode(bytes([0x80])), "\uFFFD");
     67
     68        assert_equals(decoder.decode(bytes([0xF0, 0x41]), {stream: true}), "\uFFFDA");
     69        assert_equals(decoder.decode(bytes([0x42]), {stream: true}), "B");
     70        assert_equals(decoder.decode(bytes([0x43])), "C");
     71
     72        assert_equals(decoder.decode(bytes([0xF0, 0x80]), {stream: true}), "\uFFFD\uFFFD");
     73        assert_equals(decoder.decode(bytes([0x80]), {stream: true}), "\uFFFD");
     74        assert_equals(decoder.decode(bytes([0x80])), "\uFFFD");
     75
     76        assert_equals(decoder.decode(bytes([0xF4, 0xA0]), {stream: true}), "\uFFFD\uFFFD");
     77        assert_equals(decoder.decode(bytes([0x80]), {stream: true}), "\uFFFD");
     78        assert_equals(decoder.decode(bytes([0x80])), "\uFFFD");
     79
     80        assert_equals(decoder.decode(bytes([0xF0, 0x90, 0x41]), {stream: true}), "\uFFFDA");
     81        assert_equals(decoder.decode(bytes([0x42])), "B");
     82
     83        // 4-byte UTF-8 sequences always correspond to non-BMP characters. Here
     84        // we make sure that, although the first 3 bytes are enough to emit the
     85        // lead surrogate, it only gets emitted when the fourth byte is read.
     86        assert_equals(decoder.decode(bytes([0xF0, 0x9F, 0x92]), {stream: true}), "");
     87        assert_equals(decoder.decode(bytes([0xA9])), "\u{1F4A9}");
     88    }, `Streaming decode: UTF-8 chunk tests (${arrayBufferOrSharedArrayBuffer})`);
    4289})
  • trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-streaming.any.worker-expected.txt

    r267659 r287024  
    1515PASS Streaming decode: utf-16be, 4 byte window (ArrayBuffer)
    1616PASS Streaming decode: utf-16be, 5 byte window (ArrayBuffer)
     17PASS Streaming decode: UTF-8 chunk tests (ArrayBuffer)
    1718PASS Streaming decode: utf-8, 1 byte window (SharedArrayBuffer)
    1819PASS Streaming decode: utf-8, 2 byte window (SharedArrayBuffer)
     
    3031PASS Streaming decode: utf-16be, 4 byte window (SharedArrayBuffer)
    3132PASS Streaming decode: utf-16be, 5 byte window (SharedArrayBuffer)
     33PASS Streaming decode: UTF-8 chunk tests (SharedArrayBuffer)
    3234
  • trunk/Source/WebCore/PAL/ChangeLog

    r287015 r287024  
     12021-12-14  Andreu Botella  <andreu@andreubotella.com>
     2
     3        TextDecoder doesn't detect invalid UTF-8 sequences early enough
     4        https://bugs.webkit.org/show_bug.cgi?id=233921
     5
     6        Reviewed by Darin Adler.
     7
     8        In streaming mode, when TextCodecUTF8 found a lead byte for which a
     9        valid sequence would span longer than the currently available bytes, it
     10        used to defer any processing of that sequence until all such bytes were
     11        available, even if errors could be detected earlier. Additionally, if
     12        the stream was flushed at that point, it would emit a single replacement
     13        character, regardless of whether the remaining bytes formed a valid
     14        sequence, even if they had lead bytes, resulting in skipped characters.
     15        Both issues are solved by always checking the validity of partial
     16        sequences.
     17
     18        The approach used in this patch uses `decodeNonASCIISequence` to find
     19        the length of the maximal subpart of a partial sequence, and if the
     20        length is equal to the partial sequence size and we're not at EOF, we
     21        don't emit the error. This is enough to handle the missing characters at
     22        EOF, and when combined with changing the condition of the outer do-while
     23        loops in the `decode` method from `flush && m_partialSequenceSize` to
     24        only `m_partialSequenceSize`, it also fixes the streaming issue.
     25
     26        This patch is a port of
     27        https://chromium-review.googlesource.com/c/chromium/src/+/3263938
     28
     29        Tests: imported/w3c/web-platform-tests/encoding/textdecoder-eof.any.html
     30               imported/w3c/web-platform-tests/encoding/textdecoder-stream.any.html
     31
     32        * pal/text/TextCodecUTF8.cpp:
     33        (PAL::TextCodecUTF8::handlePartialSequence): Changed to always process
     34        partial sequences.
     35        (PAL::TextCodecUTF8::decode): Changed the loop condition of the outer
     36        do-while loops to not depend on `flush`.
     37
    1382021-12-14  Ben Nham  <nham@apple.com>
    239
  • trunk/Source/WebCore/PAL/pal/text/TextCodecUTF8.cpp

    r286772 r287024  
    190190            return true;
    191191
     192        // Copy from `source` until we have `count` bytes.
     193        if (count > m_partialSequenceSize && end > source) {
     194            size_t additionalBytes = std::min<size_t>(count - m_partialSequenceSize, end - source);
     195            memcpy(m_partialSequence + m_partialSequenceSize, source, additionalBytes);
     196            source += additionalBytes;
     197            m_partialSequenceSize += additionalBytes;
     198        }
     199
     200        // If we still don't have `count` bytes, fill the rest with zeros (any
     201        // other lead byte would do), so we can run `decodeNonASCIISequence` to
     202        // tell if the chunk that we have is valid. These bytes are not part of
     203        // the partial sequence, so don't increment `m_partialSequenceSize`.
     204        bool partialSequenceIsTooShort = false;
    192205        if (count > m_partialSequenceSize) {
    193             if (count - m_partialSequenceSize > end - source) {
    194                 if (!flush) {
    195                     // The new data is not enough to complete the sequence, so
    196                     // add it to the existing partial sequence.
    197                     memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
    198                     m_partialSequenceSize += end - source;
    199                     return false;
    200                 }
    201                 // An incomplete partial sequence at the end is an error, but it will create
    202                 // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle
    203                 // the error.
    204                 return true;
    205             }
    206             memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
    207             source += count - m_partialSequenceSize;
    208             m_partialSequenceSize = count;
    209         }
     206            partialSequenceIsTooShort = true;
     207            memset(m_partialSequence + m_partialSequenceSize, 0, count - m_partialSequenceSize);
     208        }
     209
    210210        int character = decodeNonASCIISequence(m_partialSequence, count);
     211        if (partialSequenceIsTooShort) {
     212            ASSERT(character == nonCharacter);
     213            ASSERT(count <= m_partialSequenceSize);
     214            // If we're not at the end, and the partial sequence that we have is
     215            // incomplete but otherwise valid, a non-character is not an error.
     216            if (!flush && count == m_partialSequenceSize)
     217                return false;
     218        }
     219
    211220        if (!isLatin1(character))
    212221            return true;
     
    237246            continue;
    238247        }
     248
     249        // Copy from `source` until we have `count` bytes.
     250        if (count > m_partialSequenceSize && end > source) {
     251            size_t additionalBytes = std::min<size_t>(count - m_partialSequenceSize, end - source);
     252            memcpy(m_partialSequence + m_partialSequenceSize, source, additionalBytes);
     253            source += additionalBytes;
     254            m_partialSequenceSize += additionalBytes;
     255        }
     256
     257        // If we still don't have `count` bytes, fill the rest with zeros (any
     258        // other lead byte would do), so we can run `decodeNonASCIISequence` to
     259        // tell if the chunk that we have is valid. These bytes are not part of
     260        // the partial sequence, so don't increment `m_partialSequenceSize`.
     261        bool partialSequenceIsTooShort = false;
    239262        if (count > m_partialSequenceSize) {
    240             if (count - m_partialSequenceSize > end - source) {
    241                 if (!flush) {
    242                     // The new data is not enough to complete the sequence, so
    243                     // add it to the existing partial sequence.
    244                     memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
    245                     m_partialSequenceSize += end - source;
    246                     return;
    247                 }
    248                 // An incomplete partial sequence at the end is an error.
    249                 sawError = true;
    250                 if (stopOnError)
    251                     return;
    252                 *destination++ = replacementCharacter;
    253                 m_partialSequenceSize = 0;
    254                 source = end;
    255                 continue;
    256             }
    257             memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
    258             source += count - m_partialSequenceSize;
    259             m_partialSequenceSize = count;
    260         }
     263            partialSequenceIsTooShort = true;
     264            memset(m_partialSequence + m_partialSequenceSize, 0, count - m_partialSequenceSize);
     265        }
     266
    261267        int character = decodeNonASCIISequence(m_partialSequence, count);
     268        if (partialSequenceIsTooShort) {
     269            ASSERT(character == nonCharacter);
     270            ASSERT(count <= m_partialSequenceSize);
     271            // If we're not at the end, and the partial sequence that we have is
     272            // incomplete but otherwise valid, a non-character is not an error.
     273            if (!flush && count == m_partialSequenceSize)
     274                return;
     275        }
     276
    262277        if (character == nonCharacter) {
    263278            sawError = true;
     
    354369            *destination++ = character;
    355370        }
    356     } while (flush && m_partialSequenceSize);
     371    } while (m_partialSequenceSize);
    357372
    358373    buffer.shrink(destination - buffer.characters());
     
    434449            destination16 = appendCharacter(destination16, character);
    435450        }
    436     } while (flush && m_partialSequenceSize);
     451    } while (m_partialSequenceSize);
    437452
    438453    buffer16.shrink(destination16 - buffer16.characters());
Note: See TracChangeset for help on using the changeset viewer.