Changeset 287024 in webkit
- Timestamp:
- Dec 14, 2021 8:29:11 AM (7 months ago)
- Location:
- trunk
- Files:
-
- 7 edited
-
LayoutTests/imported/w3c/ChangeLog (modified) (1 diff)
-
LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-eof.any.js (modified) (2 diffs)
-
LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-streaming.any-expected.txt (modified) (2 diffs)
-
LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-streaming.any.js (modified) (2 diffs)
-
LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-streaming.any.worker-expected.txt (modified) (2 diffs)
-
Source/WebCore/PAL/ChangeLog (modified) (1 diff)
-
Source/WebCore/PAL/pal/text/TextCodecUTF8.cpp (modified) (4 diffs)
Legend:
- Unmodified
- Added
- Removed
-
trunk/LayoutTests/imported/w3c/ChangeLog
r287023 r287024 1 2021-12-14 Andreu Botella <andreu@andreubotella.com> 2 3 TextDecoder doesn't detect invalid UTF-8 sequences early enough 4 https://bugs.webkit.org/show_bug.cgi?id=233921 5 6 Reviewed by Darin Adler. 7 8 Import WPT tests from 9 https://github.com/web-platform-tests/wpt/pull/31537. 10 11 * web-platform-tests/encoding/textdecoder-eof.any.js: 12 (test): 13 * web-platform-tests/encoding/textdecoder-streaming.any-expected.txt: 14 * web-platform-tests/encoding/textdecoder-streaming.any.js: 15 (string_appeared_here.forEach.): 16 (string_appeared_here.forEach.test): 17 (string_appeared_here.forEach): 18 * web-platform-tests/encoding/textdecoder-streaming.any.worker-expected.txt: 19 1 20 2021-12-14 Rob Buis <rbuis@igalia.com> 2 21 -
trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-eof.any.js
r279513 r287024 1 1 test(() => { 2 // Truncated sequences 2 3 assert_equals(new TextDecoder().decode(new Uint8Array([0xF0])), "\uFFFD"); 3 4 assert_equals(new TextDecoder().decode(new Uint8Array([0xF0, 0x9F])), "\uFFFD"); 4 5 assert_equals(new TextDecoder().decode(new Uint8Array([0xF0, 0x9F, 0x92])), "\uFFFD"); 6 7 // Errors near end-of-queue 8 assert_equals(new TextDecoder().decode(new Uint8Array([0xF0, 0x9F, 0x41])), "\uFFFDA"); 9 assert_equals(new TextDecoder().decode(new Uint8Array([0xF0, 0x41, 0x42])), "\uFFFDAB"); 10 assert_equals(new TextDecoder().decode(new Uint8Array([0xF0, 0x41, 0xF0])), "\uFFFDA\uFFFD"); 11 assert_equals(new TextDecoder().decode(new Uint8Array([0xF0, 0x8F, 0x92])), "\uFFFD\uFFFD\uFFFD"); 5 12 }, "TextDecoder end-of-queue handling"); 6 13 … … 16 23 decoder.decode(new Uint8Array([0xF0, 0x9F]), { stream: true }); 17 24 assert_equals(decoder.decode(new Uint8Array([0x92])), "\uFFFD"); 25 26 assert_equals(decoder.decode(new Uint8Array([0xF0, 0x9F]), { stream: true }), ""); 27 assert_equals(decoder.decode(new Uint8Array([0x41]), { stream: true }), "\uFFFDA"); 28 assert_equals(decoder.decode(), ""); 29 30 assert_equals(decoder.decode(new Uint8Array([0xF0, 0x41, 0x42]), { stream: true }), "\uFFFDAB"); 31 assert_equals(decoder.decode(), ""); 32 33 assert_equals(decoder.decode(new Uint8Array([0xF0, 0x41, 0xF0]), { stream: true }), "\uFFFDA"); 34 assert_equals(decoder.decode(), "\uFFFD"); 35 36 assert_equals(decoder.decode(new Uint8Array([0xF0]), { stream: true }), ""); 37 assert_equals(decoder.decode(new Uint8Array([0x8F]), { stream: true }), "\uFFFD\uFFFD"); 38 assert_equals(decoder.decode(new Uint8Array([0x92]), { stream: true }), "\uFFFD"); 39 assert_equals(decoder.decode(), ""); 18 40 }, "TextDecoder end-of-queue handling using stream: true"); -
trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-streaming.any-expected.txt
r267659 r287024 15 15 PASS Streaming decode: utf-16be, 4 byte window (ArrayBuffer) 16 16 PASS Streaming decode: utf-16be, 5 byte window (ArrayBuffer) 17 PASS Streaming decode: UTF-8 chunk tests (ArrayBuffer) 17 18 PASS Streaming decode: utf-8, 1 byte window (SharedArrayBuffer) 18 19 PASS Streaming decode: utf-8, 2 byte window (SharedArrayBuffer) … … 30 31 PASS Streaming decode: utf-16be, 4 byte window (SharedArrayBuffer) 31 32 PASS Streaming decode: utf-16be, 5 byte window (SharedArrayBuffer) 33 PASS Streaming decode: UTF-8 chunk tests (SharedArrayBuffer) 32 34 -
trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-streaming.any.js
r264561 r287024 29 29 for (var i = 0; i < encoded.length; i += len) { 30 30 var sub = []; 31 for (var j = i; j < encoded.length && j < i + len; ++j) 31 for (var j = i; j < encoded.length && j < i + len; ++j) { 32 32 sub.push(encoded[j]); 33 var uintArray = new Uint8Array(createBuffer(arrayBufferOrSharedArrayBuffer, sub.length)); 34 uintArray.set(sub); 33 } 34 var uintArray = new Uint8Array(createBuffer(arrayBufferOrSharedArrayBuffer, sub.length)); 35 uintArray.set(sub); 35 36 out += decoder.decode(uintArray, {stream: true}); 36 37 } … … 40 41 } 41 42 }); 43 44 test(() => { 45 function bytes(byteArray) { 46 const view = new Uint8Array(createBuffer(arrayBufferOrSharedArrayBuffer, byteArray.length)); 47 view.set(byteArray); 48 return view; 49 } 50 51 const decoder = new TextDecoder(); 52 53 assert_equals(decoder.decode(bytes([0xC1]), {stream: true}), "\uFFFD"); 54 assert_equals(decoder.decode(), ""); 55 56 assert_equals(decoder.decode(bytes([0xF5]), {stream: true}), "\uFFFD"); 57 assert_equals(decoder.decode(), ""); 58 59 assert_equals(decoder.decode(bytes([0xE0, 0x41]), {stream: true}), "\uFFFDA"); 60 assert_equals(decoder.decode(bytes([0x42])), "B"); 61 62 assert_equals(decoder.decode(bytes([0xE0, 0x80]), {stream: true}), "\uFFFD\uFFFD"); 63 assert_equals(decoder.decode(bytes([0x80])), "\uFFFD"); 64 65 assert_equals(decoder.decode(bytes([0xED, 0xA0]), {stream: true}), "\uFFFD\uFFFD"); 66 assert_equals(decoder.decode(bytes([0x80])), "\uFFFD"); 67 68 assert_equals(decoder.decode(bytes([0xF0, 0x41]), {stream: true}), "\uFFFDA"); 69 assert_equals(decoder.decode(bytes([0x42]), {stream: true}), "B"); 70 assert_equals(decoder.decode(bytes([0x43])), "C"); 71 72 assert_equals(decoder.decode(bytes([0xF0, 0x80]), {stream: true}), "\uFFFD\uFFFD"); 73 assert_equals(decoder.decode(bytes([0x80]), {stream: true}), "\uFFFD"); 74 assert_equals(decoder.decode(bytes([0x80])), "\uFFFD"); 75 76 assert_equals(decoder.decode(bytes([0xF4, 0xA0]), {stream: true}), "\uFFFD\uFFFD"); 77 assert_equals(decoder.decode(bytes([0x80]), {stream: true}), "\uFFFD"); 78 assert_equals(decoder.decode(bytes([0x80])), "\uFFFD"); 79 80 assert_equals(decoder.decode(bytes([0xF0, 0x90, 0x41]), {stream: true}), "\uFFFDA"); 81 assert_equals(decoder.decode(bytes([0x42])), "B"); 82 83 // 4-byte UTF-8 sequences always correspond to non-BMP characters. Here 84 // we make sure that, although the first 3 bytes are enough to emit the 85 // lead surrogate, it only gets emitted when the fourth byte is read. 86 assert_equals(decoder.decode(bytes([0xF0, 0x9F, 0x92]), {stream: true}), ""); 87 assert_equals(decoder.decode(bytes([0xA9])), "\u{1F4A9}"); 88 }, `Streaming decode: UTF-8 chunk tests (${arrayBufferOrSharedArrayBuffer})`); 42 89 }) -
trunk/LayoutTests/imported/w3c/web-platform-tests/encoding/textdecoder-streaming.any.worker-expected.txt
r267659 r287024 15 15 PASS Streaming decode: utf-16be, 4 byte window (ArrayBuffer) 16 16 PASS Streaming decode: utf-16be, 5 byte window (ArrayBuffer) 17 PASS Streaming decode: UTF-8 chunk tests (ArrayBuffer) 17 18 PASS Streaming decode: utf-8, 1 byte window (SharedArrayBuffer) 18 19 PASS Streaming decode: utf-8, 2 byte window (SharedArrayBuffer) … … 30 31 PASS Streaming decode: utf-16be, 4 byte window (SharedArrayBuffer) 31 32 PASS Streaming decode: utf-16be, 5 byte window (SharedArrayBuffer) 33 PASS Streaming decode: UTF-8 chunk tests (SharedArrayBuffer) 32 34 -
trunk/Source/WebCore/PAL/ChangeLog
r287015 r287024 1 2021-12-14 Andreu Botella <andreu@andreubotella.com> 2 3 TextDecoder doesn't detect invalid UTF-8 sequences early enough 4 https://bugs.webkit.org/show_bug.cgi?id=233921 5 6 Reviewed by Darin Adler. 7 8 In streaming mode, when TextCodecUTF8 found a lead byte for which a 9 valid sequence would span longer than the currently available bytes, it 10 used to defer any processing of that sequence until all such bytes were 11 available, even if errors could be detected earlier. Additionally, if 12 the stream was flushed at that point, it would emit a single replacement 13 character, regardless of whether the remaining bytes formed a valid 14 sequence, even if they had lead bytes, resulting in skipped characters. 15 Both issues are solved by always checking the validity of partial 16 sequences. 17 18 The approach used in this patch uses `decodeNonASCIISequence` to find 19 the length of the maximal subpart of a partial sequence, and if the 20 length is equal to the partial sequence size and we're not at EOF, we 21 don't emit the error. This is enough to handle the missing characters at 22 EOF, and when combined with changing the condition of the outer do-while 23 loops in the `decode` method from `flush && m_partialSequenceSize` to 24 only `m_partialSequenceSize`, it also fixes the streaming issue. 25 26 This patch is a port of 27 https://chromium-review.googlesource.com/c/chromium/src/+/3263938 28 29 Tests: imported/w3c/web-platform-tests/encoding/textdecoder-eof.any.html 30 imported/w3c/web-platform-tests/encoding/textdecoder-stream.any.html 31 32 * pal/text/TextCodecUTF8.cpp: 33 (PAL::TextCodecUTF8::handlePartialSequence): Changed to always process 34 partial sequences. 35 (PAL::TextCodecUTF8::decode): Changed the loop condition of the outer 36 do-while loops to not depend on `flush`. 37 1 38 2021-12-14 Ben Nham <nham@apple.com> 2 39 -
trunk/Source/WebCore/PAL/pal/text/TextCodecUTF8.cpp
r286772 r287024 190 190 return true; 191 191 192 // Copy from `source` until we have `count` bytes. 193 if (count > m_partialSequenceSize && end > source) { 194 size_t additionalBytes = std::min<size_t>(count - m_partialSequenceSize, end - source); 195 memcpy(m_partialSequence + m_partialSequenceSize, source, additionalBytes); 196 source += additionalBytes; 197 m_partialSequenceSize += additionalBytes; 198 } 199 200 // If we still don't have `count` bytes, fill the rest with zeros (any 201 // other lead byte would do), so we can run `decodeNonASCIISequence` to 202 // tell if the chunk that we have is valid. These bytes are not part of 203 // the partial sequence, so don't increment `m_partialSequenceSize`. 204 bool partialSequenceIsTooShort = false; 192 205 if (count > m_partialSequenceSize) { 193 if (count - m_partialSequenceSize > end - source) { 194 if (!flush) { 195 // The new data is not enough to complete the sequence, so 196 // add it to the existing partial sequence. 197 memcpy(m_partialSequence + m_partialSequenceSize, source, end - source); 198 m_partialSequenceSize += end - source; 199 return false; 200 } 201 // An incomplete partial sequence at the end is an error, but it will create 202 // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle 203 // the error. 204 return true; 205 } 206 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize); 207 source += count - m_partialSequenceSize; 208 m_partialSequenceSize = count; 209 } 206 partialSequenceIsTooShort = true; 207 memset(m_partialSequence + m_partialSequenceSize, 0, count - m_partialSequenceSize); 208 } 209 210 210 int character = decodeNonASCIISequence(m_partialSequence, count); 211 if (partialSequenceIsTooShort) { 212 ASSERT(character == nonCharacter); 213 ASSERT(count <= m_partialSequenceSize); 214 // If we're not at the end, and the partial sequence that we have is 215 // incomplete but otherwise valid, a non-character is not an error. 216 if (!flush && count == m_partialSequenceSize) 217 return false; 218 } 219 211 220 if (!isLatin1(character)) 212 221 return true; … … 237 246 continue; 238 247 } 248 249 // Copy from `source` until we have `count` bytes. 250 if (count > m_partialSequenceSize && end > source) { 251 size_t additionalBytes = std::min<size_t>(count - m_partialSequenceSize, end - source); 252 memcpy(m_partialSequence + m_partialSequenceSize, source, additionalBytes); 253 source += additionalBytes; 254 m_partialSequenceSize += additionalBytes; 255 } 256 257 // If we still don't have `count` bytes, fill the rest with zeros (any 258 // other lead byte would do), so we can run `decodeNonASCIISequence` to 259 // tell if the chunk that we have is valid. These bytes are not part of 260 // the partial sequence, so don't increment `m_partialSequenceSize`. 261 bool partialSequenceIsTooShort = false; 239 262 if (count > m_partialSequenceSize) { 240 if (count - m_partialSequenceSize > end - source) { 241 if (!flush) { 242 // The new data is not enough to complete the sequence, so 243 // add it to the existing partial sequence. 244 memcpy(m_partialSequence + m_partialSequenceSize, source, end - source); 245 m_partialSequenceSize += end - source; 246 return; 247 } 248 // An incomplete partial sequence at the end is an error. 249 sawError = true; 250 if (stopOnError) 251 return; 252 *destination++ = replacementCharacter; 253 m_partialSequenceSize = 0; 254 source = end; 255 continue; 256 } 257 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize); 258 source += count - m_partialSequenceSize; 259 m_partialSequenceSize = count; 260 } 263 partialSequenceIsTooShort = true; 264 memset(m_partialSequence + m_partialSequenceSize, 0, count - m_partialSequenceSize); 265 } 266 261 267 int character = decodeNonASCIISequence(m_partialSequence, count); 268 if (partialSequenceIsTooShort) { 269 ASSERT(character == nonCharacter); 270 ASSERT(count <= m_partialSequenceSize); 271 // If we're not at the end, and the partial sequence that we have is 272 // incomplete but otherwise valid, a non-character is not an error. 273 if (!flush && count == m_partialSequenceSize) 274 return; 275 } 276 262 277 if (character == nonCharacter) { 263 278 sawError = true; … … 354 369 *destination++ = character; 355 370 } 356 } while ( flush &&m_partialSequenceSize);371 } while (m_partialSequenceSize); 357 372 358 373 buffer.shrink(destination - buffer.characters()); … … 434 449 destination16 = appendCharacter(destination16, character); 435 450 } 436 } while ( flush &&m_partialSequenceSize);451 } while (m_partialSequenceSize); 437 452 438 453 buffer16.shrink(destination16 - buffer16.characters());
Note: See TracChangeset
for help on using the changeset viewer.