Changeset 24052 in webkit


Ignore:
Timestamp:
Jul 6, 2007 3:00:45 AM (17 years ago)
Author:
bdash
Message:

2007-07-06 Jungshik Shin <jungshik.shin@gmail.com>

Reviewed by Alexey.

Test:

  • fast/encoding/utf-32-big-endian-bom.html
  • fast/encoding/utf-32-big-endian-nobom.xml
  • fast/encoding/utf-32-little-endian-bom.html
  • fast/encoding/utf-32-little-endian-nobom.xml
  • loader/TextResourceDecoder.cpp: (WebCore::TextResourceDecoder::checkForBOM): (WebCore::TextResourceDecoder::checkForHeadCharset):
  • platform/TextDecoder.cpp: (WebCore::TextDecoder::checkForBOM):
  • platform/TextDecoder.h:
  • platform/TextEncoding.cpp: (WebCore::UTF32BigEndianEncoding): (WebCore::UTF32LittleEndianEncoding):
  • platform/TextEncoding.h:

2007-07-06 Jungshik Shin <jungshik.shin@gmail.com>

Reviewed by Alexey.

  • fast/encoding/utf-32-big-endian-bom-expected.txt: Added.
  • fast/encoding/utf-32-big-endian-bom.html: Added.
  • fast/encoding/utf-32-big-endian-nobom-expected.txt: Added.
  • fast/encoding/utf-32-big-endian-nobom.xml: Added.
  • fast/encoding/utf-32-little-endian-bom-expected.txt: Added.
  • fast/encoding/utf-32-little-endian-bom.html: Added.
  • fast/encoding/utf-32-little-endian-nobom-expected.txt: Added.
  • fast/encoding/utf-32-little-endian-nobom.xml: Added.
Location:
trunk
Files:
8 added
7 edited

Legend:

Unmodified
Added
Removed
  • trunk/LayoutTests/ChangeLog

    r24048 r24052  
     12007-07-06  Jungshik Shin <jungshik.shin@gmail.com>
     2
     3        Reviewed by Alexey.
     4
     5       - test for http://bugs.webkit.org/show_bug.cgi?id=13415
     6
     7        * fast/encoding/utf-32-big-endian-bom-expected.txt: Added.
     8        * fast/encoding/utf-32-big-endian-bom.html: Added.
     9        * fast/encoding/utf-32-big-endian-nobom-expected.txt: Added.
     10        * fast/encoding/utf-32-big-endian-nobom.xml: Added.
     11        * fast/encoding/utf-32-little-endian-bom-expected.txt: Added.
     12        * fast/encoding/utf-32-little-endian-bom.html: Added.
     13        * fast/encoding/utf-32-little-endian-nobom-expected.txt: Added.
     14        * fast/encoding/utf-32-little-endian-nobom.xml: Added.
     15
    1162007-07-06  Rob Buis  <buis@kde.org>
    217
  • trunk/WebCore/ChangeLog

    r24051 r24052  
     12007-07-06  Jungshik Shin  <jungshik.shin@gmail.com>
     2
     3        Reviewed by Alexey.
     4
     5        - Add UTF-32 encoding support
     6          http://bugs.webkit.org/show_bug.cgi?id=13415
     7
     8        Test:
     9         - fast/encoding/utf-32-big-endian-bom.html
     10         - fast/encoding/utf-32-big-endian-nobom.xml
     11         - fast/encoding/utf-32-little-endian-bom.html
     12         - fast/encoding/utf-32-little-endian-nobom.xml
     13
     14        * loader/TextResourceDecoder.cpp:
     15        (WebCore::TextResourceDecoder::checkForBOM):
     16        (WebCore::TextResourceDecoder::checkForHeadCharset):
     17        * platform/TextDecoder.cpp:
     18        (WebCore::TextDecoder::checkForBOM):
     19        * platform/TextDecoder.h:
     20        * platform/TextEncoding.cpp:
     21        (WebCore::UTF32BigEndianEncoding):
     22        (WebCore::UTF32LittleEndianEncoding):
     23        * platform/TextEncoding.h:
     24
    1252007-07-06  Holger Hans Peter Freyther  <zecke@selfish.org>
    226
  • trunk/WebCore/loader/TextResourceDecoder.cpp

    r23906 r24052  
    347347void TextResourceDecoder::checkForBOM(const char* data, size_t len)
    348348{
    349     // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
     349    // Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
    350350
    351351    if (m_source == UserChosenEncoding) {
     
    357357    // Check if we have enough data.
    358358    size_t bufferLength = m_buffer.size();
    359     if (bufferLength + len < 3)
     359    if (bufferLength + len < 4)
    360360        return;
    361361
    362362    m_checkedForBOM = true;
    363363
    364     // Extract the first three bytes.
     364    // Extract the first four bytes.
    365365    // Handle the case where some of bytes are already in the buffer.
    366366    // The last byte is always guaranteed to not be in the buffer.
     
    368368    unsigned char c1 = bufferLength >= 1 ? m_buffer[0] : *udata++;
    369369    unsigned char c2 = bufferLength >= 2 ? m_buffer[1] : *udata++;
    370     ASSERT(bufferLength < 3);
    371     unsigned char c3 = *udata;
     370    unsigned char c3 = bufferLength >= 3 ? m_buffer[2] : *udata++;
     371    ASSERT(bufferLength < 4);
     372    unsigned char c4 = *udata;
    372373
    373374    // Check for the BOM.
    374     if (c1 == 0xFE && c2 == 0xFF)
    375         setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
    376     else if (c1 == 0xFF && c2 == 0xFE)
    377         setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
     375    if (c1 == 0xFF && c2 == 0xFE) {
     376        if (c3 !=0 || c4 != 0)
     377            setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
     378        else
     379            setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
     380    }
    378381    else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
    379382        setEncoding(UTF8Encoding(), AutoDetectedEncoding);
     383    else if (c1 == 0xFE && c2 == 0xFF)
     384        setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
     385    else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF)
     386        setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
    380387}
    381388
     
    520527            } else if (ptr[0] == 0 && ptr[1] == '?' && ptr[2] == 0 && ptr[3] == 'x' && ptr[4] == 0 && ptr[5] == 'm' && ptr[6] == 0 && ptr[7] == 'l') {
    521528                // UTF-16 without BOM
    522                 setEncoding(((ptr - m_buffer.data()) % 2) ? "UTF-16LE" : "UTF-16BE", AutoDetectedEncoding);
     529                setEncoding(((ptr - m_buffer.data()) % 2) ? UTF16LittleEndianEncoding() : UTF16BigEndianEncoding(), AutoDetectedEncoding);
     530                return true;
     531            } else if (ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == '?' && ptr[4] == 0 && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == 'x') {
     532                // UTF-32 without BOM
     533                setEncoding(((ptr - m_buffer.data()) % 4) ? UTF32LittleEndianEncoding() : UTF32BigEndianEncoding(), AutoDetectedEncoding);
    523534                return true;
    524535            }
  • trunk/WebCore/platform/TextDecoder.cpp

    r16245 r24052  
    5858    unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
    5959    unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
    60     unsigned char c3 = buf2Len ? (--buf2Len, *buf2++) : 0;
     60    unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
     61    unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;
    6162
    6263    const TextEncoding* encodingConsideringBOM = &m_encoding;
    63     if (c1 == 0xFF && c2 == 0xFE)
    64         encodingConsideringBOM = &UTF16LittleEndianEncoding();
     64    bool foundBOM = true;
     65    if (c1 == 0xFF && c2 == 0xFE) {
     66        if (c3 != 0 || c4 != 0)
     67            encodingConsideringBOM = &UTF16LittleEndianEncoding();
     68        else if (numBufferedBytes + length > sizeof(m_bufferedBytes))
     69            encodingConsideringBOM = &UTF32LittleEndianEncoding();
     70        else
     71            foundBOM = false;
     72    }
     73    else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
     74        encodingConsideringBOM = &UTF8Encoding();
    6575    else if (c1 == 0xFE && c2 == 0xFF)
    6676        encodingConsideringBOM = &UTF16BigEndianEncoding();
    67     else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF)
    68         encodingConsideringBOM = &UTF8Encoding();
    69     else if (numBufferedBytes + length <= sizeof(m_bufferedBytes) && !flush) {
     77    else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF)
     78        encodingConsideringBOM = &UTF32BigEndianEncoding();
     79    else
     80        foundBOM = false;
     81    if (!foundBOM && numBufferedBytes + length <= sizeof(m_bufferedBytes) && !flush) {
    7082        // Continue to look for the BOM.
    7183        memcpy(&m_bufferedBytes[numBufferedBytes], data, length);
  • trunk/WebCore/platform/TextDecoder.h

    r17431 r24052  
    5757        bool m_checkedForBOM;
    5858        unsigned char m_numBufferedBytes;
    59         unsigned char m_bufferedBytes[2];
     59        unsigned char m_bufferedBytes[3];
    6060    };
    6161
  • trunk/WebCore/platform/TextEncoding.cpp

    r21227 r24052  
    186186}
    187187
     188const TextEncoding& UTF32BigEndianEncoding()
     189{
     190    static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE");
     191    return globalUTF32BigEndianEncoding;
     192}
     193
     194const TextEncoding& UTF32LittleEndianEncoding()
     195{
     196    static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE");
     197    return globalUTF32LittleEndianEncoding;
     198}
     199
     200
    188201const TextEncoding& UTF8Encoding()
    189202{
  • trunk/WebCore/platform/TextEncoding.h

    r18712 r24052  
    6161    const TextEncoding& UTF16BigEndianEncoding();
    6262    const TextEncoding& UTF16LittleEndianEncoding();
     63    const TextEncoding& UTF32BigEndianEncoding();
     64    const TextEncoding& UTF32LittleEndianEncoding();
    6365    const TextEncoding& UTF8Encoding();
    6466    const TextEncoding& WindowsLatin1Encoding();
Note: See TracChangeset for help on using the changeset viewer.