Changeset 25066 in webkit


Ignore:
Timestamp:
Aug 13, 2007 10:24:05 PM (17 years ago)
Author:
ap
Message:

http://bugs.webkit.org/show_bug.cgi?id=14951
REGRESSION: page interpreted as UTF-8 because of stray <?xml> after <head>

Test: fast/encoding/misplaced-xml-declaration.html

  • loader/TextResourceDecoder.cpp: (WebCore::TextResourceDecoder::checkForHeadCharset): Only honor XML declaration at the very beginning of the file.
Location:
trunk
Files:
2 added
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/LayoutTests/ChangeLog

    r25062 r25066  
     12007-08-13  Alexey Proskuryakov  <ap@webkit.org>
     2
     3        Reviewed by Darin.
     4
     5        http://bugs.webkit.org/show_bug.cgi?id=14951
     6        REGRESSION: page interpreted as UTF-8 because of stray <?xml> after <head>
     7
     8        * fast/encoding/misplaced-xml-declaration-expected.txt: Added.
     9        * fast/encoding/misplaced-xml-declaration.html: Added.
     10
    1112007-08-13  Sam Weinig  <sam@webkit.org>
    212
  • trunk/WebCore/ChangeLog

    r25065 r25066  
     12007-08-13  Alexey Proskuryakov  <ap@webkit.org>
     2
     3        Reviewed by Darin.
     4
     5        http://bugs.webkit.org/show_bug.cgi?id=14951
     6        REGRESSION: page interpreted as UTF-8 because of stray <?xml> after <head>
     7
     8        Test: fast/encoding/misplaced-xml-declaration.html
     9
     10        * loader/TextResourceDecoder.cpp:
     11        (WebCore::TextResourceDecoder::checkForHeadCharset): Only honor XML declaration
     12        at the very beginning of the file.
     13
    1142007-08-13  Oliver Hunt  <oliver@apple.com>
    215
  • trunk/WebCore/loader/TextResourceDecoder.cpp

    r24052 r25066  
    44    Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
    55    Copyright (C) 2003, 2004, 2005, 2006 Apple Computer, Inc.
    6     Copyright (C) 2005, 2006 Alexey Proskuryakov (ap@nypop.com)
     6    Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
    77
    88    This library is free software; you can redistribute it and/or
     
    481481
    482482    movedDataToBuffer = true;
    483    
     483
     484    const char* ptr = m_buffer.data();
     485    const char* pEnd = ptr + m_buffer.size();
     486
     487    // Is there enough data available to check for XML declaration?
     488    if (m_buffer.size() < 8)
     489        return false;
     490
     491    // Handle XML declaration, which can have encoding in it. This encoding is honored even for HTML documents.
     492    // It is an error for an XML declaration not to be at the start of an XML document, and it is ignored in HTML documents in such case.
     493    if (ptr[0] == '<' && ptr[1] == '?' && ptr[2] == 'x' && ptr[3] == 'm' && ptr[4] == 'l') {
     494        const char* xmlDeclarationEnd = ptr;
     495        while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>')
     496            ++xmlDeclarationEnd;
     497        if (xmlDeclarationEnd == pEnd)
     498            return false;
     499        DeprecatedCString str(ptr, xmlDeclarationEnd - ptr); // No need for +1, because we have an extra "?" to lose at the end of XML declaration.
     500        int len = 0;
     501        int pos = findXMLEncoding(str, len);
     502        if (pos != -1)
     503            setEncoding(TextEncoding(str.mid(pos, len)), EncodingFromXMLHeader);
     504        // continue looking for a charset - it may be specified in an HTTP-Equiv meta
     505    } else if (ptr[0] == '<' && ptr[1] == 0 && ptr[2] == '?' && ptr[3] == 0 && ptr[4] == 'x' && ptr[5] == 0) {
     506        setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
     507        return true;
     508    } else if (ptr[0] == 0 && ptr[1] == '<' && ptr[2] == 0 && ptr[3] == '?' && ptr[4] == 0 && ptr[5] == 'x') {
     509        setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
     510        return true;
     511    } else if (ptr[0] == '<' && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0 && ptr[4] == '?' && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == 0) {
     512        setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
     513        return true;
     514    } else if (ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == '<' && ptr[4] == 0 && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == '?') {
     515        setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
     516        return true;
     517    }
     518
    484519    // we still don't have an encoding, and are in the head
    485520    // the following tags are allowed in <head>:
     
    497532    AtomicStringImpl* enclosingTagName = 0;
    498533
    499     const char* ptr = m_buffer.data();
    500     const char* pEnd = ptr + m_buffer.size();
    501     while (ptr + 7 < pEnd) { // +7 guarantees that "<!--" and "<?xml" fit in the buffer - and certainly we aren't going to lose any "charset" that way.
     534    while (ptr + 3 < pEnd) { // +3 guarantees that "<!--" fits in the buffer - and certainly we aren't going to lose any "charset" that way.
    502535        if (*ptr == '<') {
    503536            bool end = false;
     
    509542                skipComment(ptr, pEnd);
    510543                continue;
    511             }
    512 
    513             // Handle XML declaration, which can have encoding in it.
    514             // This encoding is honored even for HTML documents.
    515             if (ptr[0] == '?' && ptr[1] == 'x' && ptr[2] == 'm' && ptr[3] == 'l') {
    516                 const char* xmlDeclarationEnd = ptr;
    517                 while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>')
    518                     ++xmlDeclarationEnd;
    519                 if (xmlDeclarationEnd == pEnd)
    520                     return false;
    521                 DeprecatedCString str(ptr, xmlDeclarationEnd - ptr); // No need for +1, because we have an extra "?" to lose at the end of XML declaration.
    522                 int len = 0;
    523                 int pos = findXMLEncoding(str, len);
    524                 if (pos != -1)
    525                     setEncoding(TextEncoding(str.mid(pos, len)), EncodingFromXMLHeader);
    526                 // continue looking for a charset - it may be specified in an HTTP-Equiv meta
    527             } else if (ptr[0] == 0 && ptr[1] == '?' && ptr[2] == 0 && ptr[3] == 'x' && ptr[4] == 0 && ptr[5] == 'm' && ptr[6] == 0 && ptr[7] == 'l') {
    528                 // UTF-16 without BOM
    529                 setEncoding(((ptr - m_buffer.data()) % 2) ? UTF16LittleEndianEncoding() : UTF16BigEndianEncoding(), AutoDetectedEncoding);
    530                 return true;
    531             } else if (ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == '?' && ptr[4] == 0 && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == 'x') {
    532                 // UTF-32 without BOM
    533                 setEncoding(((ptr - m_buffer.data()) % 4) ? UTF32LittleEndianEncoding() : UTF32BigEndianEncoding(), AutoDetectedEncoding);
    534                 return true;
    535544            }
    536545
Note: See TracChangeset for help on using the changeset viewer.