Changeset 60790 in webkit


Ignore:
Timestamp:
Jun 7, 2010 11:34:07 AM (14 years ago)
Author:
abarth@webkit.org
Message:

2010-06-07 Adam Barth <abarth@webkit.org>

Reviewed by Eric Seidel.

HTML5 parser should normalize line endings
https://bugs.webkit.org/show_bug.cgi?id=40199

When reading characters from the source, we need to normalize line
endings, which means treating \r as \n and folding \r\n sequences into
a single \n. Doing this requires a slightly tighter pattern for
advancing the source (to make sure we update our state machine
correctly).

Fixes fast/css/first-child-pseudo-class.html and hundreds of other
LayoutTests in the HTML5 parser.

  • html/HTML5Lexer.cpp: (WebCore::HTML5Lexer::nextToken):
  • html/HTML5Lexer.h: (WebCore::HTML5Lexer::InputStreamPreprocessor::LineEndingNormalizer):
    • Introduced InputStreamPreprocessor to keep track of our line endings state machine. This class will grow to eventually do all the input stream preprocessing required by the HTML5 spec.

(WebCore::HTML5Lexer::InputStreamPreprocessor::nextInputCharacter):

  • We need to be careful not to read *source more than once because we need to know whether we've transformed the character. Keeping the next input character in this state machine requires some careful coding, but seems advantageous.

(WebCore::HTML5Lexer::InputStreamPreprocessor::peek):
(WebCore::HTML5Lexer::InputStreamPreprocessor::advance):

Location:
trunk/WebCore
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/WebCore/ChangeLog

    r60789 r60790  
     12010-06-07  Adam Barth  <abarth@webkit.org>
     2
     3        Reviewed by Eric Seidel.
     4
     5        HTML5 parser should normalize line endings
     6        https://bugs.webkit.org/show_bug.cgi?id=40199
     7
     8        When reading characters from the source, we need to normalize line
     9        endings, which means treating \r as \n and folding \r\n sequences into
     10        a single \n.  Doing this requires a slightly tighter pattern for
     11        advancing the source (to make sure we update our state machine
     12        correctly).
     13
     14        Fixes fast/css/first-child-pseudo-class.html and hundreds of other
     15        LayoutTests in the HTML5 parser.
     16
     17        * html/HTML5Lexer.cpp:
     18        (WebCore::HTML5Lexer::nextToken):
     19        * html/HTML5Lexer.h:
     20        (WebCore::HTML5Lexer::InputStreamPreprocessor::LineEndingNormalizer):
     21            - Introduced InputStreamPreprocessor to keep track of our line
     22              endings state machine.  This class will grow to eventually do
     23              all the input stream preprocessing required by the HTML5 spec.
     24        (WebCore::HTML5Lexer::InputStreamPreprocessor::nextInputCharacter):
     25            - We need to be careful not to read *source more than
     26              once because we need to know whether we've transformed the
     27              character.  Keeping the next input character in this state
     28              machine requires some careful coding, but seems advantageous.
     29        (WebCore::HTML5Lexer::InputStreamPreprocessor::peek):
     30        (WebCore::HTML5Lexer::InputStreamPreprocessor::advance):
     31
    1322010-06-07  Andrei Popescu  <andreip@google.com>
    233
  • trunk/WebCore/html/HTML5Lexer.cpp

    r60739 r60790  
    327327#define END_STATE() ASSERT_NOT_REACHED(); break;
    328328
     329// We use this macro when the HTML5 spec says "reconsume the current input
     330// character in the <mumble> state."
    329331#define RECONSUME_IN(stateName)                                            \
    330332    do {                                                                   \
    331333        m_state = stateName;                                               \
    332         cc = *source;                                                      \
    333334        goto stateName;                                                    \
    334335    } while (false)
    335336
     337// We use this macro when the HTML5 spec says "consume the next input
     338// character ... and switch to the <mumble> state."
    336339#define ADVANCE_TO(stateName)                                              \
    337340    do {                                                                   \
    338341        m_state = stateName;                                               \
    339         source.advance(m_lineNumber);                                      \
    340         if (source.isEmpty())                                              \
     342        if (!m_inputStreamPreprocessor.advance(source, m_lineNumber))      \
    341343            return shouldEmitBufferedCharacterToken(source);               \
    342         cc = *source;                                                      \
     344        cc = m_inputStreamPreprocessor.nextInputCharacter();               \
    343345        goto stateName;                                                    \
    344346    } while (false)
    345347
     348// Sometimes there's more complicated logic in the spec that separates when
     349// we consume the next input character and when we switch to a particular
     350// state.  We handle those cases by advancing the source directly and using
     351// this macro to switch to the indicated state.
     352#define SWITCH_TO(stateName)                                               \
     353    do {                                                                   \
     354        m_state = stateName;                                               \
     355        if (!m_inputStreamPreprocessor.peek(source, m_lineNumber))         \
     356            return shouldEmitBufferedCharacterToken(source);               \
     357        cc = m_inputStreamPreprocessor.nextInputCharacter();               \
     358        goto stateName;                                                    \
     359    } while (false)
     360
     361// We use this macro when the HTML5 spec says "Emit the current <mumble>
     362// token. Switch to the <mumble> state."  We use the word "resume" instead of
     363// switch to indicate that this macro actually returns and that we'll end up
     364// in the state when we "resume" (i.e., are called again).
    346365#define EMIT_AND_RESUME_IN(stateName)                                      \
    347366    do {                                                                   \
     
    367386        m_state = stateName;                                               \
    368387        _FLUSH_BUFFERED_END_TAG();                                         \
    369         if (source.isEmpty())                                              \
     388        if (source.isEmpty()                                               \
     389            || !m_inputStreamPreprocessor.peek(source, m_lineNumber))      \
    370390            return shouldEmitBufferedCharacterToken(source);               \
    371         cc = *source;                                                      \
     391        cc = m_inputStreamPreprocessor.nextInputCharacter();               \
    372392        goto stateName;                                                    \
    373393    } while (false)
     
    398418    }
    399419
     420    if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source, m_lineNumber))
     421        return shouldEmitBufferedCharacterToken(source);
     422    UChar cc = m_inputStreamPreprocessor.nextInputCharacter();
     423
    400424    // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
    401     if (m_skipLeadingNewLineForListing && m_state == DataState && !source.isEmpty() && *source == '\x0A')
    402         source.advanceAndASSERT('\x0A');
    403     m_skipLeadingNewLineForListing = false;
    404 
    405     if (source.isEmpty())
    406         return shouldEmitBufferedCharacterToken(source);
     425    // Note that this logic is different than the generic \r\n collapsing
     426    // handled in the input stream preprocessor.  This logic is here as an
     427    // "authoring convenience" so folks can write:
     428    //
     429    // <pre>
     430    // lorem ipsum
     431    // lorem ipsum
     432    // </pre>
     433    //
     434    // without getting an extra newline at the start of their <pre> element.
     435    if (m_skipLeadingNewLineForListing) {
     436        m_skipLeadingNewLineForListing = false;
     437        if (m_state == DataState && cc == '\n')
     438            ADVANCE_TO(DataState);
     439    }
    407440
    408441    // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
    409     UChar cc = *source;
    410442    switch (m_state) {
    411443    BEGIN_STATE(DataState) {
     
    429461        if (!processEntity(source))
    430462            return shouldEmitBufferedCharacterToken(source);
    431         RECONSUME_IN(DataState);
     463        SWITCH_TO(DataState);
    432464    }
    433465    END_STATE()
     
    448480        if (!processEntity(source))
    449481            return shouldEmitBufferedCharacterToken(source);
    450         RECONSUME_IN(RCDATAState);
     482        SWITCH_TO(RCDATAState);
    451483    }
    452484    END_STATE()
     
    11011133        // state can be determined by m_additionalAllowedCharacter.
    11021134        if (m_additionalAllowedCharacter == '"')
    1103             RECONSUME_IN(AttributeValueDoubleQuotedState);
     1135            SWITCH_TO(AttributeValueDoubleQuotedState);
    11041136        else if (m_additionalAllowedCharacter == '\'')
    1105             RECONSUME_IN(AttributeValueSingleQuotedState);
     1137            SWITCH_TO(AttributeValueSingleQuotedState);
    11061138        else if (m_additionalAllowedCharacter == '>')
    1107             RECONSUME_IN(AttributeValueUnquotedState);
     1139            SWITCH_TO(AttributeValueUnquotedState);
    11081140        else
    11091141            ASSERT_NOT_REACHED();
     
    11431175        m_token->beginComment();
    11441176        while (!source.isEmpty()) {
    1145             cc = *source;
     1177            cc = m_inputStreamPreprocessor.nextInputCharacter();
    11461178            if (cc == '>')
    11471179                EMIT_AND_RESUME_IN(DataState);
    11481180            m_token->appendToComment(cc);
    1149             source.advance(m_lineNumber);
     1181            m_inputStreamPreprocessor.advance(source, m_lineNumber);
     1182            // We ignore the return value (which indicates that |source| is
     1183            // empty) because it's checked by the loop condition above.
    11501184        }
    11511185        m_state = DataState;
     
    11641198                source.advanceAndASSERT('-');
    11651199                m_token->beginComment();
    1166                 RECONSUME_IN(CommentStartState);
     1200                SWITCH_TO(CommentStartState);
    11671201            } else if (result == SegmentedString::NotEnoughCharacters)
    11681202                return shouldEmitBufferedCharacterToken(source);
     
    11711205            if (result == SegmentedString::DidMatch) {
    11721206                advanceStringAndASSERTIgnoringCase(source, "doctype");
    1173                 RECONSUME_IN(DOCTYPEState);
     1207                SWITCH_TO(DOCTYPEState);
    11741208            } else if (result == SegmentedString::NotEnoughCharacters)
    11751209                return shouldEmitBufferedCharacterToken(source);
     
    13561390                if (result == SegmentedString::DidMatch) {
    13571391                    advanceStringAndASSERTIgnoringCase(source, "public");
    1358                     RECONSUME_IN(AfterDOCTYPEPublicKeywordState);
     1392                    SWITCH_TO(AfterDOCTYPEPublicKeywordState);
    13591393                } else if (result == SegmentedString::NotEnoughCharacters)
    13601394                    return shouldEmitBufferedCharacterToken(source);
     
    13631397                if (result == SegmentedString::DidMatch) {
    13641398                    advanceStringAndASSERTIgnoringCase(source, "system");
    1365                     RECONSUME_IN(AfterDOCTYPESystemKeywordState);
     1399                    SWITCH_TO(AfterDOCTYPESystemKeywordState);
    13661400                } else if (result == SegmentedString::NotEnoughCharacters)
    13671401                    return shouldEmitBufferedCharacterToken(source);
  • trunk/WebCore/html/HTML5Lexer.h

    r60739 r60790  
    131131
    132132    private:
     133        // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
     134        class InputStreamPreprocessor : public Noncopyable {
     135        public:
     136            InputStreamPreprocessor()
     137                : m_nextInputCharacter('\0')
     138                , m_skipNextNewLine(false)
     139            {
     140            }
     141
     142            UChar nextInputCharacter() const { return m_nextInputCharacter; }
     143
     144            // Returns whether we succeeded in peeking at the next character.
     145            // The only way we can fail to peek is if there are no more
     146            // characters in |source| (after collapsing \r\n, etc).
     147            bool peek(SegmentedString& source, int& lineNumber)
     148            {
     149                m_nextInputCharacter = *source;
     150                if (m_nextInputCharacter == '\n' && m_skipNextNewLine) {
     151                    m_skipNextNewLine = false;
     152                    source.advancePastNewline(lineNumber);
     153                    if (source.isEmpty())
     154                        return false;
     155                    m_nextInputCharacter = *source;
     156                }
     157                if (m_nextInputCharacter == '\r') {
     158                    m_nextInputCharacter = '\n';
     159                    m_skipNextNewLine = true;
     160                } else
     161                    m_skipNextNewLine = false;
     162                return true;
     163            }
     164
     165            // Returns whether there are more characters in |source| after advancing.
     166            bool advance(SegmentedString& source, int& lineNumber)
     167            {
     168                source.advance(lineNumber);
     169                if (source.isEmpty())
     170                    return false;
     171                return peek(source, lineNumber);
     172            }
     173
     174        private:
     175            // http://www.whatwg.org/specs/web-apps/current-work/#next-input-character
     176            UChar m_nextInputCharacter;
     177            bool m_skipNextNewLine;
     178        };
     179
    133180        inline void emitCharacter(UChar);
    134181        inline void emitParseError();
     
    170217        // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character
    171218        UChar m_additionalAllowedCharacter;
     219
     220        // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
     221        InputStreamPreprocessor m_inputStreamPreprocessor;
    172222    };
    173223
Note: See TracChangeset for help on using the changeset viewer.