Context Navigation

← Previous Changeset
Next Changeset →

Changeset 60790 in webkit

Timestamp:

Jun 7, 2010 11:34:07 AM (14 years ago)

Author:

abarth@webkit.org

Message:

2010-06-07 Adam Barth <abarth@webkit.org>

Reviewed by Eric Seidel.

HTML5 parser should normalize line endings
https://bugs.webkit.org/show_bug.cgi?id=40199

When reading characters from the source, we need to normalize line
endings, which means treating \r as \n and folding \r\n sequences into
a single \n. Doing this requires a slightly tighter pattern for
advancing the source (to make sure we update our state machine
correctly).

Fixes fast/css/first-child-pseudo-class.html and hundreds of other
LayoutTests in the HTML5 parser.

html/HTML5Lexer.cpp: (WebCore::HTML5Lexer::nextToken):
html/HTML5Lexer.h: (WebCore::HTML5Lexer::InputStreamPreprocessor::LineEndingNormalizer):
- Introduced InputStreamPreprocessor to keep track of our line endings state machine. This class will grow to eventually do all the input stream preprocessing required by the HTML5 spec.

(WebCore::HTML5Lexer::InputStreamPreprocessor::nextInputCharacter):

We need to be careful not to read *source more than once because we need to know whether we've transformed the character. Keeping the next input character in this state machine requires some careful coding, but seems advantageous.

(WebCore::HTML5Lexer::InputStreamPreprocessor::peek):
(WebCore::HTML5Lexer::InputStreamPreprocessor::advance):

Location:

trunk/WebCore

Files:

: 3 edited

ChangeLog (modified) (1 diff)
html/HTML5Lexer.cpp (modified) (11 diffs)
html/HTML5Lexer.h (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/WebCore/ChangeLog

-                      r60789
+                      r60790
+-06-07  Adam Barth  <abarth@webkit.org>
+        Reviewed by Eric Seidel.
+        HTML5 parser should normalize line endings
+        https://bugs.webkit.org/show_bug.cgi?id=40199
+        When reading characters from the source, we need to normalize line
+        endings, which means treating \r as \n and folding \r\n sequences into
+        a single \n.  Doing this requires a slightly tighter pattern for
+        advancing the source (to make sure we update our state machine
+        correctly).
+        Fixes fast/css/first-child-pseudo-class.html and hundreds of other
+        LayoutTests in the HTML5 parser.
+        * html/HTML5Lexer.cpp:
+        (WebCore::HTML5Lexer::nextToken):
+        * html/HTML5Lexer.h:
+        (WebCore::HTML5Lexer::InputStreamPreprocessor::LineEndingNormalizer):
+            - Introduced InputStreamPreprocessor to keep track of our line
+              endings state machine.  This class will grow to eventually do
+              all the input stream preprocessing required by the HTML5 spec.
+        (WebCore::HTML5Lexer::InputStreamPreprocessor::nextInputCharacter):
+            - We need to be careful not to read *source more than
+              once because we need to know whether we've transformed the
+              character.  Keeping the next input character in this state
+              machine requires some careful coding, but seems advantageous.
+        (WebCore::HTML5Lexer::InputStreamPreprocessor::peek):
+        (WebCore::HTML5Lexer::InputStreamPreprocessor::advance):
 -06-07  Andrei Popescu  <andreip@google.com>

trunk/WebCore/html/HTML5Lexer.cpp

-                      r60739
+                      r60790
 #define END_STATE() ASSERT_NOT_REACHED(); break;
+// We use this macro when the HTML5 spec says "reconsume the current input
+// character in the <mumble> state."
 #define RECONSUME_IN(stateName)                                            \
     do {                                                                   \
         m_state = stateName;                                               \
-        cc = *source;                                                      \
         goto stateName;                                                    \
     } while (false)
+// We use this macro when the HTML5 spec says "consume the next input
+// character ... and switch to the <mumble> state."
 #define ADVANCE_TO(stateName)                                              \
     do {                                                                   \
         m_state = stateName;                                               \
+        source.advance(m_lineNumber);                                      \
+        if (source.isEmpty())                                              \
+        if (!m_inputStreamPreprocessor.advance(source, m_lineNumber))      \
             return shouldEmitBufferedCharacterToken(source);               \
         cc = *source;                                                      \
+        cc = m_inputStreamPreprocessor.nextInputCharacter();               \
         goto stateName;                                                    \
     } while (false)
+// Sometimes there's more complicated logic in the spec that separates when
+// we consume the next input character and when we switch to a particular
+// state.  We handle those cases by advancing the source directly and using
+// this macro to switch to the indicated state.
+#define SWITCH_TO(stateName)                                               \
+    do {                                                                   \
+        m_state = stateName;                                               \
+        if (!m_inputStreamPreprocessor.peek(source, m_lineNumber))         \
+            return shouldEmitBufferedCharacterToken(source);               \
+        cc = m_inputStreamPreprocessor.nextInputCharacter();               \
+        goto stateName;                                                    \
+    } while (false)
+// We use this macro when the HTML5 spec says "Emit the current <mumble>
+// token. Switch to the <mumble> state."  We use the word "resume" instead of
+// switch to indicate that this macro actually returns and that we'll end up
+// in the state when we "resume" (i.e., are called again).
 #define EMIT_AND_RESUME_IN(stateName)                                      \
     do {                                                                   \
 …
         m_state = stateName;                                               \
         _FLUSH_BUFFERED_END_TAG();                                         \
+        if (source.isEmpty())                                              \
+        if (source.isEmpty()                                               \
+            || !m_inputStreamPreprocessor.peek(source, m_lineNumber))      \
             return shouldEmitBufferedCharacterToken(source);               \
         cc = *source;                                                      \
+        cc = m_inputStreamPreprocessor.nextInputCharacter();               \
         goto stateName;                                                    \
     } while (false)
 …
+    }
+    if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source, m_lineNumber))
+        return shouldEmitBufferedCharacterToken(source);
+    UChar cc = m_inputStreamPreprocessor.nextInputCharacter();
     // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
+    if (m_skipLeadingNewLineForListing && m_state == DataState && !source.isEmpty() && *source == '\x0A')
+        source.advanceAndASSERT('\x0A');
+    m_skipLeadingNewLineForListing = false;
+    if (source.isEmpty())
+        return shouldEmitBufferedCharacterToken(source);
+    // Note that this logic is different than the generic \r\n collapsing
+    // handled in the input stream preprocessor.  This logic is here as an
+    // "authoring convenience" so folks can write:
+    //
+    // <pre>
+    // lorem ipsum
+    // lorem ipsum
+    // </pre>
+    //
+    // without getting an extra newline at the start of their <pre> element.
+    if (m_skipLeadingNewLineForListing) {
+        m_skipLeadingNewLineForListing = false;
+        if (m_state == DataState && cc == '\n')
+            ADVANCE_TO(DataState);
+    }
     // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
-    UChar cc = *source;
     switch (m_state) {
     BEGIN_STATE(DataState) {
 …
         if (!processEntity(source))
             return shouldEmitBufferedCharacterToken(source);
         RECONSUME_IN(DataState);
+        SWITCH_TO(DataState);
+    }
     END_STATE()
 …
         if (!processEntity(source))
             return shouldEmitBufferedCharacterToken(source);
         RECONSUME_IN(RCDATAState);
+        SWITCH_TO(RCDATAState);
+    }
     END_STATE()
 …
         // state can be determined by m_additionalAllowedCharacter.
         if (m_additionalAllowedCharacter == '"')
             RECONSUME_IN(AttributeValueDoubleQuotedState);
+            SWITCH_TO(AttributeValueDoubleQuotedState);
         else if (m_additionalAllowedCharacter == '\'')
             RECONSUME_IN(AttributeValueSingleQuotedState);
+            SWITCH_TO(AttributeValueSingleQuotedState);
         else if (m_additionalAllowedCharacter == '>')
             RECONSUME_IN(AttributeValueUnquotedState);
+            SWITCH_TO(AttributeValueUnquotedState);
         else
             ASSERT_NOT_REACHED();
 …
         m_token->beginComment();
         while (!source.isEmpty()) {
             cc = *source;
+            cc = m_inputStreamPreprocessor.nextInputCharacter();
             if (cc == '>')
                 EMIT_AND_RESUME_IN(DataState);
             m_token->appendToComment(cc);
+            source.advance(m_lineNumber);
+            m_inputStreamPreprocessor.advance(source, m_lineNumber);
+            // We ignore the return value (which indicates that |source| is
+            // empty) because it's checked by the loop condition above.
+        }
         m_state = DataState;
 …
                 source.advanceAndASSERT('-');
                 m_token->beginComment();
                 RECONSUME_IN(CommentStartState);
+                SWITCH_TO(CommentStartState);
             } else if (result == SegmentedString::NotEnoughCharacters)
                 return shouldEmitBufferedCharacterToken(source);
 …
             if (result == SegmentedString::DidMatch) {
                 advanceStringAndASSERTIgnoringCase(source, "doctype");
                 RECONSUME_IN(DOCTYPEState);
+                SWITCH_TO(DOCTYPEState);
             } else if (result == SegmentedString::NotEnoughCharacters)
                 return shouldEmitBufferedCharacterToken(source);
 …
                 if (result == SegmentedString::DidMatch) {
                     advanceStringAndASSERTIgnoringCase(source, "public");
                     RECONSUME_IN(AfterDOCTYPEPublicKeywordState);
+                    SWITCH_TO(AfterDOCTYPEPublicKeywordState);
                 } else if (result == SegmentedString::NotEnoughCharacters)
                     return shouldEmitBufferedCharacterToken(source);
 …
                 if (result == SegmentedString::DidMatch) {
                     advanceStringAndASSERTIgnoringCase(source, "system");
                     RECONSUME_IN(AfterDOCTYPESystemKeywordState);
+                    SWITCH_TO(AfterDOCTYPESystemKeywordState);
                 } else if (result == SegmentedString::NotEnoughCharacters)
                     return shouldEmitBufferedCharacterToken(source);

trunk/WebCore/html/HTML5Lexer.h

-                      r60739
+                      r60790
     private:
+        // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
+        class InputStreamPreprocessor : public Noncopyable {
+        public:
+            InputStreamPreprocessor()
+                : m_nextInputCharacter('\0')
+                , m_skipNextNewLine(false)
+            {
+            }
+            UChar nextInputCharacter() const { return m_nextInputCharacter; }
+            // Returns whether we succeeded in peeking at the next character.
+            // The only way we can fail to peek is if there are no more
+            // characters in |source| (after collapsing \r\n, etc).
+            bool peek(SegmentedString& source, int& lineNumber)
+            {
+                m_nextInputCharacter = *source;
+                if (m_nextInputCharacter == '\n' && m_skipNextNewLine) {
+                    m_skipNextNewLine = false;
+                    source.advancePastNewline(lineNumber);
+                    if (source.isEmpty())
+                        return false;
+                    m_nextInputCharacter = *source;
+                }
+                if (m_nextInputCharacter == '\r') {
+                    m_nextInputCharacter = '\n';
+                    m_skipNextNewLine = true;
+                } else
+                    m_skipNextNewLine = false;
+                return true;
+            }
+            // Returns whether there are more characters in |source| after advancing.
+            bool advance(SegmentedString& source, int& lineNumber)
+            {
+                source.advance(lineNumber);
+                if (source.isEmpty())
+                    return false;
+                return peek(source, lineNumber);
+            }
+        private:
+            // http://www.whatwg.org/specs/web-apps/current-work/#next-input-character
+            UChar m_nextInputCharacter;
+            bool m_skipNextNewLine;
+        };
         inline void emitCharacter(UChar);
         inline void emitParseError();
 …
         // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character
         UChar m_additionalAllowedCharacter;
+        // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
+        InputStreamPreprocessor m_inputStreamPreprocessor;
     };

Note: See TracChangeset for help on using the changeset viewer.