Changeset 91025 in webkit


Ignore:
Timestamp:
Jul 14, 2011 1:08:08 PM (13 years ago)
Author:
jpfau@apple.com
Message:

Character reference parser for new XML parser
https://bugs.webkit.org/show_bug.cgi?id=64398

Reviewed by Adam Barth.

Refactored out the HTML entity parser and added a common template for parsing character references.

  • WebCore.xcodeproj/project.pbxproj:
  • html/parser/HTMLEntityParser.cpp:

(WebCore::consumeHTMLEntity):

  • xml/parser/CharacterReferenceParserInlineMethods.h: Copied from Source/WebCore/html/parser/HTMLEntityParser.cpp.

(WebCore::isHexDigit):
(WebCore::unconsumeCharacters):
(WebCore::consumeCharacterReference):

  • xml/parser/XMLCharacterReferenceParser.cpp: Added.

(WebCore::consumeXMLCharacterReference):

  • xml/parser/XMLCharacterReferenceParser.h: Added.
Location:
trunk/Source/WebCore
Files:
3 added
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/Source/WebCore/ChangeLog

    r91024 r91025  
     12011-07-14  Jeffrey Pfau  <jpfau@apple.com>
     2
     3        Character reference parser for new XML parser
     4        https://bugs.webkit.org/show_bug.cgi?id=64398
     5
     6        Reviewed by Adam Barth.
     7
     8        Refactored out the HTML entity parser and added a common template for parsing character references.
     9
     10        * WebCore.xcodeproj/project.pbxproj:
     11        * html/parser/HTMLEntityParser.cpp:
     12        (WebCore::consumeHTMLEntity):
     13        * xml/parser/CharacterReferenceParserInlineMethods.h: Copied from Source/WebCore/html/parser/HTMLEntityParser.cpp.
     14        (WebCore::isHexDigit):
     15        (WebCore::unconsumeCharacters):
     16        (WebCore::consumeCharacterReference):
     17        * xml/parser/XMLCharacterReferenceParser.cpp: Added.
     18        (WebCore::consumeXMLCharacterReference):
     19        * xml/parser/XMLCharacterReferenceParser.h: Added.
     20
    1212011-07-14  Ilya Tikhonovsky  <loislo@chromium.org>
    222
  • trunk/Source/WebCore/WebCore.xcodeproj/project.pbxproj

    r91008 r91025  
    5656                00CA93B213C6691600F7FE95 /* NewXMLDocumentParser.h in Headers */ = {isa = PBXBuildFile; fileRef = 00CA93B113C6691600F7FE95 /* NewXMLDocumentParser.h */; };
    5757                00CA93B513C6697C00F7FE95 /* NewXMLDocumentParser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 00CA93B413C6697C00F7FE95 /* NewXMLDocumentParser.cpp */; };
     58                00D0464A13C4D14500326FCC /* XMLCharacterReferenceParser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 00D0464813C4D14500326FCC /* XMLCharacterReferenceParser.cpp */; };
     59                00D0464B13C4D14500326FCC /* XMLCharacterReferenceParser.h in Headers */ = {isa = PBXBuildFile; fileRef = 00D0464913C4D14500326FCC /* XMLCharacterReferenceParser.h */; };
     60                00022E6913CE1BBA00282D5B /* CharacterReferenceParserInlineMethods.h in Headers */ = {isa = PBXBuildFile; fileRef = 00022E6813CE1BBA00282D5B /* CharacterReferenceParserInlineMethods.h */; };
    5861                052BFCE9128ABF1500FD338D /* GeolocationClientMock.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 052BFCE8128ABF1500FD338D /* GeolocationClientMock.cpp */; };
    5962                052BFCEB128ABF2100FD338D /* GeolocationClientMock.h in Headers */ = {isa = PBXBuildFile; fileRef = 052BFCEA128ABF2100FD338D /* GeolocationClientMock.h */; settings = {ATTRIBUTES = (Private, ); }; };
     
    64126415                00CA93B113C6691600F7FE95 /* NewXMLDocumentParser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = NewXMLDocumentParser.h; sourceTree = "<group>"; };
    64136416                00CA93B413C6697C00F7FE95 /* NewXMLDocumentParser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = NewXMLDocumentParser.cpp; sourceTree = "<group>"; };
     6417                00D0464813C4D14500326FCC /* XMLCharacterReferenceParser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = XMLCharacterReferenceParser.cpp; sourceTree = "<group>"; };
     6418                00D0464913C4D14500326FCC /* XMLCharacterReferenceParser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = XMLCharacterReferenceParser.h; sourceTree = "<group>"; };
     6419                00022E6813CE1BBA00282D5B /* CharacterReferenceParserInlineMethods.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CharacterReferenceParserInlineMethods.h; sourceTree = "<group>"; };
    64146420                052BFCE8128ABF1500FD338D /* GeolocationClientMock.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = GeolocationClientMock.cpp; path = mock/GeolocationClientMock.cpp; sourceTree = "<group>"; };
    64156421                052BFCEA128ABF2100FD338D /* GeolocationClientMock.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = GeolocationClientMock.h; path = mock/GeolocationClientMock.h; sourceTree = "<group>"; };
     
    1269812704                        isa = PBXGroup;
    1269912705                        children = (
     12706                                00022E6813CE1BBA00282D5B /* CharacterReferenceParserInlineMethods.h */,
    1270012707                                00CA93B413C6697C00F7FE95 /* NewXMLDocumentParser.cpp */,
    1270112708                                00CA93B113C6691600F7FE95 /* NewXMLDocumentParser.h */,
     12709                                00D0464813C4D14500326FCC /* XMLCharacterReferenceParser.cpp */,
     12710                                00D0464913C4D14500326FCC /* XMLCharacterReferenceParser.h */,
    1270212711                                00B9318113BA867F0035A948 /* XMLDocumentParser.cpp */,
    1270312712                                00B9318213BA867F0035A948 /* XMLDocumentParser.h */,
     
    2027920288                                514185EE0CD65F0400763C99 /* ChangeVersionWrapper.h in Headers */,
    2028020289                                6550B6A0099DF0270090D781 /* CharacterData.h in Headers */,
     20290                                00022E6913CE1BBA00282D5B /* CharacterReferenceParserInlineMethods.h in Headers */,
    2028120291                                B2C3DA2A0D006C1D00EF6F26 /* CharsetData.h in Headers */,
    2028220292                                F55B3DB21251F12D003EF269 /* CheckboxInputType.h in Headers */,
     
    2304923059                                9BAF3B2412C1A39800014BF1 /* WritingDirection.h in Headers */,
    2305023060                                6565820209D1508D000E61D7 /* XLinkNames.h in Headers */,
     23061                                00D0464B13C4D14500326FCC /* XMLCharacterReferenceParser.h in Headers */,
    2305123062                                00B9318813BA8DBA0035A948 /* XMLDocumentParser.h in Headers */,
    2305223063                                00B9318C13BA8DCC0035A948 /* XMLDocumentParserScope.h in Headers */,
     
    2581825829                                93309E23099E64920056E581 /* WrapContentsInDummySpanCommand.cpp in Sources */,
    2581925830                                A833C7CC0A2CF07400D57664 /* XLinkNames.cpp in Sources */,
     25831                                00D0464A13C4D14500326FCC /* XMLCharacterReferenceParser.cpp in Sources */,
    2582025832                                00B9318713BA8DB30035A948 /* XMLDocumentParser.cpp in Sources */,
    2582125833                                00B9318913BA8DBC0035A948 /* XMLDocumentParserLibxml2.cpp in Sources */,
  • trunk/Source/WebCore/html/parser/HTMLEntityParser.cpp

    r66359 r91025  
    2929#include "HTMLEntityParser.h"
    3030
     31#include "CharacterReferenceParserInlineMethods.h"
    3132#include "HTMLEntitySearch.h"
    3233#include "HTMLEntityTable.h"
     
    4647};
    4748
    48 inline UChar adjustEntity(UChar32 value)
    49 {
    50     if ((value & ~0x1F) != 0x0080)
    51         return value;
    52     return windowsLatin1ExtensionArray[value - 0x80];
    53 }
    54 
    55 inline UChar32 legalEntityFor(UChar32 value)
    56 {
    57     // FIXME: A number of specific entity values generate parse errors.
    58     if (value == 0 || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))
    59         return 0xFFFD;
    60     if (U_IS_BMP(value))
    61         return adjustEntity(value);
    62     return value;
    63 }
    64 
    65 inline bool convertToUTF16(UChar32 value, Vector<UChar, 16>& decodedEntity)
    66 {
    67     if (U_IS_BMP(value)) {
    68         UChar character = static_cast<UChar>(value);
    69         ASSERT(character == value);
    70         decodedEntity.append(character);
    71         return true;
    72     }
    73     decodedEntity.append(U16_LEAD(value));
    74     decodedEntity.append(U16_TRAIL(value));
    75     return true;
    76 }
    77 
    78 inline bool isHexDigit(UChar cc)
    79 {
    80     return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F');
    81 }
    82 
    8349inline bool isAlphaNumeric(UChar cc)
    8450{
     
    8652}
    8753
    88 void unconsumeCharacters(SegmentedString& source, const Vector<UChar, 10>& consumedCharacters)
    89 {
    90     if (consumedCharacters.size() == 1)
    91         source.push(consumedCharacters[0]);
    92     else if (consumedCharacters.size() == 2) {
    93         source.push(consumedCharacters[0]);
    94         source.push(consumedCharacters[1]);
    95     } else
    96         source.prepend(SegmentedString(String(consumedCharacters.data(), consumedCharacters.size())));
    97 }
     54class HTMLEntityParser {
     55public:
     56    inline static UChar adjustEntity(UChar32 value)
     57    {
     58        if ((value & ~0x1F) != 0x0080)
     59            return value;
     60        return windowsLatin1ExtensionArray[value - 0x80];
     61    }
     62
     63    inline static UChar32 legalEntityFor(UChar32 value)
     64    {
     65        // FIXME: A number of specific entity values generate parse errors.
     66        if (!value || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))
     67            return 0xFFFD;
     68        if (U_IS_BMP(value))
     69            return adjustEntity(value);
     70        return value;
     71    }
     72
     73    inline static bool convertToUTF16(UChar32 value, Vector<UChar, 16>& decodedEntity)
     74    {
     75        if (U_IS_BMP(value)) {
     76            UChar character = static_cast<UChar>(value);
     77            ASSERT(character == value);
     78            decodedEntity.append(character);
     79            return true;
     80        }
     81        decodedEntity.append(U16_LEAD(value));
     82        decodedEntity.append(U16_TRAIL(value));
     83        return true;
     84    }
     85
     86    inline static bool acceptMalformed() { return true; }
     87
     88    inline static bool consumeNamedEntity(SegmentedString& source, Vector<UChar, 16>& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc)
     89    {
     90        Vector<UChar, 10> consumedCharacters;
     91        HTMLEntitySearch entitySearch;
     92        while (!source.isEmpty()) {
     93            cc = *source;
     94            entitySearch.advance(cc);
     95            if (!entitySearch.isEntityPrefix())
     96                break;
     97            consumedCharacters.append(cc);
     98            source.advanceAndASSERT(cc);
     99        }
     100        notEnoughCharacters = source.isEmpty();
     101        if (notEnoughCharacters) {
     102            // We can't an entity because there might be a longer entity
     103            // that we could match if we had more data.
     104            unconsumeCharacters(source, consumedCharacters);
     105            return false;
     106        }
     107        if (!entitySearch.mostRecentMatch()) {
     108            ASSERT(!entitySearch.currentValue());
     109            unconsumeCharacters(source, consumedCharacters);
     110            return false;
     111        }
     112        if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
     113            // We've consumed too many characters. We need to walk the
     114            // source back to the point at which we had consumed an
     115            // actual entity.
     116            unconsumeCharacters(source, consumedCharacters);
     117            consumedCharacters.clear();
     118            const int length = entitySearch.mostRecentMatch()->length;
     119            const UChar* reference = entitySearch.mostRecentMatch()->entity;
     120            for (int i = 0; i < length; ++i) {
     121                cc = *source;
     122                ASSERT_UNUSED(reference, cc == *reference++);
     123                consumedCharacters.append(cc);
     124                source.advanceAndASSERT(cc);
     125                ASSERT(!source.isEmpty());
     126            }
     127            cc = *source;
     128        }
     129        if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
     130            || !additionalAllowedCharacter
     131            || !(isAlphaNumeric(cc) || cc == '=')) {
     132            return convertToUTF16(entitySearch.mostRecentMatch()->value, decodedEntity);
     133        }
     134        unconsumeCharacters(source, consumedCharacters);
     135        return false;
     136    }
     137};
    98138
    99139}
    100140
     141
    101142bool consumeHTMLEntity(SegmentedString& source, Vector<UChar, 16>& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter)
    102143{
    103     ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>');
    104     ASSERT(!notEnoughCharacters);
    105     ASSERT(decodedEntity.isEmpty());
    106 
    107     enum EntityState {
    108         Initial,
    109         Number,
    110         MaybeHexLowerCaseX,
    111         MaybeHexUpperCaseX,
    112         Hex,
    113         Decimal,
    114         Named
    115     };
    116     EntityState entityState = Initial;
    117     UChar32 result = 0;
    118     Vector<UChar, 10> consumedCharacters;
    119 
    120     while (!source.isEmpty()) {
    121         UChar cc = *source;
    122         switch (entityState) {
    123         case Initial: {
    124             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&')
    125                 return false;
    126             if (additionalAllowedCharacter && cc == additionalAllowedCharacter)
    127                 return false;
    128             if (cc == '#') {
    129                 entityState = Number;
    130                 break;
    131             }
    132             if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
    133                 entityState = Named;
    134                 continue;
    135             }
    136             return false;
    137         }
    138         case Number: {
    139             if (cc == 'x') {
    140                 entityState = MaybeHexLowerCaseX;
    141                 break;
    142             }
    143             if (cc == 'X') {
    144                 entityState = MaybeHexUpperCaseX;
    145                 break;
    146             }
    147             if (cc >= '0' && cc <= '9') {
    148                 entityState = Decimal;
    149                 continue;
    150             }
    151             source.push('#');
    152             return false;
    153         }
    154         case MaybeHexLowerCaseX: {
    155             if (isHexDigit(cc)) {
    156                 entityState = Hex;
    157                 continue;
    158             }
    159             source.push('#');
    160             source.push('x');
    161             return false;
    162         }
    163         case MaybeHexUpperCaseX: {
    164             if (isHexDigit(cc)) {
    165                 entityState = Hex;
    166                 continue;
    167             }
    168             source.push('#');
    169             source.push('X');
    170             return false;
    171         }
    172         case Hex: {
    173             if (cc >= '0' && cc <= '9')
    174                 result = result * 16 + cc - '0';
    175             else if (cc >= 'a' && cc <= 'f')
    176                 result = result * 16 + 10 + cc - 'a';
    177             else if (cc >= 'A' && cc <= 'F')
    178                 result = result * 16 + 10 + cc - 'A';
    179             else {
    180                 if (cc == ';')
    181                     source.advanceAndASSERT(cc);
    182                 return convertToUTF16(legalEntityFor(result), decodedEntity);
    183             }
    184             break;
    185         }
    186         case Decimal: {
    187             if (cc >= '0' && cc <= '9')
    188                 result = result * 10 + cc - '0';
    189             else {
    190                 if (cc == ';')
    191                     source.advanceAndASSERT(cc);
    192                 return convertToUTF16(legalEntityFor(result), decodedEntity);
    193             }
    194             break;
    195         }
    196         case Named: {
    197             HTMLEntitySearch entitySearch;
    198             while (!source.isEmpty()) {
    199                 cc = *source;
    200                 entitySearch.advance(cc);
    201                 if (!entitySearch.isEntityPrefix())
    202                     break;
    203                 consumedCharacters.append(cc);
    204                 source.advanceAndASSERT(cc);
    205             }
    206             notEnoughCharacters = source.isEmpty();
    207             if (notEnoughCharacters) {
    208                 // We can't an entity because there might be a longer entity
    209                 // that we could match if we had more data.
    210                 unconsumeCharacters(source, consumedCharacters);
    211                 return false;
    212             }
    213             if (!entitySearch.mostRecentMatch()) {
    214                 ASSERT(!entitySearch.currentValue());
    215                 unconsumeCharacters(source, consumedCharacters);
    216                 return false;
    217             }
    218             if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
    219                 // We've consumed too many characters.  We need to walk the
    220                 // source back to the point at which we had consumed an
    221                 // actual entity.
    222                 unconsumeCharacters(source, consumedCharacters);
    223                 consumedCharacters.clear();
    224                 const int length = entitySearch.mostRecentMatch()->length;
    225                 const UChar* reference = entitySearch.mostRecentMatch()->entity;
    226                 for (int i = 0; i < length; ++i) {
    227                     cc = *source;
    228                     ASSERT_UNUSED(reference, cc == *reference++);
    229                     consumedCharacters.append(cc);
    230                     source.advanceAndASSERT(cc);
    231                     ASSERT(!source.isEmpty());
    232                 }
    233                 cc = *source;
    234             }
    235             if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
    236                 || !additionalAllowedCharacter
    237                 || !(isAlphaNumeric(cc) || cc == '=')) {
    238                 return convertToUTF16(entitySearch.mostRecentMatch()->value, decodedEntity);
    239             }
    240             unconsumeCharacters(source, consumedCharacters);
    241             return false;
    242         }
    243         }
    244         consumedCharacters.append(cc);
    245         source.advanceAndASSERT(cc);
    246     }
    247     ASSERT(source.isEmpty());
    248     notEnoughCharacters = true;
    249     unconsumeCharacters(source, consumedCharacters);
    250     return false;
     144    return consumeCharacterReference<HTMLEntityParser>(source, decodedEntity, notEnoughCharacters, additionalAllowedCharacter);
    251145}
    252146
Note: See TracChangeset for help on using the changeset viewer.