Changeset 91025 in webkit
- Timestamp:
- Jul 14, 2011 1:08:08 PM (13 years ago)
- Location:
- trunk/Source/WebCore
- Files:
-
- 3 added
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/Source/WebCore/ChangeLog
r91024 r91025 1 2011-07-14 Jeffrey Pfau <jpfau@apple.com> 2 3 Character reference parser for new XML parser 4 https://bugs.webkit.org/show_bug.cgi?id=64398 5 6 Reviewed by Adam Barth. 7 8 Refactored out the HTML entity parser and added a common template for parsing character references. 9 10 * WebCore.xcodeproj/project.pbxproj: 11 * html/parser/HTMLEntityParser.cpp: 12 (WebCore::consumeHTMLEntity): 13 * xml/parser/CharacterReferenceParserInlineMethods.h: Copied from Source/WebCore/html/parser/HTMLEntityParser.cpp. 14 (WebCore::isHexDigit): 15 (WebCore::unconsumeCharacters): 16 (WebCore::consumeCharacterReference): 17 * xml/parser/XMLCharacterReferenceParser.cpp: Added. 18 (WebCore::consumeXMLCharacterReference): 19 * xml/parser/XMLCharacterReferenceParser.h: Added. 20 1 21 2011-07-14 Ilya Tikhonovsky <loislo@chromium.org> 2 22 -
trunk/Source/WebCore/WebCore.xcodeproj/project.pbxproj
r91008 r91025 56 56 00CA93B213C6691600F7FE95 /* NewXMLDocumentParser.h in Headers */ = {isa = PBXBuildFile; fileRef = 00CA93B113C6691600F7FE95 /* NewXMLDocumentParser.h */; }; 57 57 00CA93B513C6697C00F7FE95 /* NewXMLDocumentParser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 00CA93B413C6697C00F7FE95 /* NewXMLDocumentParser.cpp */; }; 58 00D0464A13C4D14500326FCC /* XMLCharacterReferenceParser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 00D0464813C4D14500326FCC /* XMLCharacterReferenceParser.cpp */; }; 59 00D0464B13C4D14500326FCC /* XMLCharacterReferenceParser.h in Headers */ = {isa = PBXBuildFile; fileRef = 00D0464913C4D14500326FCC /* XMLCharacterReferenceParser.h */; }; 60 00022E6913CE1BBA00282D5B /* CharacterReferenceParserInlineMethods.h in Headers */ = {isa = PBXBuildFile; fileRef = 00022E6813CE1BBA00282D5B /* CharacterReferenceParserInlineMethods.h */; }; 58 61 052BFCE9128ABF1500FD338D /* GeolocationClientMock.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 052BFCE8128ABF1500FD338D /* GeolocationClientMock.cpp */; }; 59 62 052BFCEB128ABF2100FD338D /* GeolocationClientMock.h in Headers */ = {isa = PBXBuildFile; fileRef = 052BFCEA128ABF2100FD338D /* GeolocationClientMock.h */; settings = {ATTRIBUTES = (Private, ); }; }; … … 6412 6415 00CA93B113C6691600F7FE95 /* NewXMLDocumentParser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = NewXMLDocumentParser.h; sourceTree = "<group>"; }; 6413 6416 00CA93B413C6697C00F7FE95 /* NewXMLDocumentParser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = NewXMLDocumentParser.cpp; sourceTree = "<group>"; }; 6417 00D0464813C4D14500326FCC /* XMLCharacterReferenceParser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = XMLCharacterReferenceParser.cpp; sourceTree = "<group>"; }; 6418 00D0464913C4D14500326FCC /* XMLCharacterReferenceParser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = XMLCharacterReferenceParser.h; sourceTree = "<group>"; }; 6419 00022E6813CE1BBA00282D5B /* CharacterReferenceParserInlineMethods.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CharacterReferenceParserInlineMethods.h; sourceTree = "<group>"; }; 6414 6420 052BFCE8128ABF1500FD338D /* GeolocationClientMock.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = GeolocationClientMock.cpp; path = mock/GeolocationClientMock.cpp; sourceTree = "<group>"; }; 6415 6421 052BFCEA128ABF2100FD338D /* GeolocationClientMock.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = GeolocationClientMock.h; path = mock/GeolocationClientMock.h; sourceTree = "<group>"; }; … … 12698 12704 isa = PBXGroup; 12699 12705 children = ( 12706 00022E6813CE1BBA00282D5B /* CharacterReferenceParserInlineMethods.h */, 12700 12707 00CA93B413C6697C00F7FE95 /* NewXMLDocumentParser.cpp */, 12701 12708 00CA93B113C6691600F7FE95 /* NewXMLDocumentParser.h */, 12709 00D0464813C4D14500326FCC /* XMLCharacterReferenceParser.cpp */, 12710 00D0464913C4D14500326FCC /* XMLCharacterReferenceParser.h */, 12702 12711 00B9318113BA867F0035A948 /* XMLDocumentParser.cpp */, 12703 12712 00B9318213BA867F0035A948 /* XMLDocumentParser.h */, … … 20279 20288 514185EE0CD65F0400763C99 /* ChangeVersionWrapper.h in Headers */, 20280 20289 6550B6A0099DF0270090D781 /* CharacterData.h in Headers */, 20290 00022E6913CE1BBA00282D5B /* CharacterReferenceParserInlineMethods.h in Headers */, 20281 20291 B2C3DA2A0D006C1D00EF6F26 /* CharsetData.h in Headers */, 20282 20292 F55B3DB21251F12D003EF269 /* CheckboxInputType.h in Headers */, … … 23049 23059 9BAF3B2412C1A39800014BF1 /* WritingDirection.h in Headers */, 23050 23060 6565820209D1508D000E61D7 /* XLinkNames.h in Headers */, 23061 00D0464B13C4D14500326FCC /* XMLCharacterReferenceParser.h in Headers */, 23051 23062 00B9318813BA8DBA0035A948 /* XMLDocumentParser.h in Headers */, 23052 23063 00B9318C13BA8DCC0035A948 /* XMLDocumentParserScope.h in Headers */, … … 25818 25829 93309E23099E64920056E581 /* WrapContentsInDummySpanCommand.cpp in Sources */, 25819 25830 A833C7CC0A2CF07400D57664 /* XLinkNames.cpp in Sources */, 25831 00D0464A13C4D14500326FCC /* XMLCharacterReferenceParser.cpp in Sources */, 25820 25832 00B9318713BA8DB30035A948 /* XMLDocumentParser.cpp in Sources */, 25821 25833 00B9318913BA8DBC0035A948 /* XMLDocumentParserLibxml2.cpp in Sources */, -
trunk/Source/WebCore/html/parser/HTMLEntityParser.cpp
r66359 r91025 29 29 #include "HTMLEntityParser.h" 30 30 31 #include "CharacterReferenceParserInlineMethods.h" 31 32 #include "HTMLEntitySearch.h" 32 33 #include "HTMLEntityTable.h" … … 46 47 }; 47 48 48 inline UChar adjustEntity(UChar32 value)49 {50 if ((value & ~0x1F) != 0x0080)51 return value;52 return windowsLatin1ExtensionArray[value - 0x80];53 }54 55 inline UChar32 legalEntityFor(UChar32 value)56 {57 // FIXME: A number of specific entity values generate parse errors.58 if (value == 0 || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))59 return 0xFFFD;60 if (U_IS_BMP(value))61 return adjustEntity(value);62 return value;63 }64 65 inline bool convertToUTF16(UChar32 value, Vector<UChar, 16>& decodedEntity)66 {67 if (U_IS_BMP(value)) {68 UChar character = static_cast<UChar>(value);69 ASSERT(character == value);70 decodedEntity.append(character);71 return true;72 }73 decodedEntity.append(U16_LEAD(value));74 decodedEntity.append(U16_TRAIL(value));75 return true;76 }77 78 inline bool isHexDigit(UChar cc)79 {80 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F');81 }82 83 49 inline bool isAlphaNumeric(UChar cc) 84 50 { … … 86 52 } 87 53 88 void unconsumeCharacters(SegmentedString& source, const Vector<UChar, 10>& consumedCharacters) 89 { 90 if (consumedCharacters.size() == 1) 91 source.push(consumedCharacters[0]); 92 else if (consumedCharacters.size() == 2) { 93 source.push(consumedCharacters[0]); 94 source.push(consumedCharacters[1]); 95 } else 96 source.prepend(SegmentedString(String(consumedCharacters.data(), consumedCharacters.size()))); 97 } 54 class HTMLEntityParser { 55 public: 56 inline static UChar adjustEntity(UChar32 value) 57 { 58 if ((value & ~0x1F) != 0x0080) 59 return value; 60 return windowsLatin1ExtensionArray[value - 0x80]; 61 } 62 63 inline static UChar32 legalEntityFor(UChar32 value) 64 { 65 // FIXME: A number of specific entity values generate parse errors. 66 if (!value || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF)) 67 return 0xFFFD; 68 if (U_IS_BMP(value)) 69 return adjustEntity(value); 70 return value; 71 } 72 73 inline static bool convertToUTF16(UChar32 value, Vector<UChar, 16>& decodedEntity) 74 { 75 if (U_IS_BMP(value)) { 76 UChar character = static_cast<UChar>(value); 77 ASSERT(character == value); 78 decodedEntity.append(character); 79 return true; 80 } 81 decodedEntity.append(U16_LEAD(value)); 82 decodedEntity.append(U16_TRAIL(value)); 83 return true; 84 } 85 86 inline static bool acceptMalformed() { return true; } 87 88 inline static bool consumeNamedEntity(SegmentedString& source, Vector<UChar, 16>& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc) 89 { 90 Vector<UChar, 10> consumedCharacters; 91 HTMLEntitySearch entitySearch; 92 while (!source.isEmpty()) { 93 cc = *source; 94 entitySearch.advance(cc); 95 if (!entitySearch.isEntityPrefix()) 96 break; 97 consumedCharacters.append(cc); 98 source.advanceAndASSERT(cc); 99 } 100 notEnoughCharacters = source.isEmpty(); 101 if (notEnoughCharacters) { 102 // We can't an entity because there might be a longer entity 103 // that we could match if we had more data. 104 unconsumeCharacters(source, consumedCharacters); 105 return false; 106 } 107 if (!entitySearch.mostRecentMatch()) { 108 ASSERT(!entitySearch.currentValue()); 109 unconsumeCharacters(source, consumedCharacters); 110 return false; 111 } 112 if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) { 113 // We've consumed too many characters. We need to walk the 114 // source back to the point at which we had consumed an 115 // actual entity. 116 unconsumeCharacters(source, consumedCharacters); 117 consumedCharacters.clear(); 118 const int length = entitySearch.mostRecentMatch()->length; 119 const UChar* reference = entitySearch.mostRecentMatch()->entity; 120 for (int i = 0; i < length; ++i) { 121 cc = *source; 122 ASSERT_UNUSED(reference, cc == *reference++); 123 consumedCharacters.append(cc); 124 source.advanceAndASSERT(cc); 125 ASSERT(!source.isEmpty()); 126 } 127 cc = *source; 128 } 129 if (entitySearch.mostRecentMatch()->lastCharacter() == ';' 130 || !additionalAllowedCharacter 131 || !(isAlphaNumeric(cc) || cc == '=')) { 132 return convertToUTF16(entitySearch.mostRecentMatch()->value, decodedEntity); 133 } 134 unconsumeCharacters(source, consumedCharacters); 135 return false; 136 } 137 }; 98 138 99 139 } 100 140 141 101 142 bool consumeHTMLEntity(SegmentedString& source, Vector<UChar, 16>& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter) 102 143 { 103 ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>'); 104 ASSERT(!notEnoughCharacters); 105 ASSERT(decodedEntity.isEmpty()); 106 107 enum EntityState { 108 Initial, 109 Number, 110 MaybeHexLowerCaseX, 111 MaybeHexUpperCaseX, 112 Hex, 113 Decimal, 114 Named 115 }; 116 EntityState entityState = Initial; 117 UChar32 result = 0; 118 Vector<UChar, 10> consumedCharacters; 119 120 while (!source.isEmpty()) { 121 UChar cc = *source; 122 switch (entityState) { 123 case Initial: { 124 if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&') 125 return false; 126 if (additionalAllowedCharacter && cc == additionalAllowedCharacter) 127 return false; 128 if (cc == '#') { 129 entityState = Number; 130 break; 131 } 132 if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) { 133 entityState = Named; 134 continue; 135 } 136 return false; 137 } 138 case Number: { 139 if (cc == 'x') { 140 entityState = MaybeHexLowerCaseX; 141 break; 142 } 143 if (cc == 'X') { 144 entityState = MaybeHexUpperCaseX; 145 break; 146 } 147 if (cc >= '0' && cc <= '9') { 148 entityState = Decimal; 149 continue; 150 } 151 source.push('#'); 152 return false; 153 } 154 case MaybeHexLowerCaseX: { 155 if (isHexDigit(cc)) { 156 entityState = Hex; 157 continue; 158 } 159 source.push('#'); 160 source.push('x'); 161 return false; 162 } 163 case MaybeHexUpperCaseX: { 164 if (isHexDigit(cc)) { 165 entityState = Hex; 166 continue; 167 } 168 source.push('#'); 169 source.push('X'); 170 return false; 171 } 172 case Hex: { 173 if (cc >= '0' && cc <= '9') 174 result = result * 16 + cc - '0'; 175 else if (cc >= 'a' && cc <= 'f') 176 result = result * 16 + 10 + cc - 'a'; 177 else if (cc >= 'A' && cc <= 'F') 178 result = result * 16 + 10 + cc - 'A'; 179 else { 180 if (cc == ';') 181 source.advanceAndASSERT(cc); 182 return convertToUTF16(legalEntityFor(result), decodedEntity); 183 } 184 break; 185 } 186 case Decimal: { 187 if (cc >= '0' && cc <= '9') 188 result = result * 10 + cc - '0'; 189 else { 190 if (cc == ';') 191 source.advanceAndASSERT(cc); 192 return convertToUTF16(legalEntityFor(result), decodedEntity); 193 } 194 break; 195 } 196 case Named: { 197 HTMLEntitySearch entitySearch; 198 while (!source.isEmpty()) { 199 cc = *source; 200 entitySearch.advance(cc); 201 if (!entitySearch.isEntityPrefix()) 202 break; 203 consumedCharacters.append(cc); 204 source.advanceAndASSERT(cc); 205 } 206 notEnoughCharacters = source.isEmpty(); 207 if (notEnoughCharacters) { 208 // We can't an entity because there might be a longer entity 209 // that we could match if we had more data. 210 unconsumeCharacters(source, consumedCharacters); 211 return false; 212 } 213 if (!entitySearch.mostRecentMatch()) { 214 ASSERT(!entitySearch.currentValue()); 215 unconsumeCharacters(source, consumedCharacters); 216 return false; 217 } 218 if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) { 219 // We've consumed too many characters. We need to walk the 220 // source back to the point at which we had consumed an 221 // actual entity. 222 unconsumeCharacters(source, consumedCharacters); 223 consumedCharacters.clear(); 224 const int length = entitySearch.mostRecentMatch()->length; 225 const UChar* reference = entitySearch.mostRecentMatch()->entity; 226 for (int i = 0; i < length; ++i) { 227 cc = *source; 228 ASSERT_UNUSED(reference, cc == *reference++); 229 consumedCharacters.append(cc); 230 source.advanceAndASSERT(cc); 231 ASSERT(!source.isEmpty()); 232 } 233 cc = *source; 234 } 235 if (entitySearch.mostRecentMatch()->lastCharacter() == ';' 236 || !additionalAllowedCharacter 237 || !(isAlphaNumeric(cc) || cc == '=')) { 238 return convertToUTF16(entitySearch.mostRecentMatch()->value, decodedEntity); 239 } 240 unconsumeCharacters(source, consumedCharacters); 241 return false; 242 } 243 } 244 consumedCharacters.append(cc); 245 source.advanceAndASSERT(cc); 246 } 247 ASSERT(source.isEmpty()); 248 notEnoughCharacters = true; 249 unconsumeCharacters(source, consumedCharacters); 250 return false; 144 return consumeCharacterReference<HTMLEntityParser>(source, decodedEntity, notEnoughCharacters, additionalAllowedCharacter); 251 145 } 252 146
Note: See TracChangeset
for help on using the changeset viewer.