Changeset 112143 in webkit
- Timestamp:
- Mar 26, 2012 1:13:39 PM (12 years ago)
- Location:
- trunk
- Files:
-
- 6 added
- 11 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/LayoutTests/ChangeLog
r112137 r112143 1 2012-03-25 Gavin Barraclough <barraclough@apple.com> 2 3 Greek sigma is handled wrong in case independent regexp. 4 https://bugs.webkit.org/show_bug.cgi?id=82063 5 6 Reviewed by Oliver Hunt. 7 8 * fast/regex/script-tests/unicodeCaseInsensitive.js: Added. 9 (shouldBeTrue.ucs2CodePoint): 10 * fast/regex/unicodeCaseInsensitive-expected.txt: Added. 11 * fast/regex/unicodeCaseInsensitive.html: Added. 12 - Added test cases for case-insensitive matches of non-ascii characters. 13 1 14 2012-03-26 Emil A Eklund <eae@chromium.org> 2 15 -
trunk/Source/JavaScriptCore/CMakeLists.txt
r111974 r112143 220 220 tools/CodeProfiling.cpp 221 221 222 yarr/YarrCanonicalizeUCS2.cpp 222 223 yarr/YarrPattern.cpp 223 224 yarr/YarrInterpreter.cpp -
trunk/Source/JavaScriptCore/ChangeLog
r112123 r112143 1 2012-03-25 Gavin Barraclough <barraclough@apple.com> 2 3 Greek sigma is handled wrong in case independent regexp. 4 https://bugs.webkit.org/show_bug.cgi?id=82063 5 6 Reviewed by Oliver Hunt. 7 8 The bug here is that we assume that any given codepoint has at most one additional value it 9 should match under a case insensitive match, and that the pair of codepoints that match (if 10 a codepoint does not only match itself) can be determined by calling toUpper/toLower on the 11 given codepoint). Life is not that simple. 12 13 Instead, pre-calculate a set of tables mapping from a UCS2 codepoint to the set of characters 14 it may match, under the ES5.1 case-insensitive matching rules. Since unicode is fairly regular 15 we can pack this table quite nicely, and get it down to 364 entries. This means we can use a 16 simple binary search to find an entry in typically eight compares. 17 18 * CMakeLists.txt: 19 * GNUmakefile.list.am: 20 * JavaScriptCore.gypi: 21 * JavaScriptCore.vcproj/JavaScriptCore/JavaScriptCore.vcproj: 22 * JavaScriptCore.xcodeproj/project.pbxproj: 23 * yarr/yarr.pri: 24 - Added new files to build systems. 25 * yarr/YarrCanonicalizeUCS2.cpp: Added. 26 - New - autogenerated, UCS2 canonicalized comparison tables. 27 * yarr/YarrCanonicalizeUCS2.h: Added. 28 (JSC::Yarr::rangeInfoFor): 29 - Look up the canonicalization info for a UCS2 character. 30 (JSC::Yarr::getCanonicalPair): 31 - For a UCS2 character with a single equivalent value, look it up. 32 (JSC::Yarr::isCanonicallyUnique): 33 - Returns true if no other UCS2 code points are canonically equal. 34 (JSC::Yarr::areCanonicallyEquivalent): 35 - Compare two values, under canonicalization rules. 36 * yarr/YarrCanonicalizeUCS2.js: Added. 37 - script used to generate YarrCanonicalizeUCS2.cpp. 38 * yarr/YarrInterpreter.cpp: 39 (JSC::Yarr::Interpreter::tryConsumeBackReference): 40 - Use isCanonicallyUnique, rather than Unicode toUpper/toLower. 41 * yarr/YarrJIT.cpp: 42 (JSC::Yarr::YarrGenerator::jumpIfCharNotEquals): 43 (JSC::Yarr::YarrGenerator::generatePatternCharacterOnce): 44 (JSC::Yarr::YarrGenerator::generatePatternCharacterFixed): 45 - Use isCanonicallyUnique, rather than Unicode toUpper/toLower. 46 * yarr/YarrPattern.cpp: 47 (JSC::Yarr::CharacterClassConstructor::putChar): 48 - Updated to determine canonical equivalents correctly. 49 (JSC::Yarr::CharacterClassConstructor::putUnicodeIgnoreCase): 50 - Added, used to put a non-ascii, non-unique character in a case-insensitive match. 51 (JSC::Yarr::CharacterClassConstructor::putRange): 52 - Updated to determine canonical equivalents correctly. 53 (JSC::Yarr::YarrPatternConstructor::atomPatternCharacter): 54 - Changed to call putUnicodeIgnoreCase, instead of putChar, avoid a double lookup of rangeInfo. 55 1 56 2012-03-26 Kevin Ollivier <kevino@theolliviers.com> 2 57 -
trunk/Source/JavaScriptCore/GNUmakefile.list.am
r112082 r112143 579 579 Source/JavaScriptCore/tools/TieredMMapArray.h \ 580 580 Source/JavaScriptCore/yarr/Yarr.h \ 581 Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.cpp \ 582 Source/JavaScriptCore/yarr/YarrCanonicalizeUCS2.h \ 581 583 Source/JavaScriptCore/yarr/YarrInterpreter.cpp \ 582 584 Source/JavaScriptCore/yarr/YarrInterpreter.h \ -
trunk/Source/JavaScriptCore/JavaScriptCore.gypi
r111889 r112143 139 139 'runtime/WriteBarrier.h', 140 140 'yarr/Yarr.h', 141 'yarr/YarrCanonicalizeUCS2.h', 141 142 'yarr/YarrInterpreter.h', 142 143 'yarr/YarrPattern.h', … … 437 438 'runtime/UString.cpp', 438 439 'runtime/UStringConcatenate.h', 440 'yarr/YarrCanonicalizeUCS2.cpp', 439 441 'yarr/YarrInterpreter.cpp', 440 442 'yarr/YarrJIT.cpp', -
trunk/Source/JavaScriptCore/JavaScriptCore.vcproj/JavaScriptCore/JavaScriptCore.vcproj
r111889 r112143 1735 1735 </File> 1736 1736 <File 1737 RelativePath="..\..\yarr\YarrCanonicalizeUCS2.cpp" 1738 > 1739 </File> 1740 <File 1741 RelativePath="..\..\yarr\YarrCanonicalizeUCS2.h" 1742 > 1743 </File> 1744 <File 1737 1745 RelativePath="..\..\yarr\YarrInterpreter.cpp" 1738 1746 > -
trunk/Source/JavaScriptCore/JavaScriptCore.xcodeproj/project.pbxproj
r112040 r112143 341 341 8642C510151C06A90046D4EF /* RegExpCachedResult.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 86F75EFB151C062F007C9BA3 /* RegExpCachedResult.cpp */; }; 342 342 8642C512151C083D0046D4EF /* RegExpMatchesArray.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 86F75EFD151C062F007C9BA3 /* RegExpMatchesArray.cpp */; }; 343 8642C547151FE26B0046D4EF /* YarrCanonicalizeUCS2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 8642C544151FE26B0046D4EF /* YarrCanonicalizeUCS2.cpp */; }; 344 8642C548151FE26B0046D4EF /* YarrCanonicalizeUCS2.h in Headers */ = {isa = PBXBuildFile; fileRef = 8642C545151FE26B0046D4EF /* YarrCanonicalizeUCS2.h */; }; 343 345 865A30F1135007E100CDB49E /* JSValueInlineMethods.h in Headers */ = {isa = PBXBuildFile; fileRef = 865A30F0135007E100CDB49E /* JSValueInlineMethods.h */; settings = {ATTRIBUTES = (Private, ); }; }; 344 346 865F408810E7D56300947361 /* APIShims.h in Headers */ = {isa = PBXBuildFile; fileRef = 865F408710E7D56300947361 /* APIShims.h */; settings = {ATTRIBUTES = (Private, ); }; }; … … 977 979 8604F503143CE1C100B295F5 /* JSGlobalThis.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = JSGlobalThis.h; sourceTree = "<group>"; }; 978 980 863B23DF0FC60E6200703AA4 /* MacroAssemblerCodeRef.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MacroAssemblerCodeRef.h; sourceTree = "<group>"; }; 981 8642C544151FE26B0046D4EF /* YarrCanonicalizeUCS2.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = YarrCanonicalizeUCS2.cpp; path = yarr/YarrCanonicalizeUCS2.cpp; sourceTree = "<group>"; }; 982 8642C545151FE26B0046D4EF /* YarrCanonicalizeUCS2.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = YarrCanonicalizeUCS2.h; path = yarr/YarrCanonicalizeUCS2.h; sourceTree = "<group>"; }; 983 8642C546151FE26B0046D4EF /* YarrCanonicalizeUCS2.js */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.javascript; name = YarrCanonicalizeUCS2.js; path = yarr/YarrCanonicalizeUCS2.js; sourceTree = "<group>"; }; 979 984 865A30F0135007E100CDB49E /* JSValueInlineMethods.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = JSValueInlineMethods.h; sourceTree = "<group>"; }; 980 985 865F408710E7D56300947361 /* APIShims.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = APIShims.h; sourceTree = "<group>"; }; … … 1953 1958 children = ( 1954 1959 451539B812DC994500EF7AC4 /* Yarr.h */, 1960 8642C544151FE26B0046D4EF /* YarrCanonicalizeUCS2.cpp */, 1961 8642C545151FE26B0046D4EF /* YarrCanonicalizeUCS2.h */, 1962 8642C546151FE26B0046D4EF /* YarrCanonicalizeUCS2.js */, 1955 1963 86704B7D12DBA33700A9FE7B /* YarrInterpreter.cpp */, 1956 1964 86704B7E12DBA33700A9FE7B /* YarrInterpreter.h */, … … 2503 2511 0FA581BB150E953000B9A2D9 /* DFGNodeFlags.h in Headers */, 2504 2512 0FA581BC150E953000B9A2D9 /* DFGNodeType.h in Headers */, 2513 <<<<<<< .mine 2514 8642C548151FE26B0046D4EF /* YarrCanonicalizeUCS2.h in Headers */, 2515 ======= 2505 2516 0F2BDC16151C5D4F00CD8910 /* DFGFixupPhase.h in Headers */, 2506 2517 0F2BDC21151E803B00CD8910 /* DFGInsertionSet.h in Headers */, 2507 2518 0F2BDC2C151FDE9100CD8910 /* Operands.h in Headers */, 2519 >>>>>>> .r112137 2508 2520 ); 2509 2521 runOnlyForDeploymentPostprocessing = 0; … … 3063 3075 8642C510151C06A90046D4EF /* RegExpCachedResult.cpp in Sources */, 3064 3076 8642C512151C083D0046D4EF /* RegExpMatchesArray.cpp in Sources */, 3077 8642C547151FE26B0046D4EF /* YarrCanonicalizeUCS2.cpp in Sources */, 3065 3078 ); 3066 3079 runOnlyForDeploymentPostprocessing = 0; -
trunk/Source/JavaScriptCore/yarr/YarrInterpreter.cpp
r108858 r112143 30 30 #include "UString.h" 31 31 #include "Yarr.h" 32 #include "YarrCanonicalizeUCS2.h" 32 33 #include <wtf/BumpPointerAllocator.h> 33 34 #include <wtf/DataLog.h> … … 384 385 if (pattern->m_ignoreCase) { 385 386 for (unsigned i = 0; i < matchSize; ++i) { 386 int ch = input.reread(matchBegin + i); 387 388 int lo = Unicode::toLower(ch); 389 int hi = Unicode::toUpper(ch); 390 391 if ((lo != hi) ? (!checkCasedCharacter(lo, hi, negativeInputOffset + matchSize - i)) : (!checkCharacter(ch, negativeInputOffset + matchSize - i))) { 392 input.uncheckInput(matchSize); 393 return false; 394 } 387 int oldCh = input.reread(matchBegin + i); 388 int ch = input.readChecked(negativeInputOffset + matchSize - i); 389 390 if (oldCh == ch) 391 continue; 392 393 // The definition for canonicalize (see ES 5.1, 15.10.2.8) means that 394 // unicode values are never allowed to match against ascii ones. 395 if (isASCII(oldCh) || isASCII(ch)) { 396 if (toASCIIUpper(oldCh) == toASCIIUpper(ch)) 397 continue; 398 } else if (areCanonicallyEquivalent(oldCh, ch)) 399 continue; 400 401 input.uncheckInput(matchSize); 402 return false; 395 403 } 396 404 } else { -
trunk/Source/JavaScriptCore/yarr/YarrJIT.cpp
r110033 r112143 30 30 #include "LinkBuffer.h" 31 31 #include "Yarr.h" 32 #include "YarrCanonicalizeUCS2.h" 32 33 33 34 #if ENABLE(YARR_JIT) … … 263 264 // For case-insesitive compares, non-ascii characters that have different 264 265 // upper & lower case representations are converted to a character class. 265 ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || (Unicode::toLower(ch) == Unicode::toUpper(ch)));266 ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch)); 266 267 if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { 267 or32(TrustedImm32( 32), character);268 ch = Unicode::toLower(ch);268 or32(TrustedImm32(0x20), character); 269 ch |= 0x20; 269 270 } 270 271 … … 686 687 // For case-insesitive compares, non-ascii characters that have different 687 688 // upper & lower case representations are converted to a character class. 688 ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || (Unicode::toLower(ch) == Unicode::toUpper(ch)));689 690 if ( (m_pattern.m_ignoreCase) && (isASCIIAlpha(ch)))689 ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch)); 690 691 if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) 691 692 ignoreCaseMask |= 32; 692 693 … … 714 715 // For case-insesitive compares, non-ascii characters that have different 715 716 // upper & lower case representations are converted to a character class. 716 ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(currentCharacter) || (Unicode::toLower(currentCharacter) == Unicode::toUpper(currentCharacter)));717 ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(currentCharacter) || isCanonicallyUnique(currentCharacter)); 717 718 718 719 allCharacters |= (currentCharacter << shiftAmount); … … 791 792 // For case-insesitive compares, non-ascii characters that have different 792 793 // upper & lower case representations are converted to a character class. 793 ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || (Unicode::toLower(ch) == Unicode::toUpper(ch)));794 ASSERT(!m_pattern.m_ignoreCase || isASCIIAlpha(ch) || isCanonicallyUnique(ch)); 794 795 if (m_pattern.m_ignoreCase && isASCIIAlpha(ch)) { 795 or32(TrustedImm32( 32), character);796 ch = Unicode::toLower(ch);796 or32(TrustedImm32(0x20), character); 797 ch |= 0x20; 797 798 } 798 799 -
trunk/Source/JavaScriptCore/yarr/YarrPattern.cpp
r106748 r112143 29 29 30 30 #include "Yarr.h" 31 #include "YarrCanonicalizeUCS2.h" 31 32 #include "YarrParser.h" 32 33 #include <wtf/Vector.h> … … 67 68 void putChar(UChar ch) 68 69 { 70 // Handle ascii cases. 69 71 if (ch <= 0x7f) { 70 72 if (m_isCaseInsensitive && isASCIIAlpha(ch)) { … … 73 75 } else 74 76 addSorted(m_matches, ch); 77 return; 78 } 79 80 // Simple case, not a case-insensitive match. 81 if (!m_isCaseInsensitive) { 82 addSorted(m_matchesUnicode, ch); 83 return; 84 } 85 86 // Add multiple matches, if necessary. 87 UCS2CanonicalizationRange* info = rangeInfoFor(ch); 88 if (info->type == CanonicalizeUnique) 89 addSorted(m_matchesUnicode, ch); 90 else 91 putUnicodeIgnoreCase(ch, info); 92 } 93 94 void putUnicodeIgnoreCase(UChar ch, UCS2CanonicalizationRange* info) 95 { 96 ASSERT(m_isCaseInsensitive); 97 ASSERT(ch > 0x7f); 98 ASSERT(ch >= info->begin && ch <= info->end); 99 ASSERT(info->type != CanonicalizeUnique); 100 if (info->type == CanonicalizeSet) { 101 for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) 102 addSorted(m_matchesUnicode, ch); 75 103 } else { 76 UChar upper, lower; 77 if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) { 78 addSorted(m_matchesUnicode, upper); 79 addSorted(m_matchesUnicode, lower); 80 } else 81 addSorted(m_matchesUnicode, ch); 82 } 83 } 84 85 // returns true if this character has another case, and 'ch' is the upper case form. 86 static inline bool isUnicodeUpper(UChar ch) 87 { 88 return ch != Unicode::toLower(ch); 89 } 90 91 // returns true if this character has another case, and 'ch' is the lower case form. 92 static inline bool isUnicodeLower(UChar ch) 93 { 94 return ch != Unicode::toUpper(ch); 104 addSorted(m_matchesUnicode, ch); 105 addSorted(m_matchesUnicode, getCanonicalPair(info, ch)); 106 } 95 107 } 96 108 … … 109 121 } 110 122 } 111 if (hi >= 0x80) { 112 uint32_t unicodeCurr = std::max(lo, (UChar)0x80); 113 addSortedRange(m_rangesUnicode, unicodeCurr, hi); 114 115 if (m_isCaseInsensitive) { 116 while (unicodeCurr <= hi) { 117 // If the upper bound of the range (hi) is 0xffff, the increments to 118 // unicodeCurr in this loop may take it to 0x10000. This is fine 119 // (if so we won't re-enter the loop, since the loop condition above 120 // will definitely fail) - but this does mean we cannot use a UChar 121 // to represent unicodeCurr, we must use a 32-bit value instead. 122 ASSERT(unicodeCurr <= 0xffff); 123 124 if (isUnicodeUpper(unicodeCurr)) { 125 UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr); 126 UChar lowerCaseRangeEnd = lowerCaseRangeBegin; 127 while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1))) 128 lowerCaseRangeEnd++; 129 addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd); 130 } else if (isUnicodeLower(unicodeCurr)) { 131 UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr); 132 UChar upperCaseRangeEnd = upperCaseRangeBegin; 133 while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1))) 134 upperCaseRangeEnd++; 135 addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd); 136 } else 137 ++unicodeCurr; 138 } 139 } 140 } 123 if (hi <= 0x7f) 124 return; 125 126 lo = std::max(lo, (UChar)0x80); 127 addSortedRange(m_rangesUnicode, lo, hi); 128 129 if (!m_isCaseInsensitive) 130 return; 131 132 UCS2CanonicalizationRange* info = rangeInfoFor(lo); 133 while (true) { 134 // Handle the range [lo .. end] 135 UChar end = std::min(info->end, hi); 136 137 switch (info->type) { 138 case CanonicalizeUnique: 139 // Nothing to do - no canonical equivalents. 140 break; 141 case CanonicalizeSet: { 142 UChar ch; 143 for (uint16_t* set = characterSetInfo[info->value]; (ch = *set); ++set) 144 addSorted(m_matchesUnicode, ch); 145 break; 146 } 147 case CanonicalizeRangeLo: 148 addSortedRange(m_rangesUnicode, lo + info->value, end + info->value); 149 break; 150 case CanonicalizeRangeHi: 151 addSortedRange(m_rangesUnicode, lo - info->value, end - info->value); 152 break; 153 case CanonicalizeAlternatingAligned: 154 // Use addSortedRange since there is likely an abutting range to combine with. 155 if (lo & 1) 156 addSortedRange(m_rangesUnicode, lo - 1, lo - 1); 157 if (!(end & 1)) 158 addSortedRange(m_rangesUnicode, end + 1, end + 1); 159 break; 160 case CanonicalizeAlternatingUnaligned: 161 // Use addSortedRange since there is likely an abutting range to combine with. 162 if (!(lo & 1)) 163 addSortedRange(m_rangesUnicode, lo - 1, lo - 1); 164 if (end & 1) 165 addSortedRange(m_rangesUnicode, end + 1, end + 1); 166 break; 167 } 168 169 if (hi == end) 170 return; 171 172 ++info; 173 lo = info->begin; 174 }; 175 141 176 } 142 177 … … 281 316 // We handle case-insensitive checking of unicode characters which do have both 282 317 // cases by handling them as if they were defined using a CharacterClass. 283 if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) { 284 atomCharacterClassBegin(); 285 atomCharacterClassAtom(ch); 286 atomCharacterClassEnd(); 287 } else 318 if (!m_pattern.m_ignoreCase || isASCII(ch)) { 288 319 m_alternative->m_terms.append(PatternTerm(ch)); 320 return; 321 } 322 323 UCS2CanonicalizationRange* info = rangeInfoFor(ch); 324 if (info->type == CanonicalizeUnique) { 325 m_alternative->m_terms.append(PatternTerm(ch)); 326 return; 327 } 328 329 m_characterClassConstructor.putUnicodeIgnoreCase(ch, info); 330 CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); 331 m_pattern.m_userCharacterClasses.append(newCharacterClass); 332 m_alternative->m_terms.append(PatternTerm(newCharacterClass, false)); 289 333 } 290 334 -
trunk/Source/JavaScriptCore/yarr/yarr.pri
r102237 r112143 8 8 $$PWD/YarrInterpreter.cpp \ 9 9 $$PWD/YarrPattern.cpp \ 10 $$PWD/YarrSyntaxChecker.cpp 10 $$PWD/YarrSyntaxChecker.cpp \ 11 $$PWD/YarrCanonicalizeUCS2.cpp 11 12 12 13 # For UString.h
Note: See TracChangeset
for help on using the changeset viewer.