Changeset 225683 in webkit
- Timestamp:
- Dec 8, 2017 10:27:18 AM (6 years ago)
- Location:
- trunk/Source/JavaScriptCore
- Files:
-
- 5 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/Source/JavaScriptCore/ChangeLog
r225664 r225683 1 2017-12-08 Michael Saboff <msaboff@apple.com> 2 3 YARR: Coalesce constructed character classes 4 https://bugs.webkit.org/show_bug.cgi?id=180537 5 6 Reviewed by JF Bastien. 7 8 When adding characters or character ranges to a character class being constructed, 9 we now coalesce adjacent characters and character ranges. When we create a 10 character class after construction is complete, we do a final coalescing pass 11 across the character list and ranges to catch any remaining coalescing 12 opportunities. 13 14 Added an optimization for character classes that will match any character. 15 This is somewhat common in code created before the /s (dotAll) flag was added 16 to the engine. 17 18 * yarr/YarrInterpreter.cpp: 19 (JSC::Yarr::Interpreter::checkCharacterClass): 20 * yarr/YarrJIT.cpp: 21 (JSC::Yarr::YarrGenerator::generateCharacterClassOnce): 22 (JSC::Yarr::YarrGenerator::generateCharacterClassFixed): 23 (JSC::Yarr::YarrGenerator::generateCharacterClassGreedy): 24 (JSC::Yarr::YarrGenerator::backtrackCharacterClassNonGreedy): 25 * yarr/YarrPattern.cpp: 26 (JSC::Yarr::CharacterClassConstructor::CharacterClassConstructor): 27 (JSC::Yarr::CharacterClassConstructor::reset): 28 (JSC::Yarr::CharacterClassConstructor::charClass): 29 (JSC::Yarr::CharacterClassConstructor::addSorted): 30 (JSC::Yarr::CharacterClassConstructor::addSortedRange): 31 (JSC::Yarr::CharacterClassConstructor::mergeRangesFrom): 32 (JSC::Yarr::CharacterClassConstructor::coalesceTables): 33 (JSC::Yarr::CharacterClassConstructor::anyCharacter): 34 (JSC::Yarr::YarrPatternConstructor::atomCharacterClassEnd): 35 (JSC::Yarr::PatternTerm::dump): 36 (JSC::Yarr::anycharCreate): 37 * yarr/YarrPattern.h: 38 (JSC::Yarr::CharacterClass::CharacterClass): 39 1 40 2017-12-07 Saam Barati <sbarati@apple.com> 2 41 -
trunk/Source/JavaScriptCore/yarr/YarrInterpreter.cpp
r224072 r225683 409 409 bool checkCharacterClass(CharacterClass* characterClass, bool invert, unsigned negativeInputOffset) 410 410 { 411 if (characterClass->m_anyCharacter) 412 return !invert; 413 411 414 bool match = testCharacterClass(characterClass, input.readChecked(negativeInputOffset)); 412 415 return invert ? !match : match; -
trunk/Source/JavaScriptCore/yarr/YarrJIT.cpp
r225271 r225683 1172 1172 // If we are matching the "any character" builtin class we only need to read the 1173 1173 // character and don't need to match as it will always succeed. 1174 if (term->invert() || term->characterClass != m_pattern.anyCharacterClass()) {1174 if (term->invert() || !term->characterClass->m_anyCharacter) { 1175 1175 matchCharacterClass(character, matchDest, term->characterClass); 1176 1176 … … 1221 1221 // If we are matching the "any character" builtin class we only need to read the 1222 1222 // character and don't need to match as it will always succeed. 1223 if (term->invert() || term->characterClass != m_pattern.anyCharacterClass()) {1223 if (term->invert() || !term->characterClass->m_anyCharacter) { 1224 1224 matchCharacterClass(character, matchDest, term->characterClass); 1225 1225 … … 1273 1273 // If we are matching the "any character" builtin class we only need to read the 1274 1274 // character and don't need to match as it will always succeed. 1275 if ( term->characterClass != m_pattern.anyCharacterClass()) {1275 if (!term->characterClass->m_anyCharacter) { 1276 1276 matchCharacterClass(character, matchDest, term->characterClass); 1277 1277 failures.append(jump()); … … 1379 1379 // If we are matching the "any character" builtin class we only need to read the 1380 1380 // character and don't need to match as it will always succeed. 1381 if (term->invert() || term->characterClass != m_pattern.anyCharacterClass()) {1381 if (term->invert() || !term->characterClass->m_anyCharacter) { 1382 1382 matchCharacterClass(character, matchDest, term->characterClass); 1383 1383 -
trunk/Source/JavaScriptCore/yarr/YarrPattern.cpp
r223142 r225683 49 49 : m_isCaseInsensitive(isCaseInsensitive) 50 50 , m_hasNonBMPCharacters(false) 51 , m_anyCharacter(false) 51 52 , m_canonicalMode(canonicalMode) 52 53 { … … 60 61 m_rangesUnicode.clear(); 61 62 m_hasNonBMPCharacters = false; 63 m_anyCharacter = false; 62 64 } 63 65 … … 239 241 std::unique_ptr<CharacterClass> charClass() 240 242 { 243 coalesceTables(); 244 241 245 auto characterClass = std::make_unique<CharacterClass>(); 242 246 … … 246 250 characterClass->m_rangesUnicode.swap(m_rangesUnicode); 247 251 characterClass->m_hasNonBMPCharacters = hasNonBMPCharacters(); 252 characterClass->m_anyCharacter = anyCharacter(); 253 254 m_hasNonBMPCharacters = false; 255 m_anyCharacter = false; 248 256 249 257 return characterClass; … … 271 279 if (!val) 272 280 return; 273 else if (val > 0) 281 else if (val > 0) { 282 if (val == 1) { 283 UChar32 lo = ch; 284 UChar32 hi = ch + 1; 285 matches.remove(pos + index); 286 if (pos + index > 0 && matches[pos + index - 1] == ch - 1) { 287 lo = ch - 1; 288 matches.remove(pos + index - 1); 289 } 290 addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi); 291 return; 292 } 274 293 range = index; 275 else { 294 } else { 295 if (val == -1) { 296 UChar32 lo = ch - 1; 297 UChar32 hi = ch; 298 matches.remove(pos + index); 299 if (pos + index + 1 < matches.size() && matches[pos + index + 1] == ch + 1) { 300 hi = ch + 1; 301 matches.remove(pos + index + 1); 302 } 303 addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi); 304 return; 305 } 276 306 pos += (index+1); 277 307 range -= (index+1); … … 287 317 void addSortedRange(Vector<CharacterRange>& ranges, UChar32 lo, UChar32 hi) 288 318 { 289 unsignedend = ranges.size();319 size_t end = ranges.size(); 290 320 291 321 if (!U_IS_BMP(hi)) … … 294 324 // Simple linear scan - I doubt there are that many ranges anyway... 295 325 // feel free to fix this with something faster (eg binary chop). 296 for ( unsignedi = 0; i < end; ++i) {326 for (size_t i = 0; i < end; ++i) { 297 327 // does the new range fall before the current position in the array 298 328 if (hi < ranges[i].begin) { 299 // optional optimization: concatenate appending ranges? - may not be worthwhile.329 // Concatenate appending ranges. 300 330 if (hi == (ranges[i].begin - 1)) { 301 331 ranges[i].begin = lo; … … 313 343 ranges[i].end = std::max(ranges[i].end, hi); 314 344 315 // now check if the new range can subsume any subsequent ranges. 316 unsigned next = i+1; 317 // each iteration of the loop we will either remove something from the list, or break the loop. 318 while (next < ranges.size()) { 319 if (ranges[next].begin <= (ranges[i].end + 1)) { 320 // the next entry now overlaps / concatenates this one. 321 ranges[i].end = std::max(ranges[i].end, ranges[next].end); 322 ranges.remove(next); 323 } else 324 break; 325 } 326 345 mergeRangesFrom(ranges, i); 327 346 return; 328 347 } … … 333 352 } 334 353 354 void mergeRangesFrom(Vector<CharacterRange>& ranges, size_t index) 355 { 356 unsigned next = index + 1; 357 358 // each iteration of the loop we will either remove something from the list, or break out of the loop. 359 while (next < ranges.size()) { 360 if (ranges[next].begin <= (ranges[index].end + 1)) { 361 // the next entry now overlaps / concatenates with this one. 362 ranges[index].end = std::max(ranges[index].end, ranges[next].end); 363 ranges.remove(next); 364 } else 365 break; 366 } 367 368 } 369 370 void coalesceTables() 371 { 372 auto coalesceMatchesAndRanges = [&](Vector<UChar32>& matches, Vector<CharacterRange>& ranges) { 373 374 size_t matchesIndex = 0; 375 size_t rangesIndex = 0; 376 377 while (matchesIndex < matches.size() && rangesIndex < ranges.size()) { 378 while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].begin - 1) 379 matchesIndex++; 380 381 if (matchesIndex < matches.size() && matches[matchesIndex] == ranges[rangesIndex].begin - 1) { 382 ranges[rangesIndex].begin = matches[matchesIndex]; 383 matches.remove(matchesIndex); 384 } 385 386 while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].end + 1) 387 matchesIndex++; 388 389 if (matchesIndex < matches.size()) { 390 if (matches[matchesIndex] == ranges[rangesIndex].end + 1) { 391 ranges[rangesIndex].end = matches[matchesIndex]; 392 matches.remove(matchesIndex); 393 394 mergeRangesFrom(ranges, rangesIndex); 395 } else 396 matchesIndex++; 397 } 398 } 399 }; 400 401 coalesceMatchesAndRanges(m_matches, m_ranges); 402 coalesceMatchesAndRanges(m_matchesUnicode, m_rangesUnicode); 403 404 if (!m_matches.size() && !m_matchesUnicode.size() 405 && m_ranges.size() == 1 && m_rangesUnicode.size() == 1 406 && m_ranges[0].begin == 0 && m_ranges[0].end == 0x7f 407 && m_rangesUnicode[0].begin == 0x80 && m_rangesUnicode[0].end == 0x10ffff) 408 m_anyCharacter = true; 409 } 410 335 411 bool hasNonBMPCharacters() 336 412 { … … 338 414 } 339 415 340 bool m_isCaseInsensitive; 341 bool m_hasNonBMPCharacters; 416 bool anyCharacter() 417 { 418 return m_anyCharacter; 419 } 420 421 bool m_isCaseInsensitive : 1; 422 bool m_hasNonBMPCharacters : 1; 423 bool m_anyCharacter : 1; 342 424 CanonicalMode m_canonicalMode; 343 425 … … 490 572 { 491 573 auto newCharacterClass = m_characterClassConstructor.charClass(); 574 575 if (!m_invertCharacterClass && newCharacterClass.get()->m_anyCharacter) { 576 m_alternative->m_terms.append(PatternTerm(m_pattern.anyCharacterClass(), false)); 577 return; 578 } 492 579 m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), m_invertCharacterClass)); 493 580 m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass)); … … 1181 1268 case TypeCharacterClass: 1182 1269 out.print("character class "); 1183 if (characterClass == thisPattern->anyCharacterClass())1270 if (characterClass->m_anyCharacter) 1184 1271 out.print("<any character>"); 1185 1272 else if (characterClass == thisPattern->newlineCharacterClass()) … … 1384 1471 characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x10ffff)); 1385 1472 characterClass->m_hasNonBMPCharacters = true; 1473 characterClass->m_anyCharacter = true; 1386 1474 return characterClass; 1387 1475 } -
trunk/Source/JavaScriptCore/yarr/YarrPattern.h
r223081 r225683 60 60 : m_table(0) 61 61 , m_hasNonBMPCharacters(false) 62 , m_anyCharacter(false) 62 63 { 63 64 } … … 66 67 , m_tableInverted(inverted) 67 68 , m_hasNonBMPCharacters(false) 69 , m_anyCharacter(false) 68 70 { 69 71 } … … 76 78 , m_tableInverted(false) 77 79 , m_hasNonBMPCharacters(false) 80 , m_anyCharacter(false) 78 81 { 79 82 } … … 85 88 86 89 const char* m_table; 87 bool m_tableInverted; 88 bool m_hasNonBMPCharacters; 90 bool m_tableInverted : 1; 91 bool m_hasNonBMPCharacters : 1; 92 bool m_anyCharacter : 1; 89 93 }; 90 94
Note: See TracChangeset
for help on using the changeset viewer.