Changeset 225683 in webkit


Ignore:
Timestamp:
Dec 8, 2017 10:27:18 AM (6 years ago)
Author:
msaboff@apple.com
Message:

YARR: Coalesce constructed character classes
https://bugs.webkit.org/show_bug.cgi?id=180537

Reviewed by JF Bastien.

When adding characters or character ranges to a character class being constructed,
we now coalesce adjacent characters and character ranges. When we create a
character class after construction is complete, we do a final coalescing pass
across the character list and ranges to catch any remaining coalescing
opportunities.

Added an optimization for character classes that will match any character.
This is somewhat common in code created before the /s (dotAll) flag was added
to the engine.

  • yarr/YarrInterpreter.cpp:

(JSC::Yarr::Interpreter::checkCharacterClass):

  • yarr/YarrJIT.cpp:

(JSC::Yarr::YarrGenerator::generateCharacterClassOnce):
(JSC::Yarr::YarrGenerator::generateCharacterClassFixed):
(JSC::Yarr::YarrGenerator::generateCharacterClassGreedy):
(JSC::Yarr::YarrGenerator::backtrackCharacterClassNonGreedy):

  • yarr/YarrPattern.cpp:

(JSC::Yarr::CharacterClassConstructor::CharacterClassConstructor):
(JSC::Yarr::CharacterClassConstructor::reset):
(JSC::Yarr::CharacterClassConstructor::charClass):
(JSC::Yarr::CharacterClassConstructor::addSorted):
(JSC::Yarr::CharacterClassConstructor::addSortedRange):
(JSC::Yarr::CharacterClassConstructor::mergeRangesFrom):
(JSC::Yarr::CharacterClassConstructor::coalesceTables):
(JSC::Yarr::CharacterClassConstructor::anyCharacter):
(JSC::Yarr::YarrPatternConstructor::atomCharacterClassEnd):
(JSC::Yarr::PatternTerm::dump):
(JSC::Yarr::anycharCreate):

  • yarr/YarrPattern.h:

(JSC::Yarr::CharacterClass::CharacterClass):

Location:
trunk/Source/JavaScriptCore
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • trunk/Source/JavaScriptCore/ChangeLog

    r225664 r225683  
     12017-12-08  Michael Saboff  <msaboff@apple.com>
     2
     3        YARR: Coalesce constructed character classes
     4        https://bugs.webkit.org/show_bug.cgi?id=180537
     5
     6        Reviewed by JF Bastien.
     7
     8        When adding characters or character ranges to a character class being constructed,
     9        we now coalesce adjacent characters and character ranges.  When we create a
     10        character class after construction is complete, we do a final coalescing pass
     11        across the character list and ranges to catch any remaining coalescing
     12        opportunities.
     13
     14        Added an optimization for character classes that will match any character.
     15        This is somewhat common in code created before the /s (dotAll) flag was added
     16        to the engine.
     17
     18        * yarr/YarrInterpreter.cpp:
     19        (JSC::Yarr::Interpreter::checkCharacterClass):
     20        * yarr/YarrJIT.cpp:
     21        (JSC::Yarr::YarrGenerator::generateCharacterClassOnce):
     22        (JSC::Yarr::YarrGenerator::generateCharacterClassFixed):
     23        (JSC::Yarr::YarrGenerator::generateCharacterClassGreedy):
     24        (JSC::Yarr::YarrGenerator::backtrackCharacterClassNonGreedy):
     25        * yarr/YarrPattern.cpp:
     26        (JSC::Yarr::CharacterClassConstructor::CharacterClassConstructor):
     27        (JSC::Yarr::CharacterClassConstructor::reset):
     28        (JSC::Yarr::CharacterClassConstructor::charClass):
     29        (JSC::Yarr::CharacterClassConstructor::addSorted):
     30        (JSC::Yarr::CharacterClassConstructor::addSortedRange):
     31        (JSC::Yarr::CharacterClassConstructor::mergeRangesFrom):
     32        (JSC::Yarr::CharacterClassConstructor::coalesceTables):
     33        (JSC::Yarr::CharacterClassConstructor::anyCharacter):
     34        (JSC::Yarr::YarrPatternConstructor::atomCharacterClassEnd):
     35        (JSC::Yarr::PatternTerm::dump):
     36        (JSC::Yarr::anycharCreate):
     37        * yarr/YarrPattern.h:
     38        (JSC::Yarr::CharacterClass::CharacterClass):
     39
    1402017-12-07  Saam Barati  <sbarati@apple.com>
    241
  • trunk/Source/JavaScriptCore/yarr/YarrInterpreter.cpp

    r224072 r225683  
    409409    bool checkCharacterClass(CharacterClass* characterClass, bool invert, unsigned negativeInputOffset)
    410410    {
     411        if (characterClass->m_anyCharacter)
     412            return !invert;
     413
    411414        bool match = testCharacterClass(characterClass, input.readChecked(negativeInputOffset));
    412415        return invert ? !match : match;
  • trunk/Source/JavaScriptCore/yarr/YarrJIT.cpp

    r225271 r225683  
    11721172        // If we are matching the "any character" builtin class we only need to read the
    11731173        // character and don't need to match as it will always succeed.
    1174         if (term->invert() || term->characterClass != m_pattern.anyCharacterClass()) {
     1174        if (term->invert() || !term->characterClass->m_anyCharacter) {
    11751175            matchCharacterClass(character, matchDest, term->characterClass);
    11761176
     
    12211221        // If we are matching the "any character" builtin class we only need to read the
    12221222        // character and don't need to match as it will always succeed.
    1223         if (term->invert() || term->characterClass != m_pattern.anyCharacterClass()) {
     1223        if (term->invert() || !term->characterClass->m_anyCharacter) {
    12241224            matchCharacterClass(character, matchDest, term->characterClass);
    12251225
     
    12731273            // If we are matching the "any character" builtin class we only need to read the
    12741274            // character and don't need to match as it will always succeed.
    1275             if (term->characterClass != m_pattern.anyCharacterClass()) {
     1275            if (!term->characterClass->m_anyCharacter) {
    12761276                matchCharacterClass(character, matchDest, term->characterClass);
    12771277                failures.append(jump());
     
    13791379        // If we are matching the "any character" builtin class we only need to read the
    13801380        // character and don't need to match as it will always succeed.
    1381         if (term->invert() || term->characterClass != m_pattern.anyCharacterClass()) {
     1381        if (term->invert() || !term->characterClass->m_anyCharacter) {
    13821382            matchCharacterClass(character, matchDest, term->characterClass);
    13831383
  • trunk/Source/JavaScriptCore/yarr/YarrPattern.cpp

    r223142 r225683  
    4949        : m_isCaseInsensitive(isCaseInsensitive)
    5050        , m_hasNonBMPCharacters(false)
     51        , m_anyCharacter(false)
    5152        , m_canonicalMode(canonicalMode)
    5253    {
     
    6061        m_rangesUnicode.clear();
    6162        m_hasNonBMPCharacters = false;
     63        m_anyCharacter = false;
    6264    }
    6365
     
    239241    std::unique_ptr<CharacterClass> charClass()
    240242    {
     243        coalesceTables();
     244
    241245        auto characterClass = std::make_unique<CharacterClass>();
    242246
     
    246250        characterClass->m_rangesUnicode.swap(m_rangesUnicode);
    247251        characterClass->m_hasNonBMPCharacters = hasNonBMPCharacters();
     252        characterClass->m_anyCharacter = anyCharacter();
     253
     254        m_hasNonBMPCharacters = false;
     255        m_anyCharacter = false;
    248256
    249257        return characterClass;
     
    271279            if (!val)
    272280                return;
    273             else if (val > 0)
     281            else if (val > 0) {
     282                if (val == 1) {
     283                    UChar32 lo = ch;
     284                    UChar32 hi = ch + 1;
     285                    matches.remove(pos + index);
     286                    if (pos + index > 0 && matches[pos + index - 1] == ch - 1) {
     287                        lo = ch - 1;
     288                        matches.remove(pos + index - 1);
     289                    }
     290                    addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi);
     291                    return;
     292                }
    274293                range = index;
    275             else {
     294            } else {
     295                if (val == -1) {
     296                    UChar32 lo = ch - 1;
     297                    UChar32 hi = ch;
     298                    matches.remove(pos + index);
     299                    if (pos + index + 1 < matches.size() && matches[pos + index + 1] == ch + 1) {
     300                        hi = ch + 1;
     301                        matches.remove(pos + index + 1);
     302                    }
     303                    addSortedRange(isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi);
     304                    return;
     305                }
    276306                pos += (index+1);
    277307                range -= (index+1);
     
    287317    void addSortedRange(Vector<CharacterRange>& ranges, UChar32 lo, UChar32 hi)
    288318    {
    289         unsigned end = ranges.size();
     319        size_t end = ranges.size();
    290320
    291321        if (!U_IS_BMP(hi))
     
    294324        // Simple linear scan - I doubt there are that many ranges anyway...
    295325        // feel free to fix this with something faster (eg binary chop).
    296         for (unsigned i = 0; i < end; ++i) {
     326        for (size_t i = 0; i < end; ++i) {
    297327            // does the new range fall before the current position in the array
    298328            if (hi < ranges[i].begin) {
    299                 // optional optimization: concatenate appending ranges? - may not be worthwhile.
     329                // Concatenate appending ranges.
    300330                if (hi == (ranges[i].begin - 1)) {
    301331                    ranges[i].begin = lo;
     
    313343                ranges[i].end = std::max(ranges[i].end, hi);
    314344
    315                 // now check if the new range can subsume any subsequent ranges.
    316                 unsigned next = i+1;
    317                 // each iteration of the loop we will either remove something from the list, or break the loop.
    318                 while (next < ranges.size()) {
    319                     if (ranges[next].begin <= (ranges[i].end + 1)) {
    320                         // the next entry now overlaps / concatenates this one.
    321                         ranges[i].end = std::max(ranges[i].end, ranges[next].end);
    322                         ranges.remove(next);
    323                     } else
    324                         break;
    325                 }
    326                
     345                mergeRangesFrom(ranges, i);
    327346                return;
    328347            }
     
    333352    }
    334353
     354    void mergeRangesFrom(Vector<CharacterRange>& ranges, size_t index)
     355    {
     356        unsigned next = index + 1;
     357
     358        // each iteration of the loop we will either remove something from the list, or break out of the loop.
     359        while (next < ranges.size()) {
     360            if (ranges[next].begin <= (ranges[index].end + 1)) {
     361                // the next entry now overlaps / concatenates with this one.
     362                ranges[index].end = std::max(ranges[index].end, ranges[next].end);
     363                ranges.remove(next);
     364            } else
     365                break;
     366        }
     367
     368    }
     369
     370    void coalesceTables()
     371    {
     372        auto coalesceMatchesAndRanges = [&](Vector<UChar32>& matches, Vector<CharacterRange>& ranges) {
     373
     374            size_t matchesIndex = 0;
     375            size_t rangesIndex = 0;
     376
     377            while (matchesIndex < matches.size() && rangesIndex < ranges.size()) {
     378                while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].begin - 1)
     379                    matchesIndex++;
     380
     381                if (matchesIndex < matches.size() && matches[matchesIndex] == ranges[rangesIndex].begin - 1) {
     382                    ranges[rangesIndex].begin = matches[matchesIndex];
     383                    matches.remove(matchesIndex);
     384                }
     385
     386                while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].end + 1)
     387                    matchesIndex++;
     388
     389                if (matchesIndex < matches.size()) {
     390                    if (matches[matchesIndex] == ranges[rangesIndex].end + 1) {
     391                        ranges[rangesIndex].end = matches[matchesIndex];
     392                        matches.remove(matchesIndex);
     393
     394                        mergeRangesFrom(ranges, rangesIndex);
     395                    } else
     396                        matchesIndex++;
     397                }
     398            }
     399        };
     400
     401        coalesceMatchesAndRanges(m_matches, m_ranges);
     402        coalesceMatchesAndRanges(m_matchesUnicode, m_rangesUnicode);
     403
     404        if (!m_matches.size() && !m_matchesUnicode.size()
     405            && m_ranges.size() == 1 && m_rangesUnicode.size() == 1
     406            && m_ranges[0].begin == 0 && m_ranges[0].end == 0x7f
     407            && m_rangesUnicode[0].begin == 0x80 && m_rangesUnicode[0].end == 0x10ffff)
     408            m_anyCharacter = true;
     409    }
     410
    335411    bool hasNonBMPCharacters()
    336412    {
     
    338414    }
    339415
    340     bool m_isCaseInsensitive;
    341     bool m_hasNonBMPCharacters;
     416    bool anyCharacter()
     417    {
     418        return m_anyCharacter;
     419    }
     420
     421    bool m_isCaseInsensitive : 1;
     422    bool m_hasNonBMPCharacters : 1;
     423    bool m_anyCharacter : 1;
    342424    CanonicalMode m_canonicalMode;
    343425
     
    490572    {
    491573        auto newCharacterClass = m_characterClassConstructor.charClass();
     574
     575        if (!m_invertCharacterClass && newCharacterClass.get()->m_anyCharacter) {
     576            m_alternative->m_terms.append(PatternTerm(m_pattern.anyCharacterClass(), false));
     577            return;
     578        }
    492579        m_alternative->m_terms.append(PatternTerm(newCharacterClass.get(), m_invertCharacterClass));
    493580        m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
     
    11811268    case TypeCharacterClass:
    11821269        out.print("character class ");
    1183         if (characterClass == thisPattern->anyCharacterClass())
     1270        if (characterClass->m_anyCharacter)
    11841271            out.print("<any character>");
    11851272        else if (characterClass == thisPattern->newlineCharacterClass())
     
    13841471    characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x10ffff));
    13851472    characterClass->m_hasNonBMPCharacters = true;
     1473    characterClass->m_anyCharacter = true;
    13861474    return characterClass;
    13871475}
  • trunk/Source/JavaScriptCore/yarr/YarrPattern.h

    r223081 r225683  
    6060        : m_table(0)
    6161        , m_hasNonBMPCharacters(false)
     62        , m_anyCharacter(false)
    6263    {
    6364    }
     
    6667        , m_tableInverted(inverted)
    6768        , m_hasNonBMPCharacters(false)
     69        , m_anyCharacter(false)
    6870    {
    6971    }
     
    7678        , m_tableInverted(false)
    7779        , m_hasNonBMPCharacters(false)
     80        , m_anyCharacter(false)
    7881    {
    7982    }
     
    8588
    8689    const char* m_table;
    87     bool m_tableInverted;
    88     bool m_hasNonBMPCharacters;
     90    bool m_tableInverted : 1;
     91    bool m_hasNonBMPCharacters : 1;
     92    bool m_anyCharacter : 1;
    8993};
    9094
Note: See TracChangeset for help on using the changeset viewer.