Changeset 199523 in webkit


Ignore:
Timestamp:
Apr 13, 2016 5:47:40 PM (8 years ago)
Author:
msaboff@apple.com
Message:

Some tests fail with ES6 u (Unicode) flag for regular expressions
https://bugs.webkit.org/show_bug.cgi?id=151597

Reviewed by Geoffrey Garen.

Source/JavaScriptCore:

Added two new tables to handle the anomolies of \w and \W CharacterClassEscapes
when specified in RegExp's with both the unicode and ignoreCase flags. Given the
case folding rules described in the standard vie the meta function Canonicalize(),
which allow cross ASCII case folding when unicode is specified, the unicode characters
\u017f (small sharp s) and \u212a (kelvin symbol) are part of the \w (word) characterClassEscape.
This is true because they case fold to 's' and 'k' respectively. Because they case fold
to lower case letters, the corresponding letters, 'k', 'K', 's' and 'S', are also matched with
\W with the unicode and ignoreCase flags.

  • create_regex_tables:
  • yarr/YarrPattern.cpp:

(JSC::Yarr::YarrPatternConstructor::atomBuiltInCharacterClass):
(JSC::Yarr::YarrPatternConstructor::atomCharacterClassBuiltIn):
(JSC::Yarr::YarrPattern::YarrPattern):

  • yarr/YarrPattern.h:

(JSC::Yarr::YarrPattern::wordcharCharacterClass):
(JSC::Yarr::YarrPattern::wordUnicodeIgnoreCaseCharCharacterClass):
(JSC::Yarr::YarrPattern::nonwordcharCharacterClass):
(JSC::Yarr::YarrPattern::nonwordUnicodeIgnoreCaseCharCharacterClass):

LayoutTests:

Updated tests.

  • js/regexp-unicode-expected.txt:
  • js/script-tests/regexp-unicode.js:
Location:
trunk
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • trunk/LayoutTests/ChangeLog

    r199521 r199523  
     12016-04-13  Michael Saboff  <msaboff@apple.com>
     2
     3        Some tests fail with ES6 `u` (Unicode) flag for regular expressions
     4        https://bugs.webkit.org/show_bug.cgi?id=151597
     5
     6        Reviewed by Geoffrey Garen.
     7
     8        Updated tests.
     9
     10        * js/regexp-unicode-expected.txt:
     11        * js/script-tests/regexp-unicode.js:
     12
    1132016-04-13  Chris Dumez  <cdumez@apple.com>
    214
  • trunk/LayoutTests/js/regexp-unicode-expected.txt

    r198866 r199523  
    4040PASS "a𐄣X".match(/a𐄣b|a𐄣x/iu)[0].length is 4
    4141PASS "Ťx".match(/ťx/iu)[0].length is 2
     42PASS /\w/iu.test("ſ") is true
     43PASS /\w/iu.test("K") is true
     44PASS /!\w/iu.test("ſ") is false
     45PASS /!\w/iu.test("K") is false
     46PASS /\W/iu.test("ſ") is true
     47PASS /\W/iu.test("K") is true
     48PASS /!\W/iu.test("ſ") is false
     49PASS /!\W/iu.test("K") is false
     50PASS /[\w\d]/iu.test("ſ") is true
     51PASS /[\w\d]/iu.test("K") is true
     52PASS /[^\w\d]/iu.test("ſ") is false
     53PASS /[^\w\d]/iu.test("K") is false
     54PASS /[\W\d]/iu.test("ſ") is true
     55PASS /[\W\d]/iu.test("K") is true
     56PASS /[^\W\d]/iu.test("ſ") is false
     57PASS /[^\W\d]/iu.test("K") is false
     58PASS /\w/iu.test("S") is true
     59PASS /\w/iu.test("K") is true
     60PASS /!\w/iu.test("S") is false
     61PASS /!\w/iu.test("K") is false
     62PASS /\W/iu.test("S") is true
     63PASS /\W/iu.test("K") is true
     64PASS /!\W/iu.test("S") is false
     65PASS /!\W/iu.test("K") is false
     66PASS /[\w\d]/iu.test("S") is true
     67PASS /[\w\d]/iu.test("K") is true
     68PASS /[^\w\d]/iu.test("S") is false
     69PASS /[^\w\d]/iu.test("K") is false
     70PASS /[\W\d]/iu.test("S") is true
     71PASS /[\W\d]/iu.test("K") is true
     72PASS /[^\W\d]/iu.test("S") is false
     73PASS /[^\W\d]/iu.test("K") is false
    4274PASS "𝌆".match(/^.$/u)[0].length is 2
    4375PASS "It is 78°".match(/.*/u)[0].length is 9
  • trunk/LayoutTests/js/script-tests/regexp-unicode.js

    r198866 r199523  
    4444shouldBe('"a\u{10123}X".match(/a\u{10123}b|a\u{10123}x/iu)[0].length', '4');
    4545shouldBe('"\u0164x".match(/\u0165x/iu)[0].length', '2');
     46shouldBeTrue('/\\w/iu.test("\u017f")');
     47shouldBeTrue('/\\w/iu.test("\u212a")');
     48shouldBeFalse('/!\\w/iu.test("\u017f")');
     49shouldBeFalse('/!\\w/iu.test("\u212a")');
     50shouldBeTrue('/\\W/iu.test("\u017f")');
     51shouldBeTrue('/\\W/iu.test("\u212a")');
     52shouldBeFalse('/!\\W/iu.test("\u017f")');
     53shouldBeFalse('/!\\W/iu.test("\u212a")');
     54shouldBeTrue('/[\\w\\d]/iu.test("\u017f")');
     55shouldBeTrue('/[\\w\\d]/iu.test("\u212a")');
     56shouldBeFalse('/[^\\w\\d]/iu.test("\u017f")');
     57shouldBeFalse('/[^\\w\\d]/iu.test("\u212a")');
     58shouldBeTrue('/[\\W\\d]/iu.test("\u017f")');
     59shouldBeTrue('/[\\W\\d]/iu.test("\u212a")');
     60shouldBeFalse('/[^\\W\\d]/iu.test("\u017f")');
     61shouldBeFalse('/[^\\W\\d]/iu.test("\u212a")');
     62shouldBeTrue('/\\w/iu.test("S")');
     63shouldBeTrue('/\\w/iu.test("K")');
     64shouldBeFalse('/!\\w/iu.test("S")');
     65shouldBeFalse('/!\\w/iu.test("K")');
     66shouldBeTrue('/\\W/iu.test("S")');
     67shouldBeTrue('/\\W/iu.test("K")');
     68shouldBeFalse('/!\\W/iu.test("S")');
     69shouldBeFalse('/!\\W/iu.test("K")');
     70shouldBeTrue('/[\\w\\d]/iu.test("S")');
     71shouldBeTrue('/[\\w\\d]/iu.test("K")');
     72shouldBeFalse('/[^\\w\\d]/iu.test("S")');
     73shouldBeFalse('/[^\\w\\d]/iu.test("K")');
     74shouldBeTrue('/[\\W\\d]/iu.test("S")');
     75shouldBeTrue('/[\\W\\d]/iu.test("K")');
     76shouldBeFalse('/[^\\W\\d]/iu.test("S")');
     77shouldBeFalse('/[^\\W\\d]/iu.test("K")');
     78
    4679
    4780// Test . matches with Unicode flag
  • trunk/Source/JavaScriptCore/ChangeLog

    r199514 r199523  
     12016-04-13  Michael Saboff  <msaboff@apple.com>
     2
     3        Some tests fail with ES6 `u` (Unicode) flag for regular expressions
     4        https://bugs.webkit.org/show_bug.cgi?id=151597
     5
     6        Reviewed by Geoffrey Garen.
     7
     8        Added two new tables to handle the anomolies of \w and \W CharacterClassEscapes
     9        when specified in RegExp's with both the unicode and ignoreCase flags.  Given the
     10        case folding rules described in the standard vie the meta function Canonicalize(),
     11        which allow cross ASCII case folding when unicode is specified, the unicode characters
     12        \u017f (small sharp s) and \u212a (kelvin symbol) are part of the \w (word) characterClassEscape.
     13        This is true because they case fold to 's' and 'k' respectively.  Because they case fold
     14        to lower case letters, the corresponding letters, 'k', 'K', 's' and 'S', are also matched with
     15        \W with the unicode and ignoreCase flags.
     16
     17        * create_regex_tables:
     18        * yarr/YarrPattern.cpp:
     19        (JSC::Yarr::YarrPatternConstructor::atomBuiltInCharacterClass):
     20        (JSC::Yarr::YarrPatternConstructor::atomCharacterClassBuiltIn):
     21        (JSC::Yarr::YarrPattern::YarrPattern):
     22        * yarr/YarrPattern.h:
     23        (JSC::Yarr::YarrPattern::wordcharCharacterClass):
     24        (JSC::Yarr::YarrPattern::wordUnicodeIgnoreCaseCharCharacterClass):
     25        (JSC::Yarr::YarrPattern::nonwordcharCharacterClass):
     26        (JSC::Yarr::YarrPattern::nonwordUnicodeIgnoreCaseCharCharacterClass):
     27
    1282016-04-13  Commit Queue  <commit-queue@webkit.org>
    229
  • trunk/Source/JavaScriptCore/create_regex_tables

    r194470 r199523  
    2626types = {
    2727    "wordchar": { "UseTable" : True, "data": ['_', ('0','9'), ('A', 'Z'), ('a','z')]},
    28     "nonwordchar": { "UseTable" : True, "Inverse": "wordchar", "data": ['`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0xffff)]},
     28    "wordUnicodeIgnoreCaseChar": { "UseTable" : False, "data": ['_', ('0', '9'), ('A', 'Z'), ('a', 'z'), 0x017f, 0x212a]},
     29    "nonwordchar": { "UseTable" : True, "Inverse": "wordchar", "data": ['`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0x10ffff)]},
     30    "nonwordUnicodeIgnoreCaseChar": { "UseTable" : False, "Inverse": "wordchar", "data": ['k', 'K', 's', 'S', '`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0x10ffff)]},
    2931    "newline": { "UseTable" : False, "data": ['\n', '\r', 0x2028, 0x2029]},
    3032    "spaces": { "UseTable" : True, "data": [' ', ('\t', '\r'), 0xa0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f, 0x3000, (0x2000, 0x200a), 0xfeff]},
    31     "nonspaces": { "UseTable" : True, "Inverse": "spaces", "data": [(0, ord('\t') - 1), (ord('\r') + 1, ord(' ') - 1), (ord(' ') + 1, 0x009f), (0x00a1, 0x167f), (0x1681, 0x180d), (0x180f, 0x1fff), (0x200b, 0x2027), (0x202a, 0x202e), (0x2030, 0x205e), (0x2060, 0x2fff), (0x3001, 0xfefe), (0xff00, 0xffff)]},
     33    "nonspaces": { "UseTable" : True, "Inverse": "spaces", "data": [(0, ord('\t') - 1), (ord('\r') + 1, ord(' ') - 1), (ord(' ') + 1, 0x009f), (0x00a1, 0x167f), (0x1681, 0x180d), (0x180f, 0x1fff), (0x200b, 0x2027), (0x202a, 0x202e), (0x2030, 0x205e), (0x2060, 0x2fff), (0x3001, 0xfefe), (0xff00, 0x10ffff)]},
    3234    "digits": { "UseTable" : False, "data": [('0', '9')]},
    33     "nondigits": { "UseTable" : False, "Inverse": "digits", "data": [(0, ord('0') - 1), (ord('9') + 1, 0xffff)] }
     35    "nondigits": { "UseTable" : False, "Inverse": "digits", "data": [(0, ord('0') - 1), (ord('9') + 1, 0x10ffff)] }
    3436}
    3537entriesPerLine = 50
  • trunk/Source/JavaScriptCore/yarr/YarrPattern.cpp

    r197869 r199523  
    350350            break;
    351351        case WordClassID:
    352             m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
     352            if (m_pattern.unicode() && m_pattern.ignoreCase()) {
     353                if (invert)
     354                    m_alternative->m_terms.append(PatternTerm(m_pattern.nonwordUnicodeIgnoreCaseCharCharacterClass(), false));
     355                else
     356                    m_alternative->m_terms.append(PatternTerm(m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), false));
     357            } else
     358                m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
    353359            break;
    354360        case NewlineClassID:
     
    387393       
    388394        case WordClassID:
    389             m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
     395            if (m_pattern.unicode() && m_pattern.ignoreCase())
     396                m_characterClassConstructor.append(invert ? m_pattern.nonwordUnicodeIgnoreCaseCharCharacterClass() : m_pattern.wordUnicodeIgnoreCaseCharCharacterClass());
     397            else
     398                m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
    390399            break;
    391400       
     
    885894    , spacesCached(0)
    886895    , wordcharCached(0)
     896    , wordUnicodeIgnoreCaseCharCached(0)
    887897    , nondigitsCached(0)
    888898    , nonspacesCached(0)
    889899    , nonwordcharCached(0)
     900    , nonwordUnicodeIgnoreCasecharCached(0)
    890901{
    891902    *error = compile(pattern);
  • trunk/Source/JavaScriptCore/yarr/YarrPattern.h

    r197869 r199523  
    288288std::unique_ptr<CharacterClass> spacesCreate();
    289289std::unique_ptr<CharacterClass> wordcharCreate();
     290std::unique_ptr<CharacterClass> wordUnicodeIgnoreCaseCharCreate();
    290291std::unique_ptr<CharacterClass> nondigitsCreate();
    291292std::unique_ptr<CharacterClass> nonspacesCreate();
    292293std::unique_ptr<CharacterClass> nonwordcharCreate();
     294std::unique_ptr<CharacterClass> nonwordUnicodeIgnoreCaseCharCreate();
    293295
    294296struct TermChain {
     
    318320        spacesCached = 0;
    319321        wordcharCached = 0;
     322        wordUnicodeIgnoreCaseCharCached = 0;
    320323        nondigitsCached = 0;
    321324        nonspacesCached = 0;
    322325        nonwordcharCached = 0;
     326        nonwordUnicodeIgnoreCasecharCached = 0;
    323327
    324328        m_disjunctions.clear();
     
    368372        return wordcharCached;
    369373    }
     374    CharacterClass* wordUnicodeIgnoreCaseCharCharacterClass()
     375    {
     376        if (!wordUnicodeIgnoreCaseCharCached) {
     377            m_userCharacterClasses.append(wordUnicodeIgnoreCaseCharCreate());
     378            wordUnicodeIgnoreCaseCharCached = m_userCharacterClasses.last().get();
     379        }
     380        return wordUnicodeIgnoreCaseCharCached;
     381    }
    370382    CharacterClass* nondigitsCharacterClass()
    371383    {
     
    391403        }
    392404        return nonwordcharCached;
     405    }
     406    CharacterClass* nonwordUnicodeIgnoreCaseCharCharacterClass()
     407    {
     408        if (!nonwordUnicodeIgnoreCasecharCached) {
     409            m_userCharacterClasses.append(nonwordUnicodeIgnoreCaseCharCreate());
     410            nonwordUnicodeIgnoreCasecharCached = m_userCharacterClasses.last().get();
     411        }
     412        return nonwordUnicodeIgnoreCasecharCached;
    393413    }
    394414
     
    415435    CharacterClass* spacesCached;
    416436    CharacterClass* wordcharCached;
     437    CharacterClass* wordUnicodeIgnoreCaseCharCached;
    417438    CharacterClass* nondigitsCached;
    418439    CharacterClass* nonspacesCached;
    419440    CharacterClass* nonwordcharCached;
     441    CharacterClass* nonwordUnicodeIgnoreCasecharCached;
    420442};
    421443
Note: See TracChangeset for help on using the changeset viewer.