Changeset 202490 in webkit


Ignore:
Timestamp:
Jun 27, 2016 10:38:55 AM (8 years ago)
Author:
msaboff@apple.com
Message:

ES6 Change: Unify handling of RegExp CharacterClassEscapes \w and \W and Word Asserts \b and \B
https://bugs.webkit.org/show_bug.cgi?id=158505

Reviewed by Geoffrey Garen.

Source/JavaScriptCore:

This change makes it so that the CharacterClassEscape \w matches the inverse of
\W and vice versa for unicode, ignore case RegExp's.

Before this change, both /\w/ui and /\W/ui RegExp's would match the characters
k, K, s, S, \u017f (Latin Small Letter Long S) and \u212a (Kelvin Sign).
This was due to how the ES6 standard defined matching of character classes
specifically that the abstract operation "Canonicalize()" is called for the
character to be matched AND for the characters in the character class we are
matching against. This change is to make \W always be the inverse of \w.
It is still the case that the characters that match against \w changes
depending on a regular expression's flags.

The only real changes occur for regular expressions with both the unicode and
ignore case flags set. Updated the character class generator to make
nonwordUnicodeIgnoreCaseChar not include k, K, s, S, \u017f and \u212a.
Changed BytecodePattern.wordcharCharacterClass to use the correct
word character class for the flags. Simplfied character class set up in
in the pattern to use m_pattern.wordUnicodeIgnoreCaseCharCharacterClass and
invert as appropriate when unicode and ignore case are both set.

  • create_regex_tables:
  • yarr/YarrInterpreter.h:

(JSC::Yarr::BytecodePattern::BytecodePattern):

  • yarr/YarrPattern.cpp:

(JSC::Yarr::YarrPatternConstructor::atomBuiltInCharacterClass):

LayoutTests:

Updated and added test cases.

  • js/regexp-unicode-expected.txt:
  • js/script-tests/regexp-unicode.js:
Location:
trunk
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • trunk/LayoutTests/ChangeLog

    r202489 r202490  
     12016-06-27  Michael Saboff  <msaboff@apple.com>
     2
     3        ES6 Change: Unify handling of RegExp CharacterClassEscapes \w and \W and Word Asserts \b and \B
     4        https://bugs.webkit.org/show_bug.cgi?id=158505
     5
     6        Reviewed by Geoffrey Garen.
     7
     8        Updated and added test cases.
     9
     10        * js/regexp-unicode-expected.txt:
     11        * js/script-tests/regexp-unicode.js:
     12
    1132016-06-27  Frederic Wang  <fwang@igalia.com>
    214
  • trunk/LayoutTests/js/regexp-unicode-expected.txt

    r201714 r202490  
    4242PASS /\w/iu.test("ſ") is true
    4343PASS /\w/iu.test("K") is true
    44 PASS /!\w/iu.test("ſ") is false
    45 PASS /!\w/iu.test("K") is false
    46 PASS /\W/iu.test("ſ") is true
    47 PASS /\W/iu.test("K") is true
    48 PASS /!\W/iu.test("ſ") is false
    49 PASS /!\W/iu.test("K") is false
     44PASS /\W/iu.test("ſ") is false
     45PASS /\W/iu.test("K") is false
    5046PASS /[\w\d]/iu.test("ſ") is true
    5147PASS /[\w\d]/iu.test("K") is true
    5248PASS /[^\w\d]/iu.test("ſ") is false
    5349PASS /[^\w\d]/iu.test("K") is false
    54 PASS /[\W\d]/iu.test("ſ") is true
    55 PASS /[\W\d]/iu.test("K") is true
    56 PASS /[^\W\d]/iu.test("ſ") is false
    57 PASS /[^\W\d]/iu.test("K") is false
     50PASS /[\W\d]/iu.test("ſ") is false
     51PASS /[\W\d]/iu.test("K") is false
     52PASS /[^\W\d]/iu.test("ſ") is true
     53PASS /[^\W\d]/iu.test("K") is true
    5854PASS /\w/iu.test("S") is true
    5955PASS /\w/iu.test("K") is true
    60 PASS /!\w/iu.test("S") is false
    61 PASS /!\w/iu.test("K") is false
    62 PASS /\W/iu.test("S") is true
    63 PASS /\W/iu.test("K") is true
    64 PASS /!\W/iu.test("S") is false
    65 PASS /!\W/iu.test("K") is false
     56PASS /\W/iu.test("S") is false
     57PASS /\W/iu.test("K") is false
    6658PASS /[\w\d]/iu.test("S") is true
    6759PASS /[\w\d]/iu.test("K") is true
    6860PASS /[^\w\d]/iu.test("S") is false
    6961PASS /[^\w\d]/iu.test("K") is false
    70 PASS /[\W\d]/iu.test("S") is true
    71 PASS /[\W\d]/iu.test("K") is true
    72 PASS /[^\W\d]/iu.test("S") is false
    73 PASS /[^\W\d]/iu.test("K") is false
     62PASS /[\W\d]/iu.test("S") is false
     63PASS /[\W\d]/iu.test("K") is false
     64PASS /[^\W\d]/iu.test("S") is true
     65PASS /[^\W\d]/iu.test("K") is true
     66PASS "Grasſoden is old German for grass".match(/.*?\Bs\u017foden/iu)[0] is "Grasſoden"
     67PASS "Grasſoden is old German for grass".match(/.*?\B\u017foden/iu)[0] is "Grasſoden"
     68PASS "Grasſoden is old German for grass".match(/.*?\Boden/iu)[0] is "Grasſoden"
     69PASS "Grasſoden is old German for grass".match(/.*?\Bden/iu)[0] is "Grasſoden"
     70PASS "Water freezes at 273K which is 0C.".split(/\b\s/iu) is ["Water","freezes","at","273K","which","is","0C."]
    7471PASS "𝌆".match(/^.$/u)[0].length is 2
    7572PASS "It is 78°".match(/.*/u)[0].length is 9
  • trunk/LayoutTests/js/script-tests/regexp-unicode.js

    r201714 r202490  
    4646shouldBeTrue('/\\w/iu.test("\u017f")');
    4747shouldBeTrue('/\\w/iu.test("\u212a")');
    48 shouldBeFalse('/!\\w/iu.test("\u017f")');
    49 shouldBeFalse('/!\\w/iu.test("\u212a")');
    50 shouldBeTrue('/\\W/iu.test("\u017f")');
    51 shouldBeTrue('/\\W/iu.test("\u212a")');
    52 shouldBeFalse('/!\\W/iu.test("\u017f")');
    53 shouldBeFalse('/!\\W/iu.test("\u212a")');
     48shouldBeFalse('/\\W/iu.test("\u017f")');
     49shouldBeFalse('/\\W/iu.test("\u212a")');
    5450shouldBeTrue('/[\\w\\d]/iu.test("\u017f")');
    5551shouldBeTrue('/[\\w\\d]/iu.test("\u212a")');
    5652shouldBeFalse('/[^\\w\\d]/iu.test("\u017f")');
    5753shouldBeFalse('/[^\\w\\d]/iu.test("\u212a")');
    58 shouldBeTrue('/[\\W\\d]/iu.test("\u017f")');
    59 shouldBeTrue('/[\\W\\d]/iu.test("\u212a")');
    60 shouldBeFalse('/[^\\W\\d]/iu.test("\u017f")');
    61 shouldBeFalse('/[^\\W\\d]/iu.test("\u212a")');
     54shouldBeFalse('/[\\W\\d]/iu.test("\u017f")');
     55shouldBeFalse('/[\\W\\d]/iu.test("\u212a")');
     56shouldBeTrue('/[^\\W\\d]/iu.test("\u017f")');
     57shouldBeTrue('/[^\\W\\d]/iu.test("\u212a")');
    6258shouldBeTrue('/\\w/iu.test("S")');
    6359shouldBeTrue('/\\w/iu.test("K")');
    64 shouldBeFalse('/!\\w/iu.test("S")');
    65 shouldBeFalse('/!\\w/iu.test("K")');
    66 shouldBeTrue('/\\W/iu.test("S")');
    67 shouldBeTrue('/\\W/iu.test("K")');
    68 shouldBeFalse('/!\\W/iu.test("S")');
    69 shouldBeFalse('/!\\W/iu.test("K")');
     60shouldBeFalse('/\\W/iu.test("S")');
     61shouldBeFalse('/\\W/iu.test("K")');
    7062shouldBeTrue('/[\\w\\d]/iu.test("S")');
    7163shouldBeTrue('/[\\w\\d]/iu.test("K")');
    7264shouldBeFalse('/[^\\w\\d]/iu.test("S")');
    7365shouldBeFalse('/[^\\w\\d]/iu.test("K")');
    74 shouldBeTrue('/[\\W\\d]/iu.test("S")');
    75 shouldBeTrue('/[\\W\\d]/iu.test("K")');
    76 shouldBeFalse('/[^\\W\\d]/iu.test("S")');
    77 shouldBeFalse('/[^\\W\\d]/iu.test("K")');
    78 
     66shouldBeFalse('/[\\W\\d]/iu.test("S")');
     67shouldBeFalse('/[\\W\\d]/iu.test("K")');
     68shouldBeTrue('/[^\\W\\d]/iu.test("S")');
     69shouldBeTrue('/[^\\W\\d]/iu.test("K")');
     70shouldBe('"Gras\u017foden is old German for grass".match(/.*?\\Bs\\u017foden/iu)[0]', '"Gras\u017foden"');
     71shouldBe('"Gras\u017foden is old German for grass".match(/.*?\\B\\u017foden/iu)[0]', '"Gras\u017foden"');
     72shouldBe('"Gras\u017foden is old German for grass".match(/.*?\\Boden/iu)[0]', '"Gras\u017foden"');
     73shouldBe('"Gras\u017foden is old German for grass".match(/.*?\\Bden/iu)[0]', '"Gras\u017foden"');
     74shouldBe('"Water freezes at 273\u212a which is 0C.".split(/\\b\\s/iu)', '["Water","freezes","at","273\u212a","which","is","0C."]');
    7975
    8076// Test . matches with Unicode flag
  • trunk/Source/JavaScriptCore/ChangeLog

    r202487 r202490  
     12016-06-27  Michael Saboff  <msaboff@apple.com>
     2
     3        ES6 Change: Unify handling of RegExp CharacterClassEscapes \w and \W and Word Asserts \b and \B
     4        https://bugs.webkit.org/show_bug.cgi?id=158505
     5
     6        Reviewed by Geoffrey Garen.
     7
     8        This change makes it so that the CharacterClassEscape \w matches the inverse of
     9        \W and vice versa for unicode, ignore case RegExp's.
     10
     11        Before this change, both /\w/ui and /\W/ui RegExp's would match the characters
     12        k, K, s, S, \u017f (Latin Small Letter Long S) and \u212a (Kelvin Sign).
     13        This was due to how the ES6 standard defined matching of character classes
     14        specifically that the abstract operation "Canonicalize()" is called for the
     15        character to be matched AND for the characters in the character class we are
     16        matching against.  This change is to make \W always be the inverse of \w.
     17        It is still the case that the characters that match against \w changes
     18        depending on a regular expression's flags.
     19
     20        The only real changes occur for regular expressions with both the unicode and
     21        ignore case flags set.  Updated the character class generator to make
     22        nonwordUnicodeIgnoreCaseChar not include k, K, s, S, \u017f and \u212a.
     23        Changed BytecodePattern.wordcharCharacterClass to use the correct
     24        word character class for the flags.  Simplfied character class set up in
     25        in the pattern to use m_pattern.wordUnicodeIgnoreCaseCharCharacterClass and
     26        invert as appropriate when unicode and ignore case are both set.
     27
     28        * create_regex_tables:
     29        * yarr/YarrInterpreter.h:
     30        (JSC::Yarr::BytecodePattern::BytecodePattern):
     31        * yarr/YarrPattern.cpp:
     32        (JSC::Yarr::YarrPatternConstructor::atomBuiltInCharacterClass):
     33
    1342016-06-25  Keith Miller  <keith_miller@apple.com>
    235
  • trunk/Source/JavaScriptCore/create_regex_tables

    r199523 r202490  
    2828    "wordUnicodeIgnoreCaseChar": { "UseTable" : False, "data": ['_', ('0', '9'), ('A', 'Z'), ('a', 'z'), 0x017f, 0x212a]},
    2929    "nonwordchar": { "UseTable" : True, "Inverse": "wordchar", "data": ['`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0x10ffff)]},
    30     "nonwordUnicodeIgnoreCaseChar": { "UseTable" : False, "Inverse": "wordchar", "data": ['k', 'K', 's', 'S', '`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0x10ffff)]},
     30    "nonwordUnicodeIgnoreCaseChar": { "UseTable" : False, "Inverse": "wordUnicodeIgnoreCaseChar", "data": ['`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0x017e), (0x0180, 0x2129), (0x212b, 0x10ffff)]},
    3131    "newline": { "UseTable" : False, "data": ['\n', '\r', 0x2028, 0x2029]},
    3232    "spaces": { "UseTable" : True, "data": [' ', ('\t', '\r'), 0xa0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f, 0x3000, (0x2000, 0x200a), 0xfeff]},
  • trunk/Source/JavaScriptCore/yarr/YarrInterpreter.h

    r199075 r202490  
    348348
    349349        newlineCharacterClass = pattern.newlineCharacterClass();
    350         wordcharCharacterClass = pattern.wordcharCharacterClass();
     350        if (unicode() && ignoreCase())
     351            wordcharCharacterClass = pattern.wordUnicodeIgnoreCaseCharCharacterClass();
     352        else
     353            wordcharCharacterClass = pattern.wordcharCharacterClass();
    351354
    352355        m_allParenthesesInfo.swap(parenthesesInfoToAdopt);
  • trunk/Source/JavaScriptCore/yarr/YarrPattern.cpp

    r201412 r202490  
    352352            break;
    353353        case WordClassID:
    354             if (m_pattern.unicode() && m_pattern.ignoreCase()) {
    355                 if (invert)
    356                     m_alternative->m_terms.append(PatternTerm(m_pattern.nonwordUnicodeIgnoreCaseCharCharacterClass(), false));
    357                 else
    358                     m_alternative->m_terms.append(PatternTerm(m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), false));
    359             } else
     354            if (m_pattern.unicode() && m_pattern.ignoreCase())
     355                m_alternative->m_terms.append(PatternTerm(m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), invert));
     356            else
    360357                m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
    361358            break;
Note: See TracChangeset for help on using the changeset viewer.