Context Navigation

← Previous Changeset
Next Changeset →

Changeset 202490 in webkit

Timestamp:

Jun 27, 2016 10:38:55 AM (8 years ago)

Author:

msaboff@apple.com

Message:

ES6 Change: Unify handling of RegExp CharacterClassEscapes \w and \W and Word Asserts \b and \B
https://bugs.webkit.org/show_bug.cgi?id=158505

Reviewed by Geoffrey Garen.

Source/JavaScriptCore:

This change makes it so that the CharacterClassEscape \w matches the inverse of
\W and vice versa for unicode, ignore case RegExp's.

Before this change, both /\w/ui and /\W/ui RegExp's would match the characters
k, K, s, S, \u017f (Latin Small Letter Long S) and \u212a (Kelvin Sign).
This was due to how the ES6 standard defined matching of character classes
specifically that the abstract operation "Canonicalize()" is called for the
character to be matched AND for the characters in the character class we are
matching against. This change is to make \W always be the inverse of \w.
It is still the case that the characters that match against \w changes
depending on a regular expression's flags.

The only real changes occur for regular expressions with both the unicode and
ignore case flags set. Updated the character class generator to make
nonwordUnicodeIgnoreCaseChar not include k, K, s, S, \u017f and \u212a.
Changed BytecodePattern.wordcharCharacterClass to use the correct
word character class for the flags. Simplfied character class set up in
in the pattern to use m_pattern.wordUnicodeIgnoreCaseCharCharacterClass and
invert as appropriate when unicode and ignore case are both set.

create_regex_tables:
yarr/YarrInterpreter.h:

(JSC::Yarr::BytecodePattern::BytecodePattern):

yarr/YarrPattern.cpp:

(JSC::Yarr::YarrPatternConstructor::atomBuiltInCharacterClass):

LayoutTests:

Updated and added test cases.

js/regexp-unicode-expected.txt:
js/script-tests/regexp-unicode.js:

Location:

trunk

Files:

: 7 edited

LayoutTests/ChangeLog (modified) (1 diff)
LayoutTests/js/regexp-unicode-expected.txt (modified) (1 diff)
LayoutTests/js/script-tests/regexp-unicode.js (modified) (1 diff)
Source/JavaScriptCore/ChangeLog (modified) (1 diff)
Source/JavaScriptCore/create_regex_tables (modified) (1 diff)
Source/JavaScriptCore/yarr/YarrInterpreter.h (modified) (1 diff)
Source/JavaScriptCore/yarr/YarrPattern.cpp (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

trunk/LayoutTests/ChangeLog

-                      r202489
+                      r202490
+-06-27  Michael Saboff  <msaboff@apple.com>
+        ES6 Change: Unify handling of RegExp CharacterClassEscapes \w and \W and Word Asserts \b and \B
+        https://bugs.webkit.org/show_bug.cgi?id=158505
+        Reviewed by Geoffrey Garen.
+        Updated and added test cases.
+        * js/regexp-unicode-expected.txt:
+        * js/script-tests/regexp-unicode.js:
 -06-27  Frederic Wang  <fwang@igalia.com>

trunk/LayoutTests/js/regexp-unicode-expected.txt

-                      r201714
+                      r202490
 PASS /\w/iu.test("ſ") is true
 PASS /\w/iu.test("K") is true
+PASS /!\w/iu.test("ſ") is false
+PASS /!\w/iu.test("K") is false
+PASS /\W/iu.test("ſ") is true
+PASS /\W/iu.test("K") is true
+PASS /!\W/iu.test("ſ") is false
+PASS /!\W/iu.test("K") is false
+PASS /\W/iu.test("ſ") is false
+PASS /\W/iu.test("K") is false
 PASS /[\w\d]/iu.test("ſ") is true
 PASS /[\w\d]/iu.test("K") is true
 PASS /[^\w\d]/iu.test("ſ") is false
 PASS /[^\w\d]/iu.test("K") is false
 PASS /[\W\d]/iu.test("ſ") is true
 PASS /[\W\d]/iu.test("K") is true
 PASS /[^\W\d]/iu.test("ſ") is false
 PASS /[^\W\d]/iu.test("K") is false
+PASS /[\W\d]/iu.test("ſ") is false
+PASS /[\W\d]/iu.test("K") is false
+PASS /[^\W\d]/iu.test("ſ") is true
+PASS /[^\W\d]/iu.test("K") is true
 PASS /\w/iu.test("S") is true
 PASS /\w/iu.test("K") is true
+PASS /!\w/iu.test("S") is false
+PASS /!\w/iu.test("K") is false
+PASS /\W/iu.test("S") is true
+PASS /\W/iu.test("K") is true
+PASS /!\W/iu.test("S") is false
+PASS /!\W/iu.test("K") is false
+PASS /\W/iu.test("S") is false
+PASS /\W/iu.test("K") is false
 PASS /[\w\d]/iu.test("S") is true
 PASS /[\w\d]/iu.test("K") is true
 PASS /[^\w\d]/iu.test("S") is false
 PASS /[^\w\d]/iu.test("K") is false
+PASS /[\W\d]/iu.test("S") is true
+PASS /[\W\d]/iu.test("K") is true
+PASS /[^\W\d]/iu.test("S") is false
+PASS /[^\W\d]/iu.test("K") is false
+PASS /[\W\d]/iu.test("S") is false
+PASS /[\W\d]/iu.test("K") is false
+PASS /[^\W\d]/iu.test("S") is true
+PASS /[^\W\d]/iu.test("K") is true
+PASS "Grasſoden is old German for grass".match(/.*?\Bs\u017foden/iu)[0] is "Grasſoden"
+PASS "Grasſoden is old German for grass".match(/.*?\B\u017foden/iu)[0] is "Grasſoden"
+PASS "Grasſoden is old German for grass".match(/.*?\Boden/iu)[0] is "Grasſoden"
+PASS "Grasſoden is old German for grass".match(/.*?\Bden/iu)[0] is "Grasſoden"
+PASS "Water freezes at 273K which is 0C.".split(/\b\s/iu) is ["Water","freezes","at","273K","which","is","0C."]
 PASS "𝌆".match(/^.$/u)[0].length is 2
 PASS "It is 78°".match(/.*/u)[0].length is 9

trunk/LayoutTests/js/script-tests/regexp-unicode.js

-                      r201714
+                      r202490
 shouldBeTrue('/\\w/iu.test("\u017f")');
 shouldBeTrue('/\\w/iu.test("\u212a")');
+shouldBeFalse('/!\\w/iu.test("\u017f")');
+shouldBeFalse('/!\\w/iu.test("\u212a")');
+shouldBeTrue('/\\W/iu.test("\u017f")');
+shouldBeTrue('/\\W/iu.test("\u212a")');
+shouldBeFalse('/!\\W/iu.test("\u017f")');
+shouldBeFalse('/!\\W/iu.test("\u212a")');
+shouldBeFalse('/\\W/iu.test("\u017f")');
+shouldBeFalse('/\\W/iu.test("\u212a")');
 shouldBeTrue('/[\\w\\d]/iu.test("\u017f")');
 shouldBeTrue('/[\\w\\d]/iu.test("\u212a")');
 shouldBeFalse('/[^\\w\\d]/iu.test("\u017f")');
 shouldBeFalse('/[^\\w\\d]/iu.test("\u212a")');
 shouldBeTrue('/[\\W\\d]/iu.test("\u017f")');
 shouldBeTrue('/[\\W\\d]/iu.test("\u212a")');
 shouldBeFalse('/[^\\W\\d]/iu.test("\u017f")');
 shouldBeFalse('/[^\\W\\d]/iu.test("\u212a")');
+shouldBeFalse('/[\\W\\d]/iu.test("\u017f")');
+shouldBeFalse('/[\\W\\d]/iu.test("\u212a")');
+shouldBeTrue('/[^\\W\\d]/iu.test("\u017f")');
+shouldBeTrue('/[^\\W\\d]/iu.test("\u212a")');
 shouldBeTrue('/\\w/iu.test("S")');
 shouldBeTrue('/\\w/iu.test("K")');
+shouldBeFalse('/!\\w/iu.test("S")');
+shouldBeFalse('/!\\w/iu.test("K")');
+shouldBeTrue('/\\W/iu.test("S")');
+shouldBeTrue('/\\W/iu.test("K")');
+shouldBeFalse('/!\\W/iu.test("S")');
+shouldBeFalse('/!\\W/iu.test("K")');
+shouldBeFalse('/\\W/iu.test("S")');
+shouldBeFalse('/\\W/iu.test("K")');
 shouldBeTrue('/[\\w\\d]/iu.test("S")');
 shouldBeTrue('/[\\w\\d]/iu.test("K")');
 shouldBeFalse('/[^\\w\\d]/iu.test("S")');
 shouldBeFalse('/[^\\w\\d]/iu.test("K")');
+shouldBeTrue('/[\\W\\d]/iu.test("S")');
+shouldBeTrue('/[\\W\\d]/iu.test("K")');
+shouldBeFalse('/[^\\W\\d]/iu.test("S")');
+shouldBeFalse('/[^\\W\\d]/iu.test("K")');
+shouldBeFalse('/[\\W\\d]/iu.test("S")');
+shouldBeFalse('/[\\W\\d]/iu.test("K")');
+shouldBeTrue('/[^\\W\\d]/iu.test("S")');
+shouldBeTrue('/[^\\W\\d]/iu.test("K")');
+shouldBe('"Gras\u017foden is old German for grass".match(/.*?\\Bs\\u017foden/iu)[0]', '"Gras\u017foden"');
+shouldBe('"Gras\u017foden is old German for grass".match(/.*?\\B\\u017foden/iu)[0]', '"Gras\u017foden"');
+shouldBe('"Gras\u017foden is old German for grass".match(/.*?\\Boden/iu)[0]', '"Gras\u017foden"');
+shouldBe('"Gras\u017foden is old German for grass".match(/.*?\\Bden/iu)[0]', '"Gras\u017foden"');
+shouldBe('"Water freezes at 273\u212a which is 0C.".split(/\\b\\s/iu)', '["Water","freezes","at","273\u212a","which","is","0C."]');
 // Test . matches with Unicode flag

trunk/Source/JavaScriptCore/ChangeLog

-                      r202487
+                      r202490
+-06-27  Michael Saboff  <msaboff@apple.com>
+        ES6 Change: Unify handling of RegExp CharacterClassEscapes \w and \W and Word Asserts \b and \B
+        https://bugs.webkit.org/show_bug.cgi?id=158505
+        Reviewed by Geoffrey Garen.
+        This change makes it so that the CharacterClassEscape \w matches the inverse of
+        \W and vice versa for unicode, ignore case RegExp's.
+        Before this change, both /\w/ui and /\W/ui RegExp's would match the characters
+        k, K, s, S, \u017f (Latin Small Letter Long S) and \u212a (Kelvin Sign).
+        This was due to how the ES6 standard defined matching of character classes
+        specifically that the abstract operation "Canonicalize()" is called for the
+        character to be matched AND for the characters in the character class we are
+        matching against.  This change is to make \W always be the inverse of \w.
+        It is still the case that the characters that match against \w changes
+        depending on a regular expression's flags.
+        The only real changes occur for regular expressions with both the unicode and
+        ignore case flags set.  Updated the character class generator to make
+        nonwordUnicodeIgnoreCaseChar not include k, K, s, S, \u017f and \u212a.
+        Changed BytecodePattern.wordcharCharacterClass to use the correct
+        word character class for the flags.  Simplfied character class set up in
+        in the pattern to use m_pattern.wordUnicodeIgnoreCaseCharCharacterClass and
+        invert as appropriate when unicode and ignore case are both set.
+        * create_regex_tables:
+        * yarr/YarrInterpreter.h:
+        (JSC::Yarr::BytecodePattern::BytecodePattern):
+        * yarr/YarrPattern.cpp:
+        (JSC::Yarr::YarrPatternConstructor::atomBuiltInCharacterClass):
 -06-25  Keith Miller  <keith_miller@apple.com>

trunk/Source/JavaScriptCore/create_regex_tables

r199523	r202490
28	28	"wordUnicodeIgnoreCaseChar": { "UseTable" : False, "data": ['_', ('0', '9'), ('A', 'Z'), ('a', 'z'), 0x017f, 0x212a]},
29	29	"nonwordchar": { "UseTable" : True, "Inverse": "wordchar", "data": ['`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0x10ffff)]},
30		"nonwordUnicodeIgnoreCaseChar": { "UseTable" : False, "Inverse": "word~~char", "data": ['k', 'K', 's', 'S', '`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1~~, 0x10ffff)]},
	30	"nonwordUnicodeIgnoreCaseChar": { "UseTable" : False, "Inverse": "wordUnicodeIgnoreCaseChar", "data": ['`', (0, ord('0') - 1), (ord('9') + 1, ord('A') - 1), (ord('Z') + 1, ord('_') - 1), (ord('z') + 1, 0x017e), (0x0180, 0x2129), (0x212b, 0x10ffff)]},
31	31	"newline": { "UseTable" : False, "data": ['\n', '\r', 0x2028, 0x2029]},
32	32	"spaces": { "UseTable" : True, "data": [' ', ('\t', '\r'), 0xa0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f, 0x3000, (0x2000, 0x200a), 0xfeff]},

trunk/Source/JavaScriptCore/yarr/YarrInterpreter.h

-                      r199075
+                      r202490
         newlineCharacterClass = pattern.newlineCharacterClass();
+        wordcharCharacterClass = pattern.wordcharCharacterClass();
+        if (unicode() && ignoreCase())
+            wordcharCharacterClass = pattern.wordUnicodeIgnoreCaseCharCharacterClass();
+        else
+            wordcharCharacterClass = pattern.wordcharCharacterClass();
         m_allParenthesesInfo.swap(parenthesesInfoToAdopt);

trunk/Source/JavaScriptCore/yarr/YarrPattern.cpp

-                      r201412
+                      r202490
             break;
         case WordClassID:
+            if (m_pattern.unicode() && m_pattern.ignoreCase()) {
+                if (invert)
+                    m_alternative->m_terms.append(PatternTerm(m_pattern.nonwordUnicodeIgnoreCaseCharCharacterClass(), false));
+                else
+                    m_alternative->m_terms.append(PatternTerm(m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), false));
+            } else
+            if (m_pattern.unicode() && m_pattern.ignoreCase())
+                m_alternative->m_terms.append(PatternTerm(m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), invert));
+            else
                 m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
             break;

Note: See TracChangeset for help on using the changeset viewer.