Changeset 275554 in webkit


Ignore:
Timestamp:
Apr 6, 2021, 2:28:04 PM (4 years ago)
Author:
commit-queue@webkit.org
Message:

Add U+0581 and U+0585 to list of Armenian characters that look like Latin characters
https://bugs.webkit.org/show_bug.cgi?id=224219
Source/WTF:

<rdar://75896365>

Patch by Alex Christensen <achristensen@webkit.org> on 2021-04-06
Reviewed by Brent Fulgham.

These code points are allowed in the context of other Armenian code points and punctuation, but not other script code points.
This was already implemented for the others, but I consolidated the list to one location to avoid having two locations for the list.

  • wtf/URLHelpers.cpp:

(WTF::URLHelpers::isArmenianLookalikeCharacter):
(WTF::URLHelpers::isArmenianLookalikeSequence):
(WTF::URLHelpers::isLookalikeCharacter):

Tools:

Patch by Alex Christensen <achristensen@webkit.org> on 2021-04-06
Reviewed by Brent Fulgham.

  • TestWebKitAPI/Tests/WTF/cocoa/URLExtras.mm:

(TestWebKitAPI::TEST):

Location:
trunk
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • trunk/Source/WTF/ChangeLog

    r275542 r275554  
     12021-04-06  Alex Christensen  <achristensen@webkit.org>
     2
     3        Add U+0581 and U+0585 to list of Armenian characters that look like Latin characters
     4        https://bugs.webkit.org/show_bug.cgi?id=224219
     5        <rdar://75896365>
     6
     7        Reviewed by Brent Fulgham.
     8
     9        These code points are allowed in the context of other Armenian code points and punctuation, but not other script code points.
     10        This was already implemented for the others, but I consolidated the list to one location to avoid having two locations for the list.
     11
     12        * wtf/URLHelpers.cpp:
     13        (WTF::URLHelpers::isArmenianLookalikeCharacter):
     14        (WTF::URLHelpers::isArmenianLookalikeSequence):
     15        (WTF::URLHelpers::isLookalikeCharacter):
     16
    1172021-04-06  Yusuke Suzuki  <ysuzuki@apple.com>
    218
  • trunk/Source/WTF/wtf/URLHelpers.cpp

    r271499 r275554  
    7070static bool isArmenianLookalikeCharacter(UChar32 codePoint)
    7171{
    72     return codePoint == 0x0548 || codePoint == 0x054D || codePoint == 0x0578 || codePoint == 0x057D;
     72    switch (codePoint) {
     73    case 0x0548: /* ARMENIAN CAPITAL LETTER VO */
     74    case 0x054D: /* ARMENIAN CAPITAL LETTER SEH */
     75    case 0x0551: /* ARMENIAN CAPITAL LETTER CO */
     76    case 0x0555: /* ARMENIAN CAPITAL LETTER OH */
     77    case 0x0578: /* ARMENIAN SMALL LETTER VO */
     78    case 0x057D: /* ARMENIAN SMALL LETTER SEH */
     79    case 0x0581: /* ARMENIAN SMALL LETTER CO */
     80    case 0x0585: /* ARMENIAN SMALL LETTER OH */
     81        return true;
     82    default:
     83        return false;
     84    }
    7385}
    7486
     
    107119}
    108120
    109 static bool isLookalikeCharacter(const Optional<UChar32>& previousCodePoint, UChar32 charCode)
     121static bool isArmenianLookalikeSequence(const Optional<UChar32>& previousCodePoint, UChar32 codePoint)
     122{
     123    if (!previousCodePoint || *previousCodePoint == '/')
     124        return false;
     125
     126    auto isArmenianLookalikePair = [] (UChar first, UChar second) {
     127        return isArmenianLookalikeCharacter(first) && !(isArmenianScriptCharacter(second) || isASCIIDigitOrValidHostCharacter(second));
     128    };
     129    return isArmenianLookalikePair(codePoint, *previousCodePoint)
     130        || isArmenianLookalikePair(*previousCodePoint, codePoint);
     131}
     132
     133static bool isLookalikeCharacter(const Optional<UChar32>& previousCodePoint, UChar32 codePoint)
    110134{
    111135    // This function treats the following as unsafe, lookalike characters:
     
    120144    // on characters that have not been processed by ICU, so they are needed here.
    121145   
    122     if (!u_isprint(charCode) || u_isUWhiteSpace(charCode) || u_hasBinaryProperty(charCode, UCHAR_DEFAULT_IGNORABLE_CODE_POINT))
     146    if (!u_isprint(codePoint) || u_isUWhiteSpace(codePoint) || u_hasBinaryProperty(codePoint, UCHAR_DEFAULT_IGNORABLE_CODE_POINT))
    123147        return true;
    124148   
    125     switch (charCode) {
     149    switch (codePoint) {
    126150    case 0x00BC: /* VULGAR FRACTION ONE QUARTER */
    127151    case 0x00BD: /* VULGAR FRACTION ONE HALF */
     
    251275            || previousCodePoint == 0x0131 /* LATIN SMALL LETTER DOTLESS I */
    252276            || previousCodePoint == 0x05D5; /* HEBREW LETTER VAV */
    253     case 0x0548: /* ARMENIAN CAPITAL LETTER VO */
    254     case 0x054D: /* ARMENIAN CAPITAL LETTER SEH */
    255     case 0x0578: /* ARMENIAN SMALL LETTER VO */
    256     case 0x057D: /* ARMENIAN SMALL LETTER SEH */
    257         return previousCodePoint
    258             && !isASCIIDigitOrValidHostCharacter(previousCodePoint.value())
    259             && !isArmenianScriptCharacter(previousCodePoint.value());
    260277    case '.':
    261278        return false;
    262279    default:
    263         return previousCodePoint
    264             && isArmenianLookalikeCharacter(previousCodePoint.value())
    265             && !(isArmenianScriptCharacter(charCode) || isASCIIDigitOrValidHostCharacter(charCode));
     280        return isArmenianLookalikeSequence(previousCodePoint, codePoint);
    266281    }
    267282}
  • trunk/Tools/ChangeLog

    r275546 r275554  
     12021-04-06  Alex Christensen  <achristensen@webkit.org>
     2
     3        Add U+0581 and U+0585 to list of Armenian characters that look like Latin characters
     4        https://bugs.webkit.org/show_bug.cgi?id=224219
     5
     6        Reviewed by Brent Fulgham.
     7
     8        * TestWebKitAPI/Tests/WTF/cocoa/URLExtras.mm:
     9        (TestWebKitAPI::TEST):
     10
    1112021-04-06  Wenson Hsieh  <wenson_hsieh@apple.com>
    212
  • trunk/Tools/TestWebKitAPI/Tests/WTF/cocoa/URLExtras.mm

    r271499 r275554  
    116116        "xn--koa", // U+0274
    117117        "xn--tma", // U+0237
     118        "xn--o-pdc", // U+0585 'o'
     119        "xn--o-qdc", // 'o' U+0585
     120        "xn--g-hdc", // U+0581 'g'
     121        "xn--g-idc", // 'g' U+0581
    118122    };
    119123    for (const String& host : punycodedSpoofHosts) {
     
    131135    EXPECT_STREQ("https://2\u0573_\u0574\u0578.\u0570\u0561\u0575", userVisibleString(literalURL("https://2\u0573_\u0574\u0578.\u0570\u0561\u0575")));
    132136    EXPECT_STREQ("https://\u0573_\u0574\u05783.\u0570\u0561\u0575", userVisibleString(literalURL("https://\u0573_\u0574\u05783.\u0570\u0561\u0575")));
    133     EXPECT_STREQ("https://got\u0551\u0535\u0543.com", userVisibleString(literalURL("https://got\u0551\u0535\u0543.com")));
     137    EXPECT_STREQ("https://got%D5%91\u0535\u0543.com", userVisibleString(literalURL("https://got\u0551\u0535\u0543.com")));
    134138    EXPECT_STREQ("https://\u0551\u0535\u0543fans.net", userVisibleString(literalURL("https://\u0551\u0535\u0543fans.net")));
    135139    EXPECT_STREQ("https://\u0551\u0535or\u0575\u0543.biz", userVisibleString(literalURL("https://\u0551\u0535or\u0575\u0543.biz")));
    136140    EXPECT_STREQ("https://\u0551\u0535and!$^&*()-~+={}or<>,.?\u0575\u0543.biz", userVisibleString(literalURL("https://\u0551\u0535and!$^&*()-~+={}or<>,.?\u0575\u0543.biz")));
     141    EXPECT_STREQ("https://\u0551%67/", userVisibleString(literalURL("https://\u0551g/")));
     142    EXPECT_STREQ("https://\u0581%67/", userVisibleString(literalURL("https://\u0581g/")));
     143    EXPECT_STREQ("https://o%D5%95%2F", userVisibleString(literalURL("https://o\u0555/")));
     144    EXPECT_STREQ("https://o%D6%85%2F", userVisibleString(literalURL("https://o\u0585/")));
    137145}
    138146
Note: See TracChangeset for help on using the changeset viewer.