Context Navigation

← Previous Changeset
Next Changeset →

Changeset 64817 in webkit

Timestamp:

Aug 5, 2010, 10:36:28 PM (15 years ago)

Author:

ap@apple.com

Message:

Reviewed by Darin Adler.

https://bugs.webkit.org/show_bug.cgi?id=43554
Way too many encoding aliases are treated as valid

<rdar://problem/7863399> Garbage characters displayed in some yesky.com pages.

<rdar://problem/7859068> Garbage characters displayed for most text at ceping.zhaopin.com

Test: http/tests/misc/bad-charset-alias.html

loader/TextResourceDecoder.cpp: (WebCore::TextResourceDecoder::checkForCSSCharset): Fix encoding name length computation. Previously, a trailing quote was ignored by TextEncodingRegistry.

platform/text/TextCodecICU.cpp: (WebCore::TextCodecICU::registerExtendedEncodingNames): Added dashes to alias names that didn't have them. Added aliases prompted by regression tests.

platform/text/TextCodecLatin1.cpp: (WebCore::TextCodecLatin1::registerEncodingNames): Don't register 8859-1, other browsers do not support this encoding name.

platform/text/TextEncoding.cpp: (WebCore::Latin1Encoding): "Latin-1" is not a real encoding name, it's not known to Firefox or IE.

platform/text/TextEncodingRegistry.cpp: (WebCore::TextEncodingNameHash::equal): Changed to no longer ignore non-alphanumeric characters. There is a good chance that we'll be missing support for some necessary alias names, but other browsers don't ignore any characters when matching names. (WebCore::TextEncodingNameHash::hash): Ditto. (WebCore::checkExistingName): Re-formatted a line. (WebCore::isUndesiredAlias): Added a filter to reject "8859_1" and any names containing commas. (WebCore::addToTextEncodingNameMap): Used it. (WebCore::atomicCanonicalTextEncodingName): Changed to no longer ignore non-alphanumeric characters.

Location:

trunk

Files:

: 3 added
: 9 edited

LayoutTests/ChangeLog (modified) (1 diff)
LayoutTests/fast/encoding/char-encoding-expected.txt (modified) (1 diff)
LayoutTests/fast/encoding/char-encoding.html (modified) (1 diff)
LayoutTests/http/tests/misc/bad-charset-alias-expected.txt (added)
LayoutTests/http/tests/misc/bad-charset-alias.html (added)
LayoutTests/http/tests/misc/resources/bad-charset-alias.php (added)
WebCore/ChangeLog (modified) (1 diff)
WebCore/loader/TextResourceDecoder.cpp (modified) (1 diff)
WebCore/platform/text/TextCodecICU.cpp (modified) (3 diffs)
WebCore/platform/text/TextCodecLatin1.cpp (modified) (1 diff)
WebCore/platform/text/TextEncoding.cpp (modified) (1 diff)
WebCore/platform/text/TextEncodingRegistry.cpp (modified) (5 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/LayoutTests/ChangeLog

-              r64815
+              r64817
+-08-05  Alexey Proskuryakov  <ap@apple.com>
+        Reviewed by Darin Adler.
+        https://bugs.webkit.org/show_bug.cgi?id=43554
+        Way too many encoding aliases are treated as valid
+        <rdar://problem/7863399> Garbage characters displayed in some yesky.com pages.
+        <rdar://problem/7859068> Garbage characters displayed for most text at ceping.zhaopin.com
+        * fast/encoding/char-encoding-expected.txt:
+        * fast/encoding/char-encoding.html:
+        Use a correct name for GB_2312-80. At least Firefox doesn't know GB-2312-80.
+        * http/tests/misc/bad-charset-alias-expected.txt: Added.
+        * http/tests/misc/bad-charset-alias.html: Added.
+        * http/tests/misc/resources/bad-charset-alias.php: Added.
+        Check that certain encoding names are unknown. Both Firefox and IE don't know these.
 -08-05  W. James MacLean  <wjmaclean@chromium.org>

trunk/LayoutTests/fast/encoding/char-encoding-expected.txt

-              r39787
+              r64817
 PASS encode('GBK', 'U+00A5') is '%A3%A4'
 PASS encode('gb2312', 'U+00A5') is '%A3%A4'
 PASS encode('GB-2312-80', 'U+00A5') is '%A3%A4'
+PASS encode('GB_2312-80', 'U+00A5') is '%A3%A4'
 PASS encode('EUC-CN', 'U+00A5') is '%A3%A4'
 PASS encode('GBK', 'U+20AC') is '%80'
 PASS encode('gb2312', 'U+20AC') is '%80'
 PASS encode('GB-2312-80', 'U+20AC') is '%80'
+PASS encode('GB_2312-80', 'U+20AC') is '%80'
 PASS encode('EUC-CN', 'U+20AC') is '%80'
 PASS encode('GBK', 'U+01F9') is '%A8%BF'

trunk/LayoutTests/fast/encoding/char-encoding.html

-              r51088
+              r64817
 testEncode('GBK', 'U+00A5', '%A3%A4');
 testEncode('gb2312', 'U+00A5', '%A3%A4');
 testEncode('GB-2312-80', 'U+00A5', '%A3%A4');
+testEncode('GB_2312-80', 'U+00A5', '%A3%A4');
 testEncode('EUC-CN', 'U+00A5', '%A3%A4');
 //Euro symbol in gbk
 testEncode('GBK', 'U+20AC', '%80');
 testEncode('gb2312', 'U+20AC', '%80');
 testEncode('GB-2312-80', 'U+20AC', '%80');
+testEncode('GB_2312-80', 'U+20AC', '%80');
 testEncode('EUC-CN', 'U+20AC', '%80');
 //Misc symbols from TEC specific GBK translation

trunk/WebCore/ChangeLog

-              r64816
+              r64817
+-08-05  Alexey Proskuryakov  <ap@apple.com>
+        Reviewed by Darin Adler.
+        https://bugs.webkit.org/show_bug.cgi?id=43554
+        Way too many encoding aliases are treated as valid
+        <rdar://problem/7863399> Garbage characters displayed in some yesky.com pages.
+        <rdar://problem/7859068> Garbage characters displayed for most text at ceping.zhaopin.com
+        Test: http/tests/misc/bad-charset-alias.html
+        * loader/TextResourceDecoder.cpp: (WebCore::TextResourceDecoder::checkForCSSCharset):
+        Fix encoding name length computation. Previously, a trailing quote was ignored by
+        TextEncodingRegistry.
+        * platform/text/TextCodecICU.cpp: (WebCore::TextCodecICU::registerExtendedEncodingNames):
+        Added dashes to alias names that didn't have them. Added aliases prompted by regression tests.
+        * platform/text/TextCodecLatin1.cpp: (WebCore::TextCodecLatin1::registerEncodingNames):
+        Don't register 8859-1, other browsers do not support this encoding name.
+        * platform/text/TextEncoding.cpp: (WebCore::Latin1Encoding):
+        "Latin-1" is not a real encoding name, it's not known to Firefox or IE.
+        * platform/text/TextEncodingRegistry.cpp:
+        (WebCore::TextEncodingNameHash::equal): Changed to no longer ignore non-alphanumeric characters.
+        There is a good chance that we'll be missing support for some necessary alias names, but other
+        browsers don't ignore any characters when matching names.
+        (WebCore::TextEncodingNameHash::hash): Ditto.
+        (WebCore::checkExistingName): Re-formatted a line.
+        (WebCore::isUndesiredAlias): Added a filter to reject "8859_1" and any names containing commas.
+        (WebCore::addToTextEncodingNameMap): Used it.
+        (WebCore::atomicCanonicalTextEncodingName): Changed to no longer ignore non-alphanumeric characters.
 -08-05  Simon Hausmann  <simon.hausmann@nokia.com>

trunk/WebCore/loader/TextResourceDecoder.cpp

r62551	r64817
489	489	return false;
490	490
491		int encodingNameLength = pos - dataStart ~~+ 1~~;
	491	int encodingNameLength = pos - dataStart;
492	492
493	493	++pos;

trunk/WebCore/platform/text/TextCodecICU.cpp

-              r56825
+              r64817
+}
-// FIXME: Registering all the encodings we get from ucnv_getAvailableName
-// includes encodings we don't want or need. For example, all
-// the encodings with commas and version numbers.
 void TextCodecICU::registerExtendedEncodingNames(EncodingNameRegistrar registrar)
+{
 …
     // Perhaps we can prove these are not used on the web and remove them.
     // Or perhaps we can get them added to ICU.
     registrar("xmacroman", "macintosh");
     registrar("xmacukrainian", "x-mac-cyrillic");
     registrar("cnbig5", "Big5");
     registrar("xxbig5", "Big5");
     registrar("cngb", "GBK");
+    registrar("x-mac-roman", "macintosh");
+    registrar("x-mac-ukrainian", "x-mac-cyrillic");
+    registrar("cn-big5", "Big5");
+    registrar("x-x-big5", "Big5");
+    registrar("cn-gb", "GBK");
     registrar("csgb231280", "GBK");
     registrar("xeuccn", "GBK");
     registrar("xgbk", "GBK");
     registrar("csISO88598I", "ISO_8859-8-I");
+    registrar("x-euc-cn", "GBK");
+    registrar("x-gbk", "GBK");
+    registrar("csISO88598I", "ISO-8859-8-I");
     registrar("koi", "KOI8-R");
     registrar("logical", "ISO-8859-8-I");
     registrar("unicode11utf8", "UTF-8");
     registrar("unicode20utf8", "UTF-8");
     registrar("xunicode20utf8", "UTF-8");
+    registrar("x-unicode20utf8", "UTF-8");
     registrar("visual", "ISO-8859-8");
     registrar("winarabic", "windows-1256");
     registrar("winbaltic", "windows-1257");
     registrar("wincyrillic", "windows-1251");
+    registrar("iso885911", "windows-874");
+    registrar("dos874", "windows-874");
+    registrar("iso-8859-11", "windows-874");
+    registrar("iso8859-11", "windows-874");
+    registrar("dos-874", "windows-874");
     registrar("wingreek", "windows-1253");
     registrar("winhebrew", "windows-1255");
 …
     registrar("winturkish", "windows-1254");
     registrar("winvietnamese", "windows-1258");
+    registrar("xcp1250", "windows-1250");
+    registrar("xcp1251", "windows-1251");
+    registrar("xeuc", "EUC-JP");
+    registrar("xwindows949", "windows-949");
+    registrar("xuhc", "windows-949");
+    registrar("x-cp1250", "windows-1250");
+    registrar("x-cp1251", "windows-1251");
+    registrar("x-euc", "EUC-JP");
+    registrar("x-windows-949", "windows-949");
+    registrar("x-uhc", "windows-949");
+    registrar("utf8", "UTF-8");
     // These aliases are present in modern versions of ICU, but use different codecs, and have no standard names.
     // They are not present in ICU 3.2.
     registrar("dos720", "cp864");
+    registrar("dos-720", "cp864");
     registrar("jis7", "ISO-2022-JP");
+    // Alternative spelling of ISO encoding names.
+    registrar("ISO8859-1", "ISO-8859-1");
+    registrar("ISO8859-2", "ISO-8859-2");
+    registrar("ISO8859-3", "ISO-8859-3");
+    registrar("ISO8859-4", "ISO-8859-4");
+    registrar("ISO8859-5", "ISO-8859-5");
+    registrar("ISO8859-6", "ISO-8859-6");
+    registrar("ISO8859-7", "ISO-8859-7");
+    registrar("ISO8859-8", "ISO-8859-8");
+    registrar("ISO8859-8-I", "ISO-8859-8-I");
+    registrar("ISO8859-9", "ISO-8859-9");
+    registrar("ISO8859-10", "ISO-8859-10");
+    registrar("ISO8859-13", "ISO-8859-13");
+    registrar("ISO8859-14", "ISO-8859-14");
+    registrar("ISO8859-15", "ISO-8859-15");
+    registrar("ISO8859-16", "ISO-8859-16");
+}

trunk/WebCore/platform/text/TextCodecLatin1.cpp

r56825	r64817
80	80	registrar("ibm-1252_P100-2000", "windows-1252");
81	81
82		~~registrar("8859-1", "ISO-8859-1");~~
83	82	registrar("CP819", "ISO-8859-1");
84	83	registrar("IBM819", "ISO-8859-1");

trunk/WebCore/platform/text/TextEncoding.cpp

r56825	r64817
249	249	const TextEncoding& Latin1Encoding()
250	250	{
251		static TextEncoding globalLatin1Encoding("~~Latin-~~1");
	251	static TextEncoding globalLatin1Encoding("latin1");
252	252	return globalLatin1Encoding;
253	253	}

trunk/WebCore/platform/text/TextEncodingRegistry.cpp

-              r63036
+              r64817
 const size_t maxEncodingNameLength = 63;
+// Hash for all-ASCII strings that does case folding and skips any characters
+// that are not alphanumeric. If passed any non-ASCII characters, depends on
+// the behavior of isalnum -- if that returns false as it does on OS X, then
+// it will properly skip those characters too.
+// Hash for all-ASCII strings that does case folding.
 struct TextEncodingNameHash {
 …
         char c2;
         do {
+            do
+                c1 = *s1++;
+            while (c1 && !isASCIIAlphanumeric(c1));
+            do
+                c2 = *s2++;
+            while (c2 && !isASCIIAlphanumeric(c2));
+            c1 = *s1++;
+            c2 = *s2++;
             if (toASCIILower(c1) != toASCIILower(c2))
                 return false;
 …
         unsigned h = WTF::stringHashingStartValue;
         for (;;) {
+            char c;
+            do {
+                c = *s++;
+                if (!c) {
+                    h += (h << 3);
+                    h ^= (h >> 11);
+                    h += (h << 15);
+                    return h;
+                }
+            } while (!isASCIIAlphanumeric(c));
+            char c = *s++;
+            if (!c) {
+                h += (h << 3);
+                h ^= (h >> 11);
+                h += (h << 15);
+                return h;
+            }
             h += toASCIILower(c);
             h += (h << 10);
 …
             && strcasecmp(atomicName, "iso-8859-8") == 0)
         return;
+    LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s",
+        alias, oldAtomicName, atomicName);
+}
+#endif
+    LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName);
+}
+#endif
+static bool isUndesiredAlias(const char* alias)
+{
+    // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU).
+    for (const char* p = alias; *p; ++p) {
+        if (*p == ',')
+            return true;
+    }
+    // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility
+    // problem, see bug 43554.
+    if (0 == strcmp(alias, "8859_1"))
+        return true;
+    return false;
+}
 static void addToTextEncodingNameMap(const char* alias, const char* name)
+{
     ASSERT(strlen(alias) <= maxEncodingNameLength);
+    if (isUndesiredAlias(alias))
+        return;
     const char* atomicName = textEncodingNameMap->get(name);
     ASSERT(strcmp(alias, name) == 0 || atomicName);
 …
     for (size_t i = 0; i < length; ++i) {
         UChar c = characters[i];
+        if (isASCIIAlphanumeric(c)) {
+            if (j == maxEncodingNameLength)
+                return 0;
+            buffer[j++] = c;
+        }
+        if (j == maxEncodingNameLength)
+            return 0;
+        buffer[j++] = c;
+    }
     buffer[j] = 0;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 64817 in webkit

Legend:

Download in other formats: