Changeset 149947 in webkit
- Timestamp:
- May 11, 2013 5:53:25 PM (11 years ago)
- Location:
- trunk/Source/WebCore
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/Source/WebCore/ChangeLog
r149942 r149947 1 2013-05-11 Alexey Proskuryakov <ap@apple.com> 2 3 <rdar://problem/13823864> TextCodecICU complains about ambiguous codec names with current ICU release 4 https://bugs.webkit.org/show_bug.cgi?id=115953 5 6 Reviewed by Darin Adler. 7 8 Store and use canonical converter name to create converters. 9 10 As a side effect, we now actually reuse cached converters - previously we would compare 11 a standard encoding name to internal canonical one, which rarely match. 12 13 * platform/text/TextCodecICU.h: 14 * platform/text/TextCodecICU.cpp: 15 (WebCore::TextCodecICU::create): Pass canonical ICU converter name to constructor. 16 (WebCore::TextCodecICU::registerEncodingNames): 17 - Updated terminology. 18 - Added a comment that special cases should be kept in sync between registerEncodingNames 19 and registerCodecs. 20 - Moved maccyrillic alias to a correct section. It's not present in ICU even today. 21 - Changed a few aliases to actually map to standard name, not to an overridden one 22 (this doesn't change behavior since addToTextEncodingNameMap looks up canonical 23 name, but is clearer). 24 (WebCore::TextCodecICU::registerCodecs): Store a converter name to use with each 25 canonical encoding name. 26 (WebCore::TextCodecICU::TextCodecICU): Ditto. 27 (WebCore::TextCodecICU::releaseICUConverter): Reset the converter to remove any 28 leftover data. 29 (WebCore::TextCodecICU::createICUConverter): 30 - Compare converter name to converter name, not to another alias name. 31 - Use proper string comparison instead of pointer comparison. 32 - When creating a converter, assert that the name is not ambigous - canonical 33 converter names should never be, otherwise there would be no way to create 34 the converter without ambiguity. 35 1 36 2013-05-11 Antoine Quint <graouts@apple.com> 2 37 -
trunk/Source/WebCore/platform/text/TextCodecICU.cpp
r149924 r149947 58 58 } 59 59 60 PassOwnPtr<TextCodec> TextCodecICU::create(const TextEncoding& encoding, const void* )61 { 62 // TextEncoding name is always one from atomicCanonicalTextEncodingName, guaranteed to never bedeleted.63 return adoptPtr(new TextCodecICU(encoding.name() ));60 PassOwnPtr<TextCodec> TextCodecICU::create(const TextEncoding& encoding, const void* additionalData) 61 { 62 // Name strings are persistently kept in TextEncodingRegistry maps, so they are never deleted. 63 return adoptPtr(new TextCodecICU(encoding.name(), static_cast<const char*>(additionalData))); 64 64 } 65 65 … … 72 72 registrar("ISO-8859-8-I", "ISO-8859-8-I"); 73 73 74 int32_t num Encodings = ucnv_countAvailable();75 for (int32_t i = 0; i < num Encodings; ++i) {76 const char* name = ucnv_getAvailableName(i);74 int32_t numConverters = ucnv_countAvailable(); 75 for (int32_t i = 0; i < numConverters; ++i) { 76 const char* canonicalConverterName = ucnv_getAvailableName(i); 77 77 UErrorCode error = U_ZERO_ERROR; 78 78 // Try MIME before trying IANA to pick up commonly used names like 79 79 // 'EUC-JP' instead of horrendously long names like 80 80 // 'Extended_UNIX_Code_Packed_Format_for_Japanese'. 81 const char* standardName = ucnv_getStandardName(name, "MIME", &error);82 if (!U_SUCCESS(error) || ! standardName) {81 const char* webStandardName = ucnv_getStandardName(canonicalConverterName, "MIME", &error); 82 if (!U_SUCCESS(error) || !webStandardName) { 83 83 error = U_ZERO_ERROR; 84 84 // Try IANA to pick up 'windows-12xx' and other names 85 85 // which are not preferred MIME names but are widely used. 86 standardName = ucnv_getStandardName(name, "IANA", &error);87 if (!U_SUCCESS(error) || ! standardName)86 webStandardName = ucnv_getStandardName(canonicalConverterName, "IANA", &error); 87 if (!U_SUCCESS(error) || !webStandardName) 88 88 continue; 89 89 } 90 91 // Any standard encoding overrides should match checks in registerCodecs() below. 90 92 91 93 // 1. Treat GB2312 encoding as GBK (its more modern superset), to match other browsers. 92 94 // 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native encoding 93 95 // for encoding GB_2312-80 and several others. So, we need to override this behavior, too. 94 if (strcmp( standardName, "GB2312") == 0 || strcmp(standardName, "GB_2312-80") == 0)95 standardName = "GBK";96 if (strcmp(webStandardName, "GB2312") == 0 || strcmp(webStandardName, "GB_2312-80") == 0) 97 webStandardName = "GBK"; 96 98 // Similarly, EUC-KR encodings all map to an extended version. 97 else if (strcmp( standardName, "KSC_5601") == 0 || strcmp(standardName, "EUC-KR") == 0 || strcmp(standardName, "cp1363") == 0)98 standardName = "windows-949";99 else if (strcmp(webStandardName, "KSC_5601") == 0 || strcmp(webStandardName, "EUC-KR") == 0 || strcmp(webStandardName, "cp1363") == 0) 100 webStandardName = "windows-949"; 99 101 // And so on. 100 else if (strcasecmp(standardName, "iso-8859-9") == 0) // This name is returned in different case by ICU 3.2 and 3.6. 101 standardName = "windows-1254"; 102 else if (strcmp(standardName, "TIS-620") == 0) 103 standardName = "windows-874"; 104 105 registrar(standardName, standardName); 106 107 uint16_t numAliases = ucnv_countAliases(name, &error); 102 // FIXME: strcasecmp is locale sensitive, we should not be using it. 103 else if (strcasecmp(webStandardName, "iso-8859-9") == 0) // This name is returned in different case by ICU 3.2 and 3.6. 104 webStandardName = "windows-1254"; 105 else if (strcmp(webStandardName, "TIS-620") == 0) 106 webStandardName = "windows-874"; 107 108 registrar(webStandardName, webStandardName); 109 110 uint16_t numAliases = ucnv_countAliases(canonicalConverterName, &error); 108 111 ASSERT(U_SUCCESS(error)); 109 112 if (U_SUCCESS(error)) 110 113 for (uint16_t j = 0; j < numAliases; ++j) { 111 114 error = U_ZERO_ERROR; 112 const char* alias = ucnv_getAlias( name, j, &error);115 const char* alias = ucnv_getAlias(canonicalConverterName, j, &error); 113 116 ASSERT(U_SUCCESS(error)); 114 if (U_SUCCESS(error) && alias != standardName)115 registrar(alias, standardName);117 if (U_SUCCESS(error) && alias != webStandardName) 118 registrar(alias, webStandardName); 116 119 } 117 120 } 118 121 119 122 // Additional aliases. 120 // These are present in modern versions of ICU, but not in ICU 3.2 (shipped with Mac OS X 10.4). 123 // macroman is present in modern versions of ICU, but not in ICU 3.2 (shipped with Mac OS X 10.4). 124 // FIXME: Do any ports still use such old versions? 121 125 registrar("macroman", "macintosh"); 122 registrar("maccyrillic", "x-mac-cyrillic");123 126 124 127 // Additional aliases that historically were present in the encoding … … 127 130 // Or perhaps we can get them added to ICU. 128 131 registrar("x-mac-roman", "macintosh"); 132 registrar("maccyrillic", "x-mac-cyrillic"); 129 133 registrar("x-mac-ukrainian", "x-mac-cyrillic"); 130 134 registrar("cn-big5", "Big5"); … … 153 157 registrar("x-euc", "EUC-JP"); 154 158 registrar("x-windows-949", "windows-949"); 155 registrar("KSC5601", " KSC_5601");159 registrar("KSC5601", "windows-949"); 156 160 registrar("x-uhc", "windows-949"); 157 161 registrar("shift-jis", "Shift_JIS"); … … 172 176 registrar("ISO8859-8", "ISO-8859-8"); 173 177 registrar("ISO8859-8-I", "ISO-8859-8-I"); 174 registrar("ISO8859-9", " ISO-8859-9");178 registrar("ISO8859-9", "windows-1254"); 175 179 registrar("ISO8859-10", "ISO-8859-10"); 176 180 registrar("ISO8859-13", "ISO-8859-13"); … … 184 188 { 185 189 // See comment above in registerEncodingNames. 186 registrar("ISO-8859-8-I", create, 0); 187 188 int32_t numEncodings = ucnv_countAvailable(); 189 for (int32_t i = 0; i < numEncodings; ++i) { 190 const char* name = ucnv_getAvailableName(i); 191 UErrorCode error = U_ZERO_ERROR; 192 const char* standardName = ucnv_getStandardName(name, "MIME", &error); 193 if (!U_SUCCESS(error) || !standardName) { 190 UErrorCode error = U_ZERO_ERROR; 191 const char* canonicalConverterName = ucnv_getCanonicalName("ISO-8859-8-I", "IANA", &error); 192 ASSERT(U_SUCCESS(error)); 193 registrar("ISO-8859-8-I", create, canonicalConverterName); 194 195 int32_t numConverters = ucnv_countAvailable(); 196 for (int32_t i = 0; i < numConverters; ++i) { 197 canonicalConverterName = ucnv_getAvailableName(i); 198 error = U_ZERO_ERROR; 199 const char* webStandardName = ucnv_getStandardName(canonicalConverterName, "MIME", &error); 200 if (!U_SUCCESS(error) || !webStandardName) { 194 201 error = U_ZERO_ERROR; 195 standardName = ucnv_getStandardName(name, "IANA", &error);196 if (!U_SUCCESS(error) || ! standardName)202 webStandardName = ucnv_getStandardName(canonicalConverterName, "IANA", &error); 203 if (!U_SUCCESS(error) || !webStandardName) 197 204 continue; 198 205 } 199 registrar(standardName, create, 0); 200 } 201 } 202 203 TextCodecICU::TextCodecICU(const char* encoding) 206 207 // Don't register codecs for overridden encodings. 208 if (strcmp(webStandardName, "GB2312") == 0 || strcmp(webStandardName, "GB_2312-80") == 0 209 || strcmp(webStandardName, "KSC_5601") == 0 || strcmp(webStandardName, "EUC-KR") == 0 210 || strcmp(webStandardName, "cp1363") == 0 211 || strcasecmp(webStandardName, "iso-8859-9") == 0 212 || strcmp(webStandardName, "TIS-620") == 0) 213 continue; 214 215 registrar(webStandardName, create, fastStrDup(canonicalConverterName)); 216 } 217 218 // These encodings currently don't have standard names, so we need to register encoders manually. 219 // FIXME: Is there a good way to determine the most up to date variant programmatically? 220 registrar("windows-874", create, "windows-874-2000"); 221 registrar("windows-949", create, "windows-949-2000"); 222 } 223 224 TextCodecICU::TextCodecICU(const char* encoding, const char* canonicalConverterName) 204 225 : m_encodingName(encoding) 226 , m_canonicalConverterName(canonicalConverterName) 205 227 , m_converterICU(0) 206 228 , m_needsGBKFallbacks(false) … … 219 241 if (cachedConverter) 220 242 ucnv_close(cachedConverter); 243 ucnv_reset(m_converterICU); 221 244 cachedConverter = m_converterICU; 222 245 m_converterICU = 0; … … 228 251 ASSERT(!m_converterICU); 229 252 230 m_needsGBKFallbacks = m_encodingName[0] == 'G' && m_encodingName[1] == 'B' && m_encodingName[2] == 'K' && !m_encodingName[3];231 232 253 UErrorCode err; 254 255 m_needsGBKFallbacks = !strcmp(m_encodingName, "GBK"); 233 256 234 257 UConverter*& cachedConverter = cachedConverterICU(); 235 258 if (cachedConverter) { 236 259 err = U_ZERO_ERROR; 237 const char* cached Name = ucnv_getName(cachedConverter, &err);238 if (U_SUCCESS(err) && m_encodingName == cachedName) {260 const char* cachedConverterName = ucnv_getName(cachedConverter, &err); 261 if (U_SUCCESS(err) && !strcmp(m_canonicalConverterName, cachedConverterName)) { 239 262 m_converterICU = cachedConverter; 240 263 cachedConverter = 0; … … 244 267 245 268 err = U_ZERO_ERROR; 246 m_converterICU = ucnv_open(m_encodingName, &err); 247 #if !LOG_DISABLED 248 if (err == U_AMBIGUOUS_ALIAS_WARNING) 249 LOG_ERROR("ICU ambiguous alias warning for encoding: %s", m_encodingName); 250 #endif 269 m_converterICU = ucnv_open(m_canonicalConverterName, &err); 270 ASSERT(U_SUCCESS(err)); 251 271 if (m_converterICU) 252 272 ucnv_setFallback(m_converterICU, TRUE); … … 337 357 // <http://bugs.webkit.org/show_bug.cgi?id=17014> 338 358 // Simplified Chinese pages use the code A3A0 to mean "full-width space", but ICU decodes it as U+E5E5. 359 // FIXME: strcasecmp is locale sensitive, we should not be using it. 339 360 if (strcmp(m_encodingName, "GBK") == 0 || strcasecmp(m_encodingName, "gb18030") == 0) 340 361 resultString.replace(0xE5E5, ideographicSpace); -
trunk/Source/WebCore/platform/text/TextCodecICU.h
r149924 r149947 43 43 44 44 private: 45 explicit TextCodecICU(const char* encoding);46 static PassOwnPtr<TextCodec> create(const TextEncoding&, const void* );45 TextCodecICU(const char* encoding, const char* canonicalConverterName); 46 static PassOwnPtr<TextCodec> create(const TextEncoding&, const void* additionalData); 47 47 48 48 virtual String decode(const char*, size_t length, bool flush, bool stopOnError, bool& sawError); … … 58 58 59 59 const char* const m_encodingName; 60 const char* const m_canonicalConverterName; 60 61 mutable UConverter* m_converterICU; 61 62 mutable bool m_needsGBKFallbacks;
Note: See TracChangeset
for help on using the changeset viewer.