Changeset 73201 in webkit


Ignore:
Timestamp:
Dec 2, 2010 2:45:36 PM (13 years ago)
Author:
Patrick Gansterer
Message:

2010-12-02 Patrick Gansterer <Patrick Gansterer>

Reviewed by Darin Adler.

Add AtomicString::fromUTF8
https://bugs.webkit.org/show_bug.cgi?id=45594

Unicode::calculateStringHashFromUTF8 creates a StringHash out of UTF8 input data and
calculates the required length for the UTF16 conversation in one step.
This is then used in a specialized translator for the string table of AtomicString.

  • JavaScriptCore.exp:
  • JavaScriptCore.vcproj/JavaScriptCore/JavaScriptCore.def:
  • wtf/text/AtomicString.cpp: (WTF::CStringTranslator::equal): (WTF::HashAndUTF8CharactersTranslator::hash): (WTF::HashAndUTF8CharactersTranslator::equal): (WTF::HashAndUTF8CharactersTranslator::translate): (WTF::AtomicString::add): (WTF::AtomicString::addSlowCase): (WTF::AtomicString::find): (WTF::AtomicString::fromUTF8):
  • wtf/text/AtomicString.h:
  • wtf/text/StringImpl.h:
  • wtf/text/WTFString.h:
  • wtf/unicode/UTF8.cpp: (WTF::Unicode::readUTF8Sequence): (WTF::Unicode::convertUTF8ToUTF16): (WTF::Unicode::calculateStringHashFromUTF8): (WTF::Unicode::equalUTF16WithUTF8):
  • wtf/unicode/UTF8.h:

2010-12-02 Patrick Gansterer <Patrick Gansterer>

Reviewed by Darin Adler.

Add AtomicString::fromUTF8
https://bugs.webkit.org/show_bug.cgi?id=45594

Use AtomicString::fromUTF8 directly in the libxml2 parser.

  • dom/XMLDocumentParserLibxml2.cpp: (WebCore::toAtomicString):
Location:
trunk
Files:
11 edited

Legend:

Unmodified
Added
Removed
  • trunk/JavaScriptCore/ChangeLog

    r73198 r73201  
     12010-12-02  Patrick Gansterer  <paroga@webkit.org>
     2
     3        Reviewed by Darin Adler.
     4
     5        Add AtomicString::fromUTF8
     6        https://bugs.webkit.org/show_bug.cgi?id=45594
     7
     8        Unicode::calculateStringHashFromUTF8 creates a StringHash out of UTF8 input data and
     9        calculates the required length for the UTF16 conversation in one step.
     10        This is then used in a specialized translator for the string table of AtomicString.
     11
     12        * JavaScriptCore.exp:
     13        * JavaScriptCore.vcproj/JavaScriptCore/JavaScriptCore.def:
     14        * wtf/text/AtomicString.cpp:
     15        (WTF::CStringTranslator::equal):
     16        (WTF::HashAndUTF8CharactersTranslator::hash):
     17        (WTF::HashAndUTF8CharactersTranslator::equal):
     18        (WTF::HashAndUTF8CharactersTranslator::translate):
     19        (WTF::AtomicString::add):
     20        (WTF::AtomicString::addSlowCase):
     21        (WTF::AtomicString::find):
     22        (WTF::AtomicString::fromUTF8):
     23        * wtf/text/AtomicString.h:
     24        * wtf/text/StringImpl.h:
     25        * wtf/text/WTFString.h:
     26        * wtf/unicode/UTF8.cpp:
     27        (WTF::Unicode::readUTF8Sequence):
     28        (WTF::Unicode::convertUTF8ToUTF16):
     29        (WTF::Unicode::calculateStringHashFromUTF8):
     30        (WTF::Unicode::equalUTF16WithUTF8):
     31        * wtf/unicode/UTF8.h:
     32
    1332010-12-02  Geoffrey Garen  <ggaren@apple.com>
    234
  • trunk/JavaScriptCore/JavaScriptCore.exp

    r73095 r73201  
    374374__ZN3WTF12AtomicString4findEPKtjj
    375375__ZN3WTF12AtomicString4initEv
     376__ZN3WTF12AtomicString8fromUTF8EPKc
     377__ZN3WTF12AtomicString8fromUTF8EPKcm
    376378__ZN3WTF12createThreadEPFPvS0_ES0_
    377379__ZN3WTF12createThreadEPFPvS0_ES0_PKc
  • trunk/JavaScriptCore/JavaScriptCore.vcproj/JavaScriptCore/JavaScriptCore.def

    r73099 r73201  
    6060    ?bufferLengthForStringDecimal@DecimalNumber@WTF@@QBEIXZ
    6161    ?calculateDSTOffset@WTF@@YANNN@Z
     62    ?calculateStringHashFromUTF8@Unicode@WTF@@YAIPBD0AAI@Z
    6263    ?calculateUTCOffset@WTF@@YAHXZ
    6364    ?calculatedFunctionName@DebuggerCallFrame@JSC@@QBE?AVUString@2@XZ
     
    140141    ?enumerable@PropertyDescriptor@JSC@@QBE_NXZ
    141142    ?equal@Identifier@JSC@@SA_NPBVStringImpl@WTF@@PBD@Z
     143    ?equalUTF16WithUTF8@Unicode@WTF@@YA_NPB_W0PBD1@Z
    142144    ?evaluate@DebuggerCallFrame@JSC@@QBE?AVJSValue@2@ABVUString@2@AAV32@@Z
    143145    ?evaluate@JSC@@YA?AVCompletion@1@PAVExecState@1@AAVScopeChain@1@ABVSourceCode@1@VJSValue@1@@Z
  • trunk/JavaScriptCore/wtf/text/AtomicString.cpp

    r71375 r73201  
    11/*
    22 * Copyright (C) 2004, 2005, 2006, 2007, 2008 Apple Inc. All rights reserved.
     3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>
    34 *
    45 * This library is free software; you can redistribute it and/or
     
    2728#include <wtf/Threading.h>
    2829#include <wtf/WTFThreadData.h>
     30#include <wtf/unicode/UTF8.h>
    2931
    3032namespace WTF {
     33
     34using namespace Unicode;
    3135
    3236COMPILE_ASSERT(sizeof(AtomicString) == sizeof(String), atomic_string_and_string_must_be_same_size);
     
    8690                return false;
    8791        }
    88         return s[length] == 0;
     92        return !s[length];
    8993    }
    9094
     
    207211};
    208212
     213struct HashAndUTF8Characters {
     214    unsigned hash;
     215    const char* characters;
     216    unsigned length;
     217    unsigned utf16Length;
     218};
     219
     220struct HashAndUTF8CharactersTranslator {
     221    static unsigned hash(const HashAndUTF8Characters& buffer)
     222    {
     223        return buffer.hash;
     224    }
     225
     226    static bool equal(StringImpl* const& string, const HashAndUTF8Characters& buffer)
     227    {
     228        return equalUTF16WithUTF8(string->characters(), string->characters() + string->length(), buffer.characters, buffer.characters + buffer.length);
     229    }
     230
     231    static void translate(StringImpl*& location, const HashAndUTF8Characters& buffer, unsigned hash)
     232    {
     233        UChar* target;
     234        location = StringImpl::createUninitialized(buffer.utf16Length, target).releaseRef();
     235
     236        const char* source = buffer.characters;
     237        if (convertUTF8ToUTF16(&source, source + buffer.length, &target, target + buffer.utf16Length) != conversionOK)
     238            ASSERT_NOT_REACHED();
     239
     240        location->setHash(hash);
     241        location->setIsAtomic(true);
     242    }
     243};
     244
    209245PassRefPtr<StringImpl> AtomicString::add(const UChar* s, unsigned length)
    210246{
     
    212248        return 0;
    213249
    214     if (length == 0)
     250    if (!length)
    215251        return StringImpl::empty();
    216252   
     
    228264    ASSERT(existingHash);
    229265
    230     if (length == 0)
     266    if (!length)
    231267        return StringImpl::empty();
    232268   
     
    247283        length++;
    248284
    249     if (length == 0)
     285    if (!length)
    250286        return StringImpl::empty();
    251287
     
    263299        return r;
    264300
    265     if (r->length() == 0)
     301    if (!r->length())
    266302        return StringImpl::empty();
    267303
     
    277313    ASSERT(existingHash);
    278314
    279     if (length == 0)
     315    if (!length)
    280316        return static_cast<AtomicStringImpl*>(StringImpl::empty());
    281317
     
    291327    stringTable().remove(r);
    292328}
    293    
     329
    294330AtomicString AtomicString::lower() const
    295331{
     
    304340}
    305341
    306 }
     342AtomicString AtomicString::fromUTF8(const char* characters, size_t length)
     343{
     344    if (!characters)
     345        return AtomicString();
     346
     347    if (!length)
     348        return emptyAtom;
     349
     350    HashAndUTF8Characters buffer;
     351    buffer.characters = characters;
     352    buffer.length = length;
     353    buffer.hash = calculateStringHashFromUTF8(characters, characters + length, buffer.utf16Length);
     354
     355    if (!buffer.hash)
     356        return AtomicString();
     357
     358    pair<HashSet<StringImpl*>::iterator, bool> addResult = stringTable().add<HashAndUTF8Characters, HashAndUTF8CharactersTranslator>(buffer);
     359
     360    // If the string is newly-translated, then we need to adopt it.
     361    // The boolean in the pair tells us if that is so.
     362    AtomicString atomicString;
     363    atomicString.m_string = addResult.second ? adoptRef(*addResult.first) : *addResult.first;
     364    return atomicString;
     365}
     366
     367AtomicString AtomicString::fromUTF8(const char* characters)
     368{
     369    if (!characters)
     370        return AtomicString();
     371    return fromUTF8(characters, strlen(characters));
     372}
     373
     374} // namespace WTF
  • trunk/JavaScriptCore/wtf/text/AtomicString.h

    r68422 r73201  
    109109#endif
    110110
     111    // AtomicString::fromUTF8 will return a null string if
     112    // the input data contains invalid UTF-8 sequences.
     113    static AtomicString fromUTF8(const char*, size_t);
     114    static AtomicString fromUTF8(const char*);
     115
    111116private:
    112117    String m_string;
  • trunk/JavaScriptCore/wtf/text/StringImpl.h

    r72477 r73201  
    5454struct CStringTranslator;
    5555struct HashAndCharactersTranslator;
     56struct HashAndUTF8CharactersTranslator;
    5657struct UCharBufferTranslator;
    5758
     
    6768    friend struct WTF::CStringTranslator;
    6869    friend struct WTF::HashAndCharactersTranslator;
     70    friend struct WTF::HashAndUTF8CharactersTranslator;
    6971    friend struct WTF::UCharBufferTranslator;
    7072    friend class AtomicStringImpl;
  • trunk/JavaScriptCore/wtf/text/WTFString.h

    r70288 r73201  
    310310#endif
    311311
     312    // String::fromUTF8 will return a null string if
     313    // the input data contains invalid UTF-8 sequences.
    312314    static String fromUTF8(const char*, size_t);
    313315    static String fromUTF8(const char*);
  • trunk/JavaScriptCore/wtf/unicode/UTF8.cpp

    r72979 r73201  
    11/*
    22 * Copyright (C) 2007 Apple Inc.  All rights reserved.
     3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>
    34 *
    45 * Redistribution and use in source and binary forms, with or without
     
    2627#include "config.h"
    2728#include "UTF8.h"
     29#include <wtf/StringHasher.h>
    2830
    2931#include "ASCIICType.h"
     
    3335
    3436// FIXME: Use definition from CharacterNames.h.
    35 const UChar replacementCharacter = 0xFFFD;
     37static const UChar replacementCharacter = 0xFFFD;
    3638
    3739inline int inlineUTF8SequenceLengthNonASCII(char b0)
     
    315317}
    316318
     319unsigned calculateStringHashFromUTF8(const char* data, const char* dataEnd, unsigned& utf16Length)
     320{
     321    if (!data)
     322        return 0;
     323
     324    WTF::StringHasher stringHasher;
     325    utf16Length = 0;
     326
     327    while (data < dataEnd) {
     328        if (isASCII(*data)) {
     329            stringHasher.addCharacter(*data++);
     330            utf16Length++;
     331            continue;
     332        }
     333
     334        int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);
     335
     336        if (dataEnd - data < utf8SequenceLength)
     337            return false;
     338
     339        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength))
     340            return 0;
     341
     342        UChar32 character = readUTF8Sequence(data, utf8SequenceLength);
     343        ASSERT(!isASCII(character));
     344
     345        if (U_IS_BMP(character)) {
     346            // UTF-16 surrogate values are illegal in UTF-32
     347            if (U_IS_SURROGATE(character))
     348                return 0;
     349            stringHasher.addCharacter(static_cast<UChar>(character)); // normal case
     350            utf16Length++;
     351        } else if (U_IS_SUPPLEMENTARY(character)) {
     352            stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)),
     353                                       static_cast<UChar>(U16_TRAIL(character)));
     354            utf16Length += 2;
     355        } else
     356            return 0;
     357    }
     358
     359    return stringHasher.hash();
     360}
     361
     362bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd)
     363{
     364    while (b < bEnd) {
     365        if (isASCII(*b)) {
     366            if (*a++ != *b++)
     367                return false;
     368            continue;
     369        }
     370
     371        int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b);
     372
     373        if (bEnd - b < utf8SequenceLength)
     374            return false;
     375
     376        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8SequenceLength))
     377            return 0;
     378
     379        UChar32 character = readUTF8Sequence(b, utf8SequenceLength);
     380        ASSERT(!isASCII(character));
     381
     382        if (U_IS_BMP(character)) {
     383            // UTF-16 surrogate values are illegal in UTF-32
     384            if (U_IS_SURROGATE(character))
     385                return false;
     386            if (*a++ != character)
     387                return false;
     388        } else if (U_IS_SUPPLEMENTARY(character)) {
     389            if (*a++ != U16_LEAD(character))
     390                return false;
     391            if (*a++ != U16_TRAIL(character))
     392                return false;
     393        } else
     394            return false;
     395    }
     396
     397    return a == aEnd;
     398}
     399
    317400} // namespace Unicode
    318401} // namespace WTF
  • trunk/JavaScriptCore/wtf/unicode/UTF8.h

    r27810 r73201  
    3030
    3131namespace WTF {
    32   namespace Unicode {
     32namespace Unicode {
    3333
    3434    // Given a first byte, gives the length of the UTF-8 sequence it begins.
     
    7070                    const UChar** sourceStart, const UChar* sourceEnd,
    7171                    char** targetStart, char* targetEnd, bool strict = true);
    72   }
    73 }
     72
     73    unsigned calculateStringHashFromUTF8(const char* data, const char* dataEnd, unsigned& utf16Length);
     74
     75    bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd);
     76
     77} // namespace Unicode
     78} // namespace WTF
    7479
    7580#endif // WTF_UTF8_h
  • trunk/WebCore/ChangeLog

    r73197 r73201  
     12010-12-02  Patrick Gansterer  <paroga@webkit.org>
     2
     3        Reviewed by Darin Adler.
     4
     5        Add AtomicString::fromUTF8
     6        https://bugs.webkit.org/show_bug.cgi?id=45594
     7
     8        Use AtomicString::fromUTF8 directly in the libxml2 parser.
     9
     10        * dom/XMLDocumentParserLibxml2.cpp:
     11        (WebCore::toAtomicString):
     12
    1132010-12-02  Andy Estes  <aestes@apple.com>
    214
  • trunk/WebCore/dom/XMLDocumentParserLibxml2.cpp

    r71895 r73201  
    687687static inline AtomicString toAtomicString(const xmlChar* string, size_t size)
    688688{
    689     // FIXME: Use AtomicString::fromUTF8.
    690     return AtomicString(toString(string, size));
     689    return AtomicString::fromUTF8(reinterpret_cast<const char*>(string), size);
    691690}
    692691
    693692static inline AtomicString toAtomicString(const xmlChar* string)
    694693{
    695     // FIXME: Use AtomicString::fromUTF8.
    696     return AtomicString(toString(string));
     694    return AtomicString::fromUTF8(reinterpret_cast<const char*>(string));
    697695}
    698696
Note: See TracChangeset for help on using the changeset viewer.