Changeset 206125 in webkit


Ignore:
Timestamp:
Sep 19, 2016 4:03:02 PM (8 years ago)
Author:
achristensen@apple.com
Message:

URLParser should parse serialized valid URLs faster than unknown input
https://bugs.webkit.org/show_bug.cgi?id=162228

Reviewed by Chris Dumez.

The URL constructor with ParsedURLStringTag is almost twice as fast as the other URL constructors.
Assuming there are no tabs or newlines, and assuming characters are already encoded decreases the URLParser
runtime by over 25% and adds infrastructure for more optimizations.

No new tests. No change in behaviour.

  • platform/URL.cpp:

(WebCore::URL::URL):

  • platform/URLParser.cpp:

(WebCore::utf8PercentEncode):
(WebCore::utf8PercentEncodeQuery):
(WebCore::URLParser::parse):
(WebCore::URLParser::parseSerializedURL):
(WebCore::incrementIterator):
(WebCore::URLParser::parseAuthority):
(WebCore::URLParser::parsePort):
(WebCore::URLParser::parseHost):

  • platform/URLParser.h:

(WebCore::URLParser::parse): Deleted.

Location:
trunk/Source/WebCore
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • trunk/Source/WebCore/ChangeLog

    r206122 r206125  
     12016-09-19  Alex Christensen  <achristensen@webkit.org>
     2
     3        URLParser should parse serialized valid URLs faster than unknown input
     4        https://bugs.webkit.org/show_bug.cgi?id=162228
     5
     6        Reviewed by Chris Dumez.
     7
     8        The URL constructor with ParsedURLStringTag is almost twice as fast as the other URL constructors.
     9        Assuming there are no tabs or newlines, and assuming characters are already encoded decreases the URLParser
     10        runtime by over 25% and adds infrastructure for more optimizations.
     11
     12        No new tests. No change in behaviour.
     13
     14        * platform/URL.cpp:
     15        (WebCore::URL::URL):
     16        * platform/URLParser.cpp:
     17        (WebCore::utf8PercentEncode):
     18        (WebCore::utf8PercentEncodeQuery):
     19        (WebCore::URLParser::parse):
     20        (WebCore::URLParser::parseSerializedURL):
     21        (WebCore::incrementIterator):
     22        (WebCore::URLParser::parseAuthority):
     23        (WebCore::URLParser::parsePort):
     24        (WebCore::URLParser::parseHost):
     25        * platform/URLParser.h:
     26        (WebCore::URLParser::parse): Deleted.
     27
    1282016-09-19  Daniel Bates  <dabates@apple.com>
    229
  • trunk/Source/WebCore/platform/URL.cpp

    r206076 r206125  
    443443    if (URLParser::enabled()) {
    444444        URLParser parser;
    445         *this = parser.parse(url);
    446         ASSERT((url.isEmpty() && m_string.isEmpty()) || url == m_string); // FIXME: Investigate parsing non-null empty ParsedURLStrings.
    447     } else {
     445        *this = parser.parseSerializedURL(url);
     446    } else
    448447        parse(url);
    449448#if OS(WINDOWS)
     
    453452        ASSERT(url == m_string);
    454453#endif
    455     }
    456454}
    457455
  • trunk/Source/WebCore/platform/URLParser.cpp

    r206076 r206125  
    436436}
    437437
     438template<bool serialized>
    438439inline static void utf8PercentEncode(UChar32 codePoint, Vector<LChar>& destination, bool(*isInCodeSet)(UChar32))
    439440{
    440     if (isInCodeSet(codePoint)) {
     441    if (serialized) {
     442        ASSERT_WITH_SECURITY_IMPLICATION(isASCII(codePoint));
     443        ASSERT_WITH_SECURITY_IMPLICATION(!isInCodeSet(codePoint));
     444        destination.append(codePoint);
     445    } else {
     446        if (isInCodeSet(codePoint)) {
     447            uint8_t buffer[U8_MAX_LENGTH];
     448            int32_t offset = 0;
     449            UBool error = false;
     450            U8_APPEND(buffer, offset, U8_MAX_LENGTH, codePoint, error);
     451            // FIXME: Check error.
     452            for (int32_t i = 0; i < offset; ++i)
     453                percentEncode(buffer[i], destination);
     454        } else {
     455            ASSERT_WITH_MESSAGE(isASCII(codePoint), "isInCodeSet should always return true for non-ASCII characters");
     456            destination.append(codePoint);
     457        }
     458    }
     459}
     460
     461template<bool serialized>
     462inline static void utf8PercentEncodeQuery(UChar32 codePoint, Vector<LChar>& destination)
     463{
     464    if (serialized) {
     465        ASSERT_WITH_SECURITY_IMPLICATION(isASCII(codePoint));
     466        ASSERT_WITH_SECURITY_IMPLICATION(!shouldPercentEncodeQueryByte(codePoint));
     467        destination.append(codePoint);
     468    } else {
    441469        uint8_t buffer[U8_MAX_LENGTH];
    442470        int32_t offset = 0;
    443471        UBool error = false;
    444472        U8_APPEND(buffer, offset, U8_MAX_LENGTH, codePoint, error);
     473        ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer)));
    445474        // FIXME: Check error.
    446         for (int32_t i = 0; i < offset; ++i)
    447             percentEncode(buffer[i], destination);
    448     } else {
    449         ASSERT_WITH_MESSAGE(isASCII(codePoint), "isInCodeSet should always return true for non-ASCII characters");
    450         destination.append(codePoint);
    451     }
    452 }
    453 
    454 inline static void utf8PercentEncodeQuery(UChar32 codePoint, Vector<LChar>& destination)
    455 {
    456     uint8_t buffer[U8_MAX_LENGTH];
    457     int32_t offset = 0;
    458     UBool error = false;
    459     U8_APPEND(buffer, offset, U8_MAX_LENGTH, codePoint, error);
    460     ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer)));
    461     // FIXME: Check error.
    462     for (int32_t i = 0; i < offset; ++i) {
    463         auto byte = buffer[i];
    464         if (shouldPercentEncodeQueryByte(byte))
    465             percentEncode(byte, destination);
    466         else
    467             destination.append(byte);
     475        for (int32_t i = 0; i < offset; ++i) {
     476            auto byte = buffer[i];
     477            if (shouldPercentEncodeQueryByte(byte))
     478                percentEncode(byte, destination);
     479            else
     480                destination.append(byte);
     481        }
    468482    }
    469483}
     
    864878URL URLParser::parse(const String& input, const URL& base, const TextEncoding& encoding)
    865879{
     880    const bool serialized = false;
    866881    if (input.is8Bit())
    867         return parse(input.characters8(), input.length(), base, encoding);
    868     return parse(input.characters16(), input.length(), base, encoding);
    869 }
    870 
    871 template<typename CharacterType>
     882        return parse<serialized>(input.characters8(), input.length(), base, encoding);
     883    return parse<serialized>(input.characters16(), input.length(), base, encoding);
     884}
     885
     886URL URLParser::parseSerializedURL(const String& input)
     887{
     888    const bool serialized = true;
     889    if (input.is8Bit())
     890        return parse<serialized>(input.characters8(), input.length(), { }, UTF8Encoding());
     891    return parse<serialized>(input.characters16(), input.length(), { }, UTF8Encoding());
     892}
     893
     894template<bool serialized, typename CharacterType>
     895void incrementIteratorSkippingTabAndNewLine(CodePointIterator<CharacterType>& iterator)
     896{
     897    ++iterator;
     898    while (!serialized && !iterator.atEnd() && isTabOrNewline(*iterator))
     899        ++iterator;
     900}
     901   
     902template<bool serialized, typename CharacterType>
    872903URL URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding)
    873904{
     
    917948    State state = State::SchemeStart;
    918949    while (!c.atEnd()) {
    919         if (isTabOrNewline(*c)) {
     950        if (!serialized && isTabOrNewline(*c)) {
    920951            ++c;
    921952            continue;
     
    961992                    m_url.m_portEnd = m_url.m_userStart;
    962993                    auto maybeSlash = c;
    963                     ++maybeSlash;
    964                     while (!maybeSlash.atEnd() && isTabOrNewline(*maybeSlash))
    965                         ++maybeSlash;
     994                    incrementIteratorSkippingTabAndNewLine<serialized>(maybeSlash);
    966995                    if (!maybeSlash.atEnd() && *maybeSlash == '/') {
    967996                        m_asciiBuffer.append('/');
     
    9841013                break;
    9851014            }
    986             ++c;
    987             while (!c.atEnd() && isTabOrNewline(*c))
    988                 ++c;
     1015            incrementIteratorSkippingTabAndNewLine<serialized>(c);
    9891016            if (c.atEnd()) {
    9901017                m_asciiBuffer.clear();
     
    10161043            if (*c == '/') {
    10171044                m_asciiBuffer.append('/');
    1018                 ++c;
    1019                 while (!c.atEnd() && isTabOrNewline(*c))
    1020                     ++c;
     1045                incrementIteratorSkippingTabAndNewLine<serialized>(c);
    10211046                if (c.atEnd())
    10221047                    return failure(input, length);
     
    10841109            m_asciiBuffer.append("//", 2);
    10851110            if (*c == '/' || *c == '\\') {
    1086                 ++c;
    1087                 while (!c.atEnd() && isTabOrNewline(*c))
    1088                     ++c;
     1111                incrementIteratorSkippingTabAndNewLine<serialized>(c);
    10891112                if (!c.atEnd() && (*c == '/' || *c == '\\'))
    10901113                    ++c;
     
    11061129            {
    11071130                if (*c == '@') {
    1108                     parseAuthority(CodePointIterator<CharacterType>(authorityOrHostBegin, c));
    1109                     ++c;
    1110                     while (!c.atEnd() && isTabOrNewline(*c))
    1111                         ++c;
     1131                    parseAuthority<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, c));
     1132                    incrementIteratorSkippingTabAndNewLine<serialized>(c);
    11121133                    authorityOrHostBegin = c;
    11131134                    state = State::Host;
     
    11191140                    m_url.m_userEnd = m_asciiBuffer.size();
    11201141                    m_url.m_passwordEnd = m_url.m_userEnd;
    1121                     if (!parseHost(CodePointIterator<CharacterType>(authorityOrHostBegin, c)))
     1142                    if (!parseHost<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, c)))
    11221143                        return failure(input, length);
    11231144                    if (!isSlash) {
     
    11361157            LOG_STATE("Host");
    11371158            if (*c == '/' || *c == '?' || *c == '#') {
    1138                 if (!parseHost(CodePointIterator<CharacterType>(authorityOrHostBegin, c)))
     1159                if (!parseHost<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, c)))
    11391160                    return failure(input, length);
    11401161                state = State::Path;
     
    12661287                    break;
    12671288                }
    1268                 if (!parseHost(CodePointIterator<CharacterType>(authorityOrHostBegin, c)))
     1289                if (!parseHost<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, c)))
    12691290                    return failure(input, length);
    12701291               
     
    13281349                break;
    13291350            }
    1330             utf8PercentEncode(*c, m_asciiBuffer, isInDefaultEncodeSet);
     1351            utf8PercentEncode<serialized>(*c, m_asciiBuffer, isInDefaultEncodeSet);
    13311352            ++c;
    13321353            break;
     
    13411362                state = State::Fragment;
    13421363            } else {
    1343                 utf8PercentEncode(*c, m_asciiBuffer, isInSimpleEncodeSet);
     1364                utf8PercentEncode<serialized>(*c, m_asciiBuffer, isInSimpleEncodeSet);
    13441365                ++c;
    13451366            }
     
    13551376            }
    13561377            if (isUTF8Encoding)
    1357                 utf8PercentEncodeQuery(*c, m_asciiBuffer);
     1378                utf8PercentEncodeQuery<serialized>(*c, m_asciiBuffer);
    13581379            else
    13591380                queryBuffer.append(*c);
     
    14311452        if (state == State::Host)
    14321453            LOG_FINAL_STATE("Host");
    1433         if (!parseHost(authorityOrHostBegin))
     1454        if (!parseHost<serialized>(authorityOrHostBegin))
    14341455            return failure(input, length);
    14351456        m_asciiBuffer.append('/');
     
    14851506        }
    14861507
    1487         if (!parseHost(CodePointIterator<CharacterType>(authorityOrHostBegin, c)))
     1508        if (!parseHost<serialized>(CodePointIterator<CharacterType>(authorityOrHostBegin, c)))
    14881509            return failure(input, length);
    14891510
     
    15431564}
    15441565
    1545 template<typename CharacterType>
     1566template<bool serialized, typename CharacterType>
    15461567void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator)
    15471568{
     
    15641585            break;
    15651586        }
    1566         utf8PercentEncode(*iterator, m_asciiBuffer, isInUserInfoEncodeSet);
     1587        utf8PercentEncode<serialized>(*iterator, m_asciiBuffer, isInUserInfoEncodeSet);
    15671588    }
    15681589    for (; !iterator.atEnd(); ++iterator)
    1569         utf8PercentEncode(*iterator, m_asciiBuffer, isInUserInfoEncodeSet);
     1590        utf8PercentEncode<serialized>(*iterator, m_asciiBuffer, isInUserInfoEncodeSet);
    15701591    m_url.m_passwordEnd = m_asciiBuffer.size();
    15711592    if (!m_url.m_userEnd)
     
    19301951}
    19311952
    1932 template<typename CharacterType>
     1953template<bool serialized, typename CharacterType>
    19331954bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator)
    19341955{
     
    19401961    m_asciiBuffer.append(':');
    19411962    for (; !iterator.atEnd(); ++iterator) {
    1942         if (isTabOrNewline(*iterator))
     1963        if (!serialized && isTabOrNewline(*iterator))
    19431964            continue;
    19441965        if (isASCIIDigit(*iterator)) {
     
    19601981}
    19611982
    1962 template<typename CharacterType>
     1983template<bool serialized, typename CharacterType>
    19631984bool URLParser::parseHost(CodePointIterator<CharacterType> iterator)
    19641985{
     
    19771998                if (!ipv6End.atEnd() && *ipv6End == ':') {
    19781999                    ++ipv6End;
    1979                     return parsePort(ipv6End);
     2000                    return parsePort<serialized>(ipv6End);
    19802001                }
    19812002                m_url.m_portEnd = m_asciiBuffer.size();
     
    19892010        auto hostIterator = iterator;
    19902011        for (; !iterator.atEnd(); ++iterator) {
    1991             if (isTabOrNewline(*iterator))
     2012            if (!serialized && isTabOrNewline(*iterator))
    19922013                continue;
    19932014            if (*iterator == ':')
     
    20042025            }
    20052026            ++iterator;
    2006             return parsePort(iterator);
     2027            return parsePort<serialized>(iterator);
    20072028        }
    20082029        for (; hostIterator != iterator; ++hostIterator) {
    2009             if (!isTabOrNewline(*hostIterator))
     2030            if (serialized || !isTabOrNewline(*hostIterator))
    20102031                m_asciiBuffer.append(toASCIILower(*hostIterator));
    20112032        }
     
    20132034        if (!hostIterator.atEnd()) {
    20142035            ASSERT(*hostIterator == ':');
    2015             ++hostIterator;
    2016             while (!hostIterator.atEnd() && isTabOrNewline(*hostIterator))
    2017                 ++hostIterator;
    2018             return parsePort(hostIterator);
     2036            incrementIteratorSkippingTabAndNewLine<serialized>(hostIterator);
     2037            return parsePort<serialized>(hostIterator);
    20192038        }
    20202039        m_url.m_portEnd = m_asciiBuffer.size();
     
    20252044    StringBuilder utf8Encoded;
    20262045    for (; !iterator.atEnd(); ++iterator) {
    2027         if (isTabOrNewline(*iterator))
     2046        if (!serialized && isTabOrNewline(*iterator))
    20282047            continue;
    20292048        if (*iterator == ':')
     
    20562075        }
    20572076        ++iterator;
    2058         return parsePort(iterator);
     2077        return parsePort<serialized>(iterator);
    20592078    }
    20602079
     
    20632082    if (!iterator.atEnd()) {
    20642083        ASSERT(*iterator == ':');
    2065         ++iterator;
    2066         while (!iterator.atEnd() && isTabOrNewline(*iterator))
    2067             ++iterator;
    2068         return parsePort(iterator);
     2084        incrementIteratorSkippingTabAndNewLine<serialized>(iterator);
     2085        return parsePort<serialized>(iterator);
    20692086    }
    20702087    m_url.m_portEnd = m_asciiBuffer.size();
  • trunk/Source/WebCore/platform/URLParser.h

    r206044 r206125  
    3838public:
    3939    WEBCORE_EXPORT URL parse(const String&, const URL& = { }, const TextEncoding& = UTF8Encoding());
     40    WEBCORE_EXPORT URL parseSerializedURL(const String&);
    4041    WEBCORE_EXPORT static bool allValuesEqual(const URL&, const URL&);
    4142
     
    5455    bool m_hostHasPercentOrNonASCII { false };
    5556
    56     template<typename CharacterType> URL parse(const CharacterType*, const unsigned length, const URL&, const TextEncoding&);
    57     template<typename CharacterType> void parseAuthority(CodePointIterator<CharacterType>);
    58     template<typename CharacterType> bool parseHost(CodePointIterator<CharacterType>);
    59     template<typename CharacterType> bool parsePort(CodePointIterator<CharacterType>&);
     57    template<bool serialized, typename CharacterType> URL parse(const CharacterType*, const unsigned length, const URL&, const TextEncoding&);
     58    template<bool serialized, typename CharacterType> void parseAuthority(CodePointIterator<CharacterType>);
     59    template<bool serialized, typename CharacterType> bool parseHost(CodePointIterator<CharacterType>);
     60    template<bool serialized, typename CharacterType> bool parsePort(CodePointIterator<CharacterType>&);
    6061    template<typename CharacterType> URL failure(const CharacterType*, unsigned length);
    6162
Note: See TracChangeset for help on using the changeset viewer.