Changeset 28181 in webkit


Ignore:
Timestamp:
Nov 29, 2007 3:37:40 AM (16 years ago)
Author:
eric@webkit.org
Message:

2007-11-26 Eric Seidel <eric@webkit.org>

Reviewed by Sam.

Pull first_byte and req_byte optimizations out into separate static funtions, SunSpider reported this as a win.

  • pcre/pcre_exec.cpp: (tryFirstByteOptimization): (tryRequiredByteOptimization): (jsRegExpExecute):
  • pcre/pcre_internal.h:
Location:
trunk/JavaScriptCore
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/JavaScriptCore/ChangeLog

    r28180 r28181  
    3939        * pcre/pcre_exec.cpp:
    4040        (match):
     41
     422007-11-26  Eric Seidel  <eric@webkit.org>
     43
     44        Reviewed by Sam.
     45
     46        Pull first_byte and req_byte optimizations out into separate static funtions, SunSpider reported this as a win.
     47
     48        * pcre/pcre_exec.cpp:
     49        (tryFirstByteOptimization):
     50        (tryRequiredByteOptimization):
     51        (jsRegExpExecute):
     52        * pcre/pcre_internal.h:
    4153
    42542007-11-26  Eric Seidel  <eric@webkit.org>
  • trunk/JavaScriptCore/pcre/pcre_exec.cpp

    r28180 r28181  
    19721972*/
    19731973
     1974static void tryFirstByteOptimization(const UChar*& subjectPtr, const UChar* endSubject, int first_byte, bool first_byte_caseless, bool useMultiLineFirstCharOptimization, const UChar* originalSubjectStart)
     1975{
     1976    // If first_byte is set, try scanning to the first instance of that byte
     1977    // no need to try and match against any earlier part of the subject string.
     1978    if (first_byte >= 0) {
     1979        UChar first_char = first_byte;
     1980        if (first_byte_caseless)
     1981            while (subjectPtr < endSubject) {
     1982                int c = *subjectPtr;
     1983                if (c > 127)
     1984                    break;
     1985                if (toLowerCase(c) == first_char)
     1986                    break;
     1987                subjectPtr++;
     1988            }
     1989        else {
     1990            while (subjectPtr < endSubject && *subjectPtr != first_char)
     1991                subjectPtr++;
     1992        }
     1993    } else if (useMultiLineFirstCharOptimization) {
     1994        /* Or to just after \n for a multiline match if possible */
     1995        // I'm not sure why this != originalSubjectStart check is necessary -- ecs 11/18/07
     1996        if (subjectPtr > originalSubjectStart) {
     1997            while (subjectPtr < endSubject && !isNewline(subjectPtr[-1]))
     1998                subjectPtr++;
     1999        }
     2000    }
     2001}
     2002
     2003static bool tryRequiredByteOptimization(const UChar*& subjectPtr, const UChar* endSubject, int req_byte, int req_byte2, bool req_byte_caseless, bool hasFirstByte, const UChar*& req_byte_ptr)
     2004{
     2005    /* If req_byte is set, we know that that character must appear in the subject
     2006     for the match to succeed. If the first character is set, req_byte must be
     2007     later in the subject; otherwise the test starts at the match point. This
     2008     optimization can save a huge amount of backtracking in patterns with nested
     2009     unlimited repeats that aren't going to match. Writing separate code for
     2010     cased/caseless versions makes it go faster, as does using an autoincrement
     2011     and backing off on a match.
     2012     
     2013     HOWEVER: when the subject string is very, very long, searching to its end can
     2014     take a long time, and give bad performance on quite ordinary patterns. This
     2015     showed up when somebody was matching /^C/ on a 32-megabyte string... so we
     2016     don't do this when the string is sufficiently long.
     2017    */
     2018
     2019    if (req_byte >= 0 && endSubject - subjectPtr < REQ_BYTE_MAX) {
     2020        const UChar* p = subjectPtr + (hasFirstByte ? 1 : 0);
     2021
     2022        /* We don't need to repeat the search if we haven't yet reached the
     2023         place we found it at last time. */
     2024
     2025        if (p > req_byte_ptr) {
     2026            if (req_byte_caseless) {
     2027                while (p < endSubject) {
     2028                    int pp = *p++;
     2029                    if (pp == req_byte || pp == req_byte2) {
     2030                        p--;
     2031                        break;
     2032                    }
     2033                }
     2034            } else {
     2035                while (p < endSubject) {
     2036                    if (*p++ == req_byte) {
     2037                        p--;
     2038                        break;
     2039                    }
     2040                }
     2041            }
     2042
     2043            /* If we can't find the required character, break the matching loop */
     2044
     2045            if (p >= endSubject)
     2046                return true;
     2047
     2048            /* If we have found the required character, save the point where we
     2049             found it, so that we don't search again next time round the loop if
     2050             the start hasn't passed this character yet. */
     2051
     2052            req_byte_ptr = p;
     2053        }
     2054    }
     2055    return false;
     2056}
     2057
    19742058int jsRegExpExecute(const JSRegExp* re,
    19752059                    const UChar* subject, int length, int start_offset, int* offsets,
     
    20542138    int req_byte2 = -1;
    20552139    if (re->options & PCRE_REQCHSET) {
    2056         req_byte = re->req_byte & 255;
     2140        req_byte = re->req_byte & 255; // FIXME: This optimization could be made to work for UTF16 chars as well...
    20572141        req_byte_caseless = (re->req_byte & REQ_IGNORE_CASE);
    20582142        req_byte2 = flipCase(req_byte);
     
    20672151   
    20682152    do {
    2069         const UChar* save_end_subject = end_subject;
    2070        
    20712153        /* Reset the maximum number of extractions we might see. */
    2072        
    20732154        if (match_block.offset_vector) {
    20742155            int* iptr = match_block.offset_vector;
     
    20782159        }
    20792160       
    2080         /* Advance to a unique first char if possible. If firstline is true, the
    2081          start of the match is constrained to the first line of a multiline string.
    2082          Implement this by temporarily adjusting end_subject so that we stop scanning
    2083          at a newline. If the match fails at the newline, later code breaks this loop.
    2084          */
    2085        
    2086         /* Now test for a unique first byte */
    2087        
    2088         if (first_byte >= 0) {
    2089             UChar first_char = first_byte;
    2090             if (first_byte_caseless)
    2091                 while (start_match < end_subject) {
    2092                     int sm = *start_match;
    2093                     if (sm > 127)
    2094                         break;
    2095                     if (toLowerCase(sm) == first_char)
    2096                         break;
    2097                     start_match++;
    2098                 }
    2099             else
    2100                 while (start_match < end_subject && *start_match != first_char)
    2101                     start_match++;
    2102         }
    2103        
    2104         /* Or to just after \n for a multiline match if possible */
    2105         else if (useMultiLineFirstCharOptimization) {
    2106             if (start_match > match_block.start_subject + start_offset) {
    2107                 while (start_match < end_subject && !isNewline(start_match[-1]))
    2108                     start_match++;
    2109             }
    2110         }
    2111        
    2112         /* Restore fudged end_subject */
    2113        
    2114         end_subject = save_end_subject;
    2115        
    2116 #ifdef DEBUG  /* Sigh. Some compilers never learn. */
    2117         printf(">>>> Match against: ");
    2118         pchars(start_match, end_subject - start_match, true, &match_block);
    2119         printf("\n");
    2120 #endif
    2121        
    2122         /* If req_byte is set, we know that that character must appear in the subject
    2123          for the match to succeed. If the first character is set, req_byte must be
    2124          later in the subject; otherwise the test starts at the match point. This
    2125          optimization can save a huge amount of backtracking in patterns with nested
    2126          unlimited repeats that aren't going to match. Writing separate code for
    2127          cased/caseless versions makes it go faster, as does using an autoincrement
    2128          and backing off on a match.
    2129          
    2130          HOWEVER: when the subject string is very, very long, searching to its end can
    2131          take a long time, and give bad performance on quite ordinary patterns. This
    2132          showed up when somebody was matching /^C/ on a 32-megabyte string... so we
    2133          don't do this when the string is sufficiently long.
    2134          
    2135          ALSO: this processing is disabled when partial matching is requested.
    2136          */
    2137        
    2138         if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX) {
    2139             const UChar* p = start_match + ((first_byte >= 0) ? 1 : 0);
    2140            
    2141             /* We don't need to repeat the search if we haven't yet reached the
    2142              place we found it at last time. */
    2143            
    2144             if (p > req_byte_ptr) {
    2145                 if (req_byte_caseless) {
    2146                     while (p < end_subject) {
    2147                         int pp = *p++;
    2148                         if (pp == req_byte || pp == req_byte2) {
    2149                             p--;
    2150                             break;
    2151                         }
    2152                     }
    2153                 } else {
    2154                     while (p < end_subject) {
    2155                         if (*p++ == req_byte) {
    2156                             p--;
    2157                             break;
    2158                         }
    2159                     }
    2160                 }
    2161                
    2162                 /* If we can't find the required character, break the matching loop */
    2163                
    2164                 if (p >= end_subject)
    2165                     break;
    2166                
    2167                 /* If we have found the required character, save the point where we
    2168                  found it, so that we don't search again next time round the loop if
    2169                  the start hasn't passed this character yet. */
    2170                
    2171                 req_byte_ptr = p;
    2172             }
    2173         }
    2174        
     2161        tryFirstByteOptimization(start_match, end_subject, first_byte, first_byte_caseless, useMultiLineFirstCharOptimization, match_block.start_subject + start_offset);
     2162        if (tryRequiredByteOptimization(start_match, end_subject, req_byte, req_byte2, req_byte_caseless, first_byte >= 0, req_byte_ptr))
     2163            break;
     2164               
    21752165        /* When a match occurs, substrings will be set for all internal extractions;
    21762166         we just need to set up the whole thing as substring 0 before returning. If
  • trunk/JavaScriptCore/pcre/pcre_internal.h

    r28180 r28181  
    550550
    551551struct JSRegExp {
    552   pcre_uint32 size;               /* Total that was malloced */
    553   pcre_uint32 options;
    554 
    555   pcre_uint16 top_bracket;
    556   pcre_uint16 top_backref;
    557   pcre_uint16 first_byte;
    558   pcre_uint16 req_byte;
     552    pcre_uint32 size;               /* Total that was malloced */
     553    pcre_uint32 options;
     554
     555    pcre_uint16 top_bracket;
     556    pcre_uint16 top_backref;
     557   
     558    // jsRegExpExecute && jsRegExpCompile currently only how to handle ASCII
     559    // chars for thse optimizations, however it would be trivial to add support
     560    // for optimized UChar first_byte/req_byte scans
     561    unsigned char first_byte;
     562    unsigned char req_byte;
    559563};
    560564
Note: See TracChangeset for help on using the changeset viewer.