Changeset 28181 in webkit
- Timestamp:
- Nov 29, 2007 3:37:40 AM (16 years ago)
- Location:
- trunk/JavaScriptCore
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/JavaScriptCore/ChangeLog
r28180 r28181 39 39 * pcre/pcre_exec.cpp: 40 40 (match): 41 42 2007-11-26 Eric Seidel <eric@webkit.org> 43 44 Reviewed by Sam. 45 46 Pull first_byte and req_byte optimizations out into separate static funtions, SunSpider reported this as a win. 47 48 * pcre/pcre_exec.cpp: 49 (tryFirstByteOptimization): 50 (tryRequiredByteOptimization): 51 (jsRegExpExecute): 52 * pcre/pcre_internal.h: 41 53 42 54 2007-11-26 Eric Seidel <eric@webkit.org> -
trunk/JavaScriptCore/pcre/pcre_exec.cpp
r28180 r28181 1972 1972 */ 1973 1973 1974 static void tryFirstByteOptimization(const UChar*& subjectPtr, const UChar* endSubject, int first_byte, bool first_byte_caseless, bool useMultiLineFirstCharOptimization, const UChar* originalSubjectStart) 1975 { 1976 // If first_byte is set, try scanning to the first instance of that byte 1977 // no need to try and match against any earlier part of the subject string. 1978 if (first_byte >= 0) { 1979 UChar first_char = first_byte; 1980 if (first_byte_caseless) 1981 while (subjectPtr < endSubject) { 1982 int c = *subjectPtr; 1983 if (c > 127) 1984 break; 1985 if (toLowerCase(c) == first_char) 1986 break; 1987 subjectPtr++; 1988 } 1989 else { 1990 while (subjectPtr < endSubject && *subjectPtr != first_char) 1991 subjectPtr++; 1992 } 1993 } else if (useMultiLineFirstCharOptimization) { 1994 /* Or to just after \n for a multiline match if possible */ 1995 // I'm not sure why this != originalSubjectStart check is necessary -- ecs 11/18/07 1996 if (subjectPtr > originalSubjectStart) { 1997 while (subjectPtr < endSubject && !isNewline(subjectPtr[-1])) 1998 subjectPtr++; 1999 } 2000 } 2001 } 2002 2003 static bool tryRequiredByteOptimization(const UChar*& subjectPtr, const UChar* endSubject, int req_byte, int req_byte2, bool req_byte_caseless, bool hasFirstByte, const UChar*& req_byte_ptr) 2004 { 2005 /* If req_byte is set, we know that that character must appear in the subject 2006 for the match to succeed. If the first character is set, req_byte must be 2007 later in the subject; otherwise the test starts at the match point. This 2008 optimization can save a huge amount of backtracking in patterns with nested 2009 unlimited repeats that aren't going to match. Writing separate code for 2010 cased/caseless versions makes it go faster, as does using an autoincrement 2011 and backing off on a match. 2012 2013 HOWEVER: when the subject string is very, very long, searching to its end can 2014 take a long time, and give bad performance on quite ordinary patterns. This 2015 showed up when somebody was matching /^C/ on a 32-megabyte string... so we 2016 don't do this when the string is sufficiently long. 2017 */ 2018 2019 if (req_byte >= 0 && endSubject - subjectPtr < REQ_BYTE_MAX) { 2020 const UChar* p = subjectPtr + (hasFirstByte ? 1 : 0); 2021 2022 /* We don't need to repeat the search if we haven't yet reached the 2023 place we found it at last time. */ 2024 2025 if (p > req_byte_ptr) { 2026 if (req_byte_caseless) { 2027 while (p < endSubject) { 2028 int pp = *p++; 2029 if (pp == req_byte || pp == req_byte2) { 2030 p--; 2031 break; 2032 } 2033 } 2034 } else { 2035 while (p < endSubject) { 2036 if (*p++ == req_byte) { 2037 p--; 2038 break; 2039 } 2040 } 2041 } 2042 2043 /* If we can't find the required character, break the matching loop */ 2044 2045 if (p >= endSubject) 2046 return true; 2047 2048 /* If we have found the required character, save the point where we 2049 found it, so that we don't search again next time round the loop if 2050 the start hasn't passed this character yet. */ 2051 2052 req_byte_ptr = p; 2053 } 2054 } 2055 return false; 2056 } 2057 1974 2058 int jsRegExpExecute(const JSRegExp* re, 1975 2059 const UChar* subject, int length, int start_offset, int* offsets, … … 2054 2138 int req_byte2 = -1; 2055 2139 if (re->options & PCRE_REQCHSET) { 2056 req_byte = re->req_byte & 255; 2140 req_byte = re->req_byte & 255; // FIXME: This optimization could be made to work for UTF16 chars as well... 2057 2141 req_byte_caseless = (re->req_byte & REQ_IGNORE_CASE); 2058 2142 req_byte2 = flipCase(req_byte); … … 2067 2151 2068 2152 do { 2069 const UChar* save_end_subject = end_subject;2070 2071 2153 /* Reset the maximum number of extractions we might see. */ 2072 2073 2154 if (match_block.offset_vector) { 2074 2155 int* iptr = match_block.offset_vector; … … 2078 2159 } 2079 2160 2080 /* Advance to a unique first char if possible. If firstline is true, the 2081 start of the match is constrained to the first line of a multiline string. 2082 Implement this by temporarily adjusting end_subject so that we stop scanning 2083 at a newline. If the match fails at the newline, later code breaks this loop. 2084 */ 2085 2086 /* Now test for a unique first byte */ 2087 2088 if (first_byte >= 0) { 2089 UChar first_char = first_byte; 2090 if (first_byte_caseless) 2091 while (start_match < end_subject) { 2092 int sm = *start_match; 2093 if (sm > 127) 2094 break; 2095 if (toLowerCase(sm) == first_char) 2096 break; 2097 start_match++; 2098 } 2099 else 2100 while (start_match < end_subject && *start_match != first_char) 2101 start_match++; 2102 } 2103 2104 /* Or to just after \n for a multiline match if possible */ 2105 else if (useMultiLineFirstCharOptimization) { 2106 if (start_match > match_block.start_subject + start_offset) { 2107 while (start_match < end_subject && !isNewline(start_match[-1])) 2108 start_match++; 2109 } 2110 } 2111 2112 /* Restore fudged end_subject */ 2113 2114 end_subject = save_end_subject; 2115 2116 #ifdef DEBUG /* Sigh. Some compilers never learn. */ 2117 printf(">>>> Match against: "); 2118 pchars(start_match, end_subject - start_match, true, &match_block); 2119 printf("\n"); 2120 #endif 2121 2122 /* If req_byte is set, we know that that character must appear in the subject 2123 for the match to succeed. If the first character is set, req_byte must be 2124 later in the subject; otherwise the test starts at the match point. This 2125 optimization can save a huge amount of backtracking in patterns with nested 2126 unlimited repeats that aren't going to match. Writing separate code for 2127 cased/caseless versions makes it go faster, as does using an autoincrement 2128 and backing off on a match. 2129 2130 HOWEVER: when the subject string is very, very long, searching to its end can 2131 take a long time, and give bad performance on quite ordinary patterns. This 2132 showed up when somebody was matching /^C/ on a 32-megabyte string... so we 2133 don't do this when the string is sufficiently long. 2134 2135 ALSO: this processing is disabled when partial matching is requested. 2136 */ 2137 2138 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX) { 2139 const UChar* p = start_match + ((first_byte >= 0) ? 1 : 0); 2140 2141 /* We don't need to repeat the search if we haven't yet reached the 2142 place we found it at last time. */ 2143 2144 if (p > req_byte_ptr) { 2145 if (req_byte_caseless) { 2146 while (p < end_subject) { 2147 int pp = *p++; 2148 if (pp == req_byte || pp == req_byte2) { 2149 p--; 2150 break; 2151 } 2152 } 2153 } else { 2154 while (p < end_subject) { 2155 if (*p++ == req_byte) { 2156 p--; 2157 break; 2158 } 2159 } 2160 } 2161 2162 /* If we can't find the required character, break the matching loop */ 2163 2164 if (p >= end_subject) 2165 break; 2166 2167 /* If we have found the required character, save the point where we 2168 found it, so that we don't search again next time round the loop if 2169 the start hasn't passed this character yet. */ 2170 2171 req_byte_ptr = p; 2172 } 2173 } 2174 2161 tryFirstByteOptimization(start_match, end_subject, first_byte, first_byte_caseless, useMultiLineFirstCharOptimization, match_block.start_subject + start_offset); 2162 if (tryRequiredByteOptimization(start_match, end_subject, req_byte, req_byte2, req_byte_caseless, first_byte >= 0, req_byte_ptr)) 2163 break; 2164 2175 2165 /* When a match occurs, substrings will be set for all internal extractions; 2176 2166 we just need to set up the whole thing as substring 0 before returning. If -
trunk/JavaScriptCore/pcre/pcre_internal.h
r28180 r28181 550 550 551 551 struct JSRegExp { 552 pcre_uint32 size; /* Total that was malloced */ 553 pcre_uint32 options; 554 555 pcre_uint16 top_bracket; 556 pcre_uint16 top_backref; 557 pcre_uint16 first_byte; 558 pcre_uint16 req_byte; 552 pcre_uint32 size; /* Total that was malloced */ 553 pcre_uint32 options; 554 555 pcre_uint16 top_bracket; 556 pcre_uint16 top_backref; 557 558 // jsRegExpExecute && jsRegExpCompile currently only how to handle ASCII 559 // chars for thse optimizations, however it would be trivial to add support 560 // for optimized UChar first_byte/req_byte scans 561 unsigned char first_byte; 562 unsigned char req_byte; 559 563 }; 560 564
Note: See TracChangeset
for help on using the changeset viewer.