Changeset 181107 in webkit
- Timestamp:
- Mar 5, 2015 3:47:21 PM (9 years ago)
- Location:
- trunk
- Files:
-
- 5 added
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/LayoutTests/ChangeLog
r181105 r181107 1 2015-03-05 Benjamin Poulain <bpoulain@apple.com> 2 3 Add basic support for character sets to the URL Filter parser 4 https://bugs.webkit.org/show_bug.cgi?id=142257 5 6 Reviewed by Alex Christensen. 7 8 * http/tests/usercontentfilter/character-set-basic-support-expected.txt: Added. 9 * http/tests/usercontentfilter/character-set-basic-support.html: Added. 10 * http/tests/usercontentfilter/character-set-basic-support.html.json: Added. 11 * http/tests/usercontentfilter/resources/url-blocking-test.js: Added. 12 1 13 2015-03-05 Chris Dumez <cdumez@apple.com> 2 14 -
trunk/Source/WebCore/ChangeLog
r181100 r181107 1 2015-03-05 Benjamin Poulain <bpoulain@apple.com> 2 3 Add basic support for character sets to the URL Filter parser 4 https://bugs.webkit.org/show_bug.cgi?id=142257 5 6 Reviewed by Alex Christensen. 7 8 This patch is a first step toward making the URL filter parser a bit 9 more powerful: it adds support for character set atom. 10 11 I did not attempt to integrate that into the prefix tree in this patch, 12 instead, the GraphBuilder gets a two modes of generating the NFA: 13 PrefixTree and DirectGeneration. 14 15 As long as we only see trivial atoms, we use the PrefixTree generation 16 to minimize the number of nodes we need. As soon as we start a character 17 class, we switch to DirectGeneration and we generate the NFA from the input 18 without merging with previously seen patterns. 19 20 To differentiate between Trivial atoms and CharacterSet, we also gain 21 an AtomType state. 22 23 The character set themself are very simple: each character is represented by 24 a bit in a 16bytes bit vector. 25 26 * contentextensions/URLFilterParser.cpp: 27 (WebCore::ContentExtensions::quantifyTrivialAtom): 28 (WebCore::ContentExtensions::GraphBuilder::atomPatternCharacter): 29 (WebCore::ContentExtensions::GraphBuilder::atomBuiltInCharacterClass): 30 (WebCore::ContentExtensions::GraphBuilder::quantifyAtom): 31 (WebCore::ContentExtensions::GraphBuilder::atomBackReference): 32 (WebCore::ContentExtensions::GraphBuilder::atomCharacterClassBegin): 33 (WebCore::ContentExtensions::GraphBuilder::atomCharacterClassAtom): 34 (WebCore::ContentExtensions::GraphBuilder::atomCharacterClassRange): 35 (WebCore::ContentExtensions::GraphBuilder::atomCharacterClassEnd): 36 (WebCore::ContentExtensions::GraphBuilder::atomCharacterClassBuiltIn): 37 (WebCore::ContentExtensions::GraphBuilder::sinkAtom): 38 (WebCore::ContentExtensions::GraphBuilder::generateTransition): 39 (WebCore::ContentExtensions::GraphBuilder::sinkTrivialAtom): 40 (WebCore::ContentExtensions::GraphBuilder::sinkCharacterSet): 41 (WebCore::ContentExtensions::GraphBuilder::sinkPendingAtomIfNecessary): 42 1 43 2015-03-05 Roger Fong <roger_fong@apple.com> 2 44 -
trunk/Source/WebCore/contentextensions/URLFilterParser.cpp
r178857 r181107 31 31 #include "NFA.h" 32 32 #include <JavaScriptCore/YarrParser.h> 33 #include <wtf/BitVector.h> 33 34 34 35 namespace WebCore { … … 51 52 } 52 53 53 enum class TrivialAtomQuantifier : uint16_t { 54 enum class AtomQuantifier : uint16_t { 55 One = 0, 54 56 ZeroOrOne = 0x1000, 55 Zero ToMany= 0x2000,56 One ToMany= 0x400057 ZeroOrMore = 0x2000, 58 OneOrMore = 0x4000 57 59 }; 58 60 59 static void quantifyTrivialAtom(TrivialAtom& trivialAtom, TrivialAtomQuantifier quantifier)61 static void quantifyTrivialAtom(TrivialAtom& trivialAtom, AtomQuantifier quantifier) 60 62 { 61 63 ASSERT(trivialAtom & (hasNonCharacterMask | characterMask)); … … 64 66 } 65 67 68 static AtomQuantifier trivialAtomQuantifier(TrivialAtom trivialAtom) 69 { 70 switch (trivialAtom & 0xf000) { 71 case static_cast<unsigned>(AtomQuantifier::One): 72 return AtomQuantifier::One; 73 case static_cast<unsigned>(AtomQuantifier::ZeroOrOne): 74 return AtomQuantifier::ZeroOrOne; 75 case static_cast<unsigned>(AtomQuantifier::ZeroOrMore): 76 return AtomQuantifier::ZeroOrMore; 77 case static_cast<unsigned>(AtomQuantifier::OneOrMore): 78 return AtomQuantifier::OneOrMore; 79 } 80 ASSERT_NOT_REACHED(); 81 return AtomQuantifier::One; 82 } 83 66 84 static TrivialAtom trivialAtomForNewlineClassIDBuiltin() 67 85 { … … 70 88 71 89 class GraphBuilder { 72 private:73 90 struct BoundedSubGraph { 74 91 unsigned start; 75 92 unsigned end; 76 93 }; 94 77 95 public: 78 96 GraphBuilder(NFA& nfa, PrefixTreeEntry& prefixTreeRoot, bool patternIsCaseSensitive, uint64_t patternId) … … 105 123 void atomPatternCharacter(UChar character) 106 124 { 125 if (hasError()) 126 return; 127 107 128 if (!isASCII(character)) { 108 129 fail(ASCIILiteral("Only ASCII characters are supported in pattern.")); … … 110 131 } 111 132 112 if (hasError())113 return;114 115 133 sinkPendingAtomIfNecessary(); 134 ASSERT(m_floatingAtomType == FloatingAtomType::Invalid); 135 ASSERT(!m_pendingTrivialAtom); 116 136 117 137 char asciiChararacter = static_cast<char>(character); 118 m_hasValidAtom = true;119 120 ASSERT(m_lastPrefixTreeEntry);121 138 m_pendingTrivialAtom = trivialAtomFromASCIICharacter(asciiChararacter, m_patternIsCaseSensitive); 139 m_floatingAtomType = FloatingAtomType::Trivial; 122 140 } 123 141 … … 128 146 129 147 sinkPendingAtomIfNecessary(); 148 ASSERT(m_floatingAtomType == FloatingAtomType::Invalid); 149 ASSERT(!m_pendingTrivialAtom); 130 150 131 151 if (builtInCharacterClassID == JSC::Yarr::NewlineClassID && inverted) { 132 m_hasValidAtom = true;133 ASSERT(m_lastPrefixTreeEntry);134 152 m_pendingTrivialAtom = trivialAtomForNewlineClassIDBuiltin(); 153 m_floatingAtomType = FloatingAtomType::Trivial; 135 154 } else 136 155 fail(ASCIILiteral("Character class is not supported.")); … … 142 161 return; 143 162 144 ASSERT(m_hasValidAtom);145 if (!m_hasValidAtom) {163 switch (m_floatingAtomType) { 164 case FloatingAtomType::Invalid: 146 165 fail(ASCIILiteral("Quantifier without corresponding atom to quantify.")); 147 return; 148 } 149 150 ASSERT(m_lastPrefixTreeEntry); 151 if (!minimum && maximum == 1) 152 quantifyTrivialAtom(m_pendingTrivialAtom, TrivialAtomQuantifier::ZeroOrOne); 153 else if (!minimum && maximum == JSC::Yarr::quantifyInfinite) 154 quantifyTrivialAtom(m_pendingTrivialAtom, TrivialAtomQuantifier::ZeroToMany); 155 else if (minimum == 1 && maximum == JSC::Yarr::quantifyInfinite) 156 quantifyTrivialAtom(m_pendingTrivialAtom, TrivialAtomQuantifier::OneToMany); 157 else 158 fail(ASCIILiteral("Arbitrary atom repetitions are not supported.")); 159 } 160 161 NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) 166 break; 167 168 case FloatingAtomType::Trivial: 169 if (!minimum && maximum == 1) 170 quantifyTrivialAtom(m_pendingTrivialAtom, AtomQuantifier::ZeroOrOne); 171 else if (!minimum && maximum == JSC::Yarr::quantifyInfinite) 172 quantifyTrivialAtom(m_pendingTrivialAtom, AtomQuantifier::ZeroOrMore); 173 else if (minimum == 1 && maximum == JSC::Yarr::quantifyInfinite) 174 quantifyTrivialAtom(m_pendingTrivialAtom, AtomQuantifier::OneOrMore); 175 else 176 fail(ASCIILiteral("Arbitrary atom repetitions are not supported.")); 177 break; 178 case FloatingAtomType::CharacterSet: { 179 ASSERT(m_characterSetQuantifier == AtomQuantifier::One); 180 if (!minimum && maximum == 1) 181 m_characterSetQuantifier = AtomQuantifier::ZeroOrOne; 182 else if (!minimum && maximum == JSC::Yarr::quantifyInfinite) 183 m_characterSetQuantifier = AtomQuantifier::ZeroOrMore; 184 else if (minimum == 1 && maximum == JSC::Yarr::quantifyInfinite) 185 m_characterSetQuantifier = AtomQuantifier::OneOrMore; 186 else 187 fail(ASCIILiteral("Arbitrary character set repetitions are not supported.")); 188 break; 189 } 190 } 191 } 192 193 void atomBackReference(unsigned) 162 194 { 163 195 fail(ASCIILiteral("Patterns cannot contain backreferences.")); 164 ASSERT_NOT_REACHED();165 }166 167 void atomCharacterClassAtom(UChar)168 {169 fail(ASCIILiteral("Character class atoms are not supported yet."));170 196 } 171 197 … … 185 211 } 186 212 187 void atomCharacterClassBegin(bool = false) 188 { 189 fail(ASCIILiteral("Character class atoms are not supported yet.")); 190 } 191 192 void atomCharacterClassRange(UChar, UChar) 193 { 194 fail(ASCIILiteral("Character class ranges are not supported yet.")); 213 void atomCharacterClassBegin(bool inverted = false) 214 { 215 if (hasError()) 216 return; 217 218 sinkPendingAtomIfNecessary(); 219 220 ASSERT_WITH_MESSAGE(!m_pendingCharacterSet.bitCount(), "We should not have nested character classes."); 221 ASSERT(m_floatingAtomType == FloatingAtomType::Invalid); 222 223 m_buildMode = BuildMode::DirectGeneration; 224 m_lastPrefixTreeEntry = nullptr; 225 226 m_isInvertedCharacterSet = inverted; 227 m_floatingAtomType = FloatingAtomType::CharacterSet; 228 } 229 230 void atomCharacterClassAtom(UChar character) 231 { 232 if (hasError()) 233 return; 234 235 ASSERT(m_floatingAtomType == FloatingAtomType::CharacterSet); 236 237 if (!isASCII(character)) { 238 fail(ASCIILiteral("Non ASCII Character in a character set.")); 239 return; 240 } 241 m_pendingCharacterSet.set(character); 242 } 243 244 void atomCharacterClassRange(UChar a, UChar b) 245 { 246 if (hasError()) 247 return; 248 249 ASSERT(m_floatingAtomType == FloatingAtomType::CharacterSet); 250 251 if (!a || !b || !isASCII(a) || !isASCII(b)) { 252 fail(ASCIILiteral("Non ASCII Character in a character range of a character set.")); 253 return; 254 } 255 for (unsigned i = a; i <= b; ++i) 256 m_pendingCharacterSet.set(i); 257 } 258 259 void atomCharacterClassEnd() 260 { 261 // Nothing to do here. The character set atom may have a quantifier, we sink the atom lazily. 195 262 } 196 263 197 264 void atomCharacterClassBuiltIn(JSC::Yarr::BuiltInCharacterClassID, bool) 198 265 { 199 fail(ASCIILiteral("Buildins character class atoms are not supported yet.")); 200 } 201 202 void atomCharacterClassEnd() 203 { 204 fail(ASCIILiteral("Character class are not supported yet.")); 266 fail(ASCIILiteral("Builtins character class atoms are not supported yet.")); 205 267 } 206 268 … … 240 302 241 303 m_errorMessage = errorMessage; 304 } 305 306 BoundedSubGraph sinkAtom(std::function<void(unsigned, unsigned)> transitionFunction, AtomQuantifier quantifier, unsigned start) 307 { 308 switch (quantifier) { 309 case AtomQuantifier::One: { 310 unsigned newEnd = m_nfa.createNode(); 311 m_nfa.addRuleId(newEnd, m_patternId); 312 transitionFunction(start, newEnd); 313 return { start, newEnd }; 314 } 315 case AtomQuantifier::ZeroOrOne: { 316 unsigned newEnd = m_nfa.createNode(); 317 m_nfa.addRuleId(newEnd, m_patternId); 318 transitionFunction(start, newEnd); 319 return { start, newEnd }; 320 } 321 case AtomQuantifier::ZeroOrMore: { 322 unsigned repeatStart = m_nfa.createNode(); 323 m_nfa.addRuleId(repeatStart, m_patternId); 324 unsigned repeatEnd = m_nfa.createNode(); 325 m_nfa.addRuleId(repeatEnd, m_patternId); 326 327 transitionFunction(repeatStart, repeatEnd); 328 m_nfa.addEpsilonTransition(repeatEnd, repeatStart); 329 330 m_nfa.addEpsilonTransition(start, repeatStart); 331 332 unsigned kleenEnd = m_nfa.createNode(); 333 m_nfa.addRuleId(kleenEnd, m_patternId); 334 m_nfa.addEpsilonTransition(repeatEnd, kleenEnd); 335 m_nfa.addEpsilonTransition(start, kleenEnd); 336 return { start, kleenEnd }; 337 } 338 case AtomQuantifier::OneOrMore: { 339 unsigned repeatStart = m_nfa.createNode(); 340 m_nfa.addRuleId(repeatStart, m_patternId); 341 unsigned repeatEnd = m_nfa.createNode(); 342 m_nfa.addRuleId(repeatEnd, m_patternId); 343 344 transitionFunction(repeatStart, repeatEnd); 345 m_nfa.addEpsilonTransition(repeatEnd, repeatStart); 346 347 m_nfa.addEpsilonTransition(start, repeatStart); 348 349 unsigned afterRepeat = m_nfa.createNode(); 350 m_nfa.addRuleId(afterRepeat, m_patternId); 351 m_nfa.addEpsilonTransition(repeatEnd, afterRepeat); 352 return { start, afterRepeat }; 353 } 354 } 242 355 } 243 356 … … 259 372 BoundedSubGraph sinkTrivialAtom(TrivialAtom trivialAtom, unsigned start) 260 373 { 261 if (trivialAtom & static_cast<uint16_t>(TrivialAtomQuantifier::ZeroOrOne)) { 262 unsigned newEnd = m_nfa.createNode(); 263 m_nfa.addRuleId(newEnd, m_patternId); 264 generateTransition(trivialAtom, start, newEnd); 265 m_nfa.addEpsilonTransition(start, newEnd); 266 return { start, newEnd }; 267 } 268 269 if (trivialAtom & static_cast<uint16_t>(TrivialAtomQuantifier::ZeroToMany)) { 270 unsigned repeatStart = m_nfa.createNode(); 271 m_nfa.addRuleId(repeatStart, m_patternId); 272 unsigned repeatEnd = m_nfa.createNode(); 273 m_nfa.addRuleId(repeatEnd, m_patternId); 274 275 generateTransition(trivialAtom, repeatStart, repeatEnd); 276 m_nfa.addEpsilonTransition(repeatEnd, repeatStart); 277 278 m_nfa.addEpsilonTransition(start, repeatStart); 279 280 unsigned kleenEnd = m_nfa.createNode(); 281 m_nfa.addRuleId(kleenEnd, m_patternId); 282 m_nfa.addEpsilonTransition(repeatEnd, kleenEnd); 283 m_nfa.addEpsilonTransition(start, kleenEnd); 284 return { start, kleenEnd }; 285 } 286 287 if (trivialAtom & static_cast<uint16_t>(TrivialAtomQuantifier::OneToMany)) { 288 unsigned repeatStart = m_nfa.createNode(); 289 m_nfa.addRuleId(repeatStart, m_patternId); 290 unsigned repeatEnd = m_nfa.createNode(); 291 m_nfa.addRuleId(repeatEnd, m_patternId); 292 293 generateTransition(trivialAtom, repeatStart, repeatEnd); 294 m_nfa.addEpsilonTransition(repeatEnd, repeatStart); 295 296 m_nfa.addEpsilonTransition(start, repeatStart); 297 298 unsigned afterRepeat = m_nfa.createNode(); 299 m_nfa.addRuleId(afterRepeat, m_patternId); 300 m_nfa.addEpsilonTransition(repeatEnd, afterRepeat); 301 return { start, afterRepeat }; 302 } 303 304 unsigned newEnd = m_nfa.createNode(); 305 m_nfa.addRuleId(newEnd, m_patternId); 306 generateTransition(trivialAtom, start, newEnd); 307 return { start, newEnd }; 374 auto transitionFunction = [this, trivialAtom](unsigned source, unsigned target) 375 { 376 generateTransition(trivialAtom, source, target); 377 }; 378 return sinkAtom(transitionFunction, trivialAtomQuantifier(trivialAtom), start); 379 } 380 381 void generateTransition(const BitVector& characterSet, bool isInverted, unsigned source, unsigned target) 382 { 383 ASSERT(characterSet.bitCount()); 384 if (!isInverted) { 385 for (const auto& characterIterator : characterSet.setBits()) { 386 char character = static_cast<char>(characterIterator); 387 if (!m_patternIsCaseSensitive && isASCIIAlpha(character)) { 388 m_nfa.addTransition(source, target, toASCIIUpper(character)); 389 m_nfa.addTransition(source, target, toASCIILower(character)); 390 } else 391 m_nfa.addTransition(source, target, character); 392 } 393 } else { 394 for (unsigned i = 1; i < characterSet.size(); ++i) { 395 if (characterSet.get(i)) 396 continue; 397 char character = static_cast<char>(i); 398 399 if (!m_patternIsCaseSensitive && (characterSet.get(toASCIIUpper(character)) || characterSet.get(toASCIILower(character)))) 400 continue; 401 402 m_nfa.addTransition(source, target, character); 403 } 404 } 405 } 406 407 BoundedSubGraph sinkCharacterSet(const BitVector& characterSet, bool isInverted, unsigned start) 408 { 409 auto transitionFunction = [this, &characterSet, isInverted](unsigned source, unsigned target) 410 { 411 generateTransition(characterSet, isInverted, source, target); 412 }; 413 return sinkAtom(transitionFunction, m_characterSetQuantifier, start); 308 414 } 309 415 310 416 void sinkPendingAtomIfNecessary() 311 417 { 312 ASSERT(m_lastPrefixTreeEntry); 313 314 if (!m_hasValidAtom) 315 return; 316 317 ASSERT(m_pendingTrivialAtom); 318 319 auto nextEntry = m_lastPrefixTreeEntry->nextPattern.find(m_pendingTrivialAtom); 320 if (nextEntry != m_lastPrefixTreeEntry->nextPattern.end()) { 321 m_lastPrefixTreeEntry = nextEntry->value.get(); 322 m_nfa.addRuleId(m_lastPrefixTreeEntry->nfaNode, m_patternId); 323 } else { 324 std::unique_ptr<PrefixTreeEntry> nextPrefixTreeEntry = std::make_unique<PrefixTreeEntry>(); 325 326 BoundedSubGraph newSubGraph = sinkTrivialAtom(m_pendingTrivialAtom, m_lastPrefixTreeEntry->nfaNode); 327 nextPrefixTreeEntry->nfaNode = newSubGraph.end; 328 329 auto addResult = m_lastPrefixTreeEntry->nextPattern.set(m_pendingTrivialAtom, WTF::move(nextPrefixTreeEntry)); 330 ASSERT(addResult.isNewEntry); 331 332 m_newPrefixSubtreeRoot = m_lastPrefixTreeEntry; 333 m_newPrefixStaringPoint = m_pendingTrivialAtom; 334 335 m_lastPrefixTreeEntry = addResult.iterator->value.get(); 336 } 337 ASSERT(m_lastPrefixTreeEntry); 338 339 m_activeGroup.end = m_lastPrefixTreeEntry->nfaNode; 418 if (m_floatingAtomType == FloatingAtomType::Invalid) 419 return; 420 421 switch (m_buildMode) { 422 case BuildMode::PrefixTree: { 423 ASSERT(m_lastPrefixTreeEntry); 424 ASSERT_WITH_MESSAGE(m_floatingAtomType == FloatingAtomType::Trivial, "Only trivial atoms are handled with a prefix tree."); 425 426 auto nextEntry = m_lastPrefixTreeEntry->nextPattern.find(m_pendingTrivialAtom); 427 if (nextEntry != m_lastPrefixTreeEntry->nextPattern.end()) { 428 m_lastPrefixTreeEntry = nextEntry->value.get(); 429 m_nfa.addRuleId(m_lastPrefixTreeEntry->nfaNode, m_patternId); 430 } else { 431 std::unique_ptr<PrefixTreeEntry> nextPrefixTreeEntry = std::make_unique<PrefixTreeEntry>(); 432 433 BoundedSubGraph newSubGraph = sinkTrivialAtom(m_pendingTrivialAtom, m_lastPrefixTreeEntry->nfaNode); 434 nextPrefixTreeEntry->nfaNode = newSubGraph.end; 435 436 auto addResult = m_lastPrefixTreeEntry->nextPattern.set(m_pendingTrivialAtom, WTF::move(nextPrefixTreeEntry)); 437 ASSERT(addResult.isNewEntry); 438 439 if (!m_newPrefixSubtreeRoot) { 440 m_newPrefixSubtreeRoot = m_lastPrefixTreeEntry; 441 m_newPrefixStaringPoint = m_pendingTrivialAtom; 442 } 443 444 m_lastPrefixTreeEntry = addResult.iterator->value.get(); 445 } 446 m_activeGroup.end = m_lastPrefixTreeEntry->nfaNode; 447 ASSERT(m_lastPrefixTreeEntry); 448 break; 449 } 450 case BuildMode::DirectGeneration: { 451 ASSERT(!m_lastPrefixTreeEntry); 452 453 switch (m_floatingAtomType) { 454 case FloatingAtomType::Invalid: 455 ASSERT_NOT_REACHED(); 456 break; 457 case FloatingAtomType::Trivial: { 458 BoundedSubGraph newSubGraph = sinkTrivialAtom(m_pendingTrivialAtom, m_activeGroup.end); 459 m_activeGroup.end = newSubGraph.end; 460 break; 461 } 462 case FloatingAtomType::CharacterSet: 463 if (!m_pendingCharacterSet.bitCount()) { 464 fail(ASCIILiteral("Empty character set.")); 465 return; 466 } 467 BoundedSubGraph newSubGraph = sinkCharacterSet(m_pendingCharacterSet, m_isInvertedCharacterSet, m_activeGroup.end); 468 m_activeGroup.end = newSubGraph.end; 469 470 m_isInvertedCharacterSet = false; 471 m_characterSetQuantifier = AtomQuantifier::One; 472 m_pendingCharacterSet.clearAll(); 473 break; 474 } 475 break; 476 } 477 } 478 340 479 m_pendingTrivialAtom = 0; 341 m_ hasValidAtom = false;480 m_floatingAtomType = FloatingAtomType::Invalid; 342 481 } 343 482 … … 349 488 350 489 PrefixTreeEntry* m_lastPrefixTreeEntry; 351 bool m_hasValidAtom = false; 490 enum class FloatingAtomType { 491 Invalid, 492 Trivial, 493 CharacterSet 494 }; 495 FloatingAtomType m_floatingAtomType { FloatingAtomType::Invalid }; 352 496 TrivialAtom m_pendingTrivialAtom = 0; 497 498 bool m_isInvertedCharacterSet { false }; 499 BitVector m_pendingCharacterSet { 128 }; 500 AtomQuantifier m_characterSetQuantifier { AtomQuantifier::One }; 501 502 enum class BuildMode { 503 PrefixTree, 504 DirectGeneration 505 }; 506 BuildMode m_buildMode { BuildMode::PrefixTree }; 353 507 354 508 PrefixTreeEntry* m_newPrefixSubtreeRoot = nullptr;
Note: See TracChangeset
for help on using the changeset viewer.