Changeset 85792 in webkit
- Timestamp:
- May 4, 2011 3:02:34 PM (13 years ago)
- Location:
- trunk/Tools
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/Tools/ChangeLog
r85786 r85792 1 2011-05-04 Eric Seidel <eric@webkit.org> 2 3 Reviewed by Ojan Vafai. 4 5 sheriff-bot can't parse results.html 6 https://bugs.webkit.org/show_bug.cgi?id=60196 7 8 The problem was that the NRWT bots are currently core 9 and so sheriff-bot was trying to diagnose failures on 10 them. It was trying to parse their results.html file 11 and BeautifulSoup was throwing an exception. 12 13 It seems our version of BeautifulSoup was quite old 14 and unable to handle <script> tags correctly. 15 This moves us to 3.2 which doesn't seem to have this bug. 16 17 * Scripts/webkitpy/thirdparty/BeautifulSoup.py: 18 1 19 2011-05-04 Adam Barth <abarth@webkit.org> 2 20 -
trunk/Tools/Scripts/webkitpy/thirdparty/BeautifulSoup.py
r55055 r85792 43 43 Here, have some legalese: 44 44 45 Copyright (c) 2004-20 09, Leonard Richardson45 Copyright (c) 2004-2010, Leonard Richardson 46 46 47 47 All rights reserved. … … 80 80 81 81 __author__ = "Leonard Richardson (leonardr@segfault.org)" 82 __version__ = "3. 1.0.1"83 __copyright__ = "Copyright (c) 2004-20 09Leonard Richardson"82 __version__ = "3.2.0" 83 __copyright__ = "Copyright (c) 2004-2010 Leonard Richardson" 84 84 __license__ = "New-style BSD" 85 85 86 from sgmllib import SGMLParser, SGMLParseError 86 87 import codecs 87 88 import markupbase 88 89 import types 89 90 import re 90 from HTMLParser import HTMLParser, HTMLParseError 91 import sgmllib 91 92 try: 92 93 from htmlentitydefs import name2codepoint 93 94 except ImportError: 94 95 name2codepoint = {} 95 96 try: 96 97 set … … 99 100 100 101 #These hacks make Beautiful Soup able to parse XML with namespaces 102 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') 101 103 markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match 102 104 103 105 DEFAULT_OUTPUT_ENCODING = "utf-8" 104 106 107 def _match_css_class(str): 108 """Build a RE to match the given CSS class.""" 109 return re.compile(r"(^|.*\s)%s($|\s)" % str) 110 105 111 # First, the classes that represent markup elements. 106 112 107 def sob(unicode, encoding): 108 """Returns either the given Unicode string or its encoding.""" 109 if encoding is None: 110 return unicode 111 else: 112 return unicode.encode(encoding) 113 114 class PageElement: 113 class PageElement(object): 115 114 """Contains the navigational information for some part of the page 116 115 (either a tag or a piece of text)""" … … 130 129 def replaceWith(self, replaceWith): 131 130 oldParent = self.parent 132 myIndex = self.parent.contents.index(self) 133 if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: 131 myIndex = self.parent.index(self) 132 if hasattr(replaceWith, "parent")\ 133 and replaceWith.parent is self.parent: 134 134 # We're replacing this element with one of its siblings. 135 index = self.parent.contents.index(replaceWith)135 index = replaceWith.parent.index(replaceWith) 136 136 if index and index < myIndex: 137 137 # Furthermore, it comes before this element. That … … 142 142 oldParent.insert(myIndex, replaceWith) 143 143 144 def replaceWithChildren(self): 145 myParent = self.parent 146 myIndex = self.parent.index(self) 147 self.extract() 148 reversedChildren = list(self.contents) 149 reversedChildren.reverse() 150 for child in reversedChildren: 151 myParent.insert(myIndex, child) 152 144 153 def extract(self): 145 154 """Destructively rips this element out of the tree.""" 146 155 if self.parent: 147 156 try: 148 self.parent.contents.remove(self)157 del self.parent.contents[self.parent.index(self)] 149 158 except ValueError: 150 159 pass … … 179 188 180 189 def insert(self, position, newChild): 181 if (isinstance(newChild, basestring) 182 or isinstance(newChild, unicode)) \ 190 if isinstance(newChild, basestring) \ 183 191 and not isinstance(newChild, NavigableString): 184 192 newChild = NavigableString(newChild) 185 193 186 194 position = min(position, len(self.contents)) 187 if hasattr(newChild, 'parent') and newChild.parent !=None:195 if hasattr(newChild, 'parent') and newChild.parent is not None: 188 196 # We're 'inserting' an element that's already one 189 197 # of this object's children. 190 if newChild.parent ==self:191 index = self. find(newChild)192 if index and index <position:198 if newChild.parent is self: 199 index = self.index(newChild) 200 if index > position: 193 201 # Furthermore we're moving it further down the 194 202 # list of this object's children. That means that … … 328 336 if isinstance(name, SoupStrainer): 329 337 strainer = name 338 # (Possibly) special case some findAll*(...) searches 339 elif text is None and not limit and not attrs and not kwargs: 340 # findAll*(True) 341 if name is True: 342 return [element for element in generator() 343 if isinstance(element, Tag)] 344 # findAll*('tag-name') 345 elif isinstance(name, basestring): 346 return [element for element in generator() 347 if isinstance(element, Tag) and 348 element.name == name] 349 else: 350 strainer = SoupStrainer(name, attrs, text, **kwargs) 351 # Build a SoupStrainer 330 352 else: 331 # Build a SoupStrainer332 353 strainer = SoupStrainer(name, attrs, text, **kwargs) 333 354 results = ResultSet(strainer) … … 350 371 def nextGenerator(self): 351 372 i = self 352 while i :373 while i is not None: 353 374 i = i.next 354 375 yield i … … 356 377 def nextSiblingGenerator(self): 357 378 i = self 358 while i :379 while i is not None: 359 380 i = i.nextSibling 360 381 yield i … … 362 383 def previousGenerator(self): 363 384 i = self 364 while i :385 while i is not None: 365 386 i = i.previous 366 387 yield i … … 368 389 def previousSiblingGenerator(self): 369 390 i = self 370 while i :391 while i is not None: 371 392 i = i.previousSibling 372 393 yield i … … 374 395 def parentGenerator(self): 375 396 i = self 376 while i :397 while i is not None: 377 398 i = i.parent 378 399 yield i … … 416 437 417 438 def __getnewargs__(self): 418 return ( unicode(self),)439 return (NavigableString.__str__(self),) 419 440 420 441 def __getattr__(self, attr): … … 427 448 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) 428 449 429 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING): 430 return self.decode().encode(encoding) 431 432 def decodeGivenEventualEncoding(self, eventualEncoding): 433 return self 450 def __unicode__(self): 451 return str(self).decode(DEFAULT_OUTPUT_ENCODING) 452 453 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 454 if encoding: 455 return self.encode(encoding) 456 else: 457 return self 434 458 435 459 class CData(NavigableString): 436 460 437 def decodeGivenEventualEncoding(self, eventualEncoding):438 return u'<![CDATA[' + self + u']]>'461 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 462 return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding) 439 463 440 464 class ProcessingInstruction(NavigableString): 441 442 def decodeGivenEventualEncoding(self, eventualEncoding): 465 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 443 466 output = self 444 if u'%SOUP-ENCODING%'in output:445 output = self.substituteEncoding(output, e ventualEncoding)446 return u'<?' + output + u'?>'467 if "%SOUP-ENCODING%" in output: 468 output = self.substituteEncoding(output, encoding) 469 return "<?%s?>" % self.toEncoding(output, encoding) 447 470 448 471 class Comment(NavigableString): 449 def decodeGivenEventualEncoding(self, eventualEncoding):450 return u'<!--' + self + u'-->'472 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 473 return "<!--%s-->" % NavigableString.__str__(self, encoding) 451 474 452 475 class Declaration(NavigableString): 453 def decodeGivenEventualEncoding(self, eventualEncoding):454 return u'<!' + self + u'>'476 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 477 return "<!%s>" % NavigableString.__str__(self, encoding) 455 478 456 479 class Tag(PageElement): … … 507 530 self.isSelfClosing = parser.isSelfClosingTag(name) 508 531 self.name = name 509 if attrs ==None:532 if attrs is None: 510 533 attrs = [] 534 elif isinstance(attrs, dict): 535 attrs = attrs.items() 511 536 self.attrs = attrs 512 537 self.contents = [] … … 518 543 self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities 519 544 520 def convert(kval): 521 "Converts HTML, XML and numeric entities in the attribute value." 522 k, val = kval 523 if val is None: 524 return kval 525 return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", 526 self._convertEntities, val)) 545 # Convert any HTML, XML, or numeric entities in the attribute values. 546 convert = lambda(k, val): (k, 547 re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", 548 self._convertEntities, 549 val)) 527 550 self.attrs = map(convert, self.attrs) 551 552 def getString(self): 553 if (len(self.contents) == 1 554 and isinstance(self.contents[0], NavigableString)): 555 return self.contents[0] 556 557 def setString(self, string): 558 """Replace the contents of the tag with a string""" 559 self.clear() 560 self.append(string) 561 562 string = property(getString, setString) 563 564 def getText(self, separator=u""): 565 if not len(self.contents): 566 return u"" 567 stopNode = self._lastRecursiveChild().next 568 strings = [] 569 current = self.contents[0] 570 while current is not stopNode: 571 if isinstance(current, NavigableString): 572 strings.append(current.strip()) 573 current = current.next 574 return separator.join(strings) 575 576 text = property(getText) 528 577 529 578 def get(self, key, default=None): … … 532 581 attribute.""" 533 582 return self._getAttrMap().get(key, default) 583 584 def clear(self): 585 """Extract all children.""" 586 for child in self.contents[:]: 587 child.extract() 588 589 def index(self, element): 590 for i, child in enumerate(self.contents): 591 if child is element: 592 return i 593 raise ValueError("Tag.index: element not in tag") 534 594 535 595 def has_key(self, key): … … 601 661 NOTE: right now this will return false if two tags have the 602 662 same attributes in a different order. Should this be fixed?""" 663 if other is self: 664 return True 603 665 if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): 604 666 return False … … 615 677 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): 616 678 """Renders this tag as a string.""" 617 return self.decode(eventualEncoding=encoding) 679 return self.__str__(encoding) 680 681 def __unicode__(self): 682 return self.__str__(None) 618 683 619 684 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" … … 626 691 return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" 627 692 628 def __unicode__(self): 629 return self.decode() 630 631 def __str__(self): 632 return self.encode() 633 634 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, 635 prettyPrint=False, indentLevel=0): 636 return self.decode(prettyPrint, indentLevel, encoding).encode(encoding) 637 638 def decode(self, prettyPrint=False, indentLevel=0, 639 eventualEncoding=DEFAULT_OUTPUT_ENCODING): 693 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, 694 prettyPrint=False, indentLevel=0): 640 695 """Returns a string or Unicode representation of this tag and 641 its contents. To get Unicode, pass None for encoding.""" 696 its contents. To get Unicode, pass None for encoding. 697 698 NOTE: since Python's HTML parser consumes whitespace, this 699 method is not certain to reproduce the whitespace present in 700 the original string.""" 701 702 encodedName = self.toEncoding(self.name, encoding) 642 703 643 704 attrs = [] … … 645 706 for key, val in self.attrs: 646 707 fmt = '%s="%s"' 647 if isString(val): 648 if (self.containsSubstitutions 649 and eventualEncoding is not None 650 and '%SOUP-ENCODING%' in val): 651 val = self.substituteEncoding(val, eventualEncoding) 708 if isinstance(val, basestring): 709 if self.containsSubstitutions and '%SOUP-ENCODING%' in val: 710 val = self.substituteEncoding(val, encoding) 652 711 653 712 # The attribute value either: … … 678 737 # to escape those to XML entities too. 679 738 val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) 680 if val is None: 681 # Handle boolean attributes. 682 decoded = key 683 else: 684 decoded = fmt % (key, val) 685 attrs.append(decoded) 739 740 attrs.append(fmt % (self.toEncoding(key, encoding), 741 self.toEncoding(val, encoding))) 686 742 close = '' 687 743 closeTag = '' … … 689 745 close = ' /' 690 746 else: 691 closeTag = '</%s>' % self.name747 closeTag = '</%s>' % encodedName 692 748 693 749 indentTag, indentContents = 0, 0 … … 696 752 space = (' ' * (indentTag-1)) 697 753 indentContents = indentTag + 1 698 contents = self.decodeContents(prettyPrint, indentContents, 699 eventualEncoding) 754 contents = self.renderContents(encoding, prettyPrint, indentContents) 700 755 if self.hidden: 701 756 s = contents … … 707 762 if prettyPrint: 708 763 s.append(space) 709 s.append('<%s%s%s>' % ( self.name, attributeString, close))764 s.append('<%s%s%s>' % (encodedName, attributeString, close)) 710 765 if prettyPrint: 711 766 s.append("\n") … … 723 778 def decompose(self): 724 779 """Recursively destroys the contents of this tree.""" 725 contents = [i for i in self.contents]726 for i in contents:727 if isinstance(i, Tag):728 i.decompose()729 else:730 i.extract()731 780 self.extract() 781 if len(self.contents) == 0: 782 return 783 current = self.contents[0] 784 while current is not None: 785 next = current.next 786 if isinstance(current, Tag): 787 del current.contents[:] 788 current.parent = None 789 current.previous = None 790 current.previousSibling = None 791 current.next = None 792 current.nextSibling = None 793 current = next 732 794 733 795 def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): 734 return self. encode(encoding, True)735 736 def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING,796 return self.__str__(encoding, True) 797 798 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, 737 799 prettyPrint=False, indentLevel=0): 738 return self.decodeContents(prettyPrint, indentLevel).encode(encoding)739 740 def decodeContents(self, prettyPrint=False, indentLevel=0,741 eventualEncoding=DEFAULT_OUTPUT_ENCODING):742 800 """Renders the contents of this tag as a string in the given 743 801 encoding. If encoding is None, returns a Unicode string..""" … … 746 804 text = None 747 805 if isinstance(c, NavigableString): 748 text = c. decodeGivenEventualEncoding(eventualEncoding)806 text = c.__str__(encoding) 749 807 elif isinstance(c, Tag): 750 s.append(c. decode(prettyPrint, indentLevel, eventualEncoding))808 s.append(c.__str__(encoding, prettyPrint, indentLevel)) 751 809 if text and prettyPrint: 752 810 text = text.strip() … … 789 847 findChildren = findAll 790 848 791 # Pre-3.x compatibility methods . Will go away in 4.0.849 # Pre-3.x compatibility methods 792 850 first = find 793 851 fetch = findAll … … 798 856 def firstText(self, text=None, recursive=True): 799 857 return self.find(text=text, recursive=recursive) 800 801 # 3.x compatibility methods. Will go away in 4.0.802 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,803 prettyPrint=False, indentLevel=0):804 if encoding is None:805 return self.decodeContents(prettyPrint, indentLevel, encoding)806 else:807 return self.encodeContents(encoding, prettyPrint, indentLevel)808 809 858 810 859 #Private methods … … 820 869 821 870 #Generator methods 871 def childGenerator(self): 872 # Just use the iterator from the contents 873 return iter(self.contents) 874 822 875 def recursiveChildGenerator(self): 823 876 if not len(self.contents): … … 829 882 current = current.next 830 883 831 def childGenerator(self):832 if not len(self.contents):833 raise StopIteration834 current = self.contents[0]835 while current:836 yield current837 current = current.nextSibling838 raise StopIteration839 884 840 885 # Next, a couple classes to represent queries and their results. … … 845 890 def __init__(self, name=None, attrs={}, text=None, **kwargs): 846 891 self.name = name 847 if is String(attrs):848 kwargs['class'] = attrs892 if isinstance(attrs, basestring): 893 kwargs['class'] = _match_css_class(attrs) 849 894 attrs = None 850 895 if kwargs: … … 905 950 # If given a list of items, scan it for a text element that 906 951 # matches. 907 if isList(markup) and not isinstance(markup, Tag): 952 if hasattr(markup, "__iter__") \ 953 and not isinstance(markup, Tag): 908 954 for element in markup: 909 955 if isinstance(element, NavigableString) \ … … 918 964 # If it's text, make sure the text matches. 919 965 elif isinstance(markup, NavigableString) or \ 920 is String(markup):966 isinstance(markup, basestring): 921 967 if self._matches(markup, self.text): 922 968 found = markup … … 929 975 #print "Matching %s against %s" % (markup, matchAgainst) 930 976 result = False 931 if matchAgainst == True and type(matchAgainst) == types.BooleanType:932 result = markup !=None977 if matchAgainst is True: 978 result = markup is not None 933 979 elif callable(matchAgainst): 934 980 result = matchAgainst(markup) … … 938 984 if isinstance(markup, Tag): 939 985 markup = markup.name 940 if markup is not None and not isString(markup):986 if markup and not isinstance(markup, basestring): 941 987 markup = unicode(markup) 942 988 #Now we know that chunk is either a string, or None. … … 944 990 # It's a regexp object. 945 991 result = markup and matchAgainst.search(markup) 946 elif (isList(matchAgainst) 947 and (markup is not None or not isString(matchAgainst))): 992 elif hasattr(matchAgainst, '__iter__'): # list-like 948 993 result = markup in matchAgainst 949 994 elif hasattr(matchAgainst, 'items'): 950 995 result = markup.has_key(matchAgainst) 951 elif matchAgainst and is String(markup):996 elif matchAgainst and isinstance(markup, basestring): 952 997 if isinstance(markup, unicode): 953 998 matchAgainst = unicode(matchAgainst) … … 967 1012 968 1013 # Now, some helper functions. 969 970 def isList(l):971 """Convenience method that works with all 2.x versions of Python972 to determine whether or not something is listlike."""973 return ((hasattr(l, '__iter__') and not isString(l))974 or (type(l) in (types.ListType, types.TupleType)))975 976 def isString(s):977 """Convenience method that works with all 2.x versions of Python978 to determine whether or not something is stringlike."""979 try:980 return isinstance(s, unicode) or isinstance(s, basestring)981 except NameError:982 return isinstance(s, str)983 1014 984 1015 def buildTagMap(default, *args): … … 992 1023 for k,v in portion.items(): 993 1024 built[k] = v 994 elif isList(portion) and not isString(portion):1025 elif hasattr(portion, '__iter__'): # is a list 995 1026 #It's a list. Map each item to the default. 996 1027 for k in portion: … … 1003 1034 # Now, the parser classes. 1004 1035 1005 class HTMLParserBuilder(HTMLParser): 1006 1007 def __init__(self, soup): 1008 HTMLParser.__init__(self) 1009 self.soup = soup 1010 1011 # We inherit feed() and reset(). 1012 1013 def handle_starttag(self, name, attrs): 1014 if name == 'meta': 1015 self.soup.extractCharsetFromMeta(attrs) 1016 else: 1017 self.soup.unknown_starttag(name, attrs) 1018 1019 def handle_endtag(self, name): 1020 self.soup.unknown_endtag(name) 1021 1022 def handle_data(self, content): 1023 self.soup.handle_data(content) 1024 1025 def _toStringSubclass(self, text, subclass): 1026 """Adds a certain piece of text to the tree as a NavigableString 1027 subclass.""" 1028 self.soup.endData() 1029 self.handle_data(text) 1030 self.soup.endData(subclass) 1031 1032 def handle_pi(self, text): 1033 """Handle a processing instruction as a ProcessingInstruction 1034 object, possibly one with a %SOUP-ENCODING% slot into which an 1035 encoding will be plugged later.""" 1036 if text[:3] == "xml": 1037 text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" 1038 self._toStringSubclass(text, ProcessingInstruction) 1039 1040 def handle_comment(self, text): 1041 "Handle comments as Comment objects." 1042 self._toStringSubclass(text, Comment) 1043 1044 def handle_charref(self, ref): 1045 "Handle character references as data." 1046 if self.soup.convertEntities: 1047 data = unichr(int(ref)) 1048 else: 1049 data = '&#%s;' % ref 1050 self.handle_data(data) 1051 1052 def handle_entityref(self, ref): 1053 """Handle entity references as data, possibly converting known 1054 HTML and/or XML entity references to the corresponding Unicode 1055 characters.""" 1056 data = None 1057 if self.soup.convertHTMLEntities: 1058 try: 1059 data = unichr(name2codepoint[ref]) 1060 except KeyError: 1061 pass 1062 1063 if not data and self.soup.convertXMLEntities: 1064 data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) 1065 1066 if not data and self.soup.convertHTMLEntities and \ 1067 not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): 1068 # TODO: We've got a problem here. We're told this is 1069 # an entity reference, but it's not an XML entity 1070 # reference or an HTML entity reference. Nonetheless, 1071 # the logical thing to do is to pass it through as an 1072 # unrecognized entity reference. 1073 # 1074 # Except: when the input is "&carol;" this function 1075 # will be called with input "carol". When the input is 1076 # "AT&T", this function will be called with input 1077 # "T". We have no way of knowing whether a semicolon 1078 # was present originally, so we don't know whether 1079 # this is an unknown entity or just a misplaced 1080 # ampersand. 1081 # 1082 # The more common case is a misplaced ampersand, so I 1083 # escape the ampersand and omit the trailing semicolon. 1084 data = "&%s" % ref 1085 if not data: 1086 # This case is different from the one above, because we 1087 # haven't already gone through a supposedly comprehensive 1088 # mapping of entities to Unicode characters. We might not 1089 # have gone through any mapping at all. So the chances are 1090 # very high that this is a real entity, and not a 1091 # misplaced ampersand. 1092 data = "&%s;" % ref 1093 self.handle_data(data) 1094 1095 def handle_decl(self, data): 1096 "Handle DOCTYPEs and the like as Declaration objects." 1097 self._toStringSubclass(data, Declaration) 1098 1099 def parse_declaration(self, i): 1100 """Treat a bogus SGML declaration as raw data. Treat a CDATA 1101 declaration as a CData object.""" 1102 j = None 1103 if self.rawdata[i:i+9] == '<![CDATA[': 1104 k = self.rawdata.find(']]>', i) 1105 if k == -1: 1106 k = len(self.rawdata) 1107 data = self.rawdata[i+9:k] 1108 j = k+3 1109 self._toStringSubclass(data, CData) 1110 else: 1111 try: 1112 j = HTMLParser.parse_declaration(self, i) 1113 except HTMLParseError: 1114 toHandle = self.rawdata[i:] 1115 self.handle_data(toHandle) 1116 j = i + len(toHandle) 1117 return j 1118 1119 1120 class BeautifulStoneSoup(Tag): 1036 class BeautifulStoneSoup(Tag, SGMLParser): 1121 1037 1122 1038 """This class contains the basic parser and search code. It defines … … 1164 1080 def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, 1165 1081 markupMassage=True, smartQuotesTo=XML_ENTITIES, 1166 convertEntities=None, selfClosingTags=None, isHTML=False, 1167 builder=HTMLParserBuilder): 1082 convertEntities=None, selfClosingTags=None, isHTML=False): 1168 1083 """The Soup object is initialized as the 'root tag', and the 1169 1084 provided markup (which can be a string or a file-like object) 1170 1085 is fed into the underlying parser. 1171 1086 1172 HTMLParserwill process most bad HTML, and the BeautifulSoup1087 sgmllib will process most bad HTML, and the BeautifulSoup 1173 1088 class has some tricks for dealing with some HTML that kills 1174 HTMLParser, but Beautiful Soup can nonetheless choke or lose data1089 sgmllib, but Beautiful Soup can nonetheless choke or lose data 1175 1090 if your data uses self-closing tags or declarations 1176 1091 incorrectly. … … 1182 1097 1183 1098 The default parser massage techniques fix the two most common 1184 instances of invalid HTML that choke HTMLParser:1099 instances of invalid HTML that choke sgmllib: 1185 1100 1186 1101 <br/> (No space between name of closing tag and tag close) … … 1220 1135 1221 1136 self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) 1222 self.builder = builder(self) 1223 self.reset() 1137 SGMLParser.__init__(self) 1224 1138 1225 1139 if hasattr(markup, 'read'): # It's a file-type object. … … 1231 1145 except StopParsing: 1232 1146 pass 1233 self.markup = None # The markup can now be GCed. 1234 self.builder = None # So can the builder. 1147 self.markup = None # The markup can now be GCed 1148 1149 def convert_charref(self, name): 1150 """This method fixes a bug in Python's SGMLParser.""" 1151 try: 1152 n = int(name) 1153 except ValueError: 1154 return 1155 if not 0 <= n <= 127 : # ASCII ends at 127, not 255 1156 return 1157 return self.convert_codepoint(n) 1235 1158 1236 1159 def _feed(self, inDocumentEncoding=None, isHTML=False): … … 1249 1172 if markup: 1250 1173 if self.markupMassage: 1251 if not isList(self.markupMassage):1174 if not hasattr(self.markupMassage, "__iter__"): 1252 1175 self.markupMassage = self.MARKUP_MASSAGE 1253 1176 for fix, m in self.markupMassage: … … 1259 1182 # might cause problems. 1260 1183 del(self.markupMassage) 1261 self. builder.reset()1262 1263 self.builder.feed(markup)1184 self.reset() 1185 1186 SGMLParser.feed(self, markup) 1264 1187 # Close out any unfinished strings and close all the open tags. 1265 1188 self.endData() 1266 1189 while self.currentTag.name != self.ROOT_TAG_NAME: 1267 1190 self.popTag() 1191 1192 def __getattr__(self, methodName): 1193 """This method routes method call requests to either the SGMLParser 1194 superclass or the Tag superclass, depending on the method name.""" 1195 #print "__getattr__ called on %s.%s" % (self.__class__, methodName) 1196 1197 if methodName.startswith('start_') or methodName.startswith('end_') \ 1198 or methodName.startswith('do_'): 1199 return SGMLParser.__getattr__(self, methodName) 1200 elif not methodName.startswith('__'): 1201 return Tag.__getattr__(self, methodName) 1202 else: 1203 raise AttributeError 1268 1204 1269 1205 def isSelfClosingTag(self, name): … … 1276 1212 Tag.__init__(self, self, self.ROOT_TAG_NAME) 1277 1213 self.hidden = 1 1278 self.builder.reset()1214 SGMLParser.reset(self) 1279 1215 self.currentData = [] 1280 1216 self.currentTag = None … … 1285 1221 def popTag(self): 1286 1222 tag = self.tagStack.pop() 1287 # Tags with just one string-owning child get the child as a1288 # 'string' property, so that soup.tag.string is shorthand for1289 # soup.tag.contents[0]1290 if len(self.currentTag.contents) == 1 and \1291 isinstance(self.currentTag.contents[0], NavigableString):1292 self.currentTag.string = self.currentTag.contents[0]1293 1223 1294 1224 #print "Pop", tag.name … … 1379 1309 popTo = name 1380 1310 break 1381 if (nestingResetTriggers !=None1311 if (nestingResetTriggers is not None 1382 1312 and p.name in nestingResetTriggers) \ 1383 or (nestingResetTriggers ==None and isResetNesting1313 or (nestingResetTriggers is None and isResetNesting 1384 1314 and self.RESET_NESTING_TAGS.has_key(p.name)): 1385 1315 … … 1400 1330 #This is not a real tag. 1401 1331 #print "<%s> is not real!" % name 1402 attrs = ''.join( map(lambda(x, y): ' %s="%s"' % (x, y), attrs))1332 attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs]) 1403 1333 self.handle_data('<%s%s>' % (name, attrs)) 1404 1334 return … … 1441 1371 self.currentData.append(data) 1442 1372 1443 def extractCharsetFromMeta(self, attrs): 1444 self.unknown_starttag('meta', attrs) 1445 1373 def _toStringSubclass(self, text, subclass): 1374 """Adds a certain piece of text to the tree as a NavigableString 1375 subclass.""" 1376 self.endData() 1377 self.handle_data(text) 1378 self.endData(subclass) 1379 1380 def handle_pi(self, text): 1381 """Handle a processing instruction as a ProcessingInstruction 1382 object, possibly one with a %SOUP-ENCODING% slot into which an 1383 encoding will be plugged later.""" 1384 if text[:3] == "xml": 1385 text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" 1386 self._toStringSubclass(text, ProcessingInstruction) 1387 1388 def handle_comment(self, text): 1389 "Handle comments as Comment objects." 1390 self._toStringSubclass(text, Comment) 1391 1392 def handle_charref(self, ref): 1393 "Handle character references as data." 1394 if self.convertEntities: 1395 data = unichr(int(ref)) 1396 else: 1397 data = '&#%s;' % ref 1398 self.handle_data(data) 1399 1400 def handle_entityref(self, ref): 1401 """Handle entity references as data, possibly converting known 1402 HTML and/or XML entity references to the corresponding Unicode 1403 characters.""" 1404 data = None 1405 if self.convertHTMLEntities: 1406 try: 1407 data = unichr(name2codepoint[ref]) 1408 except KeyError: 1409 pass 1410 1411 if not data and self.convertXMLEntities: 1412 data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) 1413 1414 if not data and self.convertHTMLEntities and \ 1415 not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): 1416 # TODO: We've got a problem here. We're told this is 1417 # an entity reference, but it's not an XML entity 1418 # reference or an HTML entity reference. Nonetheless, 1419 # the logical thing to do is to pass it through as an 1420 # unrecognized entity reference. 1421 # 1422 # Except: when the input is "&carol;" this function 1423 # will be called with input "carol". When the input is 1424 # "AT&T", this function will be called with input 1425 # "T". We have no way of knowing whether a semicolon 1426 # was present originally, so we don't know whether 1427 # this is an unknown entity or just a misplaced 1428 # ampersand. 1429 # 1430 # The more common case is a misplaced ampersand, so I 1431 # escape the ampersand and omit the trailing semicolon. 1432 data = "&%s" % ref 1433 if not data: 1434 # This case is different from the one above, because we 1435 # haven't already gone through a supposedly comprehensive 1436 # mapping of entities to Unicode characters. We might not 1437 # have gone through any mapping at all. So the chances are 1438 # very high that this is a real entity, and not a 1439 # misplaced ampersand. 1440 data = "&%s;" % ref 1441 self.handle_data(data) 1442 1443 def handle_decl(self, data): 1444 "Handle DOCTYPEs and the like as Declaration objects." 1445 self._toStringSubclass(data, Declaration) 1446 1447 def parse_declaration(self, i): 1448 """Treat a bogus SGML declaration as raw data. Treat a CDATA 1449 declaration as a CData object.""" 1450 j = None 1451 if self.rawdata[i:i+9] == '<![CDATA[': 1452 k = self.rawdata.find(']]>', i) 1453 if k == -1: 1454 k = len(self.rawdata) 1455 data = self.rawdata[i+9:k] 1456 j = k+3 1457 self._toStringSubclass(data, CData) 1458 else: 1459 try: 1460 j = SGMLParser.parse_declaration(self, i) 1461 except SGMLParseError: 1462 toHandle = self.rawdata[i:] 1463 self.handle_data(toHandle) 1464 j = i + len(toHandle) 1465 return j 1446 1466 1447 1467 class BeautifulSoup(BeautifulStoneSoup): … … 1500 1520 1501 1521 SELF_CLOSING_TAGS = buildTagMap(None, 1502 ['br' , 'hr', 'input', 'img', 'meta',1503 'spacer', 'link', 'frame', 'base' ])1522 ('br' , 'hr', 'input', 'img', 'meta', 1523 'spacer', 'link', 'frame', 'base', 'col')) 1504 1524 1505 1525 PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) … … 1510 1530 #contain another tag of the same type. Furthermore, it's common 1511 1531 #to actually use these tags this way. 1512 NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',1513 'center' ]1532 NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 1533 'center') 1514 1534 1515 1535 #According to the HTML standard, these block tags can contain 1516 1536 #another tag of the same type. Furthermore, it's common 1517 1537 #to actually use these tags this way. 1518 NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']1538 NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del') 1519 1539 1520 1540 #Lists can contain other lists, but there are restrictions. … … 1536 1556 } 1537 1557 1538 NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']1558 NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre') 1539 1559 1540 1560 #If one of these tags is encountered, all tags up to the next tag of … … 1551 1571 CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) 1552 1572 1553 def extractCharsetFromMeta(self, attrs):1573 def start_meta(self, attrs): 1554 1574 """Beautiful Soup can detect a charset included in a META tag, 1555 1575 try to convert the document to that charset, and re-parse the … … 1598 1618 tag.containsSubstitutions = True 1599 1619 1600 1601 1620 class StopParsing(Exception): 1602 1621 pass … … 1628 1647 1629 1648 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ 1630 ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',1649 ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', 1631 1650 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', 1632 'big' ]1633 1634 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']1651 'big') 1652 1653 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',) 1635 1654 1636 1655 NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, … … 1779 1798 if not u: self.originalEncoding = None 1780 1799 1781 def _subMSChar(self, match):1800 def _subMSChar(self, orig): 1782 1801 """Changes a MS smart quote character to an XML or HTML 1783 1802 entity.""" 1784 orig = match.group(1)1785 1803 sub = self.MS_CHARS.get(orig) 1786 if type(sub) == types.TupleType:1804 if isinstance(sub, tuple): 1787 1805 if self.smartQuotesTo == 'xml': 1788 sub = '&#x '.encode() + sub[1].encode() + ';'.encode()1806 sub = '&#x%s;' % sub[1] 1789 1807 else: 1790 sub = '&'.encode() + sub[0].encode() + ';'.encode() 1791 else: 1792 sub = sub.encode() 1808 sub = '&%s;' % sub[0] 1793 1809 return sub 1794 1810 … … 1805 1821 "iso-8859-1", 1806 1822 "iso-8859-2"): 1807 smart_quotes_re = "([\x80-\x9f])"1808 smart_quotes_compiled = re.compile(smart_quotes_re)1809 markup = smart_quotes_compiled.sub(self._subMSChar,markup)1823 markup = re.compile("([\x80-\x9f])").sub \ 1824 (lambda(x): self._subMSChar(x.group(1)), 1825 markup) 1810 1826 1811 1827 try: … … 1896 1912 except: 1897 1913 xml_encoding_match = None 1898 xml_encoding_ re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode()1899 xml_encoding_match = re.compile(xml_encoding_re).match(xml_data)1914 xml_encoding_match = re.compile( 1915 '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data) 1900 1916 if not xml_encoding_match and isHTML: 1901 meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode() 1902 regexp = re.compile(meta_re, re.I) 1917 regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I) 1903 1918 xml_encoding_match = regexp.search(xml_data) 1904 1919 if xml_encoding_match is not None: 1905 xml_encoding = xml_encoding_match.groups()[0].decode( 1906 'ascii').lower() 1920 xml_encoding = xml_encoding_match.groups()[0].lower() 1907 1921 if isHTML: 1908 1922 self.declaredHTMLEncoding = xml_encoding
Note: See TracChangeset
for help on using the changeset viewer.