Changeset 85792 in webkit


Ignore:
Timestamp:
May 4, 2011 3:02:34 PM (13 years ago)
Author:
eric@webkit.org
Message:

2011-05-04 Eric Seidel <eric@webkit.org>

Reviewed by Ojan Vafai.

sheriff-bot can't parse results.html
https://bugs.webkit.org/show_bug.cgi?id=60196

The problem was that the NRWT bots are currently core
and so sheriff-bot was trying to diagnose failures on
them. It was trying to parse their results.html file
and BeautifulSoup was throwing an exception.

It seems our version of BeautifulSoup was quite old
and unable to handle <script> tags correctly.
This moves us to 3.2 which doesn't seem to have this bug.

  • Scripts/webkitpy/thirdparty/BeautifulSoup.py:
Location:
trunk/Tools
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/Tools/ChangeLog

    r85786 r85792  
     12011-05-04  Eric Seidel  <eric@webkit.org>
     2
     3        Reviewed by Ojan Vafai.
     4
     5        sheriff-bot can't parse results.html
     6        https://bugs.webkit.org/show_bug.cgi?id=60196
     7
     8        The problem was that the NRWT bots are currently core
     9        and so sheriff-bot was trying to diagnose failures on
     10        them.  It was trying to parse their results.html file
     11        and BeautifulSoup was throwing an exception.
     12
     13        It seems our version of BeautifulSoup was quite old
     14        and unable to handle <script> tags correctly.
     15        This moves us to 3.2 which doesn't seem to have this bug.
     16
     17        * Scripts/webkitpy/thirdparty/BeautifulSoup.py:
     18
    1192011-05-04  Adam Barth  <abarth@webkit.org>
    220
  • trunk/Tools/Scripts/webkitpy/thirdparty/BeautifulSoup.py

    r55055 r85792  
    4343Here, have some legalese:
    4444
    45 Copyright (c) 2004-2009, Leonard Richardson
     45Copyright (c) 2004-2010, Leonard Richardson
    4646
    4747All rights reserved.
     
    8080
    8181__author__ = "Leonard Richardson (leonardr@segfault.org)"
    82 __version__ = "3.1.0.1"
    83 __copyright__ = "Copyright (c) 2004-2009 Leonard Richardson"
     82__version__ = "3.2.0"
     83__copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"
    8484__license__ = "New-style BSD"
    8585
     86from sgmllib import SGMLParser, SGMLParseError
    8687import codecs
    8788import markupbase
    8889import types
    8990import re
    90 from HTMLParser import HTMLParser, HTMLParseError
     91import sgmllib
    9192try:
    92     from htmlentitydefs import name2codepoint
     93  from htmlentitydefs import name2codepoint
    9394except ImportError:
    94     name2codepoint = {}
     95  name2codepoint = {}
    9596try:
    9697    set
     
    99100
    100101#These hacks make Beautiful Soup able to parse XML with namespaces
     102sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
    101103markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
    102104
    103105DEFAULT_OUTPUT_ENCODING = "utf-8"
    104106
     107def _match_css_class(str):
     108    """Build a RE to match the given CSS class."""
     109    return re.compile(r"(^|.*\s)%s($|\s)" % str)
     110
    105111# First, the classes that represent markup elements.
    106112
    107 def sob(unicode, encoding):
    108     """Returns either the given Unicode string or its encoding."""
    109     if encoding is None:
    110         return unicode
    111     else:
    112         return unicode.encode(encoding)
    113 
    114 class PageElement:
     113class PageElement(object):
    115114    """Contains the navigational information for some part of the page
    116115    (either a tag or a piece of text)"""
     
    130129    def replaceWith(self, replaceWith):
    131130        oldParent = self.parent
    132         myIndex = self.parent.contents.index(self)
    133         if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
     131        myIndex = self.parent.index(self)
     132        if hasattr(replaceWith, "parent")\
     133                  and replaceWith.parent is self.parent:
    134134            # We're replacing this element with one of its siblings.
    135             index = self.parent.contents.index(replaceWith)
     135            index = replaceWith.parent.index(replaceWith)
    136136            if index and index < myIndex:
    137137                # Furthermore, it comes before this element. That
     
    142142        oldParent.insert(myIndex, replaceWith)
    143143
     144    def replaceWithChildren(self):
     145        myParent = self.parent
     146        myIndex = self.parent.index(self)
     147        self.extract()
     148        reversedChildren = list(self.contents)
     149        reversedChildren.reverse()
     150        for child in reversedChildren:
     151            myParent.insert(myIndex, child)
     152
    144153    def extract(self):
    145154        """Destructively rips this element out of the tree."""
    146155        if self.parent:
    147156            try:
    148                 self.parent.contents.remove(self)
     157                del self.parent.contents[self.parent.index(self)]
    149158            except ValueError:
    150159                pass
     
    179188
    180189    def insert(self, position, newChild):
    181         if (isinstance(newChild, basestring)
    182             or isinstance(newChild, unicode)) \
     190        if isinstance(newChild, basestring) \
    183191            and not isinstance(newChild, NavigableString):
    184192            newChild = NavigableString(newChild)
    185193
    186194        position =  min(position, len(self.contents))
    187         if hasattr(newChild, 'parent') and newChild.parent != None:
     195        if hasattr(newChild, 'parent') and newChild.parent is not None:
    188196            # We're 'inserting' an element that's already one
    189197            # of this object's children.
    190             if newChild.parent == self:
    191                 index = self.find(newChild)
    192                 if index and index < position:
     198            if newChild.parent is self:
     199                index = self.index(newChild)
     200                if index > position:
    193201                    # Furthermore we're moving it further down the
    194202                    # list of this object's children. That means that
     
    328336        if isinstance(name, SoupStrainer):
    329337            strainer = name
     338        # (Possibly) special case some findAll*(...) searches
     339        elif text is None and not limit and not attrs and not kwargs:
     340            # findAll*(True)
     341            if name is True:
     342                return [element for element in generator()
     343                        if isinstance(element, Tag)]
     344            # findAll*('tag-name')
     345            elif isinstance(name, basestring):
     346                return [element for element in generator()
     347                        if isinstance(element, Tag) and
     348                        element.name == name]
     349            else:
     350                strainer = SoupStrainer(name, attrs, text, **kwargs)
     351        # Build a SoupStrainer
    330352        else:
    331             # Build a SoupStrainer
    332353            strainer = SoupStrainer(name, attrs, text, **kwargs)
    333354        results = ResultSet(strainer)
     
    350371    def nextGenerator(self):
    351372        i = self
    352         while i:
     373        while i is not None:
    353374            i = i.next
    354375            yield i
     
    356377    def nextSiblingGenerator(self):
    357378        i = self
    358         while i:
     379        while i is not None:
    359380            i = i.nextSibling
    360381            yield i
     
    362383    def previousGenerator(self):
    363384        i = self
    364         while i:
     385        while i is not None:
    365386            i = i.previous
    366387            yield i
     
    368389    def previousSiblingGenerator(self):
    369390        i = self
    370         while i:
     391        while i is not None:
    371392            i = i.previousSibling
    372393            yield i
     
    374395    def parentGenerator(self):
    375396        i = self
    376         while i:
     397        while i is not None:
    377398            i = i.parent
    378399            yield i
     
    416437
    417438    def __getnewargs__(self):
    418         return (unicode(self),)
     439        return (NavigableString.__str__(self),)
    419440
    420441    def __getattr__(self, attr):
     
    427448            raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
    428449
    429     def encode(self, encoding=DEFAULT_OUTPUT_ENCODING):
    430         return self.decode().encode(encoding)
    431 
    432     def decodeGivenEventualEncoding(self, eventualEncoding):
    433         return self
     450    def __unicode__(self):
     451        return str(self).decode(DEFAULT_OUTPUT_ENCODING)
     452
     453    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
     454        if encoding:
     455            return self.encode(encoding)
     456        else:
     457            return self
    434458
    435459class CData(NavigableString):
    436460
    437     def decodeGivenEventualEncoding(self, eventualEncoding):
    438         return u'<![CDATA[' + self + u']]>'
     461    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
     462        return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
    439463
    440464class ProcessingInstruction(NavigableString):
    441 
    442     def decodeGivenEventualEncoding(self, eventualEncoding):
     465    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
    443466        output = self
    444         if u'%SOUP-ENCODING%' in output:
    445             output = self.substituteEncoding(output, eventualEncoding)
    446         return u'<?' + output + u'?>'
     467        if "%SOUP-ENCODING%" in output:
     468            output = self.substituteEncoding(output, encoding)
     469        return "<?%s?>" % self.toEncoding(output, encoding)
    447470
    448471class Comment(NavigableString):
    449     def decodeGivenEventualEncoding(self, eventualEncoding):
    450         return u'<!--' + self + u'-->'
     472    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
     473        return "<!--%s-->" % NavigableString.__str__(self, encoding)
    451474
    452475class Declaration(NavigableString):
    453     def decodeGivenEventualEncoding(self, eventualEncoding):
    454         return u'<!' + self + u'>'
     476    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
     477        return "<!%s>" % NavigableString.__str__(self, encoding)
    455478
    456479class Tag(PageElement):
     
    507530        self.isSelfClosing = parser.isSelfClosingTag(name)
    508531        self.name = name
    509         if attrs == None:
     532        if attrs is None:
    510533            attrs = []
     534        elif isinstance(attrs, dict):
     535            attrs = attrs.items()
    511536        self.attrs = attrs
    512537        self.contents = []
     
    518543        self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
    519544
    520         def convert(kval):
    521             "Converts HTML, XML and numeric entities in the attribute value."
    522             k, val = kval
    523             if val is None:
    524                 return kval
    525             return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
    526                               self._convertEntities, val))
     545        # Convert any HTML, XML, or numeric entities in the attribute values.
     546        convert = lambda(k, val): (k,
     547                                   re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
     548                                          self._convertEntities,
     549                                          val))
    527550        self.attrs = map(convert, self.attrs)
     551
     552    def getString(self):
     553        if (len(self.contents) == 1
     554            and isinstance(self.contents[0], NavigableString)):
     555            return self.contents[0]
     556
     557    def setString(self, string):
     558        """Replace the contents of the tag with a string"""
     559        self.clear()
     560        self.append(string)
     561
     562    string = property(getString, setString)
     563
     564    def getText(self, separator=u""):
     565        if not len(self.contents):
     566            return u""
     567        stopNode = self._lastRecursiveChild().next
     568        strings = []
     569        current = self.contents[0]
     570        while current is not stopNode:
     571            if isinstance(current, NavigableString):
     572                strings.append(current.strip())
     573            current = current.next
     574        return separator.join(strings)
     575
     576    text = property(getText)
    528577
    529578    def get(self, key, default=None):
     
    532581        attribute."""
    533582        return self._getAttrMap().get(key, default)
     583
     584    def clear(self):
     585        """Extract all children."""
     586        for child in self.contents[:]:
     587            child.extract()
     588
     589    def index(self, element):
     590        for i, child in enumerate(self.contents):
     591            if child is element:
     592                return i
     593        raise ValueError("Tag.index: element not in tag")
    534594
    535595    def has_key(self, key):
     
    601661        NOTE: right now this will return false if two tags have the
    602662        same attributes in a different order. Should this be fixed?"""
     663        if other is self:
     664            return True
    603665        if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
    604666            return False
     
    615677    def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
    616678        """Renders this tag as a string."""
    617         return self.decode(eventualEncoding=encoding)
     679        return self.__str__(encoding)
     680
     681    def __unicode__(self):
     682        return self.__str__(None)
    618683
    619684    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
     
    626691        return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
    627692
    628     def __unicode__(self):
    629         return self.decode()
    630 
    631     def __str__(self):
    632         return self.encode()
    633 
    634     def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
    635                prettyPrint=False, indentLevel=0):
    636         return self.decode(prettyPrint, indentLevel, encoding).encode(encoding)
    637 
    638     def decode(self, prettyPrint=False, indentLevel=0,
    639                eventualEncoding=DEFAULT_OUTPUT_ENCODING):
     693    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
     694                prettyPrint=False, indentLevel=0):
    640695        """Returns a string or Unicode representation of this tag and
    641         its contents. To get Unicode, pass None for encoding."""
     696        its contents. To get Unicode, pass None for encoding.
     697
     698        NOTE: since Python's HTML parser consumes whitespace, this
     699        method is not certain to reproduce the whitespace present in
     700        the original string."""
     701
     702        encodedName = self.toEncoding(self.name, encoding)
    642703
    643704        attrs = []
     
    645706            for key, val in self.attrs:
    646707                fmt = '%s="%s"'
    647                 if isString(val):
    648                     if (self.containsSubstitutions
    649                         and eventualEncoding is not None
    650                         and '%SOUP-ENCODING%' in val):
    651                         val = self.substituteEncoding(val, eventualEncoding)
     708                if isinstance(val, basestring):
     709                    if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
     710                        val = self.substituteEncoding(val, encoding)
    652711
    653712                    # The attribute value either:
     
    678737                    # to escape those to XML entities too.
    679738                    val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
    680                 if val is None:
    681                     # Handle boolean attributes.
    682                     decoded = key
    683                 else:
    684                     decoded = fmt % (key, val)
    685                 attrs.append(decoded)
     739
     740                attrs.append(fmt % (self.toEncoding(key, encoding),
     741                                    self.toEncoding(val, encoding)))
    686742        close = ''
    687743        closeTag = ''
     
    689745            close = ' /'
    690746        else:
    691             closeTag = '</%s>' % self.name
     747            closeTag = '</%s>' % encodedName
    692748
    693749        indentTag, indentContents = 0, 0
     
    696752            space = (' ' * (indentTag-1))
    697753            indentContents = indentTag + 1
    698         contents = self.decodeContents(prettyPrint, indentContents,
    699                                        eventualEncoding)
     754        contents = self.renderContents(encoding, prettyPrint, indentContents)
    700755        if self.hidden:
    701756            s = contents
     
    707762            if prettyPrint:
    708763                s.append(space)
    709             s.append('<%s%s%s>' % (self.name, attributeString, close))
     764            s.append('<%s%s%s>' % (encodedName, attributeString, close))
    710765            if prettyPrint:
    711766                s.append("\n")
     
    723778    def decompose(self):
    724779        """Recursively destroys the contents of this tree."""
    725         contents = [i for i in self.contents]
    726         for i in contents:
    727             if isinstance(i, Tag):
    728                 i.decompose()
    729             else:
    730                 i.extract()
    731780        self.extract()
     781        if len(self.contents) == 0:
     782            return
     783        current = self.contents[0]
     784        while current is not None:
     785            next = current.next
     786            if isinstance(current, Tag):
     787                del current.contents[:]
     788            current.parent = None
     789            current.previous = None
     790            current.previousSibling = None
     791            current.next = None
     792            current.nextSibling = None
     793            current = next
    732794
    733795    def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
    734         return self.encode(encoding, True)
    735 
    736     def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
     796        return self.__str__(encoding, True)
     797
     798    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
    737799                       prettyPrint=False, indentLevel=0):
    738         return self.decodeContents(prettyPrint, indentLevel).encode(encoding)
    739 
    740     def decodeContents(self, prettyPrint=False, indentLevel=0,
    741                        eventualEncoding=DEFAULT_OUTPUT_ENCODING):
    742800        """Renders the contents of this tag as a string in the given
    743801        encoding. If encoding is None, returns a Unicode string.."""
     
    746804            text = None
    747805            if isinstance(c, NavigableString):
    748                 text = c.decodeGivenEventualEncoding(eventualEncoding)
     806                text = c.__str__(encoding)
    749807            elif isinstance(c, Tag):
    750                 s.append(c.decode(prettyPrint, indentLevel, eventualEncoding))
     808                s.append(c.__str__(encoding, prettyPrint, indentLevel))
    751809            if text and prettyPrint:
    752810                text = text.strip()
     
    789847    findChildren = findAll
    790848
    791     # Pre-3.x compatibility methods. Will go away in 4.0.
     849    # Pre-3.x compatibility methods
    792850    first = find
    793851    fetch = findAll
     
    798856    def firstText(self, text=None, recursive=True):
    799857        return self.find(text=text, recursive=recursive)
    800 
    801     # 3.x compatibility methods. Will go away in 4.0.
    802     def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
    803                        prettyPrint=False, indentLevel=0):
    804         if encoding is None:
    805             return self.decodeContents(prettyPrint, indentLevel, encoding)
    806         else:
    807             return self.encodeContents(encoding, prettyPrint, indentLevel)
    808 
    809858
    810859    #Private methods
     
    820869
    821870    #Generator methods
     871    def childGenerator(self):
     872        # Just use the iterator from the contents
     873        return iter(self.contents)
     874
    822875    def recursiveChildGenerator(self):
    823876        if not len(self.contents):
     
    829882            current = current.next
    830883
    831     def childGenerator(self):
    832         if not len(self.contents):
    833             raise StopIteration
    834         current = self.contents[0]
    835         while current:
    836             yield current
    837             current = current.nextSibling
    838         raise StopIteration
    839884
    840885# Next, a couple classes to represent queries and their results.
     
    845890    def __init__(self, name=None, attrs={}, text=None, **kwargs):
    846891        self.name = name
    847         if isString(attrs):
    848             kwargs['class'] = attrs
     892        if isinstance(attrs, basestring):
     893            kwargs['class'] = _match_css_class(attrs)
    849894            attrs = None
    850895        if kwargs:
     
    905950        # If given a list of items, scan it for a text element that
    906951        # matches.
    907         if isList(markup) and not isinstance(markup, Tag):
     952        if hasattr(markup, "__iter__") \
     953                and not isinstance(markup, Tag):
    908954            for element in markup:
    909955                if isinstance(element, NavigableString) \
     
    918964        # If it's text, make sure the text matches.
    919965        elif isinstance(markup, NavigableString) or \
    920                  isString(markup):
     966                 isinstance(markup, basestring):
    921967            if self._matches(markup, self.text):
    922968                found = markup
     
    929975        #print "Matching %s against %s" % (markup, matchAgainst)
    930976        result = False
    931         if matchAgainst == True and type(matchAgainst) == types.BooleanType:
    932             result = markup != None
     977        if matchAgainst is True:
     978            result = markup is not None
    933979        elif callable(matchAgainst):
    934980            result = matchAgainst(markup)
     
    938984            if isinstance(markup, Tag):
    939985                markup = markup.name
    940             if markup is not None and not isString(markup):
     986            if markup and not isinstance(markup, basestring):
    941987                markup = unicode(markup)
    942988            #Now we know that chunk is either a string, or None.
     
    944990                # It's a regexp object.
    945991                result = markup and matchAgainst.search(markup)
    946             elif (isList(matchAgainst)
    947                   and (markup is not None or not isString(matchAgainst))):
     992            elif hasattr(matchAgainst, '__iter__'): # list-like
    948993                result = markup in matchAgainst
    949994            elif hasattr(matchAgainst, 'items'):
    950995                result = markup.has_key(matchAgainst)
    951             elif matchAgainst and isString(markup):
     996            elif matchAgainst and isinstance(markup, basestring):
    952997                if isinstance(markup, unicode):
    953998                    matchAgainst = unicode(matchAgainst)
     
    9671012
    9681013# Now, some helper functions.
    969 
    970 def isList(l):
    971     """Convenience method that works with all 2.x versions of Python
    972     to determine whether or not something is listlike."""
    973     return ((hasattr(l, '__iter__') and not isString(l))
    974             or (type(l) in (types.ListType, types.TupleType)))
    975 
    976 def isString(s):
    977     """Convenience method that works with all 2.x versions of Python
    978     to determine whether or not something is stringlike."""
    979     try:
    980         return isinstance(s, unicode) or isinstance(s, basestring)
    981     except NameError:
    982         return isinstance(s, str)
    9831014
    9841015def buildTagMap(default, *args):
     
    9921023            for k,v in portion.items():
    9931024                built[k] = v
    994         elif isList(portion) and not isString(portion):
     1025        elif hasattr(portion, '__iter__'): # is a list
    9951026            #It's a list. Map each item to the default.
    9961027            for k in portion:
     
    10031034# Now, the parser classes.
    10041035
    1005 class HTMLParserBuilder(HTMLParser):
    1006 
    1007     def __init__(self, soup):
    1008         HTMLParser.__init__(self)
    1009         self.soup = soup
    1010 
    1011     # We inherit feed() and reset().
    1012 
    1013     def handle_starttag(self, name, attrs):
    1014         if name == 'meta':
    1015             self.soup.extractCharsetFromMeta(attrs)
    1016         else:
    1017             self.soup.unknown_starttag(name, attrs)
    1018 
    1019     def handle_endtag(self, name):
    1020         self.soup.unknown_endtag(name)
    1021 
    1022     def handle_data(self, content):
    1023         self.soup.handle_data(content)
    1024 
    1025     def _toStringSubclass(self, text, subclass):
    1026         """Adds a certain piece of text to the tree as a NavigableString
    1027         subclass."""
    1028         self.soup.endData()
    1029         self.handle_data(text)
    1030         self.soup.endData(subclass)
    1031 
    1032     def handle_pi(self, text):
    1033         """Handle a processing instruction as a ProcessingInstruction
    1034         object, possibly one with a %SOUP-ENCODING% slot into which an
    1035         encoding will be plugged later."""
    1036         if text[:3] == "xml":
    1037             text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
    1038         self._toStringSubclass(text, ProcessingInstruction)
    1039 
    1040     def handle_comment(self, text):
    1041         "Handle comments as Comment objects."
    1042         self._toStringSubclass(text, Comment)
    1043 
    1044     def handle_charref(self, ref):
    1045         "Handle character references as data."
    1046         if self.soup.convertEntities:
    1047             data = unichr(int(ref))
    1048         else:
    1049             data = '&#%s;' % ref
    1050         self.handle_data(data)
    1051 
    1052     def handle_entityref(self, ref):
    1053         """Handle entity references as data, possibly converting known
    1054         HTML and/or XML entity references to the corresponding Unicode
    1055         characters."""
    1056         data = None
    1057         if self.soup.convertHTMLEntities:
    1058             try:
    1059                 data = unichr(name2codepoint[ref])
    1060             except KeyError:
    1061                 pass
    1062 
    1063         if not data and self.soup.convertXMLEntities:
    1064                 data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
    1065 
    1066         if not data and self.soup.convertHTMLEntities and \
    1067             not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
    1068                 # TODO: We've got a problem here. We're told this is
    1069                 # an entity reference, but it's not an XML entity
    1070                 # reference or an HTML entity reference. Nonetheless,
    1071                 # the logical thing to do is to pass it through as an
    1072                 # unrecognized entity reference.
    1073                 #
    1074                 # Except: when the input is "&carol;" this function
    1075                 # will be called with input "carol". When the input is
    1076                 # "AT&T", this function will be called with input
    1077                 # "T". We have no way of knowing whether a semicolon
    1078                 # was present originally, so we don't know whether
    1079                 # this is an unknown entity or just a misplaced
    1080                 # ampersand.
    1081                 #
    1082                 # The more common case is a misplaced ampersand, so I
    1083                 # escape the ampersand and omit the trailing semicolon.
    1084                 data = "&amp;%s" % ref
    1085         if not data:
    1086             # This case is different from the one above, because we
    1087             # haven't already gone through a supposedly comprehensive
    1088             # mapping of entities to Unicode characters. We might not
    1089             # have gone through any mapping at all. So the chances are
    1090             # very high that this is a real entity, and not a
    1091             # misplaced ampersand.
    1092             data = "&%s;" % ref
    1093         self.handle_data(data)
    1094 
    1095     def handle_decl(self, data):
    1096         "Handle DOCTYPEs and the like as Declaration objects."
    1097         self._toStringSubclass(data, Declaration)
    1098 
    1099     def parse_declaration(self, i):
    1100         """Treat a bogus SGML declaration as raw data. Treat a CDATA
    1101         declaration as a CData object."""
    1102         j = None
    1103         if self.rawdata[i:i+9] == '<![CDATA[':
    1104              k = self.rawdata.find(']]>', i)
    1105              if k == -1:
    1106                  k = len(self.rawdata)
    1107              data = self.rawdata[i+9:k]
    1108              j = k+3
    1109              self._toStringSubclass(data, CData)
    1110         else:
    1111             try:
    1112                 j = HTMLParser.parse_declaration(self, i)
    1113             except HTMLParseError:
    1114                 toHandle = self.rawdata[i:]
    1115                 self.handle_data(toHandle)
    1116                 j = i + len(toHandle)
    1117         return j
    1118 
    1119 
    1120 class BeautifulStoneSoup(Tag):
     1036class BeautifulStoneSoup(Tag, SGMLParser):
    11211037
    11221038    """This class contains the basic parser and search code. It defines
     
    11641080    def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
    11651081                 markupMassage=True, smartQuotesTo=XML_ENTITIES,
    1166                  convertEntities=None, selfClosingTags=None, isHTML=False,
    1167                  builder=HTMLParserBuilder):
     1082                 convertEntities=None, selfClosingTags=None, isHTML=False):
    11681083        """The Soup object is initialized as the 'root tag', and the
    11691084        provided markup (which can be a string or a file-like object)
    11701085        is fed into the underlying parser.
    11711086
    1172         HTMLParser will process most bad HTML, and the BeautifulSoup
     1087        sgmllib will process most bad HTML, and the BeautifulSoup
    11731088        class has some tricks for dealing with some HTML that kills
    1174         HTMLParser, but Beautiful Soup can nonetheless choke or lose data
     1089        sgmllib, but Beautiful Soup can nonetheless choke or lose data
    11751090        if your data uses self-closing tags or declarations
    11761091        incorrectly.
     
    11821097
    11831098        The default parser massage techniques fix the two most common
    1184         instances of invalid HTML that choke HTMLParser:
     1099        instances of invalid HTML that choke sgmllib:
    11851100
    11861101         <br/> (No space between name of closing tag and tag close)
     
    12201135
    12211136        self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
    1222         self.builder = builder(self)
    1223         self.reset()
     1137        SGMLParser.__init__(self)
    12241138
    12251139        if hasattr(markup, 'read'):        # It's a file-type object.
     
    12311145        except StopParsing:
    12321146            pass
    1233         self.markup = None                 # The markup can now be GCed.
    1234         self.builder = None                # So can the builder.
     1147        self.markup = None                 # The markup can now be GCed
     1148
     1149    def convert_charref(self, name):
     1150        """This method fixes a bug in Python's SGMLParser."""
     1151        try:
     1152            n = int(name)
     1153        except ValueError:
     1154            return
     1155        if not 0 <= n <= 127 : # ASCII ends at 127, not 255
     1156            return
     1157        return self.convert_codepoint(n)
    12351158
    12361159    def _feed(self, inDocumentEncoding=None, isHTML=False):
     
    12491172        if markup:
    12501173            if self.markupMassage:
    1251                 if not isList(self.markupMassage):
     1174                if not hasattr(self.markupMassage, "__iter__"):
    12521175                    self.markupMassage = self.MARKUP_MASSAGE
    12531176                for fix, m in self.markupMassage:
     
    12591182                # might cause problems.
    12601183                del(self.markupMassage)
    1261         self.builder.reset()
    1262 
    1263         self.builder.feed(markup)
     1184        self.reset()
     1185
     1186        SGMLParser.feed(self, markup)
    12641187        # Close out any unfinished strings and close all the open tags.
    12651188        self.endData()
    12661189        while self.currentTag.name != self.ROOT_TAG_NAME:
    12671190            self.popTag()
     1191
     1192    def __getattr__(self, methodName):
     1193        """This method routes method call requests to either the SGMLParser
     1194        superclass or the Tag superclass, depending on the method name."""
     1195        #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
     1196
     1197        if methodName.startswith('start_') or methodName.startswith('end_') \
     1198               or methodName.startswith('do_'):
     1199            return SGMLParser.__getattr__(self, methodName)
     1200        elif not methodName.startswith('__'):
     1201            return Tag.__getattr__(self, methodName)
     1202        else:
     1203            raise AttributeError
    12681204
    12691205    def isSelfClosingTag(self, name):
     
    12761212        Tag.__init__(self, self, self.ROOT_TAG_NAME)
    12771213        self.hidden = 1
    1278         self.builder.reset()
     1214        SGMLParser.reset(self)
    12791215        self.currentData = []
    12801216        self.currentTag = None
     
    12851221    def popTag(self):
    12861222        tag = self.tagStack.pop()
    1287         # Tags with just one string-owning child get the child as a
    1288         # 'string' property, so that soup.tag.string is shorthand for
    1289         # soup.tag.contents[0]
    1290         if len(self.currentTag.contents) == 1 and \
    1291            isinstance(self.currentTag.contents[0], NavigableString):
    1292             self.currentTag.string = self.currentTag.contents[0]
    12931223
    12941224        #print "Pop", tag.name
     
    13791309                popTo = name
    13801310                break
    1381             if (nestingResetTriggers != None
     1311            if (nestingResetTriggers is not None
    13821312                and p.name in nestingResetTriggers) \
    1383                 or (nestingResetTriggers == None and isResetNesting
     1313                or (nestingResetTriggers is None and isResetNesting
    13841314                    and self.RESET_NESTING_TAGS.has_key(p.name)):
    13851315
     
    14001330            #This is not a real tag.
    14011331            #print "<%s> is not real!" % name
    1402             attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
     1332            attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
    14031333            self.handle_data('<%s%s>' % (name, attrs))
    14041334            return
     
    14411371        self.currentData.append(data)
    14421372
    1443     def extractCharsetFromMeta(self, attrs):
    1444         self.unknown_starttag('meta', attrs)
    1445 
     1373    def _toStringSubclass(self, text, subclass):
     1374        """Adds a certain piece of text to the tree as a NavigableString
     1375        subclass."""
     1376        self.endData()
     1377        self.handle_data(text)
     1378        self.endData(subclass)
     1379
     1380    def handle_pi(self, text):
     1381        """Handle a processing instruction as a ProcessingInstruction
     1382        object, possibly one with a %SOUP-ENCODING% slot into which an
     1383        encoding will be plugged later."""
     1384        if text[:3] == "xml":
     1385            text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
     1386        self._toStringSubclass(text, ProcessingInstruction)
     1387
     1388    def handle_comment(self, text):
     1389        "Handle comments as Comment objects."
     1390        self._toStringSubclass(text, Comment)
     1391
     1392    def handle_charref(self, ref):
     1393        "Handle character references as data."
     1394        if self.convertEntities:
     1395            data = unichr(int(ref))
     1396        else:
     1397            data = '&#%s;' % ref
     1398        self.handle_data(data)
     1399
     1400    def handle_entityref(self, ref):
     1401        """Handle entity references as data, possibly converting known
     1402        HTML and/or XML entity references to the corresponding Unicode
     1403        characters."""
     1404        data = None
     1405        if self.convertHTMLEntities:
     1406            try:
     1407                data = unichr(name2codepoint[ref])
     1408            except KeyError:
     1409                pass
     1410
     1411        if not data and self.convertXMLEntities:
     1412                data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
     1413
     1414        if not data and self.convertHTMLEntities and \
     1415            not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
     1416                # TODO: We've got a problem here. We're told this is
     1417                # an entity reference, but it's not an XML entity
     1418                # reference or an HTML entity reference. Nonetheless,
     1419                # the logical thing to do is to pass it through as an
     1420                # unrecognized entity reference.
     1421                #
     1422                # Except: when the input is "&carol;" this function
     1423                # will be called with input "carol". When the input is
     1424                # "AT&T", this function will be called with input
     1425                # "T". We have no way of knowing whether a semicolon
     1426                # was present originally, so we don't know whether
     1427                # this is an unknown entity or just a misplaced
     1428                # ampersand.
     1429                #
     1430                # The more common case is a misplaced ampersand, so I
     1431                # escape the ampersand and omit the trailing semicolon.
     1432                data = "&amp;%s" % ref
     1433        if not data:
     1434            # This case is different from the one above, because we
     1435            # haven't already gone through a supposedly comprehensive
     1436            # mapping of entities to Unicode characters. We might not
     1437            # have gone through any mapping at all. So the chances are
     1438            # very high that this is a real entity, and not a
     1439            # misplaced ampersand.
     1440            data = "&%s;" % ref
     1441        self.handle_data(data)
     1442
     1443    def handle_decl(self, data):
     1444        "Handle DOCTYPEs and the like as Declaration objects."
     1445        self._toStringSubclass(data, Declaration)
     1446
     1447    def parse_declaration(self, i):
     1448        """Treat a bogus SGML declaration as raw data. Treat a CDATA
     1449        declaration as a CData object."""
     1450        j = None
     1451        if self.rawdata[i:i+9] == '<![CDATA[':
     1452             k = self.rawdata.find(']]>', i)
     1453             if k == -1:
     1454                 k = len(self.rawdata)
     1455             data = self.rawdata[i+9:k]
     1456             j = k+3
     1457             self._toStringSubclass(data, CData)
     1458        else:
     1459            try:
     1460                j = SGMLParser.parse_declaration(self, i)
     1461            except SGMLParseError:
     1462                toHandle = self.rawdata[i:]
     1463                self.handle_data(toHandle)
     1464                j = i + len(toHandle)
     1465        return j
    14461466
    14471467class BeautifulSoup(BeautifulStoneSoup):
     
    15001520
    15011521    SELF_CLOSING_TAGS = buildTagMap(None,
    1502                                     ['br' , 'hr', 'input', 'img', 'meta',
    1503                                     'spacer', 'link', 'frame', 'base'])
     1522                                    ('br' , 'hr', 'input', 'img', 'meta',
     1523                                    'spacer', 'link', 'frame', 'base', 'col'))
    15041524
    15051525    PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
     
    15101530    #contain another tag of the same type. Furthermore, it's common
    15111531    #to actually use these tags this way.
    1512     NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
    1513                             'center']
     1532    NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
     1533                            'center')
    15141534
    15151535    #According to the HTML standard, these block tags can contain
    15161536    #another tag of the same type. Furthermore, it's common
    15171537    #to actually use these tags this way.
    1518     NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
     1538    NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
    15191539
    15201540    #Lists can contain other lists, but there are restrictions.
     
    15361556                           }
    15371557
    1538     NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
     1558    NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
    15391559
    15401560    #If one of these tags is encountered, all tags up to the next tag of
     
    15511571    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
    15521572
    1553     def extractCharsetFromMeta(self, attrs):
     1573    def start_meta(self, attrs):
    15541574        """Beautiful Soup can detect a charset included in a META tag,
    15551575        try to convert the document to that charset, and re-parse the
     
    15981618            tag.containsSubstitutions = True
    15991619
    1600 
    16011620class StopParsing(Exception):
    16021621    pass
     
    16281647
    16291648    I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
    1630      ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
     1649     ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
    16311650      'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
    1632       'big']
    1633 
    1634     I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
     1651      'big')
     1652
     1653    I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
    16351654
    16361655    NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
     
    17791798        if not u: self.originalEncoding = None
    17801799
    1781     def _subMSChar(self, match):
     1800    def _subMSChar(self, orig):
    17821801        """Changes a MS smart quote character to an XML or HTML
    17831802        entity."""
    1784         orig = match.group(1)
    17851803        sub = self.MS_CHARS.get(orig)
    1786         if type(sub) == types.TupleType:
     1804        if isinstance(sub, tuple):
    17871805            if self.smartQuotesTo == 'xml':
    1788                 sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
     1806                sub = '&#x%s;' % sub[1]
    17891807            else:
    1790                 sub = '&'.encode() + sub[0].encode() + ';'.encode()
    1791         else:
    1792             sub = sub.encode()
     1808                sub = '&%s;' % sub[0]
    17931809        return sub
    17941810
     
    18051821                                                      "iso-8859-1",
    18061822                                                      "iso-8859-2"):
    1807             smart_quotes_re = "([\x80-\x9f])"
    1808             smart_quotes_compiled = re.compile(smart_quotes_re)
    1809             markup = smart_quotes_compiled.sub(self._subMSChar, markup)
     1823            markup = re.compile("([\x80-\x9f])").sub \
     1824                     (lambda(x): self._subMSChar(x.group(1)),
     1825                      markup)
    18101826
    18111827        try:
     
    18961912        except:
    18971913            xml_encoding_match = None
    1898         xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode()
    1899         xml_encoding_match = re.compile(xml_encoding_re).match(xml_data)
     1914        xml_encoding_match = re.compile(
     1915            '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
    19001916        if not xml_encoding_match and isHTML:
    1901             meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode()
    1902             regexp = re.compile(meta_re, re.I)
     1917            regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
    19031918            xml_encoding_match = regexp.search(xml_data)
    19041919        if xml_encoding_match is not None:
    1905             xml_encoding = xml_encoding_match.groups()[0].decode(
    1906                 'ascii').lower()
     1920            xml_encoding = xml_encoding_match.groups()[0].lower()
    19071921            if isHTML:
    19081922                self.declaredHTMLEncoding = xml_encoding
Note: See TracChangeset for help on using the changeset viewer.