Context Navigation

← Previous Changeset
Next Changeset →

Changeset 85792 in webkit

Timestamp:

May 4, 2011, 3:02:34 PM (14 years ago)

Author:

eric@webkit.org

Message:

2011-05-04 Eric Seidel <eric@webkit.org>

Reviewed by Ojan Vafai.

sheriff-bot can't parse results.html
https://bugs.webkit.org/show_bug.cgi?id=60196

The problem was that the NRWT bots are currently core
and so sheriff-bot was trying to diagnose failures on
them. It was trying to parse their results.html file
and BeautifulSoup was throwing an exception.

It seems our version of BeautifulSoup was quite old
and unable to handle <script> tags correctly.
This moves us to 3.2 which doesn't seem to have this bug.

Scripts/webkitpy/thirdparty/BeautifulSoup.py:

Location:

trunk/Tools

Files:

: 2 edited

ChangeLog (modified) (1 diff)
Scripts/webkitpy/thirdparty/BeautifulSoup.py (modified) (60 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/Tools/ChangeLog

-              r85786
+              r85792
+-05-04  Eric Seidel  <eric@webkit.org>
+        Reviewed by Ojan Vafai.
+        sheriff-bot can't parse results.html
+        https://bugs.webkit.org/show_bug.cgi?id=60196
+        The problem was that the NRWT bots are currently core
+        and so sheriff-bot was trying to diagnose failures on
+        them.  It was trying to parse their results.html file
+        and BeautifulSoup was throwing an exception.
+        It seems our version of BeautifulSoup was quite old
+        and unable to handle <script> tags correctly.
+        This moves us to 3.2 which doesn't seem to have this bug.
+        * Scripts/webkitpy/thirdparty/BeautifulSoup.py:
 -05-04  Adam Barth  <abarth@webkit.org>

trunk/Tools/Scripts/webkitpy/thirdparty/BeautifulSoup.py

-              r55055
+              r85792
 Here, have some legalese:
 Copyright (c) 2004-2009, Leonard Richardson
+Copyright (c) 2004-2010, Leonard Richardson
 All rights reserved.
 …
 __author__ = "Leonard Richardson (leonardr@segfault.org)"
 __version__ = "3.1.0.1"
 __copyright__ = "Copyright (c) 2004-2009 Leonard Richardson"
+__version__ = "3.2.0"
+__copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"
 __license__ = "New-style BSD"
+from sgmllib import SGMLParser, SGMLParseError
 import codecs
 import markupbase
 import types
 import re
+from HTMLParser import HTMLParser, HTMLParseError
+import sgmllib
 try:
     from htmlentitydefs import name2codepoint
+  from htmlentitydefs import name2codepoint
 except ImportError:
     name2codepoint = {}
+  name2codepoint = {}
 try:
     set
 …
 #These hacks make Beautiful Soup able to parse XML with namespaces
+sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
 markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
 DEFAULT_OUTPUT_ENCODING = "utf-8"
+def _match_css_class(str):
+    """Build a RE to match the given CSS class."""
+    return re.compile(r"(^|.*\s)%s($|\s)" % str)
 # First, the classes that represent markup elements.
+def sob(unicode, encoding):
+    """Returns either the given Unicode string or its encoding."""
+    if encoding is None:
+        return unicode
+    else:
+        return unicode.encode(encoding)
+class PageElement:
+class PageElement(object):
     """Contains the navigational information for some part of the page
     (either a tag or a piece of text)"""
 …
     def replaceWith(self, replaceWith):
         oldParent = self.parent
+        myIndex = self.parent.contents.index(self)
+        if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
+        myIndex = self.parent.index(self)
+        if hasattr(replaceWith, "parent")\
+                  and replaceWith.parent is self.parent:
             # We're replacing this element with one of its siblings.
             index = self.parent.contents.index(replaceWith)
+            index = replaceWith.parent.index(replaceWith)
             if index and index < myIndex:
                 # Furthermore, it comes before this element. That
 …
         oldParent.insert(myIndex, replaceWith)
+    def replaceWithChildren(self):
+        myParent = self.parent
+        myIndex = self.parent.index(self)
+        self.extract()
+        reversedChildren = list(self.contents)
+        reversedChildren.reverse()
+        for child in reversedChildren:
+            myParent.insert(myIndex, child)
     def extract(self):
         """Destructively rips this element out of the tree."""
         if self.parent:
             try:
                 self.parent.contents.remove(self)
+                del self.parent.contents[self.parent.index(self)]
             except ValueError:
                 pass
 …
     def insert(self, position, newChild):
+        if (isinstance(newChild, basestring)
+            or isinstance(newChild, unicode)) \
+        if isinstance(newChild, basestring) \
             and not isinstance(newChild, NavigableString):
             newChild = NavigableString(newChild)
         position =  min(position, len(self.contents))
         if hasattr(newChild, 'parent') and newChild.parent != None:
+        if hasattr(newChild, 'parent') and newChild.parent is not None:
             # We're 'inserting' an element that's already one
             # of this object's children.
             if newChild.parent == self:
                 index = self.find(newChild)
                 if index and index < position:
+            if newChild.parent is self:
+                index = self.index(newChild)
+                if index > position:
                     # Furthermore we're moving it further down the
                     # list of this object's children. That means that
 …
         if isinstance(name, SoupStrainer):
             strainer = name
+        # (Possibly) special case some findAll*(...) searches
+        elif text is None and not limit and not attrs and not kwargs:
+            # findAll*(True)
+            if name is True:
+                return [element for element in generator()
+                        if isinstance(element, Tag)]
+            # findAll*('tag-name')
+            elif isinstance(name, basestring):
+                return [element for element in generator()
+                        if isinstance(element, Tag) and
+                        element.name == name]
+            else:
+                strainer = SoupStrainer(name, attrs, text, **kwargs)
+        # Build a SoupStrainer
         else:
-            # Build a SoupStrainer
             strainer = SoupStrainer(name, attrs, text, **kwargs)
         results = ResultSet(strainer)
 …
     def nextGenerator(self):
         i = self
         while i:
+        while i is not None:
             i = i.next
             yield i
 …
     def nextSiblingGenerator(self):
         i = self
         while i:
+        while i is not None:
             i = i.nextSibling
             yield i
 …
     def previousGenerator(self):
         i = self
         while i:
+        while i is not None:
             i = i.previous
             yield i
 …
     def previousSiblingGenerator(self):
         i = self
         while i:
+        while i is not None:
             i = i.previousSibling
             yield i
 …
     def parentGenerator(self):
         i = self
         while i:
+        while i is not None:
             i = i.parent
             yield i
 …
     def __getnewargs__(self):
         return (unicode(self),)
+        return (NavigableString.__str__(self),)
     def __getattr__(self, attr):
 …
             raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
+    def encode(self, encoding=DEFAULT_OUTPUT_ENCODING):
+        return self.decode().encode(encoding)
+    def decodeGivenEventualEncoding(self, eventualEncoding):
+        return self
+    def __unicode__(self):
+        return str(self).decode(DEFAULT_OUTPUT_ENCODING)
+    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+        if encoding:
+            return self.encode(encoding)
+        else:
+            return self
 class CData(NavigableString):
     def decodeGivenEventualEncoding(self, eventualEncoding):
         return u'<![CDATA[' + self + u']]>'
+    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+        return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
 class ProcessingInstruction(NavigableString):
+    def decodeGivenEventualEncoding(self, eventualEncoding):
+    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
         output = self
         if u'%SOUP-ENCODING%' in output:
             output = self.substituteEncoding(output, eventualEncoding)
         return u'<?' + output + u'?>'
+        if "%SOUP-ENCODING%" in output:
+            output = self.substituteEncoding(output, encoding)
+        return "<?%s?>" % self.toEncoding(output, encoding)
 class Comment(NavigableString):
     def decodeGivenEventualEncoding(self, eventualEncoding):
         return u'<!--' + self + u'-->'
+    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+        return "<!--%s-->" % NavigableString.__str__(self, encoding)
 class Declaration(NavigableString):
     def decodeGivenEventualEncoding(self, eventualEncoding):
         return u'<!' + self + u'>'
+    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+        return "<!%s>" % NavigableString.__str__(self, encoding)
 class Tag(PageElement):
 …
         self.isSelfClosing = parser.isSelfClosingTag(name)
         self.name = name
         if attrs == None:
+        if attrs is None:
             attrs = []
+        elif isinstance(attrs, dict):
+            attrs = attrs.items()
         self.attrs = attrs
         self.contents = []
 …
         self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
+        def convert(kval):
+            "Converts HTML, XML and numeric entities in the attribute value."
+            k, val = kval
+            if val is None:
+                return kval
+            return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
+                              self._convertEntities, val))
+        # Convert any HTML, XML, or numeric entities in the attribute values.
+        convert = lambda(k, val): (k,
+                                   re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
+                                          self._convertEntities,
+                                          val))
         self.attrs = map(convert, self.attrs)
+    def getString(self):
+        if (len(self.contents) == 1
+            and isinstance(self.contents[0], NavigableString)):
+            return self.contents[0]
+    def setString(self, string):
+        """Replace the contents of the tag with a string"""
+        self.clear()
+        self.append(string)
+    string = property(getString, setString)
+    def getText(self, separator=u""):
+        if not len(self.contents):
+            return u""
+        stopNode = self._lastRecursiveChild().next
+        strings = []
+        current = self.contents[0]
+        while current is not stopNode:
+            if isinstance(current, NavigableString):
+                strings.append(current.strip())
+            current = current.next
+        return separator.join(strings)
+    text = property(getText)
     def get(self, key, default=None):
 …
         attribute."""
         return self._getAttrMap().get(key, default)
+    def clear(self):
+        """Extract all children."""
+        for child in self.contents[:]:
+            child.extract()
+    def index(self, element):
+        for i, child in enumerate(self.contents):
+            if child is element:
+                return i
+        raise ValueError("Tag.index: element not in tag")
     def has_key(self, key):
 …
         NOTE: right now this will return false if two tags have the
         same attributes in a different order. Should this be fixed?"""
+        if other is self:
+            return True
         if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
             return False
 …
     def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
         """Renders this tag as a string."""
+        return self.decode(eventualEncoding=encoding)
+        return self.__str__(encoding)
+    def __unicode__(self):
+        return self.__str__(None)
     BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
 …
         return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
+    def __unicode__(self):
+        return self.decode()
+    def __str__(self):
+        return self.encode()
+    def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
+               prettyPrint=False, indentLevel=0):
+        return self.decode(prettyPrint, indentLevel, encoding).encode(encoding)
+    def decode(self, prettyPrint=False, indentLevel=0,
+               eventualEncoding=DEFAULT_OUTPUT_ENCODING):
+    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
+                prettyPrint=False, indentLevel=0):
         """Returns a string or Unicode representation of this tag and
+        its contents. To get Unicode, pass None for encoding."""
+        its contents. To get Unicode, pass None for encoding.
+        NOTE: since Python's HTML parser consumes whitespace, this
+        method is not certain to reproduce the whitespace present in
+        the original string."""
+        encodedName = self.toEncoding(self.name, encoding)
         attrs = []
 …
             for key, val in self.attrs:
                 fmt = '%s="%s"'
+                if isString(val):
+                    if (self.containsSubstitutions
+                        and eventualEncoding is not None
+                        and '%SOUP-ENCODING%' in val):
+                        val = self.substituteEncoding(val, eventualEncoding)
+                if isinstance(val, basestring):
+                    if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
+                        val = self.substituteEncoding(val, encoding)
                     # The attribute value either:
 …
                     # to escape those to XML entities too.
                     val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
+                if val is None:
+                    # Handle boolean attributes.
+                    decoded = key
+                else:
+                    decoded = fmt % (key, val)
+                attrs.append(decoded)
+                attrs.append(fmt % (self.toEncoding(key, encoding),
+                                    self.toEncoding(val, encoding)))
         close = ''
         closeTag = ''
 …
             close = ' /'
         else:
             closeTag = '</%s>' % self.name
+            closeTag = '</%s>' % encodedName
         indentTag, indentContents = 0, 0
 …
             space = (' ' * (indentTag-1))
             indentContents = indentTag + 1
+        contents = self.decodeContents(prettyPrint, indentContents,
+                                       eventualEncoding)
+        contents = self.renderContents(encoding, prettyPrint, indentContents)
         if self.hidden:
             s = contents
 …
             if prettyPrint:
                 s.append(space)
             s.append('<%s%s%s>' % (self.name, attributeString, close))
+            s.append('<%s%s%s>' % (encodedName, attributeString, close))
             if prettyPrint:
                 s.append("\n")
 …
     def decompose(self):
         """Recursively destroys the contents of this tree."""
-        contents = [i for i in self.contents]
-        for i in contents:
-            if isinstance(i, Tag):
-                i.decompose()
-            else:
-                i.extract()
         self.extract()
+        if len(self.contents) == 0:
+            return
+        current = self.contents[0]
+        while current is not None:
+            next = current.next
+            if isinstance(current, Tag):
+                del current.contents[:]
+            current.parent = None
+            current.previous = None
+            current.previousSibling = None
+            current.next = None
+            current.nextSibling = None
+            current = next
     def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
         return self.encode(encoding, True)
     def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
+        return self.__str__(encoding, True)
+    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
                        prettyPrint=False, indentLevel=0):
-        return self.decodeContents(prettyPrint, indentLevel).encode(encoding)
-    def decodeContents(self, prettyPrint=False, indentLevel=0,
-                       eventualEncoding=DEFAULT_OUTPUT_ENCODING):
         """Renders the contents of this tag as a string in the given
         encoding. If encoding is None, returns a Unicode string.."""
 …
             text = None
             if isinstance(c, NavigableString):
                 text = c.decodeGivenEventualEncoding(eventualEncoding)
+                text = c.__str__(encoding)
             elif isinstance(c, Tag):
                 s.append(c.decode(prettyPrint, indentLevel, eventualEncoding))
+                s.append(c.__str__(encoding, prettyPrint, indentLevel))
             if text and prettyPrint:
                 text = text.strip()
 …
     findChildren = findAll
     # Pre-3.x compatibility methods. Will go away in 4.0.
+    # Pre-3.x compatibility methods
     first = find
     fetch = findAll
 …
     def firstText(self, text=None, recursive=True):
         return self.find(text=text, recursive=recursive)
-    # 3.x compatibility methods. Will go away in 4.0.
-    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
-                       prettyPrint=False, indentLevel=0):
-        if encoding is None:
-            return self.decodeContents(prettyPrint, indentLevel, encoding)
-        else:
-            return self.encodeContents(encoding, prettyPrint, indentLevel)
     #Private methods
 …
     #Generator methods
+    def childGenerator(self):
+        # Just use the iterator from the contents
+        return iter(self.contents)
     def recursiveChildGenerator(self):
         if not len(self.contents):
 …
             current = current.next
-    def childGenerator(self):
-        if not len(self.contents):
-            raise StopIteration
-        current = self.contents[0]
-        while current:
-            yield current
-            current = current.nextSibling
-        raise StopIteration
 # Next, a couple classes to represent queries and their results.
 …
     def __init__(self, name=None, attrs={}, text=None, **kwargs):
         self.name = name
         if isString(attrs):
             kwargs['class'] = attrs
+        if isinstance(attrs, basestring):
+            kwargs['class'] = _match_css_class(attrs)
             attrs = None
         if kwargs:
 …
         # If given a list of items, scan it for a text element that
         # matches.
+        if isList(markup) and not isinstance(markup, Tag):
+        if hasattr(markup, "__iter__") \
+                and not isinstance(markup, Tag):
             for element in markup:
                 if isinstance(element, NavigableString) \
 …
         # If it's text, make sure the text matches.
         elif isinstance(markup, NavigableString) or \
                  isString(markup):
+                 isinstance(markup, basestring):
             if self._matches(markup, self.text):
                 found = markup
 …
         #print "Matching %s against %s" % (markup, matchAgainst)
         result = False
         if matchAgainst == True and type(matchAgainst) == types.BooleanType:
             result = markup != None
+        if matchAgainst is True:
+            result = markup is not None
         elif callable(matchAgainst):
             result = matchAgainst(markup)
 …
             if isinstance(markup, Tag):
                 markup = markup.name
             if markup is not None and not isString(markup):
+            if markup and not isinstance(markup, basestring):
                 markup = unicode(markup)
             #Now we know that chunk is either a string, or None.
 …
                 # It's a regexp object.
                 result = markup and matchAgainst.search(markup)
+            elif (isList(matchAgainst)
+                  and (markup is not None or not isString(matchAgainst))):
+            elif hasattr(matchAgainst, '__iter__'): # list-like
                 result = markup in matchAgainst
             elif hasattr(matchAgainst, 'items'):
                 result = markup.has_key(matchAgainst)
             elif matchAgainst and isString(markup):
+            elif matchAgainst and isinstance(markup, basestring):
                 if isinstance(markup, unicode):
                     matchAgainst = unicode(matchAgainst)
 …
 # Now, some helper functions.
-def isList(l):
-    """Convenience method that works with all 2.x versions of Python
-    to determine whether or not something is listlike."""
-    return ((hasattr(l, '__iter__') and not isString(l))
-            or (type(l) in (types.ListType, types.TupleType)))
-def isString(s):
-    """Convenience method that works with all 2.x versions of Python
-    to determine whether or not something is stringlike."""
-    try:
-        return isinstance(s, unicode) or isinstance(s, basestring)
-    except NameError:
-        return isinstance(s, str)
 def buildTagMap(default, *args):
 …
             for k,v in portion.items():
                 built[k] = v
         elif isList(portion) and not isString(portion):
+        elif hasattr(portion, '__iter__'): # is a list
             #It's a list. Map each item to the default.
             for k in portion:
 …
 # Now, the parser classes.
+class HTMLParserBuilder(HTMLParser):
+    def __init__(self, soup):
+        HTMLParser.__init__(self)
+        self.soup = soup
+    # We inherit feed() and reset().
+    def handle_starttag(self, name, attrs):
+        if name == 'meta':
+            self.soup.extractCharsetFromMeta(attrs)
+        else:
+            self.soup.unknown_starttag(name, attrs)
+    def handle_endtag(self, name):
+        self.soup.unknown_endtag(name)
+    def handle_data(self, content):
+        self.soup.handle_data(content)
+    def _toStringSubclass(self, text, subclass):
+        """Adds a certain piece of text to the tree as a NavigableString
+        subclass."""
+        self.soup.endData()
+        self.handle_data(text)
+        self.soup.endData(subclass)
+    def handle_pi(self, text):
+        """Handle a processing instruction as a ProcessingInstruction
+        object, possibly one with a %SOUP-ENCODING% slot into which an
+        encoding will be plugged later."""
+        if text[:3] == "xml":
+            text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
+        self._toStringSubclass(text, ProcessingInstruction)
+    def handle_comment(self, text):
+        "Handle comments as Comment objects."
+        self._toStringSubclass(text, Comment)
+    def handle_charref(self, ref):
+        "Handle character references as data."
+        if self.soup.convertEntities:
+            data = unichr(int(ref))
+        else:
+            data = '&#%s;' % ref
+        self.handle_data(data)
+    def handle_entityref(self, ref):
+        """Handle entity references as data, possibly converting known
+        HTML and/or XML entity references to the corresponding Unicode
+        characters."""
+        data = None
+        if self.soup.convertHTMLEntities:
+            try:
+                data = unichr(name2codepoint[ref])
+            except KeyError:
+                pass
+        if not data and self.soup.convertXMLEntities:
+                data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
+        if not data and self.soup.convertHTMLEntities and \
+            not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
+                # TODO: We've got a problem here. We're told this is
+                # an entity reference, but it's not an XML entity
+                # reference or an HTML entity reference. Nonetheless,
+                # the logical thing to do is to pass it through as an
+                # unrecognized entity reference.
+                #
+                # Except: when the input is "&carol;" this function
+                # will be called with input "carol". When the input is
+                # "AT&T", this function will be called with input
+                # "T". We have no way of knowing whether a semicolon
+                # was present originally, so we don't know whether
+                # this is an unknown entity or just a misplaced
+                # ampersand.
+                #
+                # The more common case is a misplaced ampersand, so I
+                # escape the ampersand and omit the trailing semicolon.
+                data = "&amp;%s" % ref
+        if not data:
+            # This case is different from the one above, because we
+            # haven't already gone through a supposedly comprehensive
+            # mapping of entities to Unicode characters. We might not
+            # have gone through any mapping at all. So the chances are
+            # very high that this is a real entity, and not a
+            # misplaced ampersand.
+            data = "&%s;" % ref
+        self.handle_data(data)
+    def handle_decl(self, data):
+        "Handle DOCTYPEs and the like as Declaration objects."
+        self._toStringSubclass(data, Declaration)
+    def parse_declaration(self, i):
+        """Treat a bogus SGML declaration as raw data. Treat a CDATA
+        declaration as a CData object."""
+        j = None
+        if self.rawdata[i:i+9] == '<![CDATA[':
+             k = self.rawdata.find(']]>', i)
+             if k == -1:
+                 k = len(self.rawdata)
+             data = self.rawdata[i+9:k]
+             j = k+3
+             self._toStringSubclass(data, CData)
+        else:
+            try:
+                j = HTMLParser.parse_declaration(self, i)
+            except HTMLParseError:
+                toHandle = self.rawdata[i:]
+                self.handle_data(toHandle)
+                j = i + len(toHandle)
+        return j
+class BeautifulStoneSoup(Tag):
+class BeautifulStoneSoup(Tag, SGMLParser):
     """This class contains the basic parser and search code. It defines
 …
     def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
                  markupMassage=True, smartQuotesTo=XML_ENTITIES,
+                 convertEntities=None, selfClosingTags=None, isHTML=False,
+                 builder=HTMLParserBuilder):
+                 convertEntities=None, selfClosingTags=None, isHTML=False):
         """The Soup object is initialized as the 'root tag', and the
         provided markup (which can be a string or a file-like object)
         is fed into the underlying parser.
         HTMLParser will process most bad HTML, and the BeautifulSoup
+        sgmllib will process most bad HTML, and the BeautifulSoup
         class has some tricks for dealing with some HTML that kills
         HTMLParser, but Beautiful Soup can nonetheless choke or lose data
+        sgmllib, but Beautiful Soup can nonetheless choke or lose data
         if your data uses self-closing tags or declarations
         incorrectly.
 …
         The default parser massage techniques fix the two most common
         instances of invalid HTML that choke HTMLParser:
+        instances of invalid HTML that choke sgmllib:
          <br/> (No space between name of closing tag and tag close)
 …
         self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
+        self.builder = builder(self)
+        self.reset()
+        SGMLParser.__init__(self)
         if hasattr(markup, 'read'):        # It's a file-type object.
 …
         except StopParsing:
             pass
+        self.markup = None                 # The markup can now be GCed.
+        self.builder = None                # So can the builder.
+        self.markup = None                 # The markup can now be GCed
+    def convert_charref(self, name):
+        """This method fixes a bug in Python's SGMLParser."""
+        try:
+            n = int(name)
+        except ValueError:
+            return
+        if not 0 <= n <= 127 : # ASCII ends at 127, not 255
+            return
+        return self.convert_codepoint(n)
     def _feed(self, inDocumentEncoding=None, isHTML=False):
 …
         if markup:
             if self.markupMassage:
                 if not isList(self.markupMassage):
+                if not hasattr(self.markupMassage, "__iter__"):
                     self.markupMassage = self.MARKUP_MASSAGE
                 for fix, m in self.markupMassage:
 …
                 # might cause problems.
                 del(self.markupMassage)
         self.builder.reset()
         self.builder.feed(markup)
+        self.reset()
+        SGMLParser.feed(self, markup)
         # Close out any unfinished strings and close all the open tags.
         self.endData()
         while self.currentTag.name != self.ROOT_TAG_NAME:
             self.popTag()
+    def __getattr__(self, methodName):
+        """This method routes method call requests to either the SGMLParser
+        superclass or the Tag superclass, depending on the method name."""
+        #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
+        if methodName.startswith('start_') or methodName.startswith('end_') \
+               or methodName.startswith('do_'):
+            return SGMLParser.__getattr__(self, methodName)
+        elif not methodName.startswith('__'):
+            return Tag.__getattr__(self, methodName)
+        else:
+            raise AttributeError
     def isSelfClosingTag(self, name):
 …
         Tag.__init__(self, self, self.ROOT_TAG_NAME)
         self.hidden = 1
         self.builder.reset()
+        SGMLParser.reset(self)
         self.currentData = []
         self.currentTag = None
 …
     def popTag(self):
         tag = self.tagStack.pop()
-        # Tags with just one string-owning child get the child as a
-        # 'string' property, so that soup.tag.string is shorthand for
-        # soup.tag.contents[0]
-        if len(self.currentTag.contents) == 1 and \
-           isinstance(self.currentTag.contents[0], NavigableString):
-            self.currentTag.string = self.currentTag.contents[0]
         #print "Pop", tag.name
 …
                 popTo = name
                 break
             if (nestingResetTriggers != None
+            if (nestingResetTriggers is not None
                 and p.name in nestingResetTriggers) \
                 or (nestingResetTriggers == None and isResetNesting
+                or (nestingResetTriggers is None and isResetNesting
                     and self.RESET_NESTING_TAGS.has_key(p.name)):
 …
             #This is not a real tag.
             #print "<%s> is not real!" % name
             attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
+            attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
             self.handle_data('<%s%s>' % (name, attrs))
             return
 …
         self.currentData.append(data)
+    def extractCharsetFromMeta(self, attrs):
+        self.unknown_starttag('meta', attrs)
+    def _toStringSubclass(self, text, subclass):
+        """Adds a certain piece of text to the tree as a NavigableString
+        subclass."""
+        self.endData()
+        self.handle_data(text)
+        self.endData(subclass)
+    def handle_pi(self, text):
+        """Handle a processing instruction as a ProcessingInstruction
+        object, possibly one with a %SOUP-ENCODING% slot into which an
+        encoding will be plugged later."""
+        if text[:3] == "xml":
+            text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
+        self._toStringSubclass(text, ProcessingInstruction)
+    def handle_comment(self, text):
+        "Handle comments as Comment objects."
+        self._toStringSubclass(text, Comment)
+    def handle_charref(self, ref):
+        "Handle character references as data."
+        if self.convertEntities:
+            data = unichr(int(ref))
+        else:
+            data = '&#%s;' % ref
+        self.handle_data(data)
+    def handle_entityref(self, ref):
+        """Handle entity references as data, possibly converting known
+        HTML and/or XML entity references to the corresponding Unicode
+        characters."""
+        data = None
+        if self.convertHTMLEntities:
+            try:
+                data = unichr(name2codepoint[ref])
+            except KeyError:
+                pass
+        if not data and self.convertXMLEntities:
+                data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
+        if not data and self.convertHTMLEntities and \
+            not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
+                # TODO: We've got a problem here. We're told this is
+                # an entity reference, but it's not an XML entity
+                # reference or an HTML entity reference. Nonetheless,
+                # the logical thing to do is to pass it through as an
+                # unrecognized entity reference.
+                #
+                # Except: when the input is "&carol;" this function
+                # will be called with input "carol". When the input is
+                # "AT&T", this function will be called with input
+                # "T". We have no way of knowing whether a semicolon
+                # was present originally, so we don't know whether
+                # this is an unknown entity or just a misplaced
+                # ampersand.
+                #
+                # The more common case is a misplaced ampersand, so I
+                # escape the ampersand and omit the trailing semicolon.
+                data = "&amp;%s" % ref
+        if not data:
+            # This case is different from the one above, because we
+            # haven't already gone through a supposedly comprehensive
+            # mapping of entities to Unicode characters. We might not
+            # have gone through any mapping at all. So the chances are
+            # very high that this is a real entity, and not a
+            # misplaced ampersand.
+            data = "&%s;" % ref
+        self.handle_data(data)
+    def handle_decl(self, data):
+        "Handle DOCTYPEs and the like as Declaration objects."
+        self._toStringSubclass(data, Declaration)
+    def parse_declaration(self, i):
+        """Treat a bogus SGML declaration as raw data. Treat a CDATA
+        declaration as a CData object."""
+        j = None
+        if self.rawdata[i:i+9] == '<![CDATA[':
+             k = self.rawdata.find(']]>', i)
+             if k == -1:
+                 k = len(self.rawdata)
+             data = self.rawdata[i+9:k]
+             j = k+3
+             self._toStringSubclass(data, CData)
+        else:
+            try:
+                j = SGMLParser.parse_declaration(self, i)
+            except SGMLParseError:
+                toHandle = self.rawdata[i:]
+                self.handle_data(toHandle)
+                j = i + len(toHandle)
+        return j
 class BeautifulSoup(BeautifulStoneSoup):
 …
     SELF_CLOSING_TAGS = buildTagMap(None,
                                     ['br' , 'hr', 'input', 'img', 'meta',
                                     'spacer', 'link', 'frame', 'base'])
+                                    ('br' , 'hr', 'input', 'img', 'meta',
+                                    'spacer', 'link', 'frame', 'base', 'col'))
     PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
 …
     #contain another tag of the same type. Furthermore, it's common
     #to actually use these tags this way.
     NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
                             'center']
+    NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
+                            'center')
     #According to the HTML standard, these block tags can contain
     #another tag of the same type. Furthermore, it's common
     #to actually use these tags this way.
     NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
+    NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
     #Lists can contain other lists, but there are restrictions.
 …
+                           }
     NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
+    NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
     #If one of these tags is encountered, all tags up to the next tag of
 …
     CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
     def extractCharsetFromMeta(self, attrs):
+    def start_meta(self, attrs):
         """Beautiful Soup can detect a charset included in a META tag,
         try to convert the document to that charset, and re-parse the
 …
             tag.containsSubstitutions = True
 class StopParsing(Exception):
     pass
 …
     I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
      ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
+     ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
       'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
       'big']
     I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
+      'big')
+    I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
     NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
 …
         if not u: self.originalEncoding = None
     def _subMSChar(self, match):
+    def _subMSChar(self, orig):
         """Changes a MS smart quote character to an XML or HTML
         entity."""
-        orig = match.group(1)
         sub = self.MS_CHARS.get(orig)
         if type(sub) == types.TupleType:
+        if isinstance(sub, tuple):
             if self.smartQuotesTo == 'xml':
                 sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
+                sub = '&#x%s;' % sub[1]
             else:
+                sub = '&'.encode() + sub[0].encode() + ';'.encode()
+        else:
+            sub = sub.encode()
+                sub = '&%s;' % sub[0]
         return sub
 …
                                                       "iso-8859-1",
                                                       "iso-8859-2"):
             smart_quotes_re = "([\x80-\x9f])"
             smart_quotes_compiled = re.compile(smart_quotes_re)
             markup = smart_quotes_compiled.sub(self._subMSChar, markup)
+            markup = re.compile("([\x80-\x9f])").sub \
+                     (lambda(x): self._subMSChar(x.group(1)),
+                      markup)
         try:
 …
         except:
             xml_encoding_match = None
         xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode()
         xml_encoding_match = re.compile(xml_encoding_re).match(xml_data)
+        xml_encoding_match = re.compile(
+            '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
         if not xml_encoding_match and isHTML:
+            meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode()
+            regexp = re.compile(meta_re, re.I)
+            regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
             xml_encoding_match = regexp.search(xml_data)
         if xml_encoding_match is not None:
+            xml_encoding = xml_encoding_match.groups()[0].decode(
+                'ascii').lower()
+            xml_encoding = xml_encoding_match.groups()[0].lower()
             if isHTML:
                 self.declaredHTMLEncoding = xml_encoding

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 85792 in webkit

Legend:

trunk/Tools/ChangeLog

trunk/Tools/Scripts/webkitpy/thirdparty/BeautifulSoup.py

Download in other formats: