Update included_dependencies to beautifulsoup4-4.6.1

2025-12-06 08:52:55 +01:00 · 2018-08-08 10:05:48 -05:00 · 2018-08-08 10:05:48 -05:00 · 67698baf11
commit 67698baf11
parent 5be511916b
7 changed files with 374 additions and 94 deletions
--- a/included_dependencies/bs4/init.py
+++ b/included_dependencies/bs4/init.py
@ -21,14 +21,15 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
 # found in the LICENSE file.

 __author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.5.3"
-__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
+__version__ = "4.6.1"
+__copyright__ = "Copyright (c) 2004-2018 Leonard Richardson"
 __license__ = "MIT"

 __all__ = ['BeautifulSoup']

 import os
 import re
+import sys
 import traceback
 import warnings

@ -82,14 +83,46 @@ class BeautifulSoup(Tag):

    ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'

-    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
+    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"

    def __init__(self, markup="", features=None, builder=None,
                 parse_only=None, from_encoding=None, exclude_encodings=None,
                 **kwargs):
-        """The Soup object is initialized as the 'root tag', and the
-        provided markup (which can be a string or a file-like object)
-        is fed into the underlying parser."""
+        """Constructor.
+
+        :param markup: A string or a file-like object representing
+        markup to be parsed.
+
+        :param features: Desirable features of the parser to be used. This
+        may be the name of a specific parser ("lxml", "lxml-xml",
+        "html.parser", or "html5lib") or it may be the type of markup
+        to be used ("html", "html5", "xml"). It's recommended that you
+        name a specific parser, so that Beautiful Soup gives you the
+        same results across platforms and virtual environments.
+
+        :param builder: A specific TreeBuilder to use instead of looking one
+        up based on `features`. You shouldn't need to use this.
+
+        :param parse_only: A SoupStrainer. Only parts of the document
+        matching the SoupStrainer will be considered. This is useful
+        when parsing part of a document that would otherwise be too
+        large to fit into memory.
+
+        :param from_encoding: A string indicating the encoding of the
+        document to be parsed. Pass this in if Beautiful Soup is
+        guessing wrongly about the document's encoding.
+
+        :param exclude_encodings: A list of strings indicating
+        encodings known to be wrong. Pass this in if you don't know
+        the document's encoding but you know Beautiful Soup's guess is
+        wrong.
+
+        :param kwargs: For backwards compatibility purposes, the
+        constructor accepts certain keyword arguments used in
+        Beautiful Soup 3. None of these arguments do anything in
+        Beautiful Soup 4 and there's no need to actually pass keyword
+        arguments into the constructor.
+        """

        if 'convertEntities' in kwargs:
            warnings.warn(
@ -171,14 +204,35 @@ class BeautifulSoup(Tag):
                else:
                    markup_type = "HTML"

-                caller = traceback.extract_stack()[0]
-                filename = caller[0]
-                line_number = caller[1]
-                warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
+                # This code adapted from warnings.py so that we get the same line
+                # of code as our warnings.warn() call gets, even if the answer is wrong
+                # (as it may be in a multithreading situation).
+                caller = None
+                try:
+                    caller = sys._getframe(1)
+                except ValueError:
+                    pass
+                if caller:
+                    globals = caller.f_globals
+                    line_number = caller.f_lineno
+                else:
+                    globals = sys.__dict__
+                    line_number= 1                    
+                filename = globals.get('__file__')
+                if filename:
+                    fnl = filename.lower()
+                    if fnl.endswith((".pyc", ".pyo")):
+                        filename = filename[:-1]
+                if filename:
+                    # If there is no filename at all, the user is most likely in a REPL,
+                    # and the warning is not necessary.
+                    values = dict(
                        filename=filename,
                        line_number=line_number,
                        parser=builder.NAME,
-                    markup_type=markup_type))
+                        markup_type=markup_type
+                    )
+                    warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)

        self.builder = builder
        self.is_xml = builder.is_xml
@ -302,9 +356,10 @@ class BeautifulSoup(Tag):
        self.preserve_whitespace_tag_stack = []
        self.pushTag(self)

-    def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
+    def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs):
        """Create a new tag associated with this soup."""
-        return Tag(None, self.builder, name, namespace, nsprefix, attrs)
+        kwattrs.update(attrs)
+        return Tag(None, self.builder, name, namespace, nsprefix, kwattrs)

    def new_string(self, s, subclass=NavigableString):
        """Create a new NavigableString associated with this soup."""
--- a/included_dependencies/bs4/builder/init.py
+++ b/included_dependencies/bs4/builder/init.py
@ -232,8 +232,13 @@ class HTMLTreeBuilder(TreeBuilder):
    """

    preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
-    empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
-                              'spacer', 'link', 'frame', 'base'])
+    empty_element_tags = set([
+        # These are from HTML5.
+        'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
+        
+        # These are from earlier versions of HTML and are removed in HTML5.
+        'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
+    ])
    
    # The HTML standard defines these attributes as containing a
    # space-separated list of values, not a single value. That is,
--- a/included_dependencies/bs4/builder/_htmlparser.py
+++ b/included_dependencies/bs4/builder/_htmlparser.py
@ -1,3 +1,4 @@
+# encoding: utf-8
 """Use the HTMLParser library to parse HTML files that aren't too bad."""

 # Use of this source code is governed by a BSD-style license that can be
@ -52,7 +53,42 @@ from bs4.builder import (
 HTMLPARSER = 'html.parser'

 class BeautifulSoupHTMLParser(HTMLParser):
-    def handle_starttag(self, name, attrs):
+
+    def __init__(self, *args, **kwargs):
+        HTMLParser.__init__(self, *args, **kwargs)
+
+        # Keep a list of empty-element tags that were encountered
+        # without an explicit closing tag. If we encounter a closing tag
+        # of this type, we'll associate it with one of those entries.
+        #
+        # This isn't a stack because we don't care about the
+        # order. It's a list of closing tags we've already handled and
+        # will ignore, assuming they ever show up.
+        self.already_closed_empty_element = []
+
+    def error(self, msg):
+        """In Python 3, HTMLParser subclasses must implement error(), although this
+        requirement doesn't appear to be documented.
+
+        In Python 2, HTMLParser implements error() as raising an exception.
+
+        In any event, this method is called only on very strange markup and our best strategy
+        is to pretend it didn't happen and keep going.
+        """
+        warnings.warn(msg)
+        
+    def handle_startendtag(self, name, attrs):
+        # This is only called when the markup looks like
+        # <tag/>.
+
+        # is_startend() tells handle_starttag not to close the tag
+        # just because its name matches a known empty-element tag. We
+        # know that this is an empty-element tag and we want to call
+        # handle_endtag ourselves.
+        tag = self.handle_starttag(name, attrs, handle_empty_element=False)
+        self.handle_endtag(name)
+        
+    def handle_starttag(self, name, attrs, handle_empty_element=True):
        # XXX namespace
        attr_dict = {}
        for key, value in attrs:
@ -62,9 +98,33 @@ class BeautifulSoupHTMLParser(HTMLParser):
                value = ''
            attr_dict[key] = value
            attrvalue = '""'
-        self.soup.handle_starttag(name, None, None, attr_dict)
+        #print "START", name
+        tag = self.soup.handle_starttag(name, None, None, attr_dict)
+        if tag and tag.is_empty_element and handle_empty_element:
+            # Unlike other parsers, html.parser doesn't send separate end tag
+            # events for empty-element tags. (It's handled in
+            # handle_startendtag, but only if the original markup looked like
+            # <tag/>.)
+            #
+            # So we need to call handle_endtag() ourselves. Since we
+            # know the start event is identical to the end event, we
+            # don't want handle_endtag() to cross off any previous end
+            # events for tags of this name.
+            self.handle_endtag(name, check_already_closed=False)

-    def handle_endtag(self, name):
+            # But we might encounter an explicit closing tag for this tag
+            # later on. If so, we want to ignore it.
+            self.already_closed_empty_element.append(name)
+            
+    def handle_endtag(self, name, check_already_closed=True):
+        #print "END", name
+        if check_already_closed and name in self.already_closed_empty_element:
+            # This is a redundant end tag for an empty-element tag.
+            # We've already called handle_endtag() for it, so just
+            # check it off the list.
+            # print "ALREADY CLOSED", name
+            self.already_closed_empty_element.remove(name)
+        else:
            self.soup.handle_endtag(name)

    def handle_data(self, data):
@ -81,11 +141,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
        else:
            real_name = int(name)

+        data = None
+        if real_name < 256:
+            # HTML numeric entities are supposed to reference Unicode
+            # code points, but sometimes they reference code points in
+            # some other encoding (ahem, Windows-1252). E.g. &#147;
+            # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
+            # code tries to detect this situation and compensate.
+            for encoding in (self.soup.original_encoding, 'windows-1252'):
+                if not encoding:
+                    continue
+                try:
+                    data = bytearray([real_name]).decode(encoding)
+                except UnicodeDecodeError, e:
+                    pass
+        if not data:
            try:
                data = unichr(real_name)
            except (ValueError, OverflowError), e:
-            data = u"\N{REPLACEMENT CHARACTER}"
-
+                pass
+        data = data or u"\N{REPLACEMENT CHARACTER}"
        self.handle_data(data)

    def handle_entityref(self, name):
@ -93,7 +168,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
        if character is not None:
            data = character
        else:
-            data = "&%s;" % name
+            # If this were XML, it would be ambiguous whether "&foo"
+            # was an character entity reference with a missing
+            # semicolon or the literal string "&foo". Since this is
+            # HTML, we have a complete list of all character entity references,
+            # and this one wasn't found, so assume it's the literal string "&foo".
+            data = "&%s" % name
        self.handle_data(data)

    def handle_comment(self, data):
@ -165,10 +245,12 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
        parser.soup = self.soup
        try:
            parser.feed(markup)
+            parser.close()
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e
+        parser.already_closed_empty_element = []

 # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
 # 3.2.3 code. This ensures they don't treat markup like <p></p> as a
--- a/included_dependencies/bs4/builder/_lxml.py
+++ b/included_dependencies/bs4/builder/_lxml.py
@ -5,9 +5,13 @@ __all__ = [
    'LXMLTreeBuilder',
    ]

+try:
+    from collections.abc import Callable # Python 3.6
+except ImportError , e:
+    from collections import Callable
+
 from io import BytesIO
 from StringIO import StringIO
-import collections
 from lxml import etree
 from bs4.element import (
    Comment,
@ -58,7 +62,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
        # Use the default parser.
        parser = self.default_parser(encoding)

-        if isinstance(parser, collections.Callable):
+        if isinstance(parser, Callable):
            # Instantiate the parser with default arguments
            parser = parser(target=self, strip_cdata=False, encoding=encoding)
        return parser
@ -147,7 +151,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
        attrs = dict(attrs)
        nsprefix = None
        # Invert each namespace map as it comes in.
-        if len(self.nsmaps) > 1:
+        if len(nsmap) == 0 and len(self.nsmaps) > 1:
                # There are no new namespaces for this tag, but
                # non-default namespaces are in play, so we need a
                # separate tag stack to know when they end.
--- a/included_dependencies/bs4/dammit.py
+++ b/included_dependencies/bs4/dammit.py
@ -46,9 +46,9 @@ except ImportError:
    pass

 xml_encoding_re = re.compile(
-    '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
+    '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
 html_meta_re = re.compile(
-    '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
+    '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)

 class EntitySubstitution(object):

@ -82,7 +82,7 @@ class EntitySubstitution(object):
        }

    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
-                                           "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+                                           "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
                                           ")")

    AMPERSAND_OR_BRACKET = re.compile("([<>&])")
--- a/included_dependencies/bs4/diagnose.py
+++ b/included_dependencies/bs4/diagnose.py
@ -37,7 +37,7 @@ def diagnose(data):
                name)

    if 'lxml' in basic_parsers:
-        basic_parsers.append(["lxml", "xml"])
+        basic_parsers.append("lxml-xml")
        try:
            from lxml import etree
            print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
@ -56,21 +56,27 @@ def diagnose(data):

    if hasattr(data, 'read'):
        data = data.read()
-    elif os.path.exists(data):
-        print '"%s" looks like a filename. Reading data from the file.' % data
-        with open(data) as fp:
-            data = fp.read()
    elif data.startswith("http:") or data.startswith("https:"):
        print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
        print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
        return
+    else:
+        try:
+            if os.path.exists(data):
+                print '"%s" looks like a filename. Reading data from the file.' % data
+                with open(data) as fp:
+                    data = fp.read()
+        except ValueError:
+            # This can happen on some platforms when the 'filename' is
+            # too long. Assume it's data and not a filename.
+            pass
        print

    for parser in basic_parsers:
        print "Trying to parse your markup with %s" % parser
        success = False
        try:
-            soup = BeautifulSoup(data, parser)
+            soup = BeautifulSoup(data, features=parser)
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
--- a/included_dependencies/bs4/element.py
+++ b/included_dependencies/bs4/element.py
@ -2,7 +2,10 @@
 # found in the LICENSE file.
 __license__ = "MIT"

-import collections
+try:
+    from collections.abc import Callable # Python 3.6
+except ImportError , e:
+    from collections import Callable
 import re
 import shlex
 import sys
@ -12,7 +15,7 @@ from bs4.dammit import EntitySubstitution
 DEFAULT_OUTPUT_ENCODING = "utf-8"
 PY3K = (sys.version_info[0] > 2)

-whitespace_re = re.compile("\s+")
+whitespace_re = re.compile(r"\s+")

 def _alias(attr):
    """Alias one attribute name to another for backward compatibility"""
@ -69,7 +72,7 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
    The value of the 'content' attribute will be one of these objects.
    """

-    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+    CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)

    def __new__(cls, original_value):
        match = cls.CHARSET_RE.search(original_value)
@ -123,6 +126,41 @@ class HTMLAwareEntitySubstitution(EntitySubstitution):
        return cls._substitute_if_appropriate(
            ns, EntitySubstitution.substitute_xml)

+class Formatter(object):
+    """Contains information about how to format a parse tree."""
+    
+    # By default, represent void elements as <tag/> rather than <tag>
+    void_element_close_prefix = '/'
+
+    def substitute_entities(self, *args, **kwargs):
+        """Transform certain characters into named entities."""
+        raise NotImplementedError()
+
+class HTMLFormatter(Formatter):
+    """The default HTML formatter."""
+    def substitute(self, *args, **kwargs):
+        return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
+
+class MinimalHTMLFormatter(Formatter):
+    """A minimal HTML formatter."""
+    def substitute(self, *args, **kwargs):
+        return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs)
+    
+class HTML5Formatter(HTMLFormatter):
+    """An HTML formatter that omits the slash in a void tag."""
+    void_element_close_prefix = None
+
+class XMLFormatter(Formatter):
+    """Substitute only the essential XML entities."""
+    def substitute(self, *args, **kwargs):
+        return EntitySubstitution.substitute_xml(*args, **kwargs)
+
+class HTMLXMLFormatter(Formatter):
+    """Format XML using HTML rules."""
+    def substitute(self, *args, **kwargs):
+        return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
+
+    
 class PageElement(object):
    """Contains the navigational information for some part of the page
    (either a tag or a piece of text)"""
@ -132,39 +170,48 @@ class PageElement(object):
    #
    # "html" - All Unicode characters with corresponding HTML entities
    #   are converted to those entities on output.
+    # "html5" - The same as "html", but empty void tags are represented as
+    #   <tag> rather than <tag/>
    # "minimal" - Bare ampersands and angle brackets are converted to
    #   XML entities: &amp; &lt; &gt;
    # None - The null formatter. Unicode characters are never
    #   converted to entities.  This is not recommended, but it's
    #   faster than "minimal".
-    # A function - This function will be called on every string that
+    # A callable function - it will be called on every string that needs to undergo entity substitution.
+    # A Formatter instance - Formatter.substitute(string) will be called on every string that
    #  needs to undergo entity substitution.
    #

-    # In an HTML document, the default "html" and "minimal" functions
-    # will leave the contents of <script> and <style> tags alone. For
-    # an XML document, all tags will be given the same treatment.
+    # In an HTML document, the default "html", "html5", and "minimal"
+    # functions will leave the contents of <script> and <style> tags
+    # alone. For an XML document, all tags will be given the same
+    # treatment.

    HTML_FORMATTERS = {
-        "html" : HTMLAwareEntitySubstitution.substitute_html,
-        "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
+        "html" : HTMLFormatter(),
+        "html5" : HTML5Formatter(),
+        "minimal" : MinimalHTMLFormatter(),
        None : None
        }

    XML_FORMATTERS = {
-        "html" : EntitySubstitution.substitute_html,
-        "minimal" : EntitySubstitution.substitute_xml,
+        "html" : HTMLXMLFormatter(),
+        "minimal" : XMLFormatter(),
        None : None
        }

    def format_string(self, s, formatter='minimal'):
        """Format the given string using the given formatter."""
-        if not callable(formatter):
+        if isinstance(formatter, basestring):
            formatter = self._formatter_for_name(formatter)
        if formatter is None:
            output = s
        else:
+            if callable(formatter):
+                # Backwards compatibility -- you used to pass in a formatting method.
                output = formatter(s)
+            else:
+                output = formatter.substitute(s)
        return output

    @property
@ -194,11 +241,9 @@ class PageElement(object):
    def _formatter_for_name(self, name):
        "Look up a formatter function based on its name and the tree."
        if self._is_xml:
-            return self.XML_FORMATTERS.get(
-                name, EntitySubstitution.substitute_xml)
+            return self.XML_FORMATTERS.get(name, XMLFormatter())
        else:
-            return self.HTML_FORMATTERS.get(
-                name, HTMLAwareEntitySubstitution.substitute_xml)
+            return self.HTML_FORMATTERS.get(name, HTMLFormatter())

    def setup(self, parent=None, previous_element=None, next_element=None,
              previous_sibling=None, next_sibling=None):
@ -316,6 +361,14 @@ class PageElement(object):
            and not isinstance(new_child, NavigableString)):
            new_child = NavigableString(new_child)

+        from bs4 import BeautifulSoup
+        if isinstance(new_child, BeautifulSoup):
+            # We don't want to end up with a situation where one BeautifulSoup
+            # object contains another. Insert the children one at a time.
+            for subchild in list(new_child.contents):
+                self.insert(position, subchild)
+                position += 1
+            return
        position = min(position, len(self.contents))
        if hasattr(new_child, 'parent') and new_child.parent is not None:
            # We're 'inserting' an element that's already one
@ -535,9 +588,23 @@ class PageElement(object):
                return ResultSet(strainer, result)
            elif isinstance(name, basestring):
                # Optimization to find all tags with a given name.
+                if name.count(':') == 1:
+                    # This is a name with a prefix. If this is a namespace-aware document,
+                    # we need to match the local name against tag.name. If not,
+                    # we need to match the fully-qualified name against tag.name.
+                    prefix, local_name = name.split(':', 1)
+                else:
+                    prefix = None
+                    local_name = name
                result = (element for element in generator
                          if isinstance(element, Tag)
-                            and element.name == name)
+                          and (
+                              element.name == name
+                          ) or (
+                              element.name == local_name
+                              and (prefix is None or element.prefix == prefix)
+                          )
+                )
                return ResultSet(strainer, result)
        results = ResultSet(strainer)
        while True:
@ -863,7 +930,7 @@ class Tag(PageElement):
        Its contents are a copy of the old Tag's contents.
        """
        clone = type(self)(None, self.builder, self.name, self.namespace,
-                           self.nsprefix, self.attrs, is_xml=self._is_xml)
+                           self.prefix, self.attrs, is_xml=self._is_xml)
        for attr in ('can_be_empty_element', 'hidden'):
            setattr(clone, attr, getattr(self, attr))
        for child in self.contents:
@ -985,6 +1052,13 @@ class Tag(PageElement):
        attribute."""
        return self.attrs.get(key, default)

+    def get_attribute_list(self, key, default=None):
+        """The same as get(), but always returns a list."""
+        value = self.get(key, default)
+        if not isinstance(value, list):
+            value = [value]
+        return value
+    
    def has_attr(self, key):
        return key in self.attrs

@ -1032,8 +1106,10 @@ class Tag(PageElement):
            # BS3: soup.aTag -> "soup.find("a")
            tag_name = tag[:-3]
            warnings.warn(
-                '.%sTag is deprecated, use .find("%s") instead.' % (
-                    tag_name, tag_name))
+                '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
+                    name=tag_name
+                )
+            )
            return self.find(tag_name)
        # We special case contents to avoid recursion.
        elif not tag.startswith("__") and not tag == "contents":
@ -1115,11 +1191,10 @@ class Tag(PageElement):
           encoding.
        """

-        # First off, turn a string formatter into a function. This
+        # First off, turn a string formatter into a Formatter object. This
        # will stop the lookup from happening over and over again.
-        if not callable(formatter):
+        if not isinstance(formatter, Formatter) and not callable(formatter):
            formatter = self._formatter_for_name(formatter)
-
        attrs = []
        if self.attrs:
            for key, val in sorted(self.attrs.items()):
@ -1148,7 +1223,7 @@ class Tag(PageElement):
            prefix = self.prefix + ":"

        if self.is_empty_element:
-            close = '/'
+            close = formatter.void_element_close_prefix or ''
        else:
            closeTag = '</%s%s>' % (prefix, self.name)

@ -1219,9 +1294,9 @@ class Tag(PageElement):
        :param formatter: The output formatter responsible for converting
           entities to Unicode characters.
        """
-        # First off, turn a string formatter into a function. This
+        # First off, turn a string formatter into a Formatter object. This
        # will stop the lookup from happening over and over again.
-        if not callable(formatter):
+        if not isinstance(formatter, Formatter) and not callable(formatter):
            formatter = self._formatter_for_name(formatter)

        pretty_print = (indent_level is not None)
@ -1334,15 +1409,29 @@ class Tag(PageElement):
        # Handle grouping selectors if ',' exists, ie: p,a
        if ',' in selector:
            context = []
-            for partial_selector in selector.split(','):
-                partial_selector = partial_selector.strip()
+            selectors = [x.strip() for x in selector.split(",")]
+
+            # If a selector is mentioned multiple times we don't want
+            # to use it more than once.
+            used_selectors = set()
+
+            # We also don't want to select the same element more than once,
+            # if it's matched by multiple selectors.
+            selected_object_ids = set()
+            for partial_selector in selectors:
                if partial_selector == '':
                    raise ValueError('Invalid group selection syntax: %s' % selector)
+                if partial_selector in used_selectors:
+                    continue
+                used_selectors.add(partial_selector)
                candidates = self.select(partial_selector, limit=limit)
                for candidate in candidates:
-                    if candidate not in context:
+                    # This lets us distinguish between distinct tags that
+                    # represent the same markup.
+                    object_id = id(candidate)
+                    if object_id not in selected_object_ids:
                        context.append(candidate)
-
+                        selected_object_ids.add(object_id)
                if limit and len(context) >= limit:
                    break
            return context
@ -1404,7 +1493,7 @@ class Tag(PageElement):
                if tag_name == '':
                    raise ValueError(
                        "A pseudo-class must be prefixed with a tag name.")
-                pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
+                pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
                found = []
                if pseudo_attributes is None:
                    pseudo_type = pseudo
@ -1638,7 +1727,7 @@ class SoupStrainer(object):
            markup = markup_name
            markup_attrs = markup
        call_function_with_tag_data = (
-            isinstance(self.name, collections.Callable)
+            isinstance(self.name, Callable)
            and not isinstance(markup_name, Tag))

        if ((not self.name)
@ -1698,7 +1787,7 @@ class SoupStrainer(object):
                "I don't know how to match against a %s" % markup.__class__)
        return found

-    def _matches(self, markup, match_against):
+    def _matches(self, markup, match_against, already_tried=None):
        # print u"Matching %s against %s" % (markup, match_against)
        result = False
        if isinstance(markup, list) or isinstance(markup, tuple):
@ -1718,11 +1807,12 @@ class SoupStrainer(object):
            # True matches any non-None value.
            return markup is not None

-        if isinstance(match_against, collections.Callable):
+        if isinstance(match_against, Callable):
            return match_against(markup)

        # Custom callables take the tag as an argument, but all
        # other ways of matching match the tag name as a string.
+        original_markup = markup
        if isinstance(markup, Tag):
            markup = markup.name

@ -1733,18 +1823,51 @@ class SoupStrainer(object):
            # None matches None, False, an empty string, an empty list, and so on.
            return not match_against

-        if isinstance(match_against, unicode):
-            # Exact string match
-            return markup == match_against
+        if (hasattr(match_against, '__iter__')
+            and not isinstance(match_against, basestring)):
+            # We're asked to match against an iterable of items.
+            # The markup must be match at least one item in the
+            # iterable. We'll try each one in turn.
+            #
+            # To avoid infinite recursion we need to keep track of
+            # items we've already seen.
+            if not already_tried:
+                already_tried = set()
+            for item in match_against:
+                if item.__hash__:
+                    key = item
+                else:
+                    key = id(item)
+                if key in already_tried:
+                    continue
+                else:
+                    already_tried.add(key)
+                    if self._matches(original_markup, item, already_tried):
+                        return True
+            else:
+                return False
        
-        if hasattr(match_against, 'match'):
+        # Beyond this point we might need to run the test twice: once against
+        # the tag's name and once against its prefixed name.
+        match = False
+        
+        if not match and isinstance(match_against, unicode):
+            # Exact string match
+            match = markup == match_against
+
+        if not match and hasattr(match_against, 'search'):
            # Regexp match
            return match_against.search(markup)

-        if hasattr(match_against, '__iter__'):
-            # The markup must be an exact match against something
-            # in the iterable.
-            return markup in match_against
+        if (not match
+            and isinstance(original_markup, Tag)
+            and original_markup.prefix):
+            # Try the whole thing again with the prefixed tag name.
+            return self._matches(
+                original_markup.prefix + ':' + original_markup.name, match_against
+            )
+
+        return match


 class ResultSet(list):
@ -1753,3 +1876,8 @@ class ResultSet(list):
    def __init__(self, source, result=()):
        super(ResultSet, self).__init__(result)
        self.source = source
+
+    def __getattr__(self, key):
+        raise AttributeError(
+            "ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % key
+        )