Update included_dependencies to beautifulsoup4-4.6.1

2025-12-06 08:52:55 +01:00 · 2018-08-08 10:05:48 -05:00 · 2018-08-08 10:05:48 -05:00 · 67698baf11
commit 67698baf11
parent 5be511916b
7 changed files with 374 additions and 94 deletions
--- a/included_dependencies/bs4/init.py
+++ b/included_dependencies/bs4/init.py
@ -21,14 +21,15 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
 # found in the LICENSE file.
 __author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.5.3"
+__version__ = "4.6.1"
-__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
+__copyright__ = "Copyright (c) 2004-2018 Leonard Richardson"
 __license__ = "MIT"
 __all__ = ['BeautifulSoup']
 import os
 import re
 import sys
 import traceback
 import warnings
@ -82,14 +83,46 @@ class BeautifulSoup(Tag):
    ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
-    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
+    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
    def __init__(self, markup="", features=None, builder=None,
                 parse_only=None, from_encoding=None, exclude_encodings=None,
                 **kwargs):
-        """The Soup object is initialized as the 'root tag', and the
+        """Constructor.
-        provided markup (which can be a string or a file-like object)
+
-        is fed into the underlying parser."""
+        :param markup: A string or a file-like object representing
        markup to be parsed.
        :param features: Desirable features of the parser to be used. This
        may be the name of a specific parser ("lxml", "lxml-xml",
        "html.parser", or "html5lib") or it may be the type of markup
        to be used ("html", "html5", "xml"). It's recommended that you
        name a specific parser, so that Beautiful Soup gives you the
        same results across platforms and virtual environments.
        :param builder: A specific TreeBuilder to use instead of looking one
        up based on `features`. You shouldn't need to use this.
        :param parse_only: A SoupStrainer. Only parts of the document
        matching the SoupStrainer will be considered. This is useful
        when parsing part of a document that would otherwise be too
        large to fit into memory.
        :param from_encoding: A string indicating the encoding of the
        document to be parsed. Pass this in if Beautiful Soup is
        guessing wrongly about the document's encoding.
        :param exclude_encodings: A list of strings indicating
        encodings known to be wrong. Pass this in if you don't know
        the document's encoding but you know Beautiful Soup's guess is
        wrong.
        :param kwargs: For backwards compatibility purposes, the
        constructor accepts certain keyword arguments used in
        Beautiful Soup 3. None of these arguments do anything in
        Beautiful Soup 4 and there's no need to actually pass keyword
        arguments into the constructor.
        """
        if 'convertEntities' in kwargs:
            warnings.warn(
@ -171,14 +204,35 @@ class BeautifulSoup(Tag):
                else:
                    markup_type = "HTML"
-                caller = traceback.extract_stack()[0]
+                # This code adapted from warnings.py so that we get the same line
-                filename = caller[0]
+                # of code as our warnings.warn() call gets, even if the answer is wrong
-                line_number = caller[1]
+                # (as it may be in a multithreading situation).
-                warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
+                caller = None
-                    filename=filename,
+                try:
-                    line_number=line_number,
+                    caller = sys._getframe(1)
-                    parser=builder.NAME,
+                except ValueError:
-                    markup_type=markup_type))
+                    pass
                if caller:
                    globals = caller.f_globals
                    line_number = caller.f_lineno
                else:
                    globals = sys.__dict__
                    line_number= 1                    
                filename = globals.get('__file__')
                if filename:
                    fnl = filename.lower()
                    if fnl.endswith((".pyc", ".pyo")):
                        filename = filename[:-1]
                if filename:
                    # If there is no filename at all, the user is most likely in a REPL,
                    # and the warning is not necessary.
                    values = dict(
                        filename=filename,
                        line_number=line_number,
                        parser=builder.NAME,
                        markup_type=markup_type
                    )
                    warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
        self.builder = builder
        self.is_xml = builder.is_xml
@ -215,8 +269,8 @@ class BeautifulSoup(Tag):
                    markup = markup.encode("utf8")
                warnings.warn(
                    '"%s" looks like a filename, not markup. You should'
-                    'probably open this file and pass the filehandle into'
+                    ' probably open this file and pass the filehandle into'
-                    'Beautiful Soup.' % markup)
+                    ' Beautiful Soup.' % markup)
            self._check_markup_is_url(markup)
        for (self.markup, self.original_encoding, self.declared_html_encoding,
@ -302,9 +356,10 @@ class BeautifulSoup(Tag):
        self.preserve_whitespace_tag_stack = []
        self.pushTag(self)
-    def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
+    def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs):
        """Create a new tag associated with this soup."""
-        return Tag(None, self.builder, name, namespace, nsprefix, attrs)
+        kwattrs.update(attrs)
        return Tag(None, self.builder, name, namespace, nsprefix, kwattrs)
    def new_string(self, s, subclass=NavigableString):
        """Create a new NavigableString associated with this soup."""
--- a/included_dependencies/bs4/builder/init.py
+++ b/included_dependencies/bs4/builder/init.py
@ -93,7 +93,7 @@ class TreeBuilder(object):
    preserve_whitespace_tags = set()
    empty_element_tags = None # A tag will be considered an empty-element
                              # tag when and only when it has no contents.
-
+    
    # A value for these tag/attribute combinations is a space- or
    # comma-separated list of CDATA, rather than a single CDATA.
    cdata_list_attributes = {}
@ -125,7 +125,7 @@ class TreeBuilder(object):
        if self.empty_element_tags is None:
            return True
        return tag_name in self.empty_element_tags
-
+        
    def feed(self, markup):
        raise NotImplementedError()
@ -232,9 +232,14 @@ class HTMLTreeBuilder(TreeBuilder):
    """
    preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
-    empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
+    empty_element_tags = set([
-                              'spacer', 'link', 'frame', 'base'])
+        # These are from HTML5.
-
+        'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
        # These are from earlier versions of HTML and are removed in HTML5.
        'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
    ])
    # The HTML standard defines these attributes as containing a
    # space-separated list of values, not a single value. That is,
    # class="foo bar" means that the 'class' attribute has two values,
--- a/included_dependencies/bs4/builder/_htmlparser.py
+++ b/included_dependencies/bs4/builder/_htmlparser.py
@ -1,3 +1,4 @@
 # encoding: utf-8
 """Use the HTMLParser library to parse HTML files that aren't too bad."""
 # Use of this source code is governed by a BSD-style license that can be
@ -52,7 +53,42 @@ from bs4.builder import (
 HTMLPARSER = 'html.parser'
 class BeautifulSoupHTMLParser(HTMLParser):
-    def handle_starttag(self, name, attrs):
+
    def __init__(self, *args, **kwargs):
        HTMLParser.__init__(self, *args, **kwargs)
        # Keep a list of empty-element tags that were encountered
        # without an explicit closing tag. If we encounter a closing tag
        # of this type, we'll associate it with one of those entries.
        #
        # This isn't a stack because we don't care about the
        # order. It's a list of closing tags we've already handled and
        # will ignore, assuming they ever show up.
        self.already_closed_empty_element = []
    def error(self, msg):
        """In Python 3, HTMLParser subclasses must implement error(), although this
        requirement doesn't appear to be documented.
        In Python 2, HTMLParser implements error() as raising an exception.
        In any event, this method is called only on very strange markup and our best strategy
        is to pretend it didn't happen and keep going.
        """
        warnings.warn(msg)
    def handle_startendtag(self, name, attrs):
        # This is only called when the markup looks like
        # <tag/>.
        # is_startend() tells handle_starttag not to close the tag
        # just because its name matches a known empty-element tag. We
        # know that this is an empty-element tag and we want to call
        # handle_endtag ourselves.
        tag = self.handle_starttag(name, attrs, handle_empty_element=False)
        self.handle_endtag(name)
    def handle_starttag(self, name, attrs, handle_empty_element=True):
        # XXX namespace
        attr_dict = {}
        for key, value in attrs:
@ -62,10 +98,34 @@ class BeautifulSoupHTMLParser(HTMLParser):
                value = ''
            attr_dict[key] = value
            attrvalue = '""'
-        self.soup.handle_starttag(name, None, None, attr_dict)
+        #print "START", name
        tag = self.soup.handle_starttag(name, None, None, attr_dict)
        if tag and tag.is_empty_element and handle_empty_element:
            # Unlike other parsers, html.parser doesn't send separate end tag
            # events for empty-element tags. (It's handled in
            # handle_startendtag, but only if the original markup looked like
            # <tag/>.)
            #
            # So we need to call handle_endtag() ourselves. Since we
            # know the start event is identical to the end event, we
            # don't want handle_endtag() to cross off any previous end
            # events for tags of this name.
            self.handle_endtag(name, check_already_closed=False)
-    def handle_endtag(self, name):
+            # But we might encounter an explicit closing tag for this tag
-        self.soup.handle_endtag(name)
+            # later on. If so, we want to ignore it.
            self.already_closed_empty_element.append(name)
    def handle_endtag(self, name, check_already_closed=True):
        #print "END", name
        if check_already_closed and name in self.already_closed_empty_element:
            # This is a redundant end tag for an empty-element tag.
            # We've already called handle_endtag() for it, so just
            # check it off the list.
            # print "ALREADY CLOSED", name
            self.already_closed_empty_element.remove(name)
        else:
            self.soup.handle_endtag(name)
    def handle_data(self, data):
        self.soup.handle_data(data)
@ -81,11 +141,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
        else:
            real_name = int(name)
-        try:
+        data = None
-            data = unichr(real_name)
+        if real_name < 256:
-        except (ValueError, OverflowError), e:
+            # HTML numeric entities are supposed to reference Unicode
-            data = u"\N{REPLACEMENT CHARACTER}"
+            # code points, but sometimes they reference code points in
-
+            # some other encoding (ahem, Windows-1252). E.g. &#147;
            # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
            # code tries to detect this situation and compensate.
            for encoding in (self.soup.original_encoding, 'windows-1252'):
                if not encoding:
                    continue
                try:
                    data = bytearray([real_name]).decode(encoding)
                except UnicodeDecodeError, e:
                    pass
        if not data:
            try:
                data = unichr(real_name)
            except (ValueError, OverflowError), e:
                pass
        data = data or u"\N{REPLACEMENT CHARACTER}"
        self.handle_data(data)
    def handle_entityref(self, name):
@ -93,7 +168,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
        if character is not None:
            data = character
        else:
-            data = "&%s;" % name
+            # If this were XML, it would be ambiguous whether "&foo"
            # was an character entity reference with a missing
            # semicolon or the literal string "&foo". Since this is
            # HTML, we have a complete list of all character entity references,
            # and this one wasn't found, so assume it's the literal string "&foo".
            data = "&%s" % name
        self.handle_data(data)
    def handle_comment(self, data):
@ -165,10 +245,12 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
        parser.soup = self.soup
        try:
            parser.feed(markup)
            parser.close()
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e
        parser.already_closed_empty_element = []
 # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
 # 3.2.3 code. This ensures they don't treat markup like <p></p> as a
--- a/included_dependencies/bs4/builder/_lxml.py
+++ b/included_dependencies/bs4/builder/_lxml.py
@ -5,9 +5,13 @@ __all__ = [
    'LXMLTreeBuilder',
    ]
 try:
    from collections.abc import Callable # Python 3.6
 except ImportError , e:
    from collections import Callable
 from io import BytesIO
 from StringIO import StringIO
 import collections
 from lxml import etree
 from bs4.element import (
    Comment,
@ -58,7 +62,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
        # Use the default parser.
        parser = self.default_parser(encoding)
-        if isinstance(parser, collections.Callable):
+        if isinstance(parser, Callable):
            # Instantiate the parser with default arguments
            parser = parser(target=self, strip_cdata=False, encoding=encoding)
        return parser
@ -147,11 +151,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
        attrs = dict(attrs)
        nsprefix = None
        # Invert each namespace map as it comes in.
-        if len(self.nsmaps) > 1:
+        if len(nsmap) == 0 and len(self.nsmaps) > 1:
-            # There are no new namespaces for this tag, but
+                # There are no new namespaces for this tag, but
-            # non-default namespaces are in play, so we need a
+                # non-default namespaces are in play, so we need a
-            # separate tag stack to know when they end.
+                # separate tag stack to know when they end.
-            self.nsmaps.append(None)
+                self.nsmaps.append(None)
        elif len(nsmap) > 0:
            # A new namespace mapping has come into play.
            inverted_nsmap = dict((value, key) for key, value in nsmap.items())
--- a/included_dependencies/bs4/dammit.py
+++ b/included_dependencies/bs4/dammit.py
@ -46,9 +46,9 @@ except ImportError:
    pass
 xml_encoding_re = re.compile(
-    '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
+    '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
 html_meta_re = re.compile(
-    '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
+    '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
 class EntitySubstitution(object):
@ -82,7 +82,7 @@ class EntitySubstitution(object):
        }
    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
-                                           "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+                                           "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
                                           ")")
    AMPERSAND_OR_BRACKET = re.compile("([<>&])")
--- a/included_dependencies/bs4/diagnose.py
+++ b/included_dependencies/bs4/diagnose.py
@ -37,7 +37,7 @@ def diagnose(data):
                name)
    if 'lxml' in basic_parsers:
-        basic_parsers.append(["lxml", "xml"])
+        basic_parsers.append("lxml-xml")
        try:
            from lxml import etree
            print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
@ -56,21 +56,27 @@ def diagnose(data):
    if hasattr(data, 'read'):
        data = data.read()
    elif os.path.exists(data):
        print '"%s" looks like a filename. Reading data from the file.' % data
        with open(data) as fp:
            data = fp.read()
    elif data.startswith("http:") or data.startswith("https:"):
        print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
        print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
        return
-    print
+    else:
        try:
            if os.path.exists(data):
                print '"%s" looks like a filename. Reading data from the file.' % data
                with open(data) as fp:
                    data = fp.read()
        except ValueError:
            # This can happen on some platforms when the 'filename' is
            # too long. Assume it's data and not a filename.
            pass
        print
    for parser in basic_parsers:
        print "Trying to parse your markup with %s" % parser
        success = False
        try:
-            soup = BeautifulSoup(data, parser)
+            soup = BeautifulSoup(data, features=parser)
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
--- a/included_dependencies/bs4/element.py
+++ b/included_dependencies/bs4/element.py
@ -2,7 +2,10 @@
 # found in the LICENSE file.
 __license__ = "MIT"
-import collections
+try:
    from collections.abc import Callable # Python 3.6
 except ImportError , e:
    from collections import Callable
 import re
 import shlex
 import sys
@ -12,7 +15,7 @@ from bs4.dammit import EntitySubstitution
 DEFAULT_OUTPUT_ENCODING = "utf-8"
 PY3K = (sys.version_info[0] > 2)
-whitespace_re = re.compile("\s+")
+whitespace_re = re.compile(r"\s+")
 def _alias(attr):
    """Alias one attribute name to another for backward compatibility"""
@ -69,7 +72,7 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
    The value of the 'content' attribute will be one of these objects.
    """
-    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+    CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
    def __new__(cls, original_value):
        match = cls.CHARSET_RE.search(original_value)
@ -123,6 +126,41 @@ class HTMLAwareEntitySubstitution(EntitySubstitution):
        return cls._substitute_if_appropriate(
            ns, EntitySubstitution.substitute_xml)
 class Formatter(object):
    """Contains information about how to format a parse tree."""
    # By default, represent void elements as <tag/> rather than <tag>
    void_element_close_prefix = '/'
    def substitute_entities(self, *args, **kwargs):
        """Transform certain characters into named entities."""
        raise NotImplementedError()
 class HTMLFormatter(Formatter):
    """The default HTML formatter."""
    def substitute(self, *args, **kwargs):
        return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
 class MinimalHTMLFormatter(Formatter):
    """A minimal HTML formatter."""
    def substitute(self, *args, **kwargs):
        return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs)
 class HTML5Formatter(HTMLFormatter):
    """An HTML formatter that omits the slash in a void tag."""
    void_element_close_prefix = None
 class XMLFormatter(Formatter):
    """Substitute only the essential XML entities."""
    def substitute(self, *args, **kwargs):
        return EntitySubstitution.substitute_xml(*args, **kwargs)
 class HTMLXMLFormatter(Formatter):
    """Format XML using HTML rules."""
    def substitute(self, *args, **kwargs):
        return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
 class PageElement(object):
    """Contains the navigational information for some part of the page
    (either a tag or a piece of text)"""
@ -132,39 +170,48 @@ class PageElement(object):
    #
    # "html" - All Unicode characters with corresponding HTML entities
    #   are converted to those entities on output.
    # "html5" - The same as "html", but empty void tags are represented as
    #   <tag> rather than <tag/>
    # "minimal" - Bare ampersands and angle brackets are converted to
    #   XML entities: &amp; &lt; &gt;
    # None - The null formatter. Unicode characters are never
    #   converted to entities.  This is not recommended, but it's
    #   faster than "minimal".
-    # A function - This function will be called on every string that
+    # A callable function - it will be called on every string that needs to undergo entity substitution.
    # A Formatter instance - Formatter.substitute(string) will be called on every string that
    #  needs to undergo entity substitution.
    #
-    # In an HTML document, the default "html" and "minimal" functions
+    # In an HTML document, the default "html", "html5", and "minimal"
-    # will leave the contents of <script> and <style> tags alone. For
+    # functions will leave the contents of <script> and <style> tags
-    # an XML document, all tags will be given the same treatment.
+    # alone. For an XML document, all tags will be given the same
    # treatment.
    HTML_FORMATTERS = {
-        "html" : HTMLAwareEntitySubstitution.substitute_html,
+        "html" : HTMLFormatter(),
-        "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
+        "html5" : HTML5Formatter(),
        "minimal" : MinimalHTMLFormatter(),
        None : None
        }
    XML_FORMATTERS = {
-        "html" : EntitySubstitution.substitute_html,
+        "html" : HTMLXMLFormatter(),
-        "minimal" : EntitySubstitution.substitute_xml,
+        "minimal" : XMLFormatter(),
        None : None
        }
    def format_string(self, s, formatter='minimal'):
        """Format the given string using the given formatter."""
-        if not callable(formatter):
+        if isinstance(formatter, basestring):
            formatter = self._formatter_for_name(formatter)
        if formatter is None:
            output = s
        else:
-            output = formatter(s)
+            if callable(formatter):
                # Backwards compatibility -- you used to pass in a formatting method.
                output = formatter(s)
            else:
                output = formatter.substitute(s)
        return output
    @property
@ -194,11 +241,9 @@ class PageElement(object):
    def _formatter_for_name(self, name):
        "Look up a formatter function based on its name and the tree."
        if self._is_xml:
-            return self.XML_FORMATTERS.get(
+            return self.XML_FORMATTERS.get(name, XMLFormatter())
                name, EntitySubstitution.substitute_xml)
        else:
-            return self.HTML_FORMATTERS.get(
+            return self.HTML_FORMATTERS.get(name, HTMLFormatter())
                name, HTMLAwareEntitySubstitution.substitute_xml)
    def setup(self, parent=None, previous_element=None, next_element=None,
              previous_sibling=None, next_sibling=None):
@ -316,6 +361,14 @@ class PageElement(object):
            and not isinstance(new_child, NavigableString)):
            new_child = NavigableString(new_child)
        from bs4 import BeautifulSoup
        if isinstance(new_child, BeautifulSoup):
            # We don't want to end up with a situation where one BeautifulSoup
            # object contains another. Insert the children one at a time.
            for subchild in list(new_child.contents):
                self.insert(position, subchild)
                position += 1
            return
        position = min(position, len(self.contents))
        if hasattr(new_child, 'parent') and new_child.parent is not None:
            # We're 'inserting' an element that's already one
@ -535,9 +588,23 @@ class PageElement(object):
                return ResultSet(strainer, result)
            elif isinstance(name, basestring):
                # Optimization to find all tags with a given name.
                if name.count(':') == 1:
                    # This is a name with a prefix. If this is a namespace-aware document,
                    # we need to match the local name against tag.name. If not,
                    # we need to match the fully-qualified name against tag.name.
                    prefix, local_name = name.split(':', 1)
                else:
                    prefix = None
                    local_name = name
                result = (element for element in generator
                          if isinstance(element, Tag)
-                            and element.name == name)
+                          and (
                              element.name == name
                          ) or (
                              element.name == local_name
                              and (prefix is None or element.prefix == prefix)
                          )
                )
                return ResultSet(strainer, result)
        results = ResultSet(strainer)
        while True:
@ -855,7 +922,7 @@ class Tag(PageElement):
            self.can_be_empty_element = builder.can_be_empty_element(name)
        else:
            self.can_be_empty_element = False
-
+            
    parserClass = _alias("parser_class")  # BS3
    def __copy__(self):
@ -863,7 +930,7 @@ class Tag(PageElement):
        Its contents are a copy of the old Tag's contents.
        """
        clone = type(self)(None, self.builder, self.name, self.namespace,
-                           self.nsprefix, self.attrs, is_xml=self._is_xml)
+                           self.prefix, self.attrs, is_xml=self._is_xml)
        for attr in ('can_be_empty_element', 'hidden'):
            setattr(clone, attr, getattr(self, attr))
        for child in self.contents:
@ -985,6 +1052,13 @@ class Tag(PageElement):
        attribute."""
        return self.attrs.get(key, default)
    def get_attribute_list(self, key, default=None):
        """The same as get(), but always returns a list."""
        value = self.get(key, default)
        if not isinstance(value, list):
            value = [value]
        return value
    def has_attr(self, key):
        return key in self.attrs
@ -1032,8 +1106,10 @@ class Tag(PageElement):
            # BS3: soup.aTag -> "soup.find("a")
            tag_name = tag[:-3]
            warnings.warn(
-                '.%sTag is deprecated, use .find("%s") instead.' % (
+                '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
-                    tag_name, tag_name))
+                    name=tag_name
                )
            )
            return self.find(tag_name)
        # We special case contents to avoid recursion.
        elif not tag.startswith("__") and not tag == "contents":
@ -1115,11 +1191,10 @@ class Tag(PageElement):
           encoding.
        """
-        # First off, turn a string formatter into a function. This
+        # First off, turn a string formatter into a Formatter object. This
        # will stop the lookup from happening over and over again.
-        if not callable(formatter):
+        if not isinstance(formatter, Formatter) and not callable(formatter):
            formatter = self._formatter_for_name(formatter)
        attrs = []
        if self.attrs:
            for key, val in sorted(self.attrs.items()):
@ -1148,7 +1223,7 @@ class Tag(PageElement):
            prefix = self.prefix + ":"
        if self.is_empty_element:
-            close = '/'
+            close = formatter.void_element_close_prefix or ''
        else:
            closeTag = '</%s%s>' % (prefix, self.name)
@ -1219,9 +1294,9 @@ class Tag(PageElement):
        :param formatter: The output formatter responsible for converting
           entities to Unicode characters.
        """
-        # First off, turn a string formatter into a function. This
+        # First off, turn a string formatter into a Formatter object. This
        # will stop the lookup from happening over and over again.
-        if not callable(formatter):
+        if not isinstance(formatter, Formatter) and not callable(formatter):
            formatter = self._formatter_for_name(formatter)
        pretty_print = (indent_level is not None)
@ -1334,15 +1409,29 @@ class Tag(PageElement):
        # Handle grouping selectors if ',' exists, ie: p,a
        if ',' in selector:
            context = []
-            for partial_selector in selector.split(','):
+            selectors = [x.strip() for x in selector.split(",")]
-                partial_selector = partial_selector.strip()
+
            # If a selector is mentioned multiple times we don't want
            # to use it more than once.
            used_selectors = set()
            # We also don't want to select the same element more than once,
            # if it's matched by multiple selectors.
            selected_object_ids = set()
            for partial_selector in selectors:
                if partial_selector == '':
                    raise ValueError('Invalid group selection syntax: %s' % selector)
                if partial_selector in used_selectors:
                    continue
                used_selectors.add(partial_selector)
                candidates = self.select(partial_selector, limit=limit)
                for candidate in candidates:
-                    if candidate not in context:
+                    # This lets us distinguish between distinct tags that
                    # represent the same markup.
                    object_id = id(candidate)
                    if object_id not in selected_object_ids:
                        context.append(candidate)
-
+                        selected_object_ids.add(object_id)
                if limit and len(context) >= limit:
                    break
            return context
@ -1404,7 +1493,7 @@ class Tag(PageElement):
                if tag_name == '':
                    raise ValueError(
                        "A pseudo-class must be prefixed with a tag name.")
-                pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
+                pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
                found = []
                if pseudo_attributes is None:
                    pseudo_type = pseudo
@ -1638,7 +1727,7 @@ class SoupStrainer(object):
            markup = markup_name
            markup_attrs = markup
        call_function_with_tag_data = (
-            isinstance(self.name, collections.Callable)
+            isinstance(self.name, Callable)
            and not isinstance(markup_name, Tag))
        if ((not self.name)
@ -1698,7 +1787,7 @@ class SoupStrainer(object):
                "I don't know how to match against a %s" % markup.__class__)
        return found
-    def _matches(self, markup, match_against):
+    def _matches(self, markup, match_against, already_tried=None):
        # print u"Matching %s against %s" % (markup, match_against)
        result = False
        if isinstance(markup, list) or isinstance(markup, tuple):
@ -1713,16 +1802,17 @@ class SoupStrainer(object):
            if self._matches(' '.join(markup), match_against):
                return True
            return False
-
+        
        if match_against is True:
            # True matches any non-None value.
            return markup is not None
-        if isinstance(match_against, collections.Callable):
+        if isinstance(match_against, Callable):
            return match_against(markup)
        # Custom callables take the tag as an argument, but all
        # other ways of matching match the tag name as a string.
        original_markup = markup
        if isinstance(markup, Tag):
            markup = markup.name
@ -1733,18 +1823,51 @@ class SoupStrainer(object):
            # None matches None, False, an empty string, an empty list, and so on.
            return not match_against
-        if isinstance(match_against, unicode):
+        if (hasattr(match_against, '__iter__')
            and not isinstance(match_against, basestring)):
            # We're asked to match against an iterable of items.
            # The markup must be match at least one item in the
            # iterable. We'll try each one in turn.
            #
            # To avoid infinite recursion we need to keep track of
            # items we've already seen.
            if not already_tried:
                already_tried = set()
            for item in match_against:
                if item.__hash__:
                    key = item
                else:
                    key = id(item)
                if key in already_tried:
                    continue
                else:
                    already_tried.add(key)
                    if self._matches(original_markup, item, already_tried):
                        return True
            else:
                return False
        # Beyond this point we might need to run the test twice: once against
        # the tag's name and once against its prefixed name.
        match = False
        if not match and isinstance(match_against, unicode):
            # Exact string match
-            return markup == match_against
+            match = markup == match_against
-        if hasattr(match_against, 'match'):
+        if not match and hasattr(match_against, 'search'):
            # Regexp match
            return match_against.search(markup)
-        if hasattr(match_against, '__iter__'):
+        if (not match
-            # The markup must be an exact match against something
+            and isinstance(original_markup, Tag)
-            # in the iterable.
+            and original_markup.prefix):
-            return markup in match_against
+            # Try the whole thing again with the prefixed tag name.
            return self._matches(
                original_markup.prefix + ':' + original_markup.name, match_against
            )
        return match
 class ResultSet(list):
@ -1753,3 +1876,8 @@ class ResultSet(list):
    def __init__(self, source, result=()):
        super(ResultSet, self).__init__(result)
        self.source = source
    def __getattr__(self, key):
        raise AttributeError(
            "ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % key
        )