mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-06 08:52:55 +01:00
Update included_dependencies to beautifulsoup4-4.6.1
This commit is contained in:
parent
5be511916b
commit
67698baf11
7 changed files with 374 additions and 94 deletions
|
|
@ -21,14 +21,15 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||||
# found in the LICENSE file.
|
# found in the LICENSE file.
|
||||||
|
|
||||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||||
__version__ = "4.5.3"
|
__version__ = "4.6.1"
|
||||||
__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
|
__copyright__ = "Copyright (c) 2004-2018 Leonard Richardson"
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
|
|
||||||
__all__ = ['BeautifulSoup']
|
__all__ = ['BeautifulSoup']
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
import traceback
|
import traceback
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
|
|
@ -82,14 +83,46 @@ class BeautifulSoup(Tag):
|
||||||
|
|
||||||
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
||||||
|
|
||||||
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
|
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
|
||||||
|
|
||||||
def __init__(self, markup="", features=None, builder=None,
|
def __init__(self, markup="", features=None, builder=None,
|
||||||
parse_only=None, from_encoding=None, exclude_encodings=None,
|
parse_only=None, from_encoding=None, exclude_encodings=None,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
"""The Soup object is initialized as the 'root tag', and the
|
"""Constructor.
|
||||||
provided markup (which can be a string or a file-like object)
|
|
||||||
is fed into the underlying parser."""
|
:param markup: A string or a file-like object representing
|
||||||
|
markup to be parsed.
|
||||||
|
|
||||||
|
:param features: Desirable features of the parser to be used. This
|
||||||
|
may be the name of a specific parser ("lxml", "lxml-xml",
|
||||||
|
"html.parser", or "html5lib") or it may be the type of markup
|
||||||
|
to be used ("html", "html5", "xml"). It's recommended that you
|
||||||
|
name a specific parser, so that Beautiful Soup gives you the
|
||||||
|
same results across platforms and virtual environments.
|
||||||
|
|
||||||
|
:param builder: A specific TreeBuilder to use instead of looking one
|
||||||
|
up based on `features`. You shouldn't need to use this.
|
||||||
|
|
||||||
|
:param parse_only: A SoupStrainer. Only parts of the document
|
||||||
|
matching the SoupStrainer will be considered. This is useful
|
||||||
|
when parsing part of a document that would otherwise be too
|
||||||
|
large to fit into memory.
|
||||||
|
|
||||||
|
:param from_encoding: A string indicating the encoding of the
|
||||||
|
document to be parsed. Pass this in if Beautiful Soup is
|
||||||
|
guessing wrongly about the document's encoding.
|
||||||
|
|
||||||
|
:param exclude_encodings: A list of strings indicating
|
||||||
|
encodings known to be wrong. Pass this in if you don't know
|
||||||
|
the document's encoding but you know Beautiful Soup's guess is
|
||||||
|
wrong.
|
||||||
|
|
||||||
|
:param kwargs: For backwards compatibility purposes, the
|
||||||
|
constructor accepts certain keyword arguments used in
|
||||||
|
Beautiful Soup 3. None of these arguments do anything in
|
||||||
|
Beautiful Soup 4 and there's no need to actually pass keyword
|
||||||
|
arguments into the constructor.
|
||||||
|
"""
|
||||||
|
|
||||||
if 'convertEntities' in kwargs:
|
if 'convertEntities' in kwargs:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
|
|
@ -171,14 +204,35 @@ class BeautifulSoup(Tag):
|
||||||
else:
|
else:
|
||||||
markup_type = "HTML"
|
markup_type = "HTML"
|
||||||
|
|
||||||
caller = traceback.extract_stack()[0]
|
# This code adapted from warnings.py so that we get the same line
|
||||||
filename = caller[0]
|
# of code as our warnings.warn() call gets, even if the answer is wrong
|
||||||
line_number = caller[1]
|
# (as it may be in a multithreading situation).
|
||||||
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
|
caller = None
|
||||||
filename=filename,
|
try:
|
||||||
line_number=line_number,
|
caller = sys._getframe(1)
|
||||||
parser=builder.NAME,
|
except ValueError:
|
||||||
markup_type=markup_type))
|
pass
|
||||||
|
if caller:
|
||||||
|
globals = caller.f_globals
|
||||||
|
line_number = caller.f_lineno
|
||||||
|
else:
|
||||||
|
globals = sys.__dict__
|
||||||
|
line_number= 1
|
||||||
|
filename = globals.get('__file__')
|
||||||
|
if filename:
|
||||||
|
fnl = filename.lower()
|
||||||
|
if fnl.endswith((".pyc", ".pyo")):
|
||||||
|
filename = filename[:-1]
|
||||||
|
if filename:
|
||||||
|
# If there is no filename at all, the user is most likely in a REPL,
|
||||||
|
# and the warning is not necessary.
|
||||||
|
values = dict(
|
||||||
|
filename=filename,
|
||||||
|
line_number=line_number,
|
||||||
|
parser=builder.NAME,
|
||||||
|
markup_type=markup_type
|
||||||
|
)
|
||||||
|
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
|
||||||
|
|
||||||
self.builder = builder
|
self.builder = builder
|
||||||
self.is_xml = builder.is_xml
|
self.is_xml = builder.is_xml
|
||||||
|
|
@ -215,8 +269,8 @@ class BeautifulSoup(Tag):
|
||||||
markup = markup.encode("utf8")
|
markup = markup.encode("utf8")
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
'"%s" looks like a filename, not markup. You should'
|
'"%s" looks like a filename, not markup. You should'
|
||||||
'probably open this file and pass the filehandle into'
|
' probably open this file and pass the filehandle into'
|
||||||
'Beautiful Soup.' % markup)
|
' Beautiful Soup.' % markup)
|
||||||
self._check_markup_is_url(markup)
|
self._check_markup_is_url(markup)
|
||||||
|
|
||||||
for (self.markup, self.original_encoding, self.declared_html_encoding,
|
for (self.markup, self.original_encoding, self.declared_html_encoding,
|
||||||
|
|
@ -302,9 +356,10 @@ class BeautifulSoup(Tag):
|
||||||
self.preserve_whitespace_tag_stack = []
|
self.preserve_whitespace_tag_stack = []
|
||||||
self.pushTag(self)
|
self.pushTag(self)
|
||||||
|
|
||||||
def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
|
def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs):
|
||||||
"""Create a new tag associated with this soup."""
|
"""Create a new tag associated with this soup."""
|
||||||
return Tag(None, self.builder, name, namespace, nsprefix, attrs)
|
kwattrs.update(attrs)
|
||||||
|
return Tag(None, self.builder, name, namespace, nsprefix, kwattrs)
|
||||||
|
|
||||||
def new_string(self, s, subclass=NavigableString):
|
def new_string(self, s, subclass=NavigableString):
|
||||||
"""Create a new NavigableString associated with this soup."""
|
"""Create a new NavigableString associated with this soup."""
|
||||||
|
|
|
||||||
|
|
@ -93,7 +93,7 @@ class TreeBuilder(object):
|
||||||
preserve_whitespace_tags = set()
|
preserve_whitespace_tags = set()
|
||||||
empty_element_tags = None # A tag will be considered an empty-element
|
empty_element_tags = None # A tag will be considered an empty-element
|
||||||
# tag when and only when it has no contents.
|
# tag when and only when it has no contents.
|
||||||
|
|
||||||
# A value for these tag/attribute combinations is a space- or
|
# A value for these tag/attribute combinations is a space- or
|
||||||
# comma-separated list of CDATA, rather than a single CDATA.
|
# comma-separated list of CDATA, rather than a single CDATA.
|
||||||
cdata_list_attributes = {}
|
cdata_list_attributes = {}
|
||||||
|
|
@ -125,7 +125,7 @@ class TreeBuilder(object):
|
||||||
if self.empty_element_tags is None:
|
if self.empty_element_tags is None:
|
||||||
return True
|
return True
|
||||||
return tag_name in self.empty_element_tags
|
return tag_name in self.empty_element_tags
|
||||||
|
|
||||||
def feed(self, markup):
|
def feed(self, markup):
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
@ -232,9 +232,14 @@ class HTMLTreeBuilder(TreeBuilder):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
|
preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
|
||||||
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
|
empty_element_tags = set([
|
||||||
'spacer', 'link', 'frame', 'base'])
|
# These are from HTML5.
|
||||||
|
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
|
||||||
|
|
||||||
|
# These are from earlier versions of HTML and are removed in HTML5.
|
||||||
|
'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
|
||||||
|
])
|
||||||
|
|
||||||
# The HTML standard defines these attributes as containing a
|
# The HTML standard defines these attributes as containing a
|
||||||
# space-separated list of values, not a single value. That is,
|
# space-separated list of values, not a single value. That is,
|
||||||
# class="foo bar" means that the 'class' attribute has two values,
|
# class="foo bar" means that the 'class' attribute has two values,
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
|
# encoding: utf-8
|
||||||
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
|
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
|
||||||
|
|
||||||
# Use of this source code is governed by a BSD-style license that can be
|
# Use of this source code is governed by a BSD-style license that can be
|
||||||
|
|
@ -52,7 +53,42 @@ from bs4.builder import (
|
||||||
HTMLPARSER = 'html.parser'
|
HTMLPARSER = 'html.parser'
|
||||||
|
|
||||||
class BeautifulSoupHTMLParser(HTMLParser):
|
class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
def handle_starttag(self, name, attrs):
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
HTMLParser.__init__(self, *args, **kwargs)
|
||||||
|
|
||||||
|
# Keep a list of empty-element tags that were encountered
|
||||||
|
# without an explicit closing tag. If we encounter a closing tag
|
||||||
|
# of this type, we'll associate it with one of those entries.
|
||||||
|
#
|
||||||
|
# This isn't a stack because we don't care about the
|
||||||
|
# order. It's a list of closing tags we've already handled and
|
||||||
|
# will ignore, assuming they ever show up.
|
||||||
|
self.already_closed_empty_element = []
|
||||||
|
|
||||||
|
def error(self, msg):
|
||||||
|
"""In Python 3, HTMLParser subclasses must implement error(), although this
|
||||||
|
requirement doesn't appear to be documented.
|
||||||
|
|
||||||
|
In Python 2, HTMLParser implements error() as raising an exception.
|
||||||
|
|
||||||
|
In any event, this method is called only on very strange markup and our best strategy
|
||||||
|
is to pretend it didn't happen and keep going.
|
||||||
|
"""
|
||||||
|
warnings.warn(msg)
|
||||||
|
|
||||||
|
def handle_startendtag(self, name, attrs):
|
||||||
|
# This is only called when the markup looks like
|
||||||
|
# <tag/>.
|
||||||
|
|
||||||
|
# is_startend() tells handle_starttag not to close the tag
|
||||||
|
# just because its name matches a known empty-element tag. We
|
||||||
|
# know that this is an empty-element tag and we want to call
|
||||||
|
# handle_endtag ourselves.
|
||||||
|
tag = self.handle_starttag(name, attrs, handle_empty_element=False)
|
||||||
|
self.handle_endtag(name)
|
||||||
|
|
||||||
|
def handle_starttag(self, name, attrs, handle_empty_element=True):
|
||||||
# XXX namespace
|
# XXX namespace
|
||||||
attr_dict = {}
|
attr_dict = {}
|
||||||
for key, value in attrs:
|
for key, value in attrs:
|
||||||
|
|
@ -62,10 +98,34 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
value = ''
|
value = ''
|
||||||
attr_dict[key] = value
|
attr_dict[key] = value
|
||||||
attrvalue = '""'
|
attrvalue = '""'
|
||||||
self.soup.handle_starttag(name, None, None, attr_dict)
|
#print "START", name
|
||||||
|
tag = self.soup.handle_starttag(name, None, None, attr_dict)
|
||||||
|
if tag and tag.is_empty_element and handle_empty_element:
|
||||||
|
# Unlike other parsers, html.parser doesn't send separate end tag
|
||||||
|
# events for empty-element tags. (It's handled in
|
||||||
|
# handle_startendtag, but only if the original markup looked like
|
||||||
|
# <tag/>.)
|
||||||
|
#
|
||||||
|
# So we need to call handle_endtag() ourselves. Since we
|
||||||
|
# know the start event is identical to the end event, we
|
||||||
|
# don't want handle_endtag() to cross off any previous end
|
||||||
|
# events for tags of this name.
|
||||||
|
self.handle_endtag(name, check_already_closed=False)
|
||||||
|
|
||||||
def handle_endtag(self, name):
|
# But we might encounter an explicit closing tag for this tag
|
||||||
self.soup.handle_endtag(name)
|
# later on. If so, we want to ignore it.
|
||||||
|
self.already_closed_empty_element.append(name)
|
||||||
|
|
||||||
|
def handle_endtag(self, name, check_already_closed=True):
|
||||||
|
#print "END", name
|
||||||
|
if check_already_closed and name in self.already_closed_empty_element:
|
||||||
|
# This is a redundant end tag for an empty-element tag.
|
||||||
|
# We've already called handle_endtag() for it, so just
|
||||||
|
# check it off the list.
|
||||||
|
# print "ALREADY CLOSED", name
|
||||||
|
self.already_closed_empty_element.remove(name)
|
||||||
|
else:
|
||||||
|
self.soup.handle_endtag(name)
|
||||||
|
|
||||||
def handle_data(self, data):
|
def handle_data(self, data):
|
||||||
self.soup.handle_data(data)
|
self.soup.handle_data(data)
|
||||||
|
|
@ -81,11 +141,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
else:
|
else:
|
||||||
real_name = int(name)
|
real_name = int(name)
|
||||||
|
|
||||||
try:
|
data = None
|
||||||
data = unichr(real_name)
|
if real_name < 256:
|
||||||
except (ValueError, OverflowError), e:
|
# HTML numeric entities are supposed to reference Unicode
|
||||||
data = u"\N{REPLACEMENT CHARACTER}"
|
# code points, but sometimes they reference code points in
|
||||||
|
# some other encoding (ahem, Windows-1252). E.g. “
|
||||||
|
# instead of É for LEFT DOUBLE QUOTATION MARK. This
|
||||||
|
# code tries to detect this situation and compensate.
|
||||||
|
for encoding in (self.soup.original_encoding, 'windows-1252'):
|
||||||
|
if not encoding:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
data = bytearray([real_name]).decode(encoding)
|
||||||
|
except UnicodeDecodeError, e:
|
||||||
|
pass
|
||||||
|
if not data:
|
||||||
|
try:
|
||||||
|
data = unichr(real_name)
|
||||||
|
except (ValueError, OverflowError), e:
|
||||||
|
pass
|
||||||
|
data = data or u"\N{REPLACEMENT CHARACTER}"
|
||||||
self.handle_data(data)
|
self.handle_data(data)
|
||||||
|
|
||||||
def handle_entityref(self, name):
|
def handle_entityref(self, name):
|
||||||
|
|
@ -93,7 +168,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
if character is not None:
|
if character is not None:
|
||||||
data = character
|
data = character
|
||||||
else:
|
else:
|
||||||
data = "&%s;" % name
|
# If this were XML, it would be ambiguous whether "&foo"
|
||||||
|
# was an character entity reference with a missing
|
||||||
|
# semicolon or the literal string "&foo". Since this is
|
||||||
|
# HTML, we have a complete list of all character entity references,
|
||||||
|
# and this one wasn't found, so assume it's the literal string "&foo".
|
||||||
|
data = "&%s" % name
|
||||||
self.handle_data(data)
|
self.handle_data(data)
|
||||||
|
|
||||||
def handle_comment(self, data):
|
def handle_comment(self, data):
|
||||||
|
|
@ -165,10 +245,12 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||||
parser.soup = self.soup
|
parser.soup = self.soup
|
||||||
try:
|
try:
|
||||||
parser.feed(markup)
|
parser.feed(markup)
|
||||||
|
parser.close()
|
||||||
except HTMLParseError, e:
|
except HTMLParseError, e:
|
||||||
warnings.warn(RuntimeWarning(
|
warnings.warn(RuntimeWarning(
|
||||||
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
|
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
|
||||||
raise e
|
raise e
|
||||||
|
parser.already_closed_empty_element = []
|
||||||
|
|
||||||
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
|
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
|
||||||
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
|
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
|
||||||
|
|
|
||||||
|
|
@ -5,9 +5,13 @@ __all__ = [
|
||||||
'LXMLTreeBuilder',
|
'LXMLTreeBuilder',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
try:
|
||||||
|
from collections.abc import Callable # Python 3.6
|
||||||
|
except ImportError , e:
|
||||||
|
from collections import Callable
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
import collections
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from bs4.element import (
|
from bs4.element import (
|
||||||
Comment,
|
Comment,
|
||||||
|
|
@ -58,7 +62,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
# Use the default parser.
|
# Use the default parser.
|
||||||
parser = self.default_parser(encoding)
|
parser = self.default_parser(encoding)
|
||||||
|
|
||||||
if isinstance(parser, collections.Callable):
|
if isinstance(parser, Callable):
|
||||||
# Instantiate the parser with default arguments
|
# Instantiate the parser with default arguments
|
||||||
parser = parser(target=self, strip_cdata=False, encoding=encoding)
|
parser = parser(target=self, strip_cdata=False, encoding=encoding)
|
||||||
return parser
|
return parser
|
||||||
|
|
@ -147,11 +151,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
attrs = dict(attrs)
|
attrs = dict(attrs)
|
||||||
nsprefix = None
|
nsprefix = None
|
||||||
# Invert each namespace map as it comes in.
|
# Invert each namespace map as it comes in.
|
||||||
if len(self.nsmaps) > 1:
|
if len(nsmap) == 0 and len(self.nsmaps) > 1:
|
||||||
# There are no new namespaces for this tag, but
|
# There are no new namespaces for this tag, but
|
||||||
# non-default namespaces are in play, so we need a
|
# non-default namespaces are in play, so we need a
|
||||||
# separate tag stack to know when they end.
|
# separate tag stack to know when they end.
|
||||||
self.nsmaps.append(None)
|
self.nsmaps.append(None)
|
||||||
elif len(nsmap) > 0:
|
elif len(nsmap) > 0:
|
||||||
# A new namespace mapping has come into play.
|
# A new namespace mapping has come into play.
|
||||||
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
|
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
|
||||||
|
|
|
||||||
|
|
@ -46,9 +46,9 @@ except ImportError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
xml_encoding_re = re.compile(
|
xml_encoding_re = re.compile(
|
||||||
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
|
'^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
|
||||||
html_meta_re = re.compile(
|
html_meta_re = re.compile(
|
||||||
'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
|
'<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
|
||||||
|
|
||||||
class EntitySubstitution(object):
|
class EntitySubstitution(object):
|
||||||
|
|
||||||
|
|
@ -82,7 +82,7 @@ class EntitySubstitution(object):
|
||||||
}
|
}
|
||||||
|
|
||||||
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
|
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
|
||||||
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
|
"&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
|
||||||
")")
|
")")
|
||||||
|
|
||||||
AMPERSAND_OR_BRACKET = re.compile("([<>&])")
|
AMPERSAND_OR_BRACKET = re.compile("([<>&])")
|
||||||
|
|
|
||||||
|
|
@ -37,7 +37,7 @@ def diagnose(data):
|
||||||
name)
|
name)
|
||||||
|
|
||||||
if 'lxml' in basic_parsers:
|
if 'lxml' in basic_parsers:
|
||||||
basic_parsers.append(["lxml", "xml"])
|
basic_parsers.append("lxml-xml")
|
||||||
try:
|
try:
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
|
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
|
||||||
|
|
@ -56,21 +56,27 @@ def diagnose(data):
|
||||||
|
|
||||||
if hasattr(data, 'read'):
|
if hasattr(data, 'read'):
|
||||||
data = data.read()
|
data = data.read()
|
||||||
elif os.path.exists(data):
|
|
||||||
print '"%s" looks like a filename. Reading data from the file.' % data
|
|
||||||
with open(data) as fp:
|
|
||||||
data = fp.read()
|
|
||||||
elif data.startswith("http:") or data.startswith("https:"):
|
elif data.startswith("http:") or data.startswith("https:"):
|
||||||
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
|
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
|
||||||
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
|
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
|
||||||
return
|
return
|
||||||
print
|
else:
|
||||||
|
try:
|
||||||
|
if os.path.exists(data):
|
||||||
|
print '"%s" looks like a filename. Reading data from the file.' % data
|
||||||
|
with open(data) as fp:
|
||||||
|
data = fp.read()
|
||||||
|
except ValueError:
|
||||||
|
# This can happen on some platforms when the 'filename' is
|
||||||
|
# too long. Assume it's data and not a filename.
|
||||||
|
pass
|
||||||
|
print
|
||||||
|
|
||||||
for parser in basic_parsers:
|
for parser in basic_parsers:
|
||||||
print "Trying to parse your markup with %s" % parser
|
print "Trying to parse your markup with %s" % parser
|
||||||
success = False
|
success = False
|
||||||
try:
|
try:
|
||||||
soup = BeautifulSoup(data, parser)
|
soup = BeautifulSoup(data, features=parser)
|
||||||
success = True
|
success = True
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
print "%s could not parse the markup." % parser
|
print "%s could not parse the markup." % parser
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,10 @@
|
||||||
# found in the LICENSE file.
|
# found in the LICENSE file.
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
|
|
||||||
import collections
|
try:
|
||||||
|
from collections.abc import Callable # Python 3.6
|
||||||
|
except ImportError , e:
|
||||||
|
from collections import Callable
|
||||||
import re
|
import re
|
||||||
import shlex
|
import shlex
|
||||||
import sys
|
import sys
|
||||||
|
|
@ -12,7 +15,7 @@ from bs4.dammit import EntitySubstitution
|
||||||
DEFAULT_OUTPUT_ENCODING = "utf-8"
|
DEFAULT_OUTPUT_ENCODING = "utf-8"
|
||||||
PY3K = (sys.version_info[0] > 2)
|
PY3K = (sys.version_info[0] > 2)
|
||||||
|
|
||||||
whitespace_re = re.compile("\s+")
|
whitespace_re = re.compile(r"\s+")
|
||||||
|
|
||||||
def _alias(attr):
|
def _alias(attr):
|
||||||
"""Alias one attribute name to another for backward compatibility"""
|
"""Alias one attribute name to another for backward compatibility"""
|
||||||
|
|
@ -69,7 +72,7 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
|
||||||
The value of the 'content' attribute will be one of these objects.
|
The value of the 'content' attribute will be one of these objects.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
|
CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
|
||||||
|
|
||||||
def __new__(cls, original_value):
|
def __new__(cls, original_value):
|
||||||
match = cls.CHARSET_RE.search(original_value)
|
match = cls.CHARSET_RE.search(original_value)
|
||||||
|
|
@ -123,6 +126,41 @@ class HTMLAwareEntitySubstitution(EntitySubstitution):
|
||||||
return cls._substitute_if_appropriate(
|
return cls._substitute_if_appropriate(
|
||||||
ns, EntitySubstitution.substitute_xml)
|
ns, EntitySubstitution.substitute_xml)
|
||||||
|
|
||||||
|
class Formatter(object):
|
||||||
|
"""Contains information about how to format a parse tree."""
|
||||||
|
|
||||||
|
# By default, represent void elements as <tag/> rather than <tag>
|
||||||
|
void_element_close_prefix = '/'
|
||||||
|
|
||||||
|
def substitute_entities(self, *args, **kwargs):
|
||||||
|
"""Transform certain characters into named entities."""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
class HTMLFormatter(Formatter):
|
||||||
|
"""The default HTML formatter."""
|
||||||
|
def substitute(self, *args, **kwargs):
|
||||||
|
return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
|
||||||
|
|
||||||
|
class MinimalHTMLFormatter(Formatter):
|
||||||
|
"""A minimal HTML formatter."""
|
||||||
|
def substitute(self, *args, **kwargs):
|
||||||
|
return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs)
|
||||||
|
|
||||||
|
class HTML5Formatter(HTMLFormatter):
|
||||||
|
"""An HTML formatter that omits the slash in a void tag."""
|
||||||
|
void_element_close_prefix = None
|
||||||
|
|
||||||
|
class XMLFormatter(Formatter):
|
||||||
|
"""Substitute only the essential XML entities."""
|
||||||
|
def substitute(self, *args, **kwargs):
|
||||||
|
return EntitySubstitution.substitute_xml(*args, **kwargs)
|
||||||
|
|
||||||
|
class HTMLXMLFormatter(Formatter):
|
||||||
|
"""Format XML using HTML rules."""
|
||||||
|
def substitute(self, *args, **kwargs):
|
||||||
|
return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class PageElement(object):
|
class PageElement(object):
|
||||||
"""Contains the navigational information for some part of the page
|
"""Contains the navigational information for some part of the page
|
||||||
(either a tag or a piece of text)"""
|
(either a tag or a piece of text)"""
|
||||||
|
|
@ -132,39 +170,48 @@ class PageElement(object):
|
||||||
#
|
#
|
||||||
# "html" - All Unicode characters with corresponding HTML entities
|
# "html" - All Unicode characters with corresponding HTML entities
|
||||||
# are converted to those entities on output.
|
# are converted to those entities on output.
|
||||||
|
# "html5" - The same as "html", but empty void tags are represented as
|
||||||
|
# <tag> rather than <tag/>
|
||||||
# "minimal" - Bare ampersands and angle brackets are converted to
|
# "minimal" - Bare ampersands and angle brackets are converted to
|
||||||
# XML entities: & < >
|
# XML entities: & < >
|
||||||
# None - The null formatter. Unicode characters are never
|
# None - The null formatter. Unicode characters are never
|
||||||
# converted to entities. This is not recommended, but it's
|
# converted to entities. This is not recommended, but it's
|
||||||
# faster than "minimal".
|
# faster than "minimal".
|
||||||
# A function - This function will be called on every string that
|
# A callable function - it will be called on every string that needs to undergo entity substitution.
|
||||||
|
# A Formatter instance - Formatter.substitute(string) will be called on every string that
|
||||||
# needs to undergo entity substitution.
|
# needs to undergo entity substitution.
|
||||||
#
|
#
|
||||||
|
|
||||||
# In an HTML document, the default "html" and "minimal" functions
|
# In an HTML document, the default "html", "html5", and "minimal"
|
||||||
# will leave the contents of <script> and <style> tags alone. For
|
# functions will leave the contents of <script> and <style> tags
|
||||||
# an XML document, all tags will be given the same treatment.
|
# alone. For an XML document, all tags will be given the same
|
||||||
|
# treatment.
|
||||||
|
|
||||||
HTML_FORMATTERS = {
|
HTML_FORMATTERS = {
|
||||||
"html" : HTMLAwareEntitySubstitution.substitute_html,
|
"html" : HTMLFormatter(),
|
||||||
"minimal" : HTMLAwareEntitySubstitution.substitute_xml,
|
"html5" : HTML5Formatter(),
|
||||||
|
"minimal" : MinimalHTMLFormatter(),
|
||||||
None : None
|
None : None
|
||||||
}
|
}
|
||||||
|
|
||||||
XML_FORMATTERS = {
|
XML_FORMATTERS = {
|
||||||
"html" : EntitySubstitution.substitute_html,
|
"html" : HTMLXMLFormatter(),
|
||||||
"minimal" : EntitySubstitution.substitute_xml,
|
"minimal" : XMLFormatter(),
|
||||||
None : None
|
None : None
|
||||||
}
|
}
|
||||||
|
|
||||||
def format_string(self, s, formatter='minimal'):
|
def format_string(self, s, formatter='minimal'):
|
||||||
"""Format the given string using the given formatter."""
|
"""Format the given string using the given formatter."""
|
||||||
if not callable(formatter):
|
if isinstance(formatter, basestring):
|
||||||
formatter = self._formatter_for_name(formatter)
|
formatter = self._formatter_for_name(formatter)
|
||||||
if formatter is None:
|
if formatter is None:
|
||||||
output = s
|
output = s
|
||||||
else:
|
else:
|
||||||
output = formatter(s)
|
if callable(formatter):
|
||||||
|
# Backwards compatibility -- you used to pass in a formatting method.
|
||||||
|
output = formatter(s)
|
||||||
|
else:
|
||||||
|
output = formatter.substitute(s)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
@ -194,11 +241,9 @@ class PageElement(object):
|
||||||
def _formatter_for_name(self, name):
|
def _formatter_for_name(self, name):
|
||||||
"Look up a formatter function based on its name and the tree."
|
"Look up a formatter function based on its name and the tree."
|
||||||
if self._is_xml:
|
if self._is_xml:
|
||||||
return self.XML_FORMATTERS.get(
|
return self.XML_FORMATTERS.get(name, XMLFormatter())
|
||||||
name, EntitySubstitution.substitute_xml)
|
|
||||||
else:
|
else:
|
||||||
return self.HTML_FORMATTERS.get(
|
return self.HTML_FORMATTERS.get(name, HTMLFormatter())
|
||||||
name, HTMLAwareEntitySubstitution.substitute_xml)
|
|
||||||
|
|
||||||
def setup(self, parent=None, previous_element=None, next_element=None,
|
def setup(self, parent=None, previous_element=None, next_element=None,
|
||||||
previous_sibling=None, next_sibling=None):
|
previous_sibling=None, next_sibling=None):
|
||||||
|
|
@ -316,6 +361,14 @@ class PageElement(object):
|
||||||
and not isinstance(new_child, NavigableString)):
|
and not isinstance(new_child, NavigableString)):
|
||||||
new_child = NavigableString(new_child)
|
new_child = NavigableString(new_child)
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
if isinstance(new_child, BeautifulSoup):
|
||||||
|
# We don't want to end up with a situation where one BeautifulSoup
|
||||||
|
# object contains another. Insert the children one at a time.
|
||||||
|
for subchild in list(new_child.contents):
|
||||||
|
self.insert(position, subchild)
|
||||||
|
position += 1
|
||||||
|
return
|
||||||
position = min(position, len(self.contents))
|
position = min(position, len(self.contents))
|
||||||
if hasattr(new_child, 'parent') and new_child.parent is not None:
|
if hasattr(new_child, 'parent') and new_child.parent is not None:
|
||||||
# We're 'inserting' an element that's already one
|
# We're 'inserting' an element that's already one
|
||||||
|
|
@ -535,9 +588,23 @@ class PageElement(object):
|
||||||
return ResultSet(strainer, result)
|
return ResultSet(strainer, result)
|
||||||
elif isinstance(name, basestring):
|
elif isinstance(name, basestring):
|
||||||
# Optimization to find all tags with a given name.
|
# Optimization to find all tags with a given name.
|
||||||
|
if name.count(':') == 1:
|
||||||
|
# This is a name with a prefix. If this is a namespace-aware document,
|
||||||
|
# we need to match the local name against tag.name. If not,
|
||||||
|
# we need to match the fully-qualified name against tag.name.
|
||||||
|
prefix, local_name = name.split(':', 1)
|
||||||
|
else:
|
||||||
|
prefix = None
|
||||||
|
local_name = name
|
||||||
result = (element for element in generator
|
result = (element for element in generator
|
||||||
if isinstance(element, Tag)
|
if isinstance(element, Tag)
|
||||||
and element.name == name)
|
and (
|
||||||
|
element.name == name
|
||||||
|
) or (
|
||||||
|
element.name == local_name
|
||||||
|
and (prefix is None or element.prefix == prefix)
|
||||||
|
)
|
||||||
|
)
|
||||||
return ResultSet(strainer, result)
|
return ResultSet(strainer, result)
|
||||||
results = ResultSet(strainer)
|
results = ResultSet(strainer)
|
||||||
while True:
|
while True:
|
||||||
|
|
@ -855,7 +922,7 @@ class Tag(PageElement):
|
||||||
self.can_be_empty_element = builder.can_be_empty_element(name)
|
self.can_be_empty_element = builder.can_be_empty_element(name)
|
||||||
else:
|
else:
|
||||||
self.can_be_empty_element = False
|
self.can_be_empty_element = False
|
||||||
|
|
||||||
parserClass = _alias("parser_class") # BS3
|
parserClass = _alias("parser_class") # BS3
|
||||||
|
|
||||||
def __copy__(self):
|
def __copy__(self):
|
||||||
|
|
@ -863,7 +930,7 @@ class Tag(PageElement):
|
||||||
Its contents are a copy of the old Tag's contents.
|
Its contents are a copy of the old Tag's contents.
|
||||||
"""
|
"""
|
||||||
clone = type(self)(None, self.builder, self.name, self.namespace,
|
clone = type(self)(None, self.builder, self.name, self.namespace,
|
||||||
self.nsprefix, self.attrs, is_xml=self._is_xml)
|
self.prefix, self.attrs, is_xml=self._is_xml)
|
||||||
for attr in ('can_be_empty_element', 'hidden'):
|
for attr in ('can_be_empty_element', 'hidden'):
|
||||||
setattr(clone, attr, getattr(self, attr))
|
setattr(clone, attr, getattr(self, attr))
|
||||||
for child in self.contents:
|
for child in self.contents:
|
||||||
|
|
@ -985,6 +1052,13 @@ class Tag(PageElement):
|
||||||
attribute."""
|
attribute."""
|
||||||
return self.attrs.get(key, default)
|
return self.attrs.get(key, default)
|
||||||
|
|
||||||
|
def get_attribute_list(self, key, default=None):
|
||||||
|
"""The same as get(), but always returns a list."""
|
||||||
|
value = self.get(key, default)
|
||||||
|
if not isinstance(value, list):
|
||||||
|
value = [value]
|
||||||
|
return value
|
||||||
|
|
||||||
def has_attr(self, key):
|
def has_attr(self, key):
|
||||||
return key in self.attrs
|
return key in self.attrs
|
||||||
|
|
||||||
|
|
@ -1032,8 +1106,10 @@ class Tag(PageElement):
|
||||||
# BS3: soup.aTag -> "soup.find("a")
|
# BS3: soup.aTag -> "soup.find("a")
|
||||||
tag_name = tag[:-3]
|
tag_name = tag[:-3]
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
'.%sTag is deprecated, use .find("%s") instead.' % (
|
'.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
|
||||||
tag_name, tag_name))
|
name=tag_name
|
||||||
|
)
|
||||||
|
)
|
||||||
return self.find(tag_name)
|
return self.find(tag_name)
|
||||||
# We special case contents to avoid recursion.
|
# We special case contents to avoid recursion.
|
||||||
elif not tag.startswith("__") and not tag == "contents":
|
elif not tag.startswith("__") and not tag == "contents":
|
||||||
|
|
@ -1115,11 +1191,10 @@ class Tag(PageElement):
|
||||||
encoding.
|
encoding.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# First off, turn a string formatter into a function. This
|
# First off, turn a string formatter into a Formatter object. This
|
||||||
# will stop the lookup from happening over and over again.
|
# will stop the lookup from happening over and over again.
|
||||||
if not callable(formatter):
|
if not isinstance(formatter, Formatter) and not callable(formatter):
|
||||||
formatter = self._formatter_for_name(formatter)
|
formatter = self._formatter_for_name(formatter)
|
||||||
|
|
||||||
attrs = []
|
attrs = []
|
||||||
if self.attrs:
|
if self.attrs:
|
||||||
for key, val in sorted(self.attrs.items()):
|
for key, val in sorted(self.attrs.items()):
|
||||||
|
|
@ -1148,7 +1223,7 @@ class Tag(PageElement):
|
||||||
prefix = self.prefix + ":"
|
prefix = self.prefix + ":"
|
||||||
|
|
||||||
if self.is_empty_element:
|
if self.is_empty_element:
|
||||||
close = '/'
|
close = formatter.void_element_close_prefix or ''
|
||||||
else:
|
else:
|
||||||
closeTag = '</%s%s>' % (prefix, self.name)
|
closeTag = '</%s%s>' % (prefix, self.name)
|
||||||
|
|
||||||
|
|
@ -1219,9 +1294,9 @@ class Tag(PageElement):
|
||||||
:param formatter: The output formatter responsible for converting
|
:param formatter: The output formatter responsible for converting
|
||||||
entities to Unicode characters.
|
entities to Unicode characters.
|
||||||
"""
|
"""
|
||||||
# First off, turn a string formatter into a function. This
|
# First off, turn a string formatter into a Formatter object. This
|
||||||
# will stop the lookup from happening over and over again.
|
# will stop the lookup from happening over and over again.
|
||||||
if not callable(formatter):
|
if not isinstance(formatter, Formatter) and not callable(formatter):
|
||||||
formatter = self._formatter_for_name(formatter)
|
formatter = self._formatter_for_name(formatter)
|
||||||
|
|
||||||
pretty_print = (indent_level is not None)
|
pretty_print = (indent_level is not None)
|
||||||
|
|
@ -1334,15 +1409,29 @@ class Tag(PageElement):
|
||||||
# Handle grouping selectors if ',' exists, ie: p,a
|
# Handle grouping selectors if ',' exists, ie: p,a
|
||||||
if ',' in selector:
|
if ',' in selector:
|
||||||
context = []
|
context = []
|
||||||
for partial_selector in selector.split(','):
|
selectors = [x.strip() for x in selector.split(",")]
|
||||||
partial_selector = partial_selector.strip()
|
|
||||||
|
# If a selector is mentioned multiple times we don't want
|
||||||
|
# to use it more than once.
|
||||||
|
used_selectors = set()
|
||||||
|
|
||||||
|
# We also don't want to select the same element more than once,
|
||||||
|
# if it's matched by multiple selectors.
|
||||||
|
selected_object_ids = set()
|
||||||
|
for partial_selector in selectors:
|
||||||
if partial_selector == '':
|
if partial_selector == '':
|
||||||
raise ValueError('Invalid group selection syntax: %s' % selector)
|
raise ValueError('Invalid group selection syntax: %s' % selector)
|
||||||
|
if partial_selector in used_selectors:
|
||||||
|
continue
|
||||||
|
used_selectors.add(partial_selector)
|
||||||
candidates = self.select(partial_selector, limit=limit)
|
candidates = self.select(partial_selector, limit=limit)
|
||||||
for candidate in candidates:
|
for candidate in candidates:
|
||||||
if candidate not in context:
|
# This lets us distinguish between distinct tags that
|
||||||
|
# represent the same markup.
|
||||||
|
object_id = id(candidate)
|
||||||
|
if object_id not in selected_object_ids:
|
||||||
context.append(candidate)
|
context.append(candidate)
|
||||||
|
selected_object_ids.add(object_id)
|
||||||
if limit and len(context) >= limit:
|
if limit and len(context) >= limit:
|
||||||
break
|
break
|
||||||
return context
|
return context
|
||||||
|
|
@ -1404,7 +1493,7 @@ class Tag(PageElement):
|
||||||
if tag_name == '':
|
if tag_name == '':
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"A pseudo-class must be prefixed with a tag name.")
|
"A pseudo-class must be prefixed with a tag name.")
|
||||||
pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
|
pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
|
||||||
found = []
|
found = []
|
||||||
if pseudo_attributes is None:
|
if pseudo_attributes is None:
|
||||||
pseudo_type = pseudo
|
pseudo_type = pseudo
|
||||||
|
|
@ -1638,7 +1727,7 @@ class SoupStrainer(object):
|
||||||
markup = markup_name
|
markup = markup_name
|
||||||
markup_attrs = markup
|
markup_attrs = markup
|
||||||
call_function_with_tag_data = (
|
call_function_with_tag_data = (
|
||||||
isinstance(self.name, collections.Callable)
|
isinstance(self.name, Callable)
|
||||||
and not isinstance(markup_name, Tag))
|
and not isinstance(markup_name, Tag))
|
||||||
|
|
||||||
if ((not self.name)
|
if ((not self.name)
|
||||||
|
|
@ -1698,7 +1787,7 @@ class SoupStrainer(object):
|
||||||
"I don't know how to match against a %s" % markup.__class__)
|
"I don't know how to match against a %s" % markup.__class__)
|
||||||
return found
|
return found
|
||||||
|
|
||||||
def _matches(self, markup, match_against):
|
def _matches(self, markup, match_against, already_tried=None):
|
||||||
# print u"Matching %s against %s" % (markup, match_against)
|
# print u"Matching %s against %s" % (markup, match_against)
|
||||||
result = False
|
result = False
|
||||||
if isinstance(markup, list) or isinstance(markup, tuple):
|
if isinstance(markup, list) or isinstance(markup, tuple):
|
||||||
|
|
@ -1713,16 +1802,17 @@ class SoupStrainer(object):
|
||||||
if self._matches(' '.join(markup), match_against):
|
if self._matches(' '.join(markup), match_against):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if match_against is True:
|
if match_against is True:
|
||||||
# True matches any non-None value.
|
# True matches any non-None value.
|
||||||
return markup is not None
|
return markup is not None
|
||||||
|
|
||||||
if isinstance(match_against, collections.Callable):
|
if isinstance(match_against, Callable):
|
||||||
return match_against(markup)
|
return match_against(markup)
|
||||||
|
|
||||||
# Custom callables take the tag as an argument, but all
|
# Custom callables take the tag as an argument, but all
|
||||||
# other ways of matching match the tag name as a string.
|
# other ways of matching match the tag name as a string.
|
||||||
|
original_markup = markup
|
||||||
if isinstance(markup, Tag):
|
if isinstance(markup, Tag):
|
||||||
markup = markup.name
|
markup = markup.name
|
||||||
|
|
||||||
|
|
@ -1733,18 +1823,51 @@ class SoupStrainer(object):
|
||||||
# None matches None, False, an empty string, an empty list, and so on.
|
# None matches None, False, an empty string, an empty list, and so on.
|
||||||
return not match_against
|
return not match_against
|
||||||
|
|
||||||
if isinstance(match_against, unicode):
|
if (hasattr(match_against, '__iter__')
|
||||||
|
and not isinstance(match_against, basestring)):
|
||||||
|
# We're asked to match against an iterable of items.
|
||||||
|
# The markup must be match at least one item in the
|
||||||
|
# iterable. We'll try each one in turn.
|
||||||
|
#
|
||||||
|
# To avoid infinite recursion we need to keep track of
|
||||||
|
# items we've already seen.
|
||||||
|
if not already_tried:
|
||||||
|
already_tried = set()
|
||||||
|
for item in match_against:
|
||||||
|
if item.__hash__:
|
||||||
|
key = item
|
||||||
|
else:
|
||||||
|
key = id(item)
|
||||||
|
if key in already_tried:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
already_tried.add(key)
|
||||||
|
if self._matches(original_markup, item, already_tried):
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Beyond this point we might need to run the test twice: once against
|
||||||
|
# the tag's name and once against its prefixed name.
|
||||||
|
match = False
|
||||||
|
|
||||||
|
if not match and isinstance(match_against, unicode):
|
||||||
# Exact string match
|
# Exact string match
|
||||||
return markup == match_against
|
match = markup == match_against
|
||||||
|
|
||||||
if hasattr(match_against, 'match'):
|
if not match and hasattr(match_against, 'search'):
|
||||||
# Regexp match
|
# Regexp match
|
||||||
return match_against.search(markup)
|
return match_against.search(markup)
|
||||||
|
|
||||||
if hasattr(match_against, '__iter__'):
|
if (not match
|
||||||
# The markup must be an exact match against something
|
and isinstance(original_markup, Tag)
|
||||||
# in the iterable.
|
and original_markup.prefix):
|
||||||
return markup in match_against
|
# Try the whole thing again with the prefixed tag name.
|
||||||
|
return self._matches(
|
||||||
|
original_markup.prefix + ':' + original_markup.name, match_against
|
||||||
|
)
|
||||||
|
|
||||||
|
return match
|
||||||
|
|
||||||
|
|
||||||
class ResultSet(list):
|
class ResultSet(list):
|
||||||
|
|
@ -1753,3 +1876,8 @@ class ResultSet(list):
|
||||||
def __init__(self, source, result=()):
|
def __init__(self, source, result=()):
|
||||||
super(ResultSet, self).__init__(result)
|
super(ResultSet, self).__init__(result)
|
||||||
self.source = source
|
self.source = source
|
||||||
|
|
||||||
|
def __getattr__(self, key):
|
||||||
|
raise AttributeError(
|
||||||
|
"ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % key
|
||||||
|
)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue