mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-06 08:52:55 +01:00
Update included bs4 to 4.7.1
This commit is contained in:
parent
c38bfcf1da
commit
f234cd2e78
8 changed files with 214 additions and 423 deletions
|
|
@ -17,12 +17,10 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
|||
|
||||
"""
|
||||
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
|
||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||
__version__ = "4.6.1"
|
||||
__copyright__ = "Copyright (c) 2004-2018 Leonard Richardson"
|
||||
__version__ = "4.7.1"
|
||||
__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
__all__ = ['BeautifulSoup']
|
||||
|
|
@ -237,10 +235,11 @@ class BeautifulSoup(Tag):
|
|||
self.builder = builder
|
||||
self.is_xml = builder.is_xml
|
||||
self.known_xml = self.is_xml
|
||||
self.builder.soup = self
|
||||
|
||||
self._namespaces = dict()
|
||||
self.parse_only = parse_only
|
||||
|
||||
self.builder.initialize_soup(self)
|
||||
|
||||
if hasattr(markup, 'read'): # It's a file-type object.
|
||||
markup = markup.read()
|
||||
elif len(markup) <= 256 and (
|
||||
|
|
@ -382,7 +381,7 @@ class BeautifulSoup(Tag):
|
|||
|
||||
def pushTag(self, tag):
|
||||
#print "Push", tag.name
|
||||
if self.currentTag:
|
||||
if self.currentTag is not None:
|
||||
self.currentTag.contents.append(tag)
|
||||
self.tagStack.append(tag)
|
||||
self.currentTag = self.tagStack[-1]
|
||||
|
|
@ -421,60 +420,71 @@ class BeautifulSoup(Tag):
|
|||
|
||||
def object_was_parsed(self, o, parent=None, most_recent_element=None):
|
||||
"""Add an object to the parse tree."""
|
||||
parent = parent or self.currentTag
|
||||
previous_element = most_recent_element or self._most_recent_element
|
||||
if parent is None:
|
||||
parent = self.currentTag
|
||||
if most_recent_element is not None:
|
||||
previous_element = most_recent_element
|
||||
else:
|
||||
previous_element = self._most_recent_element
|
||||
|
||||
next_element = previous_sibling = next_sibling = None
|
||||
if isinstance(o, Tag):
|
||||
next_element = o.next_element
|
||||
next_sibling = o.next_sibling
|
||||
previous_sibling = o.previous_sibling
|
||||
if not previous_element:
|
||||
if previous_element is None:
|
||||
previous_element = o.previous_element
|
||||
|
||||
fix = parent.next_element is not None
|
||||
|
||||
o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
|
||||
|
||||
self._most_recent_element = o
|
||||
parent.contents.append(o)
|
||||
|
||||
if parent.next_sibling:
|
||||
# This node is being inserted into an element that has
|
||||
# already been parsed. Deal with any dangling references.
|
||||
index = len(parent.contents)-1
|
||||
while index >= 0:
|
||||
if parent.contents[index] is o:
|
||||
break
|
||||
index -= 1
|
||||
else:
|
||||
raise ValueError(
|
||||
"Error building tree: supposedly %r was inserted "
|
||||
"into %r after the fact, but I don't see it!" % (
|
||||
o, parent
|
||||
)
|
||||
)
|
||||
if index == 0:
|
||||
previous_element = parent
|
||||
previous_sibling = None
|
||||
else:
|
||||
previous_element = previous_sibling = parent.contents[index-1]
|
||||
if index == len(parent.contents)-1:
|
||||
next_element = parent.next_sibling
|
||||
next_sibling = None
|
||||
else:
|
||||
next_element = next_sibling = parent.contents[index+1]
|
||||
# Check if we are inserting into an already parsed node.
|
||||
if fix:
|
||||
self._linkage_fixer(parent)
|
||||
|
||||
o.previous_element = previous_element
|
||||
if previous_element:
|
||||
previous_element.next_element = o
|
||||
o.next_element = next_element
|
||||
if next_element:
|
||||
next_element.previous_element = o
|
||||
o.next_sibling = next_sibling
|
||||
if next_sibling:
|
||||
next_sibling.previous_sibling = o
|
||||
o.previous_sibling = previous_sibling
|
||||
if previous_sibling:
|
||||
previous_sibling.next_sibling = o
|
||||
def _linkage_fixer(self, el):
|
||||
"""Make sure linkage of this fragment is sound."""
|
||||
|
||||
first = el.contents[0]
|
||||
child = el.contents[-1]
|
||||
descendant = child
|
||||
|
||||
if child is first and el.parent is not None:
|
||||
# Parent should be linked to first child
|
||||
el.next_element = child
|
||||
# We are no longer linked to whatever this element is
|
||||
prev_el = child.previous_element
|
||||
if prev_el is not None and prev_el is not el:
|
||||
prev_el.next_element = None
|
||||
# First child should be linked to the parent, and no previous siblings.
|
||||
child.previous_element = el
|
||||
child.previous_sibling = None
|
||||
|
||||
# We have no sibling as we've been appended as the last.
|
||||
child.next_sibling = None
|
||||
|
||||
# This index is a tag, dig deeper for a "last descendant"
|
||||
if isinstance(child, Tag) and child.contents:
|
||||
descendant = child._last_descendant(False)
|
||||
|
||||
# As the final step, link last descendant. It should be linked
|
||||
# to the parent's next sibling (if found), else walk up the chain
|
||||
# and find a parent with a sibling. It should have no next sibling.
|
||||
descendant.next_element = None
|
||||
descendant.next_sibling = None
|
||||
target = el
|
||||
while True:
|
||||
if target is None:
|
||||
break
|
||||
elif target.next_sibling is not None:
|
||||
descendant.next_element = target.next_sibling
|
||||
target.next_sibling.previous_element = child
|
||||
break
|
||||
target = target.parent
|
||||
|
||||
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
|
||||
"""Pops the tag stack up to and including the most recent
|
||||
|
|
@ -520,7 +530,7 @@ class BeautifulSoup(Tag):
|
|||
self.currentTag, self._most_recent_element)
|
||||
if tag is None:
|
||||
return tag
|
||||
if self._most_recent_element:
|
||||
if self._most_recent_element is not None:
|
||||
self._most_recent_element.next_element = tag
|
||||
self._most_recent_element = tag
|
||||
self.pushTag(tag)
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
from collections import defaultdict
|
||||
import itertools
|
||||
|
|
@ -8,7 +8,7 @@ from bs4.element import (
|
|||
CharsetMetaAttributeValue,
|
||||
ContentMetaAttributeValue,
|
||||
HTMLAwareEntitySubstitution,
|
||||
whitespace_re
|
||||
nonwhitespace_re
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
|
|
@ -102,6 +102,12 @@ class TreeBuilder(object):
|
|||
def __init__(self):
|
||||
self.soup = None
|
||||
|
||||
def initialize_soup(self, soup):
|
||||
"""The BeautifulSoup object has been initialized and is now
|
||||
being associated with the TreeBuilder.
|
||||
"""
|
||||
self.soup = soup
|
||||
|
||||
def reset(self):
|
||||
pass
|
||||
|
||||
|
|
@ -167,7 +173,7 @@ class TreeBuilder(object):
|
|||
# values. Split it into a list.
|
||||
value = attrs[attr]
|
||||
if isinstance(value, basestring):
|
||||
values = whitespace_re.split(value)
|
||||
values = nonwhitespace_re.findall(value)
|
||||
else:
|
||||
# html5lib sometimes calls setAttributes twice
|
||||
# for the same tag when rearranging the parse
|
||||
|
|
@ -239,6 +245,12 @@ class HTMLTreeBuilder(TreeBuilder):
|
|||
# These are from earlier versions of HTML and are removed in HTML5.
|
||||
'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
|
||||
])
|
||||
|
||||
# The HTML standard defines these as block-level elements. Beautiful
|
||||
# Soup does not treat these elements differently from other elements,
|
||||
# but it may do so eventually, and this information is available if
|
||||
# you need to use it.
|
||||
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
|
||||
|
||||
# The HTML standard defines these attributes as containing a
|
||||
# space-separated list of values, not a single value. That is,
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
__all__ = [
|
||||
'HTML5TreeBuilder',
|
||||
|
|
@ -15,7 +15,7 @@ from bs4.builder import (
|
|||
)
|
||||
from bs4.element import (
|
||||
NamespacedAttribute,
|
||||
whitespace_re,
|
||||
nonwhitespace_re,
|
||||
)
|
||||
import html5lib
|
||||
from html5lib.constants import (
|
||||
|
|
@ -206,7 +206,7 @@ class AttrList(object):
|
|||
# A node that is being cloned may have already undergone
|
||||
# this procedure.
|
||||
if not isinstance(value, list):
|
||||
value = whitespace_re.split(value)
|
||||
value = nonwhitespace_re.findall(value)
|
||||
self.element[name] = value
|
||||
def items(self):
|
||||
return list(self.attrs.items())
|
||||
|
|
@ -249,7 +249,7 @@ class Element(treebuilder_base.Node):
|
|||
if not isinstance(child, basestring) and child.parent is not None:
|
||||
node.element.extract()
|
||||
|
||||
if (string_child and self.element.contents
|
||||
if (string_child is not None and self.element.contents
|
||||
and self.element.contents[-1].__class__ == NavigableString):
|
||||
# We are appending a string onto another string.
|
||||
# TODO This has O(n^2) performance, for input like
|
||||
|
|
@ -360,16 +360,16 @@ class Element(treebuilder_base.Node):
|
|||
# Set the first child's previous_element and previous_sibling
|
||||
# to elements within the new parent
|
||||
first_child = to_append[0]
|
||||
if new_parents_last_descendant:
|
||||
if new_parents_last_descendant is not None:
|
||||
first_child.previous_element = new_parents_last_descendant
|
||||
else:
|
||||
first_child.previous_element = new_parent_element
|
||||
first_child.previous_sibling = new_parents_last_child
|
||||
if new_parents_last_descendant:
|
||||
if new_parents_last_descendant is not None:
|
||||
new_parents_last_descendant.next_element = first_child
|
||||
else:
|
||||
new_parent_element.next_element = first_child
|
||||
if new_parents_last_child:
|
||||
if new_parents_last_child is not None:
|
||||
new_parents_last_child.next_sibling = first_child
|
||||
|
||||
# Find the very last element being moved. It is now the
|
||||
|
|
@ -379,7 +379,7 @@ class Element(treebuilder_base.Node):
|
|||
last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
|
||||
|
||||
last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
|
||||
if new_parents_last_descendant_next_element:
|
||||
if new_parents_last_descendant_next_element is not None:
|
||||
# TODO: This code has no test coverage and I'm not sure
|
||||
# how to get html5lib to go through this path, but it's
|
||||
# just the other side of the previous line.
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
# encoding: utf-8
|
||||
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
|
||||
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
__all__ = [
|
||||
'HTMLParserTreeBuilder',
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
__all__ = [
|
||||
'LXMLTreeBuilderForXML',
|
||||
'LXMLTreeBuilder',
|
||||
|
|
@ -32,6 +33,10 @@ from bs4.dammit import EncodingDetector
|
|||
|
||||
LXML = 'lxml'
|
||||
|
||||
def _invert(d):
|
||||
"Invert a dictionary."
|
||||
return dict((v,k) for k, v in d.items())
|
||||
|
||||
class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
DEFAULT_PARSER_CLASS = etree.XMLParser
|
||||
|
||||
|
|
@ -48,7 +53,29 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
|
||||
# This namespace mapping is specified in the XML Namespace
|
||||
# standard.
|
||||
DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
|
||||
DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
|
||||
|
||||
DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
|
||||
|
||||
def initialize_soup(self, soup):
|
||||
"""Let the BeautifulSoup object know about the standard namespace
|
||||
mapping.
|
||||
"""
|
||||
super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
|
||||
self._register_namespaces(self.DEFAULT_NSMAPS)
|
||||
|
||||
def _register_namespaces(self, mapping):
|
||||
"""Let the BeautifulSoup object know about namespaces encountered
|
||||
while parsing the document.
|
||||
|
||||
This might be useful later on when creating CSS selectors.
|
||||
"""
|
||||
for key, value in mapping.items():
|
||||
if key and key not in self.soup._namespaces:
|
||||
# Let the BeautifulSoup object know about a new namespace.
|
||||
# If there are multiple namespaces defined with the same
|
||||
# prefix, the first one in the document takes precedence.
|
||||
self.soup._namespaces[key] = value
|
||||
|
||||
def default_parser(self, encoding):
|
||||
# This can either return a parser object or a class, which
|
||||
|
|
@ -75,8 +102,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
if empty_element_tags is not None:
|
||||
self.empty_element_tags = set(empty_element_tags)
|
||||
self.soup = None
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS]
|
||||
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
|
||||
|
||||
def _getNsTag(self, tag):
|
||||
# Split the namespace URL out of a fully-qualified lxml tag
|
||||
# name. Copied from lxml's src/lxml/sax.py.
|
||||
|
|
@ -144,7 +171,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
raise ParserRejectedMarkup(str(e))
|
||||
|
||||
def close(self):
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS]
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
|
||||
|
||||
def start(self, name, attrs, nsmap={}):
|
||||
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
|
||||
|
|
@ -158,8 +185,14 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
|||
self.nsmaps.append(None)
|
||||
elif len(nsmap) > 0:
|
||||
# A new namespace mapping has come into play.
|
||||
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
|
||||
self.nsmaps.append(inverted_nsmap)
|
||||
|
||||
# First, Let the BeautifulSoup object know about it.
|
||||
self._register_namespaces(nsmap)
|
||||
|
||||
# Then, add it to our running list of inverted namespace
|
||||
# mappings.
|
||||
self.nsmaps.append(_invert(nsmap))
|
||||
|
||||
# Also treat the namespace mapping as a set of attributes on the
|
||||
# tag, so we can recreate it later.
|
||||
attrs = attrs.copy()
|
||||
|
|
|
|||
|
|
@ -6,8 +6,7 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal
|
|||
Feed Parser. It works best on XML and HTML, but it does not rewrite the
|
||||
XML or HTML to reflect a new encoding; that's the tree builder's job.
|
||||
"""
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
import codecs
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
"""Diagnostic functions, mainly for use when doing tech support."""
|
||||
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
import cProfile
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
try:
|
||||
|
|
@ -7,14 +6,25 @@ try:
|
|||
except ImportError , e:
|
||||
from collections import Callable
|
||||
import re
|
||||
import shlex
|
||||
import sys
|
||||
import warnings
|
||||
try:
|
||||
import soupsieve
|
||||
except ImportError, e:
|
||||
soupsieve = None
|
||||
warnings.warn(
|
||||
'The soupsieve package is not installed. CSS selectors cannot be used.'
|
||||
)
|
||||
|
||||
from bs4.dammit import EntitySubstitution
|
||||
|
||||
DEFAULT_OUTPUT_ENCODING = "utf-8"
|
||||
PY3K = (sys.version_info[0] > 2)
|
||||
|
||||
nonwhitespace_re = re.compile(r"\S+")
|
||||
|
||||
# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on
|
||||
# the off chance someone imported it for their own use.
|
||||
whitespace_re = re.compile(r"\s+")
|
||||
|
||||
def _alias(attr):
|
||||
|
|
@ -207,7 +217,7 @@ class PageElement(object):
|
|||
if formatter is None:
|
||||
output = s
|
||||
else:
|
||||
if callable(formatter):
|
||||
if isinstance(formatter, Callable):
|
||||
# Backwards compatibility -- you used to pass in a formatting method.
|
||||
output = formatter(s)
|
||||
else:
|
||||
|
|
@ -256,26 +266,26 @@ class PageElement(object):
|
|||
self.previous_element.next_element = self
|
||||
|
||||
self.next_element = next_element
|
||||
if self.next_element:
|
||||
if self.next_element is not None:
|
||||
self.next_element.previous_element = self
|
||||
|
||||
self.next_sibling = next_sibling
|
||||
if self.next_sibling:
|
||||
if self.next_sibling is not None:
|
||||
self.next_sibling.previous_sibling = self
|
||||
|
||||
if (not previous_sibling
|
||||
if (previous_sibling is None
|
||||
and self.parent is not None and self.parent.contents):
|
||||
previous_sibling = self.parent.contents[-1]
|
||||
|
||||
self.previous_sibling = previous_sibling
|
||||
if previous_sibling:
|
||||
if previous_sibling is not None:
|
||||
self.previous_sibling.next_sibling = self
|
||||
|
||||
nextSibling = _alias("next_sibling") # BS3
|
||||
previousSibling = _alias("previous_sibling") # BS3
|
||||
|
||||
def replace_with(self, replace_with):
|
||||
if not self.parent:
|
||||
if self.parent is None:
|
||||
raise ValueError(
|
||||
"Cannot replace one element with another when the"
|
||||
"element to be replaced is not part of a tree.")
|
||||
|
|
@ -292,7 +302,7 @@ class PageElement(object):
|
|||
|
||||
def unwrap(self):
|
||||
my_parent = self.parent
|
||||
if not self.parent:
|
||||
if self.parent is None:
|
||||
raise ValueError(
|
||||
"Cannot replace an element with its contents when that"
|
||||
"element is not part of a tree.")
|
||||
|
|
@ -340,7 +350,7 @@ class PageElement(object):
|
|||
|
||||
def _last_descendant(self, is_initialized=True, accept_self=True):
|
||||
"Finds the last element beneath this object to be parsed."
|
||||
if is_initialized and self.next_sibling:
|
||||
if is_initialized and self.next_sibling is not None:
|
||||
last_child = self.next_sibling.previous_element
|
||||
else:
|
||||
last_child = self
|
||||
|
|
@ -430,43 +440,54 @@ class PageElement(object):
|
|||
"""Appends the given tag to the contents of this tag."""
|
||||
self.insert(len(self.contents), tag)
|
||||
|
||||
def insert_before(self, predecessor):
|
||||
"""Makes the given element the immediate predecessor of this one.
|
||||
def extend(self, tags):
|
||||
"""Appends the given tags to the contents of this tag."""
|
||||
for tag in tags:
|
||||
self.append(tag)
|
||||
|
||||
The two elements will have the same parent, and the given element
|
||||
def insert_before(self, *args):
|
||||
"""Makes the given element(s) the immediate predecessor of this one.
|
||||
|
||||
The elements will have the same parent, and the given elements
|
||||
will be immediately before this one.
|
||||
"""
|
||||
if self is predecessor:
|
||||
raise ValueError("Can't insert an element before itself.")
|
||||
parent = self.parent
|
||||
if parent is None:
|
||||
raise ValueError(
|
||||
"Element has no parent, so 'before' has no meaning.")
|
||||
# Extract first so that the index won't be screwed up if they
|
||||
# are siblings.
|
||||
if isinstance(predecessor, PageElement):
|
||||
predecessor.extract()
|
||||
index = parent.index(self)
|
||||
parent.insert(index, predecessor)
|
||||
if any(x is self for x in args):
|
||||
raise ValueError("Can't insert an element before itself.")
|
||||
for predecessor in args:
|
||||
# Extract first so that the index won't be screwed up if they
|
||||
# are siblings.
|
||||
if isinstance(predecessor, PageElement):
|
||||
predecessor.extract()
|
||||
index = parent.index(self)
|
||||
parent.insert(index, predecessor)
|
||||
|
||||
def insert_after(self, successor):
|
||||
"""Makes the given element the immediate successor of this one.
|
||||
def insert_after(self, *args):
|
||||
"""Makes the given element(s) the immediate successor of this one.
|
||||
|
||||
The two elements will have the same parent, and the given element
|
||||
The elements will have the same parent, and the given elements
|
||||
will be immediately after this one.
|
||||
"""
|
||||
if self is successor:
|
||||
raise ValueError("Can't insert an element after itself.")
|
||||
# Do all error checking before modifying the tree.
|
||||
parent = self.parent
|
||||
if parent is None:
|
||||
raise ValueError(
|
||||
"Element has no parent, so 'after' has no meaning.")
|
||||
# Extract first so that the index won't be screwed up if they
|
||||
# are siblings.
|
||||
if isinstance(successor, PageElement):
|
||||
successor.extract()
|
||||
index = parent.index(self)
|
||||
parent.insert(index+1, successor)
|
||||
if any(x is self for x in args):
|
||||
raise ValueError("Can't insert an element after itself.")
|
||||
|
||||
offset = 0
|
||||
for successor in args:
|
||||
# Extract first so that the index won't be screwed up if they
|
||||
# are siblings.
|
||||
if isinstance(successor, PageElement):
|
||||
successor.extract()
|
||||
index = parent.index(self)
|
||||
parent.insert(index+1+offset, successor)
|
||||
offset += 1
|
||||
|
||||
def find_next(self, name=None, attrs={}, text=None, **kwargs):
|
||||
"""Returns the first item that matches the given criteria and
|
||||
|
|
@ -657,82 +678,6 @@ class PageElement(object):
|
|||
yield i
|
||||
i = i.parent
|
||||
|
||||
# Methods for supporting CSS selectors.
|
||||
|
||||
tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$')
|
||||
|
||||
# /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
|
||||
# \---------------------------/ \---/\-------------/ \-------/
|
||||
# | | | |
|
||||
# | | | The value
|
||||
# | | ~,|,^,$,* or =
|
||||
# | Attribute
|
||||
# Tag
|
||||
attribselect_re = re.compile(
|
||||
r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' +
|
||||
r'=?"?(?P<value>[^\]"]*)"?\]$'
|
||||
)
|
||||
|
||||
def _attr_value_as_string(self, value, default=None):
|
||||
"""Force an attribute value into a string representation.
|
||||
|
||||
A multi-valued attribute will be converted into a
|
||||
space-separated stirng.
|
||||
"""
|
||||
value = self.get(value, default)
|
||||
if isinstance(value, list) or isinstance(value, tuple):
|
||||
value =" ".join(value)
|
||||
return value
|
||||
|
||||
def _tag_name_matches_and(self, function, tag_name):
|
||||
if not tag_name:
|
||||
return function
|
||||
else:
|
||||
def _match(tag):
|
||||
return tag.name == tag_name and function(tag)
|
||||
return _match
|
||||
|
||||
def _attribute_checker(self, operator, attribute, value=''):
|
||||
"""Create a function that performs a CSS selector operation.
|
||||
|
||||
Takes an operator, attribute and optional value. Returns a
|
||||
function that will return True for elements that match that
|
||||
combination.
|
||||
"""
|
||||
if operator == '=':
|
||||
# string representation of `attribute` is equal to `value`
|
||||
return lambda el: el._attr_value_as_string(attribute) == value
|
||||
elif operator == '~':
|
||||
# space-separated list representation of `attribute`
|
||||
# contains `value`
|
||||
def _includes_value(element):
|
||||
attribute_value = element.get(attribute, [])
|
||||
if not isinstance(attribute_value, list):
|
||||
attribute_value = attribute_value.split()
|
||||
return value in attribute_value
|
||||
return _includes_value
|
||||
elif operator == '^':
|
||||
# string representation of `attribute` starts with `value`
|
||||
return lambda el: el._attr_value_as_string(
|
||||
attribute, '').startswith(value)
|
||||
elif operator == '$':
|
||||
# string representation of `attribute` ends with `value`
|
||||
return lambda el: el._attr_value_as_string(
|
||||
attribute, '').endswith(value)
|
||||
elif operator == '*':
|
||||
# string representation of `attribute` contains `value`
|
||||
return lambda el: value in el._attr_value_as_string(attribute, '')
|
||||
elif operator == '|':
|
||||
# string representation of `attribute` is either exactly
|
||||
# `value` or starts with `value` and then a dash.
|
||||
def _is_or_starts_with_dash(element):
|
||||
attribute_value = element._attr_value_as_string(attribute, '')
|
||||
return (attribute_value == value or attribute_value.startswith(
|
||||
value + '-'))
|
||||
return _is_or_starts_with_dash
|
||||
else:
|
||||
return lambda el: el.has_attr(attribute)
|
||||
|
||||
# Old non-property versions of the generators, for backwards
|
||||
# compatibility with BS3.
|
||||
def nextGenerator(self):
|
||||
|
|
@ -1193,7 +1138,7 @@ class Tag(PageElement):
|
|||
|
||||
# First off, turn a string formatter into a Formatter object. This
|
||||
# will stop the lookup from happening over and over again.
|
||||
if not isinstance(formatter, Formatter) and not callable(formatter):
|
||||
if not isinstance(formatter, Formatter) and not isinstance(formatter, Callable):
|
||||
formatter = self._formatter_for_name(formatter)
|
||||
attrs = []
|
||||
if self.attrs:
|
||||
|
|
@ -1223,7 +1168,9 @@ class Tag(PageElement):
|
|||
prefix = self.prefix + ":"
|
||||
|
||||
if self.is_empty_element:
|
||||
close = formatter.void_element_close_prefix or ''
|
||||
close = ''
|
||||
if isinstance(formatter, Formatter):
|
||||
close = formatter.void_element_close_prefix or close
|
||||
else:
|
||||
closeTag = '</%s%s>' % (prefix, self.name)
|
||||
|
||||
|
|
@ -1296,7 +1243,7 @@ class Tag(PageElement):
|
|||
"""
|
||||
# First off, turn a string formatter into a Formatter object. This
|
||||
# will stop the lookup from happening over and over again.
|
||||
if not isinstance(formatter, Formatter) and not callable(formatter):
|
||||
if not isinstance(formatter, Formatter) and not isinstance(formatter, Callable):
|
||||
formatter = self._formatter_for_name(formatter)
|
||||
|
||||
pretty_print = (indent_level is not None)
|
||||
|
|
@ -1392,250 +1339,41 @@ class Tag(PageElement):
|
|||
current = current.next_element
|
||||
|
||||
# CSS selector code
|
||||
|
||||
_selector_combinators = ['>', '+', '~']
|
||||
_select_debug = False
|
||||
quoted_colon = re.compile('"[^"]*:[^"]*"')
|
||||
def select_one(self, selector):
|
||||
def select_one(self, selector, namespaces=None, **kwargs):
|
||||
"""Perform a CSS selection operation on the current element."""
|
||||
value = self.select(selector, limit=1)
|
||||
value = self.select(selector, namespaces, 1, **kwargs)
|
||||
if value:
|
||||
return value[0]
|
||||
return None
|
||||
|
||||
def select(self, selector, _candidate_generator=None, limit=None):
|
||||
"""Perform a CSS selection operation on the current element."""
|
||||
def select(self, selector, namespaces=None, limit=None, **kwargs):
|
||||
"""Perform a CSS selection operation on the current element.
|
||||
|
||||
# Handle grouping selectors if ',' exists, ie: p,a
|
||||
if ',' in selector:
|
||||
context = []
|
||||
selectors = [x.strip() for x in selector.split(",")]
|
||||
This uses the SoupSieve library.
|
||||
|
||||
# If a selector is mentioned multiple times we don't want
|
||||
# to use it more than once.
|
||||
used_selectors = set()
|
||||
:param selector: A string containing a CSS selector.
|
||||
|
||||
# We also don't want to select the same element more than once,
|
||||
# if it's matched by multiple selectors.
|
||||
selected_object_ids = set()
|
||||
for partial_selector in selectors:
|
||||
if partial_selector == '':
|
||||
raise ValueError('Invalid group selection syntax: %s' % selector)
|
||||
if partial_selector in used_selectors:
|
||||
continue
|
||||
used_selectors.add(partial_selector)
|
||||
candidates = self.select(partial_selector, limit=limit)
|
||||
for candidate in candidates:
|
||||
# This lets us distinguish between distinct tags that
|
||||
# represent the same markup.
|
||||
object_id = id(candidate)
|
||||
if object_id not in selected_object_ids:
|
||||
context.append(candidate)
|
||||
selected_object_ids.add(object_id)
|
||||
if limit and len(context) >= limit:
|
||||
break
|
||||
return context
|
||||
tokens = shlex.split(selector)
|
||||
current_context = [self]
|
||||
:param namespaces: A dictionary mapping namespace prefixes
|
||||
used in the CSS selector to namespace URIs. By default,
|
||||
Beautiful Soup will use the prefixes it encountered while
|
||||
parsing the document.
|
||||
|
||||
if tokens[-1] in self._selector_combinators:
|
||||
raise ValueError(
|
||||
'Final combinator "%s" is missing an argument.' % tokens[-1])
|
||||
:param limit: After finding this number of results, stop looking.
|
||||
|
||||
if self._select_debug:
|
||||
print 'Running CSS selector "%s"' % selector
|
||||
|
||||
for index, token in enumerate(tokens):
|
||||
new_context = []
|
||||
new_context_ids = set([])
|
||||
|
||||
if tokens[index-1] in self._selector_combinators:
|
||||
# This token was consumed by the previous combinator. Skip it.
|
||||
if self._select_debug:
|
||||
print ' Token was consumed by the previous combinator.'
|
||||
continue
|
||||
|
||||
if self._select_debug:
|
||||
print ' Considering token "%s"' % token
|
||||
recursive_candidate_generator = None
|
||||
tag_name = None
|
||||
|
||||
# Each operation corresponds to a checker function, a rule
|
||||
# for determining whether a candidate matches the
|
||||
# selector. Candidates are generated by the active
|
||||
# iterator.
|
||||
checker = None
|
||||
|
||||
m = self.attribselect_re.match(token)
|
||||
if m is not None:
|
||||
# Attribute selector
|
||||
tag_name, attribute, operator, value = m.groups()
|
||||
checker = self._attribute_checker(operator, attribute, value)
|
||||
|
||||
elif '#' in token:
|
||||
# ID selector
|
||||
tag_name, tag_id = token.split('#', 1)
|
||||
def id_matches(tag):
|
||||
return tag.get('id', None) == tag_id
|
||||
checker = id_matches
|
||||
|
||||
elif '.' in token:
|
||||
# Class selector
|
||||
tag_name, klass = token.split('.', 1)
|
||||
classes = set(klass.split('.'))
|
||||
def classes_match(candidate):
|
||||
return classes.issubset(candidate.get('class', []))
|
||||
checker = classes_match
|
||||
|
||||
elif ':' in token and not self.quoted_colon.search(token):
|
||||
# Pseudo-class
|
||||
tag_name, pseudo = token.split(':', 1)
|
||||
if tag_name == '':
|
||||
raise ValueError(
|
||||
"A pseudo-class must be prefixed with a tag name.")
|
||||
pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
|
||||
found = []
|
||||
if pseudo_attributes is None:
|
||||
pseudo_type = pseudo
|
||||
pseudo_value = None
|
||||
else:
|
||||
pseudo_type, pseudo_value = pseudo_attributes.groups()
|
||||
if pseudo_type == 'nth-of-type':
|
||||
try:
|
||||
pseudo_value = int(pseudo_value)
|
||||
except:
|
||||
raise NotImplementedError(
|
||||
'Only numeric values are currently supported for the nth-of-type pseudo-class.')
|
||||
if pseudo_value < 1:
|
||||
raise ValueError(
|
||||
'nth-of-type pseudo-class value must be at least 1.')
|
||||
class Counter(object):
|
||||
def __init__(self, destination):
|
||||
self.count = 0
|
||||
self.destination = destination
|
||||
|
||||
def nth_child_of_type(self, tag):
|
||||
self.count += 1
|
||||
if self.count == self.destination:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
checker = Counter(pseudo_value).nth_child_of_type
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
'Only the following pseudo-classes are implemented: nth-of-type.')
|
||||
|
||||
elif token == '*':
|
||||
# Star selector -- matches everything
|
||||
pass
|
||||
elif token == '>':
|
||||
# Run the next token as a CSS selector against the
|
||||
# direct children of each tag in the current context.
|
||||
recursive_candidate_generator = lambda tag: tag.children
|
||||
elif token == '~':
|
||||
# Run the next token as a CSS selector against the
|
||||
# siblings of each tag in the current context.
|
||||
recursive_candidate_generator = lambda tag: tag.next_siblings
|
||||
elif token == '+':
|
||||
# For each tag in the current context, run the next
|
||||
# token as a CSS selector against the tag's next
|
||||
# sibling that's a tag.
|
||||
def next_tag_sibling(tag):
|
||||
yield tag.find_next_sibling(True)
|
||||
recursive_candidate_generator = next_tag_sibling
|
||||
|
||||
elif self.tag_name_re.match(token):
|
||||
# Just a tag name.
|
||||
tag_name = token
|
||||
else:
|
||||
raise ValueError(
|
||||
'Unsupported or invalid CSS selector: "%s"' % token)
|
||||
if recursive_candidate_generator:
|
||||
# This happens when the selector looks like "> foo".
|
||||
#
|
||||
# The generator calls select() recursively on every
|
||||
# member of the current context, passing in a different
|
||||
# candidate generator and a different selector.
|
||||
#
|
||||
# In the case of "> foo", the candidate generator is
|
||||
# one that yields a tag's direct children (">"), and
|
||||
# the selector is "foo".
|
||||
next_token = tokens[index+1]
|
||||
def recursive_select(tag):
|
||||
if self._select_debug:
|
||||
print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
|
||||
print '-' * 40
|
||||
for i in tag.select(next_token, recursive_candidate_generator):
|
||||
if self._select_debug:
|
||||
print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
|
||||
yield i
|
||||
if self._select_debug:
|
||||
print '-' * 40
|
||||
_use_candidate_generator = recursive_select
|
||||
elif _candidate_generator is None:
|
||||
# By default, a tag's candidates are all of its
|
||||
# children. If tag_name is defined, only yield tags
|
||||
# with that name.
|
||||
if self._select_debug:
|
||||
if tag_name:
|
||||
check = "[any]"
|
||||
else:
|
||||
check = tag_name
|
||||
print ' Default candidate generator, tag name="%s"' % check
|
||||
if self._select_debug:
|
||||
# This is redundant with later code, but it stops
|
||||
# a bunch of bogus tags from cluttering up the
|
||||
# debug log.
|
||||
def default_candidate_generator(tag):
|
||||
for child in tag.descendants:
|
||||
if not isinstance(child, Tag):
|
||||
continue
|
||||
if tag_name and not child.name == tag_name:
|
||||
continue
|
||||
yield child
|
||||
_use_candidate_generator = default_candidate_generator
|
||||
else:
|
||||
_use_candidate_generator = lambda tag: tag.descendants
|
||||
else:
|
||||
_use_candidate_generator = _candidate_generator
|
||||
|
||||
count = 0
|
||||
for tag in current_context:
|
||||
if self._select_debug:
|
||||
print " Running candidate generator on %s %s" % (
|
||||
tag.name, repr(tag.attrs))
|
||||
for candidate in _use_candidate_generator(tag):
|
||||
if not isinstance(candidate, Tag):
|
||||
continue
|
||||
if tag_name and candidate.name != tag_name:
|
||||
continue
|
||||
if checker is not None:
|
||||
try:
|
||||
result = checker(candidate)
|
||||
except StopIteration:
|
||||
# The checker has decided we should no longer
|
||||
# run the generator.
|
||||
break
|
||||
if checker is None or result:
|
||||
if self._select_debug:
|
||||
print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
|
||||
if id(candidate) not in new_context_ids:
|
||||
# If a tag matches a selector more than once,
|
||||
# don't include it in the context more than once.
|
||||
new_context.append(candidate)
|
||||
new_context_ids.add(id(candidate))
|
||||
elif self._select_debug:
|
||||
print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
|
||||
|
||||
current_context = new_context
|
||||
if limit and len(current_context) >= limit:
|
||||
current_context = current_context[:limit]
|
||||
|
||||
if self._select_debug:
|
||||
print "Final verdict:"
|
||||
for i in current_context:
|
||||
print " %s %s" % (i.name, i.attrs)
|
||||
return current_context
|
||||
:param kwargs: Any extra arguments you'd like to pass in to
|
||||
soupsieve.select().
|
||||
"""
|
||||
if namespaces is None:
|
||||
namespaces = self._namespaces
|
||||
|
||||
if limit is None:
|
||||
limit = 0
|
||||
if soupsieve is None:
|
||||
raise NotImplementedError(
|
||||
"Cannot execute CSS selectors because the soupsieve package is not installed."
|
||||
)
|
||||
|
||||
return soupsieve.select(selector, self, namespaces, limit, **kwargs)
|
||||
|
||||
# Old names for backwards compatibility
|
||||
def childGenerator(self):
|
||||
|
|
@ -1687,7 +1425,7 @@ class SoupStrainer(object):
|
|||
def _normalize_search_value(self, value):
|
||||
# Leave it alone if it's a Unicode string, a callable, a
|
||||
# regular expression, a boolean, or None.
|
||||
if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match')
|
||||
if (isinstance(value, unicode) or isinstance(value, Callable) or hasattr(value, 'match')
|
||||
or isinstance(value, bool) or value is None):
|
||||
return value
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue