Roll included soupsieve back--newest isn't py2 compat.

This commit is contained in:
Jim Miller 2020-12-22 14:03:03 -06:00
parent 34dc2e14b2
commit 9112346f41
6 changed files with 366 additions and 253 deletions

View file

@ -25,16 +25,17 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE. SOFTWARE.
""" """
from __future__ import unicode_literals
from .__meta__ import __version__, __version_info__ # noqa: F401 from .__meta__ import __version__, __version_info__ # noqa: F401
from . import css_parser as cp from . import css_parser as cp
from . import css_match as cm from . import css_match as cm
from . import css_types as ct from . import css_types as ct
from .util import DEBUG, SelectorSyntaxError # noqa: F401 from .util import DEBUG, _QUIRKS, deprecated, SelectorSyntaxError # noqa: F401
__all__ = ( __all__ = (
'DEBUG', 'SelectorSyntaxError', 'SoupSieve', 'DEBUG', "_QUIRKS", 'SelectorSyntaxError', 'SoupSieve',
'closest', 'compile', 'filter', 'iselect', 'closest', 'comments', 'compile', 'filter', 'icomments',
'match', 'select', 'select_one' 'iselect', 'match', 'select', 'select_one'
) )
SoupSieve = cm.SoupSieve SoupSieve = cm.SoupSieve
@ -86,6 +87,21 @@ def filter(select, iterable, namespaces=None, flags=0, **kwargs): # noqa: A001
return compile(select, namespaces, flags, **kwargs).filter(iterable) return compile(select, namespaces, flags, **kwargs).filter(iterable)
@deprecated("'comments' is not related to CSS selectors and will be removed in the future.")
def comments(tag, limit=0, flags=0, **kwargs):
"""Get comments only."""
return [comment for comment in cm.CommentsMatch(tag).get_comments(limit)]
@deprecated("'icomments' is not related to CSS selectors and will be removed in the future.")
def icomments(tag, limit=0, flags=0, **kwargs):
"""Iterate comments only."""
for comment in cm.CommentsMatch(tag).get_comments(limit):
yield comment
def select_one(select, tag, namespaces=None, flags=0, **kwargs): def select_one(select, tag, namespaces=None, flags=0, **kwargs):
"""Select a single tag.""" """Select a single tag."""

View file

@ -1,4 +1,5 @@
"""Meta related things.""" """Meta related things."""
from __future__ import unicode_literals
from collections import namedtuple from collections import namedtuple
import re import re
@ -185,5 +186,5 @@ def parse_version(ver, pre=False):
return Version(major, minor, micro, release, pre, post, dev) return Version(major, minor, micro, release, pre, post, dev)
__version_info__ = Version(2, 1, 0, "final") __version_info__ = Version(1, 9, 1, "final")
__version__ = __version_info__._get_canonical() __version__ = __version_info__._get_canonical()

View file

@ -1,12 +1,11 @@
"""CSS matcher.""" """CSS matcher."""
from __future__ import unicode_literals
from datetime import datetime from datetime import datetime
from . import util from . import util
import re import re
from .import css_types as ct from .import css_types as ct
import unicodedata import unicodedata
import bs4
# Empty tag pattern (whitespace okay) # Empty tag pattern (whitespace okay)
RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]') RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
@ -44,7 +43,6 @@ RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2}
RE_DATETIME = re.compile( RE_DATETIME = re.compile(
r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$' r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$'
) )
RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)')
MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November
FEB = 2 FEB = 2
@ -55,7 +53,7 @@ FEB_LEAP_MONTH = 29
DAYS_IN_WEEK = 7 DAYS_IN_WEEK = 7
class _FakeParent(object): class FakeParent(object):
""" """
Fake parent class. Fake parent class.
@ -75,7 +73,7 @@ class _FakeParent(object):
return len(self.contents) return len(self.contents)
class _DocumentNav(object): class Document(object):
"""Navigate a Beautiful Soup document.""" """Navigate a Beautiful Soup document."""
@classmethod @classmethod
@ -89,37 +87,58 @@ class _DocumentNav(object):
@staticmethod @staticmethod
def is_doc(obj): def is_doc(obj):
"""Is `BeautifulSoup` object.""" """Is `BeautifulSoup` object."""
import bs4
return isinstance(obj, bs4.BeautifulSoup) return isinstance(obj, bs4.BeautifulSoup)
@staticmethod @staticmethod
def is_tag(obj): def is_tag(obj):
"""Is tag.""" """Is tag."""
import bs4
return isinstance(obj, bs4.Tag) return isinstance(obj, bs4.Tag)
@staticmethod
def is_comment(obj):
"""Is comment."""
import bs4
return isinstance(obj, bs4.Comment)
@staticmethod @staticmethod
def is_declaration(obj): # pragma: no cover def is_declaration(obj): # pragma: no cover
"""Is declaration.""" """Is declaration."""
import bs4
return isinstance(obj, bs4.Declaration) return isinstance(obj, bs4.Declaration)
@staticmethod @staticmethod
def is_cdata(obj): def is_cdata(obj): # pragma: no cover
"""Is CDATA.""" """Is CDATA."""
return isinstance(obj, bs4.CData)
import bs4
return isinstance(obj, bs4.Declaration)
@staticmethod @staticmethod
def is_processing_instruction(obj): # pragma: no cover def is_processing_instruction(obj): # pragma: no cover
"""Is processing instruction.""" """Is processing instruction."""
import bs4
return isinstance(obj, bs4.ProcessingInstruction) return isinstance(obj, bs4.ProcessingInstruction)
@staticmethod @staticmethod
def is_navigable_string(obj): def is_navigable_string(obj):
"""Is navigable string.""" """Is navigable string."""
import bs4
return isinstance(obj, bs4.NavigableString) return isinstance(obj, bs4.NavigableString)
@staticmethod @staticmethod
def is_special_string(obj): def is_special_string(obj):
"""Is special string.""" """Is special string."""
return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
import bs4
return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction))
@classmethod @classmethod
def is_content_string(cls, obj): def is_content_string(cls, obj):
@ -131,7 +150,7 @@ class _DocumentNav(object):
def create_fake_parent(el): def create_fake_parent(el):
"""Create fake parent for a given element.""" """Create fake parent for a given element."""
return _FakeParent(el) return FakeParent(el)
@staticmethod @staticmethod
def is_xml_tree(el): def is_xml_tree(el):
@ -198,13 +217,10 @@ class _DocumentNav(object):
is_tag = self.is_tag(child) is_tag = self.is_tag(child)
if no_iframe and is_tag and self.is_iframe(child): if no_iframe and is_tag and self.is_iframe(child):
if child.next_sibling is not None: last_child = child
next_good = child.next_sibling while self.is_tag(last_child) and last_child.contents:
else: last_child = last_child.contents[-1]
last_child = child next_good = last_child.next_element
while self.is_tag(last_child) and last_child.contents:
last_child = last_child.contents[-1]
next_good = last_child.next_element
yield child yield child
if next_good is None: if next_good is None:
break break
@ -234,27 +250,21 @@ class _DocumentNav(object):
return el.prefix return el.prefix
@staticmethod
def get_uri(el):
"""Get namespace `URI`."""
return el.namespace
@classmethod @classmethod
def get_next(cls, el, tags=True): def get_next_tag(cls, el):
"""Get next sibling tag.""" """Get next sibling tag."""
sibling = el.next_sibling sibling = el.next_sibling
while tags and not cls.is_tag(sibling) and sibling is not None: while not cls.is_tag(sibling) and sibling is not None:
sibling = sibling.next_sibling sibling = sibling.next_sibling
return sibling return sibling
@classmethod @classmethod
def get_previous(cls, el, tags=True): def get_previous_tag(cls, el):
"""Get previous sibling tag.""" """Get previous sibling tag."""
sibling = el.previous_sibling sibling = el.previous_sibling
while tags and not cls.is_tag(sibling) and sibling is not None: while not cls.is_tag(sibling) and sibling is not None:
sibling = sibling.previous_sibling sibling = sibling.previous_sibling
return sibling return sibling
@ -305,7 +315,7 @@ class _DocumentNav(object):
"""Get classes.""" """Get classes."""
classes = cls.get_attribute_by_name(el, 'class', []) classes = cls.get_attribute_by_name(el, 'class', [])
if isinstance(classes, str): if isinstance(classes, util.ustr):
classes = RE_NOT_WS.findall(classes) classes = RE_NOT_WS.findall(classes)
return classes return classes
@ -316,11 +326,6 @@ class _DocumentNav(object):
[node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)] [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)]
) )
def get_own_text(self, el, no_iframe=False):
"""Get Own Text."""
return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)]
class Inputs(object): class Inputs(object):
"""Class for parsing and validating input items.""" """Class for parsing and validating input items."""
@ -423,7 +428,7 @@ class Inputs(object):
return parsed return parsed
class _Match(object): class CSSMatch(Document, object):
"""Perform CSS matching.""" """Perform CSS matching."""
def __init__(self, selectors, scope, namespaces, flags): def __init__(self, selectors, scope, namespaces, flags):
@ -471,7 +476,7 @@ class _Match(object):
if self.supports_namespaces(): if self.supports_namespaces():
namespace = '' namespace = ''
ns = self.get_uri(el) ns = el.namespace
if ns: if ns:
namespace = ns namespace = ns
else: else:
@ -531,57 +536,6 @@ class _Match(object):
return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
return None return None
def extended_language_filter(self, lang_range, lang_tag):
"""Filter the language tags."""
match = True
lang_range = RE_WILD_STRIP.sub('-', lang_range).lower()
ranges = lang_range.split('-')
subtags = lang_tag.lower().split('-')
length = len(ranges)
rindex = 0
sindex = 0
r = ranges[rindex]
s = subtags[sindex]
# Primary tag needs to match
if r != '*' and r != s:
match = False
rindex += 1
sindex += 1
# Match until we run out of ranges
while match and rindex < length:
r = ranges[rindex]
try:
s = subtags[sindex]
except IndexError:
# Ran out of subtags,
# but we still have ranges
match = False
continue
# Empty range
if not r:
match = False
continue
# Matched range
elif s == r:
rindex += 1
# Implicit wildcard cannot match
# singletons
elif len(s) == 1:
match = False
continue
# Implicitly matched, so grab next subtag
sindex += 1
return match
def match_attribute_name(self, el, attr, prefix): def match_attribute_name(self, el, attr, prefix):
"""Match attribute name and return value if it exists.""" """Match attribute name and return value if it exists."""
@ -706,12 +660,12 @@ class _Match(object):
if parent: if parent:
found = self.match_selectors(parent, relation) found = self.match_selectors(parent, relation)
elif relation[0].rel_type == REL_SIBLING: elif relation[0].rel_type == REL_SIBLING:
sibling = self.get_previous(el) sibling = self.get_previous_tag(el)
while not found and sibling: while not found and sibling:
found = self.match_selectors(sibling, relation) found = self.match_selectors(sibling, relation)
sibling = self.get_previous(sibling) sibling = self.get_previous_tag(sibling)
elif relation[0].rel_type == REL_CLOSE_SIBLING: elif relation[0].rel_type == REL_CLOSE_SIBLING:
sibling = self.get_previous(el) sibling = self.get_previous_tag(el)
if sibling and self.is_tag(sibling): if sibling and self.is_tag(sibling):
found = self.match_selectors(sibling, relation) found = self.match_selectors(sibling, relation)
return found return found
@ -736,12 +690,12 @@ class _Match(object):
elif relation[0].rel_type == REL_HAS_CLOSE_PARENT: elif relation[0].rel_type == REL_HAS_CLOSE_PARENT:
found = self.match_future_child(el, relation) found = self.match_future_child(el, relation)
elif relation[0].rel_type == REL_HAS_SIBLING: elif relation[0].rel_type == REL_HAS_SIBLING:
sibling = self.get_next(el) sibling = self.get_next_tag(el)
while not found and sibling: while not found and sibling:
found = self.match_selectors(sibling, relation) found = self.match_selectors(sibling, relation)
sibling = self.get_next(sibling) sibling = self.get_next_tag(sibling)
elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING: elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING:
sibling = self.get_next(el) sibling = self.get_next_tag(el)
if sibling and self.is_tag(sibling): if sibling and self.is_tag(sibling):
found = self.match_selectors(sibling, relation) found = self.match_selectors(sibling, relation)
return found return found
@ -782,28 +736,7 @@ class _Match(object):
def match_root(self, el): def match_root(self, el):
"""Match element as root.""" """Match element as root."""
is_root = self.is_root(el) return self.is_root(el)
if is_root:
sibling = self.get_previous(el, tags=False)
while is_root and sibling is not None:
if (
self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
self.is_cdata(sibling)
):
is_root = False
else:
sibling = self.get_previous(sibling, tags=False)
if is_root:
sibling = self.get_next(el, tags=False)
while is_root and sibling is not None:
if (
self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
self.is_cdata(sibling)
):
is_root = False
else:
sibling = self.get_next(sibling, tags=False)
return is_root
def match_scope(self, el): def match_scope(self, el):
"""Match element as scope.""" """Match element as scope."""
@ -948,23 +881,12 @@ class _Match(object):
content = None content = None
for contain_list in contains: for contain_list in contains:
if content is None: if content is None:
if contain_list.own: content = self.get_text(el, no_iframe=self.is_html)
content = self.get_own_text(el, no_iframe=self.is_html)
else:
content = self.get_text(el, no_iframe=self.is_html)
found = False found = False
for text in contain_list.text: for text in contain_list.text:
if contain_list.own: if text in content:
for c in content: found = True
if text in c: break
found = True
break
if found:
break
else:
if text in content:
found = True
break
if not found: if not found:
match = False match = False
return match return match
@ -1148,7 +1070,7 @@ class _Match(object):
for patterns in langs: for patterns in langs:
match = False match = False
for pattern in patterns: for pattern in patterns:
if self.extended_language_filter(pattern, found_lang): if pattern.match(found_lang):
match = True match = True
if not match: if not match:
break break
@ -1230,7 +1152,7 @@ class _Match(object):
out_of_range = False out_of_range = False
itype = util.lower(self.get_attribute_by_name(el, 'type')) itype = self.get_attribute_by_name(el, 'type').lower()
mn = self.get_attribute_by_name(el, 'min', None) mn = self.get_attribute_by_name(el, 'min', None)
if mn is not None: if mn is not None:
mn = Inputs.parse_value(itype, mn) mn = Inputs.parse_value(itype, mn)
@ -1285,21 +1207,6 @@ class _Match(object):
self.get_prefix(el) is not None self.get_prefix(el) is not None
) )
def match_placeholder_shown(self, el):
"""
Match placeholder shown according to HTML spec.
- text area should be checked if they have content. A single newline does not count as content.
"""
match = False
content = self.get_text(el)
if content in ('', '\n'):
match = True
return match
def match_selectors(self, el, selectors): def match_selectors(self, el, selectors):
"""Check if element matches one of the selectors.""" """Check if element matches one of the selectors."""
@ -1332,9 +1239,6 @@ class _Match(object):
# Verify element is scope # Verify element is scope
if selector.flags & ct.SEL_SCOPE and not self.match_scope(el): if selector.flags & ct.SEL_SCOPE and not self.match_scope(el):
continue continue
# Verify element has placeholder shown
if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el):
continue
# Verify `nth` matches # Verify `nth` matches
if not self.match_nth(el, selector.nth): if not self.match_nth(el, selector.nth):
continue continue
@ -1421,8 +1325,28 @@ class _Match(object):
return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors) return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)
class CSSMatch(_DocumentNav, _Match): class CommentsMatch(Document, object):
"""The Beautiful Soup CSS match class.""" """Comments matcher."""
def __init__(self, el):
"""Initialize."""
self.assert_valid_input(el)
self.tag = el
def get_comments(self, limit=0):
"""Get comments."""
if limit < 1:
limit = None
for child in self.get_descendants(self.tag, tags=False):
if self.is_comment(child):
yield child
if limit is not None:
limit -= 1
if limit < 1:
break
class SoupSieve(ct.Immutable): class SoupSieve(ct.Immutable):
@ -1468,6 +1392,19 @@ class SoupSieve(ct.Immutable):
else: else:
return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)] return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]
@util.deprecated("'comments' is not related to CSS selectors and will be removed in the future.")
def comments(self, tag, limit=0):
"""Get comments only."""
return [comment for comment in CommentsMatch(tag).get_comments(limit)]
@util.deprecated("'icomments' is not related to CSS selectors and will be removed in the future.")
def icomments(self, tag, limit=0):
"""Iterate comments only."""
for comment in CommentsMatch(tag).get_comments(limit):
yield comment
def select_one(self, tag): def select_one(self, tag):
"""Select a single tag.""" """Select a single tag."""

View file

@ -1,11 +1,10 @@
"""CSS selector parser.""" """CSS selector parser."""
from __future__ import unicode_literals
import re import re
from functools import lru_cache
from . import util from . import util
from . import css_match as cm from . import css_match as cm
from . import css_types as ct from . import css_types as ct
from .util import SelectorSyntaxError from .util import SelectorSyntaxError
import warnings
UNICODE_REPLACEMENT_CHAR = 0xFFFD UNICODE_REPLACEMENT_CHAR = 0xFFFD
@ -60,8 +59,6 @@ PSEUDO_SIMPLE_NO_MATCH = {
# Complex pseudo classes that take selector lists # Complex pseudo classes that take selector lists
PSEUDO_COMPLEX = { PSEUDO_COMPLEX = {
':contains', ':contains',
':-soup-contains',
':-soup-contains-own',
':has', ':has',
':is', ':is',
':matches', ':matches',
@ -113,6 +110,11 @@ VALUE = r'''
ATTR = r''' ATTR = r'''
(?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}+(?P<case>[is]))?)?{ws}*\] (?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}+(?P<case>[is]))?)?{ws}*\]
'''.format(ws=WSC, value=VALUE) '''.format(ws=WSC, value=VALUE)
# Definitions for quirks mode
QUIRKS_ATTR_IDENTIFIER = r'(?:(?:{esc}|(?!/\*)[^"\] \t\r\n\f])+?)'.format(esc=CSS_ESCAPES)
QUIRKS_ATTR = r'''
(?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}+(?P<case>[is]))?)?{ws}*\]
'''.format(ws=WSC, value=QUIRKS_ATTR_IDENTIFIER)
# Selector patterns # Selector patterns
# IDs (`#id`) # IDs (`#id`)
@ -120,11 +122,13 @@ PAT_ID = r'\#{ident}'.format(ident=IDENTIFIER)
# Classes (`.class`) # Classes (`.class`)
PAT_CLASS = r'\.{ident}'.format(ident=IDENTIFIER) PAT_CLASS = r'\.{ident}'.format(ident=IDENTIFIER)
# Prefix:Tag (`prefix|tag`) # Prefix:Tag (`prefix|tag`)
PAT_TAG = r'(?P<tag_ns>(?:{ident}|\*)?\|)?(?P<tag_name>{ident}|\*)'.format(ident=IDENTIFIER) PAT_TAG = r'(?:(?:{ident}|\*)?\|)?(?:{ident}|\*)'.format(ident=IDENTIFIER)
# Attributes (`[attr]`, `[attr=value]`, etc.) # Attributes (`[attr]`, `[attr=value]`, etc.)
PAT_ATTR = r''' PAT_ATTR = r'\[{ws}*(?P<ns_attr>(?:(?:{ident}|\*)?\|)?{ident}){attr}'.format(ws=WSC, ident=IDENTIFIER, attr=ATTR)
\[{ws}*(?P<attr_ns>(?:{ident}|\*)?\|)?(?P<attr_name>{ident}){attr} # Quirks attributes, like real attributes, but unquoted values can contain anything but whitespace and closing `]`.
'''.format(ws=WSC, ident=IDENTIFIER, attr=ATTR) PAT_QUIRKS_ATTR = r'''
\[{ws}*(?P<ns_attr>(?:(?:{ident}|\*)?\|)?{ident}){attr}
'''.format(ws=WSC, ident=IDENTIFIER, attr=QUIRKS_ATTR)
# Pseudo class (`:pseudo-class`, `:pseudo-class(`) # Pseudo class (`:pseudo-class`, `:pseudo-class(`)
PAT_PSEUDO_CLASS = r'(?P<name>:{ident})(?P<open>\({ws}*)?'.format(ws=WSC, ident=IDENTIFIER) PAT_PSEUDO_CLASS = r'(?P<name>:{ident})(?P<open>\({ws}*)?'.format(ws=WSC, ident=IDENTIFIER)
# Pseudo class special patterns. Matches `:pseudo-class(` for special case pseudo classes. # Pseudo class special patterns. Matches `:pseudo-class(` for special case pseudo classes.
@ -195,13 +199,12 @@ FLG_INDETERMINATE = 0x20
FLG_OPEN = 0x40 FLG_OPEN = 0x40
FLG_IN_RANGE = 0x80 FLG_IN_RANGE = 0x80
FLG_OUT_OF_RANGE = 0x100 FLG_OUT_OF_RANGE = 0x100
FLG_PLACEHOLDER_SHOWN = 0x200
# Maximum cached patterns to store # Maximum cached patterns to store
_MAXCACHE = 500 _MAXCACHE = 500
@lru_cache(maxsize=_MAXCACHE) @util.lru_cache(maxsize=_MAXCACHE)
def _cached_css_compile(pattern, namespaces, custom, flags): def _cached_css_compile(pattern, namespaces, custom, flags):
"""Cached CSS compile.""" """Cached CSS compile."""
@ -250,7 +253,7 @@ def css_unescape(content, string=False):
codepoint = int(m.group(1)[1:], 16) codepoint = int(m.group(1)[1:], 16)
if codepoint == 0: if codepoint == 0:
codepoint = UNICODE_REPLACEMENT_CHAR codepoint = UNICODE_REPLACEMENT_CHAR
value = chr(codepoint) value = util.uchr(codepoint)
elif m.group(2): elif m.group(2):
value = m.group(2)[1:] value = m.group(2)[1:]
elif m.group(3): elif m.group(3):
@ -274,7 +277,7 @@ def escape(ident):
string.append('\\{}'.format(ident)) string.append('\\{}'.format(ident))
else: else:
for index, c in enumerate(ident): for index, c in enumerate(ident):
codepoint = ord(c) codepoint = util.uord(c)
if codepoint == 0x00: if codepoint == 0x00:
string.append('\ufffd') string.append('\ufffd')
elif (0x01 <= codepoint <= 0x1F) or codepoint == 0x7F: elif (0x01 <= codepoint <= 0x1F) or codepoint == 0x7F:
@ -305,7 +308,12 @@ class SelectorPattern(object):
return self.name return self.name
def match(self, selector, index, flags): def enabled(self, flags):
"""Enabled."""
return True
def match(self, selector, index):
"""Match the selector.""" """Match the selector."""
return self.re_pattern.match(selector, index) return self.re_pattern.match(selector, index)
@ -320,7 +328,7 @@ class SpecialPseudoPattern(SelectorPattern):
self.patterns = {} self.patterns = {}
for p in patterns: for p in patterns:
name = p[0] name = p[0]
pattern = p[3](name, p[2]) pattern = SelectorPattern(name, p[2])
for pseudo in p[1]: for pseudo in p[1]:
self.patterns[pseudo] = pattern self.patterns[pseudo] = pattern
@ -332,7 +340,12 @@ class SpecialPseudoPattern(SelectorPattern):
return self.matched_name.get_name() return self.matched_name.get_name()
def match(self, selector, index, flags): def enabled(self, flags):
"""Enabled."""
return True
def match(self, selector, index):
"""Match the selector.""" """Match the selector."""
pseudo = None pseudo = None
@ -341,13 +354,22 @@ class SpecialPseudoPattern(SelectorPattern):
name = util.lower(css_unescape(m.group('name'))) name = util.lower(css_unescape(m.group('name')))
pattern = self.patterns.get(name) pattern = self.patterns.get(name)
if pattern: if pattern:
pseudo = pattern.match(selector, index, flags) pseudo = pattern.match(selector, index)
if pseudo: if pseudo:
self.matched_name = pattern self.matched_name = pattern
return pseudo return pseudo
class QuirkPattern(SelectorPattern):
"""Selector pattern for quirk mode."""
def enabled(self, flags):
"""Enabled if quirks flag is present."""
return flags & util._QUIRKS
class _Selector(object): class _Selector(object):
""" """
Intermediate selector class. Intermediate selector class.
@ -424,16 +446,11 @@ class CSSParser(object):
SelectorPattern("pseudo_close", PAT_PSEUDO_CLOSE), SelectorPattern("pseudo_close", PAT_PSEUDO_CLOSE),
SpecialPseudoPattern( SpecialPseudoPattern(
( (
( ("pseudo_contains", (':contains',), PAT_PSEUDO_CONTAINS),
"pseudo_contains", ("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD),
(':contains', ':-soup-contains', ':-soup-contains-own'), ("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE),
PAT_PSEUDO_CONTAINS, ("pseudo_lang", (':lang',), PAT_PSEUDO_LANG),
SelectorPattern ("pseudo_dir", (':dir',), PAT_PSEUDO_DIR)
),
("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD, SelectorPattern),
("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE, SelectorPattern),
("pseudo_lang", (':lang',), PAT_PSEUDO_LANG, SelectorPattern),
("pseudo_dir", (':dir',), PAT_PSEUDO_DIR, SelectorPattern)
) )
), ),
SelectorPattern("pseudo_class_custom", PAT_PSEUDO_CLASS_CUSTOM), SelectorPattern("pseudo_class_custom", PAT_PSEUDO_CLASS_CUSTOM),
@ -444,6 +461,7 @@ class CSSParser(object):
SelectorPattern("class", PAT_CLASS), SelectorPattern("class", PAT_CLASS),
SelectorPattern("tag", PAT_TAG), SelectorPattern("tag", PAT_TAG),
SelectorPattern("attribute", PAT_ATTR), SelectorPattern("attribute", PAT_ATTR),
QuirkPattern("quirks_attribute", PAT_QUIRKS_ATTR),
SelectorPattern("combine", PAT_COMBINE) SelectorPattern("combine", PAT_COMBINE)
) )
@ -453,19 +471,24 @@ class CSSParser(object):
self.pattern = selector.replace('\x00', '\ufffd') self.pattern = selector.replace('\x00', '\ufffd')
self.flags = flags self.flags = flags
self.debug = self.flags & util.DEBUG self.debug = self.flags & util.DEBUG
self.quirks = self.flags & util._QUIRKS
self.custom = {} if custom is None else custom self.custom = {} if custom is None else custom
def parse_attribute_selector(self, sel, m, has_selector): def parse_attribute_selector(self, sel, m, has_selector, quirks):
"""Create attribute selector from the returned regex match.""" """Create attribute selector from the returned regex match."""
inverse = False inverse = False
op = m.group('cmp') op = m.group('cmp')
case = util.lower(m.group('case')) if m.group('case') else None case = util.lower(m.group('case')) if m.group('case') else None
ns = css_unescape(m.group('attr_ns')[:-1]) if m.group('attr_ns') else '' parts = [css_unescape(a) for a in m.group('ns_attr').split('|')]
attr = css_unescape(m.group('attr_name')) ns = ''
is_type = False is_type = False
pattern2 = None pattern2 = None
if len(parts) > 1:
ns = parts[0]
attr = parts[1]
else:
attr = parts[0]
if case: if case:
flags = re.I if case == 'i' else 0 flags = re.I if case == 'i' else 0
elif util.lower(attr) == 'type': elif util.lower(attr) == 'type':
@ -475,7 +498,7 @@ class CSSParser(object):
flags = 0 flags = 0
if op: if op:
if m.group('value').startswith(('"', "'")): if m.group('value').startswith(('"', "'")) and not quirks:
value = css_unescape(m.group('value')[1:-1], True) value = css_unescape(m.group('value')[1:-1], True)
else: else:
value = css_unescape(m.group('value')) value = css_unescape(m.group('value'))
@ -502,12 +525,13 @@ class CSSParser(object):
elif op.startswith('|'): elif op.startswith('|'):
# Value starts with word in dash separated list # Value starts with word in dash separated list
pattern = re.compile(r'^%s(?:-.*)?$' % re.escape(value), flags) pattern = re.compile(r'^%s(?:-.*)?$' % re.escape(value), flags)
elif op.startswith('!'):
# Equivalent to `:not([attr=value])`
pattern = re.compile(r'^%s(?:-.*)?$' % re.escape(value), flags)
inverse = True
else: else:
# Value matches # Value matches
pattern = re.compile(r'^%s$' % re.escape(value), flags) pattern = re.compile(r'^%s$' % re.escape(value), flags)
if op.startswith('!'):
# Equivalent to `:not([attr=value])`
inverse = True
if is_type and pattern: if is_type and pattern:
pattern2 = re.compile(pattern.pattern) pattern2 = re.compile(pattern.pattern)
@ -528,8 +552,13 @@ class CSSParser(object):
def parse_tag_pattern(self, sel, m, has_selector): def parse_tag_pattern(self, sel, m, has_selector):
"""Parse tag pattern from regex match.""" """Parse tag pattern from regex match."""
prefix = css_unescape(m.group('tag_ns')[:-1]) if m.group('tag_ns') else None parts = [css_unescape(x) for x in m.group(0).split('|')]
tag = css_unescape(m.group('tag_name')) if len(parts) > 1:
prefix = parts[0]
tag = parts[1]
else:
tag = parts[0]
prefix = None
sel.tag = ct.SelectorTag(tag, prefix) sel.tag = ct.SelectorTag(tag, prefix)
has_selector = True has_selector = True
return has_selector return has_selector
@ -771,11 +800,21 @@ class CSSParser(object):
if not combinator: if not combinator:
combinator = WS_COMBINATOR combinator = WS_COMBINATOR
if not has_selector: if not has_selector:
raise SelectorSyntaxError( # The only way we don't fail is if we are at the root level and quirks mode is enabled,
"The combinator '{}' at postion {}, must have a selector before it".format(combinator, index), # and we've found no other selectors yet in this compound selector.
if (not self.quirks or is_pseudo or combinator == COMMA_COMBINATOR or relations):
raise SelectorSyntaxError(
"The combinator '{}' at postion {}, must have a selector before it".format(combinator, index),
self.pattern,
index
)
util.warn_quirks(
'You have attempted to use a combinator without a selector before it at position {}.'.format(index),
'the :scope pseudo class (or another appropriate selector) should be placed before the combinator.',
self.pattern, self.pattern,
index index
) )
sel.flags |= ct.SEL_SCOPE
if combinator == COMMA_COMBINATOR: if combinator == COMMA_COMBINATOR:
if not sel.tag and not is_pseudo: if not sel.tag and not is_pseudo:
@ -808,14 +847,7 @@ class CSSParser(object):
def parse_pseudo_contains(self, sel, m, has_selector): def parse_pseudo_contains(self, sel, m, has_selector):
"""Parse contains.""" """Parse contains."""
pseudo = util.lower(css_unescape(m.group('name'))) values = m.group('values')
if pseudo == ":contains":
warnings.warn(
"The pseudo class ':contains' is deprecated, ':-soup-contains' should be used moving forward.",
FutureWarning
)
contains_own = pseudo == ":-soup-contains-own"
values = css_unescape(m.group('values'))
patterns = [] patterns = []
for token in RE_VALUES.finditer(values): for token in RE_VALUES.finditer(values):
if token.group('split'): if token.group('split'):
@ -826,7 +858,7 @@ class CSSParser(object):
else: else:
value = css_unescape(value) value = css_unescape(value)
patterns.append(value) patterns.append(value)
sel.contains.append(ct.SelectorContains(tuple(patterns), contains_own)) sel.contains.append(ct.SelectorContains(tuple(patterns)))
has_selector = True has_selector = True
return has_selector return has_selector
@ -840,12 +872,20 @@ class CSSParser(object):
continue continue
value = token.group('value') value = token.group('value')
if value.startswith(('"', "'")): if value.startswith(('"', "'")):
value = css_unescape(value[1:-1], True) parts = css_unescape(value[1:-1], True).split('-')
else: else:
value = css_unescape(value) parts = css_unescape(value).split('-')
patterns.append(value)
new_parts = []
first = True
for part in parts:
if part == '*' and first:
new_parts.append('(?!x\b)[a-z0-9]+?')
elif part != '*':
new_parts.append(('' if first else '(-(?!x\b)[a-z0-9]+)*?\\-') + re.escape(part))
if first:
first = False
patterns.append(re.compile(r'^{}(?:-.*)?$'.format(''.join(new_parts)), re.I))
sel.lang.append(ct.SelectorLang(patterns)) sel.lang.append(ct.SelectorLang(patterns))
has_selector = True has_selector = True
@ -877,7 +917,6 @@ class CSSParser(object):
is_indeterminate = bool(flags & FLG_INDETERMINATE) is_indeterminate = bool(flags & FLG_INDETERMINATE)
is_in_range = bool(flags & FLG_IN_RANGE) is_in_range = bool(flags & FLG_IN_RANGE)
is_out_of_range = bool(flags & FLG_OUT_OF_RANGE) is_out_of_range = bool(flags & FLG_OUT_OF_RANGE)
is_placeholder_shown = bool(flags & FLG_PLACEHOLDER_SHOWN)
if self.debug: # pragma: no cover if self.debug: # pragma: no cover
if is_pseudo: if is_pseudo:
@ -898,8 +937,6 @@ class CSSParser(object):
print(' is_in_range: True') print(' is_in_range: True')
if is_out_of_range: if is_out_of_range:
print(' is_out_of_range: True') print(' is_out_of_range: True')
if is_placeholder_shown:
print(' is_placeholder_shown: True')
if is_relative: if is_relative:
selectors.append(_Selector()) selectors.append(_Selector())
@ -916,7 +953,7 @@ class CSSParser(object):
elif key == 'pseudo_class': elif key == 'pseudo_class':
has_selector, is_html = self.parse_pseudo_class(sel, m, has_selector, iselector, is_html) has_selector, is_html = self.parse_pseudo_class(sel, m, has_selector, iselector, is_html)
elif key == 'pseudo_element': elif key == 'pseudo_element':
raise NotImplementedError("Pseudo-element found at position {}".format(m.start(0))) raise NotImplementedError("Psuedo-element found at position {}".format(m.start(0)))
elif key == 'pseudo_contains': elif key == 'pseudo_contains':
has_selector = self.parse_pseudo_contains(sel, m, has_selector) has_selector = self.parse_pseudo_contains(sel, m, has_selector)
elif key in ('pseudo_nth_type', 'pseudo_nth_child'): elif key in ('pseudo_nth_type', 'pseudo_nth_child'):
@ -952,8 +989,18 @@ class CSSParser(object):
has_selector, sel = self.parse_combinator( has_selector, sel = self.parse_combinator(
sel, m, has_selector, selectors, relations, is_pseudo, index sel, m, has_selector, selectors, relations, is_pseudo, index
) )
elif key == 'attribute': elif key in ('attribute', 'quirks_attribute'):
has_selector = self.parse_attribute_selector(sel, m, has_selector) quirks = key == 'quirks_attribute'
if quirks:
temp_index = index + m.group(0).find('=') + 1
util.warn_quirks(
"You have attempted to use an attribute " +
"value that should have been quoted at position {}.".format(temp_index),
"the attribute value should be quoted.",
self.pattern,
temp_index
)
has_selector = self.parse_attribute_selector(sel, m, has_selector, quirks)
elif key == 'tag': elif key == 'tag':
if has_selector: if has_selector:
raise SelectorSyntaxError( raise SelectorSyntaxError(
@ -1006,8 +1053,6 @@ class CSSParser(object):
selectors[-1].flags = ct.SEL_IN_RANGE selectors[-1].flags = ct.SEL_IN_RANGE
if is_out_of_range: if is_out_of_range:
selectors[-1].flags = ct.SEL_OUT_OF_RANGE selectors[-1].flags = ct.SEL_OUT_OF_RANGE
if is_placeholder_shown:
selectors[-1].flags = ct.SEL_PLACEHOLDER_SHOWN
return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html) return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html)
@ -1021,11 +1066,15 @@ class CSSParser(object):
end = (m.start(0) - 1) if m else (len(pattern) - 1) end = (m.start(0) - 1) if m else (len(pattern) - 1)
if self.debug: # pragma: no cover if self.debug: # pragma: no cover
if self.quirks:
print('## QUIRKS MODE: Throwing out the spec!')
print('## PARSING: {!r}'.format(pattern)) print('## PARSING: {!r}'.format(pattern))
while index <= end: while index <= end:
m = None m = None
for v in self.css_tokens: for v in self.css_tokens:
m = v.match(pattern, index, self.flags) if not v.enabled(self.flags): # pragma: no cover
continue
m = v.match(pattern, index)
if m: if m:
name = v.get_name() name = v.get_name()
if self.debug: # pragma: no cover if self.debug: # pragma: no cover
@ -1053,7 +1102,13 @@ class CSSParser(object):
print('## END PARSING') print('## END PARSING')
def process_selectors(self, index=0, flags=0): def process_selectors(self, index=0, flags=0):
"""Process selectors.""" """
Process selectors.
We do our own selectors as BeautifulSoup4 has some annoying quirks,
and we don't really need to do nth selectors or siblings or
descendants etc.
"""
return self.parse_selectors(self.selector_iter(self.pattern), index, flags) return self.parse_selectors(self.selector_iter(self.pattern), index, flags)
@ -1068,7 +1123,8 @@ CSS_LINK = CSSParser(
# CSS pattern for `:checked` # CSS pattern for `:checked`
CSS_CHECKED = CSSParser( CSS_CHECKED = CSSParser(
''' '''
html|*:is(input[type=checkbox], input[type=radio])[checked], html|option[selected] html|*:is(input[type=checkbox], input[type=radio])[checked],
html|select > html|option[selected]
''' '''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML) ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
# CSS pattern for `:default` (must compile CSS_CHECKED first) # CSS pattern for `:default` (must compile CSS_CHECKED first)
@ -1094,23 +1150,23 @@ CSS_INDETERMINATE = CSSParser(
This pattern must be at the end. This pattern must be at the end.
Special logic is applied to the last selector. Special logic is applied to the last selector.
*/ */
html|input[type="radio"][name]:not([name='']):not([checked]) html|input[type="radio"][name][name!='']:not([checked])
''' '''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_INDETERMINATE) ).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_INDETERMINATE)
# CSS pattern for `:disabled` # CSS pattern for `:disabled`
CSS_DISABLED = CSSParser( CSS_DISABLED = CSSParser(
''' '''
html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset)[disabled], html|*:is(input[type!=hidden], button, select, textarea, fieldset, optgroup, option, fieldset)[disabled],
html|optgroup[disabled] > html|option, html|optgroup[disabled] > html|option,
html|fieldset[disabled] > html|*:is(input:not([type=hidden]), button, select, textarea, fieldset), html|fieldset[disabled] > html|*:is(input[type!=hidden], button, select, textarea, fieldset),
html|fieldset[disabled] > html|fieldset[disabled] >
html|*:not(legend:nth-of-type(1)) html|*:is(input:not([type=hidden]), button, select, textarea, fieldset) html|*:not(legend:nth-of-type(1)) html|*:is(input[type!=hidden], button, select, textarea, fieldset)
''' '''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML) ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
# CSS pattern for `:enabled` # CSS pattern for `:enabled`
CSS_ENABLED = CSSParser( CSS_ENABLED = CSSParser(
''' '''
html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled) html|*:is(input[type!=hidden], button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled)
''' '''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML) ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
# CSS pattern for `:required` # CSS pattern for `:required`
@ -1124,20 +1180,22 @@ CSS_OPTIONAL = CSSParser(
# CSS pattern for `:placeholder-shown` # CSS pattern for `:placeholder-shown`
CSS_PLACEHOLDER_SHOWN = CSSParser( CSS_PLACEHOLDER_SHOWN = CSSParser(
''' '''
html|input:is( html|*:is(
:not([type]), input:is(
[type=""], :not([type]),
[type=text], [type=""],
[type=search], [type=text],
[type=url], [type=search],
[type=tel], [type=url],
[type=email], [type=tel],
[type=password], [type=email],
[type=number] [type=password],
)[placeholder]:not([placeholder='']):is(:not([value]), [value=""]), [type=number]
html|textarea[placeholder]:not([placeholder='']) ),
textarea
)[placeholder][placeholder!='']
''' '''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_PLACEHOLDER_SHOWN) ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
# CSS pattern default for `:nth-child` "of S" feature # CSS pattern default for `:nth-child` "of S" feature
CSS_NTH_OF_S_DEFAULT = CSSParser( CSS_NTH_OF_S_DEFAULT = CSSParser(
'*|*' '*|*'

View file

@ -1,6 +1,6 @@
"""CSS selector structure items.""" """CSS selector structure items."""
import copyreg from __future__ import unicode_literals
from collections.abc import Hashable, Mapping from . import util
__all__ = ( __all__ = (
'Selector', 'Selector',
@ -26,7 +26,6 @@ SEL_DIR_RTL = 0x40
SEL_IN_RANGE = 0x80 SEL_IN_RANGE = 0x80
SEL_OUT_OF_RANGE = 0x100 SEL_OUT_OF_RANGE = 0x100
SEL_DEFINED = 0x200 SEL_DEFINED = 0x200
SEL_PLACEHOLDER_SHOWN = 0x400
class Immutable(object): class Immutable(object):
@ -86,7 +85,7 @@ class Immutable(object):
__str__ = __repr__ __str__ = __repr__
class ImmutableDict(Mapping): class ImmutableDict(util.Mapping):
"""Hashable, immutable dictionary.""" """Hashable, immutable dictionary."""
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
@ -95,8 +94,8 @@ class ImmutableDict(Mapping):
arg = args[0] if args else kwargs arg = args[0] if args else kwargs
is_dict = isinstance(arg, dict) is_dict = isinstance(arg, dict)
if ( if (
is_dict and not all([isinstance(v, Hashable) for v in arg.values()]) or is_dict and not all([isinstance(v, util.Hashable) for v in arg.values()]) or
not is_dict and not all([isinstance(k, Hashable) and isinstance(v, Hashable) for k, v in arg]) not is_dict and not all([isinstance(k, util.Hashable) and isinstance(v, util.Hashable) for k, v in arg])
): ):
raise TypeError('All values must be hashable') raise TypeError('All values must be hashable')
@ -141,9 +140,9 @@ class Namespaces(ImmutableDict):
# so don't bother checking that. # so don't bother checking that.
arg = args[0] if args else kwargs arg = args[0] if args else kwargs
is_dict = isinstance(arg, dict) is_dict = isinstance(arg, dict)
if is_dict and not all([isinstance(k, str) and isinstance(v, str) for k, v in arg.items()]): if is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg.items()]):
raise TypeError('Namespace keys and values must be Unicode strings') raise TypeError('Namespace keys and values must be Unicode strings')
elif not is_dict and not all([isinstance(k, str) and isinstance(v, str) for k, v in arg]): elif not is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg]):
raise TypeError('Namespace keys and values must be Unicode strings') raise TypeError('Namespace keys and values must be Unicode strings')
super(Namespaces, self).__init__(*args, **kwargs) super(Namespaces, self).__init__(*args, **kwargs)
@ -160,9 +159,9 @@ class CustomSelectors(ImmutableDict):
# so don't bother checking that. # so don't bother checking that.
arg = args[0] if args else kwargs arg = args[0] if args else kwargs
is_dict = isinstance(arg, dict) is_dict = isinstance(arg, dict)
if is_dict and not all([isinstance(k, str) and isinstance(v, str) for k, v in arg.items()]): if is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg.items()]):
raise TypeError('CustomSelectors keys and values must be Unicode strings') raise TypeError('CustomSelectors keys and values must be Unicode strings')
elif not is_dict and not all([isinstance(k, str) and isinstance(v, str) for k, v in arg]): elif not is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg]):
raise TypeError('CustomSelectors keys and values must be Unicode strings') raise TypeError('CustomSelectors keys and values must be Unicode strings')
super(CustomSelectors, self).__init__(*args, **kwargs) super(CustomSelectors, self).__init__(*args, **kwargs)
@ -239,14 +238,13 @@ class SelectorAttribute(Immutable):
class SelectorContains(Immutable): class SelectorContains(Immutable):
"""Selector contains rule.""" """Selector contains rule."""
__slots__ = ("text", "own", "_hash") __slots__ = ("text", "_hash")
def __init__(self, text, own): def __init__(self, text):
"""Initialize.""" """Initialize."""
super(SelectorContains, self).__init__( super(SelectorContains, self).__init__(
text=text, text=text
own=own
) )
@ -333,7 +331,7 @@ def _pickle(p):
def pickle_register(obj): def pickle_register(obj):
"""Allow object to be pickled.""" """Allow object to be pickled."""
copyreg.pickle(obj, _pickle) util.copyreg.pickle(obj, _pickle)
pickle_register(Selector) pickle_register(Selector)

View file

@ -1,17 +1,47 @@
"""Utility.""" """Utility."""
from functools import wraps, lru_cache from __future__ import unicode_literals
from functools import wraps
import warnings import warnings
import sys
import struct
import os
import re import re
MODULE = os.path.dirname(__file__)
PY3 = sys.version_info >= (3, 0)
PY35 = sys.version_info >= (3, 5)
PY37 = sys.version_info >= (3, 7)
if PY3:
from functools import lru_cache # noqa F401
import copyreg # noqa F401
from collections.abc import Hashable, Mapping # noqa F401
ustr = str
bstr = bytes
unichar = chr
string = str
else:
from backports.functools_lru_cache import lru_cache # noqa F401
import copy_reg as copyreg # noqa F401
from collections import Hashable, Mapping # noqa F401
ustr = unicode # noqa: F821
bstr = str
unichar = unichr # noqa: F821
string = basestring # noqa: F821
DEBUG = 0x00001 DEBUG = 0x00001
_QUIRKS = 0x10000
RE_PATTERN_LINE_SPLIT = re.compile(r'(?:\r\n|(?!\r\n)[\n\r])|$') RE_PATTERN_LINE_SPLIT = re.compile(r'(?:\r\n|(?!\r\n)[\n\r])|$')
LC_A = ord('a')
LC_Z = ord('z')
UC_A = ord('A') UC_A = ord('A')
UC_Z = ord('Z') UC_Z = ord('Z')
@lru_cache(maxsize=512)
def lower(string): def lower(string):
"""Lower.""" """Lower."""
@ -22,7 +52,38 @@ def lower(string):
return ''.join(new_string) return ''.join(new_string)
class SelectorSyntaxError(Exception): def upper(string): # pragma: no cover
"""Lower."""
new_string = []
for c in string:
o = ord(c)
new_string.append(chr(o - 32) if LC_A <= o <= LC_Z else c)
return ''.join(new_string)
def uchr(i):
"""Allow getting Unicode character on narrow python builds."""
try:
return unichar(i)
except ValueError: # pragma: no cover
return struct.pack('i', i).decode('utf-32')
def uord(c):
"""Get Unicode ordinal."""
if len(c) == 2: # pragma: no cover
high, low = [ord(p) for p in c]
ordinal = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000
else:
ordinal = ord(c)
return ordinal
class SelectorSyntaxError(SyntaxError):
"""Syntax error in a CSS selector.""" """Syntax error in a CSS selector."""
def __init__(self, msg, pattern=None, index=None): def __init__(self, msg, pattern=None, index=None):
@ -108,3 +169,45 @@ def get_pattern_context(pattern, index):
last = m.end(0) last = m.end(0)
return ''.join(text), line, col return ''.join(text), line, col
class QuirksWarning(UserWarning): # pragma: no cover
"""Warning for quirks mode."""
def warn_quirks(message, recommend, pattern, index):
"""Warn quirks."""
import traceback
import bs4 # noqa: F401
# Acquire source code line context
paths = (MODULE, sys.modules['bs4'].__path__[0])
tb = traceback.extract_stack()
previous = None
filename = None
lineno = None
for entry in tb:
if (PY35 and entry.filename.startswith(paths)) or (not PY35 and entry[0].startswith(paths)):
break
previous = entry
if previous:
filename = previous.filename if PY35 else previous[0]
lineno = previous.lineno if PY35 else previous[1]
# Format pattern to show line and column position
context, line = get_pattern_context(pattern, index)[0:2]
# Display warning
warnings.warn_explicit(
"\nCSS selector pattern:\n" +
" {}\n".format(message) +
" This behavior is only allowed temporarily for Beautiful Soup's transition to Soup Sieve.\n" +
" In order to confrom to the CSS spec, {}\n".format(recommend) +
" It is strongly recommended the selector be altered to conform to the CSS spec " +
"as an exception will be raised for this case in the future.\n" +
"pattern line {}:\n{}".format(line, context),
QuirksWarning,
filename,
lineno
)