mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-06 00:43:00 +01:00
Roll included soupsieve back--newest isn't py2 compat.
This commit is contained in:
parent
34dc2e14b2
commit
9112346f41
6 changed files with 366 additions and 253 deletions
|
|
@ -25,16 +25,17 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
from .__meta__ import __version__, __version_info__ # noqa: F401
|
||||
from . import css_parser as cp
|
||||
from . import css_match as cm
|
||||
from . import css_types as ct
|
||||
from .util import DEBUG, SelectorSyntaxError # noqa: F401
|
||||
from .util import DEBUG, _QUIRKS, deprecated, SelectorSyntaxError # noqa: F401
|
||||
|
||||
__all__ = (
|
||||
'DEBUG', 'SelectorSyntaxError', 'SoupSieve',
|
||||
'closest', 'compile', 'filter', 'iselect',
|
||||
'match', 'select', 'select_one'
|
||||
'DEBUG', "_QUIRKS", 'SelectorSyntaxError', 'SoupSieve',
|
||||
'closest', 'comments', 'compile', 'filter', 'icomments',
|
||||
'iselect', 'match', 'select', 'select_one'
|
||||
)
|
||||
|
||||
SoupSieve = cm.SoupSieve
|
||||
|
|
@ -86,6 +87,21 @@ def filter(select, iterable, namespaces=None, flags=0, **kwargs): # noqa: A001
|
|||
return compile(select, namespaces, flags, **kwargs).filter(iterable)
|
||||
|
||||
|
||||
@deprecated("'comments' is not related to CSS selectors and will be removed in the future.")
|
||||
def comments(tag, limit=0, flags=0, **kwargs):
|
||||
"""Get comments only."""
|
||||
|
||||
return [comment for comment in cm.CommentsMatch(tag).get_comments(limit)]
|
||||
|
||||
|
||||
@deprecated("'icomments' is not related to CSS selectors and will be removed in the future.")
|
||||
def icomments(tag, limit=0, flags=0, **kwargs):
|
||||
"""Iterate comments only."""
|
||||
|
||||
for comment in cm.CommentsMatch(tag).get_comments(limit):
|
||||
yield comment
|
||||
|
||||
|
||||
def select_one(select, tag, namespaces=None, flags=0, **kwargs):
|
||||
"""Select a single tag."""
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
"""Meta related things."""
|
||||
from __future__ import unicode_literals
|
||||
from collections import namedtuple
|
||||
import re
|
||||
|
||||
|
|
@ -185,5 +186,5 @@ def parse_version(ver, pre=False):
|
|||
return Version(major, minor, micro, release, pre, post, dev)
|
||||
|
||||
|
||||
__version_info__ = Version(2, 1, 0, "final")
|
||||
__version_info__ = Version(1, 9, 1, "final")
|
||||
__version__ = __version_info__._get_canonical()
|
||||
|
|
|
|||
|
|
@ -1,12 +1,11 @@
|
|||
"""CSS matcher."""
|
||||
from __future__ import unicode_literals
|
||||
from datetime import datetime
|
||||
from . import util
|
||||
import re
|
||||
from .import css_types as ct
|
||||
import unicodedata
|
||||
|
||||
import bs4
|
||||
|
||||
# Empty tag pattern (whitespace okay)
|
||||
RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
|
||||
|
||||
|
|
@ -44,7 +43,6 @@ RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2}
|
|||
RE_DATETIME = re.compile(
|
||||
r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$'
|
||||
)
|
||||
RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)')
|
||||
|
||||
MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November
|
||||
FEB = 2
|
||||
|
|
@ -55,7 +53,7 @@ FEB_LEAP_MONTH = 29
|
|||
DAYS_IN_WEEK = 7
|
||||
|
||||
|
||||
class _FakeParent(object):
|
||||
class FakeParent(object):
|
||||
"""
|
||||
Fake parent class.
|
||||
|
||||
|
|
@ -75,7 +73,7 @@ class _FakeParent(object):
|
|||
return len(self.contents)
|
||||
|
||||
|
||||
class _DocumentNav(object):
|
||||
class Document(object):
|
||||
"""Navigate a Beautiful Soup document."""
|
||||
|
||||
@classmethod
|
||||
|
|
@ -89,37 +87,58 @@ class _DocumentNav(object):
|
|||
@staticmethod
|
||||
def is_doc(obj):
|
||||
"""Is `BeautifulSoup` object."""
|
||||
|
||||
import bs4
|
||||
return isinstance(obj, bs4.BeautifulSoup)
|
||||
|
||||
@staticmethod
|
||||
def is_tag(obj):
|
||||
"""Is tag."""
|
||||
|
||||
import bs4
|
||||
return isinstance(obj, bs4.Tag)
|
||||
|
||||
@staticmethod
|
||||
def is_comment(obj):
|
||||
"""Is comment."""
|
||||
|
||||
import bs4
|
||||
return isinstance(obj, bs4.Comment)
|
||||
|
||||
@staticmethod
|
||||
def is_declaration(obj): # pragma: no cover
|
||||
"""Is declaration."""
|
||||
|
||||
import bs4
|
||||
return isinstance(obj, bs4.Declaration)
|
||||
|
||||
@staticmethod
|
||||
def is_cdata(obj):
|
||||
def is_cdata(obj): # pragma: no cover
|
||||
"""Is CDATA."""
|
||||
return isinstance(obj, bs4.CData)
|
||||
|
||||
import bs4
|
||||
return isinstance(obj, bs4.Declaration)
|
||||
|
||||
@staticmethod
|
||||
def is_processing_instruction(obj): # pragma: no cover
|
||||
"""Is processing instruction."""
|
||||
|
||||
import bs4
|
||||
return isinstance(obj, bs4.ProcessingInstruction)
|
||||
|
||||
@staticmethod
|
||||
def is_navigable_string(obj):
|
||||
"""Is navigable string."""
|
||||
|
||||
import bs4
|
||||
return isinstance(obj, bs4.NavigableString)
|
||||
|
||||
@staticmethod
|
||||
def is_special_string(obj):
|
||||
"""Is special string."""
|
||||
return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
|
||||
|
||||
import bs4
|
||||
return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction))
|
||||
|
||||
@classmethod
|
||||
def is_content_string(cls, obj):
|
||||
|
|
@ -131,7 +150,7 @@ class _DocumentNav(object):
|
|||
def create_fake_parent(el):
|
||||
"""Create fake parent for a given element."""
|
||||
|
||||
return _FakeParent(el)
|
||||
return FakeParent(el)
|
||||
|
||||
@staticmethod
|
||||
def is_xml_tree(el):
|
||||
|
|
@ -198,13 +217,10 @@ class _DocumentNav(object):
|
|||
is_tag = self.is_tag(child)
|
||||
|
||||
if no_iframe and is_tag and self.is_iframe(child):
|
||||
if child.next_sibling is not None:
|
||||
next_good = child.next_sibling
|
||||
else:
|
||||
last_child = child
|
||||
while self.is_tag(last_child) and last_child.contents:
|
||||
last_child = last_child.contents[-1]
|
||||
next_good = last_child.next_element
|
||||
last_child = child
|
||||
while self.is_tag(last_child) and last_child.contents:
|
||||
last_child = last_child.contents[-1]
|
||||
next_good = last_child.next_element
|
||||
yield child
|
||||
if next_good is None:
|
||||
break
|
||||
|
|
@ -234,27 +250,21 @@ class _DocumentNav(object):
|
|||
|
||||
return el.prefix
|
||||
|
||||
@staticmethod
|
||||
def get_uri(el):
|
||||
"""Get namespace `URI`."""
|
||||
|
||||
return el.namespace
|
||||
|
||||
@classmethod
|
||||
def get_next(cls, el, tags=True):
|
||||
def get_next_tag(cls, el):
|
||||
"""Get next sibling tag."""
|
||||
|
||||
sibling = el.next_sibling
|
||||
while tags and not cls.is_tag(sibling) and sibling is not None:
|
||||
while not cls.is_tag(sibling) and sibling is not None:
|
||||
sibling = sibling.next_sibling
|
||||
return sibling
|
||||
|
||||
@classmethod
|
||||
def get_previous(cls, el, tags=True):
|
||||
def get_previous_tag(cls, el):
|
||||
"""Get previous sibling tag."""
|
||||
|
||||
sibling = el.previous_sibling
|
||||
while tags and not cls.is_tag(sibling) and sibling is not None:
|
||||
while not cls.is_tag(sibling) and sibling is not None:
|
||||
sibling = sibling.previous_sibling
|
||||
return sibling
|
||||
|
||||
|
|
@ -305,7 +315,7 @@ class _DocumentNav(object):
|
|||
"""Get classes."""
|
||||
|
||||
classes = cls.get_attribute_by_name(el, 'class', [])
|
||||
if isinstance(classes, str):
|
||||
if isinstance(classes, util.ustr):
|
||||
classes = RE_NOT_WS.findall(classes)
|
||||
return classes
|
||||
|
||||
|
|
@ -316,11 +326,6 @@ class _DocumentNav(object):
|
|||
[node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)]
|
||||
)
|
||||
|
||||
def get_own_text(self, el, no_iframe=False):
|
||||
"""Get Own Text."""
|
||||
|
||||
return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)]
|
||||
|
||||
|
||||
class Inputs(object):
|
||||
"""Class for parsing and validating input items."""
|
||||
|
|
@ -423,7 +428,7 @@ class Inputs(object):
|
|||
return parsed
|
||||
|
||||
|
||||
class _Match(object):
|
||||
class CSSMatch(Document, object):
|
||||
"""Perform CSS matching."""
|
||||
|
||||
def __init__(self, selectors, scope, namespaces, flags):
|
||||
|
|
@ -471,7 +476,7 @@ class _Match(object):
|
|||
|
||||
if self.supports_namespaces():
|
||||
namespace = ''
|
||||
ns = self.get_uri(el)
|
||||
ns = el.namespace
|
||||
if ns:
|
||||
namespace = ns
|
||||
else:
|
||||
|
|
@ -531,57 +536,6 @@ class _Match(object):
|
|||
return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
|
||||
return None
|
||||
|
||||
def extended_language_filter(self, lang_range, lang_tag):
|
||||
"""Filter the language tags."""
|
||||
|
||||
match = True
|
||||
lang_range = RE_WILD_STRIP.sub('-', lang_range).lower()
|
||||
ranges = lang_range.split('-')
|
||||
subtags = lang_tag.lower().split('-')
|
||||
length = len(ranges)
|
||||
rindex = 0
|
||||
sindex = 0
|
||||
r = ranges[rindex]
|
||||
s = subtags[sindex]
|
||||
|
||||
# Primary tag needs to match
|
||||
if r != '*' and r != s:
|
||||
match = False
|
||||
|
||||
rindex += 1
|
||||
sindex += 1
|
||||
|
||||
# Match until we run out of ranges
|
||||
while match and rindex < length:
|
||||
r = ranges[rindex]
|
||||
try:
|
||||
s = subtags[sindex]
|
||||
except IndexError:
|
||||
# Ran out of subtags,
|
||||
# but we still have ranges
|
||||
match = False
|
||||
continue
|
||||
|
||||
# Empty range
|
||||
if not r:
|
||||
match = False
|
||||
continue
|
||||
|
||||
# Matched range
|
||||
elif s == r:
|
||||
rindex += 1
|
||||
|
||||
# Implicit wildcard cannot match
|
||||
# singletons
|
||||
elif len(s) == 1:
|
||||
match = False
|
||||
continue
|
||||
|
||||
# Implicitly matched, so grab next subtag
|
||||
sindex += 1
|
||||
|
||||
return match
|
||||
|
||||
def match_attribute_name(self, el, attr, prefix):
|
||||
"""Match attribute name and return value if it exists."""
|
||||
|
||||
|
|
@ -706,12 +660,12 @@ class _Match(object):
|
|||
if parent:
|
||||
found = self.match_selectors(parent, relation)
|
||||
elif relation[0].rel_type == REL_SIBLING:
|
||||
sibling = self.get_previous(el)
|
||||
sibling = self.get_previous_tag(el)
|
||||
while not found and sibling:
|
||||
found = self.match_selectors(sibling, relation)
|
||||
sibling = self.get_previous(sibling)
|
||||
sibling = self.get_previous_tag(sibling)
|
||||
elif relation[0].rel_type == REL_CLOSE_SIBLING:
|
||||
sibling = self.get_previous(el)
|
||||
sibling = self.get_previous_tag(el)
|
||||
if sibling and self.is_tag(sibling):
|
||||
found = self.match_selectors(sibling, relation)
|
||||
return found
|
||||
|
|
@ -736,12 +690,12 @@ class _Match(object):
|
|||
elif relation[0].rel_type == REL_HAS_CLOSE_PARENT:
|
||||
found = self.match_future_child(el, relation)
|
||||
elif relation[0].rel_type == REL_HAS_SIBLING:
|
||||
sibling = self.get_next(el)
|
||||
sibling = self.get_next_tag(el)
|
||||
while not found and sibling:
|
||||
found = self.match_selectors(sibling, relation)
|
||||
sibling = self.get_next(sibling)
|
||||
sibling = self.get_next_tag(sibling)
|
||||
elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING:
|
||||
sibling = self.get_next(el)
|
||||
sibling = self.get_next_tag(el)
|
||||
if sibling and self.is_tag(sibling):
|
||||
found = self.match_selectors(sibling, relation)
|
||||
return found
|
||||
|
|
@ -782,28 +736,7 @@ class _Match(object):
|
|||
def match_root(self, el):
|
||||
"""Match element as root."""
|
||||
|
||||
is_root = self.is_root(el)
|
||||
if is_root:
|
||||
sibling = self.get_previous(el, tags=False)
|
||||
while is_root and sibling is not None:
|
||||
if (
|
||||
self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
|
||||
self.is_cdata(sibling)
|
||||
):
|
||||
is_root = False
|
||||
else:
|
||||
sibling = self.get_previous(sibling, tags=False)
|
||||
if is_root:
|
||||
sibling = self.get_next(el, tags=False)
|
||||
while is_root and sibling is not None:
|
||||
if (
|
||||
self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
|
||||
self.is_cdata(sibling)
|
||||
):
|
||||
is_root = False
|
||||
else:
|
||||
sibling = self.get_next(sibling, tags=False)
|
||||
return is_root
|
||||
return self.is_root(el)
|
||||
|
||||
def match_scope(self, el):
|
||||
"""Match element as scope."""
|
||||
|
|
@ -948,23 +881,12 @@ class _Match(object):
|
|||
content = None
|
||||
for contain_list in contains:
|
||||
if content is None:
|
||||
if contain_list.own:
|
||||
content = self.get_own_text(el, no_iframe=self.is_html)
|
||||
else:
|
||||
content = self.get_text(el, no_iframe=self.is_html)
|
||||
content = self.get_text(el, no_iframe=self.is_html)
|
||||
found = False
|
||||
for text in contain_list.text:
|
||||
if contain_list.own:
|
||||
for c in content:
|
||||
if text in c:
|
||||
found = True
|
||||
break
|
||||
if found:
|
||||
break
|
||||
else:
|
||||
if text in content:
|
||||
found = True
|
||||
break
|
||||
if text in content:
|
||||
found = True
|
||||
break
|
||||
if not found:
|
||||
match = False
|
||||
return match
|
||||
|
|
@ -1148,7 +1070,7 @@ class _Match(object):
|
|||
for patterns in langs:
|
||||
match = False
|
||||
for pattern in patterns:
|
||||
if self.extended_language_filter(pattern, found_lang):
|
||||
if pattern.match(found_lang):
|
||||
match = True
|
||||
if not match:
|
||||
break
|
||||
|
|
@ -1230,7 +1152,7 @@ class _Match(object):
|
|||
|
||||
out_of_range = False
|
||||
|
||||
itype = util.lower(self.get_attribute_by_name(el, 'type'))
|
||||
itype = self.get_attribute_by_name(el, 'type').lower()
|
||||
mn = self.get_attribute_by_name(el, 'min', None)
|
||||
if mn is not None:
|
||||
mn = Inputs.parse_value(itype, mn)
|
||||
|
|
@ -1285,21 +1207,6 @@ class _Match(object):
|
|||
self.get_prefix(el) is not None
|
||||
)
|
||||
|
||||
def match_placeholder_shown(self, el):
|
||||
"""
|
||||
Match placeholder shown according to HTML spec.
|
||||
|
||||
- text area should be checked if they have content. A single newline does not count as content.
|
||||
|
||||
"""
|
||||
|
||||
match = False
|
||||
content = self.get_text(el)
|
||||
if content in ('', '\n'):
|
||||
match = True
|
||||
|
||||
return match
|
||||
|
||||
def match_selectors(self, el, selectors):
|
||||
"""Check if element matches one of the selectors."""
|
||||
|
||||
|
|
@ -1332,9 +1239,6 @@ class _Match(object):
|
|||
# Verify element is scope
|
||||
if selector.flags & ct.SEL_SCOPE and not self.match_scope(el):
|
||||
continue
|
||||
# Verify element has placeholder shown
|
||||
if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el):
|
||||
continue
|
||||
# Verify `nth` matches
|
||||
if not self.match_nth(el, selector.nth):
|
||||
continue
|
||||
|
|
@ -1421,8 +1325,28 @@ class _Match(object):
|
|||
return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)
|
||||
|
||||
|
||||
class CSSMatch(_DocumentNav, _Match):
|
||||
"""The Beautiful Soup CSS match class."""
|
||||
class CommentsMatch(Document, object):
|
||||
"""Comments matcher."""
|
||||
|
||||
def __init__(self, el):
|
||||
"""Initialize."""
|
||||
|
||||
self.assert_valid_input(el)
|
||||
self.tag = el
|
||||
|
||||
def get_comments(self, limit=0):
|
||||
"""Get comments."""
|
||||
|
||||
if limit < 1:
|
||||
limit = None
|
||||
|
||||
for child in self.get_descendants(self.tag, tags=False):
|
||||
if self.is_comment(child):
|
||||
yield child
|
||||
if limit is not None:
|
||||
limit -= 1
|
||||
if limit < 1:
|
||||
break
|
||||
|
||||
|
||||
class SoupSieve(ct.Immutable):
|
||||
|
|
@ -1468,6 +1392,19 @@ class SoupSieve(ct.Immutable):
|
|||
else:
|
||||
return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]
|
||||
|
||||
@util.deprecated("'comments' is not related to CSS selectors and will be removed in the future.")
|
||||
def comments(self, tag, limit=0):
|
||||
"""Get comments only."""
|
||||
|
||||
return [comment for comment in CommentsMatch(tag).get_comments(limit)]
|
||||
|
||||
@util.deprecated("'icomments' is not related to CSS selectors and will be removed in the future.")
|
||||
def icomments(self, tag, limit=0):
|
||||
"""Iterate comments only."""
|
||||
|
||||
for comment in CommentsMatch(tag).get_comments(limit):
|
||||
yield comment
|
||||
|
||||
def select_one(self, tag):
|
||||
"""Select a single tag."""
|
||||
|
||||
|
|
|
|||
|
|
@ -1,11 +1,10 @@
|
|||
"""CSS selector parser."""
|
||||
from __future__ import unicode_literals
|
||||
import re
|
||||
from functools import lru_cache
|
||||
from . import util
|
||||
from . import css_match as cm
|
||||
from . import css_types as ct
|
||||
from .util import SelectorSyntaxError
|
||||
import warnings
|
||||
|
||||
UNICODE_REPLACEMENT_CHAR = 0xFFFD
|
||||
|
||||
|
|
@ -60,8 +59,6 @@ PSEUDO_SIMPLE_NO_MATCH = {
|
|||
# Complex pseudo classes that take selector lists
|
||||
PSEUDO_COMPLEX = {
|
||||
':contains',
|
||||
':-soup-contains',
|
||||
':-soup-contains-own',
|
||||
':has',
|
||||
':is',
|
||||
':matches',
|
||||
|
|
@ -113,6 +110,11 @@ VALUE = r'''
|
|||
ATTR = r'''
|
||||
(?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}+(?P<case>[is]))?)?{ws}*\]
|
||||
'''.format(ws=WSC, value=VALUE)
|
||||
# Definitions for quirks mode
|
||||
QUIRKS_ATTR_IDENTIFIER = r'(?:(?:{esc}|(?!/\*)[^"\] \t\r\n\f])+?)'.format(esc=CSS_ESCAPES)
|
||||
QUIRKS_ATTR = r'''
|
||||
(?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}+(?P<case>[is]))?)?{ws}*\]
|
||||
'''.format(ws=WSC, value=QUIRKS_ATTR_IDENTIFIER)
|
||||
|
||||
# Selector patterns
|
||||
# IDs (`#id`)
|
||||
|
|
@ -120,11 +122,13 @@ PAT_ID = r'\#{ident}'.format(ident=IDENTIFIER)
|
|||
# Classes (`.class`)
|
||||
PAT_CLASS = r'\.{ident}'.format(ident=IDENTIFIER)
|
||||
# Prefix:Tag (`prefix|tag`)
|
||||
PAT_TAG = r'(?P<tag_ns>(?:{ident}|\*)?\|)?(?P<tag_name>{ident}|\*)'.format(ident=IDENTIFIER)
|
||||
PAT_TAG = r'(?:(?:{ident}|\*)?\|)?(?:{ident}|\*)'.format(ident=IDENTIFIER)
|
||||
# Attributes (`[attr]`, `[attr=value]`, etc.)
|
||||
PAT_ATTR = r'''
|
||||
\[{ws}*(?P<attr_ns>(?:{ident}|\*)?\|)?(?P<attr_name>{ident}){attr}
|
||||
'''.format(ws=WSC, ident=IDENTIFIER, attr=ATTR)
|
||||
PAT_ATTR = r'\[{ws}*(?P<ns_attr>(?:(?:{ident}|\*)?\|)?{ident}){attr}'.format(ws=WSC, ident=IDENTIFIER, attr=ATTR)
|
||||
# Quirks attributes, like real attributes, but unquoted values can contain anything but whitespace and closing `]`.
|
||||
PAT_QUIRKS_ATTR = r'''
|
||||
\[{ws}*(?P<ns_attr>(?:(?:{ident}|\*)?\|)?{ident}){attr}
|
||||
'''.format(ws=WSC, ident=IDENTIFIER, attr=QUIRKS_ATTR)
|
||||
# Pseudo class (`:pseudo-class`, `:pseudo-class(`)
|
||||
PAT_PSEUDO_CLASS = r'(?P<name>:{ident})(?P<open>\({ws}*)?'.format(ws=WSC, ident=IDENTIFIER)
|
||||
# Pseudo class special patterns. Matches `:pseudo-class(` for special case pseudo classes.
|
||||
|
|
@ -195,13 +199,12 @@ FLG_INDETERMINATE = 0x20
|
|||
FLG_OPEN = 0x40
|
||||
FLG_IN_RANGE = 0x80
|
||||
FLG_OUT_OF_RANGE = 0x100
|
||||
FLG_PLACEHOLDER_SHOWN = 0x200
|
||||
|
||||
# Maximum cached patterns to store
|
||||
_MAXCACHE = 500
|
||||
|
||||
|
||||
@lru_cache(maxsize=_MAXCACHE)
|
||||
@util.lru_cache(maxsize=_MAXCACHE)
|
||||
def _cached_css_compile(pattern, namespaces, custom, flags):
|
||||
"""Cached CSS compile."""
|
||||
|
||||
|
|
@ -250,7 +253,7 @@ def css_unescape(content, string=False):
|
|||
codepoint = int(m.group(1)[1:], 16)
|
||||
if codepoint == 0:
|
||||
codepoint = UNICODE_REPLACEMENT_CHAR
|
||||
value = chr(codepoint)
|
||||
value = util.uchr(codepoint)
|
||||
elif m.group(2):
|
||||
value = m.group(2)[1:]
|
||||
elif m.group(3):
|
||||
|
|
@ -274,7 +277,7 @@ def escape(ident):
|
|||
string.append('\\{}'.format(ident))
|
||||
else:
|
||||
for index, c in enumerate(ident):
|
||||
codepoint = ord(c)
|
||||
codepoint = util.uord(c)
|
||||
if codepoint == 0x00:
|
||||
string.append('\ufffd')
|
||||
elif (0x01 <= codepoint <= 0x1F) or codepoint == 0x7F:
|
||||
|
|
@ -305,7 +308,12 @@ class SelectorPattern(object):
|
|||
|
||||
return self.name
|
||||
|
||||
def match(self, selector, index, flags):
|
||||
def enabled(self, flags):
|
||||
"""Enabled."""
|
||||
|
||||
return True
|
||||
|
||||
def match(self, selector, index):
|
||||
"""Match the selector."""
|
||||
|
||||
return self.re_pattern.match(selector, index)
|
||||
|
|
@ -320,7 +328,7 @@ class SpecialPseudoPattern(SelectorPattern):
|
|||
self.patterns = {}
|
||||
for p in patterns:
|
||||
name = p[0]
|
||||
pattern = p[3](name, p[2])
|
||||
pattern = SelectorPattern(name, p[2])
|
||||
for pseudo in p[1]:
|
||||
self.patterns[pseudo] = pattern
|
||||
|
||||
|
|
@ -332,7 +340,12 @@ class SpecialPseudoPattern(SelectorPattern):
|
|||
|
||||
return self.matched_name.get_name()
|
||||
|
||||
def match(self, selector, index, flags):
|
||||
def enabled(self, flags):
|
||||
"""Enabled."""
|
||||
|
||||
return True
|
||||
|
||||
def match(self, selector, index):
|
||||
"""Match the selector."""
|
||||
|
||||
pseudo = None
|
||||
|
|
@ -341,13 +354,22 @@ class SpecialPseudoPattern(SelectorPattern):
|
|||
name = util.lower(css_unescape(m.group('name')))
|
||||
pattern = self.patterns.get(name)
|
||||
if pattern:
|
||||
pseudo = pattern.match(selector, index, flags)
|
||||
pseudo = pattern.match(selector, index)
|
||||
if pseudo:
|
||||
self.matched_name = pattern
|
||||
|
||||
return pseudo
|
||||
|
||||
|
||||
class QuirkPattern(SelectorPattern):
|
||||
"""Selector pattern for quirk mode."""
|
||||
|
||||
def enabled(self, flags):
|
||||
"""Enabled if quirks flag is present."""
|
||||
|
||||
return flags & util._QUIRKS
|
||||
|
||||
|
||||
class _Selector(object):
|
||||
"""
|
||||
Intermediate selector class.
|
||||
|
|
@ -424,16 +446,11 @@ class CSSParser(object):
|
|||
SelectorPattern("pseudo_close", PAT_PSEUDO_CLOSE),
|
||||
SpecialPseudoPattern(
|
||||
(
|
||||
(
|
||||
"pseudo_contains",
|
||||
(':contains', ':-soup-contains', ':-soup-contains-own'),
|
||||
PAT_PSEUDO_CONTAINS,
|
||||
SelectorPattern
|
||||
),
|
||||
("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD, SelectorPattern),
|
||||
("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE, SelectorPattern),
|
||||
("pseudo_lang", (':lang',), PAT_PSEUDO_LANG, SelectorPattern),
|
||||
("pseudo_dir", (':dir',), PAT_PSEUDO_DIR, SelectorPattern)
|
||||
("pseudo_contains", (':contains',), PAT_PSEUDO_CONTAINS),
|
||||
("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD),
|
||||
("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE),
|
||||
("pseudo_lang", (':lang',), PAT_PSEUDO_LANG),
|
||||
("pseudo_dir", (':dir',), PAT_PSEUDO_DIR)
|
||||
)
|
||||
),
|
||||
SelectorPattern("pseudo_class_custom", PAT_PSEUDO_CLASS_CUSTOM),
|
||||
|
|
@ -444,6 +461,7 @@ class CSSParser(object):
|
|||
SelectorPattern("class", PAT_CLASS),
|
||||
SelectorPattern("tag", PAT_TAG),
|
||||
SelectorPattern("attribute", PAT_ATTR),
|
||||
QuirkPattern("quirks_attribute", PAT_QUIRKS_ATTR),
|
||||
SelectorPattern("combine", PAT_COMBINE)
|
||||
)
|
||||
|
||||
|
|
@ -453,19 +471,24 @@ class CSSParser(object):
|
|||
self.pattern = selector.replace('\x00', '\ufffd')
|
||||
self.flags = flags
|
||||
self.debug = self.flags & util.DEBUG
|
||||
self.quirks = self.flags & util._QUIRKS
|
||||
self.custom = {} if custom is None else custom
|
||||
|
||||
def parse_attribute_selector(self, sel, m, has_selector):
|
||||
def parse_attribute_selector(self, sel, m, has_selector, quirks):
|
||||
"""Create attribute selector from the returned regex match."""
|
||||
|
||||
inverse = False
|
||||
op = m.group('cmp')
|
||||
case = util.lower(m.group('case')) if m.group('case') else None
|
||||
ns = css_unescape(m.group('attr_ns')[:-1]) if m.group('attr_ns') else ''
|
||||
attr = css_unescape(m.group('attr_name'))
|
||||
parts = [css_unescape(a) for a in m.group('ns_attr').split('|')]
|
||||
ns = ''
|
||||
is_type = False
|
||||
pattern2 = None
|
||||
|
||||
if len(parts) > 1:
|
||||
ns = parts[0]
|
||||
attr = parts[1]
|
||||
else:
|
||||
attr = parts[0]
|
||||
if case:
|
||||
flags = re.I if case == 'i' else 0
|
||||
elif util.lower(attr) == 'type':
|
||||
|
|
@ -475,7 +498,7 @@ class CSSParser(object):
|
|||
flags = 0
|
||||
|
||||
if op:
|
||||
if m.group('value').startswith(('"', "'")):
|
||||
if m.group('value').startswith(('"', "'")) and not quirks:
|
||||
value = css_unescape(m.group('value')[1:-1], True)
|
||||
else:
|
||||
value = css_unescape(m.group('value'))
|
||||
|
|
@ -502,12 +525,13 @@ class CSSParser(object):
|
|||
elif op.startswith('|'):
|
||||
# Value starts with word in dash separated list
|
||||
pattern = re.compile(r'^%s(?:-.*)?$' % re.escape(value), flags)
|
||||
elif op.startswith('!'):
|
||||
# Equivalent to `:not([attr=value])`
|
||||
pattern = re.compile(r'^%s(?:-.*)?$' % re.escape(value), flags)
|
||||
inverse = True
|
||||
else:
|
||||
# Value matches
|
||||
pattern = re.compile(r'^%s$' % re.escape(value), flags)
|
||||
if op.startswith('!'):
|
||||
# Equivalent to `:not([attr=value])`
|
||||
inverse = True
|
||||
if is_type and pattern:
|
||||
pattern2 = re.compile(pattern.pattern)
|
||||
|
||||
|
|
@ -528,8 +552,13 @@ class CSSParser(object):
|
|||
def parse_tag_pattern(self, sel, m, has_selector):
|
||||
"""Parse tag pattern from regex match."""
|
||||
|
||||
prefix = css_unescape(m.group('tag_ns')[:-1]) if m.group('tag_ns') else None
|
||||
tag = css_unescape(m.group('tag_name'))
|
||||
parts = [css_unescape(x) for x in m.group(0).split('|')]
|
||||
if len(parts) > 1:
|
||||
prefix = parts[0]
|
||||
tag = parts[1]
|
||||
else:
|
||||
tag = parts[0]
|
||||
prefix = None
|
||||
sel.tag = ct.SelectorTag(tag, prefix)
|
||||
has_selector = True
|
||||
return has_selector
|
||||
|
|
@ -771,11 +800,21 @@ class CSSParser(object):
|
|||
if not combinator:
|
||||
combinator = WS_COMBINATOR
|
||||
if not has_selector:
|
||||
raise SelectorSyntaxError(
|
||||
"The combinator '{}' at postion {}, must have a selector before it".format(combinator, index),
|
||||
# The only way we don't fail is if we are at the root level and quirks mode is enabled,
|
||||
# and we've found no other selectors yet in this compound selector.
|
||||
if (not self.quirks or is_pseudo or combinator == COMMA_COMBINATOR or relations):
|
||||
raise SelectorSyntaxError(
|
||||
"The combinator '{}' at postion {}, must have a selector before it".format(combinator, index),
|
||||
self.pattern,
|
||||
index
|
||||
)
|
||||
util.warn_quirks(
|
||||
'You have attempted to use a combinator without a selector before it at position {}.'.format(index),
|
||||
'the :scope pseudo class (or another appropriate selector) should be placed before the combinator.',
|
||||
self.pattern,
|
||||
index
|
||||
)
|
||||
sel.flags |= ct.SEL_SCOPE
|
||||
|
||||
if combinator == COMMA_COMBINATOR:
|
||||
if not sel.tag and not is_pseudo:
|
||||
|
|
@ -808,14 +847,7 @@ class CSSParser(object):
|
|||
def parse_pseudo_contains(self, sel, m, has_selector):
|
||||
"""Parse contains."""
|
||||
|
||||
pseudo = util.lower(css_unescape(m.group('name')))
|
||||
if pseudo == ":contains":
|
||||
warnings.warn(
|
||||
"The pseudo class ':contains' is deprecated, ':-soup-contains' should be used moving forward.",
|
||||
FutureWarning
|
||||
)
|
||||
contains_own = pseudo == ":-soup-contains-own"
|
||||
values = css_unescape(m.group('values'))
|
||||
values = m.group('values')
|
||||
patterns = []
|
||||
for token in RE_VALUES.finditer(values):
|
||||
if token.group('split'):
|
||||
|
|
@ -826,7 +858,7 @@ class CSSParser(object):
|
|||
else:
|
||||
value = css_unescape(value)
|
||||
patterns.append(value)
|
||||
sel.contains.append(ct.SelectorContains(tuple(patterns), contains_own))
|
||||
sel.contains.append(ct.SelectorContains(tuple(patterns)))
|
||||
has_selector = True
|
||||
return has_selector
|
||||
|
||||
|
|
@ -840,12 +872,20 @@ class CSSParser(object):
|
|||
continue
|
||||
value = token.group('value')
|
||||
if value.startswith(('"', "'")):
|
||||
value = css_unescape(value[1:-1], True)
|
||||
parts = css_unescape(value[1:-1], True).split('-')
|
||||
else:
|
||||
value = css_unescape(value)
|
||||
|
||||
patterns.append(value)
|
||||
parts = css_unescape(value).split('-')
|
||||
|
||||
new_parts = []
|
||||
first = True
|
||||
for part in parts:
|
||||
if part == '*' and first:
|
||||
new_parts.append('(?!x\b)[a-z0-9]+?')
|
||||
elif part != '*':
|
||||
new_parts.append(('' if first else '(-(?!x\b)[a-z0-9]+)*?\\-') + re.escape(part))
|
||||
if first:
|
||||
first = False
|
||||
patterns.append(re.compile(r'^{}(?:-.*)?$'.format(''.join(new_parts)), re.I))
|
||||
sel.lang.append(ct.SelectorLang(patterns))
|
||||
has_selector = True
|
||||
|
||||
|
|
@ -877,7 +917,6 @@ class CSSParser(object):
|
|||
is_indeterminate = bool(flags & FLG_INDETERMINATE)
|
||||
is_in_range = bool(flags & FLG_IN_RANGE)
|
||||
is_out_of_range = bool(flags & FLG_OUT_OF_RANGE)
|
||||
is_placeholder_shown = bool(flags & FLG_PLACEHOLDER_SHOWN)
|
||||
|
||||
if self.debug: # pragma: no cover
|
||||
if is_pseudo:
|
||||
|
|
@ -898,8 +937,6 @@ class CSSParser(object):
|
|||
print(' is_in_range: True')
|
||||
if is_out_of_range:
|
||||
print(' is_out_of_range: True')
|
||||
if is_placeholder_shown:
|
||||
print(' is_placeholder_shown: True')
|
||||
|
||||
if is_relative:
|
||||
selectors.append(_Selector())
|
||||
|
|
@ -916,7 +953,7 @@ class CSSParser(object):
|
|||
elif key == 'pseudo_class':
|
||||
has_selector, is_html = self.parse_pseudo_class(sel, m, has_selector, iselector, is_html)
|
||||
elif key == 'pseudo_element':
|
||||
raise NotImplementedError("Pseudo-element found at position {}".format(m.start(0)))
|
||||
raise NotImplementedError("Psuedo-element found at position {}".format(m.start(0)))
|
||||
elif key == 'pseudo_contains':
|
||||
has_selector = self.parse_pseudo_contains(sel, m, has_selector)
|
||||
elif key in ('pseudo_nth_type', 'pseudo_nth_child'):
|
||||
|
|
@ -952,8 +989,18 @@ class CSSParser(object):
|
|||
has_selector, sel = self.parse_combinator(
|
||||
sel, m, has_selector, selectors, relations, is_pseudo, index
|
||||
)
|
||||
elif key == 'attribute':
|
||||
has_selector = self.parse_attribute_selector(sel, m, has_selector)
|
||||
elif key in ('attribute', 'quirks_attribute'):
|
||||
quirks = key == 'quirks_attribute'
|
||||
if quirks:
|
||||
temp_index = index + m.group(0).find('=') + 1
|
||||
util.warn_quirks(
|
||||
"You have attempted to use an attribute " +
|
||||
"value that should have been quoted at position {}.".format(temp_index),
|
||||
"the attribute value should be quoted.",
|
||||
self.pattern,
|
||||
temp_index
|
||||
)
|
||||
has_selector = self.parse_attribute_selector(sel, m, has_selector, quirks)
|
||||
elif key == 'tag':
|
||||
if has_selector:
|
||||
raise SelectorSyntaxError(
|
||||
|
|
@ -1006,8 +1053,6 @@ class CSSParser(object):
|
|||
selectors[-1].flags = ct.SEL_IN_RANGE
|
||||
if is_out_of_range:
|
||||
selectors[-1].flags = ct.SEL_OUT_OF_RANGE
|
||||
if is_placeholder_shown:
|
||||
selectors[-1].flags = ct.SEL_PLACEHOLDER_SHOWN
|
||||
|
||||
return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html)
|
||||
|
||||
|
|
@ -1021,11 +1066,15 @@ class CSSParser(object):
|
|||
end = (m.start(0) - 1) if m else (len(pattern) - 1)
|
||||
|
||||
if self.debug: # pragma: no cover
|
||||
if self.quirks:
|
||||
print('## QUIRKS MODE: Throwing out the spec!')
|
||||
print('## PARSING: {!r}'.format(pattern))
|
||||
while index <= end:
|
||||
m = None
|
||||
for v in self.css_tokens:
|
||||
m = v.match(pattern, index, self.flags)
|
||||
if not v.enabled(self.flags): # pragma: no cover
|
||||
continue
|
||||
m = v.match(pattern, index)
|
||||
if m:
|
||||
name = v.get_name()
|
||||
if self.debug: # pragma: no cover
|
||||
|
|
@ -1053,7 +1102,13 @@ class CSSParser(object):
|
|||
print('## END PARSING')
|
||||
|
||||
def process_selectors(self, index=0, flags=0):
|
||||
"""Process selectors."""
|
||||
"""
|
||||
Process selectors.
|
||||
|
||||
We do our own selectors as BeautifulSoup4 has some annoying quirks,
|
||||
and we don't really need to do nth selectors or siblings or
|
||||
descendants etc.
|
||||
"""
|
||||
|
||||
return self.parse_selectors(self.selector_iter(self.pattern), index, flags)
|
||||
|
||||
|
|
@ -1068,7 +1123,8 @@ CSS_LINK = CSSParser(
|
|||
# CSS pattern for `:checked`
|
||||
CSS_CHECKED = CSSParser(
|
||||
'''
|
||||
html|*:is(input[type=checkbox], input[type=radio])[checked], html|option[selected]
|
||||
html|*:is(input[type=checkbox], input[type=radio])[checked],
|
||||
html|select > html|option[selected]
|
||||
'''
|
||||
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
|
||||
# CSS pattern for `:default` (must compile CSS_CHECKED first)
|
||||
|
|
@ -1094,23 +1150,23 @@ CSS_INDETERMINATE = CSSParser(
|
|||
This pattern must be at the end.
|
||||
Special logic is applied to the last selector.
|
||||
*/
|
||||
html|input[type="radio"][name]:not([name='']):not([checked])
|
||||
html|input[type="radio"][name][name!='']:not([checked])
|
||||
'''
|
||||
).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_INDETERMINATE)
|
||||
# CSS pattern for `:disabled`
|
||||
CSS_DISABLED = CSSParser(
|
||||
'''
|
||||
html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset)[disabled],
|
||||
html|*:is(input[type!=hidden], button, select, textarea, fieldset, optgroup, option, fieldset)[disabled],
|
||||
html|optgroup[disabled] > html|option,
|
||||
html|fieldset[disabled] > html|*:is(input:not([type=hidden]), button, select, textarea, fieldset),
|
||||
html|fieldset[disabled] > html|*:is(input[type!=hidden], button, select, textarea, fieldset),
|
||||
html|fieldset[disabled] >
|
||||
html|*:not(legend:nth-of-type(1)) html|*:is(input:not([type=hidden]), button, select, textarea, fieldset)
|
||||
html|*:not(legend:nth-of-type(1)) html|*:is(input[type!=hidden], button, select, textarea, fieldset)
|
||||
'''
|
||||
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
|
||||
# CSS pattern for `:enabled`
|
||||
CSS_ENABLED = CSSParser(
|
||||
'''
|
||||
html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled)
|
||||
html|*:is(input[type!=hidden], button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled)
|
||||
'''
|
||||
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
|
||||
# CSS pattern for `:required`
|
||||
|
|
@ -1124,20 +1180,22 @@ CSS_OPTIONAL = CSSParser(
|
|||
# CSS pattern for `:placeholder-shown`
|
||||
CSS_PLACEHOLDER_SHOWN = CSSParser(
|
||||
'''
|
||||
html|input:is(
|
||||
:not([type]),
|
||||
[type=""],
|
||||
[type=text],
|
||||
[type=search],
|
||||
[type=url],
|
||||
[type=tel],
|
||||
[type=email],
|
||||
[type=password],
|
||||
[type=number]
|
||||
)[placeholder]:not([placeholder='']):is(:not([value]), [value=""]),
|
||||
html|textarea[placeholder]:not([placeholder=''])
|
||||
html|*:is(
|
||||
input:is(
|
||||
:not([type]),
|
||||
[type=""],
|
||||
[type=text],
|
||||
[type=search],
|
||||
[type=url],
|
||||
[type=tel],
|
||||
[type=email],
|
||||
[type=password],
|
||||
[type=number]
|
||||
),
|
||||
textarea
|
||||
)[placeholder][placeholder!='']
|
||||
'''
|
||||
).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_PLACEHOLDER_SHOWN)
|
||||
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
|
||||
# CSS pattern default for `:nth-child` "of S" feature
|
||||
CSS_NTH_OF_S_DEFAULT = CSSParser(
|
||||
'*|*'
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
"""CSS selector structure items."""
|
||||
import copyreg
|
||||
from collections.abc import Hashable, Mapping
|
||||
from __future__ import unicode_literals
|
||||
from . import util
|
||||
|
||||
__all__ = (
|
||||
'Selector',
|
||||
|
|
@ -26,7 +26,6 @@ SEL_DIR_RTL = 0x40
|
|||
SEL_IN_RANGE = 0x80
|
||||
SEL_OUT_OF_RANGE = 0x100
|
||||
SEL_DEFINED = 0x200
|
||||
SEL_PLACEHOLDER_SHOWN = 0x400
|
||||
|
||||
|
||||
class Immutable(object):
|
||||
|
|
@ -86,7 +85,7 @@ class Immutable(object):
|
|||
__str__ = __repr__
|
||||
|
||||
|
||||
class ImmutableDict(Mapping):
|
||||
class ImmutableDict(util.Mapping):
|
||||
"""Hashable, immutable dictionary."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
|
|
@ -95,8 +94,8 @@ class ImmutableDict(Mapping):
|
|||
arg = args[0] if args else kwargs
|
||||
is_dict = isinstance(arg, dict)
|
||||
if (
|
||||
is_dict and not all([isinstance(v, Hashable) for v in arg.values()]) or
|
||||
not is_dict and not all([isinstance(k, Hashable) and isinstance(v, Hashable) for k, v in arg])
|
||||
is_dict and not all([isinstance(v, util.Hashable) for v in arg.values()]) or
|
||||
not is_dict and not all([isinstance(k, util.Hashable) and isinstance(v, util.Hashable) for k, v in arg])
|
||||
):
|
||||
raise TypeError('All values must be hashable')
|
||||
|
||||
|
|
@ -141,9 +140,9 @@ class Namespaces(ImmutableDict):
|
|||
# so don't bother checking that.
|
||||
arg = args[0] if args else kwargs
|
||||
is_dict = isinstance(arg, dict)
|
||||
if is_dict and not all([isinstance(k, str) and isinstance(v, str) for k, v in arg.items()]):
|
||||
if is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg.items()]):
|
||||
raise TypeError('Namespace keys and values must be Unicode strings')
|
||||
elif not is_dict and not all([isinstance(k, str) and isinstance(v, str) for k, v in arg]):
|
||||
elif not is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg]):
|
||||
raise TypeError('Namespace keys and values must be Unicode strings')
|
||||
|
||||
super(Namespaces, self).__init__(*args, **kwargs)
|
||||
|
|
@ -160,9 +159,9 @@ class CustomSelectors(ImmutableDict):
|
|||
# so don't bother checking that.
|
||||
arg = args[0] if args else kwargs
|
||||
is_dict = isinstance(arg, dict)
|
||||
if is_dict and not all([isinstance(k, str) and isinstance(v, str) for k, v in arg.items()]):
|
||||
if is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg.items()]):
|
||||
raise TypeError('CustomSelectors keys and values must be Unicode strings')
|
||||
elif not is_dict and not all([isinstance(k, str) and isinstance(v, str) for k, v in arg]):
|
||||
elif not is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg]):
|
||||
raise TypeError('CustomSelectors keys and values must be Unicode strings')
|
||||
|
||||
super(CustomSelectors, self).__init__(*args, **kwargs)
|
||||
|
|
@ -239,14 +238,13 @@ class SelectorAttribute(Immutable):
|
|||
class SelectorContains(Immutable):
|
||||
"""Selector contains rule."""
|
||||
|
||||
__slots__ = ("text", "own", "_hash")
|
||||
__slots__ = ("text", "_hash")
|
||||
|
||||
def __init__(self, text, own):
|
||||
def __init__(self, text):
|
||||
"""Initialize."""
|
||||
|
||||
super(SelectorContains, self).__init__(
|
||||
text=text,
|
||||
own=own
|
||||
text=text
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -333,7 +331,7 @@ def _pickle(p):
|
|||
def pickle_register(obj):
|
||||
"""Allow object to be pickled."""
|
||||
|
||||
copyreg.pickle(obj, _pickle)
|
||||
util.copyreg.pickle(obj, _pickle)
|
||||
|
||||
|
||||
pickle_register(Selector)
|
||||
|
|
|
|||
|
|
@ -1,17 +1,47 @@
|
|||
"""Utility."""
|
||||
from functools import wraps, lru_cache
|
||||
from __future__ import unicode_literals
|
||||
from functools import wraps
|
||||
import warnings
|
||||
import sys
|
||||
import struct
|
||||
import os
|
||||
import re
|
||||
MODULE = os.path.dirname(__file__)
|
||||
|
||||
PY3 = sys.version_info >= (3, 0)
|
||||
PY35 = sys.version_info >= (3, 5)
|
||||
PY37 = sys.version_info >= (3, 7)
|
||||
|
||||
if PY3:
|
||||
from functools import lru_cache # noqa F401
|
||||
import copyreg # noqa F401
|
||||
from collections.abc import Hashable, Mapping # noqa F401
|
||||
|
||||
ustr = str
|
||||
bstr = bytes
|
||||
unichar = chr
|
||||
string = str
|
||||
else:
|
||||
from backports.functools_lru_cache import lru_cache # noqa F401
|
||||
import copy_reg as copyreg # noqa F401
|
||||
from collections import Hashable, Mapping # noqa F401
|
||||
|
||||
ustr = unicode # noqa: F821
|
||||
bstr = str
|
||||
unichar = unichr # noqa: F821
|
||||
string = basestring # noqa: F821
|
||||
|
||||
DEBUG = 0x00001
|
||||
_QUIRKS = 0x10000
|
||||
|
||||
RE_PATTERN_LINE_SPLIT = re.compile(r'(?:\r\n|(?!\r\n)[\n\r])|$')
|
||||
|
||||
LC_A = ord('a')
|
||||
LC_Z = ord('z')
|
||||
UC_A = ord('A')
|
||||
UC_Z = ord('Z')
|
||||
|
||||
|
||||
@lru_cache(maxsize=512)
|
||||
def lower(string):
|
||||
"""Lower."""
|
||||
|
||||
|
|
@ -22,7 +52,38 @@ def lower(string):
|
|||
return ''.join(new_string)
|
||||
|
||||
|
||||
class SelectorSyntaxError(Exception):
|
||||
def upper(string): # pragma: no cover
|
||||
"""Lower."""
|
||||
|
||||
new_string = []
|
||||
for c in string:
|
||||
o = ord(c)
|
||||
new_string.append(chr(o - 32) if LC_A <= o <= LC_Z else c)
|
||||
return ''.join(new_string)
|
||||
|
||||
|
||||
def uchr(i):
|
||||
"""Allow getting Unicode character on narrow python builds."""
|
||||
|
||||
try:
|
||||
return unichar(i)
|
||||
except ValueError: # pragma: no cover
|
||||
return struct.pack('i', i).decode('utf-32')
|
||||
|
||||
|
||||
def uord(c):
|
||||
"""Get Unicode ordinal."""
|
||||
|
||||
if len(c) == 2: # pragma: no cover
|
||||
high, low = [ord(p) for p in c]
|
||||
ordinal = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000
|
||||
else:
|
||||
ordinal = ord(c)
|
||||
|
||||
return ordinal
|
||||
|
||||
|
||||
class SelectorSyntaxError(SyntaxError):
|
||||
"""Syntax error in a CSS selector."""
|
||||
|
||||
def __init__(self, msg, pattern=None, index=None):
|
||||
|
|
@ -108,3 +169,45 @@ def get_pattern_context(pattern, index):
|
|||
last = m.end(0)
|
||||
|
||||
return ''.join(text), line, col
|
||||
|
||||
|
||||
class QuirksWarning(UserWarning): # pragma: no cover
|
||||
"""Warning for quirks mode."""
|
||||
|
||||
|
||||
def warn_quirks(message, recommend, pattern, index):
|
||||
"""Warn quirks."""
|
||||
|
||||
import traceback
|
||||
import bs4 # noqa: F401
|
||||
|
||||
# Acquire source code line context
|
||||
paths = (MODULE, sys.modules['bs4'].__path__[0])
|
||||
tb = traceback.extract_stack()
|
||||
previous = None
|
||||
filename = None
|
||||
lineno = None
|
||||
for entry in tb:
|
||||
if (PY35 and entry.filename.startswith(paths)) or (not PY35 and entry[0].startswith(paths)):
|
||||
break
|
||||
previous = entry
|
||||
if previous:
|
||||
filename = previous.filename if PY35 else previous[0]
|
||||
lineno = previous.lineno if PY35 else previous[1]
|
||||
|
||||
# Format pattern to show line and column position
|
||||
context, line = get_pattern_context(pattern, index)[0:2]
|
||||
|
||||
# Display warning
|
||||
warnings.warn_explicit(
|
||||
"\nCSS selector pattern:\n" +
|
||||
" {}\n".format(message) +
|
||||
" This behavior is only allowed temporarily for Beautiful Soup's transition to Soup Sieve.\n" +
|
||||
" In order to confrom to the CSS spec, {}\n".format(recommend) +
|
||||
" It is strongly recommended the selector be altered to conform to the CSS spec " +
|
||||
"as an exception will be raised for this case in the future.\n" +
|
||||
"pattern line {}:\n{}".format(line, context),
|
||||
QuirksWarning,
|
||||
filename,
|
||||
lineno
|
||||
)
|
||||
|
|
|
|||
Loading…
Reference in a new issue