diff --git a/included_dependencies/soupsieve/__init__.py b/included_dependencies/soupsieve/__init__.py index 1005898a..49c478f7 100644 --- a/included_dependencies/soupsieve/__init__.py +++ b/included_dependencies/soupsieve/__init__.py @@ -25,16 +25,17 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from __future__ import unicode_literals from .__meta__ import __version__, __version_info__ # noqa: F401 from . import css_parser as cp from . import css_match as cm from . import css_types as ct -from .util import DEBUG, SelectorSyntaxError # noqa: F401 +from .util import DEBUG, _QUIRKS, deprecated, SelectorSyntaxError # noqa: F401 __all__ = ( - 'DEBUG', 'SelectorSyntaxError', 'SoupSieve', - 'closest', 'compile', 'filter', 'iselect', - 'match', 'select', 'select_one' + 'DEBUG', "_QUIRKS", 'SelectorSyntaxError', 'SoupSieve', + 'closest', 'comments', 'compile', 'filter', 'icomments', + 'iselect', 'match', 'select', 'select_one' ) SoupSieve = cm.SoupSieve @@ -86,6 +87,21 @@ def filter(select, iterable, namespaces=None, flags=0, **kwargs): # noqa: A001 return compile(select, namespaces, flags, **kwargs).filter(iterable) +@deprecated("'comments' is not related to CSS selectors and will be removed in the future.") +def comments(tag, limit=0, flags=0, **kwargs): + """Get comments only.""" + + return [comment for comment in cm.CommentsMatch(tag).get_comments(limit)] + + +@deprecated("'icomments' is not related to CSS selectors and will be removed in the future.") +def icomments(tag, limit=0, flags=0, **kwargs): + """Iterate comments only.""" + + for comment in cm.CommentsMatch(tag).get_comments(limit): + yield comment + + def select_one(select, tag, namespaces=None, flags=0, **kwargs): """Select a single tag.""" diff --git a/included_dependencies/soupsieve/__meta__.py b/included_dependencies/soupsieve/__meta__.py index 90a3c747..18f2ea08 100644 --- a/included_dependencies/soupsieve/__meta__.py +++ b/included_dependencies/soupsieve/__meta__.py @@ -1,4 +1,5 @@ """Meta related things.""" +from __future__ import unicode_literals from collections import namedtuple import re @@ -185,5 +186,5 @@ def parse_version(ver, pre=False): return Version(major, minor, micro, release, pre, post, dev) -__version_info__ = Version(2, 1, 0, "final") +__version_info__ = Version(1, 9, 1, "final") __version__ = __version_info__._get_canonical() diff --git a/included_dependencies/soupsieve/css_match.py b/included_dependencies/soupsieve/css_match.py index 91aa30c2..632e07f9 100644 --- a/included_dependencies/soupsieve/css_match.py +++ b/included_dependencies/soupsieve/css_match.py @@ -1,12 +1,11 @@ """CSS matcher.""" +from __future__ import unicode_literals from datetime import datetime from . import util import re from .import css_types as ct import unicodedata -import bs4 - # Empty tag pattern (whitespace okay) RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]') @@ -44,7 +43,6 @@ RE_DATE = re.compile(r'^(?P[0-9]{4,})-(?P[0-9]{2})-(?P[0-9]{2} RE_DATETIME = re.compile( r'^(?P[0-9]{4,})-(?P[0-9]{2})-(?P[0-9]{2})T(?P[0-9]{2}):(?P[0-9]{2})$' ) -RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)') MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November FEB = 2 @@ -55,7 +53,7 @@ FEB_LEAP_MONTH = 29 DAYS_IN_WEEK = 7 -class _FakeParent(object): +class FakeParent(object): """ Fake parent class. @@ -75,7 +73,7 @@ class _FakeParent(object): return len(self.contents) -class _DocumentNav(object): +class Document(object): """Navigate a Beautiful Soup document.""" @classmethod @@ -89,37 +87,58 @@ class _DocumentNav(object): @staticmethod def is_doc(obj): """Is `BeautifulSoup` object.""" + + import bs4 return isinstance(obj, bs4.BeautifulSoup) @staticmethod def is_tag(obj): """Is tag.""" + + import bs4 return isinstance(obj, bs4.Tag) + @staticmethod + def is_comment(obj): + """Is comment.""" + + import bs4 + return isinstance(obj, bs4.Comment) + @staticmethod def is_declaration(obj): # pragma: no cover """Is declaration.""" + + import bs4 return isinstance(obj, bs4.Declaration) @staticmethod - def is_cdata(obj): + def is_cdata(obj): # pragma: no cover """Is CDATA.""" - return isinstance(obj, bs4.CData) + + import bs4 + return isinstance(obj, bs4.Declaration) @staticmethod def is_processing_instruction(obj): # pragma: no cover """Is processing instruction.""" + + import bs4 return isinstance(obj, bs4.ProcessingInstruction) @staticmethod def is_navigable_string(obj): """Is navigable string.""" + + import bs4 return isinstance(obj, bs4.NavigableString) @staticmethod def is_special_string(obj): """Is special string.""" - return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype)) + + import bs4 + return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction)) @classmethod def is_content_string(cls, obj): @@ -131,7 +150,7 @@ class _DocumentNav(object): def create_fake_parent(el): """Create fake parent for a given element.""" - return _FakeParent(el) + return FakeParent(el) @staticmethod def is_xml_tree(el): @@ -198,13 +217,10 @@ class _DocumentNav(object): is_tag = self.is_tag(child) if no_iframe and is_tag and self.is_iframe(child): - if child.next_sibling is not None: - next_good = child.next_sibling - else: - last_child = child - while self.is_tag(last_child) and last_child.contents: - last_child = last_child.contents[-1] - next_good = last_child.next_element + last_child = child + while self.is_tag(last_child) and last_child.contents: + last_child = last_child.contents[-1] + next_good = last_child.next_element yield child if next_good is None: break @@ -234,27 +250,21 @@ class _DocumentNav(object): return el.prefix - @staticmethod - def get_uri(el): - """Get namespace `URI`.""" - - return el.namespace - @classmethod - def get_next(cls, el, tags=True): + def get_next_tag(cls, el): """Get next sibling tag.""" sibling = el.next_sibling - while tags and not cls.is_tag(sibling) and sibling is not None: + while not cls.is_tag(sibling) and sibling is not None: sibling = sibling.next_sibling return sibling @classmethod - def get_previous(cls, el, tags=True): + def get_previous_tag(cls, el): """Get previous sibling tag.""" sibling = el.previous_sibling - while tags and not cls.is_tag(sibling) and sibling is not None: + while not cls.is_tag(sibling) and sibling is not None: sibling = sibling.previous_sibling return sibling @@ -305,7 +315,7 @@ class _DocumentNav(object): """Get classes.""" classes = cls.get_attribute_by_name(el, 'class', []) - if isinstance(classes, str): + if isinstance(classes, util.ustr): classes = RE_NOT_WS.findall(classes) return classes @@ -316,11 +326,6 @@ class _DocumentNav(object): [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)] ) - def get_own_text(self, el, no_iframe=False): - """Get Own Text.""" - - return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)] - class Inputs(object): """Class for parsing and validating input items.""" @@ -423,7 +428,7 @@ class Inputs(object): return parsed -class _Match(object): +class CSSMatch(Document, object): """Perform CSS matching.""" def __init__(self, selectors, scope, namespaces, flags): @@ -471,7 +476,7 @@ class _Match(object): if self.supports_namespaces(): namespace = '' - ns = self.get_uri(el) + ns = el.namespace if ns: namespace = ns else: @@ -531,57 +536,6 @@ class _Match(object): return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL return None - def extended_language_filter(self, lang_range, lang_tag): - """Filter the language tags.""" - - match = True - lang_range = RE_WILD_STRIP.sub('-', lang_range).lower() - ranges = lang_range.split('-') - subtags = lang_tag.lower().split('-') - length = len(ranges) - rindex = 0 - sindex = 0 - r = ranges[rindex] - s = subtags[sindex] - - # Primary tag needs to match - if r != '*' and r != s: - match = False - - rindex += 1 - sindex += 1 - - # Match until we run out of ranges - while match and rindex < length: - r = ranges[rindex] - try: - s = subtags[sindex] - except IndexError: - # Ran out of subtags, - # but we still have ranges - match = False - continue - - # Empty range - if not r: - match = False - continue - - # Matched range - elif s == r: - rindex += 1 - - # Implicit wildcard cannot match - # singletons - elif len(s) == 1: - match = False - continue - - # Implicitly matched, so grab next subtag - sindex += 1 - - return match - def match_attribute_name(self, el, attr, prefix): """Match attribute name and return value if it exists.""" @@ -706,12 +660,12 @@ class _Match(object): if parent: found = self.match_selectors(parent, relation) elif relation[0].rel_type == REL_SIBLING: - sibling = self.get_previous(el) + sibling = self.get_previous_tag(el) while not found and sibling: found = self.match_selectors(sibling, relation) - sibling = self.get_previous(sibling) + sibling = self.get_previous_tag(sibling) elif relation[0].rel_type == REL_CLOSE_SIBLING: - sibling = self.get_previous(el) + sibling = self.get_previous_tag(el) if sibling and self.is_tag(sibling): found = self.match_selectors(sibling, relation) return found @@ -736,12 +690,12 @@ class _Match(object): elif relation[0].rel_type == REL_HAS_CLOSE_PARENT: found = self.match_future_child(el, relation) elif relation[0].rel_type == REL_HAS_SIBLING: - sibling = self.get_next(el) + sibling = self.get_next_tag(el) while not found and sibling: found = self.match_selectors(sibling, relation) - sibling = self.get_next(sibling) + sibling = self.get_next_tag(sibling) elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING: - sibling = self.get_next(el) + sibling = self.get_next_tag(el) if sibling and self.is_tag(sibling): found = self.match_selectors(sibling, relation) return found @@ -782,28 +736,7 @@ class _Match(object): def match_root(self, el): """Match element as root.""" - is_root = self.is_root(el) - if is_root: - sibling = self.get_previous(el, tags=False) - while is_root and sibling is not None: - if ( - self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or - self.is_cdata(sibling) - ): - is_root = False - else: - sibling = self.get_previous(sibling, tags=False) - if is_root: - sibling = self.get_next(el, tags=False) - while is_root and sibling is not None: - if ( - self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or - self.is_cdata(sibling) - ): - is_root = False - else: - sibling = self.get_next(sibling, tags=False) - return is_root + return self.is_root(el) def match_scope(self, el): """Match element as scope.""" @@ -948,23 +881,12 @@ class _Match(object): content = None for contain_list in contains: if content is None: - if contain_list.own: - content = self.get_own_text(el, no_iframe=self.is_html) - else: - content = self.get_text(el, no_iframe=self.is_html) + content = self.get_text(el, no_iframe=self.is_html) found = False for text in contain_list.text: - if contain_list.own: - for c in content: - if text in c: - found = True - break - if found: - break - else: - if text in content: - found = True - break + if text in content: + found = True + break if not found: match = False return match @@ -1148,7 +1070,7 @@ class _Match(object): for patterns in langs: match = False for pattern in patterns: - if self.extended_language_filter(pattern, found_lang): + if pattern.match(found_lang): match = True if not match: break @@ -1230,7 +1152,7 @@ class _Match(object): out_of_range = False - itype = util.lower(self.get_attribute_by_name(el, 'type')) + itype = self.get_attribute_by_name(el, 'type').lower() mn = self.get_attribute_by_name(el, 'min', None) if mn is not None: mn = Inputs.parse_value(itype, mn) @@ -1285,21 +1207,6 @@ class _Match(object): self.get_prefix(el) is not None ) - def match_placeholder_shown(self, el): - """ - Match placeholder shown according to HTML spec. - - - text area should be checked if they have content. A single newline does not count as content. - - """ - - match = False - content = self.get_text(el) - if content in ('', '\n'): - match = True - - return match - def match_selectors(self, el, selectors): """Check if element matches one of the selectors.""" @@ -1332,9 +1239,6 @@ class _Match(object): # Verify element is scope if selector.flags & ct.SEL_SCOPE and not self.match_scope(el): continue - # Verify element has placeholder shown - if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el): - continue # Verify `nth` matches if not self.match_nth(el, selector.nth): continue @@ -1421,8 +1325,28 @@ class _Match(object): return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors) -class CSSMatch(_DocumentNav, _Match): - """The Beautiful Soup CSS match class.""" +class CommentsMatch(Document, object): + """Comments matcher.""" + + def __init__(self, el): + """Initialize.""" + + self.assert_valid_input(el) + self.tag = el + + def get_comments(self, limit=0): + """Get comments.""" + + if limit < 1: + limit = None + + for child in self.get_descendants(self.tag, tags=False): + if self.is_comment(child): + yield child + if limit is not None: + limit -= 1 + if limit < 1: + break class SoupSieve(ct.Immutable): @@ -1468,6 +1392,19 @@ class SoupSieve(ct.Immutable): else: return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)] + @util.deprecated("'comments' is not related to CSS selectors and will be removed in the future.") + def comments(self, tag, limit=0): + """Get comments only.""" + + return [comment for comment in CommentsMatch(tag).get_comments(limit)] + + @util.deprecated("'icomments' is not related to CSS selectors and will be removed in the future.") + def icomments(self, tag, limit=0): + """Iterate comments only.""" + + for comment in CommentsMatch(tag).get_comments(limit): + yield comment + def select_one(self, tag): """Select a single tag.""" diff --git a/included_dependencies/soupsieve/css_parser.py b/included_dependencies/soupsieve/css_parser.py index 7755cb62..d8244565 100644 --- a/included_dependencies/soupsieve/css_parser.py +++ b/included_dependencies/soupsieve/css_parser.py @@ -1,11 +1,10 @@ """CSS selector parser.""" +from __future__ import unicode_literals import re -from functools import lru_cache from . import util from . import css_match as cm from . import css_types as ct from .util import SelectorSyntaxError -import warnings UNICODE_REPLACEMENT_CHAR = 0xFFFD @@ -60,8 +59,6 @@ PSEUDO_SIMPLE_NO_MATCH = { # Complex pseudo classes that take selector lists PSEUDO_COMPLEX = { ':contains', - ':-soup-contains', - ':-soup-contains-own', ':has', ':is', ':matches', @@ -113,6 +110,11 @@ VALUE = r''' ATTR = r''' (?:{ws}*(?P[!~^|*$]?=){ws}*(?P{value})(?:{ws}+(?P[is]))?)?{ws}*\] '''.format(ws=WSC, value=VALUE) +# Definitions for quirks mode +QUIRKS_ATTR_IDENTIFIER = r'(?:(?:{esc}|(?!/\*)[^"\] \t\r\n\f])+?)'.format(esc=CSS_ESCAPES) +QUIRKS_ATTR = r''' +(?:{ws}*(?P[!~^|*$]?=){ws}*(?P{value})(?:{ws}+(?P[is]))?)?{ws}*\] +'''.format(ws=WSC, value=QUIRKS_ATTR_IDENTIFIER) # Selector patterns # IDs (`#id`) @@ -120,11 +122,13 @@ PAT_ID = r'\#{ident}'.format(ident=IDENTIFIER) # Classes (`.class`) PAT_CLASS = r'\.{ident}'.format(ident=IDENTIFIER) # Prefix:Tag (`prefix|tag`) -PAT_TAG = r'(?P(?:{ident}|\*)?\|)?(?P{ident}|\*)'.format(ident=IDENTIFIER) +PAT_TAG = r'(?:(?:{ident}|\*)?\|)?(?:{ident}|\*)'.format(ident=IDENTIFIER) # Attributes (`[attr]`, `[attr=value]`, etc.) -PAT_ATTR = r''' -\[{ws}*(?P(?:{ident}|\*)?\|)?(?P{ident}){attr} -'''.format(ws=WSC, ident=IDENTIFIER, attr=ATTR) +PAT_ATTR = r'\[{ws}*(?P(?:(?:{ident}|\*)?\|)?{ident}){attr}'.format(ws=WSC, ident=IDENTIFIER, attr=ATTR) +# Quirks attributes, like real attributes, but unquoted values can contain anything but whitespace and closing `]`. +PAT_QUIRKS_ATTR = r''' +\[{ws}*(?P(?:(?:{ident}|\*)?\|)?{ident}){attr} +'''.format(ws=WSC, ident=IDENTIFIER, attr=QUIRKS_ATTR) # Pseudo class (`:pseudo-class`, `:pseudo-class(`) PAT_PSEUDO_CLASS = r'(?P:{ident})(?P\({ws}*)?'.format(ws=WSC, ident=IDENTIFIER) # Pseudo class special patterns. Matches `:pseudo-class(` for special case pseudo classes. @@ -195,13 +199,12 @@ FLG_INDETERMINATE = 0x20 FLG_OPEN = 0x40 FLG_IN_RANGE = 0x80 FLG_OUT_OF_RANGE = 0x100 -FLG_PLACEHOLDER_SHOWN = 0x200 # Maximum cached patterns to store _MAXCACHE = 500 -@lru_cache(maxsize=_MAXCACHE) +@util.lru_cache(maxsize=_MAXCACHE) def _cached_css_compile(pattern, namespaces, custom, flags): """Cached CSS compile.""" @@ -250,7 +253,7 @@ def css_unescape(content, string=False): codepoint = int(m.group(1)[1:], 16) if codepoint == 0: codepoint = UNICODE_REPLACEMENT_CHAR - value = chr(codepoint) + value = util.uchr(codepoint) elif m.group(2): value = m.group(2)[1:] elif m.group(3): @@ -274,7 +277,7 @@ def escape(ident): string.append('\\{}'.format(ident)) else: for index, c in enumerate(ident): - codepoint = ord(c) + codepoint = util.uord(c) if codepoint == 0x00: string.append('\ufffd') elif (0x01 <= codepoint <= 0x1F) or codepoint == 0x7F: @@ -305,7 +308,12 @@ class SelectorPattern(object): return self.name - def match(self, selector, index, flags): + def enabled(self, flags): + """Enabled.""" + + return True + + def match(self, selector, index): """Match the selector.""" return self.re_pattern.match(selector, index) @@ -320,7 +328,7 @@ class SpecialPseudoPattern(SelectorPattern): self.patterns = {} for p in patterns: name = p[0] - pattern = p[3](name, p[2]) + pattern = SelectorPattern(name, p[2]) for pseudo in p[1]: self.patterns[pseudo] = pattern @@ -332,7 +340,12 @@ class SpecialPseudoPattern(SelectorPattern): return self.matched_name.get_name() - def match(self, selector, index, flags): + def enabled(self, flags): + """Enabled.""" + + return True + + def match(self, selector, index): """Match the selector.""" pseudo = None @@ -341,13 +354,22 @@ class SpecialPseudoPattern(SelectorPattern): name = util.lower(css_unescape(m.group('name'))) pattern = self.patterns.get(name) if pattern: - pseudo = pattern.match(selector, index, flags) + pseudo = pattern.match(selector, index) if pseudo: self.matched_name = pattern return pseudo +class QuirkPattern(SelectorPattern): + """Selector pattern for quirk mode.""" + + def enabled(self, flags): + """Enabled if quirks flag is present.""" + + return flags & util._QUIRKS + + class _Selector(object): """ Intermediate selector class. @@ -424,16 +446,11 @@ class CSSParser(object): SelectorPattern("pseudo_close", PAT_PSEUDO_CLOSE), SpecialPseudoPattern( ( - ( - "pseudo_contains", - (':contains', ':-soup-contains', ':-soup-contains-own'), - PAT_PSEUDO_CONTAINS, - SelectorPattern - ), - ("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD, SelectorPattern), - ("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE, SelectorPattern), - ("pseudo_lang", (':lang',), PAT_PSEUDO_LANG, SelectorPattern), - ("pseudo_dir", (':dir',), PAT_PSEUDO_DIR, SelectorPattern) + ("pseudo_contains", (':contains',), PAT_PSEUDO_CONTAINS), + ("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD), + ("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE), + ("pseudo_lang", (':lang',), PAT_PSEUDO_LANG), + ("pseudo_dir", (':dir',), PAT_PSEUDO_DIR) ) ), SelectorPattern("pseudo_class_custom", PAT_PSEUDO_CLASS_CUSTOM), @@ -444,6 +461,7 @@ class CSSParser(object): SelectorPattern("class", PAT_CLASS), SelectorPattern("tag", PAT_TAG), SelectorPattern("attribute", PAT_ATTR), + QuirkPattern("quirks_attribute", PAT_QUIRKS_ATTR), SelectorPattern("combine", PAT_COMBINE) ) @@ -453,19 +471,24 @@ class CSSParser(object): self.pattern = selector.replace('\x00', '\ufffd') self.flags = flags self.debug = self.flags & util.DEBUG + self.quirks = self.flags & util._QUIRKS self.custom = {} if custom is None else custom - def parse_attribute_selector(self, sel, m, has_selector): + def parse_attribute_selector(self, sel, m, has_selector, quirks): """Create attribute selector from the returned regex match.""" inverse = False op = m.group('cmp') case = util.lower(m.group('case')) if m.group('case') else None - ns = css_unescape(m.group('attr_ns')[:-1]) if m.group('attr_ns') else '' - attr = css_unescape(m.group('attr_name')) + parts = [css_unescape(a) for a in m.group('ns_attr').split('|')] + ns = '' is_type = False pattern2 = None - + if len(parts) > 1: + ns = parts[0] + attr = parts[1] + else: + attr = parts[0] if case: flags = re.I if case == 'i' else 0 elif util.lower(attr) == 'type': @@ -475,7 +498,7 @@ class CSSParser(object): flags = 0 if op: - if m.group('value').startswith(('"', "'")): + if m.group('value').startswith(('"', "'")) and not quirks: value = css_unescape(m.group('value')[1:-1], True) else: value = css_unescape(m.group('value')) @@ -502,12 +525,13 @@ class CSSParser(object): elif op.startswith('|'): # Value starts with word in dash separated list pattern = re.compile(r'^%s(?:-.*)?$' % re.escape(value), flags) + elif op.startswith('!'): + # Equivalent to `:not([attr=value])` + pattern = re.compile(r'^%s(?:-.*)?$' % re.escape(value), flags) + inverse = True else: # Value matches pattern = re.compile(r'^%s$' % re.escape(value), flags) - if op.startswith('!'): - # Equivalent to `:not([attr=value])` - inverse = True if is_type and pattern: pattern2 = re.compile(pattern.pattern) @@ -528,8 +552,13 @@ class CSSParser(object): def parse_tag_pattern(self, sel, m, has_selector): """Parse tag pattern from regex match.""" - prefix = css_unescape(m.group('tag_ns')[:-1]) if m.group('tag_ns') else None - tag = css_unescape(m.group('tag_name')) + parts = [css_unescape(x) for x in m.group(0).split('|')] + if len(parts) > 1: + prefix = parts[0] + tag = parts[1] + else: + tag = parts[0] + prefix = None sel.tag = ct.SelectorTag(tag, prefix) has_selector = True return has_selector @@ -771,11 +800,21 @@ class CSSParser(object): if not combinator: combinator = WS_COMBINATOR if not has_selector: - raise SelectorSyntaxError( - "The combinator '{}' at postion {}, must have a selector before it".format(combinator, index), + # The only way we don't fail is if we are at the root level and quirks mode is enabled, + # and we've found no other selectors yet in this compound selector. + if (not self.quirks or is_pseudo or combinator == COMMA_COMBINATOR or relations): + raise SelectorSyntaxError( + "The combinator '{}' at postion {}, must have a selector before it".format(combinator, index), + self.pattern, + index + ) + util.warn_quirks( + 'You have attempted to use a combinator without a selector before it at position {}.'.format(index), + 'the :scope pseudo class (or another appropriate selector) should be placed before the combinator.', self.pattern, index ) + sel.flags |= ct.SEL_SCOPE if combinator == COMMA_COMBINATOR: if not sel.tag and not is_pseudo: @@ -808,14 +847,7 @@ class CSSParser(object): def parse_pseudo_contains(self, sel, m, has_selector): """Parse contains.""" - pseudo = util.lower(css_unescape(m.group('name'))) - if pseudo == ":contains": - warnings.warn( - "The pseudo class ':contains' is deprecated, ':-soup-contains' should be used moving forward.", - FutureWarning - ) - contains_own = pseudo == ":-soup-contains-own" - values = css_unescape(m.group('values')) + values = m.group('values') patterns = [] for token in RE_VALUES.finditer(values): if token.group('split'): @@ -826,7 +858,7 @@ class CSSParser(object): else: value = css_unescape(value) patterns.append(value) - sel.contains.append(ct.SelectorContains(tuple(patterns), contains_own)) + sel.contains.append(ct.SelectorContains(tuple(patterns))) has_selector = True return has_selector @@ -840,12 +872,20 @@ class CSSParser(object): continue value = token.group('value') if value.startswith(('"', "'")): - value = css_unescape(value[1:-1], True) + parts = css_unescape(value[1:-1], True).split('-') else: - value = css_unescape(value) - - patterns.append(value) + parts = css_unescape(value).split('-') + new_parts = [] + first = True + for part in parts: + if part == '*' and first: + new_parts.append('(?!x\b)[a-z0-9]+?') + elif part != '*': + new_parts.append(('' if first else '(-(?!x\b)[a-z0-9]+)*?\\-') + re.escape(part)) + if first: + first = False + patterns.append(re.compile(r'^{}(?:-.*)?$'.format(''.join(new_parts)), re.I)) sel.lang.append(ct.SelectorLang(patterns)) has_selector = True @@ -877,7 +917,6 @@ class CSSParser(object): is_indeterminate = bool(flags & FLG_INDETERMINATE) is_in_range = bool(flags & FLG_IN_RANGE) is_out_of_range = bool(flags & FLG_OUT_OF_RANGE) - is_placeholder_shown = bool(flags & FLG_PLACEHOLDER_SHOWN) if self.debug: # pragma: no cover if is_pseudo: @@ -898,8 +937,6 @@ class CSSParser(object): print(' is_in_range: True') if is_out_of_range: print(' is_out_of_range: True') - if is_placeholder_shown: - print(' is_placeholder_shown: True') if is_relative: selectors.append(_Selector()) @@ -916,7 +953,7 @@ class CSSParser(object): elif key == 'pseudo_class': has_selector, is_html = self.parse_pseudo_class(sel, m, has_selector, iselector, is_html) elif key == 'pseudo_element': - raise NotImplementedError("Pseudo-element found at position {}".format(m.start(0))) + raise NotImplementedError("Psuedo-element found at position {}".format(m.start(0))) elif key == 'pseudo_contains': has_selector = self.parse_pseudo_contains(sel, m, has_selector) elif key in ('pseudo_nth_type', 'pseudo_nth_child'): @@ -952,8 +989,18 @@ class CSSParser(object): has_selector, sel = self.parse_combinator( sel, m, has_selector, selectors, relations, is_pseudo, index ) - elif key == 'attribute': - has_selector = self.parse_attribute_selector(sel, m, has_selector) + elif key in ('attribute', 'quirks_attribute'): + quirks = key == 'quirks_attribute' + if quirks: + temp_index = index + m.group(0).find('=') + 1 + util.warn_quirks( + "You have attempted to use an attribute " + + "value that should have been quoted at position {}.".format(temp_index), + "the attribute value should be quoted.", + self.pattern, + temp_index + ) + has_selector = self.parse_attribute_selector(sel, m, has_selector, quirks) elif key == 'tag': if has_selector: raise SelectorSyntaxError( @@ -1006,8 +1053,6 @@ class CSSParser(object): selectors[-1].flags = ct.SEL_IN_RANGE if is_out_of_range: selectors[-1].flags = ct.SEL_OUT_OF_RANGE - if is_placeholder_shown: - selectors[-1].flags = ct.SEL_PLACEHOLDER_SHOWN return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html) @@ -1021,11 +1066,15 @@ class CSSParser(object): end = (m.start(0) - 1) if m else (len(pattern) - 1) if self.debug: # pragma: no cover + if self.quirks: + print('## QUIRKS MODE: Throwing out the spec!') print('## PARSING: {!r}'.format(pattern)) while index <= end: m = None for v in self.css_tokens: - m = v.match(pattern, index, self.flags) + if not v.enabled(self.flags): # pragma: no cover + continue + m = v.match(pattern, index) if m: name = v.get_name() if self.debug: # pragma: no cover @@ -1053,7 +1102,13 @@ class CSSParser(object): print('## END PARSING') def process_selectors(self, index=0, flags=0): - """Process selectors.""" + """ + Process selectors. + + We do our own selectors as BeautifulSoup4 has some annoying quirks, + and we don't really need to do nth selectors or siblings or + descendants etc. + """ return self.parse_selectors(self.selector_iter(self.pattern), index, flags) @@ -1068,7 +1123,8 @@ CSS_LINK = CSSParser( # CSS pattern for `:checked` CSS_CHECKED = CSSParser( ''' - html|*:is(input[type=checkbox], input[type=radio])[checked], html|option[selected] + html|*:is(input[type=checkbox], input[type=radio])[checked], + html|select > html|option[selected] ''' ).process_selectors(flags=FLG_PSEUDO | FLG_HTML) # CSS pattern for `:default` (must compile CSS_CHECKED first) @@ -1094,23 +1150,23 @@ CSS_INDETERMINATE = CSSParser( This pattern must be at the end. Special logic is applied to the last selector. */ - html|input[type="radio"][name]:not([name='']):not([checked]) + html|input[type="radio"][name][name!='']:not([checked]) ''' ).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_INDETERMINATE) # CSS pattern for `:disabled` CSS_DISABLED = CSSParser( ''' - html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset)[disabled], + html|*:is(input[type!=hidden], button, select, textarea, fieldset, optgroup, option, fieldset)[disabled], html|optgroup[disabled] > html|option, - html|fieldset[disabled] > html|*:is(input:not([type=hidden]), button, select, textarea, fieldset), + html|fieldset[disabled] > html|*:is(input[type!=hidden], button, select, textarea, fieldset), html|fieldset[disabled] > - html|*:not(legend:nth-of-type(1)) html|*:is(input:not([type=hidden]), button, select, textarea, fieldset) + html|*:not(legend:nth-of-type(1)) html|*:is(input[type!=hidden], button, select, textarea, fieldset) ''' ).process_selectors(flags=FLG_PSEUDO | FLG_HTML) # CSS pattern for `:enabled` CSS_ENABLED = CSSParser( ''' - html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled) + html|*:is(input[type!=hidden], button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled) ''' ).process_selectors(flags=FLG_PSEUDO | FLG_HTML) # CSS pattern for `:required` @@ -1124,20 +1180,22 @@ CSS_OPTIONAL = CSSParser( # CSS pattern for `:placeholder-shown` CSS_PLACEHOLDER_SHOWN = CSSParser( ''' - html|input:is( - :not([type]), - [type=""], - [type=text], - [type=search], - [type=url], - [type=tel], - [type=email], - [type=password], - [type=number] - )[placeholder]:not([placeholder='']):is(:not([value]), [value=""]), - html|textarea[placeholder]:not([placeholder='']) + html|*:is( + input:is( + :not([type]), + [type=""], + [type=text], + [type=search], + [type=url], + [type=tel], + [type=email], + [type=password], + [type=number] + ), + textarea + )[placeholder][placeholder!=''] ''' -).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_PLACEHOLDER_SHOWN) +).process_selectors(flags=FLG_PSEUDO | FLG_HTML) # CSS pattern default for `:nth-child` "of S" feature CSS_NTH_OF_S_DEFAULT = CSSParser( '*|*' diff --git a/included_dependencies/soupsieve/css_types.py b/included_dependencies/soupsieve/css_types.py index 35361b1b..d426287a 100644 --- a/included_dependencies/soupsieve/css_types.py +++ b/included_dependencies/soupsieve/css_types.py @@ -1,6 +1,6 @@ """CSS selector structure items.""" -import copyreg -from collections.abc import Hashable, Mapping +from __future__ import unicode_literals +from . import util __all__ = ( 'Selector', @@ -26,7 +26,6 @@ SEL_DIR_RTL = 0x40 SEL_IN_RANGE = 0x80 SEL_OUT_OF_RANGE = 0x100 SEL_DEFINED = 0x200 -SEL_PLACEHOLDER_SHOWN = 0x400 class Immutable(object): @@ -86,7 +85,7 @@ class Immutable(object): __str__ = __repr__ -class ImmutableDict(Mapping): +class ImmutableDict(util.Mapping): """Hashable, immutable dictionary.""" def __init__(self, *args, **kwargs): @@ -95,8 +94,8 @@ class ImmutableDict(Mapping): arg = args[0] if args else kwargs is_dict = isinstance(arg, dict) if ( - is_dict and not all([isinstance(v, Hashable) for v in arg.values()]) or - not is_dict and not all([isinstance(k, Hashable) and isinstance(v, Hashable) for k, v in arg]) + is_dict and not all([isinstance(v, util.Hashable) for v in arg.values()]) or + not is_dict and not all([isinstance(k, util.Hashable) and isinstance(v, util.Hashable) for k, v in arg]) ): raise TypeError('All values must be hashable') @@ -141,9 +140,9 @@ class Namespaces(ImmutableDict): # so don't bother checking that. arg = args[0] if args else kwargs is_dict = isinstance(arg, dict) - if is_dict and not all([isinstance(k, str) and isinstance(v, str) for k, v in arg.items()]): + if is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg.items()]): raise TypeError('Namespace keys and values must be Unicode strings') - elif not is_dict and not all([isinstance(k, str) and isinstance(v, str) for k, v in arg]): + elif not is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg]): raise TypeError('Namespace keys and values must be Unicode strings') super(Namespaces, self).__init__(*args, **kwargs) @@ -160,9 +159,9 @@ class CustomSelectors(ImmutableDict): # so don't bother checking that. arg = args[0] if args else kwargs is_dict = isinstance(arg, dict) - if is_dict and not all([isinstance(k, str) and isinstance(v, str) for k, v in arg.items()]): + if is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg.items()]): raise TypeError('CustomSelectors keys and values must be Unicode strings') - elif not is_dict and not all([isinstance(k, str) and isinstance(v, str) for k, v in arg]): + elif not is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg]): raise TypeError('CustomSelectors keys and values must be Unicode strings') super(CustomSelectors, self).__init__(*args, **kwargs) @@ -239,14 +238,13 @@ class SelectorAttribute(Immutable): class SelectorContains(Immutable): """Selector contains rule.""" - __slots__ = ("text", "own", "_hash") + __slots__ = ("text", "_hash") - def __init__(self, text, own): + def __init__(self, text): """Initialize.""" super(SelectorContains, self).__init__( - text=text, - own=own + text=text ) @@ -333,7 +331,7 @@ def _pickle(p): def pickle_register(obj): """Allow object to be pickled.""" - copyreg.pickle(obj, _pickle) + util.copyreg.pickle(obj, _pickle) pickle_register(Selector) diff --git a/included_dependencies/soupsieve/util.py b/included_dependencies/soupsieve/util.py index 7f5d9f89..b899d3af 100644 --- a/included_dependencies/soupsieve/util.py +++ b/included_dependencies/soupsieve/util.py @@ -1,17 +1,47 @@ """Utility.""" -from functools import wraps, lru_cache +from __future__ import unicode_literals +from functools import wraps import warnings +import sys +import struct +import os import re +MODULE = os.path.dirname(__file__) + +PY3 = sys.version_info >= (3, 0) +PY35 = sys.version_info >= (3, 5) +PY37 = sys.version_info >= (3, 7) + +if PY3: + from functools import lru_cache # noqa F401 + import copyreg # noqa F401 + from collections.abc import Hashable, Mapping # noqa F401 + + ustr = str + bstr = bytes + unichar = chr + string = str +else: + from backports.functools_lru_cache import lru_cache # noqa F401 + import copy_reg as copyreg # noqa F401 + from collections import Hashable, Mapping # noqa F401 + + ustr = unicode # noqa: F821 + bstr = str + unichar = unichr # noqa: F821 + string = basestring # noqa: F821 DEBUG = 0x00001 +_QUIRKS = 0x10000 RE_PATTERN_LINE_SPLIT = re.compile(r'(?:\r\n|(?!\r\n)[\n\r])|$') +LC_A = ord('a') +LC_Z = ord('z') UC_A = ord('A') UC_Z = ord('Z') -@lru_cache(maxsize=512) def lower(string): """Lower.""" @@ -22,7 +52,38 @@ def lower(string): return ''.join(new_string) -class SelectorSyntaxError(Exception): +def upper(string): # pragma: no cover + """Lower.""" + + new_string = [] + for c in string: + o = ord(c) + new_string.append(chr(o - 32) if LC_A <= o <= LC_Z else c) + return ''.join(new_string) + + +def uchr(i): + """Allow getting Unicode character on narrow python builds.""" + + try: + return unichar(i) + except ValueError: # pragma: no cover + return struct.pack('i', i).decode('utf-32') + + +def uord(c): + """Get Unicode ordinal.""" + + if len(c) == 2: # pragma: no cover + high, low = [ord(p) for p in c] + ordinal = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000 + else: + ordinal = ord(c) + + return ordinal + + +class SelectorSyntaxError(SyntaxError): """Syntax error in a CSS selector.""" def __init__(self, msg, pattern=None, index=None): @@ -108,3 +169,45 @@ def get_pattern_context(pattern, index): last = m.end(0) return ''.join(text), line, col + + +class QuirksWarning(UserWarning): # pragma: no cover + """Warning for quirks mode.""" + + +def warn_quirks(message, recommend, pattern, index): + """Warn quirks.""" + + import traceback + import bs4 # noqa: F401 + + # Acquire source code line context + paths = (MODULE, sys.modules['bs4'].__path__[0]) + tb = traceback.extract_stack() + previous = None + filename = None + lineno = None + for entry in tb: + if (PY35 and entry.filename.startswith(paths)) or (not PY35 and entry[0].startswith(paths)): + break + previous = entry + if previous: + filename = previous.filename if PY35 else previous[0] + lineno = previous.lineno if PY35 else previous[1] + + # Format pattern to show line and column position + context, line = get_pattern_context(pattern, index)[0:2] + + # Display warning + warnings.warn_explicit( + "\nCSS selector pattern:\n" + + " {}\n".format(message) + + " This behavior is only allowed temporarily for Beautiful Soup's transition to Soup Sieve.\n" + + " In order to confrom to the CSS spec, {}\n".format(recommend) + + " It is strongly recommended the selector be altered to conform to the CSS spec " + + "as an exception will be raised for this case in the future.\n" + + "pattern line {}:\n{}".format(line, context), + QuirksWarning, + filename, + lineno + )