mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-06 08:52:55 +01:00
Update old included_dependencies to current versions.
This commit is contained in:
parent
d33decd8f5
commit
7b951d7f4d
23 changed files with 33216 additions and 1655 deletions
|
|
@ -16,11 +16,14 @@
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
|
||||||
from .compat import PY2, PY3
|
|
||||||
from .universaldetector import UniversalDetector
|
from .universaldetector import UniversalDetector
|
||||||
|
from .enums import InputState
|
||||||
from .version import __version__, VERSION
|
from .version import __version__, VERSION
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ['UniversalDetector', 'detect', 'detect_all', '__version__', 'VERSION']
|
||||||
|
|
||||||
|
|
||||||
def detect(byte_str):
|
def detect(byte_str):
|
||||||
"""
|
"""
|
||||||
Detect the encoding of the given byte string.
|
Detect the encoding of the given byte string.
|
||||||
|
|
@ -31,9 +34,50 @@ def detect(byte_str):
|
||||||
if not isinstance(byte_str, bytearray):
|
if not isinstance(byte_str, bytearray):
|
||||||
if not isinstance(byte_str, bytes):
|
if not isinstance(byte_str, bytes):
|
||||||
raise TypeError('Expected object of type bytes or bytearray, got: '
|
raise TypeError('Expected object of type bytes or bytearray, got: '
|
||||||
'{0}'.format(type(byte_str)))
|
'{}'.format(type(byte_str)))
|
||||||
else:
|
else:
|
||||||
byte_str = bytearray(byte_str)
|
byte_str = bytearray(byte_str)
|
||||||
detector = UniversalDetector()
|
detector = UniversalDetector()
|
||||||
detector.feed(byte_str)
|
detector.feed(byte_str)
|
||||||
return detector.close()
|
return detector.close()
|
||||||
|
|
||||||
|
|
||||||
|
def detect_all(byte_str):
|
||||||
|
"""
|
||||||
|
Detect all the possible encodings of the given byte string.
|
||||||
|
|
||||||
|
:param byte_str: The byte sequence to examine.
|
||||||
|
:type byte_str: ``bytes`` or ``bytearray``
|
||||||
|
"""
|
||||||
|
if not isinstance(byte_str, bytearray):
|
||||||
|
if not isinstance(byte_str, bytes):
|
||||||
|
raise TypeError('Expected object of type bytes or bytearray, got: '
|
||||||
|
'{}'.format(type(byte_str)))
|
||||||
|
else:
|
||||||
|
byte_str = bytearray(byte_str)
|
||||||
|
|
||||||
|
detector = UniversalDetector()
|
||||||
|
detector.feed(byte_str)
|
||||||
|
detector.close()
|
||||||
|
|
||||||
|
if detector._input_state == InputState.HIGH_BYTE:
|
||||||
|
results = []
|
||||||
|
for prober in detector._charset_probers:
|
||||||
|
if prober.get_confidence() > detector.MINIMUM_THRESHOLD:
|
||||||
|
charset_name = prober.charset_name
|
||||||
|
lower_charset_name = prober.charset_name.lower()
|
||||||
|
# Use Windows encoding name instead of ISO-8859 if we saw any
|
||||||
|
# extra Windows-specific bytes
|
||||||
|
if lower_charset_name.startswith('iso-8859'):
|
||||||
|
if detector._has_win_bytes:
|
||||||
|
charset_name = detector.ISO_WIN_MAP.get(lower_charset_name,
|
||||||
|
charset_name)
|
||||||
|
results.append({
|
||||||
|
'encoding': charset_name,
|
||||||
|
'confidence': prober.get_confidence(),
|
||||||
|
'language': prober.language,
|
||||||
|
})
|
||||||
|
if len(results) > 0:
|
||||||
|
return sorted(results, key=lambda result: -result['confidence'])
|
||||||
|
|
||||||
|
return [detector.result]
|
||||||
|
|
|
||||||
|
|
@ -73,6 +73,7 @@ class CharSetGroupProber(CharSetProber):
|
||||||
continue
|
continue
|
||||||
if state == ProbingState.FOUND_IT:
|
if state == ProbingState.FOUND_IT:
|
||||||
self._best_guess_prober = prober
|
self._best_guess_prober = prober
|
||||||
|
self._state = ProbingState.FOUND_IT
|
||||||
return self.state
|
return self.state
|
||||||
elif state == ProbingState.NOT_ME:
|
elif state == ProbingState.NOT_ME:
|
||||||
prober.active = False
|
prober.active = False
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,3 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
"""
|
"""
|
||||||
Script which takes one or more file paths and reports on their detected
|
Script which takes one or more file paths and reports on their detected
|
||||||
encodings
|
encodings
|
||||||
|
|
@ -45,10 +44,10 @@ def description_of(lines, name='stdin'):
|
||||||
if PY2:
|
if PY2:
|
||||||
name = name.decode(sys.getfilesystemencoding(), 'ignore')
|
name = name.decode(sys.getfilesystemencoding(), 'ignore')
|
||||||
if result['encoding']:
|
if result['encoding']:
|
||||||
return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
|
return '{}: {} with confidence {}'.format(name, result['encoding'],
|
||||||
result['confidence'])
|
result['confidence'])
|
||||||
else:
|
else:
|
||||||
return '{0}: no result'.format(name)
|
return '{}: no result'.format(name)
|
||||||
|
|
||||||
|
|
||||||
def main(argv=None):
|
def main(argv=None):
|
||||||
|
|
@ -69,7 +68,7 @@ def main(argv=None):
|
||||||
type=argparse.FileType('rb'), nargs='*',
|
type=argparse.FileType('rb'), nargs='*',
|
||||||
default=[sys.stdin if PY2 else sys.stdin.buffer])
|
default=[sys.stdin if PY2 else sys.stdin.buffer])
|
||||||
parser.add_argument('--version', action='version',
|
parser.add_argument('--version', action='version',
|
||||||
version='%(prog)s {0}'.format(__version__))
|
version='%(prog)s {}'.format(__version__))
|
||||||
args = parser.parse_args(argv)
|
args = parser.parse_args(argv)
|
||||||
|
|
||||||
for f in args.input:
|
for f in args.input:
|
||||||
|
|
|
||||||
|
|
@ -25,10 +25,12 @@ import sys
|
||||||
if sys.version_info < (3, 0):
|
if sys.version_info < (3, 0):
|
||||||
PY2 = True
|
PY2 = True
|
||||||
PY3 = False
|
PY3 = False
|
||||||
base_str = (str, unicode)
|
string_types = (str, unicode)
|
||||||
text_type = unicode
|
text_type = unicode
|
||||||
|
iteritems = dict.iteritems
|
||||||
else:
|
else:
|
||||||
PY2 = False
|
PY2 = False
|
||||||
PY3 = True
|
PY3 = True
|
||||||
base_str = (bytes, str)
|
string_types = (bytes, str)
|
||||||
text_type = str
|
text_type = str
|
||||||
|
iteritems = dict.items
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
5718
included_dependencies/chardet/langrussianmodel.py
Normal file
5718
included_dependencies/chardet/langrussianmodel.py
Normal file
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
0
included_dependencies/chardet/metadata/__init__.py
Normal file
0
included_dependencies/chardet/metadata/__init__.py
Normal file
310
included_dependencies/chardet/metadata/languages.py
Normal file
310
included_dependencies/chardet/metadata/languages.py
Normal file
|
|
@ -0,0 +1,310 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Metadata about languages used by our model training code for our
|
||||||
|
SingleByteCharSetProbers. Could be used for other things in the future.
|
||||||
|
|
||||||
|
This code is based on the language metadata from the uchardet project.
|
||||||
|
"""
|
||||||
|
from __future__ import absolute_import, print_function
|
||||||
|
|
||||||
|
from string import ascii_letters
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: Add Ukranian (KOI8-U)
|
||||||
|
|
||||||
|
class Language(object):
|
||||||
|
"""Metadata about a language useful for training models
|
||||||
|
|
||||||
|
:ivar name: The human name for the language, in English.
|
||||||
|
:type name: str
|
||||||
|
:ivar iso_code: 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
|
||||||
|
or use another catalog as a last resort.
|
||||||
|
:type iso_code: str
|
||||||
|
:ivar use_ascii: Whether or not ASCII letters should be included in trained
|
||||||
|
models.
|
||||||
|
:type use_ascii: bool
|
||||||
|
:ivar charsets: The charsets we want to support and create data for.
|
||||||
|
:type charsets: list of str
|
||||||
|
:ivar alphabet: The characters in the language's alphabet. If `use_ascii` is
|
||||||
|
`True`, you only need to add those not in the ASCII set.
|
||||||
|
:type alphabet: str
|
||||||
|
:ivar wiki_start_pages: The Wikipedia pages to start from if we're crawling
|
||||||
|
Wikipedia for training data.
|
||||||
|
:type wiki_start_pages: list of str
|
||||||
|
"""
|
||||||
|
def __init__(self, name=None, iso_code=None, use_ascii=True, charsets=None,
|
||||||
|
alphabet=None, wiki_start_pages=None):
|
||||||
|
super(Language, self).__init__()
|
||||||
|
self.name = name
|
||||||
|
self.iso_code = iso_code
|
||||||
|
self.use_ascii = use_ascii
|
||||||
|
self.charsets = charsets
|
||||||
|
if self.use_ascii:
|
||||||
|
if alphabet:
|
||||||
|
alphabet += ascii_letters
|
||||||
|
else:
|
||||||
|
alphabet = ascii_letters
|
||||||
|
elif not alphabet:
|
||||||
|
raise ValueError('Must supply alphabet if use_ascii is False')
|
||||||
|
self.alphabet = ''.join(sorted(set(alphabet))) if alphabet else None
|
||||||
|
self.wiki_start_pages = wiki_start_pages
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '{}({})'.format(self.__class__.__name__,
|
||||||
|
', '.join('{}={!r}'.format(k, v)
|
||||||
|
for k, v in self.__dict__.items()
|
||||||
|
if not k.startswith('_')))
|
||||||
|
|
||||||
|
|
||||||
|
LANGUAGES = {'Arabic': Language(name='Arabic',
|
||||||
|
iso_code='ar',
|
||||||
|
use_ascii=False,
|
||||||
|
# We only support encodings that use isolated
|
||||||
|
# forms, because the current recommendation is
|
||||||
|
# that the rendering system handles presentation
|
||||||
|
# forms. This means we purposefully skip IBM864.
|
||||||
|
charsets=['ISO-8859-6', 'WINDOWS-1256',
|
||||||
|
'CP720', 'CP864'],
|
||||||
|
alphabet=u'ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ',
|
||||||
|
wiki_start_pages=[u'الصفحة_الرئيسية']),
|
||||||
|
'Belarusian': Language(name='Belarusian',
|
||||||
|
iso_code='be',
|
||||||
|
use_ascii=False,
|
||||||
|
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
||||||
|
'IBM866', 'MacCyrillic'],
|
||||||
|
alphabet=(u'АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯ'
|
||||||
|
u'абвгдеёжзійклмнопрстуўфхцчшыьэюяʼ'),
|
||||||
|
wiki_start_pages=[u'Галоўная_старонка']),
|
||||||
|
'Bulgarian': Language(name='Bulgarian',
|
||||||
|
iso_code='bg',
|
||||||
|
use_ascii=False,
|
||||||
|
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
||||||
|
'IBM855'],
|
||||||
|
alphabet=(u'АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯ'
|
||||||
|
u'абвгдежзийклмнопрстуфхцчшщъьюя'),
|
||||||
|
wiki_start_pages=[u'Начална_страница']),
|
||||||
|
'Czech': Language(name='Czech',
|
||||||
|
iso_code='cz',
|
||||||
|
use_ascii=True,
|
||||||
|
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||||
|
alphabet=u'áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ',
|
||||||
|
wiki_start_pages=[u'Hlavní_strana']),
|
||||||
|
'Danish': Language(name='Danish',
|
||||||
|
iso_code='da',
|
||||||
|
use_ascii=True,
|
||||||
|
charsets=['ISO-8859-1', 'ISO-8859-15',
|
||||||
|
'WINDOWS-1252'],
|
||||||
|
alphabet=u'æøåÆØÅ',
|
||||||
|
wiki_start_pages=[u'Forside']),
|
||||||
|
'German': Language(name='German',
|
||||||
|
iso_code='de',
|
||||||
|
use_ascii=True,
|
||||||
|
charsets=['ISO-8859-1', 'WINDOWS-1252'],
|
||||||
|
alphabet=u'äöüßÄÖÜ',
|
||||||
|
wiki_start_pages=[u'Wikipedia:Hauptseite']),
|
||||||
|
'Greek': Language(name='Greek',
|
||||||
|
iso_code='el',
|
||||||
|
use_ascii=False,
|
||||||
|
charsets=['ISO-8859-7', 'WINDOWS-1253'],
|
||||||
|
alphabet=(u'αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώ'
|
||||||
|
u'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ'),
|
||||||
|
wiki_start_pages=[u'Πύλη:Κύρια']),
|
||||||
|
'English': Language(name='English',
|
||||||
|
iso_code='en',
|
||||||
|
use_ascii=True,
|
||||||
|
charsets=['ISO-8859-1', 'WINDOWS-1252'],
|
||||||
|
wiki_start_pages=[u'Main_Page']),
|
||||||
|
'Esperanto': Language(name='Esperanto',
|
||||||
|
iso_code='eo',
|
||||||
|
# Q, W, X, and Y not used at all
|
||||||
|
use_ascii=False,
|
||||||
|
charsets=['ISO-8859-3'],
|
||||||
|
alphabet=(u'abcĉdefgĝhĥijĵklmnoprsŝtuŭvz'
|
||||||
|
u'ABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ'),
|
||||||
|
wiki_start_pages=[u'Vikipedio:Ĉefpaĝo']),
|
||||||
|
'Spanish': Language(name='Spanish',
|
||||||
|
iso_code='es',
|
||||||
|
use_ascii=True,
|
||||||
|
charsets=['ISO-8859-1', 'ISO-8859-15',
|
||||||
|
'WINDOWS-1252'],
|
||||||
|
alphabet=u'ñáéíóúüÑÁÉÍÓÚÜ',
|
||||||
|
wiki_start_pages=[u'Wikipedia:Portada']),
|
||||||
|
'Estonian': Language(name='Estonian',
|
||||||
|
iso_code='et',
|
||||||
|
use_ascii=False,
|
||||||
|
charsets=['ISO-8859-4', 'ISO-8859-13',
|
||||||
|
'WINDOWS-1257'],
|
||||||
|
# C, F, Š, Q, W, X, Y, Z, Ž are only for
|
||||||
|
# loanwords
|
||||||
|
alphabet=(u'ABDEGHIJKLMNOPRSTUVÕÄÖÜ'
|
||||||
|
u'abdeghijklmnoprstuvõäöü'),
|
||||||
|
wiki_start_pages=[u'Esileht']),
|
||||||
|
'Finnish': Language(name='Finnish',
|
||||||
|
iso_code='fi',
|
||||||
|
use_ascii=True,
|
||||||
|
charsets=['ISO-8859-1', 'ISO-8859-15',
|
||||||
|
'WINDOWS-1252'],
|
||||||
|
alphabet=u'ÅÄÖŠŽåäöšž',
|
||||||
|
wiki_start_pages=[u'Wikipedia:Etusivu']),
|
||||||
|
'French': Language(name='French',
|
||||||
|
iso_code='fr',
|
||||||
|
use_ascii=True,
|
||||||
|
charsets=['ISO-8859-1', 'ISO-8859-15',
|
||||||
|
'WINDOWS-1252'],
|
||||||
|
alphabet=u'œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ',
|
||||||
|
wiki_start_pages=[u'Wikipédia:Accueil_principal',
|
||||||
|
u'Bœuf (animal)']),
|
||||||
|
'Hebrew': Language(name='Hebrew',
|
||||||
|
iso_code='he',
|
||||||
|
use_ascii=False,
|
||||||
|
charsets=['ISO-8859-8', 'WINDOWS-1255'],
|
||||||
|
alphabet=u'אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ',
|
||||||
|
wiki_start_pages=[u'עמוד_ראשי']),
|
||||||
|
'Croatian': Language(name='Croatian',
|
||||||
|
iso_code='hr',
|
||||||
|
# Q, W, X, Y are only used for foreign words.
|
||||||
|
use_ascii=False,
|
||||||
|
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||||
|
alphabet=(u'abcčćdđefghijklmnoprsštuvzž'
|
||||||
|
u'ABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ'),
|
||||||
|
wiki_start_pages=[u'Glavna_stranica']),
|
||||||
|
'Hungarian': Language(name='Hungarian',
|
||||||
|
iso_code='hu',
|
||||||
|
# Q, W, X, Y are only used for foreign words.
|
||||||
|
use_ascii=False,
|
||||||
|
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||||
|
alphabet=(u'abcdefghijklmnoprstuvzáéíóöőúüű'
|
||||||
|
u'ABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ'),
|
||||||
|
wiki_start_pages=[u'Kezdőlap']),
|
||||||
|
'Italian': Language(name='Italian',
|
||||||
|
iso_code='it',
|
||||||
|
use_ascii=True,
|
||||||
|
charsets=['ISO-8859-1', 'ISO-8859-15',
|
||||||
|
'WINDOWS-1252'],
|
||||||
|
alphabet=u'ÀÈÉÌÒÓÙàèéìòóù',
|
||||||
|
wiki_start_pages=[u'Pagina_principale']),
|
||||||
|
'Lithuanian': Language(name='Lithuanian',
|
||||||
|
iso_code='lt',
|
||||||
|
use_ascii=False,
|
||||||
|
charsets=['ISO-8859-13', 'WINDOWS-1257',
|
||||||
|
'ISO-8859-4'],
|
||||||
|
# Q, W, and X not used at all
|
||||||
|
alphabet=(u'AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽ'
|
||||||
|
u'aąbcčdeęėfghiįyjklmnoprsštuųūvzž'),
|
||||||
|
wiki_start_pages=[u'Pagrindinis_puslapis']),
|
||||||
|
'Latvian': Language(name='Latvian',
|
||||||
|
iso_code='lv',
|
||||||
|
use_ascii=False,
|
||||||
|
charsets=['ISO-8859-13', 'WINDOWS-1257',
|
||||||
|
'ISO-8859-4'],
|
||||||
|
# Q, W, X, Y are only for loanwords
|
||||||
|
alphabet=(u'AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽ'
|
||||||
|
u'aābcčdeēfgģhiījkķlļmnņoprsštuūvzž'),
|
||||||
|
wiki_start_pages=[u'Sākumlapa']),
|
||||||
|
'Macedonian': Language(name='Macedonian',
|
||||||
|
iso_code='mk',
|
||||||
|
use_ascii=False,
|
||||||
|
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
||||||
|
'MacCyrillic', 'IBM855'],
|
||||||
|
alphabet=(u'АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШ'
|
||||||
|
u'абвгдѓежзѕијклљмнњопрстќуфхцчџш'),
|
||||||
|
wiki_start_pages=[u'Главна_страница']),
|
||||||
|
'Dutch': Language(name='Dutch',
|
||||||
|
iso_code='nl',
|
||||||
|
use_ascii=True,
|
||||||
|
charsets=['ISO-8859-1', 'WINDOWS-1252'],
|
||||||
|
wiki_start_pages=[u'Hoofdpagina']),
|
||||||
|
'Polish': Language(name='Polish',
|
||||||
|
iso_code='pl',
|
||||||
|
# Q and X are only used for foreign words.
|
||||||
|
use_ascii=False,
|
||||||
|
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||||
|
alphabet=(u'AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ'
|
||||||
|
u'aąbcćdeęfghijklłmnńoóprsśtuwyzźż'),
|
||||||
|
wiki_start_pages=[u'Wikipedia:Strona_główna']),
|
||||||
|
'Portuguese': Language(name='Portuguese',
|
||||||
|
iso_code='pt',
|
||||||
|
use_ascii=True,
|
||||||
|
charsets=['ISO-8859-1', 'ISO-8859-15',
|
||||||
|
'WINDOWS-1252'],
|
||||||
|
alphabet=u'ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú',
|
||||||
|
wiki_start_pages=[u'Wikipédia:Página_principal']),
|
||||||
|
'Romanian': Language(name='Romanian',
|
||||||
|
iso_code='ro',
|
||||||
|
use_ascii=True,
|
||||||
|
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||||
|
alphabet=u'ăâîșțĂÂÎȘȚ',
|
||||||
|
wiki_start_pages=[u'Pagina_principală']),
|
||||||
|
'Russian': Language(name='Russian',
|
||||||
|
iso_code='ru',
|
||||||
|
use_ascii=False,
|
||||||
|
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
||||||
|
'KOI8-R', 'MacCyrillic', 'IBM866',
|
||||||
|
'IBM855'],
|
||||||
|
alphabet=(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
|
||||||
|
u'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'),
|
||||||
|
wiki_start_pages=[u'Заглавная_страница']),
|
||||||
|
'Slovak': Language(name='Slovak',
|
||||||
|
iso_code='sk',
|
||||||
|
use_ascii=True,
|
||||||
|
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||||
|
alphabet=u'áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ',
|
||||||
|
wiki_start_pages=[u'Hlavná_stránka']),
|
||||||
|
'Slovene': Language(name='Slovene',
|
||||||
|
iso_code='sl',
|
||||||
|
# Q, W, X, Y are only used for foreign words.
|
||||||
|
use_ascii=False,
|
||||||
|
charsets=['ISO-8859-2', 'WINDOWS-1250'],
|
||||||
|
alphabet=(u'abcčdefghijklmnoprsštuvzž'
|
||||||
|
u'ABCČDEFGHIJKLMNOPRSŠTUVZŽ'),
|
||||||
|
wiki_start_pages=[u'Glavna_stran']),
|
||||||
|
# Serbian can be written in both Latin and Cyrillic, but there's no
|
||||||
|
# simple way to get the Latin alphabet pages from Wikipedia through
|
||||||
|
# the API, so for now we just support Cyrillic.
|
||||||
|
'Serbian': Language(name='Serbian',
|
||||||
|
iso_code='sr',
|
||||||
|
alphabet=(u'АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШ'
|
||||||
|
u'абвгдђежзијклљмнњопрстћуфхцчџш'),
|
||||||
|
charsets=['ISO-8859-5', 'WINDOWS-1251',
|
||||||
|
'MacCyrillic', 'IBM855'],
|
||||||
|
wiki_start_pages=[u'Главна_страна']),
|
||||||
|
'Thai': Language(name='Thai',
|
||||||
|
iso_code='th',
|
||||||
|
use_ascii=False,
|
||||||
|
charsets=['ISO-8859-11', 'TIS-620', 'CP874'],
|
||||||
|
alphabet=u'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛',
|
||||||
|
wiki_start_pages=[u'หน้าหลัก']),
|
||||||
|
'Turkish': Language(name='Turkish',
|
||||||
|
iso_code='tr',
|
||||||
|
# Q, W, and X are not used by Turkish
|
||||||
|
use_ascii=False,
|
||||||
|
charsets=['ISO-8859-3', 'ISO-8859-9',
|
||||||
|
'WINDOWS-1254'],
|
||||||
|
alphabet=(u'abcçdefgğhıijklmnoöprsştuüvyzâîû'
|
||||||
|
u'ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ'),
|
||||||
|
wiki_start_pages=[u'Ana_Sayfa']),
|
||||||
|
'Vietnamese': Language(name='Vietnamese',
|
||||||
|
iso_code='vi',
|
||||||
|
use_ascii=False,
|
||||||
|
# Windows-1258 is the only common 8-bit
|
||||||
|
# Vietnamese encoding supported by Python.
|
||||||
|
# From Wikipedia:
|
||||||
|
# For systems that lack support for Unicode,
|
||||||
|
# dozens of 8-bit Vietnamese code pages are
|
||||||
|
# available.[1] The most common are VISCII
|
||||||
|
# (TCVN 5712:1993), VPS, and Windows-1258.[3]
|
||||||
|
# Where ASCII is required, such as when
|
||||||
|
# ensuring readability in plain text e-mail,
|
||||||
|
# Vietnamese letters are often encoded
|
||||||
|
# according to Vietnamese Quoted-Readable
|
||||||
|
# (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4]
|
||||||
|
# though usage of either variable-width
|
||||||
|
# scheme has declined dramatically following
|
||||||
|
# the adoption of Unicode on the World Wide
|
||||||
|
# Web.
|
||||||
|
charsets=['WINDOWS-1258'],
|
||||||
|
alphabet=(u'aăâbcdđeêghiklmnoôơpqrstuưvxy'
|
||||||
|
u'AĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY'),
|
||||||
|
wiki_start_pages=[u'Chữ_Quốc_ngữ']),
|
||||||
|
}
|
||||||
|
|
@ -26,10 +26,22 @@
|
||||||
# 02110-1301 USA
|
# 02110-1301 USA
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
|
from collections import namedtuple
|
||||||
|
|
||||||
from .charsetprober import CharSetProber
|
from .charsetprober import CharSetProber
|
||||||
from .enums import CharacterCategory, ProbingState, SequenceLikelihood
|
from .enums import CharacterCategory, ProbingState, SequenceLikelihood
|
||||||
|
|
||||||
|
|
||||||
|
SingleByteCharSetModel = namedtuple('SingleByteCharSetModel',
|
||||||
|
['charset_name',
|
||||||
|
'language',
|
||||||
|
'char_to_order_map',
|
||||||
|
'language_model',
|
||||||
|
'typical_positive_ratio',
|
||||||
|
'keep_ascii_letters',
|
||||||
|
'alphabet'])
|
||||||
|
|
||||||
|
|
||||||
class SingleByteCharSetProber(CharSetProber):
|
class SingleByteCharSetProber(CharSetProber):
|
||||||
SAMPLE_SIZE = 64
|
SAMPLE_SIZE = 64
|
||||||
SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2
|
SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2
|
||||||
|
|
@ -65,25 +77,25 @@ class SingleByteCharSetProber(CharSetProber):
|
||||||
if self._name_prober:
|
if self._name_prober:
|
||||||
return self._name_prober.charset_name
|
return self._name_prober.charset_name
|
||||||
else:
|
else:
|
||||||
return self._model['charset_name']
|
return self._model.charset_name
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language(self):
|
def language(self):
|
||||||
if self._name_prober:
|
if self._name_prober:
|
||||||
return self._name_prober.language
|
return self._name_prober.language
|
||||||
else:
|
else:
|
||||||
return self._model.get('language')
|
return self._model.language
|
||||||
|
|
||||||
def feed(self, byte_str):
|
def feed(self, byte_str):
|
||||||
if not self._model['keep_english_letter']:
|
# TODO: Make filter_international_words keep things in self.alphabet
|
||||||
|
if not self._model.keep_ascii_letters:
|
||||||
byte_str = self.filter_international_words(byte_str)
|
byte_str = self.filter_international_words(byte_str)
|
||||||
if not byte_str:
|
if not byte_str:
|
||||||
return self.state
|
return self.state
|
||||||
char_to_order_map = self._model['char_to_order_map']
|
char_to_order_map = self._model.char_to_order_map
|
||||||
for i, c in enumerate(byte_str):
|
language_model = self._model.language_model
|
||||||
# XXX: Order is in range 1-64, so one would think we want 0-63 here,
|
for char in byte_str:
|
||||||
# but that leads to 27 more test failures than before.
|
order = char_to_order_map.get(char, CharacterCategory.UNDEFINED)
|
||||||
order = char_to_order_map[c]
|
|
||||||
# XXX: This was SYMBOL_CAT_ORDER before, with a value of 250, but
|
# XXX: This was SYMBOL_CAT_ORDER before, with a value of 250, but
|
||||||
# CharacterCategory.SYMBOL is actually 253, so we use CONTROL
|
# CharacterCategory.SYMBOL is actually 253, so we use CONTROL
|
||||||
# to make it closer to the original intent. The only difference
|
# to make it closer to the original intent. The only difference
|
||||||
|
|
@ -91,20 +103,21 @@ class SingleByteCharSetProber(CharSetProber):
|
||||||
# _total_char purposes.
|
# _total_char purposes.
|
||||||
if order < CharacterCategory.CONTROL:
|
if order < CharacterCategory.CONTROL:
|
||||||
self._total_char += 1
|
self._total_char += 1
|
||||||
|
# TODO: Follow uchardet's lead and discount confidence for frequent
|
||||||
|
# control characters.
|
||||||
|
# See https://github.com/BYVoid/uchardet/commit/55b4f23971db61
|
||||||
if order < self.SAMPLE_SIZE:
|
if order < self.SAMPLE_SIZE:
|
||||||
self._freq_char += 1
|
self._freq_char += 1
|
||||||
if self._last_order < self.SAMPLE_SIZE:
|
if self._last_order < self.SAMPLE_SIZE:
|
||||||
self._total_seqs += 1
|
self._total_seqs += 1
|
||||||
if not self._reversed:
|
if not self._reversed:
|
||||||
i = (self._last_order * self.SAMPLE_SIZE) + order
|
lm_cat = language_model[self._last_order][order]
|
||||||
model = self._model['precedence_matrix'][i]
|
else:
|
||||||
else: # reverse the order of the letters in the lookup
|
lm_cat = language_model[order][self._last_order]
|
||||||
i = (order * self.SAMPLE_SIZE) + self._last_order
|
self._seq_counters[lm_cat] += 1
|
||||||
model = self._model['precedence_matrix'][i]
|
|
||||||
self._seq_counters[model] += 1
|
|
||||||
self._last_order = order
|
self._last_order = order
|
||||||
|
|
||||||
charset_name = self._model['charset_name']
|
charset_name = self._model.charset_name
|
||||||
if self.state == ProbingState.DETECTING:
|
if self.state == ProbingState.DETECTING:
|
||||||
if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD:
|
if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD:
|
||||||
confidence = self.get_confidence()
|
confidence = self.get_confidence()
|
||||||
|
|
@ -125,7 +138,7 @@ class SingleByteCharSetProber(CharSetProber):
|
||||||
r = 0.01
|
r = 0.01
|
||||||
if self._total_seqs > 0:
|
if self._total_seqs > 0:
|
||||||
r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) /
|
r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) /
|
||||||
self._total_seqs / self._model['typical_positive_ratio'])
|
self._total_seqs / self._model.typical_positive_ratio)
|
||||||
r = r * self._freq_char / self._total_char
|
r = r * self._freq_char / self._total_char
|
||||||
if r >= 1.0:
|
if r >= 1.0:
|
||||||
r = 0.99
|
r = 0.99
|
||||||
|
|
|
||||||
|
|
@ -27,47 +27,57 @@
|
||||||
######################### END LICENSE BLOCK #########################
|
######################### END LICENSE BLOCK #########################
|
||||||
|
|
||||||
from .charsetgroupprober import CharSetGroupProber
|
from .charsetgroupprober import CharSetGroupProber
|
||||||
from .sbcharsetprober import SingleByteCharSetProber
|
|
||||||
from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel,
|
|
||||||
Latin5CyrillicModel, MacCyrillicModel,
|
|
||||||
Ibm866Model, Ibm855Model)
|
|
||||||
from .langgreekmodel import Latin7GreekModel, Win1253GreekModel
|
|
||||||
from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
|
|
||||||
# from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
|
|
||||||
from .langthaimodel import TIS620ThaiModel
|
|
||||||
from .langhebrewmodel import Win1255HebrewModel
|
|
||||||
from .hebrewprober import HebrewProber
|
from .hebrewprober import HebrewProber
|
||||||
from .langturkishmodel import Latin5TurkishModel
|
from .langbulgarianmodel import (ISO_8859_5_BULGARIAN_MODEL,
|
||||||
|
WINDOWS_1251_BULGARIAN_MODEL)
|
||||||
|
from .langgreekmodel import ISO_8859_7_GREEK_MODEL, WINDOWS_1253_GREEK_MODEL
|
||||||
|
from .langhebrewmodel import WINDOWS_1255_HEBREW_MODEL
|
||||||
|
# from .langhungarianmodel import (ISO_8859_2_HUNGARIAN_MODEL,
|
||||||
|
# WINDOWS_1250_HUNGARIAN_MODEL)
|
||||||
|
from .langrussianmodel import (IBM855_RUSSIAN_MODEL, IBM866_RUSSIAN_MODEL,
|
||||||
|
ISO_8859_5_RUSSIAN_MODEL, KOI8_R_RUSSIAN_MODEL,
|
||||||
|
MACCYRILLIC_RUSSIAN_MODEL,
|
||||||
|
WINDOWS_1251_RUSSIAN_MODEL)
|
||||||
|
from .langthaimodel import TIS_620_THAI_MODEL
|
||||||
|
from .langturkishmodel import ISO_8859_9_TURKISH_MODEL
|
||||||
|
from .sbcharsetprober import SingleByteCharSetProber
|
||||||
|
|
||||||
|
|
||||||
class SBCSGroupProber(CharSetGroupProber):
|
class SBCSGroupProber(CharSetGroupProber):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(SBCSGroupProber, self).__init__()
|
super(SBCSGroupProber, self).__init__()
|
||||||
|
hebrew_prober = HebrewProber()
|
||||||
|
logical_hebrew_prober = SingleByteCharSetProber(WINDOWS_1255_HEBREW_MODEL,
|
||||||
|
False, hebrew_prober)
|
||||||
|
# TODO: See if using ISO-8859-8 Hebrew model works better here, since
|
||||||
|
# it's actually the visual one
|
||||||
|
visual_hebrew_prober = SingleByteCharSetProber(WINDOWS_1255_HEBREW_MODEL,
|
||||||
|
True, hebrew_prober)
|
||||||
|
hebrew_prober.set_model_probers(logical_hebrew_prober,
|
||||||
|
visual_hebrew_prober)
|
||||||
|
# TODO: ORDER MATTERS HERE. I changed the order vs what was in master
|
||||||
|
# and several tests failed that did not before. Some thought
|
||||||
|
# should be put into the ordering, and we should consider making
|
||||||
|
# order not matter here, because that is very counter-intuitive.
|
||||||
self.probers = [
|
self.probers = [
|
||||||
SingleByteCharSetProber(Win1251CyrillicModel),
|
SingleByteCharSetProber(WINDOWS_1251_RUSSIAN_MODEL),
|
||||||
SingleByteCharSetProber(Koi8rModel),
|
SingleByteCharSetProber(KOI8_R_RUSSIAN_MODEL),
|
||||||
SingleByteCharSetProber(Latin5CyrillicModel),
|
SingleByteCharSetProber(ISO_8859_5_RUSSIAN_MODEL),
|
||||||
SingleByteCharSetProber(MacCyrillicModel),
|
SingleByteCharSetProber(MACCYRILLIC_RUSSIAN_MODEL),
|
||||||
SingleByteCharSetProber(Ibm866Model),
|
SingleByteCharSetProber(IBM866_RUSSIAN_MODEL),
|
||||||
SingleByteCharSetProber(Ibm855Model),
|
SingleByteCharSetProber(IBM855_RUSSIAN_MODEL),
|
||||||
SingleByteCharSetProber(Latin7GreekModel),
|
SingleByteCharSetProber(ISO_8859_7_GREEK_MODEL),
|
||||||
SingleByteCharSetProber(Win1253GreekModel),
|
SingleByteCharSetProber(WINDOWS_1253_GREEK_MODEL),
|
||||||
SingleByteCharSetProber(Latin5BulgarianModel),
|
SingleByteCharSetProber(ISO_8859_5_BULGARIAN_MODEL),
|
||||||
SingleByteCharSetProber(Win1251BulgarianModel),
|
SingleByteCharSetProber(WINDOWS_1251_BULGARIAN_MODEL),
|
||||||
# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250)
|
# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250)
|
||||||
# after we retrain model.
|
# after we retrain model.
|
||||||
# SingleByteCharSetProber(Latin2HungarianModel),
|
# SingleByteCharSetProber(ISO_8859_2_HUNGARIAN_MODEL),
|
||||||
# SingleByteCharSetProber(Win1250HungarianModel),
|
# SingleByteCharSetProber(WINDOWS_1250_HUNGARIAN_MODEL),
|
||||||
SingleByteCharSetProber(TIS620ThaiModel),
|
SingleByteCharSetProber(TIS_620_THAI_MODEL),
|
||||||
SingleByteCharSetProber(Latin5TurkishModel),
|
SingleByteCharSetProber(ISO_8859_9_TURKISH_MODEL),
|
||||||
|
hebrew_prober,
|
||||||
|
logical_hebrew_prober,
|
||||||
|
visual_hebrew_prober,
|
||||||
]
|
]
|
||||||
hebrew_prober = HebrewProber()
|
|
||||||
logical_hebrew_prober = SingleByteCharSetProber(Win1255HebrewModel,
|
|
||||||
False, hebrew_prober)
|
|
||||||
visual_hebrew_prober = SingleByteCharSetProber(Win1255HebrewModel, True,
|
|
||||||
hebrew_prober)
|
|
||||||
hebrew_prober.set_model_probers(logical_hebrew_prober, visual_hebrew_prober)
|
|
||||||
self.probers.extend([hebrew_prober, logical_hebrew_prober,
|
|
||||||
visual_hebrew_prober])
|
|
||||||
|
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
|
||||||
|
|
@ -266,7 +266,7 @@ class UniversalDetector(object):
|
||||||
'language': max_prober.language}
|
'language': max_prober.language}
|
||||||
|
|
||||||
# Log all prober confidences if none met MINIMUM_THRESHOLD
|
# Log all prober confidences if none met MINIMUM_THRESHOLD
|
||||||
if self.logger.getEffectiveLevel() == logging.DEBUG:
|
if self.logger.getEffectiveLevel() <= logging.DEBUG:
|
||||||
if self.result['encoding'] is None:
|
if self.result['encoding'] is None:
|
||||||
self.logger.debug('no probers hit minimum threshold')
|
self.logger.debug('no probers hit minimum threshold')
|
||||||
for group_prober in self._charset_probers:
|
for group_prober in self._charset_probers:
|
||||||
|
|
@ -280,7 +280,7 @@ class UniversalDetector(object):
|
||||||
prober.get_confidence())
|
prober.get_confidence())
|
||||||
else:
|
else:
|
||||||
self.logger.debug('%s %s confidence = %s',
|
self.logger.debug('%s %s confidence = %s',
|
||||||
prober.charset_name,
|
group_prober.charset_name,
|
||||||
prober.language,
|
group_prober.language,
|
||||||
prober.get_confidence())
|
group_prober.get_confidence())
|
||||||
return self.result
|
return self.result
|
||||||
|
|
|
||||||
|
|
@ -5,5 +5,5 @@ from within setup.py and from chardet subpackages.
|
||||||
:author: Dan Blanchard (dan.blanchard@gmail.com)
|
:author: Dan Blanchard (dan.blanchard@gmail.com)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__version__ = "3.0.4"
|
__version__ = "4.0.0"
|
||||||
VERSION = __version__.split('.')
|
VERSION = __version__.split('.')
|
||||||
|
|
|
||||||
|
|
@ -25,17 +25,16 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
SOFTWARE.
|
SOFTWARE.
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals
|
|
||||||
from .__meta__ import __version__, __version_info__ # noqa: F401
|
from .__meta__ import __version__, __version_info__ # noqa: F401
|
||||||
from . import css_parser as cp
|
from . import css_parser as cp
|
||||||
from . import css_match as cm
|
from . import css_match as cm
|
||||||
from . import css_types as ct
|
from . import css_types as ct
|
||||||
from .util import DEBUG, _QUIRKS, deprecated, SelectorSyntaxError # noqa: F401
|
from .util import DEBUG, SelectorSyntaxError # noqa: F401
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
'DEBUG', "_QUIRKS", 'SelectorSyntaxError', 'SoupSieve',
|
'DEBUG', 'SelectorSyntaxError', 'SoupSieve',
|
||||||
'closest', 'comments', 'compile', 'filter', 'icomments',
|
'closest', 'compile', 'filter', 'iselect',
|
||||||
'iselect', 'match', 'select', 'select_one'
|
'match', 'select', 'select_one'
|
||||||
)
|
)
|
||||||
|
|
||||||
SoupSieve = cm.SoupSieve
|
SoupSieve = cm.SoupSieve
|
||||||
|
|
@ -87,21 +86,6 @@ def filter(select, iterable, namespaces=None, flags=0, **kwargs): # noqa: A001
|
||||||
return compile(select, namespaces, flags, **kwargs).filter(iterable)
|
return compile(select, namespaces, flags, **kwargs).filter(iterable)
|
||||||
|
|
||||||
|
|
||||||
@deprecated("'comments' is not related to CSS selectors and will be removed in the future.")
|
|
||||||
def comments(tag, limit=0, flags=0, **kwargs):
|
|
||||||
"""Get comments only."""
|
|
||||||
|
|
||||||
return [comment for comment in cm.CommentsMatch(tag).get_comments(limit)]
|
|
||||||
|
|
||||||
|
|
||||||
@deprecated("'icomments' is not related to CSS selectors and will be removed in the future.")
|
|
||||||
def icomments(tag, limit=0, flags=0, **kwargs):
|
|
||||||
"""Iterate comments only."""
|
|
||||||
|
|
||||||
for comment in cm.CommentsMatch(tag).get_comments(limit):
|
|
||||||
yield comment
|
|
||||||
|
|
||||||
|
|
||||||
def select_one(select, tag, namespaces=None, flags=0, **kwargs):
|
def select_one(select, tag, namespaces=None, flags=0, **kwargs):
|
||||||
"""Select a single tag."""
|
"""Select a single tag."""
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,4 @@
|
||||||
"""Meta related things."""
|
"""Meta related things."""
|
||||||
from __future__ import unicode_literals
|
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
@ -186,5 +185,5 @@ def parse_version(ver, pre=False):
|
||||||
return Version(major, minor, micro, release, pre, post, dev)
|
return Version(major, minor, micro, release, pre, post, dev)
|
||||||
|
|
||||||
|
|
||||||
__version_info__ = Version(1, 9, 1, "final")
|
__version_info__ = Version(2, 1, 0, "final")
|
||||||
__version__ = __version_info__._get_canonical()
|
__version__ = __version_info__._get_canonical()
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,12 @@
|
||||||
"""CSS matcher."""
|
"""CSS matcher."""
|
||||||
from __future__ import unicode_literals
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from . import util
|
from . import util
|
||||||
import re
|
import re
|
||||||
from .import css_types as ct
|
from .import css_types as ct
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
|
import bs4
|
||||||
|
|
||||||
# Empty tag pattern (whitespace okay)
|
# Empty tag pattern (whitespace okay)
|
||||||
RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
|
RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
|
||||||
|
|
||||||
|
|
@ -43,6 +44,7 @@ RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2}
|
||||||
RE_DATETIME = re.compile(
|
RE_DATETIME = re.compile(
|
||||||
r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$'
|
r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$'
|
||||||
)
|
)
|
||||||
|
RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)')
|
||||||
|
|
||||||
MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November
|
MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November
|
||||||
FEB = 2
|
FEB = 2
|
||||||
|
|
@ -53,7 +55,7 @@ FEB_LEAP_MONTH = 29
|
||||||
DAYS_IN_WEEK = 7
|
DAYS_IN_WEEK = 7
|
||||||
|
|
||||||
|
|
||||||
class FakeParent(object):
|
class _FakeParent(object):
|
||||||
"""
|
"""
|
||||||
Fake parent class.
|
Fake parent class.
|
||||||
|
|
||||||
|
|
@ -73,7 +75,7 @@ class FakeParent(object):
|
||||||
return len(self.contents)
|
return len(self.contents)
|
||||||
|
|
||||||
|
|
||||||
class Document(object):
|
class _DocumentNav(object):
|
||||||
"""Navigate a Beautiful Soup document."""
|
"""Navigate a Beautiful Soup document."""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
@ -87,58 +89,37 @@ class Document(object):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def is_doc(obj):
|
def is_doc(obj):
|
||||||
"""Is `BeautifulSoup` object."""
|
"""Is `BeautifulSoup` object."""
|
||||||
|
|
||||||
import bs4
|
|
||||||
return isinstance(obj, bs4.BeautifulSoup)
|
return isinstance(obj, bs4.BeautifulSoup)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def is_tag(obj):
|
def is_tag(obj):
|
||||||
"""Is tag."""
|
"""Is tag."""
|
||||||
|
|
||||||
import bs4
|
|
||||||
return isinstance(obj, bs4.Tag)
|
return isinstance(obj, bs4.Tag)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def is_comment(obj):
|
|
||||||
"""Is comment."""
|
|
||||||
|
|
||||||
import bs4
|
|
||||||
return isinstance(obj, bs4.Comment)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def is_declaration(obj): # pragma: no cover
|
def is_declaration(obj): # pragma: no cover
|
||||||
"""Is declaration."""
|
"""Is declaration."""
|
||||||
|
|
||||||
import bs4
|
|
||||||
return isinstance(obj, bs4.Declaration)
|
return isinstance(obj, bs4.Declaration)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def is_cdata(obj): # pragma: no cover
|
def is_cdata(obj):
|
||||||
"""Is CDATA."""
|
"""Is CDATA."""
|
||||||
|
return isinstance(obj, bs4.CData)
|
||||||
import bs4
|
|
||||||
return isinstance(obj, bs4.Declaration)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def is_processing_instruction(obj): # pragma: no cover
|
def is_processing_instruction(obj): # pragma: no cover
|
||||||
"""Is processing instruction."""
|
"""Is processing instruction."""
|
||||||
|
|
||||||
import bs4
|
|
||||||
return isinstance(obj, bs4.ProcessingInstruction)
|
return isinstance(obj, bs4.ProcessingInstruction)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def is_navigable_string(obj):
|
def is_navigable_string(obj):
|
||||||
"""Is navigable string."""
|
"""Is navigable string."""
|
||||||
|
|
||||||
import bs4
|
|
||||||
return isinstance(obj, bs4.NavigableString)
|
return isinstance(obj, bs4.NavigableString)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def is_special_string(obj):
|
def is_special_string(obj):
|
||||||
"""Is special string."""
|
"""Is special string."""
|
||||||
|
return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
|
||||||
import bs4
|
|
||||||
return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction))
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def is_content_string(cls, obj):
|
def is_content_string(cls, obj):
|
||||||
|
|
@ -150,7 +131,7 @@ class Document(object):
|
||||||
def create_fake_parent(el):
|
def create_fake_parent(el):
|
||||||
"""Create fake parent for a given element."""
|
"""Create fake parent for a given element."""
|
||||||
|
|
||||||
return FakeParent(el)
|
return _FakeParent(el)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def is_xml_tree(el):
|
def is_xml_tree(el):
|
||||||
|
|
@ -217,10 +198,13 @@ class Document(object):
|
||||||
is_tag = self.is_tag(child)
|
is_tag = self.is_tag(child)
|
||||||
|
|
||||||
if no_iframe and is_tag and self.is_iframe(child):
|
if no_iframe and is_tag and self.is_iframe(child):
|
||||||
last_child = child
|
if child.next_sibling is not None:
|
||||||
while self.is_tag(last_child) and last_child.contents:
|
next_good = child.next_sibling
|
||||||
last_child = last_child.contents[-1]
|
else:
|
||||||
next_good = last_child.next_element
|
last_child = child
|
||||||
|
while self.is_tag(last_child) and last_child.contents:
|
||||||
|
last_child = last_child.contents[-1]
|
||||||
|
next_good = last_child.next_element
|
||||||
yield child
|
yield child
|
||||||
if next_good is None:
|
if next_good is None:
|
||||||
break
|
break
|
||||||
|
|
@ -250,21 +234,27 @@ class Document(object):
|
||||||
|
|
||||||
return el.prefix
|
return el.prefix
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_uri(el):
|
||||||
|
"""Get namespace `URI`."""
|
||||||
|
|
||||||
|
return el.namespace
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_next_tag(cls, el):
|
def get_next(cls, el, tags=True):
|
||||||
"""Get next sibling tag."""
|
"""Get next sibling tag."""
|
||||||
|
|
||||||
sibling = el.next_sibling
|
sibling = el.next_sibling
|
||||||
while not cls.is_tag(sibling) and sibling is not None:
|
while tags and not cls.is_tag(sibling) and sibling is not None:
|
||||||
sibling = sibling.next_sibling
|
sibling = sibling.next_sibling
|
||||||
return sibling
|
return sibling
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_previous_tag(cls, el):
|
def get_previous(cls, el, tags=True):
|
||||||
"""Get previous sibling tag."""
|
"""Get previous sibling tag."""
|
||||||
|
|
||||||
sibling = el.previous_sibling
|
sibling = el.previous_sibling
|
||||||
while not cls.is_tag(sibling) and sibling is not None:
|
while tags and not cls.is_tag(sibling) and sibling is not None:
|
||||||
sibling = sibling.previous_sibling
|
sibling = sibling.previous_sibling
|
||||||
return sibling
|
return sibling
|
||||||
|
|
||||||
|
|
@ -315,7 +305,7 @@ class Document(object):
|
||||||
"""Get classes."""
|
"""Get classes."""
|
||||||
|
|
||||||
classes = cls.get_attribute_by_name(el, 'class', [])
|
classes = cls.get_attribute_by_name(el, 'class', [])
|
||||||
if isinstance(classes, util.ustr):
|
if isinstance(classes, str):
|
||||||
classes = RE_NOT_WS.findall(classes)
|
classes = RE_NOT_WS.findall(classes)
|
||||||
return classes
|
return classes
|
||||||
|
|
||||||
|
|
@ -326,6 +316,11 @@ class Document(object):
|
||||||
[node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)]
|
[node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def get_own_text(self, el, no_iframe=False):
|
||||||
|
"""Get Own Text."""
|
||||||
|
|
||||||
|
return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)]
|
||||||
|
|
||||||
|
|
||||||
class Inputs(object):
|
class Inputs(object):
|
||||||
"""Class for parsing and validating input items."""
|
"""Class for parsing and validating input items."""
|
||||||
|
|
@ -428,7 +423,7 @@ class Inputs(object):
|
||||||
return parsed
|
return parsed
|
||||||
|
|
||||||
|
|
||||||
class CSSMatch(Document, object):
|
class _Match(object):
|
||||||
"""Perform CSS matching."""
|
"""Perform CSS matching."""
|
||||||
|
|
||||||
def __init__(self, selectors, scope, namespaces, flags):
|
def __init__(self, selectors, scope, namespaces, flags):
|
||||||
|
|
@ -476,7 +471,7 @@ class CSSMatch(Document, object):
|
||||||
|
|
||||||
if self.supports_namespaces():
|
if self.supports_namespaces():
|
||||||
namespace = ''
|
namespace = ''
|
||||||
ns = el.namespace
|
ns = self.get_uri(el)
|
||||||
if ns:
|
if ns:
|
||||||
namespace = ns
|
namespace = ns
|
||||||
else:
|
else:
|
||||||
|
|
@ -536,6 +531,57 @@ class CSSMatch(Document, object):
|
||||||
return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
|
return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def extended_language_filter(self, lang_range, lang_tag):
|
||||||
|
"""Filter the language tags."""
|
||||||
|
|
||||||
|
match = True
|
||||||
|
lang_range = RE_WILD_STRIP.sub('-', lang_range).lower()
|
||||||
|
ranges = lang_range.split('-')
|
||||||
|
subtags = lang_tag.lower().split('-')
|
||||||
|
length = len(ranges)
|
||||||
|
rindex = 0
|
||||||
|
sindex = 0
|
||||||
|
r = ranges[rindex]
|
||||||
|
s = subtags[sindex]
|
||||||
|
|
||||||
|
# Primary tag needs to match
|
||||||
|
if r != '*' and r != s:
|
||||||
|
match = False
|
||||||
|
|
||||||
|
rindex += 1
|
||||||
|
sindex += 1
|
||||||
|
|
||||||
|
# Match until we run out of ranges
|
||||||
|
while match and rindex < length:
|
||||||
|
r = ranges[rindex]
|
||||||
|
try:
|
||||||
|
s = subtags[sindex]
|
||||||
|
except IndexError:
|
||||||
|
# Ran out of subtags,
|
||||||
|
# but we still have ranges
|
||||||
|
match = False
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Empty range
|
||||||
|
if not r:
|
||||||
|
match = False
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Matched range
|
||||||
|
elif s == r:
|
||||||
|
rindex += 1
|
||||||
|
|
||||||
|
# Implicit wildcard cannot match
|
||||||
|
# singletons
|
||||||
|
elif len(s) == 1:
|
||||||
|
match = False
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Implicitly matched, so grab next subtag
|
||||||
|
sindex += 1
|
||||||
|
|
||||||
|
return match
|
||||||
|
|
||||||
def match_attribute_name(self, el, attr, prefix):
|
def match_attribute_name(self, el, attr, prefix):
|
||||||
"""Match attribute name and return value if it exists."""
|
"""Match attribute name and return value if it exists."""
|
||||||
|
|
||||||
|
|
@ -660,12 +706,12 @@ class CSSMatch(Document, object):
|
||||||
if parent:
|
if parent:
|
||||||
found = self.match_selectors(parent, relation)
|
found = self.match_selectors(parent, relation)
|
||||||
elif relation[0].rel_type == REL_SIBLING:
|
elif relation[0].rel_type == REL_SIBLING:
|
||||||
sibling = self.get_previous_tag(el)
|
sibling = self.get_previous(el)
|
||||||
while not found and sibling:
|
while not found and sibling:
|
||||||
found = self.match_selectors(sibling, relation)
|
found = self.match_selectors(sibling, relation)
|
||||||
sibling = self.get_previous_tag(sibling)
|
sibling = self.get_previous(sibling)
|
||||||
elif relation[0].rel_type == REL_CLOSE_SIBLING:
|
elif relation[0].rel_type == REL_CLOSE_SIBLING:
|
||||||
sibling = self.get_previous_tag(el)
|
sibling = self.get_previous(el)
|
||||||
if sibling and self.is_tag(sibling):
|
if sibling and self.is_tag(sibling):
|
||||||
found = self.match_selectors(sibling, relation)
|
found = self.match_selectors(sibling, relation)
|
||||||
return found
|
return found
|
||||||
|
|
@ -690,12 +736,12 @@ class CSSMatch(Document, object):
|
||||||
elif relation[0].rel_type == REL_HAS_CLOSE_PARENT:
|
elif relation[0].rel_type == REL_HAS_CLOSE_PARENT:
|
||||||
found = self.match_future_child(el, relation)
|
found = self.match_future_child(el, relation)
|
||||||
elif relation[0].rel_type == REL_HAS_SIBLING:
|
elif relation[0].rel_type == REL_HAS_SIBLING:
|
||||||
sibling = self.get_next_tag(el)
|
sibling = self.get_next(el)
|
||||||
while not found and sibling:
|
while not found and sibling:
|
||||||
found = self.match_selectors(sibling, relation)
|
found = self.match_selectors(sibling, relation)
|
||||||
sibling = self.get_next_tag(sibling)
|
sibling = self.get_next(sibling)
|
||||||
elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING:
|
elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING:
|
||||||
sibling = self.get_next_tag(el)
|
sibling = self.get_next(el)
|
||||||
if sibling and self.is_tag(sibling):
|
if sibling and self.is_tag(sibling):
|
||||||
found = self.match_selectors(sibling, relation)
|
found = self.match_selectors(sibling, relation)
|
||||||
return found
|
return found
|
||||||
|
|
@ -736,7 +782,28 @@ class CSSMatch(Document, object):
|
||||||
def match_root(self, el):
|
def match_root(self, el):
|
||||||
"""Match element as root."""
|
"""Match element as root."""
|
||||||
|
|
||||||
return self.is_root(el)
|
is_root = self.is_root(el)
|
||||||
|
if is_root:
|
||||||
|
sibling = self.get_previous(el, tags=False)
|
||||||
|
while is_root and sibling is not None:
|
||||||
|
if (
|
||||||
|
self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
|
||||||
|
self.is_cdata(sibling)
|
||||||
|
):
|
||||||
|
is_root = False
|
||||||
|
else:
|
||||||
|
sibling = self.get_previous(sibling, tags=False)
|
||||||
|
if is_root:
|
||||||
|
sibling = self.get_next(el, tags=False)
|
||||||
|
while is_root and sibling is not None:
|
||||||
|
if (
|
||||||
|
self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
|
||||||
|
self.is_cdata(sibling)
|
||||||
|
):
|
||||||
|
is_root = False
|
||||||
|
else:
|
||||||
|
sibling = self.get_next(sibling, tags=False)
|
||||||
|
return is_root
|
||||||
|
|
||||||
def match_scope(self, el):
|
def match_scope(self, el):
|
||||||
"""Match element as scope."""
|
"""Match element as scope."""
|
||||||
|
|
@ -881,12 +948,23 @@ class CSSMatch(Document, object):
|
||||||
content = None
|
content = None
|
||||||
for contain_list in contains:
|
for contain_list in contains:
|
||||||
if content is None:
|
if content is None:
|
||||||
content = self.get_text(el, no_iframe=self.is_html)
|
if contain_list.own:
|
||||||
|
content = self.get_own_text(el, no_iframe=self.is_html)
|
||||||
|
else:
|
||||||
|
content = self.get_text(el, no_iframe=self.is_html)
|
||||||
found = False
|
found = False
|
||||||
for text in contain_list.text:
|
for text in contain_list.text:
|
||||||
if text in content:
|
if contain_list.own:
|
||||||
found = True
|
for c in content:
|
||||||
break
|
if text in c:
|
||||||
|
found = True
|
||||||
|
break
|
||||||
|
if found:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
if text in content:
|
||||||
|
found = True
|
||||||
|
break
|
||||||
if not found:
|
if not found:
|
||||||
match = False
|
match = False
|
||||||
return match
|
return match
|
||||||
|
|
@ -1070,7 +1148,7 @@ class CSSMatch(Document, object):
|
||||||
for patterns in langs:
|
for patterns in langs:
|
||||||
match = False
|
match = False
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
if pattern.match(found_lang):
|
if self.extended_language_filter(pattern, found_lang):
|
||||||
match = True
|
match = True
|
||||||
if not match:
|
if not match:
|
||||||
break
|
break
|
||||||
|
|
@ -1152,7 +1230,7 @@ class CSSMatch(Document, object):
|
||||||
|
|
||||||
out_of_range = False
|
out_of_range = False
|
||||||
|
|
||||||
itype = self.get_attribute_by_name(el, 'type').lower()
|
itype = util.lower(self.get_attribute_by_name(el, 'type'))
|
||||||
mn = self.get_attribute_by_name(el, 'min', None)
|
mn = self.get_attribute_by_name(el, 'min', None)
|
||||||
if mn is not None:
|
if mn is not None:
|
||||||
mn = Inputs.parse_value(itype, mn)
|
mn = Inputs.parse_value(itype, mn)
|
||||||
|
|
@ -1207,6 +1285,21 @@ class CSSMatch(Document, object):
|
||||||
self.get_prefix(el) is not None
|
self.get_prefix(el) is not None
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def match_placeholder_shown(self, el):
|
||||||
|
"""
|
||||||
|
Match placeholder shown according to HTML spec.
|
||||||
|
|
||||||
|
- text area should be checked if they have content. A single newline does not count as content.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
match = False
|
||||||
|
content = self.get_text(el)
|
||||||
|
if content in ('', '\n'):
|
||||||
|
match = True
|
||||||
|
|
||||||
|
return match
|
||||||
|
|
||||||
def match_selectors(self, el, selectors):
|
def match_selectors(self, el, selectors):
|
||||||
"""Check if element matches one of the selectors."""
|
"""Check if element matches one of the selectors."""
|
||||||
|
|
||||||
|
|
@ -1239,6 +1332,9 @@ class CSSMatch(Document, object):
|
||||||
# Verify element is scope
|
# Verify element is scope
|
||||||
if selector.flags & ct.SEL_SCOPE and not self.match_scope(el):
|
if selector.flags & ct.SEL_SCOPE and not self.match_scope(el):
|
||||||
continue
|
continue
|
||||||
|
# Verify element has placeholder shown
|
||||||
|
if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el):
|
||||||
|
continue
|
||||||
# Verify `nth` matches
|
# Verify `nth` matches
|
||||||
if not self.match_nth(el, selector.nth):
|
if not self.match_nth(el, selector.nth):
|
||||||
continue
|
continue
|
||||||
|
|
@ -1325,28 +1421,8 @@ class CSSMatch(Document, object):
|
||||||
return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)
|
return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)
|
||||||
|
|
||||||
|
|
||||||
class CommentsMatch(Document, object):
|
class CSSMatch(_DocumentNav, _Match):
|
||||||
"""Comments matcher."""
|
"""The Beautiful Soup CSS match class."""
|
||||||
|
|
||||||
def __init__(self, el):
|
|
||||||
"""Initialize."""
|
|
||||||
|
|
||||||
self.assert_valid_input(el)
|
|
||||||
self.tag = el
|
|
||||||
|
|
||||||
def get_comments(self, limit=0):
|
|
||||||
"""Get comments."""
|
|
||||||
|
|
||||||
if limit < 1:
|
|
||||||
limit = None
|
|
||||||
|
|
||||||
for child in self.get_descendants(self.tag, tags=False):
|
|
||||||
if self.is_comment(child):
|
|
||||||
yield child
|
|
||||||
if limit is not None:
|
|
||||||
limit -= 1
|
|
||||||
if limit < 1:
|
|
||||||
break
|
|
||||||
|
|
||||||
|
|
||||||
class SoupSieve(ct.Immutable):
|
class SoupSieve(ct.Immutable):
|
||||||
|
|
@ -1392,19 +1468,6 @@ class SoupSieve(ct.Immutable):
|
||||||
else:
|
else:
|
||||||
return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]
|
return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]
|
||||||
|
|
||||||
@util.deprecated("'comments' is not related to CSS selectors and will be removed in the future.")
|
|
||||||
def comments(self, tag, limit=0):
|
|
||||||
"""Get comments only."""
|
|
||||||
|
|
||||||
return [comment for comment in CommentsMatch(tag).get_comments(limit)]
|
|
||||||
|
|
||||||
@util.deprecated("'icomments' is not related to CSS selectors and will be removed in the future.")
|
|
||||||
def icomments(self, tag, limit=0):
|
|
||||||
"""Iterate comments only."""
|
|
||||||
|
|
||||||
for comment in CommentsMatch(tag).get_comments(limit):
|
|
||||||
yield comment
|
|
||||||
|
|
||||||
def select_one(self, tag):
|
def select_one(self, tag):
|
||||||
"""Select a single tag."""
|
"""Select a single tag."""
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,11 @@
|
||||||
"""CSS selector parser."""
|
"""CSS selector parser."""
|
||||||
from __future__ import unicode_literals
|
|
||||||
import re
|
import re
|
||||||
|
from functools import lru_cache
|
||||||
from . import util
|
from . import util
|
||||||
from . import css_match as cm
|
from . import css_match as cm
|
||||||
from . import css_types as ct
|
from . import css_types as ct
|
||||||
from .util import SelectorSyntaxError
|
from .util import SelectorSyntaxError
|
||||||
|
import warnings
|
||||||
|
|
||||||
UNICODE_REPLACEMENT_CHAR = 0xFFFD
|
UNICODE_REPLACEMENT_CHAR = 0xFFFD
|
||||||
|
|
||||||
|
|
@ -59,6 +60,8 @@ PSEUDO_SIMPLE_NO_MATCH = {
|
||||||
# Complex pseudo classes that take selector lists
|
# Complex pseudo classes that take selector lists
|
||||||
PSEUDO_COMPLEX = {
|
PSEUDO_COMPLEX = {
|
||||||
':contains',
|
':contains',
|
||||||
|
':-soup-contains',
|
||||||
|
':-soup-contains-own',
|
||||||
':has',
|
':has',
|
||||||
':is',
|
':is',
|
||||||
':matches',
|
':matches',
|
||||||
|
|
@ -110,11 +113,6 @@ VALUE = r'''
|
||||||
ATTR = r'''
|
ATTR = r'''
|
||||||
(?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}+(?P<case>[is]))?)?{ws}*\]
|
(?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}+(?P<case>[is]))?)?{ws}*\]
|
||||||
'''.format(ws=WSC, value=VALUE)
|
'''.format(ws=WSC, value=VALUE)
|
||||||
# Definitions for quirks mode
|
|
||||||
QUIRKS_ATTR_IDENTIFIER = r'(?:(?:{esc}|(?!/\*)[^"\] \t\r\n\f])+?)'.format(esc=CSS_ESCAPES)
|
|
||||||
QUIRKS_ATTR = r'''
|
|
||||||
(?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}+(?P<case>[is]))?)?{ws}*\]
|
|
||||||
'''.format(ws=WSC, value=QUIRKS_ATTR_IDENTIFIER)
|
|
||||||
|
|
||||||
# Selector patterns
|
# Selector patterns
|
||||||
# IDs (`#id`)
|
# IDs (`#id`)
|
||||||
|
|
@ -122,13 +120,11 @@ PAT_ID = r'\#{ident}'.format(ident=IDENTIFIER)
|
||||||
# Classes (`.class`)
|
# Classes (`.class`)
|
||||||
PAT_CLASS = r'\.{ident}'.format(ident=IDENTIFIER)
|
PAT_CLASS = r'\.{ident}'.format(ident=IDENTIFIER)
|
||||||
# Prefix:Tag (`prefix|tag`)
|
# Prefix:Tag (`prefix|tag`)
|
||||||
PAT_TAG = r'(?:(?:{ident}|\*)?\|)?(?:{ident}|\*)'.format(ident=IDENTIFIER)
|
PAT_TAG = r'(?P<tag_ns>(?:{ident}|\*)?\|)?(?P<tag_name>{ident}|\*)'.format(ident=IDENTIFIER)
|
||||||
# Attributes (`[attr]`, `[attr=value]`, etc.)
|
# Attributes (`[attr]`, `[attr=value]`, etc.)
|
||||||
PAT_ATTR = r'\[{ws}*(?P<ns_attr>(?:(?:{ident}|\*)?\|)?{ident}){attr}'.format(ws=WSC, ident=IDENTIFIER, attr=ATTR)
|
PAT_ATTR = r'''
|
||||||
# Quirks attributes, like real attributes, but unquoted values can contain anything but whitespace and closing `]`.
|
\[{ws}*(?P<attr_ns>(?:{ident}|\*)?\|)?(?P<attr_name>{ident}){attr}
|
||||||
PAT_QUIRKS_ATTR = r'''
|
'''.format(ws=WSC, ident=IDENTIFIER, attr=ATTR)
|
||||||
\[{ws}*(?P<ns_attr>(?:(?:{ident}|\*)?\|)?{ident}){attr}
|
|
||||||
'''.format(ws=WSC, ident=IDENTIFIER, attr=QUIRKS_ATTR)
|
|
||||||
# Pseudo class (`:pseudo-class`, `:pseudo-class(`)
|
# Pseudo class (`:pseudo-class`, `:pseudo-class(`)
|
||||||
PAT_PSEUDO_CLASS = r'(?P<name>:{ident})(?P<open>\({ws}*)?'.format(ws=WSC, ident=IDENTIFIER)
|
PAT_PSEUDO_CLASS = r'(?P<name>:{ident})(?P<open>\({ws}*)?'.format(ws=WSC, ident=IDENTIFIER)
|
||||||
# Pseudo class special patterns. Matches `:pseudo-class(` for special case pseudo classes.
|
# Pseudo class special patterns. Matches `:pseudo-class(` for special case pseudo classes.
|
||||||
|
|
@ -199,12 +195,13 @@ FLG_INDETERMINATE = 0x20
|
||||||
FLG_OPEN = 0x40
|
FLG_OPEN = 0x40
|
||||||
FLG_IN_RANGE = 0x80
|
FLG_IN_RANGE = 0x80
|
||||||
FLG_OUT_OF_RANGE = 0x100
|
FLG_OUT_OF_RANGE = 0x100
|
||||||
|
FLG_PLACEHOLDER_SHOWN = 0x200
|
||||||
|
|
||||||
# Maximum cached patterns to store
|
# Maximum cached patterns to store
|
||||||
_MAXCACHE = 500
|
_MAXCACHE = 500
|
||||||
|
|
||||||
|
|
||||||
@util.lru_cache(maxsize=_MAXCACHE)
|
@lru_cache(maxsize=_MAXCACHE)
|
||||||
def _cached_css_compile(pattern, namespaces, custom, flags):
|
def _cached_css_compile(pattern, namespaces, custom, flags):
|
||||||
"""Cached CSS compile."""
|
"""Cached CSS compile."""
|
||||||
|
|
||||||
|
|
@ -253,7 +250,7 @@ def css_unescape(content, string=False):
|
||||||
codepoint = int(m.group(1)[1:], 16)
|
codepoint = int(m.group(1)[1:], 16)
|
||||||
if codepoint == 0:
|
if codepoint == 0:
|
||||||
codepoint = UNICODE_REPLACEMENT_CHAR
|
codepoint = UNICODE_REPLACEMENT_CHAR
|
||||||
value = util.uchr(codepoint)
|
value = chr(codepoint)
|
||||||
elif m.group(2):
|
elif m.group(2):
|
||||||
value = m.group(2)[1:]
|
value = m.group(2)[1:]
|
||||||
elif m.group(3):
|
elif m.group(3):
|
||||||
|
|
@ -277,7 +274,7 @@ def escape(ident):
|
||||||
string.append('\\{}'.format(ident))
|
string.append('\\{}'.format(ident))
|
||||||
else:
|
else:
|
||||||
for index, c in enumerate(ident):
|
for index, c in enumerate(ident):
|
||||||
codepoint = util.uord(c)
|
codepoint = ord(c)
|
||||||
if codepoint == 0x00:
|
if codepoint == 0x00:
|
||||||
string.append('\ufffd')
|
string.append('\ufffd')
|
||||||
elif (0x01 <= codepoint <= 0x1F) or codepoint == 0x7F:
|
elif (0x01 <= codepoint <= 0x1F) or codepoint == 0x7F:
|
||||||
|
|
@ -308,12 +305,7 @@ class SelectorPattern(object):
|
||||||
|
|
||||||
return self.name
|
return self.name
|
||||||
|
|
||||||
def enabled(self, flags):
|
def match(self, selector, index, flags):
|
||||||
"""Enabled."""
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
def match(self, selector, index):
|
|
||||||
"""Match the selector."""
|
"""Match the selector."""
|
||||||
|
|
||||||
return self.re_pattern.match(selector, index)
|
return self.re_pattern.match(selector, index)
|
||||||
|
|
@ -328,7 +320,7 @@ class SpecialPseudoPattern(SelectorPattern):
|
||||||
self.patterns = {}
|
self.patterns = {}
|
||||||
for p in patterns:
|
for p in patterns:
|
||||||
name = p[0]
|
name = p[0]
|
||||||
pattern = SelectorPattern(name, p[2])
|
pattern = p[3](name, p[2])
|
||||||
for pseudo in p[1]:
|
for pseudo in p[1]:
|
||||||
self.patterns[pseudo] = pattern
|
self.patterns[pseudo] = pattern
|
||||||
|
|
||||||
|
|
@ -340,12 +332,7 @@ class SpecialPseudoPattern(SelectorPattern):
|
||||||
|
|
||||||
return self.matched_name.get_name()
|
return self.matched_name.get_name()
|
||||||
|
|
||||||
def enabled(self, flags):
|
def match(self, selector, index, flags):
|
||||||
"""Enabled."""
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
def match(self, selector, index):
|
|
||||||
"""Match the selector."""
|
"""Match the selector."""
|
||||||
|
|
||||||
pseudo = None
|
pseudo = None
|
||||||
|
|
@ -354,22 +341,13 @@ class SpecialPseudoPattern(SelectorPattern):
|
||||||
name = util.lower(css_unescape(m.group('name')))
|
name = util.lower(css_unescape(m.group('name')))
|
||||||
pattern = self.patterns.get(name)
|
pattern = self.patterns.get(name)
|
||||||
if pattern:
|
if pattern:
|
||||||
pseudo = pattern.match(selector, index)
|
pseudo = pattern.match(selector, index, flags)
|
||||||
if pseudo:
|
if pseudo:
|
||||||
self.matched_name = pattern
|
self.matched_name = pattern
|
||||||
|
|
||||||
return pseudo
|
return pseudo
|
||||||
|
|
||||||
|
|
||||||
class QuirkPattern(SelectorPattern):
|
|
||||||
"""Selector pattern for quirk mode."""
|
|
||||||
|
|
||||||
def enabled(self, flags):
|
|
||||||
"""Enabled if quirks flag is present."""
|
|
||||||
|
|
||||||
return flags & util._QUIRKS
|
|
||||||
|
|
||||||
|
|
||||||
class _Selector(object):
|
class _Selector(object):
|
||||||
"""
|
"""
|
||||||
Intermediate selector class.
|
Intermediate selector class.
|
||||||
|
|
@ -446,11 +424,16 @@ class CSSParser(object):
|
||||||
SelectorPattern("pseudo_close", PAT_PSEUDO_CLOSE),
|
SelectorPattern("pseudo_close", PAT_PSEUDO_CLOSE),
|
||||||
SpecialPseudoPattern(
|
SpecialPseudoPattern(
|
||||||
(
|
(
|
||||||
("pseudo_contains", (':contains',), PAT_PSEUDO_CONTAINS),
|
(
|
||||||
("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD),
|
"pseudo_contains",
|
||||||
("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE),
|
(':contains', ':-soup-contains', ':-soup-contains-own'),
|
||||||
("pseudo_lang", (':lang',), PAT_PSEUDO_LANG),
|
PAT_PSEUDO_CONTAINS,
|
||||||
("pseudo_dir", (':dir',), PAT_PSEUDO_DIR)
|
SelectorPattern
|
||||||
|
),
|
||||||
|
("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD, SelectorPattern),
|
||||||
|
("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE, SelectorPattern),
|
||||||
|
("pseudo_lang", (':lang',), PAT_PSEUDO_LANG, SelectorPattern),
|
||||||
|
("pseudo_dir", (':dir',), PAT_PSEUDO_DIR, SelectorPattern)
|
||||||
)
|
)
|
||||||
),
|
),
|
||||||
SelectorPattern("pseudo_class_custom", PAT_PSEUDO_CLASS_CUSTOM),
|
SelectorPattern("pseudo_class_custom", PAT_PSEUDO_CLASS_CUSTOM),
|
||||||
|
|
@ -461,7 +444,6 @@ class CSSParser(object):
|
||||||
SelectorPattern("class", PAT_CLASS),
|
SelectorPattern("class", PAT_CLASS),
|
||||||
SelectorPattern("tag", PAT_TAG),
|
SelectorPattern("tag", PAT_TAG),
|
||||||
SelectorPattern("attribute", PAT_ATTR),
|
SelectorPattern("attribute", PAT_ATTR),
|
||||||
QuirkPattern("quirks_attribute", PAT_QUIRKS_ATTR),
|
|
||||||
SelectorPattern("combine", PAT_COMBINE)
|
SelectorPattern("combine", PAT_COMBINE)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -471,24 +453,19 @@ class CSSParser(object):
|
||||||
self.pattern = selector.replace('\x00', '\ufffd')
|
self.pattern = selector.replace('\x00', '\ufffd')
|
||||||
self.flags = flags
|
self.flags = flags
|
||||||
self.debug = self.flags & util.DEBUG
|
self.debug = self.flags & util.DEBUG
|
||||||
self.quirks = self.flags & util._QUIRKS
|
|
||||||
self.custom = {} if custom is None else custom
|
self.custom = {} if custom is None else custom
|
||||||
|
|
||||||
def parse_attribute_selector(self, sel, m, has_selector, quirks):
|
def parse_attribute_selector(self, sel, m, has_selector):
|
||||||
"""Create attribute selector from the returned regex match."""
|
"""Create attribute selector from the returned regex match."""
|
||||||
|
|
||||||
inverse = False
|
inverse = False
|
||||||
op = m.group('cmp')
|
op = m.group('cmp')
|
||||||
case = util.lower(m.group('case')) if m.group('case') else None
|
case = util.lower(m.group('case')) if m.group('case') else None
|
||||||
parts = [css_unescape(a) for a in m.group('ns_attr').split('|')]
|
ns = css_unescape(m.group('attr_ns')[:-1]) if m.group('attr_ns') else ''
|
||||||
ns = ''
|
attr = css_unescape(m.group('attr_name'))
|
||||||
is_type = False
|
is_type = False
|
||||||
pattern2 = None
|
pattern2 = None
|
||||||
if len(parts) > 1:
|
|
||||||
ns = parts[0]
|
|
||||||
attr = parts[1]
|
|
||||||
else:
|
|
||||||
attr = parts[0]
|
|
||||||
if case:
|
if case:
|
||||||
flags = re.I if case == 'i' else 0
|
flags = re.I if case == 'i' else 0
|
||||||
elif util.lower(attr) == 'type':
|
elif util.lower(attr) == 'type':
|
||||||
|
|
@ -498,7 +475,7 @@ class CSSParser(object):
|
||||||
flags = 0
|
flags = 0
|
||||||
|
|
||||||
if op:
|
if op:
|
||||||
if m.group('value').startswith(('"', "'")) and not quirks:
|
if m.group('value').startswith(('"', "'")):
|
||||||
value = css_unescape(m.group('value')[1:-1], True)
|
value = css_unescape(m.group('value')[1:-1], True)
|
||||||
else:
|
else:
|
||||||
value = css_unescape(m.group('value'))
|
value = css_unescape(m.group('value'))
|
||||||
|
|
@ -525,13 +502,12 @@ class CSSParser(object):
|
||||||
elif op.startswith('|'):
|
elif op.startswith('|'):
|
||||||
# Value starts with word in dash separated list
|
# Value starts with word in dash separated list
|
||||||
pattern = re.compile(r'^%s(?:-.*)?$' % re.escape(value), flags)
|
pattern = re.compile(r'^%s(?:-.*)?$' % re.escape(value), flags)
|
||||||
elif op.startswith('!'):
|
|
||||||
# Equivalent to `:not([attr=value])`
|
|
||||||
pattern = re.compile(r'^%s(?:-.*)?$' % re.escape(value), flags)
|
|
||||||
inverse = True
|
|
||||||
else:
|
else:
|
||||||
# Value matches
|
# Value matches
|
||||||
pattern = re.compile(r'^%s$' % re.escape(value), flags)
|
pattern = re.compile(r'^%s$' % re.escape(value), flags)
|
||||||
|
if op.startswith('!'):
|
||||||
|
# Equivalent to `:not([attr=value])`
|
||||||
|
inverse = True
|
||||||
if is_type and pattern:
|
if is_type and pattern:
|
||||||
pattern2 = re.compile(pattern.pattern)
|
pattern2 = re.compile(pattern.pattern)
|
||||||
|
|
||||||
|
|
@ -552,13 +528,8 @@ class CSSParser(object):
|
||||||
def parse_tag_pattern(self, sel, m, has_selector):
|
def parse_tag_pattern(self, sel, m, has_selector):
|
||||||
"""Parse tag pattern from regex match."""
|
"""Parse tag pattern from regex match."""
|
||||||
|
|
||||||
parts = [css_unescape(x) for x in m.group(0).split('|')]
|
prefix = css_unescape(m.group('tag_ns')[:-1]) if m.group('tag_ns') else None
|
||||||
if len(parts) > 1:
|
tag = css_unescape(m.group('tag_name'))
|
||||||
prefix = parts[0]
|
|
||||||
tag = parts[1]
|
|
||||||
else:
|
|
||||||
tag = parts[0]
|
|
||||||
prefix = None
|
|
||||||
sel.tag = ct.SelectorTag(tag, prefix)
|
sel.tag = ct.SelectorTag(tag, prefix)
|
||||||
has_selector = True
|
has_selector = True
|
||||||
return has_selector
|
return has_selector
|
||||||
|
|
@ -800,21 +771,11 @@ class CSSParser(object):
|
||||||
if not combinator:
|
if not combinator:
|
||||||
combinator = WS_COMBINATOR
|
combinator = WS_COMBINATOR
|
||||||
if not has_selector:
|
if not has_selector:
|
||||||
# The only way we don't fail is if we are at the root level and quirks mode is enabled,
|
raise SelectorSyntaxError(
|
||||||
# and we've found no other selectors yet in this compound selector.
|
"The combinator '{}' at postion {}, must have a selector before it".format(combinator, index),
|
||||||
if (not self.quirks or is_pseudo or combinator == COMMA_COMBINATOR or relations):
|
|
||||||
raise SelectorSyntaxError(
|
|
||||||
"The combinator '{}' at postion {}, must have a selector before it".format(combinator, index),
|
|
||||||
self.pattern,
|
|
||||||
index
|
|
||||||
)
|
|
||||||
util.warn_quirks(
|
|
||||||
'You have attempted to use a combinator without a selector before it at position {}.'.format(index),
|
|
||||||
'the :scope pseudo class (or another appropriate selector) should be placed before the combinator.',
|
|
||||||
self.pattern,
|
self.pattern,
|
||||||
index
|
index
|
||||||
)
|
)
|
||||||
sel.flags |= ct.SEL_SCOPE
|
|
||||||
|
|
||||||
if combinator == COMMA_COMBINATOR:
|
if combinator == COMMA_COMBINATOR:
|
||||||
if not sel.tag and not is_pseudo:
|
if not sel.tag and not is_pseudo:
|
||||||
|
|
@ -847,7 +808,14 @@ class CSSParser(object):
|
||||||
def parse_pseudo_contains(self, sel, m, has_selector):
|
def parse_pseudo_contains(self, sel, m, has_selector):
|
||||||
"""Parse contains."""
|
"""Parse contains."""
|
||||||
|
|
||||||
values = m.group('values')
|
pseudo = util.lower(css_unescape(m.group('name')))
|
||||||
|
if pseudo == ":contains":
|
||||||
|
warnings.warn(
|
||||||
|
"The pseudo class ':contains' is deprecated, ':-soup-contains' should be used moving forward.",
|
||||||
|
FutureWarning
|
||||||
|
)
|
||||||
|
contains_own = pseudo == ":-soup-contains-own"
|
||||||
|
values = css_unescape(m.group('values'))
|
||||||
patterns = []
|
patterns = []
|
||||||
for token in RE_VALUES.finditer(values):
|
for token in RE_VALUES.finditer(values):
|
||||||
if token.group('split'):
|
if token.group('split'):
|
||||||
|
|
@ -858,7 +826,7 @@ class CSSParser(object):
|
||||||
else:
|
else:
|
||||||
value = css_unescape(value)
|
value = css_unescape(value)
|
||||||
patterns.append(value)
|
patterns.append(value)
|
||||||
sel.contains.append(ct.SelectorContains(tuple(patterns)))
|
sel.contains.append(ct.SelectorContains(tuple(patterns), contains_own))
|
||||||
has_selector = True
|
has_selector = True
|
||||||
return has_selector
|
return has_selector
|
||||||
|
|
||||||
|
|
@ -872,20 +840,12 @@ class CSSParser(object):
|
||||||
continue
|
continue
|
||||||
value = token.group('value')
|
value = token.group('value')
|
||||||
if value.startswith(('"', "'")):
|
if value.startswith(('"', "'")):
|
||||||
parts = css_unescape(value[1:-1], True).split('-')
|
value = css_unescape(value[1:-1], True)
|
||||||
else:
|
else:
|
||||||
parts = css_unescape(value).split('-')
|
value = css_unescape(value)
|
||||||
|
|
||||||
|
patterns.append(value)
|
||||||
|
|
||||||
new_parts = []
|
|
||||||
first = True
|
|
||||||
for part in parts:
|
|
||||||
if part == '*' and first:
|
|
||||||
new_parts.append('(?!x\b)[a-z0-9]+?')
|
|
||||||
elif part != '*':
|
|
||||||
new_parts.append(('' if first else '(-(?!x\b)[a-z0-9]+)*?\\-') + re.escape(part))
|
|
||||||
if first:
|
|
||||||
first = False
|
|
||||||
patterns.append(re.compile(r'^{}(?:-.*)?$'.format(''.join(new_parts)), re.I))
|
|
||||||
sel.lang.append(ct.SelectorLang(patterns))
|
sel.lang.append(ct.SelectorLang(patterns))
|
||||||
has_selector = True
|
has_selector = True
|
||||||
|
|
||||||
|
|
@ -917,6 +877,7 @@ class CSSParser(object):
|
||||||
is_indeterminate = bool(flags & FLG_INDETERMINATE)
|
is_indeterminate = bool(flags & FLG_INDETERMINATE)
|
||||||
is_in_range = bool(flags & FLG_IN_RANGE)
|
is_in_range = bool(flags & FLG_IN_RANGE)
|
||||||
is_out_of_range = bool(flags & FLG_OUT_OF_RANGE)
|
is_out_of_range = bool(flags & FLG_OUT_OF_RANGE)
|
||||||
|
is_placeholder_shown = bool(flags & FLG_PLACEHOLDER_SHOWN)
|
||||||
|
|
||||||
if self.debug: # pragma: no cover
|
if self.debug: # pragma: no cover
|
||||||
if is_pseudo:
|
if is_pseudo:
|
||||||
|
|
@ -937,6 +898,8 @@ class CSSParser(object):
|
||||||
print(' is_in_range: True')
|
print(' is_in_range: True')
|
||||||
if is_out_of_range:
|
if is_out_of_range:
|
||||||
print(' is_out_of_range: True')
|
print(' is_out_of_range: True')
|
||||||
|
if is_placeholder_shown:
|
||||||
|
print(' is_placeholder_shown: True')
|
||||||
|
|
||||||
if is_relative:
|
if is_relative:
|
||||||
selectors.append(_Selector())
|
selectors.append(_Selector())
|
||||||
|
|
@ -953,7 +916,7 @@ class CSSParser(object):
|
||||||
elif key == 'pseudo_class':
|
elif key == 'pseudo_class':
|
||||||
has_selector, is_html = self.parse_pseudo_class(sel, m, has_selector, iselector, is_html)
|
has_selector, is_html = self.parse_pseudo_class(sel, m, has_selector, iselector, is_html)
|
||||||
elif key == 'pseudo_element':
|
elif key == 'pseudo_element':
|
||||||
raise NotImplementedError("Psuedo-element found at position {}".format(m.start(0)))
|
raise NotImplementedError("Pseudo-element found at position {}".format(m.start(0)))
|
||||||
elif key == 'pseudo_contains':
|
elif key == 'pseudo_contains':
|
||||||
has_selector = self.parse_pseudo_contains(sel, m, has_selector)
|
has_selector = self.parse_pseudo_contains(sel, m, has_selector)
|
||||||
elif key in ('pseudo_nth_type', 'pseudo_nth_child'):
|
elif key in ('pseudo_nth_type', 'pseudo_nth_child'):
|
||||||
|
|
@ -989,18 +952,8 @@ class CSSParser(object):
|
||||||
has_selector, sel = self.parse_combinator(
|
has_selector, sel = self.parse_combinator(
|
||||||
sel, m, has_selector, selectors, relations, is_pseudo, index
|
sel, m, has_selector, selectors, relations, is_pseudo, index
|
||||||
)
|
)
|
||||||
elif key in ('attribute', 'quirks_attribute'):
|
elif key == 'attribute':
|
||||||
quirks = key == 'quirks_attribute'
|
has_selector = self.parse_attribute_selector(sel, m, has_selector)
|
||||||
if quirks:
|
|
||||||
temp_index = index + m.group(0).find('=') + 1
|
|
||||||
util.warn_quirks(
|
|
||||||
"You have attempted to use an attribute " +
|
|
||||||
"value that should have been quoted at position {}.".format(temp_index),
|
|
||||||
"the attribute value should be quoted.",
|
|
||||||
self.pattern,
|
|
||||||
temp_index
|
|
||||||
)
|
|
||||||
has_selector = self.parse_attribute_selector(sel, m, has_selector, quirks)
|
|
||||||
elif key == 'tag':
|
elif key == 'tag':
|
||||||
if has_selector:
|
if has_selector:
|
||||||
raise SelectorSyntaxError(
|
raise SelectorSyntaxError(
|
||||||
|
|
@ -1053,6 +1006,8 @@ class CSSParser(object):
|
||||||
selectors[-1].flags = ct.SEL_IN_RANGE
|
selectors[-1].flags = ct.SEL_IN_RANGE
|
||||||
if is_out_of_range:
|
if is_out_of_range:
|
||||||
selectors[-1].flags = ct.SEL_OUT_OF_RANGE
|
selectors[-1].flags = ct.SEL_OUT_OF_RANGE
|
||||||
|
if is_placeholder_shown:
|
||||||
|
selectors[-1].flags = ct.SEL_PLACEHOLDER_SHOWN
|
||||||
|
|
||||||
return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html)
|
return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html)
|
||||||
|
|
||||||
|
|
@ -1066,15 +1021,11 @@ class CSSParser(object):
|
||||||
end = (m.start(0) - 1) if m else (len(pattern) - 1)
|
end = (m.start(0) - 1) if m else (len(pattern) - 1)
|
||||||
|
|
||||||
if self.debug: # pragma: no cover
|
if self.debug: # pragma: no cover
|
||||||
if self.quirks:
|
|
||||||
print('## QUIRKS MODE: Throwing out the spec!')
|
|
||||||
print('## PARSING: {!r}'.format(pattern))
|
print('## PARSING: {!r}'.format(pattern))
|
||||||
while index <= end:
|
while index <= end:
|
||||||
m = None
|
m = None
|
||||||
for v in self.css_tokens:
|
for v in self.css_tokens:
|
||||||
if not v.enabled(self.flags): # pragma: no cover
|
m = v.match(pattern, index, self.flags)
|
||||||
continue
|
|
||||||
m = v.match(pattern, index)
|
|
||||||
if m:
|
if m:
|
||||||
name = v.get_name()
|
name = v.get_name()
|
||||||
if self.debug: # pragma: no cover
|
if self.debug: # pragma: no cover
|
||||||
|
|
@ -1102,13 +1053,7 @@ class CSSParser(object):
|
||||||
print('## END PARSING')
|
print('## END PARSING')
|
||||||
|
|
||||||
def process_selectors(self, index=0, flags=0):
|
def process_selectors(self, index=0, flags=0):
|
||||||
"""
|
"""Process selectors."""
|
||||||
Process selectors.
|
|
||||||
|
|
||||||
We do our own selectors as BeautifulSoup4 has some annoying quirks,
|
|
||||||
and we don't really need to do nth selectors or siblings or
|
|
||||||
descendants etc.
|
|
||||||
"""
|
|
||||||
|
|
||||||
return self.parse_selectors(self.selector_iter(self.pattern), index, flags)
|
return self.parse_selectors(self.selector_iter(self.pattern), index, flags)
|
||||||
|
|
||||||
|
|
@ -1123,8 +1068,7 @@ CSS_LINK = CSSParser(
|
||||||
# CSS pattern for `:checked`
|
# CSS pattern for `:checked`
|
||||||
CSS_CHECKED = CSSParser(
|
CSS_CHECKED = CSSParser(
|
||||||
'''
|
'''
|
||||||
html|*:is(input[type=checkbox], input[type=radio])[checked],
|
html|*:is(input[type=checkbox], input[type=radio])[checked], html|option[selected]
|
||||||
html|select > html|option[selected]
|
|
||||||
'''
|
'''
|
||||||
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
|
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
|
||||||
# CSS pattern for `:default` (must compile CSS_CHECKED first)
|
# CSS pattern for `:default` (must compile CSS_CHECKED first)
|
||||||
|
|
@ -1150,23 +1094,23 @@ CSS_INDETERMINATE = CSSParser(
|
||||||
This pattern must be at the end.
|
This pattern must be at the end.
|
||||||
Special logic is applied to the last selector.
|
Special logic is applied to the last selector.
|
||||||
*/
|
*/
|
||||||
html|input[type="radio"][name][name!='']:not([checked])
|
html|input[type="radio"][name]:not([name='']):not([checked])
|
||||||
'''
|
'''
|
||||||
).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_INDETERMINATE)
|
).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_INDETERMINATE)
|
||||||
# CSS pattern for `:disabled`
|
# CSS pattern for `:disabled`
|
||||||
CSS_DISABLED = CSSParser(
|
CSS_DISABLED = CSSParser(
|
||||||
'''
|
'''
|
||||||
html|*:is(input[type!=hidden], button, select, textarea, fieldset, optgroup, option, fieldset)[disabled],
|
html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset)[disabled],
|
||||||
html|optgroup[disabled] > html|option,
|
html|optgroup[disabled] > html|option,
|
||||||
html|fieldset[disabled] > html|*:is(input[type!=hidden], button, select, textarea, fieldset),
|
html|fieldset[disabled] > html|*:is(input:not([type=hidden]), button, select, textarea, fieldset),
|
||||||
html|fieldset[disabled] >
|
html|fieldset[disabled] >
|
||||||
html|*:not(legend:nth-of-type(1)) html|*:is(input[type!=hidden], button, select, textarea, fieldset)
|
html|*:not(legend:nth-of-type(1)) html|*:is(input:not([type=hidden]), button, select, textarea, fieldset)
|
||||||
'''
|
'''
|
||||||
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
|
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
|
||||||
# CSS pattern for `:enabled`
|
# CSS pattern for `:enabled`
|
||||||
CSS_ENABLED = CSSParser(
|
CSS_ENABLED = CSSParser(
|
||||||
'''
|
'''
|
||||||
html|*:is(input[type!=hidden], button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled)
|
html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled)
|
||||||
'''
|
'''
|
||||||
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
|
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
|
||||||
# CSS pattern for `:required`
|
# CSS pattern for `:required`
|
||||||
|
|
@ -1180,22 +1124,20 @@ CSS_OPTIONAL = CSSParser(
|
||||||
# CSS pattern for `:placeholder-shown`
|
# CSS pattern for `:placeholder-shown`
|
||||||
CSS_PLACEHOLDER_SHOWN = CSSParser(
|
CSS_PLACEHOLDER_SHOWN = CSSParser(
|
||||||
'''
|
'''
|
||||||
html|*:is(
|
html|input:is(
|
||||||
input:is(
|
:not([type]),
|
||||||
:not([type]),
|
[type=""],
|
||||||
[type=""],
|
[type=text],
|
||||||
[type=text],
|
[type=search],
|
||||||
[type=search],
|
[type=url],
|
||||||
[type=url],
|
[type=tel],
|
||||||
[type=tel],
|
[type=email],
|
||||||
[type=email],
|
[type=password],
|
||||||
[type=password],
|
[type=number]
|
||||||
[type=number]
|
)[placeholder]:not([placeholder='']):is(:not([value]), [value=""]),
|
||||||
),
|
html|textarea[placeholder]:not([placeholder=''])
|
||||||
textarea
|
|
||||||
)[placeholder][placeholder!='']
|
|
||||||
'''
|
'''
|
||||||
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
|
).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_PLACEHOLDER_SHOWN)
|
||||||
# CSS pattern default for `:nth-child` "of S" feature
|
# CSS pattern default for `:nth-child` "of S" feature
|
||||||
CSS_NTH_OF_S_DEFAULT = CSSParser(
|
CSS_NTH_OF_S_DEFAULT = CSSParser(
|
||||||
'*|*'
|
'*|*'
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
"""CSS selector structure items."""
|
"""CSS selector structure items."""
|
||||||
from __future__ import unicode_literals
|
import copyreg
|
||||||
from . import util
|
from collections.abc import Hashable, Mapping
|
||||||
|
|
||||||
__all__ = (
|
__all__ = (
|
||||||
'Selector',
|
'Selector',
|
||||||
|
|
@ -26,6 +26,7 @@ SEL_DIR_RTL = 0x40
|
||||||
SEL_IN_RANGE = 0x80
|
SEL_IN_RANGE = 0x80
|
||||||
SEL_OUT_OF_RANGE = 0x100
|
SEL_OUT_OF_RANGE = 0x100
|
||||||
SEL_DEFINED = 0x200
|
SEL_DEFINED = 0x200
|
||||||
|
SEL_PLACEHOLDER_SHOWN = 0x400
|
||||||
|
|
||||||
|
|
||||||
class Immutable(object):
|
class Immutable(object):
|
||||||
|
|
@ -85,7 +86,7 @@ class Immutable(object):
|
||||||
__str__ = __repr__
|
__str__ = __repr__
|
||||||
|
|
||||||
|
|
||||||
class ImmutableDict(util.Mapping):
|
class ImmutableDict(Mapping):
|
||||||
"""Hashable, immutable dictionary."""
|
"""Hashable, immutable dictionary."""
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
|
|
@ -94,8 +95,8 @@ class ImmutableDict(util.Mapping):
|
||||||
arg = args[0] if args else kwargs
|
arg = args[0] if args else kwargs
|
||||||
is_dict = isinstance(arg, dict)
|
is_dict = isinstance(arg, dict)
|
||||||
if (
|
if (
|
||||||
is_dict and not all([isinstance(v, util.Hashable) for v in arg.values()]) or
|
is_dict and not all([isinstance(v, Hashable) for v in arg.values()]) or
|
||||||
not is_dict and not all([isinstance(k, util.Hashable) and isinstance(v, util.Hashable) for k, v in arg])
|
not is_dict and not all([isinstance(k, Hashable) and isinstance(v, Hashable) for k, v in arg])
|
||||||
):
|
):
|
||||||
raise TypeError('All values must be hashable')
|
raise TypeError('All values must be hashable')
|
||||||
|
|
||||||
|
|
@ -140,9 +141,9 @@ class Namespaces(ImmutableDict):
|
||||||
# so don't bother checking that.
|
# so don't bother checking that.
|
||||||
arg = args[0] if args else kwargs
|
arg = args[0] if args else kwargs
|
||||||
is_dict = isinstance(arg, dict)
|
is_dict = isinstance(arg, dict)
|
||||||
if is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg.items()]):
|
if is_dict and not all([isinstance(k, str) and isinstance(v, str) for k, v in arg.items()]):
|
||||||
raise TypeError('Namespace keys and values must be Unicode strings')
|
raise TypeError('Namespace keys and values must be Unicode strings')
|
||||||
elif not is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg]):
|
elif not is_dict and not all([isinstance(k, str) and isinstance(v, str) for k, v in arg]):
|
||||||
raise TypeError('Namespace keys and values must be Unicode strings')
|
raise TypeError('Namespace keys and values must be Unicode strings')
|
||||||
|
|
||||||
super(Namespaces, self).__init__(*args, **kwargs)
|
super(Namespaces, self).__init__(*args, **kwargs)
|
||||||
|
|
@ -159,9 +160,9 @@ class CustomSelectors(ImmutableDict):
|
||||||
# so don't bother checking that.
|
# so don't bother checking that.
|
||||||
arg = args[0] if args else kwargs
|
arg = args[0] if args else kwargs
|
||||||
is_dict = isinstance(arg, dict)
|
is_dict = isinstance(arg, dict)
|
||||||
if is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg.items()]):
|
if is_dict and not all([isinstance(k, str) and isinstance(v, str) for k, v in arg.items()]):
|
||||||
raise TypeError('CustomSelectors keys and values must be Unicode strings')
|
raise TypeError('CustomSelectors keys and values must be Unicode strings')
|
||||||
elif not is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg]):
|
elif not is_dict and not all([isinstance(k, str) and isinstance(v, str) for k, v in arg]):
|
||||||
raise TypeError('CustomSelectors keys and values must be Unicode strings')
|
raise TypeError('CustomSelectors keys and values must be Unicode strings')
|
||||||
|
|
||||||
super(CustomSelectors, self).__init__(*args, **kwargs)
|
super(CustomSelectors, self).__init__(*args, **kwargs)
|
||||||
|
|
@ -238,13 +239,14 @@ class SelectorAttribute(Immutable):
|
||||||
class SelectorContains(Immutable):
|
class SelectorContains(Immutable):
|
||||||
"""Selector contains rule."""
|
"""Selector contains rule."""
|
||||||
|
|
||||||
__slots__ = ("text", "_hash")
|
__slots__ = ("text", "own", "_hash")
|
||||||
|
|
||||||
def __init__(self, text):
|
def __init__(self, text, own):
|
||||||
"""Initialize."""
|
"""Initialize."""
|
||||||
|
|
||||||
super(SelectorContains, self).__init__(
|
super(SelectorContains, self).__init__(
|
||||||
text=text
|
text=text,
|
||||||
|
own=own
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -331,7 +333,7 @@ def _pickle(p):
|
||||||
def pickle_register(obj):
|
def pickle_register(obj):
|
||||||
"""Allow object to be pickled."""
|
"""Allow object to be pickled."""
|
||||||
|
|
||||||
util.copyreg.pickle(obj, _pickle)
|
copyreg.pickle(obj, _pickle)
|
||||||
|
|
||||||
|
|
||||||
pickle_register(Selector)
|
pickle_register(Selector)
|
||||||
|
|
|
||||||
|
|
@ -1,47 +1,17 @@
|
||||||
"""Utility."""
|
"""Utility."""
|
||||||
from __future__ import unicode_literals
|
from functools import wraps, lru_cache
|
||||||
from functools import wraps
|
|
||||||
import warnings
|
import warnings
|
||||||
import sys
|
|
||||||
import struct
|
|
||||||
import os
|
|
||||||
import re
|
import re
|
||||||
MODULE = os.path.dirname(__file__)
|
|
||||||
|
|
||||||
PY3 = sys.version_info >= (3, 0)
|
|
||||||
PY35 = sys.version_info >= (3, 5)
|
|
||||||
PY37 = sys.version_info >= (3, 7)
|
|
||||||
|
|
||||||
if PY3:
|
|
||||||
from functools import lru_cache # noqa F401
|
|
||||||
import copyreg # noqa F401
|
|
||||||
from collections.abc import Hashable, Mapping # noqa F401
|
|
||||||
|
|
||||||
ustr = str
|
|
||||||
bstr = bytes
|
|
||||||
unichar = chr
|
|
||||||
string = str
|
|
||||||
else:
|
|
||||||
from backports.functools_lru_cache import lru_cache # noqa F401
|
|
||||||
import copy_reg as copyreg # noqa F401
|
|
||||||
from collections import Hashable, Mapping # noqa F401
|
|
||||||
|
|
||||||
ustr = unicode # noqa: F821
|
|
||||||
bstr = str
|
|
||||||
unichar = unichr # noqa: F821
|
|
||||||
string = basestring # noqa: F821
|
|
||||||
|
|
||||||
DEBUG = 0x00001
|
DEBUG = 0x00001
|
||||||
_QUIRKS = 0x10000
|
|
||||||
|
|
||||||
RE_PATTERN_LINE_SPLIT = re.compile(r'(?:\r\n|(?!\r\n)[\n\r])|$')
|
RE_PATTERN_LINE_SPLIT = re.compile(r'(?:\r\n|(?!\r\n)[\n\r])|$')
|
||||||
|
|
||||||
LC_A = ord('a')
|
|
||||||
LC_Z = ord('z')
|
|
||||||
UC_A = ord('A')
|
UC_A = ord('A')
|
||||||
UC_Z = ord('Z')
|
UC_Z = ord('Z')
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=512)
|
||||||
def lower(string):
|
def lower(string):
|
||||||
"""Lower."""
|
"""Lower."""
|
||||||
|
|
||||||
|
|
@ -52,38 +22,7 @@ def lower(string):
|
||||||
return ''.join(new_string)
|
return ''.join(new_string)
|
||||||
|
|
||||||
|
|
||||||
def upper(string): # pragma: no cover
|
class SelectorSyntaxError(Exception):
|
||||||
"""Lower."""
|
|
||||||
|
|
||||||
new_string = []
|
|
||||||
for c in string:
|
|
||||||
o = ord(c)
|
|
||||||
new_string.append(chr(o - 32) if LC_A <= o <= LC_Z else c)
|
|
||||||
return ''.join(new_string)
|
|
||||||
|
|
||||||
|
|
||||||
def uchr(i):
|
|
||||||
"""Allow getting Unicode character on narrow python builds."""
|
|
||||||
|
|
||||||
try:
|
|
||||||
return unichar(i)
|
|
||||||
except ValueError: # pragma: no cover
|
|
||||||
return struct.pack('i', i).decode('utf-32')
|
|
||||||
|
|
||||||
|
|
||||||
def uord(c):
|
|
||||||
"""Get Unicode ordinal."""
|
|
||||||
|
|
||||||
if len(c) == 2: # pragma: no cover
|
|
||||||
high, low = [ord(p) for p in c]
|
|
||||||
ordinal = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000
|
|
||||||
else:
|
|
||||||
ordinal = ord(c)
|
|
||||||
|
|
||||||
return ordinal
|
|
||||||
|
|
||||||
|
|
||||||
class SelectorSyntaxError(SyntaxError):
|
|
||||||
"""Syntax error in a CSS selector."""
|
"""Syntax error in a CSS selector."""
|
||||||
|
|
||||||
def __init__(self, msg, pattern=None, index=None):
|
def __init__(self, msg, pattern=None, index=None):
|
||||||
|
|
@ -169,45 +108,3 @@ def get_pattern_context(pattern, index):
|
||||||
last = m.end(0)
|
last = m.end(0)
|
||||||
|
|
||||||
return ''.join(text), line, col
|
return ''.join(text), line, col
|
||||||
|
|
||||||
|
|
||||||
class QuirksWarning(UserWarning): # pragma: no cover
|
|
||||||
"""Warning for quirks mode."""
|
|
||||||
|
|
||||||
|
|
||||||
def warn_quirks(message, recommend, pattern, index):
|
|
||||||
"""Warn quirks."""
|
|
||||||
|
|
||||||
import traceback
|
|
||||||
import bs4 # noqa: F401
|
|
||||||
|
|
||||||
# Acquire source code line context
|
|
||||||
paths = (MODULE, sys.modules['bs4'].__path__[0])
|
|
||||||
tb = traceback.extract_stack()
|
|
||||||
previous = None
|
|
||||||
filename = None
|
|
||||||
lineno = None
|
|
||||||
for entry in tb:
|
|
||||||
if (PY35 and entry.filename.startswith(paths)) or (not PY35 and entry[0].startswith(paths)):
|
|
||||||
break
|
|
||||||
previous = entry
|
|
||||||
if previous:
|
|
||||||
filename = previous.filename if PY35 else previous[0]
|
|
||||||
lineno = previous.lineno if PY35 else previous[1]
|
|
||||||
|
|
||||||
# Format pattern to show line and column position
|
|
||||||
context, line = get_pattern_context(pattern, index)[0:2]
|
|
||||||
|
|
||||||
# Display warning
|
|
||||||
warnings.warn_explicit(
|
|
||||||
"\nCSS selector pattern:\n" +
|
|
||||||
" {}\n".format(message) +
|
|
||||||
" This behavior is only allowed temporarily for Beautiful Soup's transition to Soup Sieve.\n" +
|
|
||||||
" In order to confrom to the CSS spec, {}\n".format(recommend) +
|
|
||||||
" It is strongly recommended the selector be altered to conform to the CSS spec " +
|
|
||||||
"as an exception will be raised for this case in the future.\n" +
|
|
||||||
"pattern line {}:\n{}".format(line, context),
|
|
||||||
QuirksWarning,
|
|
||||||
filename,
|
|
||||||
lineno
|
|
||||||
)
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue