Update old included_dependencies to current versions.

This commit is contained in:
Jim Miller 2020-12-22 13:29:20 -06:00
parent d33decd8f5
commit 7b951d7f4d
23 changed files with 33216 additions and 1655 deletions

View file

@ -16,11 +16,14 @@
######################### END LICENSE BLOCK #########################
from .compat import PY2, PY3
from .universaldetector import UniversalDetector
from .enums import InputState
from .version import __version__, VERSION
__all__ = ['UniversalDetector', 'detect', 'detect_all', '__version__', 'VERSION']
def detect(byte_str):
"""
Detect the encoding of the given byte string.
@ -31,9 +34,50 @@ def detect(byte_str):
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError('Expected object of type bytes or bytearray, got: '
'{0}'.format(type(byte_str)))
'{}'.format(type(byte_str)))
else:
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector.feed(byte_str)
return detector.close()
def detect_all(byte_str):
"""
Detect all the possible encodings of the given byte string.
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError('Expected object of type bytes or bytearray, got: '
'{}'.format(type(byte_str)))
else:
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector.feed(byte_str)
detector.close()
if detector._input_state == InputState.HIGH_BYTE:
results = []
for prober in detector._charset_probers:
if prober.get_confidence() > detector.MINIMUM_THRESHOLD:
charset_name = prober.charset_name
lower_charset_name = prober.charset_name.lower()
# Use Windows encoding name instead of ISO-8859 if we saw any
# extra Windows-specific bytes
if lower_charset_name.startswith('iso-8859'):
if detector._has_win_bytes:
charset_name = detector.ISO_WIN_MAP.get(lower_charset_name,
charset_name)
results.append({
'encoding': charset_name,
'confidence': prober.get_confidence(),
'language': prober.language,
})
if len(results) > 0:
return sorted(results, key=lambda result: -result['confidence'])
return [detector.result]

View file

@ -73,6 +73,7 @@ class CharSetGroupProber(CharSetProber):
continue
if state == ProbingState.FOUND_IT:
self._best_guess_prober = prober
self._state = ProbingState.FOUND_IT
return self.state
elif state == ProbingState.NOT_ME:
prober.active = False

View file

@ -1,4 +1,3 @@
#!/usr/bin/env python
"""
Script which takes one or more file paths and reports on their detected
encodings
@ -45,10 +44,10 @@ def description_of(lines, name='stdin'):
if PY2:
name = name.decode(sys.getfilesystemencoding(), 'ignore')
if result['encoding']:
return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
return '{}: {} with confidence {}'.format(name, result['encoding'],
result['confidence'])
else:
return '{0}: no result'.format(name)
return '{}: no result'.format(name)
def main(argv=None):
@ -69,7 +68,7 @@ def main(argv=None):
type=argparse.FileType('rb'), nargs='*',
default=[sys.stdin if PY2 else sys.stdin.buffer])
parser.add_argument('--version', action='version',
version='%(prog)s {0}'.format(__version__))
version='%(prog)s {}'.format(__version__))
args = parser.parse_args(argv)
for f in args.input:

View file

@ -25,10 +25,12 @@ import sys
if sys.version_info < (3, 0):
PY2 = True
PY3 = False
base_str = (str, unicode)
string_types = (str, unicode)
text_type = unicode
iteritems = dict.iteritems
else:
PY2 = False
PY3 = True
base_str = (bytes, str)
string_types = (bytes, str)
text_type = str
iteritems = dict.items

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,310 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Metadata about languages used by our model training code for our
SingleByteCharSetProbers. Could be used for other things in the future.
This code is based on the language metadata from the uchardet project.
"""
from __future__ import absolute_import, print_function
from string import ascii_letters
# TODO: Add Ukranian (KOI8-U)
class Language(object):
"""Metadata about a language useful for training models
:ivar name: The human name for the language, in English.
:type name: str
:ivar iso_code: 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
or use another catalog as a last resort.
:type iso_code: str
:ivar use_ascii: Whether or not ASCII letters should be included in trained
models.
:type use_ascii: bool
:ivar charsets: The charsets we want to support and create data for.
:type charsets: list of str
:ivar alphabet: The characters in the language's alphabet. If `use_ascii` is
`True`, you only need to add those not in the ASCII set.
:type alphabet: str
:ivar wiki_start_pages: The Wikipedia pages to start from if we're crawling
Wikipedia for training data.
:type wiki_start_pages: list of str
"""
def __init__(self, name=None, iso_code=None, use_ascii=True, charsets=None,
alphabet=None, wiki_start_pages=None):
super(Language, self).__init__()
self.name = name
self.iso_code = iso_code
self.use_ascii = use_ascii
self.charsets = charsets
if self.use_ascii:
if alphabet:
alphabet += ascii_letters
else:
alphabet = ascii_letters
elif not alphabet:
raise ValueError('Must supply alphabet if use_ascii is False')
self.alphabet = ''.join(sorted(set(alphabet))) if alphabet else None
self.wiki_start_pages = wiki_start_pages
def __repr__(self):
return '{}({})'.format(self.__class__.__name__,
', '.join('{}={!r}'.format(k, v)
for k, v in self.__dict__.items()
if not k.startswith('_')))
LANGUAGES = {'Arabic': Language(name='Arabic',
iso_code='ar',
use_ascii=False,
# We only support encodings that use isolated
# forms, because the current recommendation is
# that the rendering system handles presentation
# forms. This means we purposefully skip IBM864.
charsets=['ISO-8859-6', 'WINDOWS-1256',
'CP720', 'CP864'],
alphabet=u'ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ',
wiki_start_pages=[u'الصفحة_الرئيسية']),
'Belarusian': Language(name='Belarusian',
iso_code='be',
use_ascii=False,
charsets=['ISO-8859-5', 'WINDOWS-1251',
'IBM866', 'MacCyrillic'],
alphabet=(u'АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯ'
u'абвгдеёжзійклмнопрстуўфхцчшыьэюяʼ'),
wiki_start_pages=[u'Галоўная_старонка']),
'Bulgarian': Language(name='Bulgarian',
iso_code='bg',
use_ascii=False,
charsets=['ISO-8859-5', 'WINDOWS-1251',
'IBM855'],
alphabet=(u'АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯ'
u'абвгдежзийклмнопрстуфхцчшщъьюя'),
wiki_start_pages=[u'Начална_страница']),
'Czech': Language(name='Czech',
iso_code='cz',
use_ascii=True,
charsets=['ISO-8859-2', 'WINDOWS-1250'],
alphabet=u'áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ',
wiki_start_pages=[u'Hlavní_strana']),
'Danish': Language(name='Danish',
iso_code='da',
use_ascii=True,
charsets=['ISO-8859-1', 'ISO-8859-15',
'WINDOWS-1252'],
alphabet=u'æøåÆØÅ',
wiki_start_pages=[u'Forside']),
'German': Language(name='German',
iso_code='de',
use_ascii=True,
charsets=['ISO-8859-1', 'WINDOWS-1252'],
alphabet=u'äöüßÄÖÜ',
wiki_start_pages=[u'Wikipedia:Hauptseite']),
'Greek': Language(name='Greek',
iso_code='el',
use_ascii=False,
charsets=['ISO-8859-7', 'WINDOWS-1253'],
alphabet=(u'αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώ'
u'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ'),
wiki_start_pages=[u'Πύλη:Κύρια']),
'English': Language(name='English',
iso_code='en',
use_ascii=True,
charsets=['ISO-8859-1', 'WINDOWS-1252'],
wiki_start_pages=[u'Main_Page']),
'Esperanto': Language(name='Esperanto',
iso_code='eo',
# Q, W, X, and Y not used at all
use_ascii=False,
charsets=['ISO-8859-3'],
alphabet=(u'abcĉdefgĝhĥijĵklmnoprsŝtuŭvz'
u'ABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ'),
wiki_start_pages=[u'Vikipedio:Ĉefpaĝo']),
'Spanish': Language(name='Spanish',
iso_code='es',
use_ascii=True,
charsets=['ISO-8859-1', 'ISO-8859-15',
'WINDOWS-1252'],
alphabet=u'ñáéíóúüÑÁÉÍÓÚÜ',
wiki_start_pages=[u'Wikipedia:Portada']),
'Estonian': Language(name='Estonian',
iso_code='et',
use_ascii=False,
charsets=['ISO-8859-4', 'ISO-8859-13',
'WINDOWS-1257'],
# C, F, Š, Q, W, X, Y, Z, Ž are only for
# loanwords
alphabet=(u'ABDEGHIJKLMNOPRSTUVÕÄÖÜ'
u'abdeghijklmnoprstuvõäöü'),
wiki_start_pages=[u'Esileht']),
'Finnish': Language(name='Finnish',
iso_code='fi',
use_ascii=True,
charsets=['ISO-8859-1', 'ISO-8859-15',
'WINDOWS-1252'],
alphabet=u'ÅÄÖŠŽåäöšž',
wiki_start_pages=[u'Wikipedia:Etusivu']),
'French': Language(name='French',
iso_code='fr',
use_ascii=True,
charsets=['ISO-8859-1', 'ISO-8859-15',
'WINDOWS-1252'],
alphabet=u'œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ',
wiki_start_pages=[u'Wikipédia:Accueil_principal',
u'Bœuf (animal)']),
'Hebrew': Language(name='Hebrew',
iso_code='he',
use_ascii=False,
charsets=['ISO-8859-8', 'WINDOWS-1255'],
alphabet=u'אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ',
wiki_start_pages=[u'עמוד_ראשי']),
'Croatian': Language(name='Croatian',
iso_code='hr',
# Q, W, X, Y are only used for foreign words.
use_ascii=False,
charsets=['ISO-8859-2', 'WINDOWS-1250'],
alphabet=(u'abcčćdđefghijklmnoprsštuvzž'
u'ABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ'),
wiki_start_pages=[u'Glavna_stranica']),
'Hungarian': Language(name='Hungarian',
iso_code='hu',
# Q, W, X, Y are only used for foreign words.
use_ascii=False,
charsets=['ISO-8859-2', 'WINDOWS-1250'],
alphabet=(u'abcdefghijklmnoprstuvzáéíóöőúüű'
u'ABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ'),
wiki_start_pages=[u'Kezdőlap']),
'Italian': Language(name='Italian',
iso_code='it',
use_ascii=True,
charsets=['ISO-8859-1', 'ISO-8859-15',
'WINDOWS-1252'],
alphabet=u'ÀÈÉÌÒÓÙàèéìòóù',
wiki_start_pages=[u'Pagina_principale']),
'Lithuanian': Language(name='Lithuanian',
iso_code='lt',
use_ascii=False,
charsets=['ISO-8859-13', 'WINDOWS-1257',
'ISO-8859-4'],
# Q, W, and X not used at all
alphabet=(u'AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽ'
u'aąbcčdeęėfghiįyjklmnoprsštuųūvzž'),
wiki_start_pages=[u'Pagrindinis_puslapis']),
'Latvian': Language(name='Latvian',
iso_code='lv',
use_ascii=False,
charsets=['ISO-8859-13', 'WINDOWS-1257',
'ISO-8859-4'],
# Q, W, X, Y are only for loanwords
alphabet=(u'AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽ'
u'aābcčdeēfgģhiījkķlļmnņoprsštuūvzž'),
wiki_start_pages=[u'Sākumlapa']),
'Macedonian': Language(name='Macedonian',
iso_code='mk',
use_ascii=False,
charsets=['ISO-8859-5', 'WINDOWS-1251',
'MacCyrillic', 'IBM855'],
alphabet=(u'АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШ'
u'абвгдѓежзѕијклљмнњопрстќуфхцчџш'),
wiki_start_pages=[u'Главна_страница']),
'Dutch': Language(name='Dutch',
iso_code='nl',
use_ascii=True,
charsets=['ISO-8859-1', 'WINDOWS-1252'],
wiki_start_pages=[u'Hoofdpagina']),
'Polish': Language(name='Polish',
iso_code='pl',
# Q and X are only used for foreign words.
use_ascii=False,
charsets=['ISO-8859-2', 'WINDOWS-1250'],
alphabet=(u'AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ'
u'aąbcćdeęfghijklłmnńoóprsśtuwyzźż'),
wiki_start_pages=[u'Wikipedia:Strona_główna']),
'Portuguese': Language(name='Portuguese',
iso_code='pt',
use_ascii=True,
charsets=['ISO-8859-1', 'ISO-8859-15',
'WINDOWS-1252'],
alphabet=u'ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú',
wiki_start_pages=[u'Wikipédia:Página_principal']),
'Romanian': Language(name='Romanian',
iso_code='ro',
use_ascii=True,
charsets=['ISO-8859-2', 'WINDOWS-1250'],
alphabet=u'ăâîșțĂÂÎȘȚ',
wiki_start_pages=[u'Pagina_principală']),
'Russian': Language(name='Russian',
iso_code='ru',
use_ascii=False,
charsets=['ISO-8859-5', 'WINDOWS-1251',
'KOI8-R', 'MacCyrillic', 'IBM866',
'IBM855'],
alphabet=(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
u'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'),
wiki_start_pages=[u'Заглавная_страница']),
'Slovak': Language(name='Slovak',
iso_code='sk',
use_ascii=True,
charsets=['ISO-8859-2', 'WINDOWS-1250'],
alphabet=u'áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ',
wiki_start_pages=[u'Hlavná_stránka']),
'Slovene': Language(name='Slovene',
iso_code='sl',
# Q, W, X, Y are only used for foreign words.
use_ascii=False,
charsets=['ISO-8859-2', 'WINDOWS-1250'],
alphabet=(u'abcčdefghijklmnoprsštuvzž'
u'ABCČDEFGHIJKLMNOPRSŠTUVZŽ'),
wiki_start_pages=[u'Glavna_stran']),
# Serbian can be written in both Latin and Cyrillic, but there's no
# simple way to get the Latin alphabet pages from Wikipedia through
# the API, so for now we just support Cyrillic.
'Serbian': Language(name='Serbian',
iso_code='sr',
alphabet=(u'АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШ'
u'абвгдђежзијклљмнњопрстћуфхцчџш'),
charsets=['ISO-8859-5', 'WINDOWS-1251',
'MacCyrillic', 'IBM855'],
wiki_start_pages=[u'Главна_страна']),
'Thai': Language(name='Thai',
iso_code='th',
use_ascii=False,
charsets=['ISO-8859-11', 'TIS-620', 'CP874'],
alphabet=u'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛',
wiki_start_pages=[u'หน้าหลัก']),
'Turkish': Language(name='Turkish',
iso_code='tr',
# Q, W, and X are not used by Turkish
use_ascii=False,
charsets=['ISO-8859-3', 'ISO-8859-9',
'WINDOWS-1254'],
alphabet=(u'abcçdefgğhıijklmnoöprsştuüvyzâîû'
u'ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ'),
wiki_start_pages=[u'Ana_Sayfa']),
'Vietnamese': Language(name='Vietnamese',
iso_code='vi',
use_ascii=False,
# Windows-1258 is the only common 8-bit
# Vietnamese encoding supported by Python.
# From Wikipedia:
# For systems that lack support for Unicode,
# dozens of 8-bit Vietnamese code pages are
# available.[1] The most common are VISCII
# (TCVN 5712:1993), VPS, and Windows-1258.[3]
# Where ASCII is required, such as when
# ensuring readability in plain text e-mail,
# Vietnamese letters are often encoded
# according to Vietnamese Quoted-Readable
# (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4]
# though usage of either variable-width
# scheme has declined dramatically following
# the adoption of Unicode on the World Wide
# Web.
charsets=['WINDOWS-1258'],
alphabet=(u'aăâbcdđeêghiklmnoôơpqrstuưvxy'
u'AĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY'),
wiki_start_pages=[u'Chữ_Quốc_ngữ']),
}

View file

@ -26,10 +26,22 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
from collections import namedtuple
from .charsetprober import CharSetProber
from .enums import CharacterCategory, ProbingState, SequenceLikelihood
SingleByteCharSetModel = namedtuple('SingleByteCharSetModel',
['charset_name',
'language',
'char_to_order_map',
'language_model',
'typical_positive_ratio',
'keep_ascii_letters',
'alphabet'])
class SingleByteCharSetProber(CharSetProber):
SAMPLE_SIZE = 64
SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2
@ -65,25 +77,25 @@ class SingleByteCharSetProber(CharSetProber):
if self._name_prober:
return self._name_prober.charset_name
else:
return self._model['charset_name']
return self._model.charset_name
@property
def language(self):
if self._name_prober:
return self._name_prober.language
else:
return self._model.get('language')
return self._model.language
def feed(self, byte_str):
if not self._model['keep_english_letter']:
# TODO: Make filter_international_words keep things in self.alphabet
if not self._model.keep_ascii_letters:
byte_str = self.filter_international_words(byte_str)
if not byte_str:
return self.state
char_to_order_map = self._model['char_to_order_map']
for i, c in enumerate(byte_str):
# XXX: Order is in range 1-64, so one would think we want 0-63 here,
# but that leads to 27 more test failures than before.
order = char_to_order_map[c]
char_to_order_map = self._model.char_to_order_map
language_model = self._model.language_model
for char in byte_str:
order = char_to_order_map.get(char, CharacterCategory.UNDEFINED)
# XXX: This was SYMBOL_CAT_ORDER before, with a value of 250, but
# CharacterCategory.SYMBOL is actually 253, so we use CONTROL
# to make it closer to the original intent. The only difference
@ -91,20 +103,21 @@ class SingleByteCharSetProber(CharSetProber):
# _total_char purposes.
if order < CharacterCategory.CONTROL:
self._total_char += 1
# TODO: Follow uchardet's lead and discount confidence for frequent
# control characters.
# See https://github.com/BYVoid/uchardet/commit/55b4f23971db61
if order < self.SAMPLE_SIZE:
self._freq_char += 1
if self._last_order < self.SAMPLE_SIZE:
self._total_seqs += 1
if not self._reversed:
i = (self._last_order * self.SAMPLE_SIZE) + order
model = self._model['precedence_matrix'][i]
else: # reverse the order of the letters in the lookup
i = (order * self.SAMPLE_SIZE) + self._last_order
model = self._model['precedence_matrix'][i]
self._seq_counters[model] += 1
lm_cat = language_model[self._last_order][order]
else:
lm_cat = language_model[order][self._last_order]
self._seq_counters[lm_cat] += 1
self._last_order = order
charset_name = self._model['charset_name']
charset_name = self._model.charset_name
if self.state == ProbingState.DETECTING:
if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD:
confidence = self.get_confidence()
@ -125,7 +138,7 @@ class SingleByteCharSetProber(CharSetProber):
r = 0.01
if self._total_seqs > 0:
r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) /
self._total_seqs / self._model['typical_positive_ratio'])
self._total_seqs / self._model.typical_positive_ratio)
r = r * self._freq_char / self._total_char
if r >= 1.0:
r = 0.99

View file

@ -27,47 +27,57 @@
######################### END LICENSE BLOCK #########################
from .charsetgroupprober import CharSetGroupProber
from .sbcharsetprober import SingleByteCharSetProber
from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel,
Latin5CyrillicModel, MacCyrillicModel,
Ibm866Model, Ibm855Model)
from .langgreekmodel import Latin7GreekModel, Win1253GreekModel
from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
# from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
from .langthaimodel import TIS620ThaiModel
from .langhebrewmodel import Win1255HebrewModel
from .hebrewprober import HebrewProber
from .langturkishmodel import Latin5TurkishModel
from .langbulgarianmodel import (ISO_8859_5_BULGARIAN_MODEL,
WINDOWS_1251_BULGARIAN_MODEL)
from .langgreekmodel import ISO_8859_7_GREEK_MODEL, WINDOWS_1253_GREEK_MODEL
from .langhebrewmodel import WINDOWS_1255_HEBREW_MODEL
# from .langhungarianmodel import (ISO_8859_2_HUNGARIAN_MODEL,
# WINDOWS_1250_HUNGARIAN_MODEL)
from .langrussianmodel import (IBM855_RUSSIAN_MODEL, IBM866_RUSSIAN_MODEL,
ISO_8859_5_RUSSIAN_MODEL, KOI8_R_RUSSIAN_MODEL,
MACCYRILLIC_RUSSIAN_MODEL,
WINDOWS_1251_RUSSIAN_MODEL)
from .langthaimodel import TIS_620_THAI_MODEL
from .langturkishmodel import ISO_8859_9_TURKISH_MODEL
from .sbcharsetprober import SingleByteCharSetProber
class SBCSGroupProber(CharSetGroupProber):
def __init__(self):
super(SBCSGroupProber, self).__init__()
hebrew_prober = HebrewProber()
logical_hebrew_prober = SingleByteCharSetProber(WINDOWS_1255_HEBREW_MODEL,
False, hebrew_prober)
# TODO: See if using ISO-8859-8 Hebrew model works better here, since
# it's actually the visual one
visual_hebrew_prober = SingleByteCharSetProber(WINDOWS_1255_HEBREW_MODEL,
True, hebrew_prober)
hebrew_prober.set_model_probers(logical_hebrew_prober,
visual_hebrew_prober)
# TODO: ORDER MATTERS HERE. I changed the order vs what was in master
# and several tests failed that did not before. Some thought
# should be put into the ordering, and we should consider making
# order not matter here, because that is very counter-intuitive.
self.probers = [
SingleByteCharSetProber(Win1251CyrillicModel),
SingleByteCharSetProber(Koi8rModel),
SingleByteCharSetProber(Latin5CyrillicModel),
SingleByteCharSetProber(MacCyrillicModel),
SingleByteCharSetProber(Ibm866Model),
SingleByteCharSetProber(Ibm855Model),
SingleByteCharSetProber(Latin7GreekModel),
SingleByteCharSetProber(Win1253GreekModel),
SingleByteCharSetProber(Latin5BulgarianModel),
SingleByteCharSetProber(Win1251BulgarianModel),
SingleByteCharSetProber(WINDOWS_1251_RUSSIAN_MODEL),
SingleByteCharSetProber(KOI8_R_RUSSIAN_MODEL),
SingleByteCharSetProber(ISO_8859_5_RUSSIAN_MODEL),
SingleByteCharSetProber(MACCYRILLIC_RUSSIAN_MODEL),
SingleByteCharSetProber(IBM866_RUSSIAN_MODEL),
SingleByteCharSetProber(IBM855_RUSSIAN_MODEL),
SingleByteCharSetProber(ISO_8859_7_GREEK_MODEL),
SingleByteCharSetProber(WINDOWS_1253_GREEK_MODEL),
SingleByteCharSetProber(ISO_8859_5_BULGARIAN_MODEL),
SingleByteCharSetProber(WINDOWS_1251_BULGARIAN_MODEL),
# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250)
# after we retrain model.
# SingleByteCharSetProber(Latin2HungarianModel),
# SingleByteCharSetProber(Win1250HungarianModel),
SingleByteCharSetProber(TIS620ThaiModel),
SingleByteCharSetProber(Latin5TurkishModel),
# SingleByteCharSetProber(ISO_8859_2_HUNGARIAN_MODEL),
# SingleByteCharSetProber(WINDOWS_1250_HUNGARIAN_MODEL),
SingleByteCharSetProber(TIS_620_THAI_MODEL),
SingleByteCharSetProber(ISO_8859_9_TURKISH_MODEL),
hebrew_prober,
logical_hebrew_prober,
visual_hebrew_prober,
]
hebrew_prober = HebrewProber()
logical_hebrew_prober = SingleByteCharSetProber(Win1255HebrewModel,
False, hebrew_prober)
visual_hebrew_prober = SingleByteCharSetProber(Win1255HebrewModel, True,
hebrew_prober)
hebrew_prober.set_model_probers(logical_hebrew_prober, visual_hebrew_prober)
self.probers.extend([hebrew_prober, logical_hebrew_prober,
visual_hebrew_prober])
self.reset()

View file

@ -266,7 +266,7 @@ class UniversalDetector(object):
'language': max_prober.language}
# Log all prober confidences if none met MINIMUM_THRESHOLD
if self.logger.getEffectiveLevel() == logging.DEBUG:
if self.logger.getEffectiveLevel() <= logging.DEBUG:
if self.result['encoding'] is None:
self.logger.debug('no probers hit minimum threshold')
for group_prober in self._charset_probers:
@ -280,7 +280,7 @@ class UniversalDetector(object):
prober.get_confidence())
else:
self.logger.debug('%s %s confidence = %s',
prober.charset_name,
prober.language,
prober.get_confidence())
group_prober.charset_name,
group_prober.language,
group_prober.get_confidence())
return self.result

View file

@ -5,5 +5,5 @@ from within setup.py and from chardet subpackages.
:author: Dan Blanchard (dan.blanchard@gmail.com)
"""
__version__ = "3.0.4"
__version__ = "4.0.0"
VERSION = __version__.split('.')

View file

@ -25,17 +25,16 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
from __future__ import unicode_literals
from .__meta__ import __version__, __version_info__ # noqa: F401
from . import css_parser as cp
from . import css_match as cm
from . import css_types as ct
from .util import DEBUG, _QUIRKS, deprecated, SelectorSyntaxError # noqa: F401
from .util import DEBUG, SelectorSyntaxError # noqa: F401
__all__ = (
'DEBUG', "_QUIRKS", 'SelectorSyntaxError', 'SoupSieve',
'closest', 'comments', 'compile', 'filter', 'icomments',
'iselect', 'match', 'select', 'select_one'
'DEBUG', 'SelectorSyntaxError', 'SoupSieve',
'closest', 'compile', 'filter', 'iselect',
'match', 'select', 'select_one'
)
SoupSieve = cm.SoupSieve
@ -87,21 +86,6 @@ def filter(select, iterable, namespaces=None, flags=0, **kwargs): # noqa: A001
return compile(select, namespaces, flags, **kwargs).filter(iterable)
@deprecated("'comments' is not related to CSS selectors and will be removed in the future.")
def comments(tag, limit=0, flags=0, **kwargs):
"""Get comments only."""
return [comment for comment in cm.CommentsMatch(tag).get_comments(limit)]
@deprecated("'icomments' is not related to CSS selectors and will be removed in the future.")
def icomments(tag, limit=0, flags=0, **kwargs):
"""Iterate comments only."""
for comment in cm.CommentsMatch(tag).get_comments(limit):
yield comment
def select_one(select, tag, namespaces=None, flags=0, **kwargs):
"""Select a single tag."""

View file

@ -1,5 +1,4 @@
"""Meta related things."""
from __future__ import unicode_literals
from collections import namedtuple
import re
@ -186,5 +185,5 @@ def parse_version(ver, pre=False):
return Version(major, minor, micro, release, pre, post, dev)
__version_info__ = Version(1, 9, 1, "final")
__version_info__ = Version(2, 1, 0, "final")
__version__ = __version_info__._get_canonical()

View file

@ -1,11 +1,12 @@
"""CSS matcher."""
from __future__ import unicode_literals
from datetime import datetime
from . import util
import re
from .import css_types as ct
import unicodedata
import bs4
# Empty tag pattern (whitespace okay)
RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
@ -43,6 +44,7 @@ RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2}
RE_DATETIME = re.compile(
r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$'
)
RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)')
MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November
FEB = 2
@ -53,7 +55,7 @@ FEB_LEAP_MONTH = 29
DAYS_IN_WEEK = 7
class FakeParent(object):
class _FakeParent(object):
"""
Fake parent class.
@ -73,7 +75,7 @@ class FakeParent(object):
return len(self.contents)
class Document(object):
class _DocumentNav(object):
"""Navigate a Beautiful Soup document."""
@classmethod
@ -87,58 +89,37 @@ class Document(object):
@staticmethod
def is_doc(obj):
"""Is `BeautifulSoup` object."""
import bs4
return isinstance(obj, bs4.BeautifulSoup)
@staticmethod
def is_tag(obj):
"""Is tag."""
import bs4
return isinstance(obj, bs4.Tag)
@staticmethod
def is_comment(obj):
"""Is comment."""
import bs4
return isinstance(obj, bs4.Comment)
@staticmethod
def is_declaration(obj): # pragma: no cover
"""Is declaration."""
import bs4
return isinstance(obj, bs4.Declaration)
@staticmethod
def is_cdata(obj): # pragma: no cover
def is_cdata(obj):
"""Is CDATA."""
import bs4
return isinstance(obj, bs4.Declaration)
return isinstance(obj, bs4.CData)
@staticmethod
def is_processing_instruction(obj): # pragma: no cover
"""Is processing instruction."""
import bs4
return isinstance(obj, bs4.ProcessingInstruction)
@staticmethod
def is_navigable_string(obj):
"""Is navigable string."""
import bs4
return isinstance(obj, bs4.NavigableString)
@staticmethod
def is_special_string(obj):
"""Is special string."""
import bs4
return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction))
return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
@classmethod
def is_content_string(cls, obj):
@ -150,7 +131,7 @@ class Document(object):
def create_fake_parent(el):
"""Create fake parent for a given element."""
return FakeParent(el)
return _FakeParent(el)
@staticmethod
def is_xml_tree(el):
@ -217,10 +198,13 @@ class Document(object):
is_tag = self.is_tag(child)
if no_iframe and is_tag and self.is_iframe(child):
last_child = child
while self.is_tag(last_child) and last_child.contents:
last_child = last_child.contents[-1]
next_good = last_child.next_element
if child.next_sibling is not None:
next_good = child.next_sibling
else:
last_child = child
while self.is_tag(last_child) and last_child.contents:
last_child = last_child.contents[-1]
next_good = last_child.next_element
yield child
if next_good is None:
break
@ -250,21 +234,27 @@ class Document(object):
return el.prefix
@staticmethod
def get_uri(el):
"""Get namespace `URI`."""
return el.namespace
@classmethod
def get_next_tag(cls, el):
def get_next(cls, el, tags=True):
"""Get next sibling tag."""
sibling = el.next_sibling
while not cls.is_tag(sibling) and sibling is not None:
while tags and not cls.is_tag(sibling) and sibling is not None:
sibling = sibling.next_sibling
return sibling
@classmethod
def get_previous_tag(cls, el):
def get_previous(cls, el, tags=True):
"""Get previous sibling tag."""
sibling = el.previous_sibling
while not cls.is_tag(sibling) and sibling is not None:
while tags and not cls.is_tag(sibling) and sibling is not None:
sibling = sibling.previous_sibling
return sibling
@ -315,7 +305,7 @@ class Document(object):
"""Get classes."""
classes = cls.get_attribute_by_name(el, 'class', [])
if isinstance(classes, util.ustr):
if isinstance(classes, str):
classes = RE_NOT_WS.findall(classes)
return classes
@ -326,6 +316,11 @@ class Document(object):
[node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)]
)
def get_own_text(self, el, no_iframe=False):
"""Get Own Text."""
return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)]
class Inputs(object):
"""Class for parsing and validating input items."""
@ -428,7 +423,7 @@ class Inputs(object):
return parsed
class CSSMatch(Document, object):
class _Match(object):
"""Perform CSS matching."""
def __init__(self, selectors, scope, namespaces, flags):
@ -476,7 +471,7 @@ class CSSMatch(Document, object):
if self.supports_namespaces():
namespace = ''
ns = el.namespace
ns = self.get_uri(el)
if ns:
namespace = ns
else:
@ -536,6 +531,57 @@ class CSSMatch(Document, object):
return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
return None
def extended_language_filter(self, lang_range, lang_tag):
"""Filter the language tags."""
match = True
lang_range = RE_WILD_STRIP.sub('-', lang_range).lower()
ranges = lang_range.split('-')
subtags = lang_tag.lower().split('-')
length = len(ranges)
rindex = 0
sindex = 0
r = ranges[rindex]
s = subtags[sindex]
# Primary tag needs to match
if r != '*' and r != s:
match = False
rindex += 1
sindex += 1
# Match until we run out of ranges
while match and rindex < length:
r = ranges[rindex]
try:
s = subtags[sindex]
except IndexError:
# Ran out of subtags,
# but we still have ranges
match = False
continue
# Empty range
if not r:
match = False
continue
# Matched range
elif s == r:
rindex += 1
# Implicit wildcard cannot match
# singletons
elif len(s) == 1:
match = False
continue
# Implicitly matched, so grab next subtag
sindex += 1
return match
def match_attribute_name(self, el, attr, prefix):
"""Match attribute name and return value if it exists."""
@ -660,12 +706,12 @@ class CSSMatch(Document, object):
if parent:
found = self.match_selectors(parent, relation)
elif relation[0].rel_type == REL_SIBLING:
sibling = self.get_previous_tag(el)
sibling = self.get_previous(el)
while not found and sibling:
found = self.match_selectors(sibling, relation)
sibling = self.get_previous_tag(sibling)
sibling = self.get_previous(sibling)
elif relation[0].rel_type == REL_CLOSE_SIBLING:
sibling = self.get_previous_tag(el)
sibling = self.get_previous(el)
if sibling and self.is_tag(sibling):
found = self.match_selectors(sibling, relation)
return found
@ -690,12 +736,12 @@ class CSSMatch(Document, object):
elif relation[0].rel_type == REL_HAS_CLOSE_PARENT:
found = self.match_future_child(el, relation)
elif relation[0].rel_type == REL_HAS_SIBLING:
sibling = self.get_next_tag(el)
sibling = self.get_next(el)
while not found and sibling:
found = self.match_selectors(sibling, relation)
sibling = self.get_next_tag(sibling)
sibling = self.get_next(sibling)
elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING:
sibling = self.get_next_tag(el)
sibling = self.get_next(el)
if sibling and self.is_tag(sibling):
found = self.match_selectors(sibling, relation)
return found
@ -736,7 +782,28 @@ class CSSMatch(Document, object):
def match_root(self, el):
"""Match element as root."""
return self.is_root(el)
is_root = self.is_root(el)
if is_root:
sibling = self.get_previous(el, tags=False)
while is_root and sibling is not None:
if (
self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
self.is_cdata(sibling)
):
is_root = False
else:
sibling = self.get_previous(sibling, tags=False)
if is_root:
sibling = self.get_next(el, tags=False)
while is_root and sibling is not None:
if (
self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
self.is_cdata(sibling)
):
is_root = False
else:
sibling = self.get_next(sibling, tags=False)
return is_root
def match_scope(self, el):
"""Match element as scope."""
@ -881,12 +948,23 @@ class CSSMatch(Document, object):
content = None
for contain_list in contains:
if content is None:
content = self.get_text(el, no_iframe=self.is_html)
if contain_list.own:
content = self.get_own_text(el, no_iframe=self.is_html)
else:
content = self.get_text(el, no_iframe=self.is_html)
found = False
for text in contain_list.text:
if text in content:
found = True
break
if contain_list.own:
for c in content:
if text in c:
found = True
break
if found:
break
else:
if text in content:
found = True
break
if not found:
match = False
return match
@ -1070,7 +1148,7 @@ class CSSMatch(Document, object):
for patterns in langs:
match = False
for pattern in patterns:
if pattern.match(found_lang):
if self.extended_language_filter(pattern, found_lang):
match = True
if not match:
break
@ -1152,7 +1230,7 @@ class CSSMatch(Document, object):
out_of_range = False
itype = self.get_attribute_by_name(el, 'type').lower()
itype = util.lower(self.get_attribute_by_name(el, 'type'))
mn = self.get_attribute_by_name(el, 'min', None)
if mn is not None:
mn = Inputs.parse_value(itype, mn)
@ -1207,6 +1285,21 @@ class CSSMatch(Document, object):
self.get_prefix(el) is not None
)
def match_placeholder_shown(self, el):
"""
Match placeholder shown according to HTML spec.
- text area should be checked if they have content. A single newline does not count as content.
"""
match = False
content = self.get_text(el)
if content in ('', '\n'):
match = True
return match
def match_selectors(self, el, selectors):
"""Check if element matches one of the selectors."""
@ -1239,6 +1332,9 @@ class CSSMatch(Document, object):
# Verify element is scope
if selector.flags & ct.SEL_SCOPE and not self.match_scope(el):
continue
# Verify element has placeholder shown
if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el):
continue
# Verify `nth` matches
if not self.match_nth(el, selector.nth):
continue
@ -1325,28 +1421,8 @@ class CSSMatch(Document, object):
return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)
class CommentsMatch(Document, object):
"""Comments matcher."""
def __init__(self, el):
"""Initialize."""
self.assert_valid_input(el)
self.tag = el
def get_comments(self, limit=0):
"""Get comments."""
if limit < 1:
limit = None
for child in self.get_descendants(self.tag, tags=False):
if self.is_comment(child):
yield child
if limit is not None:
limit -= 1
if limit < 1:
break
class CSSMatch(_DocumentNav, _Match):
"""The Beautiful Soup CSS match class."""
class SoupSieve(ct.Immutable):
@ -1392,19 +1468,6 @@ class SoupSieve(ct.Immutable):
else:
return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]
@util.deprecated("'comments' is not related to CSS selectors and will be removed in the future.")
def comments(self, tag, limit=0):
"""Get comments only."""
return [comment for comment in CommentsMatch(tag).get_comments(limit)]
@util.deprecated("'icomments' is not related to CSS selectors and will be removed in the future.")
def icomments(self, tag, limit=0):
"""Iterate comments only."""
for comment in CommentsMatch(tag).get_comments(limit):
yield comment
def select_one(self, tag):
"""Select a single tag."""

View file

@ -1,10 +1,11 @@
"""CSS selector parser."""
from __future__ import unicode_literals
import re
from functools import lru_cache
from . import util
from . import css_match as cm
from . import css_types as ct
from .util import SelectorSyntaxError
import warnings
UNICODE_REPLACEMENT_CHAR = 0xFFFD
@ -59,6 +60,8 @@ PSEUDO_SIMPLE_NO_MATCH = {
# Complex pseudo classes that take selector lists
PSEUDO_COMPLEX = {
':contains',
':-soup-contains',
':-soup-contains-own',
':has',
':is',
':matches',
@ -110,11 +113,6 @@ VALUE = r'''
ATTR = r'''
(?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}+(?P<case>[is]))?)?{ws}*\]
'''.format(ws=WSC, value=VALUE)
# Definitions for quirks mode
QUIRKS_ATTR_IDENTIFIER = r'(?:(?:{esc}|(?!/\*)[^"\] \t\r\n\f])+?)'.format(esc=CSS_ESCAPES)
QUIRKS_ATTR = r'''
(?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}+(?P<case>[is]))?)?{ws}*\]
'''.format(ws=WSC, value=QUIRKS_ATTR_IDENTIFIER)
# Selector patterns
# IDs (`#id`)
@ -122,13 +120,11 @@ PAT_ID = r'\#{ident}'.format(ident=IDENTIFIER)
# Classes (`.class`)
PAT_CLASS = r'\.{ident}'.format(ident=IDENTIFIER)
# Prefix:Tag (`prefix|tag`)
PAT_TAG = r'(?:(?:{ident}|\*)?\|)?(?:{ident}|\*)'.format(ident=IDENTIFIER)
PAT_TAG = r'(?P<tag_ns>(?:{ident}|\*)?\|)?(?P<tag_name>{ident}|\*)'.format(ident=IDENTIFIER)
# Attributes (`[attr]`, `[attr=value]`, etc.)
PAT_ATTR = r'\[{ws}*(?P<ns_attr>(?:(?:{ident}|\*)?\|)?{ident}){attr}'.format(ws=WSC, ident=IDENTIFIER, attr=ATTR)
# Quirks attributes, like real attributes, but unquoted values can contain anything but whitespace and closing `]`.
PAT_QUIRKS_ATTR = r'''
\[{ws}*(?P<ns_attr>(?:(?:{ident}|\*)?\|)?{ident}){attr}
'''.format(ws=WSC, ident=IDENTIFIER, attr=QUIRKS_ATTR)
PAT_ATTR = r'''
\[{ws}*(?P<attr_ns>(?:{ident}|\*)?\|)?(?P<attr_name>{ident}){attr}
'''.format(ws=WSC, ident=IDENTIFIER, attr=ATTR)
# Pseudo class (`:pseudo-class`, `:pseudo-class(`)
PAT_PSEUDO_CLASS = r'(?P<name>:{ident})(?P<open>\({ws}*)?'.format(ws=WSC, ident=IDENTIFIER)
# Pseudo class special patterns. Matches `:pseudo-class(` for special case pseudo classes.
@ -199,12 +195,13 @@ FLG_INDETERMINATE = 0x20
FLG_OPEN = 0x40
FLG_IN_RANGE = 0x80
FLG_OUT_OF_RANGE = 0x100
FLG_PLACEHOLDER_SHOWN = 0x200
# Maximum cached patterns to store
_MAXCACHE = 500
@util.lru_cache(maxsize=_MAXCACHE)
@lru_cache(maxsize=_MAXCACHE)
def _cached_css_compile(pattern, namespaces, custom, flags):
"""Cached CSS compile."""
@ -253,7 +250,7 @@ def css_unescape(content, string=False):
codepoint = int(m.group(1)[1:], 16)
if codepoint == 0:
codepoint = UNICODE_REPLACEMENT_CHAR
value = util.uchr(codepoint)
value = chr(codepoint)
elif m.group(2):
value = m.group(2)[1:]
elif m.group(3):
@ -277,7 +274,7 @@ def escape(ident):
string.append('\\{}'.format(ident))
else:
for index, c in enumerate(ident):
codepoint = util.uord(c)
codepoint = ord(c)
if codepoint == 0x00:
string.append('\ufffd')
elif (0x01 <= codepoint <= 0x1F) or codepoint == 0x7F:
@ -308,12 +305,7 @@ class SelectorPattern(object):
return self.name
def enabled(self, flags):
"""Enabled."""
return True
def match(self, selector, index):
def match(self, selector, index, flags):
"""Match the selector."""
return self.re_pattern.match(selector, index)
@ -328,7 +320,7 @@ class SpecialPseudoPattern(SelectorPattern):
self.patterns = {}
for p in patterns:
name = p[0]
pattern = SelectorPattern(name, p[2])
pattern = p[3](name, p[2])
for pseudo in p[1]:
self.patterns[pseudo] = pattern
@ -340,12 +332,7 @@ class SpecialPseudoPattern(SelectorPattern):
return self.matched_name.get_name()
def enabled(self, flags):
"""Enabled."""
return True
def match(self, selector, index):
def match(self, selector, index, flags):
"""Match the selector."""
pseudo = None
@ -354,22 +341,13 @@ class SpecialPseudoPattern(SelectorPattern):
name = util.lower(css_unescape(m.group('name')))
pattern = self.patterns.get(name)
if pattern:
pseudo = pattern.match(selector, index)
pseudo = pattern.match(selector, index, flags)
if pseudo:
self.matched_name = pattern
return pseudo
class QuirkPattern(SelectorPattern):
"""Selector pattern for quirk mode."""
def enabled(self, flags):
"""Enabled if quirks flag is present."""
return flags & util._QUIRKS
class _Selector(object):
"""
Intermediate selector class.
@ -446,11 +424,16 @@ class CSSParser(object):
SelectorPattern("pseudo_close", PAT_PSEUDO_CLOSE),
SpecialPseudoPattern(
(
("pseudo_contains", (':contains',), PAT_PSEUDO_CONTAINS),
("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD),
("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE),
("pseudo_lang", (':lang',), PAT_PSEUDO_LANG),
("pseudo_dir", (':dir',), PAT_PSEUDO_DIR)
(
"pseudo_contains",
(':contains', ':-soup-contains', ':-soup-contains-own'),
PAT_PSEUDO_CONTAINS,
SelectorPattern
),
("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD, SelectorPattern),
("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE, SelectorPattern),
("pseudo_lang", (':lang',), PAT_PSEUDO_LANG, SelectorPattern),
("pseudo_dir", (':dir',), PAT_PSEUDO_DIR, SelectorPattern)
)
),
SelectorPattern("pseudo_class_custom", PAT_PSEUDO_CLASS_CUSTOM),
@ -461,7 +444,6 @@ class CSSParser(object):
SelectorPattern("class", PAT_CLASS),
SelectorPattern("tag", PAT_TAG),
SelectorPattern("attribute", PAT_ATTR),
QuirkPattern("quirks_attribute", PAT_QUIRKS_ATTR),
SelectorPattern("combine", PAT_COMBINE)
)
@ -471,24 +453,19 @@ class CSSParser(object):
self.pattern = selector.replace('\x00', '\ufffd')
self.flags = flags
self.debug = self.flags & util.DEBUG
self.quirks = self.flags & util._QUIRKS
self.custom = {} if custom is None else custom
def parse_attribute_selector(self, sel, m, has_selector, quirks):
def parse_attribute_selector(self, sel, m, has_selector):
"""Create attribute selector from the returned regex match."""
inverse = False
op = m.group('cmp')
case = util.lower(m.group('case')) if m.group('case') else None
parts = [css_unescape(a) for a in m.group('ns_attr').split('|')]
ns = ''
ns = css_unescape(m.group('attr_ns')[:-1]) if m.group('attr_ns') else ''
attr = css_unescape(m.group('attr_name'))
is_type = False
pattern2 = None
if len(parts) > 1:
ns = parts[0]
attr = parts[1]
else:
attr = parts[0]
if case:
flags = re.I if case == 'i' else 0
elif util.lower(attr) == 'type':
@ -498,7 +475,7 @@ class CSSParser(object):
flags = 0
if op:
if m.group('value').startswith(('"', "'")) and not quirks:
if m.group('value').startswith(('"', "'")):
value = css_unescape(m.group('value')[1:-1], True)
else:
value = css_unescape(m.group('value'))
@ -525,13 +502,12 @@ class CSSParser(object):
elif op.startswith('|'):
# Value starts with word in dash separated list
pattern = re.compile(r'^%s(?:-.*)?$' % re.escape(value), flags)
elif op.startswith('!'):
# Equivalent to `:not([attr=value])`
pattern = re.compile(r'^%s(?:-.*)?$' % re.escape(value), flags)
inverse = True
else:
# Value matches
pattern = re.compile(r'^%s$' % re.escape(value), flags)
if op.startswith('!'):
# Equivalent to `:not([attr=value])`
inverse = True
if is_type and pattern:
pattern2 = re.compile(pattern.pattern)
@ -552,13 +528,8 @@ class CSSParser(object):
def parse_tag_pattern(self, sel, m, has_selector):
"""Parse tag pattern from regex match."""
parts = [css_unescape(x) for x in m.group(0).split('|')]
if len(parts) > 1:
prefix = parts[0]
tag = parts[1]
else:
tag = parts[0]
prefix = None
prefix = css_unescape(m.group('tag_ns')[:-1]) if m.group('tag_ns') else None
tag = css_unescape(m.group('tag_name'))
sel.tag = ct.SelectorTag(tag, prefix)
has_selector = True
return has_selector
@ -800,21 +771,11 @@ class CSSParser(object):
if not combinator:
combinator = WS_COMBINATOR
if not has_selector:
# The only way we don't fail is if we are at the root level and quirks mode is enabled,
# and we've found no other selectors yet in this compound selector.
if (not self.quirks or is_pseudo or combinator == COMMA_COMBINATOR or relations):
raise SelectorSyntaxError(
"The combinator '{}' at postion {}, must have a selector before it".format(combinator, index),
self.pattern,
index
)
util.warn_quirks(
'You have attempted to use a combinator without a selector before it at position {}.'.format(index),
'the :scope pseudo class (or another appropriate selector) should be placed before the combinator.',
raise SelectorSyntaxError(
"The combinator '{}' at postion {}, must have a selector before it".format(combinator, index),
self.pattern,
index
)
sel.flags |= ct.SEL_SCOPE
if combinator == COMMA_COMBINATOR:
if not sel.tag and not is_pseudo:
@ -847,7 +808,14 @@ class CSSParser(object):
def parse_pseudo_contains(self, sel, m, has_selector):
"""Parse contains."""
values = m.group('values')
pseudo = util.lower(css_unescape(m.group('name')))
if pseudo == ":contains":
warnings.warn(
"The pseudo class ':contains' is deprecated, ':-soup-contains' should be used moving forward.",
FutureWarning
)
contains_own = pseudo == ":-soup-contains-own"
values = css_unescape(m.group('values'))
patterns = []
for token in RE_VALUES.finditer(values):
if token.group('split'):
@ -858,7 +826,7 @@ class CSSParser(object):
else:
value = css_unescape(value)
patterns.append(value)
sel.contains.append(ct.SelectorContains(tuple(patterns)))
sel.contains.append(ct.SelectorContains(tuple(patterns), contains_own))
has_selector = True
return has_selector
@ -872,20 +840,12 @@ class CSSParser(object):
continue
value = token.group('value')
if value.startswith(('"', "'")):
parts = css_unescape(value[1:-1], True).split('-')
value = css_unescape(value[1:-1], True)
else:
parts = css_unescape(value).split('-')
value = css_unescape(value)
patterns.append(value)
new_parts = []
first = True
for part in parts:
if part == '*' and first:
new_parts.append('(?!x\b)[a-z0-9]+?')
elif part != '*':
new_parts.append(('' if first else '(-(?!x\b)[a-z0-9]+)*?\\-') + re.escape(part))
if first:
first = False
patterns.append(re.compile(r'^{}(?:-.*)?$'.format(''.join(new_parts)), re.I))
sel.lang.append(ct.SelectorLang(patterns))
has_selector = True
@ -917,6 +877,7 @@ class CSSParser(object):
is_indeterminate = bool(flags & FLG_INDETERMINATE)
is_in_range = bool(flags & FLG_IN_RANGE)
is_out_of_range = bool(flags & FLG_OUT_OF_RANGE)
is_placeholder_shown = bool(flags & FLG_PLACEHOLDER_SHOWN)
if self.debug: # pragma: no cover
if is_pseudo:
@ -937,6 +898,8 @@ class CSSParser(object):
print(' is_in_range: True')
if is_out_of_range:
print(' is_out_of_range: True')
if is_placeholder_shown:
print(' is_placeholder_shown: True')
if is_relative:
selectors.append(_Selector())
@ -953,7 +916,7 @@ class CSSParser(object):
elif key == 'pseudo_class':
has_selector, is_html = self.parse_pseudo_class(sel, m, has_selector, iselector, is_html)
elif key == 'pseudo_element':
raise NotImplementedError("Psuedo-element found at position {}".format(m.start(0)))
raise NotImplementedError("Pseudo-element found at position {}".format(m.start(0)))
elif key == 'pseudo_contains':
has_selector = self.parse_pseudo_contains(sel, m, has_selector)
elif key in ('pseudo_nth_type', 'pseudo_nth_child'):
@ -989,18 +952,8 @@ class CSSParser(object):
has_selector, sel = self.parse_combinator(
sel, m, has_selector, selectors, relations, is_pseudo, index
)
elif key in ('attribute', 'quirks_attribute'):
quirks = key == 'quirks_attribute'
if quirks:
temp_index = index + m.group(0).find('=') + 1
util.warn_quirks(
"You have attempted to use an attribute " +
"value that should have been quoted at position {}.".format(temp_index),
"the attribute value should be quoted.",
self.pattern,
temp_index
)
has_selector = self.parse_attribute_selector(sel, m, has_selector, quirks)
elif key == 'attribute':
has_selector = self.parse_attribute_selector(sel, m, has_selector)
elif key == 'tag':
if has_selector:
raise SelectorSyntaxError(
@ -1053,6 +1006,8 @@ class CSSParser(object):
selectors[-1].flags = ct.SEL_IN_RANGE
if is_out_of_range:
selectors[-1].flags = ct.SEL_OUT_OF_RANGE
if is_placeholder_shown:
selectors[-1].flags = ct.SEL_PLACEHOLDER_SHOWN
return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html)
@ -1066,15 +1021,11 @@ class CSSParser(object):
end = (m.start(0) - 1) if m else (len(pattern) - 1)
if self.debug: # pragma: no cover
if self.quirks:
print('## QUIRKS MODE: Throwing out the spec!')
print('## PARSING: {!r}'.format(pattern))
while index <= end:
m = None
for v in self.css_tokens:
if not v.enabled(self.flags): # pragma: no cover
continue
m = v.match(pattern, index)
m = v.match(pattern, index, self.flags)
if m:
name = v.get_name()
if self.debug: # pragma: no cover
@ -1102,13 +1053,7 @@ class CSSParser(object):
print('## END PARSING')
def process_selectors(self, index=0, flags=0):
"""
Process selectors.
We do our own selectors as BeautifulSoup4 has some annoying quirks,
and we don't really need to do nth selectors or siblings or
descendants etc.
"""
"""Process selectors."""
return self.parse_selectors(self.selector_iter(self.pattern), index, flags)
@ -1123,8 +1068,7 @@ CSS_LINK = CSSParser(
# CSS pattern for `:checked`
CSS_CHECKED = CSSParser(
'''
html|*:is(input[type=checkbox], input[type=radio])[checked],
html|select > html|option[selected]
html|*:is(input[type=checkbox], input[type=radio])[checked], html|option[selected]
'''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
# CSS pattern for `:default` (must compile CSS_CHECKED first)
@ -1150,23 +1094,23 @@ CSS_INDETERMINATE = CSSParser(
This pattern must be at the end.
Special logic is applied to the last selector.
*/
html|input[type="radio"][name][name!='']:not([checked])
html|input[type="radio"][name]:not([name='']):not([checked])
'''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_INDETERMINATE)
# CSS pattern for `:disabled`
CSS_DISABLED = CSSParser(
'''
html|*:is(input[type!=hidden], button, select, textarea, fieldset, optgroup, option, fieldset)[disabled],
html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset)[disabled],
html|optgroup[disabled] > html|option,
html|fieldset[disabled] > html|*:is(input[type!=hidden], button, select, textarea, fieldset),
html|fieldset[disabled] > html|*:is(input:not([type=hidden]), button, select, textarea, fieldset),
html|fieldset[disabled] >
html|*:not(legend:nth-of-type(1)) html|*:is(input[type!=hidden], button, select, textarea, fieldset)
html|*:not(legend:nth-of-type(1)) html|*:is(input:not([type=hidden]), button, select, textarea, fieldset)
'''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
# CSS pattern for `:enabled`
CSS_ENABLED = CSSParser(
'''
html|*:is(input[type!=hidden], button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled)
html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled)
'''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
# CSS pattern for `:required`
@ -1180,22 +1124,20 @@ CSS_OPTIONAL = CSSParser(
# CSS pattern for `:placeholder-shown`
CSS_PLACEHOLDER_SHOWN = CSSParser(
'''
html|*:is(
input:is(
:not([type]),
[type=""],
[type=text],
[type=search],
[type=url],
[type=tel],
[type=email],
[type=password],
[type=number]
),
textarea
)[placeholder][placeholder!='']
html|input:is(
:not([type]),
[type=""],
[type=text],
[type=search],
[type=url],
[type=tel],
[type=email],
[type=password],
[type=number]
)[placeholder]:not([placeholder='']):is(:not([value]), [value=""]),
html|textarea[placeholder]:not([placeholder=''])
'''
).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_PLACEHOLDER_SHOWN)
# CSS pattern default for `:nth-child` "of S" feature
CSS_NTH_OF_S_DEFAULT = CSSParser(
'*|*'

View file

@ -1,6 +1,6 @@
"""CSS selector structure items."""
from __future__ import unicode_literals
from . import util
import copyreg
from collections.abc import Hashable, Mapping
__all__ = (
'Selector',
@ -26,6 +26,7 @@ SEL_DIR_RTL = 0x40
SEL_IN_RANGE = 0x80
SEL_OUT_OF_RANGE = 0x100
SEL_DEFINED = 0x200
SEL_PLACEHOLDER_SHOWN = 0x400
class Immutable(object):
@ -85,7 +86,7 @@ class Immutable(object):
__str__ = __repr__
class ImmutableDict(util.Mapping):
class ImmutableDict(Mapping):
"""Hashable, immutable dictionary."""
def __init__(self, *args, **kwargs):
@ -94,8 +95,8 @@ class ImmutableDict(util.Mapping):
arg = args[0] if args else kwargs
is_dict = isinstance(arg, dict)
if (
is_dict and not all([isinstance(v, util.Hashable) for v in arg.values()]) or
not is_dict and not all([isinstance(k, util.Hashable) and isinstance(v, util.Hashable) for k, v in arg])
is_dict and not all([isinstance(v, Hashable) for v in arg.values()]) or
not is_dict and not all([isinstance(k, Hashable) and isinstance(v, Hashable) for k, v in arg])
):
raise TypeError('All values must be hashable')
@ -140,9 +141,9 @@ class Namespaces(ImmutableDict):
# so don't bother checking that.
arg = args[0] if args else kwargs
is_dict = isinstance(arg, dict)
if is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg.items()]):
if is_dict and not all([isinstance(k, str) and isinstance(v, str) for k, v in arg.items()]):
raise TypeError('Namespace keys and values must be Unicode strings')
elif not is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg]):
elif not is_dict and not all([isinstance(k, str) and isinstance(v, str) for k, v in arg]):
raise TypeError('Namespace keys and values must be Unicode strings')
super(Namespaces, self).__init__(*args, **kwargs)
@ -159,9 +160,9 @@ class CustomSelectors(ImmutableDict):
# so don't bother checking that.
arg = args[0] if args else kwargs
is_dict = isinstance(arg, dict)
if is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg.items()]):
if is_dict and not all([isinstance(k, str) and isinstance(v, str) for k, v in arg.items()]):
raise TypeError('CustomSelectors keys and values must be Unicode strings')
elif not is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg]):
elif not is_dict and not all([isinstance(k, str) and isinstance(v, str) for k, v in arg]):
raise TypeError('CustomSelectors keys and values must be Unicode strings')
super(CustomSelectors, self).__init__(*args, **kwargs)
@ -238,13 +239,14 @@ class SelectorAttribute(Immutable):
class SelectorContains(Immutable):
"""Selector contains rule."""
__slots__ = ("text", "_hash")
__slots__ = ("text", "own", "_hash")
def __init__(self, text):
def __init__(self, text, own):
"""Initialize."""
super(SelectorContains, self).__init__(
text=text
text=text,
own=own
)
@ -331,7 +333,7 @@ def _pickle(p):
def pickle_register(obj):
"""Allow object to be pickled."""
util.copyreg.pickle(obj, _pickle)
copyreg.pickle(obj, _pickle)
pickle_register(Selector)

View file

@ -1,47 +1,17 @@
"""Utility."""
from __future__ import unicode_literals
from functools import wraps
from functools import wraps, lru_cache
import warnings
import sys
import struct
import os
import re
MODULE = os.path.dirname(__file__)
PY3 = sys.version_info >= (3, 0)
PY35 = sys.version_info >= (3, 5)
PY37 = sys.version_info >= (3, 7)
if PY3:
from functools import lru_cache # noqa F401
import copyreg # noqa F401
from collections.abc import Hashable, Mapping # noqa F401
ustr = str
bstr = bytes
unichar = chr
string = str
else:
from backports.functools_lru_cache import lru_cache # noqa F401
import copy_reg as copyreg # noqa F401
from collections import Hashable, Mapping # noqa F401
ustr = unicode # noqa: F821
bstr = str
unichar = unichr # noqa: F821
string = basestring # noqa: F821
DEBUG = 0x00001
_QUIRKS = 0x10000
RE_PATTERN_LINE_SPLIT = re.compile(r'(?:\r\n|(?!\r\n)[\n\r])|$')
LC_A = ord('a')
LC_Z = ord('z')
UC_A = ord('A')
UC_Z = ord('Z')
@lru_cache(maxsize=512)
def lower(string):
"""Lower."""
@ -52,38 +22,7 @@ def lower(string):
return ''.join(new_string)
def upper(string): # pragma: no cover
"""Lower."""
new_string = []
for c in string:
o = ord(c)
new_string.append(chr(o - 32) if LC_A <= o <= LC_Z else c)
return ''.join(new_string)
def uchr(i):
"""Allow getting Unicode character on narrow python builds."""
try:
return unichar(i)
except ValueError: # pragma: no cover
return struct.pack('i', i).decode('utf-32')
def uord(c):
"""Get Unicode ordinal."""
if len(c) == 2: # pragma: no cover
high, low = [ord(p) for p in c]
ordinal = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000
else:
ordinal = ord(c)
return ordinal
class SelectorSyntaxError(SyntaxError):
class SelectorSyntaxError(Exception):
"""Syntax error in a CSS selector."""
def __init__(self, msg, pattern=None, index=None):
@ -169,45 +108,3 @@ def get_pattern_context(pattern, index):
last = m.end(0)
return ''.join(text), line, col
class QuirksWarning(UserWarning): # pragma: no cover
"""Warning for quirks mode."""
def warn_quirks(message, recommend, pattern, index):
"""Warn quirks."""
import traceback
import bs4 # noqa: F401
# Acquire source code line context
paths = (MODULE, sys.modules['bs4'].__path__[0])
tb = traceback.extract_stack()
previous = None
filename = None
lineno = None
for entry in tb:
if (PY35 and entry.filename.startswith(paths)) or (not PY35 and entry[0].startswith(paths)):
break
previous = entry
if previous:
filename = previous.filename if PY35 else previous[0]
lineno = previous.lineno if PY35 else previous[1]
# Format pattern to show line and column position
context, line = get_pattern_context(pattern, index)[0:2]
# Display warning
warnings.warn_explicit(
"\nCSS selector pattern:\n" +
" {}\n".format(message) +
" This behavior is only allowed temporarily for Beautiful Soup's transition to Soup Sieve.\n" +
" In order to confrom to the CSS spec, {}\n".format(recommend) +
" It is strongly recommended the selector be altered to conform to the CSS spec " +
"as an exception will be raised for this case in the future.\n" +
"pattern line {}:\n{}".format(line, context),
QuirksWarning,
filename,
lineno
)