1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 00:15:22 +01:00

Make the parser used for BeautifulSoup configurable, still default lxml

Refs #98
This commit is contained in:
David Lynch 2025-03-04 22:58:13 -06:00
parent 9ed2d54db7
commit 4d9c31b6ac
3 changed files with 17 additions and 5 deletions

View file

@ -91,7 +91,8 @@ def chapter_html(
image_options, image_options,
titleprefix=None, titleprefix=None,
normalize=False, normalize=False,
session=None session=None,
parser='lxml'
): ):
already_fetched_images = {} already_fetched_images = {}
chapters = [] chapters = []
@ -170,7 +171,7 @@ def chapter_html(
return chapters return chapters
def generate_epub(story, cover_options={}, image_options={}, output_filename=None, output_dir=None, normalize=False, allow_spaces=False, session=None): def generate_epub(story, cover_options={}, image_options={}, output_filename=None, output_dir=None, normalize=False, allow_spaces=False, session=None, parser='lxml'):
dates = list(story.dates()) dates = list(story.dates())
metadata = { metadata = {
'title': story.title, 'title': story.title,
@ -230,7 +231,8 @@ def generate_epub(story, cover_options={}, image_options={}, output_filename=Non
story, story,
image_options=image_options, image_options=image_options,
normalize=normalize, normalize=normalize,
session=session session=session,
parser=parser
), ),
EpubFile( EpubFile(
path='Styles/base.css', path='Styles/base.css',

View file

@ -185,7 +185,8 @@ def download(urls, site_options, cache, verbose, normalize, output_dir, **other_
normalize=normalize, normalize=normalize,
output_dir=output_dir or options.get('output_dir', os.getcwd()), output_dir=output_dir or options.get('output_dir', os.getcwd()),
allow_spaces=options.get('allow_spaces', False), allow_spaces=options.get('allow_spaces', False),
session=session session=session,
parser=options.get('parser', 'lxml')
) )
logger.info("File created: " + filename) logger.info("File created: " + filename)
else: else:

View file

@ -132,6 +132,13 @@ class Site:
"callback": lambda ctx, param, value: ctx.params.update({"spoilers": value and "skip" or "include"}), "callback": lambda ctx, param, value: ctx.params.update({"spoilers": value and "skip" or "include"}),
}, },
), ),
SiteSpecificOption(
'parser',
'--parser',
help="Which HTML parser to use",
choices=('lxml', 'html5lib', 'html.parser', 'lxml-xml'),
default='lxml',
),
] ]
@classmethod @classmethod
@ -176,7 +183,9 @@ class Site:
def login(self, login_details): def login(self, login_details):
raise NotImplementedError() raise NotImplementedError()
def _soup(self, url, method='lxml', delay=0, retry=3, retry_delay=10, **kw): def _soup(self, url, method=False, delay=0, retry=3, retry_delay=10, **kw):
if not method:
method = self.options.get('parser', 'lxml')
if url.startswith('http://') or url.startswith('https://'): if url.startswith('http://') or url.startswith('https://'):
page = self.session.get(url, **kw) page = self.session.get(url, **kw)
if not page: if not page: