1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 00:15:22 +01:00

Make the parser used for BeautifulSoup configurable, still default lxml

Refs #98
This commit is contained in:
David Lynch 2025-03-04 22:58:13 -06:00
parent 9ed2d54db7
commit 4d9c31b6ac
3 changed files with 17 additions and 5 deletions

View file

@ -91,7 +91,8 @@ def chapter_html(
image_options,
titleprefix=None,
normalize=False,
session=None
session=None,
parser='lxml'
):
already_fetched_images = {}
chapters = []
@ -170,7 +171,7 @@ def chapter_html(
return chapters
def generate_epub(story, cover_options={}, image_options={}, output_filename=None, output_dir=None, normalize=False, allow_spaces=False, session=None):
def generate_epub(story, cover_options={}, image_options={}, output_filename=None, output_dir=None, normalize=False, allow_spaces=False, session=None, parser='lxml'):
dates = list(story.dates())
metadata = {
'title': story.title,
@ -230,7 +231,8 @@ def generate_epub(story, cover_options={}, image_options={}, output_filename=Non
story,
image_options=image_options,
normalize=normalize,
session=session
session=session,
parser=parser
),
EpubFile(
path='Styles/base.css',

View file

@ -185,7 +185,8 @@ def download(urls, site_options, cache, verbose, normalize, output_dir, **other_
normalize=normalize,
output_dir=output_dir or options.get('output_dir', os.getcwd()),
allow_spaces=options.get('allow_spaces', False),
session=session
session=session,
parser=options.get('parser', 'lxml')
)
logger.info("File created: " + filename)
else:

View file

@ -132,6 +132,13 @@ class Site:
"callback": lambda ctx, param, value: ctx.params.update({"spoilers": value and "skip" or "include"}),
},
),
SiteSpecificOption(
'parser',
'--parser',
help="Which HTML parser to use",
choices=('lxml', 'html5lib', 'html.parser', 'lxml-xml'),
default='lxml',
),
]
@classmethod
@ -176,7 +183,9 @@ class Site:
def login(self, login_details):
raise NotImplementedError()
def _soup(self, url, method='lxml', delay=0, retry=3, retry_delay=10, **kw):
def _soup(self, url, method=False, delay=0, retry=3, retry_delay=10, **kw):
if not method:
method = self.options.get('parser', 'lxml')
if url.startswith('http://') or url.startswith('https://'):
page = self.session.get(url, **kw)
if not page: