Make the parser used for BeautifulSoup configurable, still default lxml

Refs #98
2026-01-26 09:21:46 +01:00 · 2025-03-04 22:58:13 -06:00 · 2025-03-04 22:58:13 -06:00 · 4d9c31b6ac
commit 4d9c31b6ac
parent 9ed2d54db7
3 changed files with 17 additions and 5 deletions
--- a/ebook/init.py
+++ b/ebook/init.py
@ -91,7 +91,8 @@ def chapter_html(
    image_options,
    titleprefix=None,
    normalize=False,
-    session=None
+    session=None,
+    parser='lxml'
 ):
    already_fetched_images = {}
    chapters = []
@ -170,7 +171,7 @@ def chapter_html(
    return chapters


-def generate_epub(story, cover_options={}, image_options={}, output_filename=None, output_dir=None, normalize=False, allow_spaces=False, session=None):
+def generate_epub(story, cover_options={}, image_options={}, output_filename=None, output_dir=None, normalize=False, allow_spaces=False, session=None, parser='lxml'):
    dates = list(story.dates())
    metadata = {
        'title': story.title,
@ -230,7 +231,8 @@ def generate_epub(story, cover_options={}, image_options={}, output_filename=Non
                story,
                image_options=image_options,
                normalize=normalize,
-                session=session
+                session=session,
+                parser=parser
            ),
            EpubFile(
                path='Styles/base.css',
--- a/leech.py
+++ b/leech.py
@ -185,7 +185,8 @@ def download(urls, site_options, cache, verbose, normalize, output_dir, **other_
                normalize=normalize,
                output_dir=output_dir or options.get('output_dir', os.getcwd()),
                allow_spaces=options.get('allow_spaces', False),
-                session=session
+                session=session,
+                parser=options.get('parser', 'lxml')
            )
            logger.info("File created: " + filename)
        else:
--- a/sites/init.py
+++ b/sites/init.py
@ -132,6 +132,13 @@ class Site:
                    "callback": lambda ctx, param, value: ctx.params.update({"spoilers": value and "skip" or "include"}),
                },
            ),
+            SiteSpecificOption(
+                'parser',
+                '--parser',
+                help="Which HTML parser to use",
+                choices=('lxml', 'html5lib', 'html.parser', 'lxml-xml'),
+                default='lxml',
+            ),
        ]

    @classmethod
@ -176,7 +183,9 @@ class Site:
    def login(self, login_details):
        raise NotImplementedError()

-    def _soup(self, url, method='lxml', delay=0, retry=3, retry_delay=10, **kw):
+    def _soup(self, url, method=False, delay=0, retry=3, retry_delay=10, **kw):
+        if not method:
+            method = self.options.get('parser', 'lxml')
        if url.startswith('http://') or url.startswith('https://'):
            page = self.session.get(url, **kw)
            if not page: