diff --git a/ebook/__init__.py b/ebook/__init__.py index 9bc260a..8a99801 100644 --- a/ebook/__init__.py +++ b/ebook/__init__.py @@ -91,7 +91,8 @@ def chapter_html( image_options, titleprefix=None, normalize=False, - session=None + session=None, + parser='lxml' ): already_fetched_images = {} chapters = [] @@ -170,7 +171,7 @@ def chapter_html( return chapters -def generate_epub(story, cover_options={}, image_options={}, output_filename=None, output_dir=None, normalize=False, allow_spaces=False, session=None): +def generate_epub(story, cover_options={}, image_options={}, output_filename=None, output_dir=None, normalize=False, allow_spaces=False, session=None, parser='lxml'): dates = list(story.dates()) metadata = { 'title': story.title, @@ -230,7 +231,8 @@ def generate_epub(story, cover_options={}, image_options={}, output_filename=Non story, image_options=image_options, normalize=normalize, - session=session + session=session, + parser=parser ), EpubFile( path='Styles/base.css', diff --git a/leech.py b/leech.py index 35c7e1d..6c2c820 100755 --- a/leech.py +++ b/leech.py @@ -185,7 +185,8 @@ def download(urls, site_options, cache, verbose, normalize, output_dir, **other_ normalize=normalize, output_dir=output_dir or options.get('output_dir', os.getcwd()), allow_spaces=options.get('allow_spaces', False), - session=session + session=session, + parser=options.get('parser', 'lxml') ) logger.info("File created: " + filename) else: diff --git a/sites/__init__.py b/sites/__init__.py index 9179730..394bad5 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -132,6 +132,13 @@ class Site: "callback": lambda ctx, param, value: ctx.params.update({"spoilers": value and "skip" or "include"}), }, ), + SiteSpecificOption( + 'parser', + '--parser', + help="Which HTML parser to use", + choices=('lxml', 'html5lib', 'html.parser', 'lxml-xml'), + default='lxml', + ), ] @classmethod @@ -176,7 +183,9 @@ class Site: def login(self, login_details): raise NotImplementedError() - def _soup(self, url, method='lxml', delay=0, retry=3, retry_delay=10, **kw): + def _soup(self, url, method=False, delay=0, retry=3, retry_delay=10, **kw): + if not method: + method = self.options.get('parser', 'lxml') if url.startswith('http://') or url.startswith('https://'): page = self.session.get(url, **kw) if not page: