mirror of
https://github.com/kemayo/leech
synced 2025-12-06 00:15:22 +01:00
Make the parser used for BeautifulSoup configurable, still default lxml
Refs #98
This commit is contained in:
parent
9ed2d54db7
commit
4d9c31b6ac
3 changed files with 17 additions and 5 deletions
|
|
@ -91,7 +91,8 @@ def chapter_html(
|
||||||
image_options,
|
image_options,
|
||||||
titleprefix=None,
|
titleprefix=None,
|
||||||
normalize=False,
|
normalize=False,
|
||||||
session=None
|
session=None,
|
||||||
|
parser='lxml'
|
||||||
):
|
):
|
||||||
already_fetched_images = {}
|
already_fetched_images = {}
|
||||||
chapters = []
|
chapters = []
|
||||||
|
|
@ -170,7 +171,7 @@ def chapter_html(
|
||||||
return chapters
|
return chapters
|
||||||
|
|
||||||
|
|
||||||
def generate_epub(story, cover_options={}, image_options={}, output_filename=None, output_dir=None, normalize=False, allow_spaces=False, session=None):
|
def generate_epub(story, cover_options={}, image_options={}, output_filename=None, output_dir=None, normalize=False, allow_spaces=False, session=None, parser='lxml'):
|
||||||
dates = list(story.dates())
|
dates = list(story.dates())
|
||||||
metadata = {
|
metadata = {
|
||||||
'title': story.title,
|
'title': story.title,
|
||||||
|
|
@ -230,7 +231,8 @@ def generate_epub(story, cover_options={}, image_options={}, output_filename=Non
|
||||||
story,
|
story,
|
||||||
image_options=image_options,
|
image_options=image_options,
|
||||||
normalize=normalize,
|
normalize=normalize,
|
||||||
session=session
|
session=session,
|
||||||
|
parser=parser
|
||||||
),
|
),
|
||||||
EpubFile(
|
EpubFile(
|
||||||
path='Styles/base.css',
|
path='Styles/base.css',
|
||||||
|
|
|
||||||
3
leech.py
3
leech.py
|
|
@ -185,7 +185,8 @@ def download(urls, site_options, cache, verbose, normalize, output_dir, **other_
|
||||||
normalize=normalize,
|
normalize=normalize,
|
||||||
output_dir=output_dir or options.get('output_dir', os.getcwd()),
|
output_dir=output_dir or options.get('output_dir', os.getcwd()),
|
||||||
allow_spaces=options.get('allow_spaces', False),
|
allow_spaces=options.get('allow_spaces', False),
|
||||||
session=session
|
session=session,
|
||||||
|
parser=options.get('parser', 'lxml')
|
||||||
)
|
)
|
||||||
logger.info("File created: " + filename)
|
logger.info("File created: " + filename)
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
|
|
@ -132,6 +132,13 @@ class Site:
|
||||||
"callback": lambda ctx, param, value: ctx.params.update({"spoilers": value and "skip" or "include"}),
|
"callback": lambda ctx, param, value: ctx.params.update({"spoilers": value and "skip" or "include"}),
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
|
SiteSpecificOption(
|
||||||
|
'parser',
|
||||||
|
'--parser',
|
||||||
|
help="Which HTML parser to use",
|
||||||
|
choices=('lxml', 'html5lib', 'html.parser', 'lxml-xml'),
|
||||||
|
default='lxml',
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
@ -176,7 +183,9 @@ class Site:
|
||||||
def login(self, login_details):
|
def login(self, login_details):
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
def _soup(self, url, method='lxml', delay=0, retry=3, retry_delay=10, **kw):
|
def _soup(self, url, method=False, delay=0, retry=3, retry_delay=10, **kw):
|
||||||
|
if not method:
|
||||||
|
method = self.options.get('parser', 'lxml')
|
||||||
if url.startswith('http://') or url.startswith('https://'):
|
if url.startswith('http://') or url.startswith('https://'):
|
||||||
page = self.session.get(url, **kw)
|
page = self.session.get(url, **kw)
|
||||||
if not page:
|
if not page:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue