From 3fdbae58516f1fa92d0dfe7028eba3631c5a128b Mon Sep 17 00:00:00 2001 From: David Lynch Date: Tue, 17 Dec 2024 14:41:33 -0600 Subject: [PATCH] Pass through some more headers in the session --- ebook/__init__.py | 24 +++++++++++++++++------- ebook/image.py | 7 +++++-- leech.py | 8 ++++++-- 3 files changed, 28 insertions(+), 11 deletions(-) diff --git a/ebook/__init__.py b/ebook/__init__.py index 077b5b6..f667da1 100644 --- a/ebook/__init__.py +++ b/ebook/__init__.py @@ -7,7 +7,6 @@ from bs4 import BeautifulSoup import html import unicodedata import datetime -import requests from attrs import define, asdict html_template = ''' @@ -91,7 +90,8 @@ def chapter_html( story, image_options, titleprefix=None, - normalize=False + normalize=False, + session=None ): already_fetched_images = {} chapters = [] @@ -100,7 +100,7 @@ def chapter_html( if hasattr(chapter, '__iter__'): # This is a Section chapters.extend(chapter_html( - chapter, image_options=image_options, titleprefix=title, normalize=normalize + chapter, image_options=image_options, titleprefix=title, normalize=normalize, session=session )) else: soup = BeautifulSoup(chapter.contents, 'html5lib') @@ -118,7 +118,8 @@ def chapter_html( image_format=image_options.get('image_format'), compress_images=image_options.get('compress_images'), max_image_size=image_options.get('max_image_size'), - always_convert=image_options.get('always_convert_images') + always_convert=image_options.get('always_convert_images'), + session=session ) chapter.images.append(Image( path=f"images/ch{i}_leechimage_{count}.{img_contents[1]}", @@ -169,7 +170,7 @@ def chapter_html( return chapters -def generate_epub(story, cover_options={}, image_options={}, output_filename=None, output_dir=None, normalize=False, allow_spaces=False): +def generate_epub(story, cover_options={}, image_options={}, output_filename=None, output_dir=None, normalize=False, allow_spaces=False, session=None): dates = list(story.dates()) metadata = { 'title': story.title, @@ -181,6 +182,14 @@ def generate_epub(story, cover_options={}, image_options={}, output_filename=Non } extra_metadata = {} + session.headers.update({ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0', + }) + if story.url: + session.headers.update({ + 'Referer': story.url, + }) + if story.summary: extra_metadata['Summary'] = story.summary if story.tags: @@ -220,11 +229,12 @@ def generate_epub(story, cover_options={}, image_options={}, output_filename=Non *chapter_html( story, image_options=image_options, - normalize=normalize + normalize=normalize, + session=session ), EpubFile( path='Styles/base.css', - contents=requests.Session().get( + contents=session.get( 'https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, filetype='text/css' ), diff --git a/ebook/image.py b/ebook/image.py index 1cd024a..ee1e1eb 100644 --- a/ebook/image.py +++ b/ebook/image.py @@ -86,7 +86,8 @@ def get_image_from_url( image_format: str = "JPEG", compress_images: bool = False, max_image_size: int = 1_000_000, - always_convert: bool = False + always_convert: bool = False, + session: requests.Session = None ) -> Tuple[bytes, str, str]: """ Based on make_cover_from_url(), this function takes in the image url usually gotten from the `src` attribute of @@ -98,6 +99,8 @@ def get_image_from_url( @param max_image_size: The maximum size of the image in bytes @return: A tuple of the image data, the image format and the image mime type """ + + session = session or requests.Session() try: if url.startswith("https://www.filepicker.io/api/"): logger.warning("Filepicker.io image detected, converting to Fiction.live image. This might fail.") @@ -123,7 +126,7 @@ def get_image_from_url( return imgdata, file_ext, f"image/{file_ext}" print(url) - img = requests.Session().get(url) + img = session.get(url) image = BytesIO(img.content) image.seek(0) diff --git a/leech.py b/leech.py index 3b11abf..35c7e1d 100755 --- a/leech.py +++ b/leech.py @@ -48,7 +48,10 @@ def create_session(cache): pass session.cookies.update(lwp_cookiejar) session.headers.update({ - 'User-agent': USER_AGENT + 'User-Agent': USER_AGENT, + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate', + 'Accept': '*/*', # this is essential for imgur }) return session @@ -181,7 +184,8 @@ def download(urls, site_options, cache, verbose, normalize, output_dir, **other_ }, normalize=normalize, output_dir=output_dir or options.get('output_dir', os.getcwd()), - allow_spaces=options.get('allow_spaces', False) + allow_spaces=options.get('allow_spaces', False), + session=session ) logger.info("File created: " + filename) else: