1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-28 19:23:13 +01:00

Pass through some more headers in the session

This commit is contained in:
David Lynch 2024-12-17 14:41:33 -06:00
parent 204807add6
commit 3fdbae5851
3 changed files with 28 additions and 11 deletions

View file

@ -7,7 +7,6 @@ from bs4 import BeautifulSoup
import html
import unicodedata
import datetime
import requests
from attrs import define, asdict
html_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
@ -91,7 +90,8 @@ def chapter_html(
story,
image_options,
titleprefix=None,
normalize=False
normalize=False,
session=None
):
already_fetched_images = {}
chapters = []
@ -100,7 +100,7 @@ def chapter_html(
if hasattr(chapter, '__iter__'):
# This is a Section
chapters.extend(chapter_html(
chapter, image_options=image_options, titleprefix=title, normalize=normalize
chapter, image_options=image_options, titleprefix=title, normalize=normalize, session=session
))
else:
soup = BeautifulSoup(chapter.contents, 'html5lib')
@ -118,7 +118,8 @@ def chapter_html(
image_format=image_options.get('image_format'),
compress_images=image_options.get('compress_images'),
max_image_size=image_options.get('max_image_size'),
always_convert=image_options.get('always_convert_images')
always_convert=image_options.get('always_convert_images'),
session=session
)
chapter.images.append(Image(
path=f"images/ch{i}_leechimage_{count}.{img_contents[1]}",
@ -169,7 +170,7 @@ def chapter_html(
return chapters
def generate_epub(story, cover_options={}, image_options={}, output_filename=None, output_dir=None, normalize=False, allow_spaces=False):
def generate_epub(story, cover_options={}, image_options={}, output_filename=None, output_dir=None, normalize=False, allow_spaces=False, session=None):
dates = list(story.dates())
metadata = {
'title': story.title,
@ -181,6 +182,14 @@ def generate_epub(story, cover_options={}, image_options={}, output_filename=Non
}
extra_metadata = {}
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0',
})
if story.url:
session.headers.update({
'Referer': story.url,
})
if story.summary:
extra_metadata['Summary'] = story.summary
if story.tags:
@ -220,11 +229,12 @@ def generate_epub(story, cover_options={}, image_options={}, output_filename=Non
*chapter_html(
story,
image_options=image_options,
normalize=normalize
normalize=normalize,
session=session
),
EpubFile(
path='Styles/base.css',
contents=requests.Session().get(
contents=session.get(
'https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text,
filetype='text/css'
),

View file

@ -86,7 +86,8 @@ def get_image_from_url(
image_format: str = "JPEG",
compress_images: bool = False,
max_image_size: int = 1_000_000,
always_convert: bool = False
always_convert: bool = False,
session: requests.Session = None
) -> Tuple[bytes, str, str]:
"""
Based on make_cover_from_url(), this function takes in the image url usually gotten from the `src` attribute of
@ -98,6 +99,8 @@ def get_image_from_url(
@param max_image_size: The maximum size of the image in bytes
@return: A tuple of the image data, the image format and the image mime type
"""
session = session or requests.Session()
try:
if url.startswith("https://www.filepicker.io/api/"):
logger.warning("Filepicker.io image detected, converting to Fiction.live image. This might fail.")
@ -123,7 +126,7 @@ def get_image_from_url(
return imgdata, file_ext, f"image/{file_ext}"
print(url)
img = requests.Session().get(url)
img = session.get(url)
image = BytesIO(img.content)
image.seek(0)

View file

@ -48,7 +48,10 @@ def create_session(cache):
pass
session.cookies.update(lwp_cookiejar)
session.headers.update({
'User-agent': USER_AGENT
'User-Agent': USER_AGENT,
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Accept': '*/*', # this is essential for imgur
})
return session
@ -181,7 +184,8 @@ def download(urls, site_options, cache, verbose, normalize, output_dir, **other_
},
normalize=normalize,
output_dir=output_dir or options.get('output_dir', os.getcwd()),
allow_spaces=options.get('allow_spaces', False)
allow_spaces=options.get('allow_spaces', False),
session=session
)
logger.info("File created: " + filename)
else: