1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 08:22:56 +01:00

Compare commits

...

10 commits

Author SHA1 Message Date
Zomega
82a76cd67b
Merge abd9acb2a7 into 5cb887f767 2025-03-26 21:21:52 +00:00
David Lynch
5cb887f767 Move image processing into sites
The epub-builder still downloads the image, but all the html-mangling
is done in the extraction process now.

Turns footnotes into a chapter-object, for easier processing later on.
2025-03-22 19:39:16 -05:00
David Lynch
81189f4e1d xenforo: minor fixes around images in spoilers 2025-03-22 00:16:11 -05:00
David Lynch
3c5a4bb75a
Merge pull request #100 from kpedro88/multiple-next-items
Handle multiple entries in next_link
2025-03-18 20:07:16 -05:00
Kevin Pedro
de6913a9af simplify algorithm 2025-03-08 09:48:32 -06:00
Kevin Pedro
d4e1214be3 return to loop-based algorithm 2025-03-08 09:40:42 -06:00
Kevin Pedro
b2f15eb76c satisfy linter 2025-03-05 21:03:35 -06:00
Kevin Pedro
280b242a27 stop loop once a new link is found 2025-03-05 20:56:47 -06:00
Kevin Pedro
0066a148bb process all next_link items 2025-03-05 20:56:47 -06:00
Will Oursler
abd9acb2a7 Creates a read subcommand that allows for reading the story in terminal.
Finalize merge, a few things needed switching around.

Use site-specific options post merge...
2018-10-08 15:32:46 -07:00
14 changed files with 211 additions and 112 deletions

View file

@ -1,8 +1,6 @@
from .epub import make_epub, EpubFile from .epub import make_epub, EpubFile
from .cover import make_cover, make_cover_from_url from .cover import make_cover, make_cover_from_url
from .image import get_image_from_url from .image import get_image_from_url
from sites import Image
from bs4 import BeautifulSoup
import html import html
import unicodedata import unicodedata
@ -91,10 +89,9 @@ def chapter_html(
image_options, image_options,
titleprefix=None, titleprefix=None,
normalize=False, normalize=False,
session=None, session=None
parser='lxml'
): ):
already_fetched_images = {} images = {}
chapters = [] chapters = []
for i, chapter in enumerate(story): for i, chapter in enumerate(story):
title = chapter.title or f'#{i}' title = chapter.title or f'#{i}'
@ -104,48 +101,10 @@ def chapter_html(
chapter, image_options=image_options, titleprefix=title, normalize=normalize, session=session chapter, image_options=image_options, titleprefix=title, normalize=normalize, session=session
)) ))
else: else:
soup = BeautifulSoup(chapter.contents, 'lxml') contents = chapter.contents
images.update(chapter.images)
if image_options.get('image_fetch'):
all_images = soup.find_all('img', src=True)
len_of_all_images = len(all_images)
# print(f"Found {len_of_all_images} images in chapter {i}")
for count, img in enumerate(all_images):
print(f"[{chapter.title}] Image ({count+1} out of {len_of_all_images}). Source: ", end="")
if img['src'] not in already_fetched_images:
img_contents = get_image_from_url(
img['src'],
image_format=image_options.get('image_format'),
compress_images=image_options.get('compress_images'),
max_image_size=image_options.get('max_image_size'),
always_convert=image_options.get('always_convert_images'),
session=session
)
chapter.images.append(Image(
path=f"images/ch{i}_leechimage_{count}.{img_contents[1]}",
contents=img_contents[0],
content_type=img_contents[2]
))
already_fetched_images[img['src']] = f"images/ch{i}_leechimage_{count}.{img_contents[1]}"
else:
print(img['src'], "(already", already_fetched_images.get(img['src']), ")")
img['src'] = already_fetched_images.get(img['src'])
if not img.has_attr('alt'):
img['alt'] = f"Image {count} from chapter {i}"
else:
# Remove all images from the chapter so you don't get that annoying grey background.
for img in soup.find_all('img'):
# Note: alt="" will be completely removed here, which is consitent with the semantics
if img.parent.name.lower() == "figure":
# TODO: figcaption?
img.parent.replace_with(img.get('alt', '🖼'))
else:
img.replace_with(img.get('alt', '🖼'))
title = titleprefix and f'{titleprefix}: {title}' or title title = titleprefix and f'{titleprefix}: {title}' or title
contents = str(soup)
if normalize: if normalize:
title = unicodedata.normalize('NFKC', title) title = unicodedata.normalize('NFKC', title)
contents = unicodedata.normalize('NFKC', contents) contents = unicodedata.normalize('NFKC', contents)
@ -155,19 +114,30 @@ def chapter_html(
contents=html_template.format( contents=html_template.format(
title=html.escape(title), text=contents) title=html.escape(title), text=contents)
)) ))
# Add all pictures on this chapter as well.
for image in chapter.images:
# For/else syntax, check if the image path already exists, if it doesn't add the image.
# Duplicates are not allowed in the format.
for other_file in chapters:
if other_file.path == image.path:
break
else:
chapters.append(EpubFile(
path=f'{story.id}/{image.path}', contents=image.contents, filetype=image.content_type))
if story.footnotes: if story.footnotes:
chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format( chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format(
title="Footnotes", text='\n\n'.join(story.footnotes)))) title="Footnotes", text=story.footnotes.contents)))
images.update(story.footnotes.images)
for image in images.values():
img_contents = get_image_from_url(
image.url,
image_format=image_options.get('image_format'),
compress_images=image_options.get('compress_images'),
max_image_size=image_options.get('max_image_size'),
always_convert=image_options.get('always_convert_images'),
session=session
)
path = f'{story.id}/{image.path()}'
for chapterfile in chapters:
if chapterfile.path == path:
break
else:
chapters.append(
EpubFile(path=path, contents=img_contents[0], filetype=img_contents[2])
)
return chapters return chapters
@ -231,8 +201,7 @@ def generate_epub(story, cover_options={}, image_options={}, output_filename=Non
story, story,
image_options=image_options, image_options=image_options,
normalize=normalize, normalize=normalize,
session=session, session=session
parser=parser
), ),
EpubFile( EpubFile(
path='Styles/base.css', path='Styles/base.css',

View file

@ -99,7 +99,7 @@ def get_image_from_url(
@param max_image_size: The maximum size of the image in bytes @param max_image_size: The maximum size of the image in bytes
@return: A tuple of the image data, the image format and the image mime type @return: A tuple of the image data, the image format and the image mime type
""" """
logger.info("Downloading image: %s", url)
session = session or requests.Session() session = session or requests.Session()
try: try:
if url.startswith("https://www.filepicker.io/api/"): if url.startswith("https://www.filepicker.io/api/"):
@ -125,7 +125,7 @@ def get_image_from_url(
return _convert_to_new_format(imgdata, image_format).read(), image_format.lower(), f"image/{image_format.lower()}" return _convert_to_new_format(imgdata, image_format).read(), image_format.lower(), f"image/{image_format.lower()}"
return imgdata, file_ext, f"image/{file_ext}" return imgdata, file_ext, f"image/{file_ext}"
print(url) # print(url)
img = session.get(url) img = session.get(url)
image = BytesIO(img.content) image = BytesIO(img.content)
image.seek(0) image.seek(0)

View file

@ -13,6 +13,7 @@ from functools import reduce
import sites import sites
import ebook import ebook
import reader
__version__ = 2 __version__ = 2
USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__ USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__
@ -193,5 +194,27 @@ def download(urls, site_options, cache, verbose, normalize, output_dir, **other_
logger.warning("No ebook created") logger.warning("No ebook created")
@cli.command()
@click.argument('url')
@click.option(
'--site-options',
default='{}',
help='JSON object encoding any site specific option.'
)
@click.option('--cache/--no-cache', default=True)
@click.option('--verbose', '-v', is_flag=True, help="Verbose debugging output")
@site_specific_options # Includes other click.options specific to sites
def read(url, site_options, cache, verbose, **other_flags):
"""Launches an in terminal reader to preview or read a story."""
configure_logging(verbose)
session = create_session(cache)
site, url = sites.get(url)
options, login = create_options(site, site_options, other_flags)
story = open_story(site, url, session, login, options)
reader.launch_reader(story)
if __name__ == '__main__': if __name__ == '__main__':
cli() cli()

47
reader/__init__.py Normal file
View file

@ -0,0 +1,47 @@
import pypandoc
import pydoc
import pick
import sys
def description(description):
"""Decorator to make it possible to quickly attach a description to a function or class."""
def wrapper(action):
action.description = description
return action
return wrapper
def launch_reader(story):
chapters = story.contents
chapter_index = -1
@description('Next Chapter')
def next_chapter_action():
nonlocal chapter_index
chapter_index += 1
@description('Start from the Beginning')
def start_from_beginning_action():
nonlocal chapter_index
chapter_index = 0
@description('Select Chapter')
def select_chapter_action():
nonlocal chapter_index
_, chapter_index = pick.pick(
[chapter.title for chapter in chapters],
"Which chapter?",
default_index=max(0, chapter_index)
)
@description('Quit')
def quit_action():
sys.exit(0)
actions = [next_chapter_action, start_from_beginning_action, select_chapter_action, quit_action]
while True:
_, action_index = pick.pick([action.description for action in actions], "What to do?")
actions[action_index]()
pydoc.pager(pypandoc.convert_text(chapters[chapter_index].contents, 'rst', format='html'))

View file

@ -9,6 +9,7 @@ import time
import logging import logging
import urllib import urllib
import re import re
import hashlib
from attrs import define, field, Factory from attrs import define, field, Factory
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -24,9 +25,17 @@ def _default_uuid_string(self):
@define @define
class Image: class Image:
path: str url: str
contents: str
content_type: str def path(self):
return f"images/{hashlib.sha1(self.url.encode()).hexdigest()}.{self.ext()}"
def ext(self):
if self.url.startswith("data:image") and 'base64' in self.url:
head, base64data = self.url.split(',')
return str(head.split(';')[0].split('/')[1])
path = urllib.parse.urlparse(self.url).path
return os.path.splitext(path)[1]
@define @define
@ -34,7 +43,7 @@ class Chapter:
title: str title: str
contents: str contents: str
date: datetime.datetime = False date: datetime.datetime = False
images: list = Factory(list) images: dict = Factory(dict)
@define @define
@ -61,6 +70,13 @@ class Section:
def __len__(self): def __len__(self):
return len(self.contents) return len(self.contents)
def everychapter(self):
for chapter in self.contents:
if hasattr(chapter, '__iter__'):
yield from chapter
else:
yield chapter
def add(self, value, index=None): def add(self, value, index=None):
if index is not None: if index is not None:
self.contents.insert(index, value) self.contents.insert(index, value)
@ -68,11 +84,8 @@ class Section:
self.contents.append(value) self.contents.append(value)
def dates(self): def dates(self):
for chapter in self.contents: for chapter in self.everychapter():
if hasattr(chapter, '__iter__'): yield chapter.date
yield from chapter.dates()
elif chapter.date:
yield chapter.date
@define @define
@ -321,6 +334,41 @@ class Site:
return contents return contents
def _finalize(self, story):
# Call this on a story after it's fully extracted to clean up things
for chapter in story:
if hasattr(chapter, '__iter__'):
self._finalize(chapter, story)
else:
self._process_images(chapter)
if self.footnotes:
story.footnotes = Chapter('Footnotes', '\n\n'.join(self.footnotes))
self.footnotes = []
self._process_images(story.footnotes)
def _process_images(self, chapter):
soup, base = self._soup(chapter.contents)
if self.options.get('image_fetch'):
for count, img in enumerate(soup.find_all('img', src=True)):
# logger.info(f"Image in {chapter.title}: {img['src']}")
if img['src'] not in chapter.images:
chapter.images[img['src']] = Image(img['src'])
img['src'] = chapter.images.get(img['src']).path()
else:
# Remove all images from the chapter so you don't get that annoying grey background.
for img in soup.find_all('img'):
# Note: alt="" will be completely removed here, which is consitent with the semantics
if img.parent.name.lower() == "figure":
# TODO: figcaption?
img.parent.replace_with(img.get('alt', '🖼'))
else:
img.replace_with(img.get('alt', '🖼'))
chapter.contents = str(soup)
@define @define
class SiteSpecificOption: class SiteSpecificOption:

View file

@ -87,6 +87,8 @@ class ArchiveOfOurOwn(Site):
date=updated date=updated
)) ))
self._finalize(story)
return story return story
def _chapter(self, soup, base): def _chapter(self, soup, base):

View file

@ -76,27 +76,38 @@ class Arbitrary(Site):
else: else:
# set of already processed urls. Stored to detect loops. # set of already processed urls. Stored to detect loops.
found_content_urls = set() found_content_urls = set()
content_url = definition.url content_urls = [definition.url]
while content_url and content_url not in found_content_urls:
def process_content_url(content_url):
if content_url in found_content_urls:
return None
found_content_urls.add(content_url) found_content_urls.add(content_url)
for chapter in self._chapter(content_url, definition): for chapter in self._chapter(content_url, definition):
story.add(chapter) story.add(chapter)
if definition.next_selector: return content_url
while content_urls:
for temp_url in content_urls:
# stop inner loop once a new link is found
if content_url := process_content_url(temp_url):
break
# reset url list
content_urls = []
if content_url and definition.next_selector:
soup, base = self._soup(content_url) soup, base = self._soup(content_url)
next_link = soup.select(definition.next_selector) next_link = soup.select(definition.next_selector)
if next_link: if next_link:
next_link_url = str(next_link[0].get('href')) for next_link_item in next_link:
if base: next_link_url = str(next_link_item.get('href'))
next_link_url = self._join_url(base, next_link_url) if base:
content_url = self._join_url(content_url, next_link_url) next_link_url = self._join_url(base, next_link_url)
else: content_urls.append(self._join_url(content_url, next_link_url))
content_url = False
else:
content_url = False
if not story: if not story:
raise SiteException("No story content found; check the content selectors") raise SiteException("No story content found; check the content selectors")
self._finalize(story)
return story return story
def _chapter(self, url, definition, title=False): def _chapter(self, url, definition, title=False):

View file

@ -46,4 +46,6 @@ class DeviantArt(Stash):
except Exception: except Exception:
logger.exception("Couldn't extract chapters from thumbs") logger.exception("Couldn't extract chapters from thumbs")
self._finalize(story)
return story return story

View file

@ -69,6 +69,8 @@ class FanFictionNet(Site):
else: else:
story.add(Chapter(title=story.title, contents=self._chapter(url), date=published)) story.add(Chapter(title=story.title, contents=self._chapter(url), date=published))
self._finalize(story)
return story return story
def _chapter(self, url): def _chapter(self, url):

View file

@ -93,6 +93,8 @@ class FictionLive(Site):
date=datetime.datetime.fromtimestamp(updated / 1000.0) date=datetime.datetime.fromtimestamp(updated / 1000.0)
)) ))
self._finalize(story)
return story return story

View file

@ -68,8 +68,7 @@ class RoyalRoad(Site):
http.client._MAXHEADERS = original_maxheaders http.client._MAXHEADERS = original_maxheaders
story.footnotes = self.footnotes self._finalize(story)
self.footnotes = []
return story return story

View file

@ -40,6 +40,8 @@ class Stash(Site):
except Exception: except Exception:
logger.exception("Couldn't extract chapters from thumbs") logger.exception("Couldn't extract chapters from thumbs")
self._finalize(story)
return story return story
def _chapter(self, url): def _chapter(self, url):

View file

@ -39,6 +39,8 @@ class Wattpad(Site):
date=datetime.datetime.fromisoformat(chapter['createDate'].rstrip('Z')) # modifyDate also? date=datetime.datetime.fromisoformat(chapter['createDate'].rstrip('Z')) # modifyDate also?
)) ))
self._finalize(story)
return story return story
def _chapter(self, chapterid): def _chapter(self, chapterid):

View file

@ -153,8 +153,7 @@ class XenForo(Site):
chapter = Chapter(title=title, contents=contents, date=post_date) chapter = Chapter(title=title, contents=contents, date=post_date)
story.add(chapter) story.add(chapter)
story.footnotes = self.footnotes self._finalize(story)
self.footnotes = []
return story return story
@ -296,6 +295,14 @@ class XenForo(Site):
del tag['style'] del tag['style']
for tag in post.select('.quoteExpand, .bbCodeBlock-expandLink, .bbCodeBlock-shrinkLink'): for tag in post.select('.quoteExpand, .bbCodeBlock-expandLink, .bbCodeBlock-shrinkLink'):
tag.decompose() tag.decompose()
for tag in post.find_all('noscript'):
# TODO: strip the noscript from these?
# mostly this will be the lazyload images
tag.decompose()
for tag in post.select('img.lazyload[data-src]'):
tag['src'] = tag['data-url']
if tag['src'].startswith('proxy.php'):
tag['src'] = f"{self.domain}/{tag['src']}"
self._clean(post, base) self._clean(post, base)
self._clean_spoilers(post, chapterid) self._clean_spoilers(post, chapterid)
return post.prettify() return post.prettify()
@ -303,36 +310,19 @@ class XenForo(Site):
def _clean_spoilers(self, post, chapterid): def _clean_spoilers(self, post, chapterid):
# spoilers don't work well, so turn them into epub footnotes # spoilers don't work well, so turn them into epub footnotes
for spoiler in post.find_all(class_='ToggleTriggerAnchor'): for spoiler in post.find_all(class_='ToggleTriggerAnchor'):
spoilerTarget = spoiler.find(class_='SpoilerTarget') spoiler_title = spoiler.find(class_='SpoilerTitle')
if self.options['skip_spoilers']:
# This is a bit of a hack, but it works link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)
# This downloads the spoiler image if spoiler_title:
img_exist = list(spoilerTarget.find_all('img')) link.string = spoiler_title.get_text()
if len(img_exist) > 0:
for i in img_exist:
# For some weird reason, the images are duplicated, so this should skip some
if img_exist.index(i) % 2 == 0:
i.decompose()
else:
if not i.has_attr('src'):
i['src'] = i['data-url']
if i['src'].startswith('proxy.php'):
i['src'] = f"{self.domain}/{i['src']}"
spoiler.replace_with(spoiler.find(class_='SpoilerTarget'))
else: else:
spoiler_title = spoiler.find(class_='SpoilerTitle') if spoiler_title:
if self.options['skip_spoilers']: link = f'[SPOILER: {spoiler_title.get_text()}]'
link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)
if spoiler_title:
link.string = spoiler_title.get_text()
else: else:
if spoiler_title: link = '[SPOILER]'
link = f'[SPOILER: {spoiler_title.get_text()}]' new_spoiler = self._new_tag('div', class_="leech-spoiler")
else: new_spoiler.append(link)
link = '[SPOILER]' spoiler.replace_with(new_spoiler)
new_spoiler = self._new_tag('div', class_="leech-spoiler")
new_spoiler.append(link)
spoiler.replace_with(new_spoiler)
def _post_date(self, post): def _post_date(self, post):
maybe_date = post.find(class_='DateTime') maybe_date = post.find(class_='DateTime')