mirror of
https://github.com/kemayo/leech
synced 2025-12-06 08:22:56 +01:00
Compare commits
10 commits
cbb8916718
...
82a76cd67b
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
82a76cd67b | ||
|
|
5cb887f767 | ||
|
|
81189f4e1d | ||
|
|
3c5a4bb75a | ||
|
|
de6913a9af | ||
|
|
d4e1214be3 | ||
|
|
b2f15eb76c | ||
|
|
280b242a27 | ||
|
|
0066a148bb | ||
|
|
abd9acb2a7 |
14 changed files with 211 additions and 112 deletions
|
|
@ -1,8 +1,6 @@
|
||||||
from .epub import make_epub, EpubFile
|
from .epub import make_epub, EpubFile
|
||||||
from .cover import make_cover, make_cover_from_url
|
from .cover import make_cover, make_cover_from_url
|
||||||
from .image import get_image_from_url
|
from .image import get_image_from_url
|
||||||
from sites import Image
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
import html
|
import html
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
@ -91,10 +89,9 @@ def chapter_html(
|
||||||
image_options,
|
image_options,
|
||||||
titleprefix=None,
|
titleprefix=None,
|
||||||
normalize=False,
|
normalize=False,
|
||||||
session=None,
|
session=None
|
||||||
parser='lxml'
|
|
||||||
):
|
):
|
||||||
already_fetched_images = {}
|
images = {}
|
||||||
chapters = []
|
chapters = []
|
||||||
for i, chapter in enumerate(story):
|
for i, chapter in enumerate(story):
|
||||||
title = chapter.title or f'#{i}'
|
title = chapter.title or f'#{i}'
|
||||||
|
|
@ -104,48 +101,10 @@ def chapter_html(
|
||||||
chapter, image_options=image_options, titleprefix=title, normalize=normalize, session=session
|
chapter, image_options=image_options, titleprefix=title, normalize=normalize, session=session
|
||||||
))
|
))
|
||||||
else:
|
else:
|
||||||
soup = BeautifulSoup(chapter.contents, 'lxml')
|
contents = chapter.contents
|
||||||
|
images.update(chapter.images)
|
||||||
if image_options.get('image_fetch'):
|
|
||||||
all_images = soup.find_all('img', src=True)
|
|
||||||
len_of_all_images = len(all_images)
|
|
||||||
# print(f"Found {len_of_all_images} images in chapter {i}")
|
|
||||||
|
|
||||||
for count, img in enumerate(all_images):
|
|
||||||
print(f"[{chapter.title}] Image ({count+1} out of {len_of_all_images}). Source: ", end="")
|
|
||||||
if img['src'] not in already_fetched_images:
|
|
||||||
img_contents = get_image_from_url(
|
|
||||||
img['src'],
|
|
||||||
image_format=image_options.get('image_format'),
|
|
||||||
compress_images=image_options.get('compress_images'),
|
|
||||||
max_image_size=image_options.get('max_image_size'),
|
|
||||||
always_convert=image_options.get('always_convert_images'),
|
|
||||||
session=session
|
|
||||||
)
|
|
||||||
chapter.images.append(Image(
|
|
||||||
path=f"images/ch{i}_leechimage_{count}.{img_contents[1]}",
|
|
||||||
contents=img_contents[0],
|
|
||||||
content_type=img_contents[2]
|
|
||||||
))
|
|
||||||
already_fetched_images[img['src']] = f"images/ch{i}_leechimage_{count}.{img_contents[1]}"
|
|
||||||
else:
|
|
||||||
print(img['src'], "(already", already_fetched_images.get(img['src']), ")")
|
|
||||||
|
|
||||||
img['src'] = already_fetched_images.get(img['src'])
|
|
||||||
if not img.has_attr('alt'):
|
|
||||||
img['alt'] = f"Image {count} from chapter {i}"
|
|
||||||
else:
|
|
||||||
# Remove all images from the chapter so you don't get that annoying grey background.
|
|
||||||
for img in soup.find_all('img'):
|
|
||||||
# Note: alt="" will be completely removed here, which is consitent with the semantics
|
|
||||||
if img.parent.name.lower() == "figure":
|
|
||||||
# TODO: figcaption?
|
|
||||||
img.parent.replace_with(img.get('alt', '🖼'))
|
|
||||||
else:
|
|
||||||
img.replace_with(img.get('alt', '🖼'))
|
|
||||||
|
|
||||||
title = titleprefix and f'{titleprefix}: {title}' or title
|
title = titleprefix and f'{titleprefix}: {title}' or title
|
||||||
contents = str(soup)
|
|
||||||
if normalize:
|
if normalize:
|
||||||
title = unicodedata.normalize('NFKC', title)
|
title = unicodedata.normalize('NFKC', title)
|
||||||
contents = unicodedata.normalize('NFKC', contents)
|
contents = unicodedata.normalize('NFKC', contents)
|
||||||
|
|
@ -155,19 +114,30 @@ def chapter_html(
|
||||||
contents=html_template.format(
|
contents=html_template.format(
|
||||||
title=html.escape(title), text=contents)
|
title=html.escape(title), text=contents)
|
||||||
))
|
))
|
||||||
# Add all pictures on this chapter as well.
|
|
||||||
for image in chapter.images:
|
|
||||||
# For/else syntax, check if the image path already exists, if it doesn't add the image.
|
|
||||||
# Duplicates are not allowed in the format.
|
|
||||||
for other_file in chapters:
|
|
||||||
if other_file.path == image.path:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
chapters.append(EpubFile(
|
|
||||||
path=f'{story.id}/{image.path}', contents=image.contents, filetype=image.content_type))
|
|
||||||
if story.footnotes:
|
if story.footnotes:
|
||||||
chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format(
|
chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format(
|
||||||
title="Footnotes", text='\n\n'.join(story.footnotes))))
|
title="Footnotes", text=story.footnotes.contents)))
|
||||||
|
images.update(story.footnotes.images)
|
||||||
|
|
||||||
|
for image in images.values():
|
||||||
|
img_contents = get_image_from_url(
|
||||||
|
image.url,
|
||||||
|
image_format=image_options.get('image_format'),
|
||||||
|
compress_images=image_options.get('compress_images'),
|
||||||
|
max_image_size=image_options.get('max_image_size'),
|
||||||
|
always_convert=image_options.get('always_convert_images'),
|
||||||
|
session=session
|
||||||
|
)
|
||||||
|
path = f'{story.id}/{image.path()}'
|
||||||
|
for chapterfile in chapters:
|
||||||
|
if chapterfile.path == path:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
chapters.append(
|
||||||
|
EpubFile(path=path, contents=img_contents[0], filetype=img_contents[2])
|
||||||
|
)
|
||||||
|
|
||||||
return chapters
|
return chapters
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -231,8 +201,7 @@ def generate_epub(story, cover_options={}, image_options={}, output_filename=Non
|
||||||
story,
|
story,
|
||||||
image_options=image_options,
|
image_options=image_options,
|
||||||
normalize=normalize,
|
normalize=normalize,
|
||||||
session=session,
|
session=session
|
||||||
parser=parser
|
|
||||||
),
|
),
|
||||||
EpubFile(
|
EpubFile(
|
||||||
path='Styles/base.css',
|
path='Styles/base.css',
|
||||||
|
|
|
||||||
|
|
@ -99,7 +99,7 @@ def get_image_from_url(
|
||||||
@param max_image_size: The maximum size of the image in bytes
|
@param max_image_size: The maximum size of the image in bytes
|
||||||
@return: A tuple of the image data, the image format and the image mime type
|
@return: A tuple of the image data, the image format and the image mime type
|
||||||
"""
|
"""
|
||||||
|
logger.info("Downloading image: %s", url)
|
||||||
session = session or requests.Session()
|
session = session or requests.Session()
|
||||||
try:
|
try:
|
||||||
if url.startswith("https://www.filepicker.io/api/"):
|
if url.startswith("https://www.filepicker.io/api/"):
|
||||||
|
|
@ -125,7 +125,7 @@ def get_image_from_url(
|
||||||
return _convert_to_new_format(imgdata, image_format).read(), image_format.lower(), f"image/{image_format.lower()}"
|
return _convert_to_new_format(imgdata, image_format).read(), image_format.lower(), f"image/{image_format.lower()}"
|
||||||
return imgdata, file_ext, f"image/{file_ext}"
|
return imgdata, file_ext, f"image/{file_ext}"
|
||||||
|
|
||||||
print(url)
|
# print(url)
|
||||||
img = session.get(url)
|
img = session.get(url)
|
||||||
image = BytesIO(img.content)
|
image = BytesIO(img.content)
|
||||||
image.seek(0)
|
image.seek(0)
|
||||||
|
|
|
||||||
23
leech.py
23
leech.py
|
|
@ -13,6 +13,7 @@ from functools import reduce
|
||||||
|
|
||||||
import sites
|
import sites
|
||||||
import ebook
|
import ebook
|
||||||
|
import reader
|
||||||
|
|
||||||
__version__ = 2
|
__version__ = 2
|
||||||
USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__
|
USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__
|
||||||
|
|
@ -193,5 +194,27 @@ def download(urls, site_options, cache, verbose, normalize, output_dir, **other_
|
||||||
logger.warning("No ebook created")
|
logger.warning("No ebook created")
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.argument('url')
|
||||||
|
@click.option(
|
||||||
|
'--site-options',
|
||||||
|
default='{}',
|
||||||
|
help='JSON object encoding any site specific option.'
|
||||||
|
)
|
||||||
|
@click.option('--cache/--no-cache', default=True)
|
||||||
|
@click.option('--verbose', '-v', is_flag=True, help="Verbose debugging output")
|
||||||
|
@site_specific_options # Includes other click.options specific to sites
|
||||||
|
def read(url, site_options, cache, verbose, **other_flags):
|
||||||
|
"""Launches an in terminal reader to preview or read a story."""
|
||||||
|
configure_logging(verbose)
|
||||||
|
session = create_session(cache)
|
||||||
|
|
||||||
|
site, url = sites.get(url)
|
||||||
|
options, login = create_options(site, site_options, other_flags)
|
||||||
|
story = open_story(site, url, session, login, options)
|
||||||
|
|
||||||
|
reader.launch_reader(story)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
cli()
|
cli()
|
||||||
|
|
|
||||||
47
reader/__init__.py
Normal file
47
reader/__init__.py
Normal file
|
|
@ -0,0 +1,47 @@
|
||||||
|
import pypandoc
|
||||||
|
import pydoc
|
||||||
|
import pick
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def description(description):
|
||||||
|
"""Decorator to make it possible to quickly attach a description to a function or class."""
|
||||||
|
def wrapper(action):
|
||||||
|
action.description = description
|
||||||
|
return action
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
|
||||||
|
def launch_reader(story):
|
||||||
|
chapters = story.contents
|
||||||
|
chapter_index = -1
|
||||||
|
|
||||||
|
@description('Next Chapter')
|
||||||
|
def next_chapter_action():
|
||||||
|
nonlocal chapter_index
|
||||||
|
chapter_index += 1
|
||||||
|
|
||||||
|
@description('Start from the Beginning')
|
||||||
|
def start_from_beginning_action():
|
||||||
|
nonlocal chapter_index
|
||||||
|
chapter_index = 0
|
||||||
|
|
||||||
|
@description('Select Chapter')
|
||||||
|
def select_chapter_action():
|
||||||
|
nonlocal chapter_index
|
||||||
|
_, chapter_index = pick.pick(
|
||||||
|
[chapter.title for chapter in chapters],
|
||||||
|
"Which chapter?",
|
||||||
|
default_index=max(0, chapter_index)
|
||||||
|
)
|
||||||
|
|
||||||
|
@description('Quit')
|
||||||
|
def quit_action():
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
actions = [next_chapter_action, start_from_beginning_action, select_chapter_action, quit_action]
|
||||||
|
|
||||||
|
while True:
|
||||||
|
_, action_index = pick.pick([action.description for action in actions], "What to do?")
|
||||||
|
actions[action_index]()
|
||||||
|
pydoc.pager(pypandoc.convert_text(chapters[chapter_index].contents, 'rst', format='html'))
|
||||||
|
|
@ -9,6 +9,7 @@ import time
|
||||||
import logging
|
import logging
|
||||||
import urllib
|
import urllib
|
||||||
import re
|
import re
|
||||||
|
import hashlib
|
||||||
from attrs import define, field, Factory
|
from attrs import define, field, Factory
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
@ -24,9 +25,17 @@ def _default_uuid_string(self):
|
||||||
|
|
||||||
@define
|
@define
|
||||||
class Image:
|
class Image:
|
||||||
path: str
|
url: str
|
||||||
contents: str
|
|
||||||
content_type: str
|
def path(self):
|
||||||
|
return f"images/{hashlib.sha1(self.url.encode()).hexdigest()}.{self.ext()}"
|
||||||
|
|
||||||
|
def ext(self):
|
||||||
|
if self.url.startswith("data:image") and 'base64' in self.url:
|
||||||
|
head, base64data = self.url.split(',')
|
||||||
|
return str(head.split(';')[0].split('/')[1])
|
||||||
|
path = urllib.parse.urlparse(self.url).path
|
||||||
|
return os.path.splitext(path)[1]
|
||||||
|
|
||||||
|
|
||||||
@define
|
@define
|
||||||
|
|
@ -34,7 +43,7 @@ class Chapter:
|
||||||
title: str
|
title: str
|
||||||
contents: str
|
contents: str
|
||||||
date: datetime.datetime = False
|
date: datetime.datetime = False
|
||||||
images: list = Factory(list)
|
images: dict = Factory(dict)
|
||||||
|
|
||||||
|
|
||||||
@define
|
@define
|
||||||
|
|
@ -61,6 +70,13 @@ class Section:
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.contents)
|
return len(self.contents)
|
||||||
|
|
||||||
|
def everychapter(self):
|
||||||
|
for chapter in self.contents:
|
||||||
|
if hasattr(chapter, '__iter__'):
|
||||||
|
yield from chapter
|
||||||
|
else:
|
||||||
|
yield chapter
|
||||||
|
|
||||||
def add(self, value, index=None):
|
def add(self, value, index=None):
|
||||||
if index is not None:
|
if index is not None:
|
||||||
self.contents.insert(index, value)
|
self.contents.insert(index, value)
|
||||||
|
|
@ -68,11 +84,8 @@ class Section:
|
||||||
self.contents.append(value)
|
self.contents.append(value)
|
||||||
|
|
||||||
def dates(self):
|
def dates(self):
|
||||||
for chapter in self.contents:
|
for chapter in self.everychapter():
|
||||||
if hasattr(chapter, '__iter__'):
|
yield chapter.date
|
||||||
yield from chapter.dates()
|
|
||||||
elif chapter.date:
|
|
||||||
yield chapter.date
|
|
||||||
|
|
||||||
|
|
||||||
@define
|
@define
|
||||||
|
|
@ -321,6 +334,41 @@ class Site:
|
||||||
|
|
||||||
return contents
|
return contents
|
||||||
|
|
||||||
|
def _finalize(self, story):
|
||||||
|
# Call this on a story after it's fully extracted to clean up things
|
||||||
|
for chapter in story:
|
||||||
|
if hasattr(chapter, '__iter__'):
|
||||||
|
self._finalize(chapter, story)
|
||||||
|
else:
|
||||||
|
self._process_images(chapter)
|
||||||
|
|
||||||
|
if self.footnotes:
|
||||||
|
story.footnotes = Chapter('Footnotes', '\n\n'.join(self.footnotes))
|
||||||
|
self.footnotes = []
|
||||||
|
self._process_images(story.footnotes)
|
||||||
|
|
||||||
|
def _process_images(self, chapter):
|
||||||
|
soup, base = self._soup(chapter.contents)
|
||||||
|
|
||||||
|
if self.options.get('image_fetch'):
|
||||||
|
for count, img in enumerate(soup.find_all('img', src=True)):
|
||||||
|
# logger.info(f"Image in {chapter.title}: {img['src']}")
|
||||||
|
if img['src'] not in chapter.images:
|
||||||
|
chapter.images[img['src']] = Image(img['src'])
|
||||||
|
|
||||||
|
img['src'] = chapter.images.get(img['src']).path()
|
||||||
|
else:
|
||||||
|
# Remove all images from the chapter so you don't get that annoying grey background.
|
||||||
|
for img in soup.find_all('img'):
|
||||||
|
# Note: alt="" will be completely removed here, which is consitent with the semantics
|
||||||
|
if img.parent.name.lower() == "figure":
|
||||||
|
# TODO: figcaption?
|
||||||
|
img.parent.replace_with(img.get('alt', '🖼'))
|
||||||
|
else:
|
||||||
|
img.replace_with(img.get('alt', '🖼'))
|
||||||
|
|
||||||
|
chapter.contents = str(soup)
|
||||||
|
|
||||||
|
|
||||||
@define
|
@define
|
||||||
class SiteSpecificOption:
|
class SiteSpecificOption:
|
||||||
|
|
|
||||||
|
|
@ -87,6 +87,8 @@ class ArchiveOfOurOwn(Site):
|
||||||
date=updated
|
date=updated
|
||||||
))
|
))
|
||||||
|
|
||||||
|
self._finalize(story)
|
||||||
|
|
||||||
return story
|
return story
|
||||||
|
|
||||||
def _chapter(self, soup, base):
|
def _chapter(self, soup, base):
|
||||||
|
|
|
||||||
|
|
@ -76,27 +76,38 @@ class Arbitrary(Site):
|
||||||
else:
|
else:
|
||||||
# set of already processed urls. Stored to detect loops.
|
# set of already processed urls. Stored to detect loops.
|
||||||
found_content_urls = set()
|
found_content_urls = set()
|
||||||
content_url = definition.url
|
content_urls = [definition.url]
|
||||||
while content_url and content_url not in found_content_urls:
|
|
||||||
|
def process_content_url(content_url):
|
||||||
|
if content_url in found_content_urls:
|
||||||
|
return None
|
||||||
found_content_urls.add(content_url)
|
found_content_urls.add(content_url)
|
||||||
for chapter in self._chapter(content_url, definition):
|
for chapter in self._chapter(content_url, definition):
|
||||||
story.add(chapter)
|
story.add(chapter)
|
||||||
if definition.next_selector:
|
return content_url
|
||||||
|
|
||||||
|
while content_urls:
|
||||||
|
for temp_url in content_urls:
|
||||||
|
# stop inner loop once a new link is found
|
||||||
|
if content_url := process_content_url(temp_url):
|
||||||
|
break
|
||||||
|
# reset url list
|
||||||
|
content_urls = []
|
||||||
|
if content_url and definition.next_selector:
|
||||||
soup, base = self._soup(content_url)
|
soup, base = self._soup(content_url)
|
||||||
next_link = soup.select(definition.next_selector)
|
next_link = soup.select(definition.next_selector)
|
||||||
if next_link:
|
if next_link:
|
||||||
next_link_url = str(next_link[0].get('href'))
|
for next_link_item in next_link:
|
||||||
if base:
|
next_link_url = str(next_link_item.get('href'))
|
||||||
next_link_url = self._join_url(base, next_link_url)
|
if base:
|
||||||
content_url = self._join_url(content_url, next_link_url)
|
next_link_url = self._join_url(base, next_link_url)
|
||||||
else:
|
content_urls.append(self._join_url(content_url, next_link_url))
|
||||||
content_url = False
|
|
||||||
else:
|
|
||||||
content_url = False
|
|
||||||
|
|
||||||
if not story:
|
if not story:
|
||||||
raise SiteException("No story content found; check the content selectors")
|
raise SiteException("No story content found; check the content selectors")
|
||||||
|
|
||||||
|
self._finalize(story)
|
||||||
|
|
||||||
return story
|
return story
|
||||||
|
|
||||||
def _chapter(self, url, definition, title=False):
|
def _chapter(self, url, definition, title=False):
|
||||||
|
|
|
||||||
|
|
@ -46,4 +46,6 @@ class DeviantArt(Stash):
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.exception("Couldn't extract chapters from thumbs")
|
logger.exception("Couldn't extract chapters from thumbs")
|
||||||
|
|
||||||
|
self._finalize(story)
|
||||||
|
|
||||||
return story
|
return story
|
||||||
|
|
|
||||||
|
|
@ -69,6 +69,8 @@ class FanFictionNet(Site):
|
||||||
else:
|
else:
|
||||||
story.add(Chapter(title=story.title, contents=self._chapter(url), date=published))
|
story.add(Chapter(title=story.title, contents=self._chapter(url), date=published))
|
||||||
|
|
||||||
|
self._finalize(story)
|
||||||
|
|
||||||
return story
|
return story
|
||||||
|
|
||||||
def _chapter(self, url):
|
def _chapter(self, url):
|
||||||
|
|
|
||||||
|
|
@ -93,6 +93,8 @@ class FictionLive(Site):
|
||||||
date=datetime.datetime.fromtimestamp(updated / 1000.0)
|
date=datetime.datetime.fromtimestamp(updated / 1000.0)
|
||||||
))
|
))
|
||||||
|
|
||||||
|
self._finalize(story)
|
||||||
|
|
||||||
return story
|
return story
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -68,8 +68,7 @@ class RoyalRoad(Site):
|
||||||
|
|
||||||
http.client._MAXHEADERS = original_maxheaders
|
http.client._MAXHEADERS = original_maxheaders
|
||||||
|
|
||||||
story.footnotes = self.footnotes
|
self._finalize(story)
|
||||||
self.footnotes = []
|
|
||||||
|
|
||||||
return story
|
return story
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -40,6 +40,8 @@ class Stash(Site):
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.exception("Couldn't extract chapters from thumbs")
|
logger.exception("Couldn't extract chapters from thumbs")
|
||||||
|
|
||||||
|
self._finalize(story)
|
||||||
|
|
||||||
return story
|
return story
|
||||||
|
|
||||||
def _chapter(self, url):
|
def _chapter(self, url):
|
||||||
|
|
|
||||||
|
|
@ -39,6 +39,8 @@ class Wattpad(Site):
|
||||||
date=datetime.datetime.fromisoformat(chapter['createDate'].rstrip('Z')) # modifyDate also?
|
date=datetime.datetime.fromisoformat(chapter['createDate'].rstrip('Z')) # modifyDate also?
|
||||||
))
|
))
|
||||||
|
|
||||||
|
self._finalize(story)
|
||||||
|
|
||||||
return story
|
return story
|
||||||
|
|
||||||
def _chapter(self, chapterid):
|
def _chapter(self, chapterid):
|
||||||
|
|
|
||||||
|
|
@ -153,8 +153,7 @@ class XenForo(Site):
|
||||||
chapter = Chapter(title=title, contents=contents, date=post_date)
|
chapter = Chapter(title=title, contents=contents, date=post_date)
|
||||||
story.add(chapter)
|
story.add(chapter)
|
||||||
|
|
||||||
story.footnotes = self.footnotes
|
self._finalize(story)
|
||||||
self.footnotes = []
|
|
||||||
|
|
||||||
return story
|
return story
|
||||||
|
|
||||||
|
|
@ -296,6 +295,14 @@ class XenForo(Site):
|
||||||
del tag['style']
|
del tag['style']
|
||||||
for tag in post.select('.quoteExpand, .bbCodeBlock-expandLink, .bbCodeBlock-shrinkLink'):
|
for tag in post.select('.quoteExpand, .bbCodeBlock-expandLink, .bbCodeBlock-shrinkLink'):
|
||||||
tag.decompose()
|
tag.decompose()
|
||||||
|
for tag in post.find_all('noscript'):
|
||||||
|
# TODO: strip the noscript from these?
|
||||||
|
# mostly this will be the lazyload images
|
||||||
|
tag.decompose()
|
||||||
|
for tag in post.select('img.lazyload[data-src]'):
|
||||||
|
tag['src'] = tag['data-url']
|
||||||
|
if tag['src'].startswith('proxy.php'):
|
||||||
|
tag['src'] = f"{self.domain}/{tag['src']}"
|
||||||
self._clean(post, base)
|
self._clean(post, base)
|
||||||
self._clean_spoilers(post, chapterid)
|
self._clean_spoilers(post, chapterid)
|
||||||
return post.prettify()
|
return post.prettify()
|
||||||
|
|
@ -303,36 +310,19 @@ class XenForo(Site):
|
||||||
def _clean_spoilers(self, post, chapterid):
|
def _clean_spoilers(self, post, chapterid):
|
||||||
# spoilers don't work well, so turn them into epub footnotes
|
# spoilers don't work well, so turn them into epub footnotes
|
||||||
for spoiler in post.find_all(class_='ToggleTriggerAnchor'):
|
for spoiler in post.find_all(class_='ToggleTriggerAnchor'):
|
||||||
spoilerTarget = spoiler.find(class_='SpoilerTarget')
|
spoiler_title = spoiler.find(class_='SpoilerTitle')
|
||||||
|
if self.options['skip_spoilers']:
|
||||||
# This is a bit of a hack, but it works
|
link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)
|
||||||
# This downloads the spoiler image
|
if spoiler_title:
|
||||||
img_exist = list(spoilerTarget.find_all('img'))
|
link.string = spoiler_title.get_text()
|
||||||
if len(img_exist) > 0:
|
|
||||||
for i in img_exist:
|
|
||||||
# For some weird reason, the images are duplicated, so this should skip some
|
|
||||||
if img_exist.index(i) % 2 == 0:
|
|
||||||
i.decompose()
|
|
||||||
else:
|
|
||||||
if not i.has_attr('src'):
|
|
||||||
i['src'] = i['data-url']
|
|
||||||
if i['src'].startswith('proxy.php'):
|
|
||||||
i['src'] = f"{self.domain}/{i['src']}"
|
|
||||||
spoiler.replace_with(spoiler.find(class_='SpoilerTarget'))
|
|
||||||
else:
|
else:
|
||||||
spoiler_title = spoiler.find(class_='SpoilerTitle')
|
if spoiler_title:
|
||||||
if self.options['skip_spoilers']:
|
link = f'[SPOILER: {spoiler_title.get_text()}]'
|
||||||
link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)
|
|
||||||
if spoiler_title:
|
|
||||||
link.string = spoiler_title.get_text()
|
|
||||||
else:
|
else:
|
||||||
if spoiler_title:
|
link = '[SPOILER]'
|
||||||
link = f'[SPOILER: {spoiler_title.get_text()}]'
|
new_spoiler = self._new_tag('div', class_="leech-spoiler")
|
||||||
else:
|
new_spoiler.append(link)
|
||||||
link = '[SPOILER]'
|
spoiler.replace_with(new_spoiler)
|
||||||
new_spoiler = self._new_tag('div', class_="leech-spoiler")
|
|
||||||
new_spoiler.append(link)
|
|
||||||
spoiler.replace_with(new_spoiler)
|
|
||||||
|
|
||||||
def _post_date(self, post):
|
def _post_date(self, post):
|
||||||
maybe_date = post.find(class_='DateTime')
|
maybe_date = post.find(class_='DateTime')
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue