1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 08:22:56 +01:00
leech/sites/wattpad.py
David Lynch 5cb887f767 Move image processing into sites
The epub-builder still downloads the image, but all the html-mangling
is done in the extraction process now.

Turns footnotes into a chapter-object, for easier processing later on.
2025-03-22 19:39:16 -05:00

49 lines
1.6 KiB
Python

#!/usr/bin/python
import logging
import datetime
import re
from . import register, Site, Section, Chapter
logger = logging.getLogger(__name__)
@register
class Wattpad(Site):
"""Wattpad"""
@classmethod
def matches(cls, url):
# e.g. https://www.wattpad.com/story/208753031-summoned-to-have-tea-with-the-demon-lord-i-guess
# chapter URLs are e.g. https://www.wattpad.com/818687865-summoned-to-have-tea-with-the-demon-lord-i-guess
match = re.match(r'^(https?://(?:www\.)?wattpad\.com/story/\d+)?.*', url)
if match:
# the story-title part is unnecessary
return match.group(1)
def extract(self, url):
workid = re.match(r'^https?://(?:www\.)?wattpad\.com/story/(\d+)?.*', url).group(1)
info = self.session.get(f"https://www.wattpad.com/api/v3/stories/{workid}").json()
story = Section(
title=info['title'],
author=info['user']['name'],
url=url,
cover_url=info['cover']
)
for chapter in info['parts']:
story.add(Chapter(
title=chapter['title'],
contents=self._chapter(chapter['id']),
# "2020-05-03T22:14:29Z"
date=datetime.datetime.fromisoformat(chapter['createDate'].rstrip('Z')) # modifyDate also?
))
self._finalize(story)
return story
def _chapter(self, chapterid):
logger.info(f"Extracting chapter @ {chapterid}")
api = self.session.get(f"https://www.wattpad.com/apiv2/storytext?id={chapterid}")
return '<div>' + api.text + '</div>'