Quick take on wattpad

2025-12-06 16:33:16 +01:00 · 2021-01-26 01:56:41 -06:00 · 2021-01-26 01:56:41 -06:00 · 23c7a1496c
commit 23c7a1496c
parent 7c040c08a0
1 changed files with 64 additions and 0 deletions
--- a/sites/wattpad.py
+++ b/sites/wattpad.py
@ -0,0 +1,64 @@
 #!/usr/bin/python
 import http.client
 import logging
 import datetime
 import re
 from . import register, Site, Section, Chapter
 logger = logging.getLogger(__name__)
@register
 class Wattpad(Site):
    """Wattpad"""
    @classmethod
    def matches(cls, url):
        # e.g. https://www.wattpad.com/story/208753031-summoned-to-have-tea-with-the-demon-lord-i-guess
        # chapter URLs are e.g. https://www.wattpad.com/818687865-summoned-to-have-tea-with-the-demon-lord-i-guess
        match = re.match(r'^(https?://(?:www\.)?wattpad\.com/story/\d+)?.*', url)
        if match:
            # the story-title part is unnecessary
            return match.group(1)
    def extract(self, url):
        # URL should give us the table of contents page for the story
        soup = self._soup(url)
        story = Section(
            title=soup.find('h1').string.strip(),
            author=soup.find('div', class_='author-info').strong.a.string.strip(),
            url=soup.find('link', rel='canonical')['href'],
            cover_url=soup.find('div', class_='cover').img['src']
        )
        info = soup.find('div', class_='author-info').small
        published = datetime.datetime.strptime(info['title'], 'First published: %b %d, %Y')
        info.find('span').decompose()
        updated = datetime.datetime.strptime(info.get_text().strip(), 'Updated %b %d, %Y')
        for chapter in soup.select('ul.table-of-contents a'):
            chapter_url = str(self._join_url(story.url, str(chapter['href'])))
            contents = self._chapter(chapter_url)
            story.add(Chapter(title=chapter.string.strip(), contents=contents))
        # fix up the dates
        story[-1].date = updated
        story[0].date = published
        return story
    def _chapter(self, url):
        logger.info("Extracting chapter @ %s", url)
        soup = self._soup(url)
        content = soup.find('article').find('div', class_="page").pre
        content.name = 'div'
        for ad in content.find_all(attrs={'aria_label': "Advertisement"}):
            ad.decompose()
        content.extract()
        return content.prettify()