Quick take on wattpad

2025-12-06 16:33:16 +01:00 · 2021-01-26 01:56:41 -06:00 · 2021-01-26 01:56:41 -06:00 · 23c7a1496c
commit 23c7a1496c
parent 7c040c08a0
1 changed files with 64 additions and 0 deletions
--- a/sites/wattpad.py
+++ b/sites/wattpad.py
@ -0,0 +1,64 @@
+#!/usr/bin/python
+
+import http.client
+import logging
+import datetime
+import re
+from . import register, Site, Section, Chapter
+
+logger = logging.getLogger(__name__)
+
+
+@register
+class Wattpad(Site):
+    """Wattpad"""
+    @classmethod
+    def matches(cls, url):
+        # e.g. https://www.wattpad.com/story/208753031-summoned-to-have-tea-with-the-demon-lord-i-guess
+        # chapter URLs are e.g. https://www.wattpad.com/818687865-summoned-to-have-tea-with-the-demon-lord-i-guess
+        match = re.match(r'^(https?://(?:www\.)?wattpad\.com/story/\d+)?.*', url)
+        if match:
+            # the story-title part is unnecessary
+            return match.group(1)
+
+    def extract(self, url):
+        # URL should give us the table of contents page for the story
+        soup = self._soup(url)
+
+        story = Section(
+            title=soup.find('h1').string.strip(),
+            author=soup.find('div', class_='author-info').strong.a.string.strip(),
+            url=soup.find('link', rel='canonical')['href'],
+            cover_url=soup.find('div', class_='cover').img['src']
+        )
+
+        info = soup.find('div', class_='author-info').small
+        published = datetime.datetime.strptime(info['title'], 'First published: %b %d, %Y')
+        info.find('span').decompose()
+        updated = datetime.datetime.strptime(info.get_text().strip(), 'Updated %b %d, %Y')
+
+        for chapter in soup.select('ul.table-of-contents a'):
+            chapter_url = str(self._join_url(story.url, str(chapter['href'])))
+
+            contents = self._chapter(chapter_url)
+
+            story.add(Chapter(title=chapter.string.strip(), contents=contents))
+
+        # fix up the dates
+        story[-1].date = updated
+        story[0].date = published
+
+        return story
+
+    def _chapter(self, url):
+        logger.info("Extracting chapter @ %s", url)
+        soup = self._soup(url)
+
+        content = soup.find('article').find('div', class_="page").pre
+        content.name = 'div'
+
+        for ad in content.find_all(attrs={'aria_label': "Advertisement"}):
+            ad.decompose()
+
+        content.extract()
+        return content.prettify()