mirror of
https://github.com/kemayo/leech
synced 2025-12-06 16:33:16 +01:00
Quick take on wattpad
This commit is contained in:
parent
7c040c08a0
commit
23c7a1496c
1 changed files with 64 additions and 0 deletions
64
sites/wattpad.py
Normal file
64
sites/wattpad.py
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import http.client
|
||||
import logging
|
||||
import datetime
|
||||
import re
|
||||
from . import register, Site, Section, Chapter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@register
|
||||
class Wattpad(Site):
|
||||
"""Wattpad"""
|
||||
@classmethod
|
||||
def matches(cls, url):
|
||||
# e.g. https://www.wattpad.com/story/208753031-summoned-to-have-tea-with-the-demon-lord-i-guess
|
||||
# chapter URLs are e.g. https://www.wattpad.com/818687865-summoned-to-have-tea-with-the-demon-lord-i-guess
|
||||
match = re.match(r'^(https?://(?:www\.)?wattpad\.com/story/\d+)?.*', url)
|
||||
if match:
|
||||
# the story-title part is unnecessary
|
||||
return match.group(1)
|
||||
|
||||
def extract(self, url):
|
||||
# URL should give us the table of contents page for the story
|
||||
soup = self._soup(url)
|
||||
|
||||
story = Section(
|
||||
title=soup.find('h1').string.strip(),
|
||||
author=soup.find('div', class_='author-info').strong.a.string.strip(),
|
||||
url=soup.find('link', rel='canonical')['href'],
|
||||
cover_url=soup.find('div', class_='cover').img['src']
|
||||
)
|
||||
|
||||
info = soup.find('div', class_='author-info').small
|
||||
published = datetime.datetime.strptime(info['title'], 'First published: %b %d, %Y')
|
||||
info.find('span').decompose()
|
||||
updated = datetime.datetime.strptime(info.get_text().strip(), 'Updated %b %d, %Y')
|
||||
|
||||
for chapter in soup.select('ul.table-of-contents a'):
|
||||
chapter_url = str(self._join_url(story.url, str(chapter['href'])))
|
||||
|
||||
contents = self._chapter(chapter_url)
|
||||
|
||||
story.add(Chapter(title=chapter.string.strip(), contents=contents))
|
||||
|
||||
# fix up the dates
|
||||
story[-1].date = updated
|
||||
story[0].date = published
|
||||
|
||||
return story
|
||||
|
||||
def _chapter(self, url):
|
||||
logger.info("Extracting chapter @ %s", url)
|
||||
soup = self._soup(url)
|
||||
|
||||
content = soup.find('article').find('div', class_="page").pre
|
||||
content.name = 'div'
|
||||
|
||||
for ad in content.find_all(attrs={'aria_label': "Advertisement"}):
|
||||
ad.decompose()
|
||||
|
||||
content.extract()
|
||||
return content.prettify()
|
||||
Loading…
Reference in a new issue