1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 08:22:56 +01:00
leech/sites/royalroad.py
random human 23b76d2aac Fix royalroadl.com chapter dates
Since the timestamp provided with the chapter list is approximate, fetch
the actual chapter in order to get unixtime.
2018-08-30 04:03:29 +05:30

60 lines
2.2 KiB
Python

#!/usr/bin/python
import http.client
import logging
import datetime
import re
import urllib
from . import register, Site, Section, Chapter
logger = logging.getLogger(__name__)
@register
class RoyalRoad(Site):
"""Royal Road: a place where people write novels, mostly seeming to be light-novel in tone."""
@staticmethod
def matches(url):
# e.g. https://royalroadl.com/fiction/6752/lament-of-the-fallen
match = re.match(r'^(https?://(?:www\.)?royalroadl\.com/fiction/\d+)/?.*', url)
if match:
return match.group(1) + '/'
def extract(self, url):
workid = re.match(r'^https?://(?:www\.)?royalroadl\.com/fiction/(\d+)/?.*', url).group(1)
soup = self._soup('https://www.royalroadl.com/fiction/{}'.format(workid))
# should have gotten redirected, for a valid title
original_maxheaders = http.client._MAXHEADERS
http.client._MAXHEADERS = 1000
story = Section(
title=soup.find('h1', property='name').string.strip(),
author=soup.find('meta', property='books:author').get('content').strip(),
url=soup.find('meta', property='og:url').get('content').strip()
)
for chapter in soup.select('#chapters tbody tr[data-url]'):
chapter_url = str(urllib.parse.urljoin(story.url, str(chapter.get('data-url'))))
# Have to get exact publishing time from the chapter page
chapter_soup = self._soup(chapter_url)
updated = datetime.datetime.fromtimestamp(
int(chapter_soup.find(class_="profile-info").find('time').get('unixtime')),
)
story.add(Chapter(title=chapter.find('a', href=True).string.strip(), contents=self._chapter(chapter_url), date=updated))
http.client._MAXHEADERS = original_maxheaders
return story
def _chapter(self, url):
logger.info("Extracting chapter @ %s", url)
soup = self._soup(url)
content = soup.find('div', class_='chapter-content')
# TODO: this could be more robust, and I don't know if there's post-chapter notes anywhere as well.
author_note = soup.find('div', class_='author-note-portlet')
return (author_note and (author_note.prettify() + '<hr/>') or '') + content.prettify()