mirror of
https://github.com/kemayo/leech
synced 2025-12-15 12:56:41 +01:00
Support: RoyalRoad
This commit is contained in:
parent
7bb6da382c
commit
e099f47e66
2 changed files with 61 additions and 0 deletions
|
|
@ -33,6 +33,8 @@ Supports
|
|||
* ArchiveOfOurOwn
|
||||
* Yes, it has its own built-in EPUB export, but the formatting is horrible
|
||||
* Various XenForo-based sites: SpaceBattles and SufficientVelocity, most notably
|
||||
* RoyalRoad
|
||||
* Fiction.live (Anonkun)
|
||||
* DeviantArt galleries/collections
|
||||
* Sta.sh
|
||||
* Completely arbitrary sites, with a bit more work (see below)
|
||||
|
|
|
|||
59
sites/royalroad.py
Normal file
59
sites/royalroad.py
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import http.client
|
||||
import logging
|
||||
import datetime
|
||||
import re
|
||||
import urllib
|
||||
from . import register, Site, Section, Chapter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@register
|
||||
class RoyalRoad(Site):
|
||||
"""Royal Road: a place where people write novels, mostly seeming to be light-novel in tone."""
|
||||
@staticmethod
|
||||
def matches(url):
|
||||
# e.g. https://royalroadl.com/fiction/6752/lament-of-the-fallen
|
||||
match = re.match(r'^(https?://royalroadl\.com/fiction/\d+)/?.*', url)
|
||||
if match:
|
||||
return match.group(1) + '/'
|
||||
|
||||
def extract(self, url):
|
||||
workid = re.match(r'^https?://royalroadl\.com/fiction/(\d+)/?.*', url).group(1)
|
||||
soup = self._soup('https://royalroadl.com/fiction/{}'.format(workid))
|
||||
# should have gotten redirected, for a valid title
|
||||
|
||||
original_maxheaders = http.client._MAXHEADERS
|
||||
http.client._MAXHEADERS = 1000
|
||||
|
||||
metadata = soup.select('#main h2.heading a')
|
||||
story = Section(
|
||||
title=soup.find('h1', property='name').string.strip(),
|
||||
author=soup.find('meta', property='books:author').get('content').strip(),
|
||||
url=soup.find('meta', property='og:url').get('content').strip()
|
||||
)
|
||||
|
||||
for chapter in soup.select('#chapters tbody tr[data-url]'):
|
||||
chapter_url = str(urllib.parse.urljoin(story.url, str(chapter.get('data-url'))))
|
||||
|
||||
updated = datetime.datetime.fromtimestamp(
|
||||
int(chapter.find('time').get('unixtime')),
|
||||
)
|
||||
|
||||
story.add(Chapter(title=chapter.find('a', href=True).string.strip(), contents=self._chapter(chapter_url), date=updated))
|
||||
|
||||
http.client._MAXHEADERS = original_maxheaders
|
||||
|
||||
return story
|
||||
|
||||
def _chapter(self, url):
|
||||
logger.info("Extracting chapter @ %s", url)
|
||||
soup = self._soup(url)
|
||||
content = soup.find('div', class_='chapter-content')
|
||||
|
||||
# TODO: this could be more robust, and I don't know if there's post-chapter notes anywhere as well.
|
||||
author_note = soup.find('div', class_='author-note-portlet')
|
||||
|
||||
return (author_note and (author_note.prettify() + '<hr/>') or '') + content.prettify()
|
||||
Loading…
Reference in a new issue