diff --git a/.gitignore b/.gitignore index fe27ec7..55fa808 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ *.epub *.mobi -*.json +./*.json leech.db leech.sqlite leech.cookies diff --git a/examples/deathworlders.json b/examples/deathworlders.json new file mode 100644 index 0000000..f6ede53 --- /dev/null +++ b/examples/deathworlders.json @@ -0,0 +1,7 @@ +{ + "url": "http://hfy-archive.org/book/deathworlders/chapter-01-kevin-jenkins-experience", + "title": "Deathworlders", + "author": "Philip Richard Johnson, AKA Hambone", + "chapter_selector": "#block-book-navigation .menu a", + "content_selector": "article .node-content .field-name-body .field-item" +} diff --git a/examples/heretical-edge.json b/examples/heretical-edge.json new file mode 100644 index 0000000..f266957 --- /dev/null +++ b/examples/heretical-edge.json @@ -0,0 +1,8 @@ +{ + "url": "https://ceruleanscrawling.wordpress.com/table-of-contents/", + "title": "Heretical Edge", + "author": "Ceruelean", + "chapter_selector": "article .entry-content > p > a", + "content_selector": "article .entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style" +} diff --git a/examples/practical1.json b/examples/practical1.json new file mode 100644 index 0000000..00e1d20 --- /dev/null +++ b/examples/practical1.json @@ -0,0 +1,8 @@ +{ + "url": "https://practicalguidetoevil.wordpress.com/table-of-contents/", + "title": "A Practical Guide To Evil: Book 1", + "author": "erraticerrata", + "chapter_selector": "#main .entry-content > ul:nth-of-type(1) > li > a", + "content_selector": "#main .entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style" +} diff --git a/examples/practical2.json b/examples/practical2.json new file mode 100644 index 0000000..2dfd4c9 --- /dev/null +++ b/examples/practical2.json @@ -0,0 +1,8 @@ +{ + "url": "https://practicalguidetoevil.wordpress.com/table-of-contents/", + "title": "A Practical Guide To Evil: Book 2", + "author": "erraticerrata", + "chapter_selector": "#main .entry-content > ul:nth-of-type(2) > li > ul > li > a", + "content_selector": "#main .entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style" +} diff --git a/examples/practical3.json b/examples/practical3.json new file mode 100644 index 0000000..cc883fb --- /dev/null +++ b/examples/practical3.json @@ -0,0 +1,8 @@ +{ + "url": "https://practicalguidetoevil.wordpress.com/table-of-contents/", + "title": "A Practical Guide To Evil: Book 3", + "author": "erraticerrata", + "chapter_selector": "#main .entry-content > ul:nth-of-type(3) > li > a", + "content_selector": "#main .entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style" +} diff --git a/sites/__init__.py b/sites/__init__.py index b2556a4..5ba4094 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -2,6 +2,7 @@ import glob import os import uuid +import time import attr from bs4 import BeautifulSoup @@ -91,9 +92,16 @@ class Site: def login(self, login_details): raise NotImplementedError() - def _soup(self, url, method='html5lib', **kw): + def _soup(self, url, method='html5lib', retry=3, retry_delay=10, **kw): page = self.session.get(url, **kw) if not page: + if retry and retry > 0: + delay = retry_delay + if page.headers['Retry-After']: + delay = int(page.headers['Retry-After']) + print("Load failed: waiting {}s to retry ({})".format(delay, page)) + time.sleep(delay) + return self._soup(url, method=method, retry=retry - 1, retry_delay=retry_delay, **kw) raise SiteException("Couldn't fetch", url) return BeautifulSoup(page.text, method) diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 1463f14..195faee 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -4,6 +4,7 @@ import attr import datetime import json import os.path +import urllib from . import register, Site, Section, Chapter """ @@ -47,19 +48,19 @@ class Arbitrary(Site): story = Section( title=definition.title, - author=definition.author + author=definition.author, + url=url ) if definition.chapter_selector: soup = self._soup(definition.url) for chapter in soup.select(definition.chapter_selector): - chapter_url = str(chapter.get('href')) + chapter_url = urllib.parse.urljoin(definition.url, str(chapter.get('href'))) story.add(Chapter( title=chapter.string, contents=self._chapter(chapter_url, definition), # TODO: better date detection - date=datetime.datetime.now(), - url=url + date=datetime.datetime.now() )) else: story.add(Chapter( diff --git a/sites/fictionlive.py b/sites/fictionlive.py new file mode 100644 index 0000000..6d588ba --- /dev/null +++ b/sites/fictionlive.py @@ -0,0 +1,86 @@ +#!/usr/bin/python + +import itertools +import datetime +import re +from . import register, Site, Section, Chapter + + +@register +class FictionLive(Site): + """fiction.live: it's... mostly smut, I think? Terrible smut. But, hey, I had a rec to follow.""" + @staticmethod + def matches(url): + # e.g. https://fiction.live/stories/Descendant-of-a-Demon-Lord/SBBA49fQavNQMWxFT + match = re.match(r'^(https?://fiction\.live/stories/[^\/]+/[0-9a-zA-Z]+)/?.*', url) + if match: + return match.group(1) + + def extract(self, url): + workid = re.match(r'^https?://fiction\.live/stories/[^\/]+/([0-9a-zA-Z]+)/?.*', url).group(1) + + response = self.session.get('https://fiction.live/api/node/{}'.format(workid)).json() + + story = Section( + title=response['t'], + author=response['u'][0]['n'], + # Could normalize the URL here from the returns, but I'd have to + # go look up how they handle special characters in titles... + url=url + ) + # There's a summary (or similar) in `d` and `b`, if I want to use that later. + + # TODO: extract these #special ones and send them off to an endnotes section? + chapters = ({'ct': 0},) + tuple(c for c in response['bm'] if not c['title'].startswith('#special')) + ({'ct': 9999999999999999},) + + for prevc, currc, nextc in contextiterate(chapters): + # `id`, `title`, `ct`, `isFirst` + # https://fiction.live/api/anonkun/chapters/SBBA49fQavNQMWxFT/0/1448245168594 + # https://fiction.live/api/anonkun/chapters/SBBA49fQavNQMWxFT/1449266444062/1449615394752 + # https://fiction.live/api/anonkun/chapters/SBBA49fQavNQMWxFT/1502823848216/9999999999999998 + # i.e. format is [current timestamp] / [next timestamp - 1] + chapter_url = 'https://fiction.live/api/anonkun/chapters/{}/{}/{}'.format(workid, currc['ct'], nextc['ct'] - 1) + print("Extracting chapter from", chapter_url) + data = self.session.get(chapter_url).json() + html = [] + + updated = currc['ct'] + for segment in (d for d in data if not d.get('t', '').startswith('#special')): + updated = max(updated, segment['ct']) + # TODO: work out if this is actually enough types handled + # There's at least also a reader post type, which mostly seems to be used for die rolls. + if segment['nt'] == 'chapter': + html.extend(('