diff --git a/.gitignore b/.gitignore index fe27ec7..55fa808 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ *.epub *.mobi -*.json +./*.json leech.db leech.sqlite leech.cookies diff --git a/examples/deathworlders.json b/examples/deathworlders.json new file mode 100644 index 0000000..f6ede53 --- /dev/null +++ b/examples/deathworlders.json @@ -0,0 +1,7 @@ +{ + "url": "http://hfy-archive.org/book/deathworlders/chapter-01-kevin-jenkins-experience", + "title": "Deathworlders", + "author": "Philip Richard Johnson, AKA Hambone", + "chapter_selector": "#block-book-navigation .menu a", + "content_selector": "article .node-content .field-name-body .field-item" +} diff --git a/examples/heretical-edge.json b/examples/heretical-edge.json new file mode 100644 index 0000000..f266957 --- /dev/null +++ b/examples/heretical-edge.json @@ -0,0 +1,8 @@ +{ + "url": "https://ceruleanscrawling.wordpress.com/table-of-contents/", + "title": "Heretical Edge", + "author": "Ceruelean", + "chapter_selector": "article .entry-content > p > a", + "content_selector": "article .entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style" +} diff --git a/examples/practical1.json b/examples/practical1.json new file mode 100644 index 0000000..00e1d20 --- /dev/null +++ b/examples/practical1.json @@ -0,0 +1,8 @@ +{ + "url": "https://practicalguidetoevil.wordpress.com/table-of-contents/", + "title": "A Practical Guide To Evil: Book 1", + "author": "erraticerrata", + "chapter_selector": "#main .entry-content > ul:nth-of-type(1) > li > a", + "content_selector": "#main .entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style" +} diff --git a/examples/practical2.json b/examples/practical2.json new file mode 100644 index 0000000..2dfd4c9 --- /dev/null +++ b/examples/practical2.json @@ -0,0 +1,8 @@ +{ + "url": "https://practicalguidetoevil.wordpress.com/table-of-contents/", + "title": "A Practical Guide To Evil: Book 2", + "author": "erraticerrata", + "chapter_selector": "#main .entry-content > ul:nth-of-type(2) > li > ul > li > a", + "content_selector": "#main .entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style" +} diff --git a/examples/practical3.json b/examples/practical3.json new file mode 100644 index 0000000..cc883fb --- /dev/null +++ b/examples/practical3.json @@ -0,0 +1,8 @@ +{ + "url": "https://practicalguidetoevil.wordpress.com/table-of-contents/", + "title": "A Practical Guide To Evil: Book 3", + "author": "erraticerrata", + "chapter_selector": "#main .entry-content > ul:nth-of-type(3) > li > a", + "content_selector": "#main .entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style" +} diff --git a/sites/__init__.py b/sites/__init__.py index b2556a4..5ba4094 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -2,6 +2,7 @@ import glob import os import uuid +import time import attr from bs4 import BeautifulSoup @@ -91,9 +92,16 @@ class Site: def login(self, login_details): raise NotImplementedError() - def _soup(self, url, method='html5lib', **kw): + def _soup(self, url, method='html5lib', retry=3, retry_delay=10, **kw): page = self.session.get(url, **kw) if not page: + if retry and retry > 0: + delay = retry_delay + if page.headers['Retry-After']: + delay = int(page.headers['Retry-After']) + print("Load failed: waiting {}s to retry ({})".format(delay, page)) + time.sleep(delay) + return self._soup(url, method=method, retry=retry - 1, retry_delay=retry_delay, **kw) raise SiteException("Couldn't fetch", url) return BeautifulSoup(page.text, method) diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 1463f14..195faee 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -4,6 +4,7 @@ import attr import datetime import json import os.path +import urllib from . import register, Site, Section, Chapter """ @@ -47,19 +48,19 @@ class Arbitrary(Site): story = Section( title=definition.title, - author=definition.author + author=definition.author, + url=url ) if definition.chapter_selector: soup = self._soup(definition.url) for chapter in soup.select(definition.chapter_selector): - chapter_url = str(chapter.get('href')) + chapter_url = urllib.parse.urljoin(definition.url, str(chapter.get('href'))) story.add(Chapter( title=chapter.string, contents=self._chapter(chapter_url, definition), # TODO: better date detection - date=datetime.datetime.now(), - url=url + date=datetime.datetime.now() )) else: story.add(Chapter( diff --git a/sites/fictionlive.py b/sites/fictionlive.py new file mode 100644 index 0000000..6d588ba --- /dev/null +++ b/sites/fictionlive.py @@ -0,0 +1,86 @@ +#!/usr/bin/python + +import itertools +import datetime +import re +from . import register, Site, Section, Chapter + + +@register +class FictionLive(Site): + """fiction.live: it's... mostly smut, I think? Terrible smut. But, hey, I had a rec to follow.""" + @staticmethod + def matches(url): + # e.g. https://fiction.live/stories/Descendant-of-a-Demon-Lord/SBBA49fQavNQMWxFT + match = re.match(r'^(https?://fiction\.live/stories/[^\/]+/[0-9a-zA-Z]+)/?.*', url) + if match: + return match.group(1) + + def extract(self, url): + workid = re.match(r'^https?://fiction\.live/stories/[^\/]+/([0-9a-zA-Z]+)/?.*', url).group(1) + + response = self.session.get('https://fiction.live/api/node/{}'.format(workid)).json() + + story = Section( + title=response['t'], + author=response['u'][0]['n'], + # Could normalize the URL here from the returns, but I'd have to + # go look up how they handle special characters in titles... + url=url + ) + # There's a summary (or similar) in `d` and `b`, if I want to use that later. + + # TODO: extract these #special ones and send them off to an endnotes section? + chapters = ({'ct': 0},) + tuple(c for c in response['bm'] if not c['title'].startswith('#special')) + ({'ct': 9999999999999999},) + + for prevc, currc, nextc in contextiterate(chapters): + # `id`, `title`, `ct`, `isFirst` + # https://fiction.live/api/anonkun/chapters/SBBA49fQavNQMWxFT/0/1448245168594 + # https://fiction.live/api/anonkun/chapters/SBBA49fQavNQMWxFT/1449266444062/1449615394752 + # https://fiction.live/api/anonkun/chapters/SBBA49fQavNQMWxFT/1502823848216/9999999999999998 + # i.e. format is [current timestamp] / [next timestamp - 1] + chapter_url = 'https://fiction.live/api/anonkun/chapters/{}/{}/{}'.format(workid, currc['ct'], nextc['ct'] - 1) + print("Extracting chapter from", chapter_url) + data = self.session.get(chapter_url).json() + html = [] + + updated = currc['ct'] + for segment in (d for d in data if not d.get('t', '').startswith('#special')): + updated = max(updated, segment['ct']) + # TODO: work out if this is actually enough types handled + # There's at least also a reader post type, which mostly seems to be used for die rolls. + if segment['nt'] == 'chapter': + html.extend(('
', segment['b'].replace('
', '
'), '
')) + elif segment['nt'] == 'choice': + votes = {} + for vote in segment['votes']: + votechoices = segment['votes'][vote] + if type(votechoices) == int: + votechoices = (votechoices,) + for choice in votechoices: + choice = segment['choices'][int(choice)] + votes[choice] = votes.get(choice, 0) + 1 + choices = [(votes[v], v) for v in votes] + choices.sort(reverse=True) + html.append('

') + + story.add(Chapter( + title=currc['title'], + contents='\n'.join(html), + date=datetime.datetime.fromtimestamp(updated / 1000.0) + )) + + return story + + +# Stolen from the itertools docs +def contextiterate(iterable): + "s -> (s0,s1), (s1,s2), (s2, s3), ..." + a, b, c = itertools.tee(iterable, 3) + next(b, None) + next(c, None) + next(c, None) + return zip(a, b, c) diff --git a/sites/xenforo.py b/sites/xenforo.py index 3b9382b..6f16fb6 100644 --- a/sites/xenforo.py +++ b/sites/xenforo.py @@ -80,7 +80,10 @@ class XenForo(Site): threadmarks_link = soup.find(class_="threadmarksTrigger", href=True) if not threadmarks_link: - threadmarks_link = soup.select('.threadmarkMenus a.OverlayTrigger')[0] + try: + threadmarks_link = soup.select('.threadmarkMenus a.OverlayTrigger')[0] + except IndexError: + pass if not threadmarks_link: raise SiteException("No threadmarks")