From 1fe2e72b30e60e39b0bd63a3c802b5f09ff1c809 Mon Sep 17 00:00:00 2001 From: David Lynch Date: Thu, 12 Oct 2017 17:40:26 -0500 Subject: [PATCH 1/7] Site handler for fiction.live --- sites/fictionlive.py | 84 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 sites/fictionlive.py diff --git a/sites/fictionlive.py b/sites/fictionlive.py new file mode 100644 index 0000000..def19aa --- /dev/null +++ b/sites/fictionlive.py @@ -0,0 +1,84 @@ +#!/usr/bin/python + +import itertools +import datetime +import re +from . import register, Site, Section, Chapter + + +@register +class FictionLive(Site): + """Archive of Our Own: it has its own epub export, but the formatting is awful""" + @staticmethod + def matches(url): + # e.g. https://fiction.live/stories/Descendant-of-a-Demon-Lord/SBBA49fQavNQMWxFT + match = re.match(r'^(https?://fiction\.live/stories/[^\/]+/[0-9a-zA-Z]+)/?.*', url) + if match: + return match.group(1) + + def extract(self, url): + workid = re.match(r'^https?://fiction\.live/stories/[^\/]+/([0-9a-zA-Z]+)/?.*', url).group(1) + return self._extract_work(workid) + + def _extract_work(self, workid): + response = self.session.get('https://fiction.live/api/node/{}'.format(workid)).json() + + story = Section( + title=response['t'], + author=response['u'][0]['n'], + url='https://fiction.live/stories/{}/{}'.format(response['t'].replace(' ', '-'), workid) + ) + + # There's a summary in `d` and `b`. + + chapters = ({'ct': 0},) + tuple(c for c in response['bm'] if not c['title'].startswith('#special')) + ({'ct': 9999999999999999},) + + for prevc, currc, nextc in contextiterate(chapters): + # `id`, `title`, `ct`, `isFirst` + # https://fiction.live/api/anonkun/chapters/SBBA49fQavNQMWxFT/0/1448245168594 + # https://fiction.live/api/anonkun/chapters/SBBA49fQavNQMWxFT/1449266444062/1449615394752 + # https://fiction.live/api/anonkun/chapters/SBBA49fQavNQMWxFT/1502823848216/9999999999999998 + # i.e. format is [current timestamp] / [next timestamp - 1] + chapter_url = 'https://fiction.live/api/anonkun/chapters/{}/{}/{}'.format(workid, currc['ct'], nextc['ct'] - 1) + print("Extracting chapter from", chapter_url) + data = self.session.get(chapter_url).json() + html = [] + + updated = currc['ct'] + for segment in (d for d in data if not d.get('t', '').startswith('#special')): + updated = max(updated, segment['ct']) + if segment['nt'] == 'chapter': + html.extend(('
', segment['b'].replace('
', '
'), '
')) + elif segment['nt'] == 'choice': + votes = {} + for vote in segment['votes']: + votechoices = segment['votes'][vote] + if type(votechoices) == int: + votechoices = (votechoices,) + for choice in votechoices: + choice = segment['choices'][int(choice)] + votes[choice] = votes.get(choice, 0) + 1 + choices = [(votes[v], v) for v in votes] + choices.sort(reverse=True) + html.append('

') + + story.add(Chapter( + title=currc['title'], + contents='\n'.join(html), + date=datetime.datetime.fromtimestamp(updated / 1000.0) + )) + + return story + + +# Stolen from the itertools docs +def contextiterate(iterable): + "s -> (s0,s1), (s1,s2), (s2, s3), ..." + a, b, c = itertools.tee(iterable, 3) + next(b, None) + next(c, None) + next(c, None) + return zip(a, b, c) From 3a24cad8266fec11c7909ff1abb4eafddae9d867 Mon Sep 17 00:00:00 2001 From: David Lynch Date: Thu, 12 Oct 2017 18:38:58 -0500 Subject: [PATCH 2/7] Fill in some TODOs for fiction.live --- sites/fictionlive.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/sites/fictionlive.py b/sites/fictionlive.py index def19aa..6d588ba 100644 --- a/sites/fictionlive.py +++ b/sites/fictionlive.py @@ -8,7 +8,7 @@ from . import register, Site, Section, Chapter @register class FictionLive(Site): - """Archive of Our Own: it has its own epub export, but the formatting is awful""" + """fiction.live: it's... mostly smut, I think? Terrible smut. But, hey, I had a rec to follow.""" @staticmethod def matches(url): # e.g. https://fiction.live/stories/Descendant-of-a-Demon-Lord/SBBA49fQavNQMWxFT @@ -18,19 +18,19 @@ class FictionLive(Site): def extract(self, url): workid = re.match(r'^https?://fiction\.live/stories/[^\/]+/([0-9a-zA-Z]+)/?.*', url).group(1) - return self._extract_work(workid) - def _extract_work(self, workid): response = self.session.get('https://fiction.live/api/node/{}'.format(workid)).json() story = Section( title=response['t'], author=response['u'][0]['n'], - url='https://fiction.live/stories/{}/{}'.format(response['t'].replace(' ', '-'), workid) + # Could normalize the URL here from the returns, but I'd have to + # go look up how they handle special characters in titles... + url=url ) + # There's a summary (or similar) in `d` and `b`, if I want to use that later. - # There's a summary in `d` and `b`. - + # TODO: extract these #special ones and send them off to an endnotes section? chapters = ({'ct': 0},) + tuple(c for c in response['bm'] if not c['title'].startswith('#special')) + ({'ct': 9999999999999999},) for prevc, currc, nextc in contextiterate(chapters): @@ -47,6 +47,8 @@ class FictionLive(Site): updated = currc['ct'] for segment in (d for d in data if not d.get('t', '').startswith('#special')): updated = max(updated, segment['ct']) + # TODO: work out if this is actually enough types handled + # There's at least also a reader post type, which mostly seems to be used for die rolls. if segment['nt'] == 'chapter': html.extend(('
', segment['b'].replace('
', '
'), '
')) elif segment['nt'] == 'choice': From dc0d2162fbee2bb26598f93ed7a6e6cea1b8391a Mon Sep 17 00:00:00 2001 From: David Lynch Date: Sun, 22 Oct 2017 17:06:40 -0500 Subject: [PATCH 3/7] Arbitrary handler had misplaced url arg --- sites/arbitrary.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 1463f14..9a56bf6 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -47,7 +47,8 @@ class Arbitrary(Site): story = Section( title=definition.title, - author=definition.author + author=definition.author, + url=url ) if definition.chapter_selector: @@ -58,8 +59,7 @@ class Arbitrary(Site): title=chapter.string, contents=self._chapter(chapter_url, definition), # TODO: better date detection - date=datetime.datetime.now(), - url=url + date=datetime.datetime.now() )) else: story.add(Chapter( From 257ab69394939510e6497b1eb139e93ae420b585 Mon Sep 17 00:00:00 2001 From: David Lynch Date: Sun, 22 Oct 2017 17:31:10 -0500 Subject: [PATCH 4/7] Arbitrary handler: canonicalize URLs --- sites/arbitrary.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 9a56bf6..195faee 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -4,6 +4,7 @@ import attr import datetime import json import os.path +import urllib from . import register, Site, Section, Chapter """ @@ -54,7 +55,7 @@ class Arbitrary(Site): if definition.chapter_selector: soup = self._soup(definition.url) for chapter in soup.select(definition.chapter_selector): - chapter_url = str(chapter.get('href')) + chapter_url = urllib.parse.urljoin(definition.url, str(chapter.get('href'))) story.add(Chapter( title=chapter.string, contents=self._chapter(chapter_url, definition), From df8e67d3e102d8f4640bbf757248accd9f95520c Mon Sep 17 00:00:00 2001 From: David Lynch Date: Sun, 22 Oct 2017 17:33:43 -0500 Subject: [PATCH 5/7] Include some examples for the arbitrary handler --- .gitignore | 2 +- examples/deathworlders.json | 7 +++++++ examples/heretical-edge.json | 8 ++++++++ examples/practical1.json | 8 ++++++++ examples/practical2.json | 8 ++++++++ examples/practical3.json | 8 ++++++++ 6 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 examples/deathworlders.json create mode 100644 examples/heretical-edge.json create mode 100644 examples/practical1.json create mode 100644 examples/practical2.json create mode 100644 examples/practical3.json diff --git a/.gitignore b/.gitignore index fe27ec7..55fa808 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ *.epub *.mobi -*.json +./*.json leech.db leech.sqlite leech.cookies diff --git a/examples/deathworlders.json b/examples/deathworlders.json new file mode 100644 index 0000000..f6ede53 --- /dev/null +++ b/examples/deathworlders.json @@ -0,0 +1,7 @@ +{ + "url": "http://hfy-archive.org/book/deathworlders/chapter-01-kevin-jenkins-experience", + "title": "Deathworlders", + "author": "Philip Richard Johnson, AKA Hambone", + "chapter_selector": "#block-book-navigation .menu a", + "content_selector": "article .node-content .field-name-body .field-item" +} diff --git a/examples/heretical-edge.json b/examples/heretical-edge.json new file mode 100644 index 0000000..f266957 --- /dev/null +++ b/examples/heretical-edge.json @@ -0,0 +1,8 @@ +{ + "url": "https://ceruleanscrawling.wordpress.com/table-of-contents/", + "title": "Heretical Edge", + "author": "Ceruelean", + "chapter_selector": "article .entry-content > p > a", + "content_selector": "article .entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style" +} diff --git a/examples/practical1.json b/examples/practical1.json new file mode 100644 index 0000000..00e1d20 --- /dev/null +++ b/examples/practical1.json @@ -0,0 +1,8 @@ +{ + "url": "https://practicalguidetoevil.wordpress.com/table-of-contents/", + "title": "A Practical Guide To Evil: Book 1", + "author": "erraticerrata", + "chapter_selector": "#main .entry-content > ul:nth-of-type(1) > li > a", + "content_selector": "#main .entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style" +} diff --git a/examples/practical2.json b/examples/practical2.json new file mode 100644 index 0000000..2dfd4c9 --- /dev/null +++ b/examples/practical2.json @@ -0,0 +1,8 @@ +{ + "url": "https://practicalguidetoevil.wordpress.com/table-of-contents/", + "title": "A Practical Guide To Evil: Book 2", + "author": "erraticerrata", + "chapter_selector": "#main .entry-content > ul:nth-of-type(2) > li > ul > li > a", + "content_selector": "#main .entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style" +} diff --git a/examples/practical3.json b/examples/practical3.json new file mode 100644 index 0000000..cc883fb --- /dev/null +++ b/examples/practical3.json @@ -0,0 +1,8 @@ +{ + "url": "https://practicalguidetoevil.wordpress.com/table-of-contents/", + "title": "A Practical Guide To Evil: Book 3", + "author": "erraticerrata", + "chapter_selector": "#main .entry-content > ul:nth-of-type(3) > li > a", + "content_selector": "#main .entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style" +} From 27b677a44421c098731b4af929b8df9de238ec56 Mon Sep 17 00:00:00 2001 From: David Lynch Date: Sun, 29 Oct 2017 19:50:19 -0500 Subject: [PATCH 6/7] Fix no-threadmarks autodetect --- sites/xenforo.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sites/xenforo.py b/sites/xenforo.py index cb0e8a1..9c94401 100644 --- a/sites/xenforo.py +++ b/sites/xenforo.py @@ -71,7 +71,10 @@ class XenForo(Site): threadmarks_link = soup.find(class_="threadmarksTrigger", href=True) if not threadmarks_link: - threadmarks_link = soup.select('.threadmarkMenus a.OverlayTrigger')[0] + try: + threadmarks_link = soup.select('.threadmarkMenus a.OverlayTrigger')[0] + except IndexError: + pass if not threadmarks_link: raise SiteException("No threadmarks") From f1ac7c8bdae09403989ffed581df35465fe93197 Mon Sep 17 00:00:00 2001 From: David Lynch Date: Tue, 31 Oct 2017 00:27:54 -0500 Subject: [PATCH 7/7] Retry failed site-requests --- sites/__init__.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sites/__init__.py b/sites/__init__.py index 70ab656..24161be 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -3,6 +3,7 @@ import glob import os import argparse import uuid +import time import attr from bs4 import BeautifulSoup @@ -96,9 +97,16 @@ class Site: def _add_arguments(self, parser): pass - def _soup(self, url, method='html5lib', **kw): + def _soup(self, url, method='html5lib', retry=3, retry_delay=10, **kw): page = self.session.get(url, **kw) if not page: + if retry and retry > 0: + delay = retry_delay + if page.headers['Retry-After']: + delay = int(page.headers['Retry-After']) + print("Load failed: waiting {}s to retry ({})".format(delay, page)) + time.sleep(delay) + return self._soup(url, method=method, retry=retry - 1, retry_delay=retry_delay, **kw) raise SiteException("Couldn't fetch", url) return BeautifulSoup(page.text, method)