mirror of
https://github.com/kemayo/leech
synced 2026-01-26 01:11:31 +01:00
Merge branch 'master' into clickify
This commit is contained in:
commit
43599aceb5
10 changed files with 144 additions and 7 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -1,6 +1,6 @@
|
|||
*.epub
|
||||
*.mobi
|
||||
*.json
|
||||
./*.json
|
||||
leech.db
|
||||
leech.sqlite
|
||||
leech.cookies
|
||||
|
|
|
|||
7
examples/deathworlders.json
Normal file
7
examples/deathworlders.json
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
{
|
||||
"url": "http://hfy-archive.org/book/deathworlders/chapter-01-kevin-jenkins-experience",
|
||||
"title": "Deathworlders",
|
||||
"author": "Philip Richard Johnson, AKA Hambone",
|
||||
"chapter_selector": "#block-book-navigation .menu a",
|
||||
"content_selector": "article .node-content .field-name-body .field-item"
|
||||
}
|
||||
8
examples/heretical-edge.json
Normal file
8
examples/heretical-edge.json
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"url": "https://ceruleanscrawling.wordpress.com/table-of-contents/",
|
||||
"title": "Heretical Edge",
|
||||
"author": "Ceruelean",
|
||||
"chapter_selector": "article .entry-content > p > a",
|
||||
"content_selector": "article .entry-content",
|
||||
"filter_selector": ".sharedaddy, .wpcnt, style"
|
||||
}
|
||||
8
examples/practical1.json
Normal file
8
examples/practical1.json
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"url": "https://practicalguidetoevil.wordpress.com/table-of-contents/",
|
||||
"title": "A Practical Guide To Evil: Book 1",
|
||||
"author": "erraticerrata",
|
||||
"chapter_selector": "#main .entry-content > ul:nth-of-type(1) > li > a",
|
||||
"content_selector": "#main .entry-content",
|
||||
"filter_selector": ".sharedaddy, .wpcnt, style"
|
||||
}
|
||||
8
examples/practical2.json
Normal file
8
examples/practical2.json
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"url": "https://practicalguidetoevil.wordpress.com/table-of-contents/",
|
||||
"title": "A Practical Guide To Evil: Book 2",
|
||||
"author": "erraticerrata",
|
||||
"chapter_selector": "#main .entry-content > ul:nth-of-type(2) > li > ul > li > a",
|
||||
"content_selector": "#main .entry-content",
|
||||
"filter_selector": ".sharedaddy, .wpcnt, style"
|
||||
}
|
||||
8
examples/practical3.json
Normal file
8
examples/practical3.json
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"url": "https://practicalguidetoevil.wordpress.com/table-of-contents/",
|
||||
"title": "A Practical Guide To Evil: Book 3",
|
||||
"author": "erraticerrata",
|
||||
"chapter_selector": "#main .entry-content > ul:nth-of-type(3) > li > a",
|
||||
"content_selector": "#main .entry-content",
|
||||
"filter_selector": ".sharedaddy, .wpcnt, style"
|
||||
}
|
||||
|
|
@ -2,6 +2,7 @@
|
|||
import glob
|
||||
import os
|
||||
import uuid
|
||||
import time
|
||||
import attr
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
|
@ -91,9 +92,16 @@ class Site:
|
|||
def login(self, login_details):
|
||||
raise NotImplementedError()
|
||||
|
||||
def _soup(self, url, method='html5lib', **kw):
|
||||
def _soup(self, url, method='html5lib', retry=3, retry_delay=10, **kw):
|
||||
page = self.session.get(url, **kw)
|
||||
if not page:
|
||||
if retry and retry > 0:
|
||||
delay = retry_delay
|
||||
if page.headers['Retry-After']:
|
||||
delay = int(page.headers['Retry-After'])
|
||||
print("Load failed: waiting {}s to retry ({})".format(delay, page))
|
||||
time.sleep(delay)
|
||||
return self._soup(url, method=method, retry=retry - 1, retry_delay=retry_delay, **kw)
|
||||
raise SiteException("Couldn't fetch", url)
|
||||
return BeautifulSoup(page.text, method)
|
||||
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ import attr
|
|||
import datetime
|
||||
import json
|
||||
import os.path
|
||||
import urllib
|
||||
from . import register, Site, Section, Chapter
|
||||
|
||||
"""
|
||||
|
|
@ -47,19 +48,19 @@ class Arbitrary(Site):
|
|||
|
||||
story = Section(
|
||||
title=definition.title,
|
||||
author=definition.author
|
||||
author=definition.author,
|
||||
url=url
|
||||
)
|
||||
|
||||
if definition.chapter_selector:
|
||||
soup = self._soup(definition.url)
|
||||
for chapter in soup.select(definition.chapter_selector):
|
||||
chapter_url = str(chapter.get('href'))
|
||||
chapter_url = urllib.parse.urljoin(definition.url, str(chapter.get('href')))
|
||||
story.add(Chapter(
|
||||
title=chapter.string,
|
||||
contents=self._chapter(chapter_url, definition),
|
||||
# TODO: better date detection
|
||||
date=datetime.datetime.now(),
|
||||
url=url
|
||||
date=datetime.datetime.now()
|
||||
))
|
||||
else:
|
||||
story.add(Chapter(
|
||||
|
|
|
|||
86
sites/fictionlive.py
Normal file
86
sites/fictionlive.py
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import itertools
|
||||
import datetime
|
||||
import re
|
||||
from . import register, Site, Section, Chapter
|
||||
|
||||
|
||||
@register
|
||||
class FictionLive(Site):
|
||||
"""fiction.live: it's... mostly smut, I think? Terrible smut. But, hey, I had a rec to follow."""
|
||||
@staticmethod
|
||||
def matches(url):
|
||||
# e.g. https://fiction.live/stories/Descendant-of-a-Demon-Lord/SBBA49fQavNQMWxFT
|
||||
match = re.match(r'^(https?://fiction\.live/stories/[^\/]+/[0-9a-zA-Z]+)/?.*', url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
def extract(self, url):
|
||||
workid = re.match(r'^https?://fiction\.live/stories/[^\/]+/([0-9a-zA-Z]+)/?.*', url).group(1)
|
||||
|
||||
response = self.session.get('https://fiction.live/api/node/{}'.format(workid)).json()
|
||||
|
||||
story = Section(
|
||||
title=response['t'],
|
||||
author=response['u'][0]['n'],
|
||||
# Could normalize the URL here from the returns, but I'd have to
|
||||
# go look up how they handle special characters in titles...
|
||||
url=url
|
||||
)
|
||||
# There's a summary (or similar) in `d` and `b`, if I want to use that later.
|
||||
|
||||
# TODO: extract these #special ones and send them off to an endnotes section?
|
||||
chapters = ({'ct': 0},) + tuple(c for c in response['bm'] if not c['title'].startswith('#special')) + ({'ct': 9999999999999999},)
|
||||
|
||||
for prevc, currc, nextc in contextiterate(chapters):
|
||||
# `id`, `title`, `ct`, `isFirst`
|
||||
# https://fiction.live/api/anonkun/chapters/SBBA49fQavNQMWxFT/0/1448245168594
|
||||
# https://fiction.live/api/anonkun/chapters/SBBA49fQavNQMWxFT/1449266444062/1449615394752
|
||||
# https://fiction.live/api/anonkun/chapters/SBBA49fQavNQMWxFT/1502823848216/9999999999999998
|
||||
# i.e. format is [current timestamp] / [next timestamp - 1]
|
||||
chapter_url = 'https://fiction.live/api/anonkun/chapters/{}/{}/{}'.format(workid, currc['ct'], nextc['ct'] - 1)
|
||||
print("Extracting chapter from", chapter_url)
|
||||
data = self.session.get(chapter_url).json()
|
||||
html = []
|
||||
|
||||
updated = currc['ct']
|
||||
for segment in (d for d in data if not d.get('t', '').startswith('#special')):
|
||||
updated = max(updated, segment['ct'])
|
||||
# TODO: work out if this is actually enough types handled
|
||||
# There's at least also a reader post type, which mostly seems to be used for die rolls.
|
||||
if segment['nt'] == 'chapter':
|
||||
html.extend(('<div>', segment['b'].replace('<br>', '<br/>'), '</div>'))
|
||||
elif segment['nt'] == 'choice':
|
||||
votes = {}
|
||||
for vote in segment['votes']:
|
||||
votechoices = segment['votes'][vote]
|
||||
if type(votechoices) == int:
|
||||
votechoices = (votechoices,)
|
||||
for choice in votechoices:
|
||||
choice = segment['choices'][int(choice)]
|
||||
votes[choice] = votes.get(choice, 0) + 1
|
||||
choices = [(votes[v], v) for v in votes]
|
||||
choices.sort(reverse=True)
|
||||
html.append('<hr/><ul>')
|
||||
for votecount, choice in choices:
|
||||
html.append('<li>{}: {}</li>'.format(choice, votecount))
|
||||
html.append('</ul><hr/>')
|
||||
|
||||
story.add(Chapter(
|
||||
title=currc['title'],
|
||||
contents='\n'.join(html),
|
||||
date=datetime.datetime.fromtimestamp(updated / 1000.0)
|
||||
))
|
||||
|
||||
return story
|
||||
|
||||
|
||||
# Stolen from the itertools docs
|
||||
def contextiterate(iterable):
|
||||
"s -> (s0,s1), (s1,s2), (s2, s3), ..."
|
||||
a, b, c = itertools.tee(iterable, 3)
|
||||
next(b, None)
|
||||
next(c, None)
|
||||
next(c, None)
|
||||
return zip(a, b, c)
|
||||
|
|
@ -80,7 +80,10 @@ class XenForo(Site):
|
|||
|
||||
threadmarks_link = soup.find(class_="threadmarksTrigger", href=True)
|
||||
if not threadmarks_link:
|
||||
threadmarks_link = soup.select('.threadmarkMenus a.OverlayTrigger')[0]
|
||||
try:
|
||||
threadmarks_link = soup.select('.threadmarkMenus a.OverlayTrigger')[0]
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
if not threadmarks_link:
|
||||
raise SiteException("No threadmarks")
|
||||
|
|
|
|||
Loading…
Reference in a new issue