1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2026-01-03 06:04:19 +01:00

Merge pull request #2 from kemayo/master

Bring my fork up to date with kemayo's
This commit is contained in:
Will Oursler 2018-04-13 16:36:02 -04:00 committed by GitHub
commit 9ab8404120
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
18 changed files with 214 additions and 31 deletions

2
.gitignore vendored
View file

@ -1,6 +1,6 @@
*.epub
*.mobi
*.json
./*.json
leech.db
leech.sqlite
leech.cookies

View file

@ -33,6 +33,8 @@ Supports
* ArchiveOfOurOwn
* Yes, it has its own built-in EPUB export, but the formatting is horrible
* Various XenForo-based sites: SpaceBattles and SufficientVelocity, most notably
* RoyalRoad
* Fiction.live (Anonkun)
* DeviantArt galleries/collections
* Sta.sh
* Completely arbitrary sites, with a bit more work (see below)

View file

@ -0,0 +1,7 @@
{
"url": "http://hfy-archive.org/book/deathworlders/chapter-01-kevin-jenkins-experience",
"title": "Deathworlders",
"author": "Philip Richard Johnson, AKA Hambone",
"chapter_selector": "#block-book-navigation .menu a",
"content_selector": "article .node-content .field-name-body .field-item"
}

View file

@ -0,0 +1,8 @@
{
"url": "https://ceruleanscrawling.wordpress.com/table-of-contents/",
"title": "Heretical Edge",
"author": "Ceruelean",
"chapter_selector": "article .entry-content > p > a",
"content_selector": "article .entry-content",
"filter_selector": ".sharedaddy, .wpcnt, style"
}

8
examples/practical1.json Normal file
View file

@ -0,0 +1,8 @@
{
"url": "https://practicalguidetoevil.wordpress.com/table-of-contents/",
"title": "A Practical Guide To Evil: Book 1",
"author": "erraticerrata",
"chapter_selector": "#main .entry-content > ul:nth-of-type(1) > li > a",
"content_selector": "#main .entry-content",
"filter_selector": ".sharedaddy, .wpcnt, style"
}

8
examples/practical2.json Normal file
View file

@ -0,0 +1,8 @@
{
"url": "https://practicalguidetoevil.wordpress.com/table-of-contents/",
"title": "A Practical Guide To Evil: Book 2",
"author": "erraticerrata",
"chapter_selector": "#main .entry-content > ul:nth-of-type(2) > li > ul > li > a",
"content_selector": "#main .entry-content",
"filter_selector": ".sharedaddy, .wpcnt, style"
}

8
examples/practical3.json Normal file
View file

@ -0,0 +1,8 @@
{
"url": "https://practicalguidetoevil.wordpress.com/table-of-contents/",
"title": "A Practical Guide To Evil: Book 3",
"author": "erraticerrata",
"chapter_selector": "#main .entry-content > ul:nth-of-type(3) > li > a",
"content_selector": "#main .entry-content",
"filter_selector": ".sharedaddy, .wpcnt, style"
}

8
examples/sagaofsoul.json Normal file
View file

@ -0,0 +1,8 @@
{
"url": "http://www.sagaofsoul.com/story.html",
"title": "Saga of Soul",
"author": "Ouri Maler",
"chapter_selector": "#mainbody li a",
"content_selector": "#mainbody",
"filter_selector": "script, noscript"
}

View file

@ -4,6 +4,8 @@ import argparse
import sys
import json
import http.cookiejar
import logging
import sqlite3
import sites
import ebook
@ -14,6 +16,8 @@ import requests_cache
__version__ = 1
USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__
logger = logging.getLogger(__name__)
def leech(url, session, filename=None, args=None):
# we have: a page, which could be absolutely any part of a story, or not a story at all
@ -22,7 +26,7 @@ def leech(url, session, filename=None, args=None):
if not site:
raise Exception("No site handler found")
print("Handler", site, url)
logger.info("Handler: %s (%s)", site, url)
handler = site(session, args=args)
@ -48,13 +52,27 @@ if __name__ == '__main__':
parser.add_argument('--filename', help="output filename (the title is used if this isn't provided)")
parser.add_argument('--no-cache', dest='cache', action='store_false')
parser.add_argument('--flush', dest='flush', action='store_true')
parser.set_defaults(cache=True, flush=False)
parser.add_argument('-v', '--verbose', help="verbose output", action='store_true', dest='verbose')
parser.set_defaults(cache=True, flush=False, verbose=False)
args, extra_args = parser.parse_known_args()
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(
level=logging.INFO,
format="[%(name)s] %(message)s"
)
if args.flush:
requests_cache.install_cache('leech')
requests_cache.clear()
print("Flushed cache")
conn = sqlite3.connect('leech.sqlite')
conn.execute("VACUUM")
conn.close()
logger.info("Flushed cache")
sys.exit()
if not args.url:
@ -76,4 +94,4 @@ if __name__ == '__main__':
})
filename = leech(args.url, filename=args.filename, session=session, args=extra_args)
print("File created:", filename)
logger.info("File created: %s", filename)

View file

@ -3,9 +3,13 @@ import glob
import os
import argparse
import uuid
import time
import logging
import attr
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
_sites = []
@ -96,9 +100,16 @@ class Site:
def _add_arguments(self, parser):
pass
def _soup(self, url, method='html5lib', **kw):
def _soup(self, url, method='html5lib', retry=3, retry_delay=10, **kw):
page = self.session.get(url, **kw)
if not page:
if retry and retry > 0:
delay = retry_delay
if 'Retry-After' in page.headers:
delay = int(page.headers['Retry-After'])
logger.warning("Load failed: waiting %s to retry (%s)", delay, page)
time.sleep(delay)
return self._soup(url, method=method, retry=retry - 1, retry_delay=retry_delay, **kw)
raise SiteException("Couldn't fetch", url)
return BeautifulSoup(page.text, method)

View file

@ -1,9 +1,15 @@
#!/usr/bin/python
import logging
import datetime
import re
import urllib
import requests_cache
from bs4 import BeautifulSoup
from . import register, Site, Section, Chapter
logger = logging.getLogger(__name__)
@register
class ArchiveOfOurOwn(Site):
@ -15,12 +21,32 @@ class ArchiveOfOurOwn(Site):
if match:
return match.group(1) + '/'
def login(self, login_details):
with requests_cache.disabled():
login = self.session.get('http://archiveofourown.org/login')
soup = BeautifulSoup(login.text, 'html5lib')
form = soup.find(id='new_user_session')
post = {
'user_session[login]': login_details[0],
'user_session[password]': login_details[1],
# standard fields:
'user_session[remember_me]': '1',
'utf8': form.find(attrs={'name': 'utf8'})['value'],
'authenticity_token': form.find(attrs={'name': 'authenticity_token'})['value'],
'commit': 'Log In',
}
# I feel the session *should* handle this cookies bit for me. But
# it doesn't. And I don't know why.
self.session.post('https://archiveofourown.org/user_sessions', data=post, cookies=login.cookies)
logger.info("Logged in as %s", login_details[0])
def extract(self, url):
workid = re.match(r'^https?://archiveofourown\.org/works/(\d+)/?.*', url).group(1)
return self._extract_work(workid)
def _extract_work(self, workid):
soup = self._soup('http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid))
nav_url = 'http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid)
soup = self._soup(nav_url)
metadata = soup.select('#main h2.heading a')
story = Section(
@ -31,9 +57,7 @@ class ArchiveOfOurOwn(Site):
for chapter in soup.select('#main ol[role="navigation"] li'):
link = chapter.find('a')
chapter_url = str(link.get('href'))
if chapter_url.startswith('/works/'):
chapter_url = 'http://archiveofourown.org' + chapter_url
chapter_url = urllib.parse.urljoin(nav_url, str(link.get('href')))
chapter_url += '?view_adult=true'
updated = datetime.datetime.strptime(
@ -46,7 +70,7 @@ class ArchiveOfOurOwn(Site):
return story
def _chapter(self, url):
print("Extracting chapter from", url)
logger.info("Extracting chapter @ %s", url)
soup = self._soup(url)
content = soup.find('div', role='article')
@ -79,7 +103,8 @@ class ArchiveOfOurOwnSeries(ArchiveOfOurOwn):
story = Section(
title=soup.select('#main h2.heading')[0].string,
author=soup.select('#main dl.series.meta a[rel="author"]')[0].string
author=soup.select('#main dl.series.meta a[rel="author"]')[0].string,
url='http://archiveofourown.org/series/{}'.format(seriesid)
)
for work in soup.select('#main ul.series li.work'):

View file

@ -1,11 +1,15 @@
#!/usr/bin/python
import logging
import attr
import datetime
import json
import os.path
import urllib
from . import register, Site, Section, Chapter
logger = logging.getLogger(__name__)
"""
Example JSON:
{
@ -47,19 +51,19 @@ class Arbitrary(Site):
story = Section(
title=definition.title,
author=definition.author
author=definition.author,
url=url
)
if definition.chapter_selector:
soup = self._soup(definition.url)
for chapter in soup.select(definition.chapter_selector):
chapter_url = str(chapter.get('href'))
chapter_url = urllib.parse.urljoin(definition.url, str(chapter.get('href')))
story.add(Chapter(
title=chapter.string,
contents=self._chapter(chapter_url, definition),
# TODO: better date detection
date=datetime.datetime.now(),
url=url
date=datetime.datetime.now()
))
else:
story.add(Chapter(
@ -74,7 +78,7 @@ class Arbitrary(Site):
def _chapter(self, url, definition):
# TODO: refactor so this can meaningfully handle multiple matches on content_selector.
# Probably by changing it so that this returns a Chapter / Section.
print("Extracting chapter from", url)
logger.info("Extracting chapter @ %s", url)
soup = self._soup(url)
content = soup.select(definition.content_selector)[0]

View file

@ -1,10 +1,13 @@
#!/usr/bin/python
import logging
import re
from . import register, Section
from .stash import Stash
logger = logging.getLogger(__name__)
@register
class DeviantArt(Stash):
@ -41,6 +44,6 @@ class DeviantArt(Stash):
if thumb['href'] is not '#':
story.add(self._chapter(thumb['href']))
except Exception as e:
print(e)
logger.exception("Couldn't extract chapters from thumbs")
return story

View file

@ -1,9 +1,12 @@
#!/usr/bin/python
import logging
import datetime
import re
from . import register, Site, SiteException, Section, Chapter
logger = logging.getLogger(__name__)
@register
class FanFictionNet(Site):
@ -11,9 +14,9 @@ class FanFictionNet(Site):
@staticmethod
def matches(url):
# e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights
match = re.match(r'^(https?://www\.fanfiction\.net/s/\d+)/?.*', url)
match = re.match(r'^https?://(?:www|m)\.fanfiction\.net/s/(\d+)/?.*', url)
if match:
return match.group(1) + '/'
return 'https://www.fanfiction.net/s/' + match.group(1) + '/'
def extract(self, url):
soup = self._soup(url)
@ -59,7 +62,7 @@ class FanFictionNet(Site):
return story
def _chapter(self, url):
print("Extracting chapter from", url)
logger.info("Fetching chapter @ %s", url)
soup = self._soup(url)
content = soup.find(id="content_wrapper_inner")
@ -74,7 +77,7 @@ class FanFictionNet(Site):
for tag in text.find_all(True):
tag.attrs = None
except Exception as e:
print("Trouble cleaning attributes", e)
logger.exception("Trouble cleaning attributes")
return text.prettify()
@ -84,6 +87,6 @@ class FictionPress(FanFictionNet):
@staticmethod
def matches(url):
# e.g. https://www.fictionpress.com/s/2961893/1/Mother-of-Learning
match = re.match(r'^(https?://www\.fictionpress\.com/s/\d+)/?.*', url)
match = re.match(r'^https?://(?:www|m)\.fictionpress\.com/s/(\d+)/?.*', url)
if match:
return match.group(1) + '/'
return 'https://www.fictionpress.com/s/' + match.group(1) + '/'

View file

@ -1,10 +1,13 @@
#!/usr/bin/python
import logging
import itertools
import datetime
import re
from . import register, Site, Section, Chapter
logger = logging.getLogger(__name__)
@register
class FictionLive(Site):
@ -40,7 +43,7 @@ class FictionLive(Site):
# https://fiction.live/api/anonkun/chapters/SBBA49fQavNQMWxFT/1502823848216/9999999999999998
# i.e. format is [current timestamp] / [next timestamp - 1]
chapter_url = 'https://fiction.live/api/anonkun/chapters/{}/{}/{}'.format(workid, currc['ct'], nextc['ct'] - 1)
print("Extracting chapter from", chapter_url)
logger.info("Extracting chapter \"%s\" @ %s", currc['title'], chapter_url)
data = self.session.get(chapter_url).json()
html = []

58
sites/royalroad.py Normal file
View file

@ -0,0 +1,58 @@
#!/usr/bin/python
import http.client
import logging
import datetime
import re
import urllib
from . import register, Site, Section, Chapter
logger = logging.getLogger(__name__)
@register
class RoyalRoad(Site):
"""Royal Road: a place where people write novels, mostly seeming to be light-novel in tone."""
@staticmethod
def matches(url):
# e.g. https://royalroadl.com/fiction/6752/lament-of-the-fallen
match = re.match(r'^(https?://royalroadl\.com/fiction/\d+)/?.*', url)
if match:
return match.group(1) + '/'
def extract(self, url):
workid = re.match(r'^https?://royalroadl\.com/fiction/(\d+)/?.*', url).group(1)
soup = self._soup('https://royalroadl.com/fiction/{}'.format(workid))
# should have gotten redirected, for a valid title
original_maxheaders = http.client._MAXHEADERS
http.client._MAXHEADERS = 1000
story = Section(
title=soup.find('h1', property='name').string.strip(),
author=soup.find('meta', property='books:author').get('content').strip(),
url=soup.find('meta', property='og:url').get('content').strip()
)
for chapter in soup.select('#chapters tbody tr[data-url]'):
chapter_url = str(urllib.parse.urljoin(story.url, str(chapter.get('data-url'))))
updated = datetime.datetime.fromtimestamp(
int(chapter.find('time').get('unixtime')),
)
story.add(Chapter(title=chapter.find('a', href=True).string.strip(), contents=self._chapter(chapter_url), date=updated))
http.client._MAXHEADERS = original_maxheaders
return story
def _chapter(self, url):
logger.info("Extracting chapter @ %s", url)
soup = self._soup(url)
content = soup.find('div', class_='chapter-content')
# TODO: this could be more robust, and I don't know if there's post-chapter notes anywhere as well.
author_note = soup.find('div', class_='author-note-portlet')
return (author_note and (author_note.prettify() + '<hr/>') or '') + content.prettify()

View file

@ -1,9 +1,12 @@
#!/usr/bin/python
import logging
import datetime
import re
from . import register, Site, SiteException, Section, Chapter
logger = logging.getLogger(__name__)
@register
class Stash(Site):
@ -35,12 +38,12 @@ class Stash(Site):
if thumb['href'] is not '#':
story.add(self._chapter(thumb['href']))
except Exception as e:
print(e)
logger.exception("Couldn't extract chapters from thumbs")
return story
def _chapter(self, url):
print("Extracting chapter from", url)
logger.info("Fetching chapter @ %s", url)
soup = self._soup(url)
content = soup.find(class_="journal-wrapper")

View file

@ -2,8 +2,11 @@
import datetime
import re
import logging
from . import register, Site, SiteException, Section, Chapter
logger = logging.getLogger(__name__)
class XenForo(Site):
"""XenForo is forum software that powers a number of fiction-related forums."""
@ -23,7 +26,7 @@ class XenForo(Site):
'password': login_details[1],
}
self.session.post('https://%s/login/login' % self.domain, data=post)
print("Logged in as", login_details[0])
logger.info("Logged in as %s", login_details[0])
def extract(self, url):
soup = self._soup(url)
@ -47,7 +50,7 @@ class XenForo(Site):
if not href.startswith('http'):
href = base + href
title = str(mark.string).strip()
print("Fetching chapter", title, href)
logger.info("Fetching chapter \"%s\" @ %s", title, href)
chapter = Chapter(title=title, contents="")
contents, post_date = self._chapter(href, idx)
chapter.contents = contents
@ -63,7 +66,7 @@ class XenForo(Site):
try:
return self._chapter_list_threadmarks(url)
except SiteException as e:
print("Tried threadmarks", e.args)
logger.debug("Tried threadmarks (%r)", e.args)
return self._chapter_list_index(url)
def _chapter_list_threadmarks(self, url):
@ -71,7 +74,10 @@ class XenForo(Site):
threadmarks_link = soup.find(class_="threadmarksTrigger", href=True)
if not threadmarks_link:
threadmarks_link = soup.select('.threadmarkMenus a.OverlayTrigger')[0]
try:
threadmarks_link = soup.select('.threadmarkMenus a.OverlayTrigger')[0]
except IndexError:
pass
if not threadmarks_link:
raise SiteException("No threadmarks")