mirror of
https://github.com/kemayo/leech
synced 2025-12-06 16:33:16 +01:00
Merge branch 'master' into clickify
This commit is contained in:
commit
ecebf1de58
12 changed files with 210 additions and 91 deletions
|
|
@ -43,6 +43,8 @@ Supports
|
||||||
* ArchiveOfOurOwn
|
* ArchiveOfOurOwn
|
||||||
* Yes, it has its own built-in EPUB export, but the formatting is horrible
|
* Yes, it has its own built-in EPUB export, but the formatting is horrible
|
||||||
* Various XenForo-based sites: SpaceBattles and SufficientVelocity, most notably
|
* Various XenForo-based sites: SpaceBattles and SufficientVelocity, most notably
|
||||||
|
* RoyalRoad
|
||||||
|
* Fiction.live (Anonkun)
|
||||||
* DeviantArt galleries/collections
|
* DeviantArt galleries/collections
|
||||||
* Sta.sh
|
* Sta.sh
|
||||||
* Completely arbitrary sites, with a bit more work (see below)
|
* Completely arbitrary sites, with a bit more work (see below)
|
||||||
|
|
|
||||||
8
examples/sagaofsoul.json
Normal file
8
examples/sagaofsoul.json
Normal file
|
|
@ -0,0 +1,8 @@
|
||||||
|
{
|
||||||
|
"url": "http://www.sagaofsoul.com/story.html",
|
||||||
|
"title": "Saga of Soul",
|
||||||
|
"author": "Ouri Maler",
|
||||||
|
"chapter_selector": "#mainbody li a",
|
||||||
|
"content_selector": "#mainbody",
|
||||||
|
"filter_selector": "script, noscript"
|
||||||
|
}
|
||||||
143
leech.py
143
leech.py
|
|
@ -6,6 +6,7 @@ from click_default_group import DefaultGroup
|
||||||
import requests
|
import requests
|
||||||
import requests_cache
|
import requests_cache
|
||||||
import http.cookiejar
|
import http.cookiejar
|
||||||
|
import logging
|
||||||
import json
|
import json
|
||||||
|
|
||||||
import sites
|
import sites
|
||||||
|
|
@ -14,77 +15,69 @@ import ebook
|
||||||
__version__ = 2
|
__version__ = 2
|
||||||
USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__
|
USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def uses_session(command):
|
def configure_logging(verbose):
|
||||||
"""Decorator for click commands that need a session."""
|
if verbose:
|
||||||
@click.option('--cache/--no-cache', default=True)
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
def wrapper(cache, **kwargs):
|
else:
|
||||||
if cache:
|
logging.basicConfig(
|
||||||
session = requests_cache.CachedSession('leech', expire_after=4 * 3600)
|
level=logging.INFO,
|
||||||
else:
|
format="[%(name)s] %(message)s"
|
||||||
session = requests.Session()
|
)
|
||||||
|
|
||||||
lwp_cookiejar = http.cookiejar.LWPCookieJar()
|
def create_session(cache):
|
||||||
try:
|
if cache:
|
||||||
lwp_cookiejar.load('leech.cookies', ignore_discard=True)
|
session = requests_cache.CachedSession('leech', expire_after=4 * 3600)
|
||||||
except Exception as e:
|
else:
|
||||||
pass
|
session = requests.Session()
|
||||||
session.cookies = lwp_cookiejar
|
|
||||||
session.headers.update({
|
|
||||||
'User-agent': USER_AGENT
|
|
||||||
})
|
|
||||||
return command(session=session, **kwargs)
|
|
||||||
wrapper.__name__ = command.__name__
|
|
||||||
return wrapper
|
|
||||||
|
|
||||||
|
lwp_cookiejar = http.cookiejar.LWPCookieJar()
|
||||||
|
try:
|
||||||
|
lwp_cookiejar.load('leech.cookies', ignore_discard=True)
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
session.cookies = lwp_cookiejar
|
||||||
|
session.headers.update({
|
||||||
|
'User-agent': USER_AGENT
|
||||||
|
})
|
||||||
|
return session
|
||||||
|
|
||||||
def uses_story(command):
|
def open_story(url, session, site_options):
|
||||||
"""Decorator for click commands that need a story."""
|
site, url = sites.get(url)
|
||||||
@click.argument('url')
|
|
||||||
@click.option(
|
if not site:
|
||||||
'--site-options',
|
raise Exception("No site handler found")
|
||||||
default='{}',
|
|
||||||
help='JSON object encoding any site specific option.'
|
default_site_options = site.get_default_options()
|
||||||
|
|
||||||
|
with open('leech.json') as store_file:
|
||||||
|
store = json.load(store_file)
|
||||||
|
login = store.get('logins', {}).get(site.__name__, False)
|
||||||
|
configured_site_options = store.get('site_options', {}).get(site.__name__, {})
|
||||||
|
|
||||||
|
overridden_site_options = json.loads(site_options)
|
||||||
|
|
||||||
|
# The final options dictionary is computed by layering the default, configured,
|
||||||
|
# and overridden options together in that order.
|
||||||
|
options = dict(
|
||||||
|
list(default_site_options.items()) +
|
||||||
|
list(configured_site_options.items()) +
|
||||||
|
list(overridden_site_options.items())
|
||||||
)
|
)
|
||||||
@uses_session
|
|
||||||
def wrapper(url, session, site_options, **kwargs):
|
|
||||||
site, url = sites.get(url)
|
|
||||||
if not site:
|
|
||||||
raise Exception("No site handler found")
|
|
||||||
|
|
||||||
default_site_options = site.get_default_options()
|
handler = site(
|
||||||
|
session,
|
||||||
|
options=options
|
||||||
|
)
|
||||||
|
|
||||||
with open('leech.json') as store_file:
|
if login:
|
||||||
store = json.load(store_file)
|
handler.login(login)
|
||||||
login = store.get('logins', {}).get(site.__name__, False)
|
|
||||||
configured_site_options = store.get('site_options', {}).get(site.__name__, {})
|
|
||||||
|
|
||||||
overridden_site_options = json.loads(site_options)
|
|
||||||
|
|
||||||
# The final options dictionary is computed by layering the default, configured,
|
|
||||||
# and overridden options together in that order.
|
|
||||||
options = dict(
|
|
||||||
list(default_site_options.items()) +
|
|
||||||
list(configured_site_options.items()) +
|
|
||||||
list(overridden_site_options.items())
|
|
||||||
)
|
|
||||||
|
|
||||||
handler = site(
|
|
||||||
session,
|
|
||||||
options=options
|
|
||||||
)
|
|
||||||
|
|
||||||
if login:
|
|
||||||
handler.login(login)
|
|
||||||
|
|
||||||
story = handler.extract(url)
|
|
||||||
if not story:
|
|
||||||
raise Exception("Couldn't extract story")
|
|
||||||
|
|
||||||
command(story=story, **kwargs)
|
|
||||||
wrapper.__name__ = command.__name__
|
|
||||||
return wrapper
|
|
||||||
|
|
||||||
|
story = handler.extract(url)
|
||||||
|
if not story:
|
||||||
|
raise Exception("Couldn't extract story")
|
||||||
|
return story
|
||||||
|
|
||||||
@click.group(cls=DefaultGroup, default='download', default_if_no_args=True)
|
@click.group(cls=DefaultGroup, default='download', default_if_no_args=True)
|
||||||
def cli():
|
def cli():
|
||||||
|
|
@ -93,19 +86,31 @@ def cli():
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
def flush():
|
@click.option('--verbose', '-v', is_flag=True, help="verbose output")
|
||||||
""""Flushes the contents of the cache."""
|
def flush(verbose):
|
||||||
|
"""Flushes the contents of the cache."""
|
||||||
|
configure_logging(verbose)
|
||||||
requests_cache.install_cache('leech')
|
requests_cache.install_cache('leech')
|
||||||
requests_cache.clear()
|
requests_cache.clear()
|
||||||
print("Flushed cache")
|
logger.info("Flushed cache")
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@uses_story
|
@click.argument('url')
|
||||||
def download(story):
|
@click.option(
|
||||||
|
'--site-options',
|
||||||
|
default='{}',
|
||||||
|
help='JSON object encoding any site specific option.'
|
||||||
|
)
|
||||||
|
@click.option('--cache/--no-cache', default=True)
|
||||||
|
@click.option('--verbose', '-v', is_flag=True, help="Verbose debugging output")
|
||||||
|
def download(url, site_options, cache, verbose):
|
||||||
"""Downloads a story and saves it on disk as a ebpub ebook."""
|
"""Downloads a story and saves it on disk as a ebpub ebook."""
|
||||||
|
configure_logging(verbose)
|
||||||
|
session = create_session(cache)
|
||||||
|
story = open_story(url, session, site_options)
|
||||||
filename = ebook.generate_epub(story)
|
filename = ebook.generate_epub(story)
|
||||||
print("File created:", filename)
|
logger.info("File created: " + filename)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
||||||
|
|
@ -3,9 +3,12 @@ import glob
|
||||||
import os
|
import os
|
||||||
import uuid
|
import uuid
|
||||||
import time
|
import time
|
||||||
|
import logging
|
||||||
import attr
|
import attr
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.addHandler(logging.NullHandler())
|
||||||
_sites = []
|
_sites = []
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -97,9 +100,9 @@ class Site:
|
||||||
if not page:
|
if not page:
|
||||||
if retry and retry > 0:
|
if retry and retry > 0:
|
||||||
delay = retry_delay
|
delay = retry_delay
|
||||||
if page.headers['Retry-After']:
|
if 'Retry-After' in page.headers:
|
||||||
delay = int(page.headers['Retry-After'])
|
delay = int(page.headers['Retry-After'])
|
||||||
print("Load failed: waiting {}s to retry ({})".format(delay, page))
|
logger.warning("Load failed: waiting %s to retry (%s)", delay, page)
|
||||||
time.sleep(delay)
|
time.sleep(delay)
|
||||||
return self._soup(url, method=method, retry=retry - 1, retry_delay=retry_delay, **kw)
|
return self._soup(url, method=method, retry=retry - 1, retry_delay=retry_delay, **kw)
|
||||||
raise SiteException("Couldn't fetch", url)
|
raise SiteException("Couldn't fetch", url)
|
||||||
|
|
|
||||||
37
sites/ao3.py
37
sites/ao3.py
|
|
@ -1,9 +1,15 @@
|
||||||
#!/usr/bin/python
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
import logging
|
||||||
import datetime
|
import datetime
|
||||||
import re
|
import re
|
||||||
|
import urllib
|
||||||
|
import requests_cache
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
from . import register, Site, Section, Chapter
|
from . import register, Site, Section, Chapter
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@register
|
@register
|
||||||
class ArchiveOfOurOwn(Site):
|
class ArchiveOfOurOwn(Site):
|
||||||
|
|
@ -15,12 +21,32 @@ class ArchiveOfOurOwn(Site):
|
||||||
if match:
|
if match:
|
||||||
return match.group(1) + '/'
|
return match.group(1) + '/'
|
||||||
|
|
||||||
|
def login(self, login_details):
|
||||||
|
with requests_cache.disabled():
|
||||||
|
login = self.session.get('http://archiveofourown.org/login')
|
||||||
|
soup = BeautifulSoup(login.text, 'html5lib')
|
||||||
|
form = soup.find(id='new_user_session')
|
||||||
|
post = {
|
||||||
|
'user_session[login]': login_details[0],
|
||||||
|
'user_session[password]': login_details[1],
|
||||||
|
# standard fields:
|
||||||
|
'user_session[remember_me]': '1',
|
||||||
|
'utf8': form.find(attrs={'name': 'utf8'})['value'],
|
||||||
|
'authenticity_token': form.find(attrs={'name': 'authenticity_token'})['value'],
|
||||||
|
'commit': 'Log In',
|
||||||
|
}
|
||||||
|
# I feel the session *should* handle this cookies bit for me. But
|
||||||
|
# it doesn't. And I don't know why.
|
||||||
|
self.session.post('https://archiveofourown.org/user_sessions', data=post, cookies=login.cookies)
|
||||||
|
logger.info("Logged in as %s", login_details[0])
|
||||||
|
|
||||||
def extract(self, url):
|
def extract(self, url):
|
||||||
workid = re.match(r'^https?://archiveofourown\.org/works/(\d+)/?.*', url).group(1)
|
workid = re.match(r'^https?://archiveofourown\.org/works/(\d+)/?.*', url).group(1)
|
||||||
return self._extract_work(workid)
|
return self._extract_work(workid)
|
||||||
|
|
||||||
def _extract_work(self, workid):
|
def _extract_work(self, workid):
|
||||||
soup = self._soup('http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid))
|
nav_url = 'http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid)
|
||||||
|
soup = self._soup(nav_url)
|
||||||
|
|
||||||
metadata = soup.select('#main h2.heading a')
|
metadata = soup.select('#main h2.heading a')
|
||||||
story = Section(
|
story = Section(
|
||||||
|
|
@ -31,9 +57,7 @@ class ArchiveOfOurOwn(Site):
|
||||||
|
|
||||||
for chapter in soup.select('#main ol[role="navigation"] li'):
|
for chapter in soup.select('#main ol[role="navigation"] li'):
|
||||||
link = chapter.find('a')
|
link = chapter.find('a')
|
||||||
chapter_url = str(link.get('href'))
|
chapter_url = urllib.parse.urljoin(nav_url, str(link.get('href')))
|
||||||
if chapter_url.startswith('/works/'):
|
|
||||||
chapter_url = 'http://archiveofourown.org' + chapter_url
|
|
||||||
chapter_url += '?view_adult=true'
|
chapter_url += '?view_adult=true'
|
||||||
|
|
||||||
updated = datetime.datetime.strptime(
|
updated = datetime.datetime.strptime(
|
||||||
|
|
@ -46,7 +70,7 @@ class ArchiveOfOurOwn(Site):
|
||||||
return story
|
return story
|
||||||
|
|
||||||
def _chapter(self, url):
|
def _chapter(self, url):
|
||||||
print("Extracting chapter from", url)
|
logger.info("Extracting chapter @ %s", url)
|
||||||
soup = self._soup(url)
|
soup = self._soup(url)
|
||||||
content = soup.find('div', role='article')
|
content = soup.find('div', role='article')
|
||||||
|
|
||||||
|
|
@ -79,7 +103,8 @@ class ArchiveOfOurOwnSeries(ArchiveOfOurOwn):
|
||||||
|
|
||||||
story = Section(
|
story = Section(
|
||||||
title=soup.select('#main h2.heading')[0].string,
|
title=soup.select('#main h2.heading')[0].string,
|
||||||
author=soup.select('#main dl.series.meta a[rel="author"]')[0].string
|
author=soup.select('#main dl.series.meta a[rel="author"]')[0].string,
|
||||||
|
url='http://archiveofourown.org/series/{}'.format(seriesid)
|
||||||
)
|
)
|
||||||
|
|
||||||
for work in soup.select('#main ul.series li.work'):
|
for work in soup.select('#main ul.series li.work'):
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
#!/usr/bin/python
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
import logging
|
||||||
import attr
|
import attr
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
|
|
@ -7,6 +8,8 @@ import os.path
|
||||||
import urllib
|
import urllib
|
||||||
from . import register, Site, Section, Chapter
|
from . import register, Site, Section, Chapter
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example JSON:
|
Example JSON:
|
||||||
{
|
{
|
||||||
|
|
@ -75,7 +78,7 @@ class Arbitrary(Site):
|
||||||
def _chapter(self, url, definition):
|
def _chapter(self, url, definition):
|
||||||
# TODO: refactor so this can meaningfully handle multiple matches on content_selector.
|
# TODO: refactor so this can meaningfully handle multiple matches on content_selector.
|
||||||
# Probably by changing it so that this returns a Chapter / Section.
|
# Probably by changing it so that this returns a Chapter / Section.
|
||||||
print("Extracting chapter from", url)
|
logger.info("Extracting chapter @ %s", url)
|
||||||
soup = self._soup(url)
|
soup = self._soup(url)
|
||||||
content = soup.select(definition.content_selector)[0]
|
content = soup.select(definition.content_selector)[0]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,13 @@
|
||||||
#!/usr/bin/python
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
import logging
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from . import register, Section
|
from . import register, Section
|
||||||
from .stash import Stash
|
from .stash import Stash
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@register
|
@register
|
||||||
class DeviantArt(Stash):
|
class DeviantArt(Stash):
|
||||||
|
|
@ -41,6 +44,6 @@ class DeviantArt(Stash):
|
||||||
if thumb['href'] is not '#':
|
if thumb['href'] is not '#':
|
||||||
story.add(self._chapter(thumb['href']))
|
story.add(self._chapter(thumb['href']))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
logger.exception("Couldn't extract chapters from thumbs")
|
||||||
|
|
||||||
return story
|
return story
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,12 @@
|
||||||
#!/usr/bin/python
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
import logging
|
||||||
import datetime
|
import datetime
|
||||||
import re
|
import re
|
||||||
from . import register, Site, SiteException, Section, Chapter
|
from . import register, Site, SiteException, Section, Chapter
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@register
|
@register
|
||||||
class FanFictionNet(Site):
|
class FanFictionNet(Site):
|
||||||
|
|
@ -11,9 +14,9 @@ class FanFictionNet(Site):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def matches(url):
|
def matches(url):
|
||||||
# e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights
|
# e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights
|
||||||
match = re.match(r'^(https?://www\.fanfiction\.net/s/\d+)/?.*', url)
|
match = re.match(r'^https?://(?:www|m)\.fanfiction\.net/s/(\d+)/?.*', url)
|
||||||
if match:
|
if match:
|
||||||
return match.group(1) + '/'
|
return 'https://www.fanfiction.net/s/' + match.group(1) + '/'
|
||||||
|
|
||||||
def extract(self, url):
|
def extract(self, url):
|
||||||
soup = self._soup(url)
|
soup = self._soup(url)
|
||||||
|
|
@ -59,7 +62,7 @@ class FanFictionNet(Site):
|
||||||
return story
|
return story
|
||||||
|
|
||||||
def _chapter(self, url):
|
def _chapter(self, url):
|
||||||
print("Extracting chapter from", url)
|
logger.info("Fetching chapter @ %s", url)
|
||||||
soup = self._soup(url)
|
soup = self._soup(url)
|
||||||
|
|
||||||
content = soup.find(id="content_wrapper_inner")
|
content = soup.find(id="content_wrapper_inner")
|
||||||
|
|
@ -74,7 +77,7 @@ class FanFictionNet(Site):
|
||||||
for tag in text.find_all(True):
|
for tag in text.find_all(True):
|
||||||
tag.attrs = None
|
tag.attrs = None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Trouble cleaning attributes", e)
|
logger.exception("Trouble cleaning attributes")
|
||||||
|
|
||||||
return text.prettify()
|
return text.prettify()
|
||||||
|
|
||||||
|
|
@ -84,6 +87,6 @@ class FictionPress(FanFictionNet):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def matches(url):
|
def matches(url):
|
||||||
# e.g. https://www.fictionpress.com/s/2961893/1/Mother-of-Learning
|
# e.g. https://www.fictionpress.com/s/2961893/1/Mother-of-Learning
|
||||||
match = re.match(r'^(https?://www\.fictionpress\.com/s/\d+)/?.*', url)
|
match = re.match(r'^https?://(?:www|m)\.fictionpress\.com/s/(\d+)/?.*', url)
|
||||||
if match:
|
if match:
|
||||||
return match.group(1) + '/'
|
return 'https://www.fictionpress.com/s/' + match.group(1) + '/'
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,13 @@
|
||||||
#!/usr/bin/python
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
import logging
|
||||||
import itertools
|
import itertools
|
||||||
import datetime
|
import datetime
|
||||||
import re
|
import re
|
||||||
from . import register, Site, Section, Chapter
|
from . import register, Site, Section, Chapter
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@register
|
@register
|
||||||
class FictionLive(Site):
|
class FictionLive(Site):
|
||||||
|
|
@ -40,7 +43,7 @@ class FictionLive(Site):
|
||||||
# https://fiction.live/api/anonkun/chapters/SBBA49fQavNQMWxFT/1502823848216/9999999999999998
|
# https://fiction.live/api/anonkun/chapters/SBBA49fQavNQMWxFT/1502823848216/9999999999999998
|
||||||
# i.e. format is [current timestamp] / [next timestamp - 1]
|
# i.e. format is [current timestamp] / [next timestamp - 1]
|
||||||
chapter_url = 'https://fiction.live/api/anonkun/chapters/{}/{}/{}'.format(workid, currc['ct'], nextc['ct'] - 1)
|
chapter_url = 'https://fiction.live/api/anonkun/chapters/{}/{}/{}'.format(workid, currc['ct'], nextc['ct'] - 1)
|
||||||
print("Extracting chapter from", chapter_url)
|
logger.info("Extracting chapter \"%s\" @ %s", currc['title'], chapter_url)
|
||||||
data = self.session.get(chapter_url).json()
|
data = self.session.get(chapter_url).json()
|
||||||
html = []
|
html = []
|
||||||
|
|
||||||
|
|
|
||||||
58
sites/royalroad.py
Normal file
58
sites/royalroad.py
Normal file
|
|
@ -0,0 +1,58 @@
|
||||||
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
import http.client
|
||||||
|
import logging
|
||||||
|
import datetime
|
||||||
|
import re
|
||||||
|
import urllib
|
||||||
|
from . import register, Site, Section, Chapter
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@register
|
||||||
|
class RoyalRoad(Site):
|
||||||
|
"""Royal Road: a place where people write novels, mostly seeming to be light-novel in tone."""
|
||||||
|
@staticmethod
|
||||||
|
def matches(url):
|
||||||
|
# e.g. https://royalroadl.com/fiction/6752/lament-of-the-fallen
|
||||||
|
match = re.match(r'^(https?://royalroadl\.com/fiction/\d+)/?.*', url)
|
||||||
|
if match:
|
||||||
|
return match.group(1) + '/'
|
||||||
|
|
||||||
|
def extract(self, url):
|
||||||
|
workid = re.match(r'^https?://royalroadl\.com/fiction/(\d+)/?.*', url).group(1)
|
||||||
|
soup = self._soup('https://royalroadl.com/fiction/{}'.format(workid))
|
||||||
|
# should have gotten redirected, for a valid title
|
||||||
|
|
||||||
|
original_maxheaders = http.client._MAXHEADERS
|
||||||
|
http.client._MAXHEADERS = 1000
|
||||||
|
|
||||||
|
story = Section(
|
||||||
|
title=soup.find('h1', property='name').string.strip(),
|
||||||
|
author=soup.find('meta', property='books:author').get('content').strip(),
|
||||||
|
url=soup.find('meta', property='og:url').get('content').strip()
|
||||||
|
)
|
||||||
|
|
||||||
|
for chapter in soup.select('#chapters tbody tr[data-url]'):
|
||||||
|
chapter_url = str(urllib.parse.urljoin(story.url, str(chapter.get('data-url'))))
|
||||||
|
|
||||||
|
updated = datetime.datetime.fromtimestamp(
|
||||||
|
int(chapter.find('time').get('unixtime')),
|
||||||
|
)
|
||||||
|
|
||||||
|
story.add(Chapter(title=chapter.find('a', href=True).string.strip(), contents=self._chapter(chapter_url), date=updated))
|
||||||
|
|
||||||
|
http.client._MAXHEADERS = original_maxheaders
|
||||||
|
|
||||||
|
return story
|
||||||
|
|
||||||
|
def _chapter(self, url):
|
||||||
|
logger.info("Extracting chapter @ %s", url)
|
||||||
|
soup = self._soup(url)
|
||||||
|
content = soup.find('div', class_='chapter-content')
|
||||||
|
|
||||||
|
# TODO: this could be more robust, and I don't know if there's post-chapter notes anywhere as well.
|
||||||
|
author_note = soup.find('div', class_='author-note-portlet')
|
||||||
|
|
||||||
|
return (author_note and (author_note.prettify() + '<hr/>') or '') + content.prettify()
|
||||||
|
|
@ -1,9 +1,12 @@
|
||||||
#!/usr/bin/python
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
import logging
|
||||||
import datetime
|
import datetime
|
||||||
import re
|
import re
|
||||||
from . import register, Site, SiteException, Section, Chapter
|
from . import register, Site, SiteException, Section, Chapter
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@register
|
@register
|
||||||
class Stash(Site):
|
class Stash(Site):
|
||||||
|
|
@ -35,12 +38,12 @@ class Stash(Site):
|
||||||
if thumb['href'] is not '#':
|
if thumb['href'] is not '#':
|
||||||
story.add(self._chapter(thumb['href']))
|
story.add(self._chapter(thumb['href']))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
logger.exception("Couldn't extract chapters from thumbs")
|
||||||
|
|
||||||
return story
|
return story
|
||||||
|
|
||||||
def _chapter(self, url):
|
def _chapter(self, url):
|
||||||
print("Extracting chapter from", url)
|
logger.info("Fetching chapter @ %s", url)
|
||||||
soup = self._soup(url)
|
soup = self._soup(url)
|
||||||
|
|
||||||
content = soup.find(class_="journal-wrapper")
|
content = soup.find(class_="journal-wrapper")
|
||||||
|
|
|
||||||
|
|
@ -2,8 +2,11 @@
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
import re
|
import re
|
||||||
|
import logging
|
||||||
from . import register, Site, SiteException, Section, Chapter
|
from . import register, Site, SiteException, Section, Chapter
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class XenForo(Site):
|
class XenForo(Site):
|
||||||
"""XenForo is forum software that powers a number of fiction-related forums."""
|
"""XenForo is forum software that powers a number of fiction-related forums."""
|
||||||
|
|
@ -32,7 +35,7 @@ class XenForo(Site):
|
||||||
'password': login_details[1],
|
'password': login_details[1],
|
||||||
}
|
}
|
||||||
self.session.post('https://%s/login/login' % self.domain, data=post)
|
self.session.post('https://%s/login/login' % self.domain, data=post)
|
||||||
print("Logged in as", login_details[0])
|
logger.info("Logged in as %s", login_details[0])
|
||||||
|
|
||||||
def extract(self, url):
|
def extract(self, url):
|
||||||
soup = self._soup(url)
|
soup = self._soup(url)
|
||||||
|
|
@ -56,7 +59,7 @@ class XenForo(Site):
|
||||||
if not href.startswith('http'):
|
if not href.startswith('http'):
|
||||||
href = base + href
|
href = base + href
|
||||||
title = str(mark.string).strip()
|
title = str(mark.string).strip()
|
||||||
print("Fetching chapter", title, href)
|
logger.info("Fetching chapter \"%s\" @ %s", title, href)
|
||||||
chapter = Chapter(title=title, contents="")
|
chapter = Chapter(title=title, contents="")
|
||||||
contents, post_date = self._chapter(href, idx)
|
contents, post_date = self._chapter(href, idx)
|
||||||
chapter.contents = contents
|
chapter.contents = contents
|
||||||
|
|
@ -72,7 +75,7 @@ class XenForo(Site):
|
||||||
try:
|
try:
|
||||||
return self._chapter_list_threadmarks(url)
|
return self._chapter_list_threadmarks(url)
|
||||||
except SiteException as e:
|
except SiteException as e:
|
||||||
print("Tried threadmarks", e.args)
|
logger.debug("Tried threadmarks (%r)", e.args)
|
||||||
return self._chapter_list_index(url)
|
return self._chapter_list_index(url)
|
||||||
|
|
||||||
def _chapter_list_threadmarks(self, url):
|
def _chapter_list_threadmarks(self, url):
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue