diff --git a/README.markdown b/README.markdown index 3869073..0be34fc 100644 --- a/README.markdown +++ b/README.markdown @@ -43,6 +43,8 @@ Supports * ArchiveOfOurOwn * Yes, it has its own built-in EPUB export, but the formatting is horrible * Various XenForo-based sites: SpaceBattles and SufficientVelocity, most notably + * RoyalRoad + * Fiction.live (Anonkun) * DeviantArt galleries/collections * Sta.sh * Completely arbitrary sites, with a bit more work (see below) diff --git a/examples/sagaofsoul.json b/examples/sagaofsoul.json new file mode 100644 index 0000000..27bab61 --- /dev/null +++ b/examples/sagaofsoul.json @@ -0,0 +1,8 @@ +{ + "url": "http://www.sagaofsoul.com/story.html", + "title": "Saga of Soul", + "author": "Ouri Maler", + "chapter_selector": "#mainbody li a", + "content_selector": "#mainbody", + "filter_selector": "script, noscript" +} diff --git a/leech.py b/leech.py index 2f89753..8644c55 100755 --- a/leech.py +++ b/leech.py @@ -6,6 +6,7 @@ from click_default_group import DefaultGroup import requests import requests_cache import http.cookiejar +import logging import json import sites @@ -14,77 +15,69 @@ import ebook __version__ = 2 USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__ +logger = logging.getLogger(__name__) -def uses_session(command): - """Decorator for click commands that need a session.""" - @click.option('--cache/--no-cache', default=True) - def wrapper(cache, **kwargs): - if cache: - session = requests_cache.CachedSession('leech', expire_after=4 * 3600) - else: - session = requests.Session() +def configure_logging(verbose): + if verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig( + level=logging.INFO, + format="[%(name)s] %(message)s" + ) - lwp_cookiejar = http.cookiejar.LWPCookieJar() - try: - lwp_cookiejar.load('leech.cookies', ignore_discard=True) - except Exception as e: - pass - session.cookies = lwp_cookiejar - session.headers.update({ - 'User-agent': USER_AGENT - }) - return command(session=session, **kwargs) - wrapper.__name__ = command.__name__ - return wrapper +def create_session(cache): + if cache: + session = requests_cache.CachedSession('leech', expire_after=4 * 3600) + else: + session = requests.Session() + lwp_cookiejar = http.cookiejar.LWPCookieJar() + try: + lwp_cookiejar.load('leech.cookies', ignore_discard=True) + except Exception as e: + pass + session.cookies = lwp_cookiejar + session.headers.update({ + 'User-agent': USER_AGENT + }) + return session -def uses_story(command): - """Decorator for click commands that need a story.""" - @click.argument('url') - @click.option( - '--site-options', - default='{}', - help='JSON object encoding any site specific option.' +def open_story(url, session, site_options): + site, url = sites.get(url) + + if not site: + raise Exception("No site handler found") + + default_site_options = site.get_default_options() + + with open('leech.json') as store_file: + store = json.load(store_file) + login = store.get('logins', {}).get(site.__name__, False) + configured_site_options = store.get('site_options', {}).get(site.__name__, {}) + + overridden_site_options = json.loads(site_options) + + # The final options dictionary is computed by layering the default, configured, + # and overridden options together in that order. + options = dict( + list(default_site_options.items()) + + list(configured_site_options.items()) + + list(overridden_site_options.items()) ) - @uses_session - def wrapper(url, session, site_options, **kwargs): - site, url = sites.get(url) - if not site: - raise Exception("No site handler found") - default_site_options = site.get_default_options() + handler = site( + session, + options=options + ) - with open('leech.json') as store_file: - store = json.load(store_file) - login = store.get('logins', {}).get(site.__name__, False) - configured_site_options = store.get('site_options', {}).get(site.__name__, {}) - - overridden_site_options = json.loads(site_options) - - # The final options dictionary is computed by layering the default, configured, - # and overridden options together in that order. - options = dict( - list(default_site_options.items()) + - list(configured_site_options.items()) + - list(overridden_site_options.items()) - ) - - handler = site( - session, - options=options - ) - - if login: - handler.login(login) - - story = handler.extract(url) - if not story: - raise Exception("Couldn't extract story") - - command(story=story, **kwargs) - wrapper.__name__ = command.__name__ - return wrapper + if login: + handler.login(login) + story = handler.extract(url) + if not story: + raise Exception("Couldn't extract story") + return story @click.group(cls=DefaultGroup, default='download', default_if_no_args=True) def cli(): @@ -93,19 +86,31 @@ def cli(): @cli.command() -def flush(): - """"Flushes the contents of the cache.""" +@click.option('--verbose', '-v', is_flag=True, help="verbose output") +def flush(verbose): + """Flushes the contents of the cache.""" + configure_logging(verbose) requests_cache.install_cache('leech') requests_cache.clear() - print("Flushed cache") + logger.info("Flushed cache") @cli.command() -@uses_story -def download(story): +@click.argument('url') +@click.option( + '--site-options', + default='{}', + help='JSON object encoding any site specific option.' +) +@click.option('--cache/--no-cache', default=True) +@click.option('--verbose', '-v', is_flag=True, help="Verbose debugging output") +def download(url, site_options, cache, verbose): """Downloads a story and saves it on disk as a ebpub ebook.""" + configure_logging(verbose) + session = create_session(cache) + story = open_story(url, session, site_options) filename = ebook.generate_epub(story) - print("File created:", filename) + logger.info("File created: " + filename) if __name__ == '__main__': diff --git a/sites/__init__.py b/sites/__init__.py index 5ba4094..8e9b67f 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -3,9 +3,12 @@ import glob import os import uuid import time +import logging import attr from bs4 import BeautifulSoup +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) _sites = [] @@ -97,9 +100,9 @@ class Site: if not page: if retry and retry > 0: delay = retry_delay - if page.headers['Retry-After']: + if 'Retry-After' in page.headers: delay = int(page.headers['Retry-After']) - print("Load failed: waiting {}s to retry ({})".format(delay, page)) + logger.warning("Load failed: waiting %s to retry (%s)", delay, page) time.sleep(delay) return self._soup(url, method=method, retry=retry - 1, retry_delay=retry_delay, **kw) raise SiteException("Couldn't fetch", url) diff --git a/sites/ao3.py b/sites/ao3.py index 4523ae6..5303c80 100644 --- a/sites/ao3.py +++ b/sites/ao3.py @@ -1,9 +1,15 @@ #!/usr/bin/python +import logging import datetime import re +import urllib +import requests_cache +from bs4 import BeautifulSoup from . import register, Site, Section, Chapter +logger = logging.getLogger(__name__) + @register class ArchiveOfOurOwn(Site): @@ -15,12 +21,32 @@ class ArchiveOfOurOwn(Site): if match: return match.group(1) + '/' + def login(self, login_details): + with requests_cache.disabled(): + login = self.session.get('http://archiveofourown.org/login') + soup = BeautifulSoup(login.text, 'html5lib') + form = soup.find(id='new_user_session') + post = { + 'user_session[login]': login_details[0], + 'user_session[password]': login_details[1], + # standard fields: + 'user_session[remember_me]': '1', + 'utf8': form.find(attrs={'name': 'utf8'})['value'], + 'authenticity_token': form.find(attrs={'name': 'authenticity_token'})['value'], + 'commit': 'Log In', + } + # I feel the session *should* handle this cookies bit for me. But + # it doesn't. And I don't know why. + self.session.post('https://archiveofourown.org/user_sessions', data=post, cookies=login.cookies) + logger.info("Logged in as %s", login_details[0]) + def extract(self, url): workid = re.match(r'^https?://archiveofourown\.org/works/(\d+)/?.*', url).group(1) return self._extract_work(workid) def _extract_work(self, workid): - soup = self._soup('http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid)) + nav_url = 'http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid) + soup = self._soup(nav_url) metadata = soup.select('#main h2.heading a') story = Section( @@ -31,9 +57,7 @@ class ArchiveOfOurOwn(Site): for chapter in soup.select('#main ol[role="navigation"] li'): link = chapter.find('a') - chapter_url = str(link.get('href')) - if chapter_url.startswith('/works/'): - chapter_url = 'http://archiveofourown.org' + chapter_url + chapter_url = urllib.parse.urljoin(nav_url, str(link.get('href'))) chapter_url += '?view_adult=true' updated = datetime.datetime.strptime( @@ -46,7 +70,7 @@ class ArchiveOfOurOwn(Site): return story def _chapter(self, url): - print("Extracting chapter from", url) + logger.info("Extracting chapter @ %s", url) soup = self._soup(url) content = soup.find('div', role='article') @@ -79,7 +103,8 @@ class ArchiveOfOurOwnSeries(ArchiveOfOurOwn): story = Section( title=soup.select('#main h2.heading')[0].string, - author=soup.select('#main dl.series.meta a[rel="author"]')[0].string + author=soup.select('#main dl.series.meta a[rel="author"]')[0].string, + url='http://archiveofourown.org/series/{}'.format(seriesid) ) for work in soup.select('#main ul.series li.work'): diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 195faee..1989bc6 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -1,5 +1,6 @@ #!/usr/bin/python +import logging import attr import datetime import json @@ -7,6 +8,8 @@ import os.path import urllib from . import register, Site, Section, Chapter +logger = logging.getLogger(__name__) + """ Example JSON: { @@ -75,7 +78,7 @@ class Arbitrary(Site): def _chapter(self, url, definition): # TODO: refactor so this can meaningfully handle multiple matches on content_selector. # Probably by changing it so that this returns a Chapter / Section. - print("Extracting chapter from", url) + logger.info("Extracting chapter @ %s", url) soup = self._soup(url) content = soup.select(definition.content_selector)[0] diff --git a/sites/deviantart.py b/sites/deviantart.py index bb2775a..df30e92 100644 --- a/sites/deviantart.py +++ b/sites/deviantart.py @@ -1,10 +1,13 @@ #!/usr/bin/python +import logging import re from . import register, Section from .stash import Stash +logger = logging.getLogger(__name__) + @register class DeviantArt(Stash): @@ -41,6 +44,6 @@ class DeviantArt(Stash): if thumb['href'] is not '#': story.add(self._chapter(thumb['href'])) except Exception as e: - print(e) + logger.exception("Couldn't extract chapters from thumbs") return story diff --git a/sites/fanfictionnet.py b/sites/fanfictionnet.py index c3a6792..7f86aed 100644 --- a/sites/fanfictionnet.py +++ b/sites/fanfictionnet.py @@ -1,9 +1,12 @@ #!/usr/bin/python +import logging import datetime import re from . import register, Site, SiteException, Section, Chapter +logger = logging.getLogger(__name__) + @register class FanFictionNet(Site): @@ -11,9 +14,9 @@ class FanFictionNet(Site): @staticmethod def matches(url): # e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights - match = re.match(r'^(https?://www\.fanfiction\.net/s/\d+)/?.*', url) + match = re.match(r'^https?://(?:www|m)\.fanfiction\.net/s/(\d+)/?.*', url) if match: - return match.group(1) + '/' + return 'https://www.fanfiction.net/s/' + match.group(1) + '/' def extract(self, url): soup = self._soup(url) @@ -59,7 +62,7 @@ class FanFictionNet(Site): return story def _chapter(self, url): - print("Extracting chapter from", url) + logger.info("Fetching chapter @ %s", url) soup = self._soup(url) content = soup.find(id="content_wrapper_inner") @@ -74,7 +77,7 @@ class FanFictionNet(Site): for tag in text.find_all(True): tag.attrs = None except Exception as e: - print("Trouble cleaning attributes", e) + logger.exception("Trouble cleaning attributes") return text.prettify() @@ -84,6 +87,6 @@ class FictionPress(FanFictionNet): @staticmethod def matches(url): # e.g. https://www.fictionpress.com/s/2961893/1/Mother-of-Learning - match = re.match(r'^(https?://www\.fictionpress\.com/s/\d+)/?.*', url) + match = re.match(r'^https?://(?:www|m)\.fictionpress\.com/s/(\d+)/?.*', url) if match: - return match.group(1) + '/' + return 'https://www.fictionpress.com/s/' + match.group(1) + '/' diff --git a/sites/fictionlive.py b/sites/fictionlive.py index 6d588ba..a7d8fae 100644 --- a/sites/fictionlive.py +++ b/sites/fictionlive.py @@ -1,10 +1,13 @@ #!/usr/bin/python +import logging import itertools import datetime import re from . import register, Site, Section, Chapter +logger = logging.getLogger(__name__) + @register class FictionLive(Site): @@ -40,7 +43,7 @@ class FictionLive(Site): # https://fiction.live/api/anonkun/chapters/SBBA49fQavNQMWxFT/1502823848216/9999999999999998 # i.e. format is [current timestamp] / [next timestamp - 1] chapter_url = 'https://fiction.live/api/anonkun/chapters/{}/{}/{}'.format(workid, currc['ct'], nextc['ct'] - 1) - print("Extracting chapter from", chapter_url) + logger.info("Extracting chapter \"%s\" @ %s", currc['title'], chapter_url) data = self.session.get(chapter_url).json() html = [] diff --git a/sites/royalroad.py b/sites/royalroad.py new file mode 100644 index 0000000..794fdd2 --- /dev/null +++ b/sites/royalroad.py @@ -0,0 +1,58 @@ +#!/usr/bin/python + +import http.client +import logging +import datetime +import re +import urllib +from . import register, Site, Section, Chapter + +logger = logging.getLogger(__name__) + + +@register +class RoyalRoad(Site): + """Royal Road: a place where people write novels, mostly seeming to be light-novel in tone.""" + @staticmethod + def matches(url): + # e.g. https://royalroadl.com/fiction/6752/lament-of-the-fallen + match = re.match(r'^(https?://royalroadl\.com/fiction/\d+)/?.*', url) + if match: + return match.group(1) + '/' + + def extract(self, url): + workid = re.match(r'^https?://royalroadl\.com/fiction/(\d+)/?.*', url).group(1) + soup = self._soup('https://royalroadl.com/fiction/{}'.format(workid)) + # should have gotten redirected, for a valid title + + original_maxheaders = http.client._MAXHEADERS + http.client._MAXHEADERS = 1000 + + story = Section( + title=soup.find('h1', property='name').string.strip(), + author=soup.find('meta', property='books:author').get('content').strip(), + url=soup.find('meta', property='og:url').get('content').strip() + ) + + for chapter in soup.select('#chapters tbody tr[data-url]'): + chapter_url = str(urllib.parse.urljoin(story.url, str(chapter.get('data-url')))) + + updated = datetime.datetime.fromtimestamp( + int(chapter.find('time').get('unixtime')), + ) + + story.add(Chapter(title=chapter.find('a', href=True).string.strip(), contents=self._chapter(chapter_url), date=updated)) + + http.client._MAXHEADERS = original_maxheaders + + return story + + def _chapter(self, url): + logger.info("Extracting chapter @ %s", url) + soup = self._soup(url) + content = soup.find('div', class_='chapter-content') + + # TODO: this could be more robust, and I don't know if there's post-chapter notes anywhere as well. + author_note = soup.find('div', class_='author-note-portlet') + + return (author_note and (author_note.prettify() + '