Merge branch 'master' into clickify

2025-12-06 08:22:56 +01:00 · 2018-04-13 17:52:37 -04:00 · 2018-04-13 17:52:37 -04:00 · ecebf1de58
commit ecebf1de58
parent 43599aceb5 9ab8404120
12 changed files with 210 additions and 91 deletions
--- a/README.markdown
+++ b/README.markdown
@ -43,6 +43,8 @@ Supports
 * ArchiveOfOurOwn
   * Yes, it has its own built-in EPUB export, but the formatting is horrible
 * Various XenForo-based sites: SpaceBattles and SufficientVelocity, most notably
+ * RoyalRoad
+ * Fiction.live (Anonkun)
 * DeviantArt galleries/collections
 * Sta.sh
 * Completely arbitrary sites, with a bit more work (see below)
--- a/examples/sagaofsoul.json
+++ b/examples/sagaofsoul.json
@ -0,0 +1,8 @@
+{
+    "url": "http://www.sagaofsoul.com/story.html",
+    "title": "Saga of Soul",
+    "author": "Ouri Maler",
+    "chapter_selector": "#mainbody li a",
+    "content_selector": "#mainbody",
+    "filter_selector": "script, noscript"
+}
--- a/leech.py
+++ b/leech.py
@ -6,6 +6,7 @@ from click_default_group import DefaultGroup
 import requests
 import requests_cache
 import http.cookiejar
+import logging
 import json

 import sites
@ -14,77 +15,69 @@ import ebook
 __version__ = 2
 USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__

+logger = logging.getLogger(__name__)

-def uses_session(command):
-    """Decorator for click commands that need a session."""
-    @click.option('--cache/--no-cache', default=True)
-    def wrapper(cache, **kwargs):
-        if cache:
-            session = requests_cache.CachedSession('leech', expire_after=4 * 3600)
-        else:
-            session = requests.Session()
+def configure_logging(verbose):
+    if verbose:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="[%(name)s] %(message)s"
+        )

-        lwp_cookiejar = http.cookiejar.LWPCookieJar()
-        try:
-            lwp_cookiejar.load('leech.cookies', ignore_discard=True)
-        except Exception as e:
-            pass
-        session.cookies = lwp_cookiejar
-        session.headers.update({
-            'User-agent': USER_AGENT
-        })
-        return command(session=session, **kwargs)
-    wrapper.__name__ = command.__name__
-    return wrapper
+def create_session(cache):
+    if cache:
+        session = requests_cache.CachedSession('leech', expire_after=4 * 3600)
+    else:
+        session = requests.Session()

+    lwp_cookiejar = http.cookiejar.LWPCookieJar()
+    try:
+        lwp_cookiejar.load('leech.cookies', ignore_discard=True)
+    except Exception as e:
+        pass
+    session.cookies = lwp_cookiejar
+    session.headers.update({
+        'User-agent': USER_AGENT
+    })
+    return session

-def uses_story(command):
-    """Decorator for click commands that need a story."""
-    @click.argument('url')
-    @click.option(
-        '--site-options',
-        default='{}',
-        help='JSON object encoding any site specific option.'
+def open_story(url, session, site_options):
+    site, url = sites.get(url)
+
+    if not site:
+        raise Exception("No site handler found")
+
+    default_site_options = site.get_default_options()
+
+    with open('leech.json') as store_file:
+        store = json.load(store_file)
+        login = store.get('logins', {}).get(site.__name__, False)
+        configured_site_options = store.get('site_options', {}).get(site.__name__, {})
+
+    overridden_site_options = json.loads(site_options)
+
+    # The final options dictionary is computed by layering the default, configured,
+    # and overridden options together in that order.
+    options = dict(
+        list(default_site_options.items()) +
+        list(configured_site_options.items()) +
+        list(overridden_site_options.items())
    )
-    @uses_session
-    def wrapper(url, session, site_options, **kwargs):
-        site, url = sites.get(url)
-        if not site:
-            raise Exception("No site handler found")

-        default_site_options = site.get_default_options()
+    handler = site(
+        session,
+        options=options
+    )

-        with open('leech.json') as store_file:
-            store = json.load(store_file)
-            login = store.get('logins', {}).get(site.__name__, False)
-            configured_site_options = store.get('site_options', {}).get(site.__name__, {})
-
-        overridden_site_options = json.loads(site_options)
-
-        # The final options dictionary is computed by layering the default, configured,
-        # and overridden options together in that order.
-        options = dict(
-            list(default_site_options.items()) +
-            list(configured_site_options.items()) +
-            list(overridden_site_options.items())
-        )
-
-        handler = site(
-            session,
-            options=options
-        )
-
-        if login:
-            handler.login(login)
-
-        story = handler.extract(url)
-        if not story:
-            raise Exception("Couldn't extract story")
-
-        command(story=story, **kwargs)
-    wrapper.__name__ = command.__name__
-    return wrapper
+    if login:
+        handler.login(login)

+    story = handler.extract(url)
+    if not story:
+        raise Exception("Couldn't extract story")
+    return story

@click.group(cls=DefaultGroup, default='download', default_if_no_args=True)
 def cli():
@ -93,19 +86,31 @@ def cli():


@cli.command()
-def flush():
-    """"Flushes the contents of the cache."""
+@click.option('--verbose', '-v', is_flag=True, help="verbose output")
+def flush(verbose):
+    """Flushes the contents of the cache."""
+    configure_logging(verbose)
    requests_cache.install_cache('leech')
    requests_cache.clear()
-    print("Flushed cache")
+    logger.info("Flushed cache")


@cli.command()
-@uses_story
-def download(story):
+@click.argument('url')
+@click.option(
+    '--site-options',
+    default='{}',
+    help='JSON object encoding any site specific option.'
+)
+@click.option('--cache/--no-cache', default=True)
+@click.option('--verbose', '-v', is_flag=True, help="Verbose debugging output")
+def download(url, site_options, cache, verbose):
    """Downloads a story and saves it on disk as a ebpub ebook."""
+    configure_logging(verbose)
+    session = create_session(cache)
+    story = open_story(url, session, site_options)
    filename = ebook.generate_epub(story)
-    print("File created:", filename)
+    logger.info("File created: " + filename)


 if __name__ == '__main__':
--- a/sites/init.py
+++ b/sites/init.py
@ -3,9 +3,12 @@ import glob
 import os
 import uuid
 import time
+import logging
 import attr
 from bs4 import BeautifulSoup

+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
 _sites = []


@ -97,9 +100,9 @@ class Site:
        if not page:
            if retry and retry > 0:
                delay = retry_delay
-                if page.headers['Retry-After']:
+                if 'Retry-After' in page.headers:
                    delay = int(page.headers['Retry-After'])
-                print("Load failed: waiting {}s to retry ({})".format(delay, page))
+                logger.warning("Load failed: waiting %s to retry (%s)", delay, page)
                time.sleep(delay)
                return self._soup(url, method=method, retry=retry - 1, retry_delay=retry_delay, **kw)
            raise SiteException("Couldn't fetch", url)
--- a/sites/ao3.py
+++ b/sites/ao3.py
@ -1,9 +1,15 @@
 #!/usr/bin/python

+import logging
 import datetime
 import re
+import urllib
+import requests_cache
+from bs4 import BeautifulSoup
 from . import register, Site, Section, Chapter

+logger = logging.getLogger(__name__)
+

@register
 class ArchiveOfOurOwn(Site):
@ -15,12 +21,32 @@ class ArchiveOfOurOwn(Site):
        if match:
            return match.group(1) + '/'

+    def login(self, login_details):
+        with requests_cache.disabled():
+            login = self.session.get('http://archiveofourown.org/login')
+            soup = BeautifulSoup(login.text, 'html5lib')
+            form = soup.find(id='new_user_session')
+            post = {
+                'user_session[login]': login_details[0],
+                'user_session[password]': login_details[1],
+                # standard fields:
+                'user_session[remember_me]': '1',
+                'utf8': form.find(attrs={'name': 'utf8'})['value'],
+                'authenticity_token': form.find(attrs={'name': 'authenticity_token'})['value'],
+                'commit': 'Log In',
+            }
+            # I feel the session *should* handle this cookies bit for me. But
+            # it doesn't. And I don't know why.
+            self.session.post('https://archiveofourown.org/user_sessions', data=post, cookies=login.cookies)
+            logger.info("Logged in as %s", login_details[0])
+
    def extract(self, url):
        workid = re.match(r'^https?://archiveofourown\.org/works/(\d+)/?.*', url).group(1)
        return self._extract_work(workid)

    def _extract_work(self, workid):
-        soup = self._soup('http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid))
+        nav_url = 'http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid)
+        soup = self._soup(nav_url)

        metadata = soup.select('#main h2.heading a')
        story = Section(
@ -31,9 +57,7 @@ class ArchiveOfOurOwn(Site):

        for chapter in soup.select('#main ol[role="navigation"] li'):
            link = chapter.find('a')
-            chapter_url = str(link.get('href'))
-            if chapter_url.startswith('/works/'):
-                chapter_url = 'http://archiveofourown.org' + chapter_url
+            chapter_url = urllib.parse.urljoin(nav_url, str(link.get('href')))
            chapter_url += '?view_adult=true'

            updated = datetime.datetime.strptime(
@ -46,7 +70,7 @@ class ArchiveOfOurOwn(Site):
        return story

    def _chapter(self, url):
-        print("Extracting chapter from", url)
+        logger.info("Extracting chapter @ %s", url)
        soup = self._soup(url)
        content = soup.find('div', role='article')

@ -79,7 +103,8 @@ class ArchiveOfOurOwnSeries(ArchiveOfOurOwn):

        story = Section(
            title=soup.select('#main h2.heading')[0].string,
-            author=soup.select('#main dl.series.meta a[rel="author"]')[0].string
+            author=soup.select('#main dl.series.meta a[rel="author"]')[0].string,
+            url='http://archiveofourown.org/series/{}'.format(seriesid)
        )

        for work in soup.select('#main ul.series li.work'):
--- a/sites/arbitrary.py
+++ b/sites/arbitrary.py
@ -1,5 +1,6 @@
 #!/usr/bin/python

+import logging
 import attr
 import datetime
 import json
@ -7,6 +8,8 @@ import os.path
 import urllib
 from . import register, Site, Section, Chapter

+logger = logging.getLogger(__name__)
+
 """
 Example JSON:
 {
@ -75,7 +78,7 @@ class Arbitrary(Site):
    def _chapter(self, url, definition):
        # TODO: refactor so this can meaningfully handle multiple matches on content_selector.
        # Probably by changing it so that this returns a Chapter / Section.
-        print("Extracting chapter from", url)
+        logger.info("Extracting chapter @ %s", url)
        soup = self._soup(url)
        content = soup.select(definition.content_selector)[0]

--- a/sites/deviantart.py
+++ b/sites/deviantart.py
@ -1,10 +1,13 @@
 #!/usr/bin/python

+import logging
 import re

 from . import register, Section
 from .stash import Stash

+logger = logging.getLogger(__name__)
+

@register
 class DeviantArt(Stash):
@ -41,6 +44,6 @@ class DeviantArt(Stash):
                if thumb['href'] is not '#':
                    story.add(self._chapter(thumb['href']))
            except Exception as e:
-                print(e)
+                logger.exception("Couldn't extract chapters from thumbs")

        return story
--- a/sites/fanfictionnet.py
+++ b/sites/fanfictionnet.py
@ -1,9 +1,12 @@
 #!/usr/bin/python

+import logging
 import datetime
 import re
 from . import register, Site, SiteException, Section, Chapter

+logger = logging.getLogger(__name__)
+

@register
 class FanFictionNet(Site):
@ -11,9 +14,9 @@ class FanFictionNet(Site):
    @staticmethod
    def matches(url):
        # e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights
-        match = re.match(r'^(https?://www\.fanfiction\.net/s/\d+)/?.*', url)
+        match = re.match(r'^https?://(?:www|m)\.fanfiction\.net/s/(\d+)/?.*', url)
        if match:
-            return match.group(1) + '/'
+            return 'https://www.fanfiction.net/s/' + match.group(1) + '/'

    def extract(self, url):
        soup = self._soup(url)
@ -59,7 +62,7 @@ class FanFictionNet(Site):
        return story

    def _chapter(self, url):
-        print("Extracting chapter from", url)
+        logger.info("Fetching chapter @ %s", url)
        soup = self._soup(url)

        content = soup.find(id="content_wrapper_inner")
@ -74,7 +77,7 @@ class FanFictionNet(Site):
            for tag in text.find_all(True):
                tag.attrs = None
        except Exception as e:
-            print("Trouble cleaning attributes", e)
+            logger.exception("Trouble cleaning attributes")

        return text.prettify()

@ -84,6 +87,6 @@ class FictionPress(FanFictionNet):
    @staticmethod
    def matches(url):
        # e.g. https://www.fictionpress.com/s/2961893/1/Mother-of-Learning
-        match = re.match(r'^(https?://www\.fictionpress\.com/s/\d+)/?.*', url)
+        match = re.match(r'^https?://(?:www|m)\.fictionpress\.com/s/(\d+)/?.*', url)
        if match:
-            return match.group(1) + '/'
+            return 'https://www.fictionpress.com/s/' + match.group(1) + '/'
--- a/sites/fictionlive.py
+++ b/sites/fictionlive.py
@ -1,10 +1,13 @@
 #!/usr/bin/python

+import logging
 import itertools
 import datetime
 import re
 from . import register, Site, Section, Chapter

+logger = logging.getLogger(__name__)
+

@register
 class FictionLive(Site):
@ -40,7 +43,7 @@ class FictionLive(Site):
            # https://fiction.live/api/anonkun/chapters/SBBA49fQavNQMWxFT/1502823848216/9999999999999998
            # i.e. format is [current timestamp] / [next timestamp - 1]
            chapter_url = 'https://fiction.live/api/anonkun/chapters/{}/{}/{}'.format(workid, currc['ct'], nextc['ct'] - 1)
-            print("Extracting chapter from", chapter_url)
+            logger.info("Extracting chapter \"%s\" @ %s", currc['title'], chapter_url)
            data = self.session.get(chapter_url).json()
            html = []

--- a/sites/royalroad.py
+++ b/sites/royalroad.py
@ -0,0 +1,58 @@
+#!/usr/bin/python
+
+import http.client
+import logging
+import datetime
+import re
+import urllib
+from . import register, Site, Section, Chapter
+
+logger = logging.getLogger(__name__)
+
+
+@register
+class RoyalRoad(Site):
+    """Royal Road: a place where people write novels, mostly seeming to be light-novel in tone."""
+    @staticmethod
+    def matches(url):
+        # e.g. https://royalroadl.com/fiction/6752/lament-of-the-fallen
+        match = re.match(r'^(https?://royalroadl\.com/fiction/\d+)/?.*', url)
+        if match:
+            return match.group(1) + '/'
+
+    def extract(self, url):
+        workid = re.match(r'^https?://royalroadl\.com/fiction/(\d+)/?.*', url).group(1)
+        soup = self._soup('https://royalroadl.com/fiction/{}'.format(workid))
+        # should have gotten redirected, for a valid title
+
+        original_maxheaders = http.client._MAXHEADERS
+        http.client._MAXHEADERS = 1000
+
+        story = Section(
+            title=soup.find('h1', property='name').string.strip(),
+            author=soup.find('meta', property='books:author').get('content').strip(),
+            url=soup.find('meta', property='og:url').get('content').strip()
+        )
+
+        for chapter in soup.select('#chapters tbody tr[data-url]'):
+            chapter_url = str(urllib.parse.urljoin(story.url, str(chapter.get('data-url'))))
+
+            updated = datetime.datetime.fromtimestamp(
+                int(chapter.find('time').get('unixtime')),
+            )
+
+            story.add(Chapter(title=chapter.find('a', href=True).string.strip(), contents=self._chapter(chapter_url), date=updated))
+
+        http.client._MAXHEADERS = original_maxheaders
+
+        return story
+
+    def _chapter(self, url):
+        logger.info("Extracting chapter @ %s", url)
+        soup = self._soup(url)
+        content = soup.find('div', class_='chapter-content')
+
+        # TODO: this could be more robust, and I don't know if there's post-chapter notes anywhere as well.
+        author_note = soup.find('div', class_='author-note-portlet')
+
+        return (author_note and (author_note.prettify() + '<hr/>') or '') + content.prettify()
--- a/sites/stash.py
+++ b/sites/stash.py
@ -1,9 +1,12 @@
 #!/usr/bin/python

+import logging
 import datetime
 import re
 from . import register, Site, SiteException, Section, Chapter

+logger = logging.getLogger(__name__)
+

@register
 class Stash(Site):
@ -35,12 +38,12 @@ class Stash(Site):
                if thumb['href'] is not '#':
                    story.add(self._chapter(thumb['href']))
            except Exception as e:
-                print(e)
+                logger.exception("Couldn't extract chapters from thumbs")

        return story

    def _chapter(self, url):
-        print("Extracting chapter from", url)
+        logger.info("Fetching chapter @ %s", url)
        soup = self._soup(url)

        content = soup.find(class_="journal-wrapper")
--- a/sites/xenforo.py
+++ b/sites/xenforo.py
@ -2,8 +2,11 @@

 import datetime
 import re
+import logging
 from . import register, Site, SiteException, Section, Chapter

+logger = logging.getLogger(__name__)
+

 class XenForo(Site):
    """XenForo is forum software that powers a number of fiction-related forums."""
@ -32,7 +35,7 @@ class XenForo(Site):
            'password': login_details[1],
        }
        self.session.post('https://%s/login/login' % self.domain, data=post)
-        print("Logged in as", login_details[0])
+        logger.info("Logged in as %s", login_details[0])

    def extract(self, url):
        soup = self._soup(url)
@ -56,7 +59,7 @@ class XenForo(Site):
            if not href.startswith('http'):
                href = base + href
            title = str(mark.string).strip()
-            print("Fetching chapter", title, href)
+            logger.info("Fetching chapter \"%s\" @ %s", title, href)
            chapter = Chapter(title=title, contents="")
            contents, post_date = self._chapter(href, idx)
            chapter.contents = contents
@ -72,7 +75,7 @@ class XenForo(Site):
        try:
            return self._chapter_list_threadmarks(url)
        except SiteException as e:
-            print("Tried threadmarks", e.args)
+            logger.debug("Tried threadmarks (%r)", e.args)
            return self._chapter_list_index(url)

    def _chapter_list_threadmarks(self, url):