leech/sites/__init__.py


import glob
import os
import argparse
import uuid
import time
import logging
import attr
from bs4 import BeautifulSoup

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
_sites = []


def _default_uuid_string(*args):
    return str(uuid.uuid4())


@attr.s
class Chapter:
    title = attr.ib()
    contents = attr.ib()
    date = attr.ib(default=False)
    id = attr.ib(default=attr.Factory(_default_uuid_string), convert=str)


@attr.s
class Section:
    title = attr.ib()
    author = attr.ib()
    url = attr.ib()
    id = attr.ib(default=attr.Factory(_default_uuid_string), convert=str)
    contents = attr.ib(default=attr.Factory(list))
    footnotes = attr.ib(default=attr.Factory(list))

    def __iter__(self):
        return self.contents.__iter__()

    def __getitem__(self, index):
        return self.contents.__getitem__(index)

    def __setitem__(self, index, value):
        return self.contents.__setitem__(index, value)

    def __len__(self):
        return len(self.contents)

    def add(self, value, index=None):
        if index is not None:
            self.contents.insert(index, value)
        else:
            self.contents.append(value)

    def dates(self):
        for chapter in self.contents:
            if hasattr(chapter, '__iter__'):
                yield from chapter.dates()
            elif chapter.date:
                yield chapter.date


@attr.s
class Site:
    """A Site handles checking whether a URL might represent a site, and then
    extracting the content of a story from said site.
    """
    session = attr.ib()
    args = attr.ib()
    footnotes = attr.ib(default=attr.Factory(list), init=False)

    def __attrs_post_init__(self):
        self.options = self._parse_args(self.args)

    @staticmethod
    def matches(url):
        raise NotImplementedError()

    def extract(self, url):
        """Download a story from a given URL

        Args:
            url (string): A valid URL for this Site
        Returns:
            story (dict) containing keys:
                title (string)
                author (string)
                chapters (list): list of Chapters (namedtuple, defined above)
        """
        raise NotImplementedError()

    def login(self, login_details):
        raise NotImplementedError()

    def _parse_args(self, args):
        parser = argparse.ArgumentParser()
        self._add_arguments(parser)
        return parser.parse_args(args)

    def _add_arguments(self, parser):
        pass

    def _soup(self, url, method='html5lib', retry=3, retry_delay=10, **kw):
        page = self.session.get(url, **kw)
        if not page:
            if retry and retry > 0:
                delay = retry_delay
                if page.headers['Retry-After']:
                    delay = int(page.headers['Retry-After'])
                logger.warning("Load failed: waiting %s to retry (%s)", delay, page)
                time.sleep(delay)
                return self._soup(url, method=method, retry=retry - 1, retry_delay=retry_delay, **kw)
            raise SiteException("Couldn't fetch", url)
        return BeautifulSoup(page.text, method)

    def _new_tag(self, *args, **kw):
        soup = BeautifulSoup("", 'html5lib')
        return soup.new_tag(*args, **kw)

    def _footnote(self, contents, chapterid):
        """Register a footnote and return a link to that footnote"""

        # TODO: This embeds knowledge of what the generated filenames will be. Work out a better way.

        idx = len(self.footnotes) + 1

        # epub spec footnotes are all about epub:type on the footnote and the link
        # http://www.idpf.org/accessibility/guidelines/content/semantics/epub-type.php
        contents.name = 'div'
        contents.attrs['id'] = "footnote{}".format(idx)
        contents.attrs['epub:type'] = 'rearnote'

        # a backlink is essential for Kindle to think of this as a footnote
        # otherwise it doesn't get the inline-popup treatment
        # http://kindlegen.s3.amazonaws.com/AmazonKindlePublishingGuidelines.pdf
        # section 3.9.10
        backlink = self._new_tag('a', href="chapter{}.html#noteback{}".format(chapterid, idx))
        backlink.string = '^'
        contents.insert(0, backlink)

        self.footnotes.append(contents.prettify())

        # now build the link to the footnote to return, with appropriate
        # epub annotations.
        spoiler_link = self._new_tag('a')
        spoiler_link.attrs = {
            'id': 'noteback{}'.format(idx),
            'href': "footnotes.html#footnote{}".format(idx),
            'epub:type': 'noteref',
        }
        spoiler_link.string = str(idx)

        return spoiler_link


class SiteException(Exception):
    pass


def register(site_class):
    _sites.append(site_class)
    return site_class


def get(url):
    for site_class in _sites:
        match = site_class.matches(url)
        if match:
            return site_class, match
    raise NotImplementedError("Could not find a handler for " + url)


# And now, a particularly hacky take on a plugin system:
# Make an __all__ out of all the python files in this directory that don't start
# with __. Then import * them.

modules = glob.glob(os.path.join(os.path.dirname(__file__), "*.py"))
__all__ = [os.path.basename(f)[:-3] for f in modules if not f.startswith("__")]

from . import *  # noqa