leech/sites/arbitrary.py

#!/usr/bin/python

import logging
import attr
import datetime
import json
import re
import os.path
import urllib
from . import register, Site, Section, Chapter, Image

logger = logging.getLogger(__name__)

"""
Example JSON:
{
    "url": "https://practicalguidetoevil.wordpress.com/table-of-contents/",
    "title": "A Practical Guide To Evil: Book 1",
    "author": "erraticerrata",
    "chapter_selector": "#main .entry-content > ul > li > a",
    "content_selector": "#main .entry-content",
    "filter_selector": ".sharedaddy, .wpcnt, style",
    "cover_url": "https://gitlab.com/Mikescher2/A-Practical-Guide-To-Evil-Lyx/raw/master/APGTE_1/APGTE_front.png"
}
"""


@attr.s
class SiteDefinition:
    url = attr.ib()
    title = attr.ib()
    author = attr.ib()
    content_selector = attr.ib()
    # If present, find something within `content` to use a chapter title; if not found, the link text to it will be used
    content_title_selector = attr.ib(default=False)
    # If present, find a specific element in the `content` to be the chapter text
    content_text_selector = attr.ib(default=False)
    # If present, it looks for chapters linked from `url`. If not, it assumes `url` points to a chapter.
    chapter_selector = attr.ib(default=False)
    # If present, use to find a link to the next content page (only used if not using chapter_selector)
    next_selector = attr.ib(default=False)
    # If present, use to filter out content that matches the selector
    filter_selector = attr.ib(default=False)
    cover_url = attr.ib(default='')

    # If present, use to also download the images and embed them into the epub.
    image_selector = attr.ib(default=False)


@register
class Arbitrary(Site):
    """A way to describe an arbitrary side for a one-off fetch
    """
    @staticmethod
    def matches(url):
        # e.g. practical1.json
        if url.endswith('.json') and os.path.isfile(url):
            return url

    def extract(self, url):
        with open(url) as definition_file:
            definition = SiteDefinition(**json.load(definition_file))

        story = Section(
            title=definition.title,
            author=definition.author,
            url=url,
            cover_url=definition.cover_url
        )

        if definition.chapter_selector:
            soup, base = self._soup(definition.url)
            for chapter_link in soup.select(definition.chapter_selector):
                chapter_url = str(chapter_link.get('href'))
                if base:
                    chapter_url = self._join_url(base, chapter_url)
                chapter_url = self._join_url(definition.url, chapter_url)
                for chapter in self._chapter(chapter_url, definition, title=chapter_link.string):
                    story.add(chapter)
        else:
            # set of already processed urls. Stored to detect loops.
            found_content_urls = set()
            content_url = definition.url
            while content_url and content_url not in found_content_urls:
                found_content_urls.add(content_url)
                for chapter in self._chapter(content_url, definition):
                    story.add(chapter)
                if definition.next_selector:
                    soup, base = self._soup(content_url)
                    next_link = soup.select(definition.next_selector)
                    if next_link:
                        next_link_url = str(next_link[0].get('href'))
                        if base:
                            next_link_url = self._join_url(base, next_link_url)
                        content_url = self._join_url(content_url, next_link_url)
                    else:
                        content_url = False
                else:
                    content_url = False

        return story

    def _chapter(self, url, definition, title=False):
        logger.info("Extracting chapter @ %s", url)
        soup, base = self._soup(url)

        chapters = []

        if not soup.select(definition.content_selector):
            return chapters

        # clean up a few things which will definitely break epubs:
        # TODO: expand this greatly, or make it configurable
        for namespaced in soup.find_all(re.compile(r'[a-z]+:[a-z]+')):
            # Namespaced elements are going to cause validation errors
            namespaced.decompose()

        for content in soup.select(definition.content_selector):
            if definition.filter_selector:
                for filtered in content.select(definition.filter_selector):
                    filtered.decompose()

            if definition.content_title_selector:
                title_element = content.select(definition.content_title_selector)
                if title_element:
                    title = title_element[0].get_text().strip()

            if definition.content_text_selector:
                # TODO: multiple text elements?
                content = content.select(definition.content_text_selector)[0]

            # TODO: consider `'\n'.join(map(str, content.contents))`
            content.name = 'div'

            self._clean(content)

            images = []
            if definition.image_selector:
                images = self.load_images(content, definition.image_selector)

            chapters.append(Chapter(
                title=title,
                contents=content.prettify(),
                # TODO: better date detection
                date=datetime.datetime.now(),
                images=images
            ))

        return chapters

    def load_images(self, content, selector):
        images = []
        for image in content.select(selector):
            if not image.has_attr('src'):
                continue

            image_url = image['src']
            url = urllib.parse.urlparse(image_url)
            local_path = 'chapter_images/' + url.path.strip('/')

            image_res = self.session.get(image_url)
            content_type = image_res.headers['Content-Type']
            image_data = image_res.content

            images.append(Image(
                path=local_path,
                contents=image_data,
                content_type=content_type
            ))
            # Replace 'src'.
            image['src'] = '../' + local_path
            if image.has_attr('srcset'):
                del image['srcset']

        return images