leech/sites/stash.py

#!/usr/bin/python

import logging
import datetime
import re
from . import register, Site, SiteException, Section, Chapter

logger = logging.getLogger(__name__)


@register
class Stash(Site):
    @staticmethod
    def matches(url):
        # Need a stack page
        match = re.match(r'^(https?://sta\.sh/2.+)/?.*', url)
        if match:
            return match.group(1) + '/'

    def extract(self, url):
        soup, base = self._soup(url)
        content = soup.find(id="stash-body")
        if not content:
            return

        # metadata = content.find(id='profile_top')
        story = Section(
            title=str(soup.find(class_="stash-folder-name").h2.string),
            author=str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s"),
            url=url
        )

        thumbs = content.select(".stash-folder-stream .thumb")
        if not thumbs:
            return
        for thumb in thumbs:
            try:
                if thumb['href'] != '#':
                    story.add(self._chapter(thumb['href']))
            except Exception:
                logger.exception("Couldn't extract chapters from thumbs")

        self._finalize(story)

        return story

    def _chapter(self, url):
        logger.info("Fetching chapter @ %s", url)
        soup, base = self._soup(url)

        content = soup.find(class_="journal-wrapper")
        if not content:
            raise SiteException("No content")

        title = str(content.find(class_="gr-top").find(class_='metadata').h2.a.string)

        text = content.find(class_="text")

        # clean up some invalid xhtml attributes
        # TODO: be more selective about this somehow
        try:
            for tag in text.find_all(True):
                tag.attrs = None
        except Exception as e:
            raise SiteException("Trouble cleaning attributes", e)

        self._clean(text, base)

        return Chapter(title=title, contents=text.prettify(), date=self._date(soup))

    def _date(self, soup):
        maybe_date = soup.find('div', class_="dev-metainfo-details").find('span', ts=True)
        return datetime.datetime.fromtimestamp(int(maybe_date['ts']))