leech/sites/ao3.py

#!/usr/bin/python

import logging
import datetime
import re
import requests_cache
from bs4 import BeautifulSoup
from . import register, Site, Section, Chapter, SiteException

logger = logging.getLogger(__name__)


@register
class ArchiveOfOurOwn(Site):
    """Archive of Our Own: it has its own epub export, but the formatting is awful"""
    @staticmethod
    def matches(url):
        # e.g. http://archiveofourown.org/works/5683105/chapters/13092007
        match = re.match(r'^(https?://(?:www\.)?archiveofourown\.org/works/\d+)/?.*', url)
        if match:
            return match.group(1) + '/'

    def login(self, login_details):
        with requests_cache.disabled():
            login = self.session.get('https://archiveofourown.org/users/login')
            soup = BeautifulSoup(login.text, 'html5lib')
            post, action, method = self._form_data(soup.find(id='new_user'))
            post['user[login]'] = login_details[0]
            post['user[password]'] = login_details[1]
            # I feel the session *should* handle this cookies bit for me. But
            # it doesn't. And I don't know why.
            result = self.session.post(
                self._join_url(login.url, action),
                data=post, cookies=login.cookies
            )
            if result.ok:
                logger.info("Logged in as %s", login_details[0])
            else:
                logger.error("Failed to log in as %s", login_details[0])

    def extract(self, url):
        workid = re.match(r'^https?://(?:www\.)?archiveofourown\.org/works/(\d+)/?.*', url).group(1)
        return self._extract_work(workid)

    def _extract_work(self, workid):
        # Fetch the full work
        url = f'http://archiveofourown.org/works/{workid}?view_adult=true&view_full_work=true'
        logger.info("Extracting full work @ %s", url)
        soup, base = self._soup(url)

        if not soup.find(id='workskin'):
            raise SiteException("Can't find the story text; you may need to log in or flush the cache")

        story = Section(
            title=soup.select('#workskin > .preface .title')[0].text.strip(),
            author=soup.select('#workskin .preface .byline a')[0].text.strip(),
            summary=soup.select('#workskin .preface .summary blockquote')[0].prettify(),
            url=f'http://archiveofourown.org/works/{workid}',
            tags=[tag.get_text().strip() for tag in soup.select('.work.meta .tags a.tag')]
        )

        # Fetch the chapter list as well because it contains info that's not in the full work
        nav_soup = self._soup(f'https://archiveofourown.org/works/{workid}/navigate')
        chapters = soup.select('#chapters > div')
        if len(chapters) == 1:
            # in a single-chapter story the #chapters div is actually the chapter
            chapters = [soup.find(id='chapters').parent]

        for index, chapter in enumerate(nav_soup.select('#main ol[role="navigation"] li')):
            link = chapter.find('a')
            logger.info("Extracting chapter %s", link.string)

            updated = datetime.datetime.strptime(
                chapter.find('span', class_='datetime').string,
                "(%Y-%m-%d)"
            )

            chapter_soup = chapters[index]
            if not chapter_soup:
                logger.warning("Couldn't find chapter %s in full work", index + 1)
                continue

            story.add(Chapter(
                title=link.string,
                # the `or soup` fallback covers single-chapter works
                contents=self._chapter(chapter_soup),
                date=updated
            ))

        return story

    def _chapter(self, soup):
        content = soup.find('div', role='article')

        for landmark in content.find_all(class_='landmark'):
            landmark.decompose()

        # TODO: Maybe these should be footnotes instead?
        notes = soup.select('#chapters .end.notes')
        if notes:
            notes = notes[0]
            for landmark in notes.find_all(class_='landmark'):
                landmark.decompose()

        self._clean(content)

        return content.prettify() + (notes and notes.prettify() or '')


@register
class ArchiveOfOurOwnSeries(ArchiveOfOurOwn):
    _key = "ArchiveOfOurOwn"

    @staticmethod
    def matches(url):
        # e.g. http://archiveofourown.org/series/5683105/
        match = re.match(r'^(https?://archiveofourown\.org/series/\d+)/?.*', url)
        if match:
            return match.group(1) + '/'

    def extract(self, url):
        seriesid = re.match(r'^https?://archiveofourown\.org/series/(\d+)/?.*', url).group(1)

        soup, base = self._soup(f'http://archiveofourown.org/series/{seriesid}?view_adult=true')

        story = Section(
            title=soup.select('#main h2.heading')[0].text.strip(),
            author=soup.select('#main dl.series.meta a[rel="author"]')[0].string,
            url=f'http://archiveofourown.org/series/{seriesid}'
        )

        for work in soup.select('#main ul.series li.work'):
            workid = work.get('id').replace('work_', '')
            substory = self._extract_work(workid)

            # TODO: improve epub-writer to be able to generate a toc.ncx with nested headings
            story.add(substory)

        return story