Support ArchiveOfOurOwn

2025-12-06 16:33:16 +01:00 · 2016-04-03 21:30:29 -05:00 · 2016-04-03 21:30:29 -05:00 · 008eb8e63d
commit 008eb8e63d
parent 05c98f28db
2 changed files with 58 additions and 1 deletions
--- a/sites/init.py
+++ b/sites/init.py
@ -93,4 +93,4 @@ def get(url):
            return site_class
 # And now, the things that will use this:
-from . import xenforo, fanfictionnet, deviantart, stash
+from . import xenforo, fanfictionnet, deviantart, stash, ao3
--- a/sites/ao3.py
+++ b/sites/ao3.py
@ -0,0 +1,57 @@
 #!/usr/bin/python
 import datetime
 import re
 from . import register, Site, SiteException
@register
 class ArchiveOfOurOwn(Site):
    """Archive of Our Own: it has its own epub export, but the formatting is awful"""
    @staticmethod
    def matches(url):
        # e.g. http://archiveofourown.org/works/5683105/chapters/13092007
        return re.match(r'^https?://archiveofourown\.org/works/\d+/?.*', url)
    def extract(self, url):
        workid = re.match(r'^https?://archiveofourown\.org/works/(\d+)/?.*', url).group(1)
        soup = self._soup('http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid))
        metadata = soup.select('#main h2.heading a')
        story = {
            'title': metadata[0].string,
            'author': metadata[1].string,
        }
        chapters = []
        for chapter in soup.select('#main ol[role="navigation"] li'):
            link = chapter.find('a')
            chapter_url = str(link.get('href'))
            if chapter_url.startswith('/works/'):
                chapter_url = 'http://archiveofourown.org' + chapter_url
            chapter_url += '?view_adult=true'
            updated = datetime.datetime.strptime(
                chapter.find('span', class_='datetime').string,
                "(%Y-%m-%d)"
            )
            chapters.append((link.string, self._chapter(chapter_url), updated))
        if not chapters:
            raise SiteException("No content")
        story['chapters'] = chapters
        return story
    def _chapter(self, url):
        print("Extracting chapter from", url)
        soup = self._soup(url)
        content = soup.find('div', role='article')
        for landmark in content.find_all(class_='landmark'):
            landmark.decompose()
        return content.prettify()