From 008eb8e63d2584136c84d60ba2b05a9574027d0b Mon Sep 17 00:00:00 2001 From: David Lynch Date: Sun, 3 Apr 2016 21:30:29 -0500 Subject: [PATCH] Support ArchiveOfOurOwn --- sites/__init__.py | 2 +- sites/ao3.py | 57 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 sites/ao3.py diff --git a/sites/__init__.py b/sites/__init__.py index 5cf4ddb..b9c5c67 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -93,4 +93,4 @@ def get(url): return site_class # And now, the things that will use this: -from . import xenforo, fanfictionnet, deviantart, stash +from . import xenforo, fanfictionnet, deviantart, stash, ao3 diff --git a/sites/ao3.py b/sites/ao3.py new file mode 100644 index 0000000..93d1a2a --- /dev/null +++ b/sites/ao3.py @@ -0,0 +1,57 @@ +#!/usr/bin/python + +import datetime +import re +from . import register, Site, SiteException + + +@register +class ArchiveOfOurOwn(Site): + """Archive of Our Own: it has its own epub export, but the formatting is awful""" + @staticmethod + def matches(url): + # e.g. http://archiveofourown.org/works/5683105/chapters/13092007 + return re.match(r'^https?://archiveofourown\.org/works/\d+/?.*', url) + + def extract(self, url): + workid = re.match(r'^https?://archiveofourown\.org/works/(\d+)/?.*', url).group(1) + + soup = self._soup('http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid)) + + metadata = soup.select('#main h2.heading a') + story = { + 'title': metadata[0].string, + 'author': metadata[1].string, + } + + chapters = [] + for chapter in soup.select('#main ol[role="navigation"] li'): + link = chapter.find('a') + chapter_url = str(link.get('href')) + if chapter_url.startswith('/works/'): + chapter_url = 'http://archiveofourown.org' + chapter_url + chapter_url += '?view_adult=true' + + updated = datetime.datetime.strptime( + chapter.find('span', class_='datetime').string, + "(%Y-%m-%d)" + ) + + chapters.append((link.string, self._chapter(chapter_url), updated)) + + if not chapters: + raise SiteException("No content") + + story['chapters'] = chapters + + return story + + def _chapter(self, url): + print("Extracting chapter from", url) + soup = self._soup(url) + content = soup.find('div', role='article') + + for landmark in content.find_all(class_='landmark'): + landmark.decompose() + + return content.prettify()