#!/usr/bin/python import datetime import re from . import register, Site, SiteException, Chapter @register class ArchiveOfOurOwn(Site): """Archive of Our Own: it has its own epub export, but the formatting is awful""" @staticmethod def matches(url): # e.g. http://archiveofourown.org/works/5683105/chapters/13092007 return re.match(r'^https?://archiveofourown\.org/works/\d+/?.*', url) def extract(self, url): workid = re.match(r'^https?://archiveofourown\.org/works/(\d+)/?.*', url).group(1) return self._extract_work(workid) def _extract_work(self, workid): soup = self._soup('http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid)) metadata = soup.select('#main h2.heading a') story = { 'title': metadata[0].string, 'author': metadata[1].string, } chapters = [] for chapter in soup.select('#main ol[role="navigation"] li'): link = chapter.find('a') chapter_url = str(link.get('href')) if chapter_url.startswith('/works/'): chapter_url = 'http://archiveofourown.org' + chapter_url chapter_url += '?view_adult=true' updated = datetime.datetime.strptime( chapter.find('span', class_='datetime').string, "(%Y-%m-%d)" ) chapters.append(Chapter(title=link.string, contents=self._chapter(chapter_url), date=updated)) if not chapters: raise SiteException("No content") story['chapters'] = chapters return story def _chapter(self, url): print("Extracting chapter from", url) soup = self._soup(url) content = soup.find('div', role='article') for landmark in content.find_all(class_='landmark'): landmark.decompose() return content.prettify() @register class ArchiveOfOurOwnSeries(ArchiveOfOurOwn): @staticmethod def matches(url): # e.g. http://archiveofourown.org/works/5683105/chapters/13092007 return re.match(r'^https?://archiveofourown\.org/series/\d+/?.*', url) def extract(self, url): seriesid = re.match(r'^https?://archiveofourown\.org/series/(\d+)/?.*', url).group(1) soup = self._soup('http://archiveofourown.org/series/{}?view_adult=true'.format(seriesid)) story = { 'title': soup.select('#main h2.heading')[0].string, 'author': soup.select('#main dl.series.meta a[rel="author"]')[0].string, } chapters = [] for work in soup.select('#main ul.series li.work'): workid = work.get('id').replace('work_', '') substory = self._extract_work(workid) # TODO: improve epub-writer to be able to generate a toc.ncx with nested headings # In the meantime, append the story title to the chapter titles. chapters.extend(( Chapter(title="{}: {}".format(substory['title'], c.title), contents=c.contents, date=c.date) for c in substory['chapters'] )) story['chapters'] = chapters return story