From 7addf4c3d10a6d936770d3d82c9bf7a93e5577eb Mon Sep 17 00:00:00 2001 From: David Lynch Date: Wed, 28 Dec 2016 03:06:43 -0600 Subject: [PATCH] AO3: handle series, imperfectly --- sites/ao3.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/sites/ao3.py b/sites/ao3.py index b66f801..5d74e96 100644 --- a/sites/ao3.py +++ b/sites/ao3.py @@ -15,7 +15,9 @@ class ArchiveOfOurOwn(Site): def extract(self, url): workid = re.match(r'^https?://archiveofourown\.org/works/(\d+)/?.*', url).group(1) + return self._extract_work(workid) + def _extract_work(self, workid): soup = self._soup('http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid)) metadata = soup.select('#main h2.heading a') @@ -55,3 +57,37 @@ class ArchiveOfOurOwn(Site): landmark.decompose() return content.prettify() + + +@register +class ArchiveOfOurOwnSeries(ArchiveOfOurOwn): + @staticmethod + def matches(url): + # e.g. http://archiveofourown.org/works/5683105/chapters/13092007 + return re.match(r'^https?://archiveofourown\.org/series/\d+/?.*', url) + + def extract(self, url): + seriesid = re.match(r'^https?://archiveofourown\.org/series/(\d+)/?.*', url).group(1) + + soup = self._soup('http://archiveofourown.org/series/{}?view_adult=true'.format(seriesid)) + + story = { + 'title': soup.select('#main h2.heading')[0].string, + 'author': soup.select('#main dl.series.meta a[rel="author"]')[0].string, + } + + chapters = [] + for work in soup.select('#main ul.series li.work'): + workid = work.get('id').replace('work_', '') + substory = self._extract_work(workid) + + # TODO: improve epub-writer to be able to generate a toc.ncx with nested headings + # In the meantime, append the story title to the chapter titles. + chapters.extend(( + Chapter(title="{}: {}".format(substory['title'], c.title), contents=c.contents, date=c.date) + for c in substory['chapters'] + )) + + story['chapters'] = chapters + + return story