From 40b4856a14e10eeec98598d14bac2eb01aaa19eb Mon Sep 17 00:00:00 2001 From: David Lynch Date: Sat, 25 May 2019 15:31:39 -0500 Subject: [PATCH] Optimize AO3: use full_work URL --- sites/__init__.py | 1 + sites/ao3.py | 28 ++++++++++++++++------------ 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/sites/__init__.py b/sites/__init__.py index 70c5339..e92adde 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -34,6 +34,7 @@ class Section: id = attr.ib(default=attr.Factory(_default_uuid_string), convert=str) contents = attr.ib(default=attr.Factory(list)) footnotes = attr.ib(default=attr.Factory(list)) + summary = attr.ib(default='') def __iter__(self): return self.contents.__iter__() diff --git a/sites/ao3.py b/sites/ao3.py index 3f65fc8..ec9f06f 100644 --- a/sites/ao3.py +++ b/sites/ao3.py @@ -48,33 +48,37 @@ class ArchiveOfOurOwn(Site): return self._extract_work(workid) def _extract_work(self, workid): - nav_url = 'http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid) - soup = self._soup(nav_url) + # Fetch the full work + url = 'http://archiveofourown.org/works/{}?view_adult=true&view_full_work=true'.format(workid) + logger.info("Extracting full work @ %s", url) + soup = self._soup(url) - metadata = soup.select('#main h2.heading a') story = Section( - title=metadata[0].text.strip(), - author=metadata[1].text.strip(), + title=soup.select('#workskin > .preface .title')[0].text.strip(), + author=soup.select('#workskin .preface .byline a')[0].text.strip(), + summary=soup.select('#workskin .preface .summary blockquote')[0].prettify(), url='http://archiveofourown.org/works/{}'.format(workid) ) - for chapter in soup.select('#main ol[role="navigation"] li'): + # Fetch the chapter list as well because it contains info that's not in the full work + nav_soup = self._soup('https://archiveofourown.org/works/{}/navigate'.format(workid)) + + for index, chapter in enumerate(nav_soup.select('#main ol[role="navigation"] li')): link = chapter.find('a') - chapter_url = urllib.parse.urljoin(nav_url, str(link.get('href'))) - chapter_url += '?view_adult=true' + logger.info("Extracting chapter %s", link.string) updated = datetime.datetime.strptime( chapter.find('span', class_='datetime').string, "(%Y-%m-%d)" ) - story.add(Chapter(title=link.string, contents=self._chapter(chapter_url), date=updated)) + contents = self._chapter(soup.find(id='chapter-{}'.format(index+1))) + + story.add(Chapter(title=link.string, contents=contents, date=updated)) return story - def _chapter(self, url): - logger.info("Extracting chapter @ %s", url) - soup = self._soup(url) + def _chapter(self, soup): content = soup.find('div', role='article') for landmark in content.find_all(class_='landmark'):