Optimize AO3: use full_work URL

2025-12-06 16:33:16 +01:00 · 2019-05-25 15:31:39 -05:00 · 2019-05-25 15:31:39 -05:00 · 40b4856a14
commit 40b4856a14
parent 617ee5ebfd
2 changed files with 17 additions and 12 deletions
--- a/sites/init.py
+++ b/sites/init.py
@ -34,6 +34,7 @@ class Section:
    id = attr.ib(default=attr.Factory(_default_uuid_string), convert=str)
    contents = attr.ib(default=attr.Factory(list))
    footnotes = attr.ib(default=attr.Factory(list))
+    summary = attr.ib(default='')

    def __iter__(self):
        return self.contents.__iter__()
--- a/sites/ao3.py
+++ b/sites/ao3.py
@ -48,33 +48,37 @@ class ArchiveOfOurOwn(Site):
        return self._extract_work(workid)

    def _extract_work(self, workid):
-        nav_url = 'http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid)
-        soup = self._soup(nav_url)
+        # Fetch the full work
+        url = 'http://archiveofourown.org/works/{}?view_adult=true&view_full_work=true'.format(workid)
+        logger.info("Extracting full work @ %s", url)
+        soup = self._soup(url)

-        metadata = soup.select('#main h2.heading a')
        story = Section(
-            title=metadata[0].text.strip(),
-            author=metadata[1].text.strip(),
+            title=soup.select('#workskin > .preface .title')[0].text.strip(),
+            author=soup.select('#workskin .preface .byline a')[0].text.strip(),
+            summary=soup.select('#workskin .preface .summary blockquote')[0].prettify(),
            url='http://archiveofourown.org/works/{}'.format(workid)
        )

-        for chapter in soup.select('#main ol[role="navigation"] li'):
+        # Fetch the chapter list as well because it contains info that's not in the full work
+        nav_soup = self._soup('https://archiveofourown.org/works/{}/navigate'.format(workid))
+
+        for index, chapter in enumerate(nav_soup.select('#main ol[role="navigation"] li')):
            link = chapter.find('a')
-            chapter_url = urllib.parse.urljoin(nav_url, str(link.get('href')))
-            chapter_url += '?view_adult=true'
+            logger.info("Extracting chapter %s", link.string)

            updated = datetime.datetime.strptime(
                chapter.find('span', class_='datetime').string,
                "(%Y-%m-%d)"
            )

-            story.add(Chapter(title=link.string, contents=self._chapter(chapter_url), date=updated))
+            contents = self._chapter(soup.find(id='chapter-{}'.format(index+1)))
+
+            story.add(Chapter(title=link.string, contents=contents, date=updated))

        return story

-    def _chapter(self, url):
-        logger.info("Extracting chapter @ %s", url)
-        soup = self._soup(url)
+    def _chapter(self, soup):
        content = soup.find('div', role='article')

        for landmark in content.find_all(class_='landmark'):