1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 16:33:16 +01:00

Optimize AO3: use full_work URL

This commit is contained in:
David Lynch 2019-05-25 15:31:39 -05:00
parent 617ee5ebfd
commit 40b4856a14
2 changed files with 17 additions and 12 deletions

View file

@ -34,6 +34,7 @@ class Section:
id = attr.ib(default=attr.Factory(_default_uuid_string), convert=str) id = attr.ib(default=attr.Factory(_default_uuid_string), convert=str)
contents = attr.ib(default=attr.Factory(list)) contents = attr.ib(default=attr.Factory(list))
footnotes = attr.ib(default=attr.Factory(list)) footnotes = attr.ib(default=attr.Factory(list))
summary = attr.ib(default='')
def __iter__(self): def __iter__(self):
return self.contents.__iter__() return self.contents.__iter__()

View file

@ -48,33 +48,37 @@ class ArchiveOfOurOwn(Site):
return self._extract_work(workid) return self._extract_work(workid)
def _extract_work(self, workid): def _extract_work(self, workid):
nav_url = 'http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid) # Fetch the full work
soup = self._soup(nav_url) url = 'http://archiveofourown.org/works/{}?view_adult=true&view_full_work=true'.format(workid)
logger.info("Extracting full work @ %s", url)
soup = self._soup(url)
metadata = soup.select('#main h2.heading a')
story = Section( story = Section(
title=metadata[0].text.strip(), title=soup.select('#workskin > .preface .title')[0].text.strip(),
author=metadata[1].text.strip(), author=soup.select('#workskin .preface .byline a')[0].text.strip(),
summary=soup.select('#workskin .preface .summary blockquote')[0].prettify(),
url='http://archiveofourown.org/works/{}'.format(workid) url='http://archiveofourown.org/works/{}'.format(workid)
) )
for chapter in soup.select('#main ol[role="navigation"] li'): # Fetch the chapter list as well because it contains info that's not in the full work
nav_soup = self._soup('https://archiveofourown.org/works/{}/navigate'.format(workid))
for index, chapter in enumerate(nav_soup.select('#main ol[role="navigation"] li')):
link = chapter.find('a') link = chapter.find('a')
chapter_url = urllib.parse.urljoin(nav_url, str(link.get('href'))) logger.info("Extracting chapter %s", link.string)
chapter_url += '?view_adult=true'
updated = datetime.datetime.strptime( updated = datetime.datetime.strptime(
chapter.find('span', class_='datetime').string, chapter.find('span', class_='datetime').string,
"(%Y-%m-%d)" "(%Y-%m-%d)"
) )
story.add(Chapter(title=link.string, contents=self._chapter(chapter_url), date=updated)) contents = self._chapter(soup.find(id='chapter-{}'.format(index+1)))
story.add(Chapter(title=link.string, contents=contents, date=updated))
return story return story
def _chapter(self, url): def _chapter(self, soup):
logger.info("Extracting chapter @ %s", url)
soup = self._soup(url)
content = soup.find('div', role='article') content = soup.find('div', role='article')
for landmark in content.find_all(class_='landmark'): for landmark in content.find_all(class_='landmark'):