mirror of
https://github.com/kemayo/leech
synced 2025-12-06 16:33:16 +01:00
Optimize AO3: use full_work URL
This commit is contained in:
parent
617ee5ebfd
commit
40b4856a14
2 changed files with 17 additions and 12 deletions
|
|
@ -34,6 +34,7 @@ class Section:
|
|||
id = attr.ib(default=attr.Factory(_default_uuid_string), convert=str)
|
||||
contents = attr.ib(default=attr.Factory(list))
|
||||
footnotes = attr.ib(default=attr.Factory(list))
|
||||
summary = attr.ib(default='')
|
||||
|
||||
def __iter__(self):
|
||||
return self.contents.__iter__()
|
||||
|
|
|
|||
28
sites/ao3.py
28
sites/ao3.py
|
|
@ -48,33 +48,37 @@ class ArchiveOfOurOwn(Site):
|
|||
return self._extract_work(workid)
|
||||
|
||||
def _extract_work(self, workid):
|
||||
nav_url = 'http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid)
|
||||
soup = self._soup(nav_url)
|
||||
# Fetch the full work
|
||||
url = 'http://archiveofourown.org/works/{}?view_adult=true&view_full_work=true'.format(workid)
|
||||
logger.info("Extracting full work @ %s", url)
|
||||
soup = self._soup(url)
|
||||
|
||||
metadata = soup.select('#main h2.heading a')
|
||||
story = Section(
|
||||
title=metadata[0].text.strip(),
|
||||
author=metadata[1].text.strip(),
|
||||
title=soup.select('#workskin > .preface .title')[0].text.strip(),
|
||||
author=soup.select('#workskin .preface .byline a')[0].text.strip(),
|
||||
summary=soup.select('#workskin .preface .summary blockquote')[0].prettify(),
|
||||
url='http://archiveofourown.org/works/{}'.format(workid)
|
||||
)
|
||||
|
||||
for chapter in soup.select('#main ol[role="navigation"] li'):
|
||||
# Fetch the chapter list as well because it contains info that's not in the full work
|
||||
nav_soup = self._soup('https://archiveofourown.org/works/{}/navigate'.format(workid))
|
||||
|
||||
for index, chapter in enumerate(nav_soup.select('#main ol[role="navigation"] li')):
|
||||
link = chapter.find('a')
|
||||
chapter_url = urllib.parse.urljoin(nav_url, str(link.get('href')))
|
||||
chapter_url += '?view_adult=true'
|
||||
logger.info("Extracting chapter %s", link.string)
|
||||
|
||||
updated = datetime.datetime.strptime(
|
||||
chapter.find('span', class_='datetime').string,
|
||||
"(%Y-%m-%d)"
|
||||
)
|
||||
|
||||
story.add(Chapter(title=link.string, contents=self._chapter(chapter_url), date=updated))
|
||||
contents = self._chapter(soup.find(id='chapter-{}'.format(index+1)))
|
||||
|
||||
story.add(Chapter(title=link.string, contents=contents, date=updated))
|
||||
|
||||
return story
|
||||
|
||||
def _chapter(self, url):
|
||||
logger.info("Extracting chapter @ %s", url)
|
||||
soup = self._soup(url)
|
||||
def _chapter(self, soup):
|
||||
content = soup.find('div', role='article')
|
||||
|
||||
for landmark in content.find_all(class_='landmark'):
|
||||
|
|
|
|||
Loading…
Reference in a new issue