mirror of
https://github.com/kemayo/leech
synced 2025-12-06 16:33:16 +01:00
Optimize AO3: use full_work URL
This commit is contained in:
parent
617ee5ebfd
commit
40b4856a14
2 changed files with 17 additions and 12 deletions
|
|
@ -34,6 +34,7 @@ class Section:
|
||||||
id = attr.ib(default=attr.Factory(_default_uuid_string), convert=str)
|
id = attr.ib(default=attr.Factory(_default_uuid_string), convert=str)
|
||||||
contents = attr.ib(default=attr.Factory(list))
|
contents = attr.ib(default=attr.Factory(list))
|
||||||
footnotes = attr.ib(default=attr.Factory(list))
|
footnotes = attr.ib(default=attr.Factory(list))
|
||||||
|
summary = attr.ib(default='')
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
return self.contents.__iter__()
|
return self.contents.__iter__()
|
||||||
|
|
|
||||||
28
sites/ao3.py
28
sites/ao3.py
|
|
@ -48,33 +48,37 @@ class ArchiveOfOurOwn(Site):
|
||||||
return self._extract_work(workid)
|
return self._extract_work(workid)
|
||||||
|
|
||||||
def _extract_work(self, workid):
|
def _extract_work(self, workid):
|
||||||
nav_url = 'http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid)
|
# Fetch the full work
|
||||||
soup = self._soup(nav_url)
|
url = 'http://archiveofourown.org/works/{}?view_adult=true&view_full_work=true'.format(workid)
|
||||||
|
logger.info("Extracting full work @ %s", url)
|
||||||
|
soup = self._soup(url)
|
||||||
|
|
||||||
metadata = soup.select('#main h2.heading a')
|
|
||||||
story = Section(
|
story = Section(
|
||||||
title=metadata[0].text.strip(),
|
title=soup.select('#workskin > .preface .title')[0].text.strip(),
|
||||||
author=metadata[1].text.strip(),
|
author=soup.select('#workskin .preface .byline a')[0].text.strip(),
|
||||||
|
summary=soup.select('#workskin .preface .summary blockquote')[0].prettify(),
|
||||||
url='http://archiveofourown.org/works/{}'.format(workid)
|
url='http://archiveofourown.org/works/{}'.format(workid)
|
||||||
)
|
)
|
||||||
|
|
||||||
for chapter in soup.select('#main ol[role="navigation"] li'):
|
# Fetch the chapter list as well because it contains info that's not in the full work
|
||||||
|
nav_soup = self._soup('https://archiveofourown.org/works/{}/navigate'.format(workid))
|
||||||
|
|
||||||
|
for index, chapter in enumerate(nav_soup.select('#main ol[role="navigation"] li')):
|
||||||
link = chapter.find('a')
|
link = chapter.find('a')
|
||||||
chapter_url = urllib.parse.urljoin(nav_url, str(link.get('href')))
|
logger.info("Extracting chapter %s", link.string)
|
||||||
chapter_url += '?view_adult=true'
|
|
||||||
|
|
||||||
updated = datetime.datetime.strptime(
|
updated = datetime.datetime.strptime(
|
||||||
chapter.find('span', class_='datetime').string,
|
chapter.find('span', class_='datetime').string,
|
||||||
"(%Y-%m-%d)"
|
"(%Y-%m-%d)"
|
||||||
)
|
)
|
||||||
|
|
||||||
story.add(Chapter(title=link.string, contents=self._chapter(chapter_url), date=updated))
|
contents = self._chapter(soup.find(id='chapter-{}'.format(index+1)))
|
||||||
|
|
||||||
|
story.add(Chapter(title=link.string, contents=contents, date=updated))
|
||||||
|
|
||||||
return story
|
return story
|
||||||
|
|
||||||
def _chapter(self, url):
|
def _chapter(self, soup):
|
||||||
logger.info("Extracting chapter @ %s", url)
|
|
||||||
soup = self._soup(url)
|
|
||||||
content = soup.find('div', role='article')
|
content = soup.find('div', role='article')
|
||||||
|
|
||||||
for landmark in content.find_all(class_='landmark'):
|
for landmark in content.find_all(class_='landmark'):
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue