1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 16:33:16 +01:00
leech/sites/ao3.py
David Lynch 5cb887f767 Move image processing into sites
The epub-builder still downloads the image, but all the html-mangling
is done in the extraction process now.

Turns footnotes into a chapter-object, for easier processing later on.
2025-03-22 19:39:16 -05:00

141 lines
5.3 KiB
Python

#!/usr/bin/python
import logging
import datetime
import re
import requests_cache
from . import register, Site, Section, Chapter, SiteException
logger = logging.getLogger(__name__)
@register
class ArchiveOfOurOwn(Site):
"""Archive of Our Own: it has its own epub export, but the formatting is awful"""
@staticmethod
def matches(url):
# e.g. http://archiveofourown.org/works/5683105/chapters/13092007
match = re.match(r'^(https?://(?:www\.)?archiveofourown\.org/works/\d+)/?.*', url)
if match:
return match.group(1) + '/'
def login(self, login_details):
with requests_cache.disabled():
# Can't just pass this url to _soup because I need the cookies later
login = self.session.get('https://archiveofourown.org/users/login')
soup, nobase = self._soup(login.text)
post, action, method = self._form_data(soup.find(id='new_user'))
post['user[login]'] = login_details[0]
post['user[password]'] = login_details[1]
# I feel the session *should* handle this cookies bit for me. But
# it doesn't. And I don't know why.
result = self.session.post(
self._join_url(login.url, action),
data=post, cookies=login.cookies
)
if result.ok:
logger.info("Logged in as %s", login_details[0])
else:
logger.error("Failed to log in as %s", login_details[0])
def extract(self, url):
workid = re.match(r'^https?://(?:www\.)?archiveofourown\.org/works/(\d+)/?.*', url).group(1)
return self._extract_work(workid)
def _extract_work(self, workid):
# Fetch the full work
url = f'http://archiveofourown.org/works/{workid}?view_adult=true&view_full_work=true'
logger.info("Extracting full work @ %s", url)
soup, base = self._soup(url)
if not soup.find(id='workskin'):
raise SiteException("Can't find the story text; you may need to log in or flush the cache")
story = Section(
title=soup.select('#workskin > .preface .title')[0].text.strip(),
author=soup.select('#workskin .preface .byline a')[0].text.strip(),
summary=soup.select('#workskin .preface .summary blockquote')[0].prettify(),
url=f'http://archiveofourown.org/works/{workid}',
tags=[tag.get_text().strip() for tag in soup.select('.work.meta .tags a.tag')]
)
# Fetch the chapter list as well because it contains info that's not in the full work
nav_soup, nav_base = self._soup(f'https://archiveofourown.org/works/{workid}/navigate')
chapters = soup.select('#chapters > div')
if len(chapters) == 1:
# in a single-chapter story the #chapters div is actually the chapter
chapters = [soup.find(id='chapters').parent]
for index, chapter in enumerate(nav_soup.select('#main ol[role="navigation"] li')):
link = chapter.find('a')
logger.info("Extracting chapter %s", link.string)
updated = datetime.datetime.strptime(
chapter.find('span', class_='datetime').string,
"(%Y-%m-%d)"
)
chapter_soup = chapters[index]
if not chapter_soup:
logger.warning("Couldn't find chapter %s in full work", index + 1)
continue
story.add(Chapter(
title=link.string,
# the `or soup` fallback covers single-chapter works
contents=self._chapter(chapter_soup, base),
date=updated
))
self._finalize(story)
return story
def _chapter(self, soup, base):
content = soup.find('div', role='article')
for landmark in content.find_all(class_='landmark'):
landmark.decompose()
# TODO: Maybe these should be footnotes instead?
notes = soup.select('#chapters .end.notes')
if notes:
notes = notes[0]
for landmark in notes.find_all(class_='landmark'):
landmark.decompose()
self._clean(content, base)
return content.prettify() + (notes and notes.prettify() or '')
@register
class ArchiveOfOurOwnSeries(ArchiveOfOurOwn):
_key = "ArchiveOfOurOwn"
@staticmethod
def matches(url):
# e.g. http://archiveofourown.org/series/5683105/
match = re.match(r'^(https?://archiveofourown\.org/series/\d+)/?.*', url)
if match:
return match.group(1) + '/'
def extract(self, url):
seriesid = re.match(r'^https?://archiveofourown\.org/series/(\d+)/?.*', url).group(1)
soup, base = self._soup(f'http://archiveofourown.org/series/{seriesid}?view_adult=true')
story = Section(
title=soup.select('#main h2.heading')[0].text.strip(),
author=soup.select('#main dl.series.meta a[rel="author"]')[0].string,
url=f'http://archiveofourown.org/series/{seriesid}'
)
for work in soup.select('#main ul.series li.work'):
workid = work.get('id').replace('work_', '')
substory = self._extract_work(workid)
# TODO: improve epub-writer to be able to generate a toc.ncx with nested headings
story.add(substory)
return story