mirror of
https://github.com/kemayo/leech
synced 2025-12-06 08:22:56 +01:00
The epub-builder still downloads the image, but all the html-mangling is done in the extraction process now. Turns footnotes into a chapter-object, for easier processing later on.
73 lines
2.1 KiB
Python
73 lines
2.1 KiB
Python
#!/usr/bin/python
|
|
|
|
import logging
|
|
import datetime
|
|
import re
|
|
from . import register, Site, SiteException, Section, Chapter
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@register
|
|
class Stash(Site):
|
|
@staticmethod
|
|
def matches(url):
|
|
# Need a stack page
|
|
match = re.match(r'^(https?://sta\.sh/2.+)/?.*', url)
|
|
if match:
|
|
return match.group(1) + '/'
|
|
|
|
def extract(self, url):
|
|
soup, base = self._soup(url)
|
|
content = soup.find(id="stash-body")
|
|
if not content:
|
|
return
|
|
|
|
# metadata = content.find(id='profile_top')
|
|
story = Section(
|
|
title=str(soup.find(class_="stash-folder-name").h2.string),
|
|
author=str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s"),
|
|
url=url
|
|
)
|
|
|
|
thumbs = content.select(".stash-folder-stream .thumb")
|
|
if not thumbs:
|
|
return
|
|
for thumb in thumbs:
|
|
try:
|
|
if thumb['href'] != '#':
|
|
story.add(self._chapter(thumb['href']))
|
|
except Exception:
|
|
logger.exception("Couldn't extract chapters from thumbs")
|
|
|
|
self._finalize(story)
|
|
|
|
return story
|
|
|
|
def _chapter(self, url):
|
|
logger.info("Fetching chapter @ %s", url)
|
|
soup, base = self._soup(url)
|
|
|
|
content = soup.find(class_="journal-wrapper")
|
|
if not content:
|
|
raise SiteException("No content")
|
|
|
|
title = str(content.find(class_="gr-top").find(class_='metadata').h2.a.string)
|
|
|
|
text = content.find(class_="text")
|
|
|
|
# clean up some invalid xhtml attributes
|
|
# TODO: be more selective about this somehow
|
|
try:
|
|
for tag in text.find_all(True):
|
|
tag.attrs = None
|
|
except Exception as e:
|
|
raise SiteException("Trouble cleaning attributes", e)
|
|
|
|
self._clean(text, base)
|
|
|
|
return Chapter(title=title, contents=text.prettify(), date=self._date(soup))
|
|
|
|
def _date(self, soup):
|
|
maybe_date = soup.find('div', class_="dev-metainfo-details").find('span', ts=True)
|
|
return datetime.datetime.fromtimestamp(int(maybe_date['ts']))
|