1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 08:22:56 +01:00
leech/sites/stash.py
2017-02-23 15:03:23 -06:00

65 lines
1.9 KiB
Python

#!/usr/bin/python
import datetime
import re
from . import register, Site, SiteException, Section, Chapter
@register
class Stash(Site):
@staticmethod
def matches(url):
# Need a stack page
match = re.match(r'^(https?://sta\.sh/2.+)/?.*', url)
if match:
return match.group(1) + '/'
def extract(self, url):
soup = self._soup(url)
content = soup.find(id="stash-body")
if not content:
return
# metadata = content.find(id='profile_top')
story = Section(
title=str(soup.find(class_="stash-folder-name").h2.string),
author=str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s")
)
thumbs = content.select(".stash-folder-stream .thumb")
if not thumbs:
return
for thumb in thumbs:
try:
if thumb['href'] is not '#':
story.add(self._chapter(thumb['href']))
except Exception as e:
print(e)
return story
def _chapter(self, url):
print("Extracting chapter from", url)
soup = self._soup(url)
content = soup.find(class_="journal-wrapper")
if not content:
raise SiteException("No content")
title = str(content.find(class_="gr-top").find(class_='metadata').h2.a.string)
text = content.find(class_="text")
# clean up some invalid xhtml attributes
# TODO: be more selective about this somehow
try:
for tag in text.find_all(True):
tag.attrs = None
except Exception as e:
raise SiteException("Trouble cleaning attributes", e)
return Chapter(title=title, contents=text.prettify(), date=self._date(soup))
def _date(self, soup):
maybe_date = soup.find('div', class_="dev-metainfo-details").find('span', ts=True)
return datetime.datetime.fromtimestamp(int(maybe_date['ts']))