1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 08:22:56 +01:00
leech/sites/xenforo.py
2018-04-13 17:52:37 -04:00

226 lines
7.7 KiB
Python

#!/usr/bin/python
import datetime
import re
import logging
from . import register, Site, SiteException, Section, Chapter
logger = logging.getLogger(__name__)
class XenForo(Site):
"""XenForo is forum software that powers a number of fiction-related forums."""
domain = False
@staticmethod
def get_default_options():
return {
'offset': None,
'limit': None,
'skip_spoilers': True,
'include_index': False,
}
@classmethod
def matches(cls, url):
match = re.match(r'^(https?://%s/threads/[^/]*\d+)/?.*' % cls.domain, url)
if match:
return match.group(1) + '/'
def login(self, login_details):
# Todo: handle non-https?
post = {
'login': login_details[0],
'password': login_details[1],
}
self.session.post('https://%s/login/login' % self.domain, data=post)
logger.info("Logged in as %s", login_details[0])
def extract(self, url):
soup = self._soup(url)
base = soup.head.base.get('href')
story = Section(
title=soup.select('div.titleBar > h1')[0].get_text(),
author=soup.find('p', id='pageDescription').find('a', class_='username').get_text(),
url=url
)
marks = [
mark for mark in self._chapter_list(url)
if '/members' not in mark.get('href') and '/threadmarks' not in mark.get('href')
]
marks = marks[self.options['offset']:self.options['limit']]
for idx, mark in enumerate(marks, 1):
href = mark.get('href')
if not href.startswith('http'):
href = base + href
title = str(mark.string).strip()
logger.info("Fetching chapter \"%s\" @ %s", title, href)
chapter = Chapter(title=title, contents="")
contents, post_date = self._chapter(href, idx)
chapter.contents = contents
chapter.date = post_date
story.add(chapter)
story.footnotes = self.footnotes
self.footnotes = []
return story
def _chapter_list(self, url):
try:
return self._chapter_list_threadmarks(url)
except SiteException as e:
logger.debug("Tried threadmarks (%r)", e.args)
return self._chapter_list_index(url)
def _chapter_list_threadmarks(self, url):
soup = self._soup(url)
threadmarks_link = soup.find(class_="threadmarksTrigger", href=True)
if not threadmarks_link:
try:
threadmarks_link = soup.select('.threadmarkMenus a.OverlayTrigger')[0]
except IndexError:
pass
if not threadmarks_link:
raise SiteException("No threadmarks")
href = threadmarks_link.get('href')
base = soup.head.base.get('href')
soup = self._soup(base + href)
marks = soup.select('.threadmarks li.primaryContent.threadmarkListItem a, .threadmarks li.primaryContent.threadmarkItem a')
if not marks:
raise SiteException("No marks on threadmarks page")
return marks
def _chapter_list_index(self, url):
post = self._post_from_url(url)
if not post:
raise SiteException("Unparseable post URL", url)
links = post.find('blockquote', class_='messageText').find_all('a', class_='internalLink')
if not links:
raise SiteException("No links in index?")
if self.options['include_index']:
fake_link = self._new_tag('a', href=url)
fake_link.string = "Index"
links.insert(0, fake_link)
return links
def _chapter(self, url, chapterid):
post = self._post_from_url(url)
return self._clean_chapter(post, chapterid), self._post_date(post)
def _post_from_url(self, url):
# URLs refer to specific posts, so get just that one
# if no specific post referred to, get the first one
match = re.search(r'posts/(\d+)/?', url)
if not match:
match = re.match(r'.+#post-(\d+)$', url)
# could still be nothing here
postid = match and match.group(1)
if postid:
# create a proper post-url, because threadmarks can sometimes
# mess up page-wise with anchors
url = 'https://%s/posts/%s/' % (self.domain, postid)
soup = self._soup(url, 'html5lib')
if postid:
return soup.find('li', id='post-' + postid)
# just the first one in the thread, then
return soup.find('li', class_='message')
def _clean_chapter(self, post, chapterid):
post = post.find('blockquote', class_='messageText')
post.name = 'div'
# mostly, we want to remove colors because the Kindle is terrible at them
# TODO: find a way to denote colors, because it can be relevant
# TODO: at least invisitext, because outside of silly DC Lantern stuff, it's the most common
for tag in post.find_all(style=True):
if tag['style'] == 'color: transparent' and tag.text == 'TAB':
# Some stories fake paragraph indents like this. The output
# stylesheet will handle this just fine.
tag.decompose()
else:
# There's a few things which xenforo does as styles, despite there being perfectly good tags
# TODO: more robust CSS parsing? This is very whitespace dependent, if nothing else.
if "font-family: 'Courier New'" in tag['style']:
tag.wrap(self._new_tag('code'))
if "text-decoration: strikethrough" in tag['style']:
tag.wrap(self._new_tag('strike'))
tag.unwrap()
for tag in post.find_all(class_='quoteExpand'):
tag.decompose()
# spoilers don't work well, so turn them into epub footnotes
for idx, spoiler in enumerate(post.find_all(class_='ToggleTriggerAnchor')):
spoiler_title = spoiler.find(class_='SpoilerTitle')
if self.options['skip_spoilers']:
link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)
if spoiler_title:
link.string = spoiler_title.get_text()
else:
if spoiler_title:
link = '[SPOILER: {}]'.format(spoiler_title.get_text())
else:
link = '[SPOILER]'
new_spoiler = self._new_tag('div')
new_spoiler.append(link)
spoiler.replace_with(new_spoiler)
return post.prettify()
def _post_date(self, post):
maybe_date = post.find(class_='DateTime')
if 'data-time' in maybe_date.attrs:
return datetime.datetime.fromtimestamp(int(maybe_date['data-time']))
if 'title' in maybe_date.attrs:
# title="Feb 24, 2015 at 1:17 PM"
return datetime.datetime.strptime(maybe_date['title'], "%b %d, %Y at %I:%M %p")
raise SiteException("No date", maybe_date)
class XenForoIndex(XenForo):
@classmethod
def matches(cls, url):
match = re.match(r'^(https?://%s/posts/\d+)/?.*' % cls.domain, url)
if match:
return match.group(1) + '/'
def _chapter_list(self, url):
return self._chapter_list_index(url)
@register
class SpaceBattles(XenForo):
domain = 'forums.spacebattles.com'
@register
class SpaceBattlesIndex(SpaceBattles, XenForoIndex):
pass
@register
class SufficientVelocity(XenForo):
domain = 'forums.sufficientvelocity.com'
@register
class QuestionableQuesting(XenForo):
domain = 'forum.questionablequesting.com'
@register
class QuestionableQuestingIndex(QuestionableQuesting, XenForoIndex):
pass