mirror of
https://github.com/kemayo/leech
synced 2026-04-11 15:12:56 +02:00
XenForo: handle SV's XenForo2 changes
This commit is contained in:
parent
b1b51bdc8f
commit
3443304ab1
2 changed files with 105 additions and 27 deletions
|
|
@ -63,22 +63,19 @@ class XenForo(Site):
|
|||
def extract(self, url):
|
||||
soup = self._soup(url)
|
||||
|
||||
base = soup.head.base.get('href')
|
||||
base = soup.head.base and soup.head.base.get('href') or url
|
||||
|
||||
title = soup.select('div.titleBar > h1')[0]
|
||||
# clean out informational bits from the title
|
||||
for tag in title.find_all(class_='prefix'):
|
||||
tag.decompose()
|
||||
story = Section(
|
||||
title=title.get_text().strip(),
|
||||
author=soup.find('p', id='pageDescription').find('a', class_='username').get_text(),
|
||||
url=url
|
||||
)
|
||||
story = self._base_story(soup)
|
||||
|
||||
if url.endswith('/reader'):
|
||||
reader_url = url
|
||||
elif soup.find('a', class_='readerToggle'):
|
||||
reader_url = soup.find('a', class_='readerToggle').get('href')
|
||||
elif soup.find('div', class_='threadmarks-reader'):
|
||||
# Technically this is the xenforo2 bit, but :shrug:
|
||||
reader_url = soup.find('div', class_='threadmarks-reader').find('a').get('href')
|
||||
else:
|
||||
reader_url = False
|
||||
|
||||
if reader_url:
|
||||
idx = 0
|
||||
|
|
@ -86,7 +83,7 @@ class XenForo(Site):
|
|||
reader_url = self._join_url(base, reader_url)
|
||||
logger.info("Fetching chapters @ %s", reader_url)
|
||||
reader_soup = self._soup(reader_url)
|
||||
posts = reader_soup.select('#messageList > li.hasThreadmark')
|
||||
posts = self._posts_from_page(reader_soup)
|
||||
|
||||
for post in posts:
|
||||
idx = idx + 1
|
||||
|
|
@ -94,8 +91,7 @@ class XenForo(Site):
|
|||
continue
|
||||
if self.options['limit'] and idx >= self.options['limit']:
|
||||
continue
|
||||
# Get the title, removing "<strong>Threadmark:</strong>" which precedes it
|
||||
title = ''.join(post.select('div.threadmarker > span.label')[0].findAll(text=True, recursive=False)).strip()
|
||||
title = self._threadmark_title(post)
|
||||
logger.info("Extracting chapter \"%s\"", title)
|
||||
|
||||
story.add(Chapter(
|
||||
|
|
@ -105,11 +101,8 @@ class XenForo(Site):
|
|||
))
|
||||
|
||||
reader_url = False
|
||||
page_nav = reader_soup.find('div', class_='PageNav')
|
||||
if page_nav:
|
||||
# e.g. <div class="PageNav" data-page="1" data-range="2" data-start="2" data-end="6" data-last="11" data-sentinel="{{sentinel}}" data-baseurl="threads/the-cycle-of-deicide-quest-post-canon-worm-wot-cross.376535/reader?page=%7B%7Bsentinel%7D%7D">
|
||||
if int(page_nav.get('data-page')) < int(page_nav.get('data-last')):
|
||||
reader_url = urllib.parse.unquote(page_nav.get('data-baseurl')).replace(page_nav.get('data-sentinel'), str(int(page_nav.get('data-page')) + 1))
|
||||
if reader_soup.find('link', rel='next'):
|
||||
reader_url = reader_soup.find('link', rel='next').get('href')
|
||||
else:
|
||||
# TODO: Research whether reader mode is guaranteed to be enabled
|
||||
# when threadmarks are; if so, can delete this branch.
|
||||
|
|
@ -132,6 +125,27 @@ class XenForo(Site):
|
|||
|
||||
return story
|
||||
|
||||
def _base_story(self, soup):
|
||||
url = soup.find('meta', property='og:url').get('content')
|
||||
title = soup.select('div.titleBar > h1')[0]
|
||||
# clean out informational bits from the title
|
||||
for tag in title.find_all(class_='prefix'):
|
||||
tag.decompose()
|
||||
return Section(
|
||||
title=title.get_text().strip(),
|
||||
author=soup.find('p', id='pageDescription').find('a', class_='username').get_text(),
|
||||
url=url
|
||||
)
|
||||
|
||||
def _posts_from_page(self, soup, postid=False):
|
||||
if postid:
|
||||
return soup.find('li', id='post-' + postid)
|
||||
return soup.select('#messageList > li.hasThreadmark')
|
||||
|
||||
def _threadmark_title(self, post):
|
||||
# Get the title, removing "<strong>Threadmark:</strong>" which precedes it
|
||||
return ''.join(post.select('div.threadmarker > span.label')[0].findAll(text=True, recursive=False)).strip()
|
||||
|
||||
def _chapter_list(self, url):
|
||||
try:
|
||||
return self._chapter_list_threadmarks(url)
|
||||
|
|
@ -217,13 +231,16 @@ class XenForo(Site):
|
|||
soup = self._soup(url, 'html5lib')
|
||||
|
||||
if postid:
|
||||
return soup.find('li', id='post-' + postid)
|
||||
return self._posts_from_page(soup, postid)
|
||||
|
||||
# just the first one in the thread, then
|
||||
return soup.find('li', class_='message')
|
||||
|
||||
def _chapter_contents(self, post):
|
||||
return post.find('blockquote', class_='messageText')
|
||||
|
||||
def _clean_chapter(self, post, chapterid):
|
||||
post = post.find('blockquote', class_='messageText')
|
||||
post = self._chapter_contents(post)
|
||||
post.name = 'div'
|
||||
# mostly, we want to remove colors because the Kindle is terrible at them
|
||||
# TODO: find a way to denote colors, because it can be relevant
|
||||
|
|
@ -243,8 +260,12 @@ class XenForo(Site):
|
|||
tag.unwrap()
|
||||
for tag in post.find_all(class_='quoteExpand'):
|
||||
tag.decompose()
|
||||
self._clean_spoilers(post, chapterid)
|
||||
return post.prettify()
|
||||
|
||||
def _clean_spoilers(self, post, chapterid):
|
||||
# spoilers don't work well, so turn them into epub footnotes
|
||||
for idx, spoiler in enumerate(post.find_all(class_='ToggleTriggerAnchor')):
|
||||
for spoiler in post.find_all(class_='ToggleTriggerAnchor'):
|
||||
spoiler_title = spoiler.find(class_='SpoilerTitle')
|
||||
if self.options['skip_spoilers']:
|
||||
link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)
|
||||
|
|
@ -258,7 +279,6 @@ class XenForo(Site):
|
|||
new_spoiler = self._new_tag('div')
|
||||
new_spoiler.append(link)
|
||||
spoiler.replace_with(new_spoiler)
|
||||
return post.prettify()
|
||||
|
||||
def _post_date(self, post):
|
||||
maybe_date = post.find(class_='DateTime')
|
||||
|
|
@ -291,11 +311,6 @@ class SpaceBattlesIndex(SpaceBattles, XenForoIndex):
|
|||
_key = "SpaceBattles"
|
||||
|
||||
|
||||
@register
|
||||
class SufficientVelocity(XenForo):
|
||||
domain = 'forums.sufficientvelocity.com'
|
||||
|
||||
|
||||
@register
|
||||
class QuestionableQuesting(XenForo):
|
||||
domain = 'forum.questionablequesting.com'
|
||||
|
|
|
|||
63
sites/xenforo2.py
Normal file
63
sites/xenforo2.py
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
|
||||
from . import register, Section
|
||||
from .xenforo import XenForo
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class XenForo2(XenForo):
|
||||
def _base_story(self, soup):
|
||||
url = soup.find('meta', property='og:url').get('content')
|
||||
title = soup.select('h1.p-title-value')[0]
|
||||
# clean out informational bits from the title
|
||||
for tag in title.find_all(class_='prefix'):
|
||||
tag.decompose()
|
||||
return Section(
|
||||
title=title.get_text().strip(),
|
||||
author=soup.find('div', class_='p-description').find('a', class_='username').get_text(),
|
||||
url=url
|
||||
)
|
||||
|
||||
def _posts_from_page(self, soup, postid=False):
|
||||
if postid:
|
||||
return soup.find('article', id='js-post-' + postid)
|
||||
return soup.select('article.message--post')
|
||||
|
||||
def _threadmark_title(self, post):
|
||||
# Get the title, removing "<strong>Threadmark:</strong>" which precedes it
|
||||
return post.find('span', class_='threadmarkLabel').get_text()
|
||||
|
||||
def _chapter_contents(self, post):
|
||||
return post.find('div', class_='message-userContent')
|
||||
|
||||
def _clean_spoilers(self, post, chapterid):
|
||||
# spoilers don't work well, so turn them into epub footnotes
|
||||
for spoiler in post.find_all(class_='bbCodeSpoiler'):
|
||||
spoiler_title = spoiler.find(class_='bbCodeSpoiler-button-title')
|
||||
if self.options['skip_spoilers']:
|
||||
link = self._footnote(spoiler.find(class_='bbCodeBlock-content').extract(), chapterid)
|
||||
if spoiler_title:
|
||||
link.string = spoiler_title.get_text()
|
||||
else:
|
||||
if spoiler_title:
|
||||
link = '[SPOILER: {}]'.format(spoiler_title.get_text())
|
||||
else:
|
||||
link = '[SPOILER]'
|
||||
new_spoiler = self._new_tag('div')
|
||||
new_spoiler.append(link)
|
||||
spoiler.replace_with(new_spoiler)
|
||||
|
||||
def _post_date(self, post):
|
||||
if post.find('time'):
|
||||
return datetime.datetime.fromtimestamp(int(post.find('time').get('data-time')))
|
||||
raise SiteException("No date", maybe_date)
|
||||
|
||||
|
||||
@register
|
||||
class SufficientVelocity(XenForo2):
|
||||
domain = 'forums.sufficientvelocity.com'
|
||||
Loading…
Reference in a new issue