#!/usr/bin/python import datetime import re import logging import urllib from bs4 import BeautifulSoup from . import register, Site, SiteException, SiteSpecificOption, Section, Chapter logger = logging.getLogger(__name__) class XenForo(Site): """XenForo is forum software that powers a number of fiction-related forums.""" domain = False @staticmethod def get_site_specific_option_defs(): return [ SiteSpecificOption( 'include_index', '--include-index/--no-include-index', default=False, help="If true, the post marked as an index will be included as a chapter." ), SiteSpecificOption( 'skip_spoilers', '--skip-spoilers/--include-spoilers', default=True, help="If true, do not transcribe any tags that are marked as a spoiler." ), SiteSpecificOption( 'offset', '--offset', type=int, help="The chapter index to start in the chapter marks." ), SiteSpecificOption( 'limit', '--limit', type=int, help="The chapter to end at at in the chapter marks." ), ] @classmethod def matches(cls, url): match = re.match(r'^(https?://%s/threads/[^/]*\d+)/?.*' % cls.domain, url) if match: return match.group(1) + '/' def login(self, login_details): # Todo: handle non-https? post = { 'login': login_details[0], 'password': login_details[1], } self.session.post('https://%s/login/login' % self.domain, data=post) logger.info("Logged in as %s", login_details[0]) def extract(self, url): soup = self._soup(url) base = soup.head.base.get('href') story = Section( title=soup.select('div.titleBar > h1')[0].get_text(), author=soup.find('p', id='pageDescription').find('a', class_='username').get_text(), url=url ) if url.endswith('/reader'): reader_url = url elif soup.find('a', class_='readerToggle'): reader_url = soup.find('a', class_='readerToggle').get('href') if reader_url: idx = 0 while reader_url: reader_url = self._join_url(base, reader_url) logger.info("Fetching chapters @ %s", reader_url) reader_soup = self._soup(reader_url) posts = reader_soup.select('#messageList > li.hasThreadmark') for post in posts: idx = idx + 1 if self.options['offset'] and idx < self.options['offset']: continue if self.options['limit'] and idx >= self.options['limit']: continue # Get the title, removing "Threadmark:" which precedes it title = ''.join(post.select('div.threadmarker > span.label')[0].findAll(text=True, recursive=False)).strip() logger.info("Extracting chapter \"%s\"", title) story.add(Chapter( title=title, contents=self._clean_chapter(post, len(story) + 1), date=self._post_date(post) )) reader_url = False page_nav = reader_soup.find('div', class_='PageNav') if page_nav: # e.g.