diff --git a/sites/xenforo.py b/sites/xenforo.py index 39b8bae..4a2b86d 100644 --- a/sites/xenforo.py +++ b/sites/xenforo.py @@ -3,6 +3,7 @@ import datetime import re import logging +import urllib from bs4 import BeautifulSoup from . import register, Site, SiteException, SiteSpecificOption, Section, Chapter @@ -70,23 +71,57 @@ class XenForo(Site): url=url ) - marks = [ - mark for mark in self._chapter_list(url) - if '/members' not in mark.get('href') and '/threadmarks' not in mark.get('href') - ] - marks = marks[self.options['offset']:self.options['limit']] + if url.endswith('/reader'): + reader_url = url + elif soup.find('a', class_='readerToggle'): + reader_url = soup.find('a', class_='readerToggle').get('href') - for idx, mark in enumerate(marks, 1): - href = mark.get('href') - if not href.startswith('http'): - href = base + href - title = str(mark.string).strip() - logger.info("Fetching chapter \"%s\" @ %s", title, href) - chapter = Chapter(title=title, contents="") - contents, post_date = self._chapter(href, idx) - chapter.contents = contents - chapter.date = post_date - story.add(chapter) + if reader_url: + idx = 0 + while reader_url: + reader_url = self._join_url(base, reader_url) + logger.info("Fetching chapters @ %s", reader_url) + reader_soup = self._soup(reader_url) + posts = reader_soup.select('#messageList > li.hasThreadmark') + + for post in posts: + idx = idx + 1 + if self.options['offset'] and idx < self.options['offset']: + continue + if self.options['limit'] and idx >= self.options['limit']: + continue + # Get the title, removing "Threadmark:" which precedes it + title = ''.join(post.select('div.threadmarker > span.label')[0].findAll(text=True, recursive=False)).strip() + logger.info("Extracting chapter \"%s\"", title) + + story.add(Chapter( + title=title, + contents=self._clean_chapter(post, len(story) + 1), + date=self._post_date(post) + )) + + reader_url = False + page_nav = reader_soup.find('div', class_='PageNav') + if page_nav: + # e.g.