diff --git a/fanficfare/adapters/__init__.py b/fanficfare/adapters/__init__.py index ac80450e..74315c59 100644 --- a/fanficfare/adapters/__init__.py +++ b/fanficfare/adapters/__init__.py @@ -110,6 +110,7 @@ from . import adapter_tgstorytimecom from . import adapter_itcouldhappennet from . import adapter_forumsspacebattlescom from . import adapter_forumssufficientvelocitycom +from . import adapter_xf2testsufficientvelocitycom from . import adapter_forumquestionablequestingcom from . import adapter_ninelivesarchivecom from . import adapter_masseffect2in diff --git a/fanficfare/adapters/adapter_xf2testsufficientvelocitycom.py b/fanficfare/adapters/adapter_xf2testsufficientvelocitycom.py new file mode 100644 index 00000000..6610843f --- /dev/null +++ b/fanficfare/adapters/adapter_xf2testsufficientvelocitycom.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 FanFicFare team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import absolute_import +import re + +from .base_xenforo2forum_adapter import BaseXenForo2ForumAdapter + +def getClass(): + return XF2TestSufficientVelocityComAdapter + +class XF2TestSufficientVelocityComAdapter(BaseXenForo2ForumAdapter): + + def __init__(self, config, url): + BaseXenForo2ForumAdapter.__init__(self, config, url) + + # Each adapter needs to have a unique site abbreviation. + self.story.setMetadata('siteabbrev','fsv2') + + @staticmethod # must be @staticmethod, don't remove it. + def getSiteDomain(): + # The site domain. Does have www here, if it uses it. + return 'xf2test.sufficientvelocity.com' + + # @classmethod + # def getAcceptDomains(cls): + # return [cls.getSiteDomain(), + # cls.getSiteDomain().replace('forums.','forum.'), + # cls.getSiteDomain().replace('forums.','')] + + def getSiteURLPattern(self): + ## SV accepts forums.sufficientvelocity.com, forum.sufficientvelocity.com and sufficientvelocity.com + ## all of which redirect to forums. + ## We will use forums. as canonical for all + return super(XF2TestSufficientVelocityComAdapter, self).getSiteURLPattern().replace(re.escape("forums."),r"(forums?\.)?") diff --git a/fanficfare/adapters/base_xenforo2forum_adapter.py b/fanficfare/adapters/base_xenforo2forum_adapter.py new file mode 100644 index 00000000..ee29abec --- /dev/null +++ b/fanficfare/adapters/base_xenforo2forum_adapter.py @@ -0,0 +1,195 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 FanFicFare team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import absolute_import +import logging +from datetime import datetime +logger = logging.getLogger(__name__) +import re +from xml.dom.minidom import parseString + +from ..htmlcleanup import stripHTML +from .. import exceptions as exceptions + +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import makeDate +from .base_xenforoforum_adapter import BaseXenForoForumAdapter + +logger = logging.getLogger(__name__) + +class BaseXenForo2ForumAdapter(BaseXenForoForumAdapter): + + def __init__(self, config, url): + logger.info("init url: "+url) + BaseXenForoForumAdapter.__init__(self, config, url) + + @classmethod + def getConfigSections(cls): + "Only needs to be overriden if has additional ini sections." + return ['base_xenforo2forum'] + super(BaseXenForo2ForumAdapter, cls).getConfigSections() + + def performLogin(self): + params = {} + + if self.password: + params['login'] = self.username + params['password'] = self.password + else: + params['login'] = self.getConfig("username") + params['password'] = self.getConfig("password") + + if not params['login']: + raise exceptions.FailedToLogin(self.url,"No username given. Set in personal.ini or enter when prompted.") + + ## need a login token. + data = self._fetchUrl(self.getURLPrefix() + '/login',usecache=False) + # logger.debug(data) + # + find_token='threads|posts)/(?P.+\.)?(?P<id>\d+)/?[^#]*?(#post-(?P<anchorpost>\d+))?$" + return re.escape(self.getURLPrefix()).replace("https","https?")+r"/(?P<tp>threads|posts)/(?P<title>.+\.)?(?P<id>\d+)/?[^#]*?(#?post-(?P<anchorpost>\d+))?$" def _fetchUrlOpened(self, url, parameters=None, @@ -206,7 +206,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): ## https://forum.questionablequesting.com/login/login loginUrl = self.getURLPrefix() + '/login/login' logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl, - params['login'])) + params['login'])) d = self._fetchUrl(loginUrl, params) @@ -220,6 +220,11 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): def make_soup(self,data): soup = super(BaseXenForoForumAdapter, self).make_soup(data) + ## img class="lazyload" + ## include lazy load images. + for img in soup.find_all('img',{'class':'lazyload'}): + img['src'] = img['data-src'] + ## after lazy load images, there are noscript blocks also ## containing <img> tags. The problem comes in when they hit ## book readers such as Kindle and Nook and then you see the @@ -227,94 +232,142 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): for noscript in soup.find_all('noscript'): noscript.extract() + for qdiv in self.get_quote_expand_tag(soup): + qdiv.extract() # Remove <div class="...">click to expand</div> + + self.convert_quotes(soup) + self.handle_spoilers(soup) ## cache posts on page. self.cache_posts(soup) return soup - ## Moved over from adapter_forumquestionablequestingcom when SB/SV - ## threadmark.rss became 'most recent 10 in reverse order'. + def get_threadmarks_top(self,souptag): + return souptag.find('div',{'class':'threadmarkMenus'}) + + def get_threadmarks(self,navdiv): + return navdiv.find_all('a',{'class':'OverlayTrigger','href':re.compile('threadmarks.*category_id=')}) + + def get_threadmark_catnumname(self,threadmarksa): + return (threadmarksa['href'].split('category_id=')[1], + stripHTML(threadmarksa.find_previous('a',{'class':'threadmarksTrigger'}))) + def extract_threadmarks(self,souptag): threadmarks=[] # try threadmarks if no '#' in url - navdiv = souptag.find('div',{'class':'threadmarkMenus'}) + navdiv = self.get_threadmarks_top(souptag) if not navdiv: return threadmarks - # was class=threadmarksTrigger. thread cats are currently - # only OverlayTrigger <a>s in threadmarkMenus, but I wouldn't - # be surprised if that changed. Don't want to do use just - # href=re because there's more than one copy on the page; plus - # could be included in a post. Would be easier if <noscript>s - # weren't being stripped, but that's a different issue. - threadmarksas = navdiv.find_all('a',{'class':'OverlayTrigger','href':re.compile('threadmarks.*category_id=')}) - ## Loop on threadmark categories. - tmcat_num=None + threadmarksas = self.get_threadmarks(navdiv) + threadmarkgroups = dict() # for ordering threadmarks + ## Loop on threadmark categories. for threadmarksa in threadmarksas: - tmcat_num = threadmarksa['href'].split('category_id=')[1] - # get from earlier <a> now. - tmcat_name = stripHTML(threadmarksa.find_previous('a',{'class':'threadmarksTrigger'})) - prepend = "" + (tmcat_num,tmcat_name) = self.get_threadmark_catnumname(threadmarksa) if tmcat_name in self.getConfigList('skip_threadmarks_categories'): continue if tmcat_name == 'Apocrypha' and self.getConfig('apocrypha_to_omake'): tmcat_name = 'Omake' - if tmcat_name != "Threadmarks": - prepend = tmcat_name+" - " - - threadmarks.extend(self.fetch_threadmarks(self.getURLPrefix()+'/'+threadmarksa['href'], - tmcat_name, - tmcat_num)) + if 'http' not in threadmarksa['href']: + href = self.getURLPrefix()+'/'+threadmarksa['href'] + else: + href = threadmarksa['href'] + threadmarkgroups[tmcat_name]=self.fetch_threadmarks(href, + tmcat_name, + tmcat_num) + ## Order of threadmark groups in new SV is changed and + ## possibly unpredictable. Normalize. Keep as configurable? + ## What about categories not in the list? + default_order = ['Threadmarks', + 'Sidestory', + 'Apocrypha', + 'Omake', + 'Media', + 'Informational', + 'Staff Post'] + # default order also *after* config'ed + # threadmark_category_order so if they are not also in + # skip_threadmarks_categories they appear in the expected + # order. + for cat_name in self.getConfigList('threadmark_category_order',default_order)+default_order: + if cat_name in threadmarkgroups: + threadmarks.extend(threadmarkgroups[cat_name]) + del threadmarkgroups[cat_name] + # more categories left? new or at least unknown + if threadmarkgroups: + cats = threadmarkgroups.keys() + # alphabetize for lack of a better idea to insure consist ordering + cats.sort() + for cat_name in cats: + threadmarks.extend(threadmarkgroups[cat_name]) return threadmarks + def get_threadmarks_list(self,soupmarks): + return soupmarks.find('div',{'class':'threadmarkList'}) + + def get_threadmarks_from_list(self,tm_list): + return tm_list.find_all('li',{'class':'threadmarkListItem'}) + + def get_atag_from_threadmark(self,tm_item): + return tm_item.find('a',{'class':'PreviewTooltip'}) + + def get_threadmark_range_url(self,tm_item,tmcat_num): + load_range = "threadmarks/load-range?min=%s&max=%s&category_id=%s"%(tm_item['data-range-min'], + tm_item['data-range-max'], + tmcat_num) + return self.url+load_range + + def get_threadmark_date(self,tm_item): + atag = self.get_atag_from_threadmark(tm_item) + return self.make_date(atag.find_next_sibling('div',{'class':'extra'})) + + def get_threadmark_words(self,tm_item): + words = kwords = "" + atag = self.get_atag_from_threadmark(tm_item) + if atag.parent.has_attr('data-words'): + words = int(atag.parent['data-words']) + if "(" in atag.next_sibling: + kwords = atag.next_sibling.strip() + return words,kwords + def fetch_threadmarks(self,url,tmcat_name,tmcat_num, passed_tmcat_index=0): logger.debug("fetch_threadmarks(%s,tmcat_num=%s,passed_tmcat_index:%s,url=%s)"%(tmcat_name,tmcat_num, passed_tmcat_index, url)) threadmarks=[] soupmarks = self.make_soup(self._fetchUrl(url)) - tm_list = soupmarks.find('div',{'class':'threadmarkList'}) - if not tm_list: # load-range don't have threadmarkList. + tm_list = self.get_threadmarks_list(soupmarks) + if not tm_list: # load-range don't match tm_list = soupmarks # logger.debug(tm_list) markas = [] tmcat_index=passed_tmcat_index after = False - for tm_item in tm_list.find_all('li',{'class':'threadmarkListItem'}): - atag = tm_item.find('a',{'class':'PreviewTooltip'}) + for tm_item in self.get_threadmarks_from_list(tm_list): + atag = self.get_atag_from_threadmark(tm_item) if not atag: - if tm_item['data-range-min'] and tm_item['data-range-max']: - # logger.debug(tm_item) - load_range = "threadmarks/load-range?min=%s&max=%s&category_id=%s"%(tm_item['data-range-min'], - tm_item['data-range-max'], - tmcat_num) - threadmarks.extend(self.fetch_threadmarks(self.url+load_range, - tmcat_name, - tmcat_num, - tmcat_index)) - tmcat_index = len(threadmarks) - after=True + threadmarks.extend(self.fetch_threadmarks(self.get_threadmark_range_url(tm_item,tmcat_num), + tmcat_name, + tmcat_num, + tmcat_index)) + tmcat_index = len(threadmarks) + after=True else: if after: # logger.debug("AFTER "*10) after=False url,name = atag['href'],stripHTML(atag) - date = self.make_date(atag.find_next_sibling('div',{'class':'extra'})) - if atag.parent.has_attr('data-words'): - words = int(atag.parent['data-words']) - if "(" in atag.next_sibling: - kwords = atag.next_sibling.strip() - # logger.debug("%s"%kwords) - else: - words = "" - kwords = "" + date = self.get_threadmark_date(tm_item) + words,kwords = self.get_threadmark_words(tm_item) + if 'http' not in url: + url = self.getURLPrefix()+"/"+url # logger.debug("%s. %s"%(tmcat_index,name)) threadmarks.append({"tmcat_name":tmcat_name, "tmcat_num":tmcat_num, "tmcat_index":tmcat_index, "title":name, - "url":self.getURLPrefix()+"/"+url, + "url":url, "date":date, "words":words, "kwords":kwords}) @@ -348,27 +401,26 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): # use BeautifulSoup HTML parser to make everything easier to find. topsoup = souptag = self.make_soup(data) - h1 = souptag.find('div',{'class':'titleBar'}).h1 - ## SV has started putting 'Crossover', 'Sci-Fi' etc spans in the title h1. - for tag in h1.find_all('span',{'class':'prefix'}): - ## stick them into genre. - self.story.addToList('genre',stripHTML(tag)) - tag.extract() - self.story.setMetadata('title',stripHTML(h1)) + self.parse_title(topsoup) first_post_title = self.getConfig('first_post_title','First Post') use_threadmark_chaps = False if '#' in useurl: anchorid = useurl.split('#')[1] - souptag = souptag.find('li',id=anchorid) + # souptag = souptag.find('li',id=anchorid) + # cache is now loaded with posts from that reader + # page. looking for it in cache reuses code in + # cache_posts that finds post tags. + souptag = self.get_cache_post(anchorid) + else: ## Also sets datePublished / dateUpdated to oldest / newest post datetimes. threadmarks = self.extract_threadmarks(souptag) if len(threadmarks) >= int(self.getConfig('minimum_threadmarks',2)): # remember if reader link found--only applicable if using threadmarks. - self.reader = topsoup.find('a',href=re.compile(r'\.'+self.story.getMetadata('storyId')+r"/reader$")) is not None + self.reader = topsoup.find('a',href=re.compile(r'\.'+self.story.getMetadata('storyId')+r"/reader/?$")) is not None if self.getConfig('always_include_first_post'): self.add_chapter(first_post_title,useurl) @@ -408,22 +460,18 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): if words and self.getConfig('use_threadmark_wordcounts',True): self.story.setMetadata('numWords',words) - souptag = souptag.find('li',{'class':'message'}) # limit first post for date stuff below. ('#' posts above) + souptag = self.get_first_post(topsoup) if use_threadmark_chaps or self.getConfig('always_use_forumtags'): ## only use tags if threadmarks for chapters or always_use_forumtags is on. - for tag in topsoup.findAll('a',{'class':'tag'}) + topsoup.findAll('span',{'class':'prefix'}): + for tag in self.get_forumtags(topsoup): tstr = stripHTML(tag) if self.getConfig('capitalize_forumtags'): tstr = tstr.title() self.story.addToList('forumtags',tstr) # author moved down here to take from post URLs. - a = souptag.find('h3',{'class':'userText'}).find('a') - self.story.addToList('authorId',a['href'].split('/')[1]) - authorUrl = self.getURLPrefix()+'/'+a['href'] - self.story.addToList('authorUrl',authorUrl) - self.story.addToList('author',a.text) + self.parse_author(souptag) if self.getConfig('author_avatar_cover'): authorcard = self.make_soup(self._fetchUrl(authorUrl+"?card=1")) @@ -437,27 +485,28 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): ## ## </div> - # Now go hunting for the 'chapter list'. - bq = souptag.find('blockquote') # assume first posting contains TOC urls. + # Now get first post for description and chapter list if not + # using threadmarks. + first_post = self.get_first_post_body(topsoup) - bq.name='div' - - for iframe in bq.find_all('iframe'): + for iframe in first_post.find_all('iframe'): iframe.extract() # calibre book reader & editor don't like iframes to youtube. - for qdiv in bq.find_all('div',{'class':'quoteExpand'}): + for qdiv in first_post.find_all('div',{'class':'quoteExpand'}): qdiv.extract() # Remove <div class="quoteExpand">click to expand</div> - self.setDescription(useurl,bq) + self.setDescription(useurl,first_post) # otherwise, use first post links--include first post since # that's often also the first chapter. if self.num_chapters() < 1: self.add_chapter(first_post_title,useurl) - for (url,name) in [ (x['href'],stripHTML(x)) for x in bq.find_all('a') ]: + # logger.debug(first_post) + for (url,name,tag) in [ (x['href'],stripHTML(x),x) for x in first_post.find_all('a') ]: (is_chapter_url,url) = self._is_normalize_chapterurl(url) - if is_chapter_url and name != u"\u2191": # skip quote links as indicated by up arrow character. + # skip quote links as indicated by up arrow character or data-xf-click=attribution + if is_chapter_url and name != u"\u2191" and tag.get("data-xf-click",None)!="attribution": self.add_chapter(name,url) if url == useurl and first_post_title == self.get_chapter(0,'url') \ and not self.getConfig('always_include_first_post',False): @@ -466,14 +515,56 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): # Didn't use threadmarks, so take created/updated dates # from the 'first' posting created and updated. - date = self.make_date(souptag.find('a',{'class':'datePermalink'})) + date = self.get_post_created_date(souptag) if date: self.story.setMetadata('datePublished', date) self.story.setMetadata('dateUpdated', date) # updated overwritten below if found. - date = self.make_date(souptag.find('div',{'class':'editDate'})) + date = self.get_post_updated_date(souptag) if date: self.story.setMetadata('dateUpdated', date) + # logger.debug(self.story.getMetadata('datePublished')) + # logger.debug(self.story.getMetadata('dateUpdated')) + + def parse_title(self,souptag): + h1 = souptag.find('div',{'class':'titleBar'}).h1 + ## SV has started putting 'Crossover', 'Sci-Fi' etc spans in the title h1. + for tag in h1.find_all('span',{'class':'prefix'}): + ## stick them into genre. + self.story.addToList('genre',stripHTML(tag)) + tag.extract() + self.story.setMetadata('title',stripHTML(h1)) + + def get_forumtags(self,topsoup): + return topsoup.findAll('a',{'class':'tag'}) + topsoup.findAll('span',{'class':'prefix'}) + + def parse_author(self,souptag): + a = souptag.find('h3',{'class':'userText'}).find('a') + self.story.addToList('authorId',a['href'].split('/')[1]) + authorUrl = self.getURLPrefix()+'/'+a['href'] + self.story.addToList('authorUrl',authorUrl) + self.story.addToList('author',a.text) + + def get_first_post(self,topsoup): + return topsoup.find('li',{'class':'message'}) # limit first post for date stuff below. ('#' posts above) + + def get_first_post_body(self,topsoup): + bq = self.get_first_post(topsoup).find('blockquote') + bq.name='div' + return bq + + def get_post_body(self,souptag): + bq = souptag.find('blockquote') + if not bq: + bq = souptag.find('div',{'class':'messageText'}) # cached gets if it was already used before + bq.name='div' + return bq + + def get_post_created_date(self,souptag): + return self.make_date(souptag.find('a',{'class':'datePermalink'})) + + def get_post_updated_date(self,souptag): + return self.make_date(souptag.find('div',{'class':'editDate'})) def make_date(self,parenttag): # forums use a BS thing where dates # can appear different if recent. @@ -496,14 +587,19 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): def cache_posts(self,topsoup): for post in topsoup.find_all('li',id=re.compile('post-[0-9]+')): + logger.debug("Caching %s"%post['id']) self.post_cache[post['id']] = post def get_cache_post(self,postid): ## saved using original 'post-99999' id for key. + postid=unicode(postid) # thank you, Py3. if '/posts/' in postid: ## allows chapter urls to be passed in directly. # assumed normalized to /posts/1234/ postid = "post-"+postid.split('/')[-2] + elif '#post-' in postid: + postid = postid.split('#')[1] + logger.debug("get cache %s %s"%(postid,postid in self.post_cache)) return self.post_cache.get(postid,None) # grab the text for an individual chapter. @@ -533,20 +629,13 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): (tmcat_num,tmcat_index)=self.threadmarks_for_reader[url] reader_page_num = int((tmcat_index+posts_per_page)/posts_per_page) + offset logger.debug('Reader page offset:%s tmcat_num:%s tmcat_index:%s'%(offset,tmcat_num,tmcat_index)) - reader_url=self.getURLPrefix()+'/threads/'+self.story.getMetadata('storyId')+'/'+tmcat_num+'/reader?page='+unicode(reader_page_num) + reader_url=self.make_reader_url(tmcat_num,reader_page_num) logger.debug("Fetch reader URL to: %s"%reader_url) - data = self._fetchUrl(reader_url) - topsoup = self.make_soup(data) - - # if no posts at all, break out of loop, we're off the end. - # don't need to remember this, the page is cached. - if not topsoup.find_all('li',id=re.compile(r'post-[0-9]+')): - break - - # assumed normalized to /posts/1234/ - anchorid = "post-"+url.split('/')[-2] - # logger.debug("anchorid: %s"%anchorid) - souptag = topsoup.find('li',id=anchorid) + topsoup = self.make_soup(self._fetchUrl(reader_url)) + # make_soup() loads cache with posts from that reader + # page. looking for it in cache reuses code in + # cache_posts that finds post tags. + souptag = self.get_cache_post(url) else: logger.debug("post found in cache") if souptag: @@ -558,52 +647,57 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): souptag = self.get_cache_post(url) if not souptag: (data,opened) = self._fetchUrlOpened(url) - url = opened.geturl() + url = unicode(opened.geturl()) if '#' in origurl and '#' not in url: url = url + origurl[origurl.index('#'):] logger.debug("chapter URL redirected to: %s"%url) - topsoup = souptag = self.make_soup(data) - - if '#' in unicode(url): - anchorid = url.split('#')[1] - souptag = topsoup.find('li',id=anchorid) + topsoup = self.make_soup(data) + # make_soup() loads cache with posts from that reader + # page. looking for it in cache reuses code in + # cache_posts that finds post tags. + souptag = self.get_cache_post(url) + if not souptag and '/threads/' in url: # first post uses /thread/ URL. + souptag = self.get_first_post(topsoup) # remove <div class="baseHtml noticeContent"> because it can # get confused for post content on first posts. for notice in souptag.find_all('div',{'class':'noticeContent'}): notice.extract() - bq = souptag.find('blockquote') - if not bq: - bq = souptag.find('div',{'class':'messageText'}) # cached gets if it was already used before + postbody = self.get_post_body(souptag) - bq.name='div' - - for iframe in bq.find_all('iframe'): + for iframe in postbody.find_all('iframe'): iframe.extract() # calibre book reader & editor don't like iframes to youtube. - for qdiv in bq.find_all('div',{'class':'quoteExpand'}): - qdiv.extract() # Remove <div class="quoteExpand">click to expand</div> - - ## img alt="[​IMG]" class="bbCodeImage LbImage lazyload - ## include lazy load images. - for img in bq.find_all('img',{'class':'lazyload'}): - img['src'] = img['data-src'] - # XenForo uses <base href="https://forums.spacebattles.com/" /> - return self.utf8FromSoup(self.getURLPrefix()+'/',bq) + return self.utf8FromSoup(self.getURLPrefix()+'/',postbody) + + def make_reader_url(self,tmcat_num,reader_page_num): + return self.getURLPrefix()+'/threads/'+self.story.getMetadata('storyId')+'/'+tmcat_num+'/reader?page='+unicode(reader_page_num) + + def get_quote_expand_tag(self,soup): + return soup.find_all('div',{'class':'quoteExpand'}) + + def get_spoiler_tags(self,topsoup): + return topsoup.find_all('div',class_='bbCodeSpoilerContainer') + + def convert_quotes(self,soup): + pass def handle_spoilers(self,topsoup): ''' Modifies tag given as required to do spoiler changes. ''' if self.getConfig('remove_spoilers'): - for div in topsoup.find_all('div',class_='bbCodeSpoilerContainer'): + for div in self.get_spoiler_tags(topsoup): div.extract() elif self.getConfig('legend_spoilers'): - for div in topsoup.find_all('div',class_='bbCodeSpoilerContainer'): + for div in self.get_spoiler_tags(topsoup): div.name='fieldset' + # add copy of XF1 class name for convenience of + # existing output_css when XF2. + div['class'].append('bbCodeSpoilerContainer') legend = topsoup.new_tag('legend') legend.string = stripHTML(div.button.span) div.insert(0,legend) @@ -611,10 +705,10 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): def _do_utf8FromSoup(self,url,soup,fetch=None,allow_replace_br_with_p=True): if self.getConfig('replace_failed_smilies_with_alt_text'): - for img in soup.find_all('img',src=re.compile(r'(failedtoload|clear.png)$')): - #logger.debug("replace_failed_smilies_with_alt_text img: %s"%img) + for img in soup.find_all('img',src=re.compile(r'(^data:image|(failedtoload|clear.png)$)')): + # logger.debug("replace_failed_smilies_with_alt_text img: %s"%img) clses = unicode(img['class']) # stringify list. - if img.has_attr('alt') and 'mceSmilie' in clses : + if img.has_attr('alt') and ('mceSmilie' in clses or 'smilie--sprite' in clses): ## Change the img to a span containing the alt ## text, remove attrs. This is a one-way change. img.name='span' diff --git a/fanficfare/configurable.py b/fanficfare/configurable.py index 43436bc9..c9765389 100644 --- a/fanficfare/configurable.py +++ b/fanficfare/configurable.py @@ -1054,6 +1054,12 @@ class Configuration(configparser.SafeConfigParser): headers['Content-type']='application/x-www-form-urlencoded' if 'Accept' not in headers: headers['Accept']="text/html,*/*" + + if "xf2test" in url: + import base64 + base64string = base64.encodestring(b"xf2demo2019:dBfbyHVvRCsYtLg846r3").replace(b'\n', b'') + headers['Authorization']=b"Basic %s" % base64string + req = Request(url, data=ensure_binary(urlencode(parameters)), headers=headers) @@ -1122,6 +1128,11 @@ class Configuration(configparser.SafeConfigParser): ## not present at all headers.append(('Referer',referer)) + if "xf2test" in url: + import base64 + base64string = base64.encodestring(b"xf2demo2019:dBfbyHVvRCsYtLg846r3").replace(b'\n', b'') + headers.append(('Authorization', b"Basic %s" % base64string)) + self.opener.addheaders = headers if parameters != None: @@ -1182,6 +1193,7 @@ class Configuration(configparser.SafeConfigParser): except Exception as e: excpt=e logger.debug("Caught an exception reading URL: %s sleeptime(%s) Exception %s."%(unicode(safe_url(url)),sleeptime,unicode(e))) + raise logger.debug("Giving up on %s" %safe_url(url)) logger.debug(excpt, exc_info=True)