Merge branch 'SV-xenforo2'

2025-12-30 04:36:11 +01:00 · 2019-05-07 11:07:41 -05:00 · 2019-05-07 11:07:41 -05:00 · a8bdc69cea
commit a8bdc69cea
parent 38e1e33cb2 eb9b8aebb7
5 changed files with 466 additions and 115 deletions
--- a/fanficfare/adapters/init.py
+++ b/fanficfare/adapters/init.py
@ -110,6 +110,7 @@ from . import adapter_tgstorytimecom
 from . import adapter_itcouldhappennet
 from . import adapter_forumsspacebattlescom
 from . import adapter_forumssufficientvelocitycom
+from . import adapter_xf2testsufficientvelocitycom
 from . import adapter_forumquestionablequestingcom
 from . import adapter_ninelivesarchivecom
 from . import adapter_masseffect2in
--- a/fanficfare/adapters/adapter_xf2testsufficientvelocitycom.py
+++ b/fanficfare/adapters/adapter_xf2testsufficientvelocitycom.py
@ -0,0 +1,49 @@
+#  -*- coding: utf-8 -*-
+
+# Copyright 2019 FanFicFare team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import absolute_import
+import re
+
+from .base_xenforo2forum_adapter import BaseXenForo2ForumAdapter
+
+def getClass():
+    return XF2TestSufficientVelocityComAdapter
+
+class XF2TestSufficientVelocityComAdapter(BaseXenForo2ForumAdapter):
+
+    def __init__(self, config, url):
+        BaseXenForo2ForumAdapter.__init__(self, config, url)
+
+        # Each adapter needs to have a unique site abbreviation.
+        self.story.setMetadata('siteabbrev','fsv2')
+
+    @staticmethod # must be @staticmethod, don't remove it.
+    def getSiteDomain():
+        # The site domain.  Does have www here, if it uses it.
+        return 'xf2test.sufficientvelocity.com'
+
+    # @classmethod
+    # def getAcceptDomains(cls):
+    #     return [cls.getSiteDomain(),
+    #             cls.getSiteDomain().replace('forums.','forum.'),
+    #             cls.getSiteDomain().replace('forums.','')]
+
+    def getSiteURLPattern(self):
+        ## SV accepts forums.sufficientvelocity.com, forum.sufficientvelocity.com and sufficientvelocity.com
+        ## all of which redirect to forums.
+        ## We will use forums. as canonical for all
+        return super(XF2TestSufficientVelocityComAdapter, self).getSiteURLPattern().replace(re.escape("forums."),r"(forums?\.)?")
--- a/fanficfare/adapters/base_xenforo2forum_adapter.py
+++ b/fanficfare/adapters/base_xenforo2forum_adapter.py
@ -0,0 +1,195 @@
+#  -*- coding: utf-8 -*-
+
+# Copyright 2019 FanFicFare team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import absolute_import
+import logging
+from datetime import datetime
+logger = logging.getLogger(__name__)
+import re
+from xml.dom.minidom import parseString
+
+from ..htmlcleanup import stripHTML
+from .. import exceptions as exceptions
+
+# py2 vs py3 transition
+from ..six import text_type as unicode
+from ..six.moves.urllib.error import HTTPError
+
+from .base_adapter import makeDate
+from .base_xenforoforum_adapter import BaseXenForoForumAdapter
+
+logger = logging.getLogger(__name__)
+
+class BaseXenForo2ForumAdapter(BaseXenForoForumAdapter):
+
+    def __init__(self, config, url):
+        logger.info("init url: "+url)
+        BaseXenForoForumAdapter.__init__(self, config, url)
+
+    @classmethod
+    def getConfigSections(cls):
+        "Only needs to be overriden if has additional ini sections."
+        return ['base_xenforo2forum'] + super(BaseXenForo2ForumAdapter, cls).getConfigSections()
+
+    def performLogin(self):
+        params = {}
+
+        if self.password:
+            params['login'] = self.username
+            params['password'] = self.password
+        else:
+            params['login'] = self.getConfig("username")
+            params['password'] = self.getConfig("password")
+
+        if not params['login']:
+            raise exceptions.FailedToLogin(self.url,"No username given.  Set in personal.ini or enter when prompted.")
+
+        ## need a login token.
+        data = self._fetchUrl(self.getURLPrefix() + '/login',usecache=False)
+        # logger.debug(data)
+        # <input type="hidden" name="_xfToken" value="1556822458,710e5bf6fc87c67ea04ab56a910ac3ff" />
+        find_token='<input type="hidden" name="_xfToken" value="'
+        xftoken = data[data.index(find_token)+len(find_token):]
+        xftoken = xftoken[:xftoken.index('"')]
+        params['remember'] = '1'
+        params['_xfToken'] = xftoken
+        params['_xfRedirect'] = self.getURLPrefix() + '/'
+
+        ## https://forum.questionablequesting.com/login/login
+        loginUrl = self.getURLPrefix() + '/login/login'
+        logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
+                                                             params['login']))
+
+        d = self._postUrl(loginUrl, params)# , headers={ 'referer':self.getURLPrefix() + '/login',
+                                           #            'origin':self.getURLPrefix() })
+
+        if "Log In" in d:
+            # logger.debug(d)
+            logger.info("Failed to login to URL %s as %s" % (self.url,
+                                                             params['login']))
+            raise exceptions.FailedToLogin(self.url,params['login'])
+            return False
+        else:
+            return True
+
+    def parse_title(self,souptag):
+        h1 = souptag.find('h1',{'class':'p-title-value'})
+        # logger.debug(h1)
+        ## SV has started putting 'Crossover', 'Sci-Fi' etc spans in the title h1.
+        for tag in h1.find_all('span',{'class':'label'}):
+            ## stick them into genre.
+            self.story.addToList('genre',stripHTML(tag))
+            # logger.debug(stripHTML(tag))
+            tag.extract()
+        self.story.setMetadata('title',stripHTML(h1))
+        # logger.debug(stripHTML(h1))
+
+    def get_forumtags(self,topsoup):
+        return topsoup.find('div',{'class':'p-description'}).findAll('a',{'class':'tagItem'})
+
+    def parse_author(self,souptag):
+        a = souptag.find('section',{'class':'message-user'}).find('a',{'class':'username'})
+        # logger.debug(a)
+        self.story.addToList('authorId',a['href'].split('/')[-2])
+        authorUrl = a['href'] # self.getURLPrefix()+'/'+a['href']
+        self.story.addToList('authorUrl',authorUrl)
+        self.story.addToList('author',a.text)
+
+    def cache_posts(self,topsoup):
+        for post in topsoup.find_all('article',{'class':'message--post'}):
+            logger.debug("Caching %s"%post['data-content'])
+            self.post_cache[post['data-content']] = post
+
+    def get_first_post(self,topsoup):
+        return topsoup.find('article',{'class':'message--post'})
+
+    def get_first_post_body(self,topsoup):
+        return self.get_post_body(self.get_first_post(topsoup))
+
+    def get_post_body(self,souptag):
+        return souptag.find('article',{'class':'message-body'}).find('div',{'class':'bbWrapper'})
+
+    def get_post_created_date(self,souptag):
+        return self.make_date(souptag.find('div', {'class':'message-date'}))
+
+    def get_post_updated_date(self,souptag):
+        return self.make_date(souptag.find('div',{'class':'message-lastEdit'}))
+
+    def get_threadmarks_top(self,souptag):
+        return souptag.find('div',{'class':'block-outer-main--threadmarks'})
+
+    def get_threadmarks(self,navdiv):
+        return navdiv.find_all('a',{'class':'menuTrigger','href':re.compile('threadmarks.*(threadmark_category=)?')})
+
+    def get_threadmark_catnumname(self,threadmarksa):
+        if 'threadmark_category=' in threadmarksa['href']:
+            tmcat_num = threadmarksa['href'].split('threadmark_category=')[1]
+        else:
+            tmcat_num = '1'
+        tmcat_name = stripHTML(threadmarksa)
+        return (tmcat_num,tmcat_name)
+
+    def get_threadmarks_list(self,soupmarks):
+        return soupmarks.find('div',{'class':'structItemContainer'})
+
+    def get_threadmarks_from_list(self,tm_list):
+        return tm_list.find_all('div',{'class':'structItem--threadmark'})
+
+    def get_atag_from_threadmark(self,tm_item):
+        return tm_item.find('a',{'data-tp-primary':'on'})
+
+    def get_threadmark_range_url(self,tm_item,tmcat_num):
+        fetcher = tm_item.find('div',{'data-xf-click':'threadmark-fetcher'})
+        # logger.debug(fetcher)
+        return fetcher['data-fetchurl']
+
+    def get_threadmark_date(self,tm_item):
+        return self.make_date(tm_item)
+
+    ## XF2 doesn't appear to have words, just kwords.
+    def get_threadmark_words(self,tm_item):
+        words = kwords = ""
+        worddd = tm_item.find('dd')
+        if worddd:
+            kwords = "("+stripHTML(worddd)+")" # to match XF1
+        return words,kwords
+
+    def make_date(self,parenttag):
+        datestr=None
+        try:
+            datetag = parenttag.find('time')
+            # not paying any attention to TZ issues.
+            return datetime.fromtimestamp(float(datetag['data-time']))
+        except:
+            logger.warn('No date found in %s'%parenttag,exc_info=True)
+            return None
+
+    def make_reader_url(self,tmcat_num,reader_page_num):
+        # https://xf2test.sufficientvelocity.com/threads/mauling-snarks-worm.41471/reader/page-4?threadmark_category=4
+        return self.story.getMetadata('storyUrl')+'reader/page-'+unicode(reader_page_num)+'?threadmark_category='+tmcat_num
+
+    def get_quote_expand_tag(self,soup):
+        return soup.find_all('div',{'class':re.compile(r'bbCodeBlock-(expand|shrink)Link')})
+
+    def get_spoiler_tags(self,topsoup):
+        return topsoup.find_all('div',class_='bbCodeSpoiler')
+
+    def convert_quotes(self,soup):
+        ## make XF2 quote divs blockquotes so the spacing is the same
+        ## as XF1.
+        for tag in soup.find_all('div', class_="bbCodeBlock-expandContent"):
+            tag.name='blockquote'
--- a/fanficfare/adapters/base_xenforoforum_adapter.py
+++ b/fanficfare/adapters/base_xenforoforum_adapter.py
@ -86,7 +86,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):

    def getSiteURLPattern(self):
        ## need to accept http and https still.
-        return re.escape(self.getURLPrefix()).replace("https","https?")+r"/(?P<tp>threads|posts)/(?P<title>.+\.)?(?P<id>\d+)/?[^#]*?(#post-(?P<anchorpost>\d+))?$"
+        return re.escape(self.getURLPrefix()).replace("https","https?")+r"/(?P<tp>threads|posts)/(?P<title>.+\.)?(?P<id>\d+)/?[^#]*?(#?post-(?P<anchorpost>\d+))?$"

    def _fetchUrlOpened(self, url,
                        parameters=None,
@ -206,7 +206,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
        ## https://forum.questionablequesting.com/login/login
        loginUrl = self.getURLPrefix() + '/login/login'
        logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
-                                                              params['login']))
+                                                             params['login']))

        d = self._fetchUrl(loginUrl, params)

@ -220,6 +220,11 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):

    def make_soup(self,data):
        soup = super(BaseXenForoForumAdapter, self).make_soup(data)
+        ## img class="lazyload"
+        ## include lazy load images.
+        for img in soup.find_all('img',{'class':'lazyload'}):
+            img['src'] = img['data-src']
+
        ## after lazy load images, there are noscript blocks also
        ## containing <img> tags.  The problem comes in when they hit
        ## book readers such as Kindle and Nook and then you see the
@ -227,94 +232,142 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
        for noscript in soup.find_all('noscript'):
            noscript.extract()

+        for qdiv in self.get_quote_expand_tag(soup):
+            qdiv.extract() # Remove <div class="...">click to expand</div>
+
+        self.convert_quotes(soup)
+
        self.handle_spoilers(soup)

        ## cache posts on page.
        self.cache_posts(soup)
        return soup

-    ## Moved over from adapter_forumquestionablequestingcom when SB/SV
-    ## threadmark.rss became 'most recent 10 in reverse order'.
+    def get_threadmarks_top(self,souptag):
+        return souptag.find('div',{'class':'threadmarkMenus'})
+
+    def get_threadmarks(self,navdiv):
+        return navdiv.find_all('a',{'class':'OverlayTrigger','href':re.compile('threadmarks.*category_id=')})
+
+    def get_threadmark_catnumname(self,threadmarksa):
+        return (threadmarksa['href'].split('category_id=')[1],
+                stripHTML(threadmarksa.find_previous('a',{'class':'threadmarksTrigger'})))
+
    def extract_threadmarks(self,souptag):
        threadmarks=[]
        # try threadmarks if no '#' in url
-        navdiv = souptag.find('div',{'class':'threadmarkMenus'})
+        navdiv = self.get_threadmarks_top(souptag)
        if not navdiv:
            return threadmarks
-        # was class=threadmarksTrigger.  thread cats are currently
-        # only OverlayTrigger <a>s in threadmarkMenus, but I wouldn't
-        # be surprised if that changed.  Don't want to do use just
-        # href=re because there's more than one copy on the page; plus
-        # could be included in a post.  Would be easier if <noscript>s
-        # weren't being stripped, but that's a different issue.
-        threadmarksas = navdiv.find_all('a',{'class':'OverlayTrigger','href':re.compile('threadmarks.*category_id=')})
-        ## Loop on threadmark categories.
-        tmcat_num=None
+        threadmarksas = self.get_threadmarks(navdiv)

+        threadmarkgroups = dict() # for ordering threadmarks
+        ## Loop on threadmark categories.
        for threadmarksa in threadmarksas:
-            tmcat_num = threadmarksa['href'].split('category_id=')[1]
-            # get from earlier <a> now.
-            tmcat_name = stripHTML(threadmarksa.find_previous('a',{'class':'threadmarksTrigger'}))
-            prepend = ""
+            (tmcat_num,tmcat_name) = self.get_threadmark_catnumname(threadmarksa)
            if tmcat_name in self.getConfigList('skip_threadmarks_categories'):
                continue

            if tmcat_name == 'Apocrypha' and self.getConfig('apocrypha_to_omake'):
                tmcat_name = 'Omake'

-            if tmcat_name != "Threadmarks":
-                prepend = tmcat_name+" - "
-
-            threadmarks.extend(self.fetch_threadmarks(self.getURLPrefix()+'/'+threadmarksa['href'],
-                                                      tmcat_name,
-                                                      tmcat_num))
+            if 'http' not in threadmarksa['href']:
+                href = self.getURLPrefix()+'/'+threadmarksa['href']
+            else:
+                href = threadmarksa['href']
+            threadmarkgroups[tmcat_name]=self.fetch_threadmarks(href,
+                                                                tmcat_name,
+                                                                tmcat_num)
+        ## Order of threadmark groups in new SV is changed and
+        ## possibly unpredictable.  Normalize.  Keep as configurable?
+        ## What about categories not in the list?
+        default_order = ['Threadmarks',
+                         'Sidestory',
+                         'Apocrypha',
+                         'Omake',
+                         'Media',
+                         'Informational',
+                         'Staff Post']
+        # default order also *after* config'ed
+        # threadmark_category_order so if they are not also in
+        # skip_threadmarks_categories they appear in the expected
+        # order.
+        for cat_name in self.getConfigList('threadmark_category_order',default_order)+default_order:
+            if cat_name in threadmarkgroups:
+                threadmarks.extend(threadmarkgroups[cat_name])
+                del threadmarkgroups[cat_name]
+        # more categories left?  new or at least unknown
+        if threadmarkgroups:
+            cats = threadmarkgroups.keys()
+            # alphabetize for lack of a better idea to insure consist ordering
+            cats.sort()
+            for cat_name in cats:
+                threadmarks.extend(threadmarkgroups[cat_name])
        return threadmarks

+    def get_threadmarks_list(self,soupmarks):
+        return soupmarks.find('div',{'class':'threadmarkList'})
+
+    def get_threadmarks_from_list(self,tm_list):
+        return tm_list.find_all('li',{'class':'threadmarkListItem'})
+
+    def get_atag_from_threadmark(self,tm_item):
+        return tm_item.find('a',{'class':'PreviewTooltip'})
+
+    def get_threadmark_range_url(self,tm_item,tmcat_num):
+        load_range = "threadmarks/load-range?min=%s&max=%s&category_id=%s"%(tm_item['data-range-min'],
+                                                                            tm_item['data-range-max'],
+                                                                            tmcat_num)
+        return self.url+load_range
+
+    def get_threadmark_date(self,tm_item):
+        atag = self.get_atag_from_threadmark(tm_item)
+        return self.make_date(atag.find_next_sibling('div',{'class':'extra'}))
+
+    def get_threadmark_words(self,tm_item):
+        words = kwords = ""
+        atag = self.get_atag_from_threadmark(tm_item)
+        if atag.parent.has_attr('data-words'):
+            words = int(atag.parent['data-words'])
+            if "(" in atag.next_sibling:
+                kwords = atag.next_sibling.strip()
+        return words,kwords
+
    def fetch_threadmarks(self,url,tmcat_name,tmcat_num, passed_tmcat_index=0):
        logger.debug("fetch_threadmarks(%s,tmcat_num=%s,passed_tmcat_index:%s,url=%s)"%(tmcat_name,tmcat_num, passed_tmcat_index, url))
        threadmarks=[]
        soupmarks = self.make_soup(self._fetchUrl(url))
-        tm_list = soupmarks.find('div',{'class':'threadmarkList'})
-        if not tm_list: # load-range don't have threadmarkList.
+        tm_list = self.get_threadmarks_list(soupmarks)
+        if not tm_list: # load-range don't match
            tm_list = soupmarks
        # logger.debug(tm_list)
        markas = []
        tmcat_index=passed_tmcat_index
        after = False
-        for tm_item in tm_list.find_all('li',{'class':'threadmarkListItem'}):
-            atag = tm_item.find('a',{'class':'PreviewTooltip'})
+        for tm_item in self.get_threadmarks_from_list(tm_list):
+            atag = self.get_atag_from_threadmark(tm_item)
            if not atag:
-                if tm_item['data-range-min'] and tm_item['data-range-max']:
-                # logger.debug(tm_item)
-                    load_range = "threadmarks/load-range?min=%s&max=%s&category_id=%s"%(tm_item['data-range-min'],
-                                                                                        tm_item['data-range-max'],
-                                                                                        tmcat_num)
-                    threadmarks.extend(self.fetch_threadmarks(self.url+load_range,
-                                                              tmcat_name,
-                                                              tmcat_num,
-                                                              tmcat_index))
-                    tmcat_index = len(threadmarks)
-                    after=True
+                threadmarks.extend(self.fetch_threadmarks(self.get_threadmark_range_url(tm_item,tmcat_num),
+                                                          tmcat_name,
+                                                          tmcat_num,
+                                                          tmcat_index))
+                tmcat_index = len(threadmarks)
+                after=True
            else:
                if after:
                    # logger.debug("AFTER "*10)
                    after=False
                url,name = atag['href'],stripHTML(atag)
-                date = self.make_date(atag.find_next_sibling('div',{'class':'extra'}))
-                if atag.parent.has_attr('data-words'):
-                    words = int(atag.parent['data-words'])
-                    if "(" in atag.next_sibling:
-                        kwords = atag.next_sibling.strip()
-                    # logger.debug("%s"%kwords)
-                else:
-                    words = ""
-                    kwords = ""
+                date = self.get_threadmark_date(tm_item)
+                words,kwords = self.get_threadmark_words(tm_item)
+                if 'http' not in url:
+                    url = self.getURLPrefix()+"/"+url
                # logger.debug("%s. %s"%(tmcat_index,name))
                threadmarks.append({"tmcat_name":tmcat_name,
                                    "tmcat_num":tmcat_num,
                                    "tmcat_index":tmcat_index,
                                    "title":name,
-                                    "url":self.getURLPrefix()+"/"+url,
+                                    "url":url,
                                    "date":date,
                                    "words":words,
                                    "kwords":kwords})
@ -348,27 +401,26 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
        # use BeautifulSoup HTML parser to make everything easier to find.
        topsoup = souptag = self.make_soup(data)

-        h1 = souptag.find('div',{'class':'titleBar'}).h1
-        ## SV has started putting 'Crossover', 'Sci-Fi' etc spans in the title h1.
-        for tag in h1.find_all('span',{'class':'prefix'}):
-            ## stick them into genre.
-            self.story.addToList('genre',stripHTML(tag))
-            tag.extract()
-        self.story.setMetadata('title',stripHTML(h1))
+        self.parse_title(topsoup)

        first_post_title = self.getConfig('first_post_title','First Post')

        use_threadmark_chaps = False
        if '#' in useurl:
            anchorid = useurl.split('#')[1]
-            souptag = souptag.find('li',id=anchorid)
+            # souptag = souptag.find('li',id=anchorid)
+            # cache is now loaded with posts from that reader
+            # page.  looking for it in cache reuses code in
+            # cache_posts that finds post tags.
+            souptag = self.get_cache_post(anchorid)
+
        else:
            ## Also sets datePublished / dateUpdated to oldest / newest post datetimes.
            threadmarks = self.extract_threadmarks(souptag)

            if len(threadmarks) >= int(self.getConfig('minimum_threadmarks',2)):
                # remember if reader link found--only applicable if using threadmarks.
-                self.reader = topsoup.find('a',href=re.compile(r'\.'+self.story.getMetadata('storyId')+r"/reader$")) is not None
+                self.reader = topsoup.find('a',href=re.compile(r'\.'+self.story.getMetadata('storyId')+r"/reader/?$")) is not None

                if self.getConfig('always_include_first_post'):
                    self.add_chapter(first_post_title,useurl)
@ -408,22 +460,18 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):

                if words and self.getConfig('use_threadmark_wordcounts',True):
                    self.story.setMetadata('numWords',words)
-            souptag = souptag.find('li',{'class':'message'}) # limit first post for date stuff below. ('#' posts above)
+            souptag = self.get_first_post(topsoup)

        if use_threadmark_chaps or self.getConfig('always_use_forumtags'):
            ## only use tags if threadmarks for chapters or always_use_forumtags is on.
-            for tag in topsoup.findAll('a',{'class':'tag'}) + topsoup.findAll('span',{'class':'prefix'}):
+            for tag in self.get_forumtags(topsoup):
                tstr = stripHTML(tag)
                if self.getConfig('capitalize_forumtags'):
                    tstr = tstr.title()
                self.story.addToList('forumtags',tstr)

        # author moved down here to take from post URLs.
-        a = souptag.find('h3',{'class':'userText'}).find('a')
-        self.story.addToList('authorId',a['href'].split('/')[1])
-        authorUrl = self.getURLPrefix()+'/'+a['href']
-        self.story.addToList('authorUrl',authorUrl)
-        self.story.addToList('author',a.text)
+        self.parse_author(souptag)

        if self.getConfig('author_avatar_cover'):
            authorcard = self.make_soup(self._fetchUrl(authorUrl+"?card=1"))
@ -437,27 +485,28 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
            ##
            ## </div>

-        # Now go hunting for the 'chapter list'.
-        bq = souptag.find('blockquote') # assume first posting contains TOC urls.
+        # Now get first post for description and chapter list if not
+        # using threadmarks.
+        first_post = self.get_first_post_body(topsoup)

-        bq.name='div'
-
-        for iframe in bq.find_all('iframe'):
+        for iframe in first_post.find_all('iframe'):
            iframe.extract() # calibre book reader & editor don't like iframes to youtube.

-        for qdiv in bq.find_all('div',{'class':'quoteExpand'}):
+        for qdiv in first_post.find_all('div',{'class':'quoteExpand'}):
            qdiv.extract() # Remove <div class="quoteExpand">click to expand</div>

-        self.setDescription(useurl,bq)
+        self.setDescription(useurl,first_post)

        # otherwise, use first post links--include first post since
        # that's often also the first chapter.

        if self.num_chapters() < 1:
            self.add_chapter(first_post_title,useurl)
-            for (url,name) in [ (x['href'],stripHTML(x)) for x in bq.find_all('a') ]:
+            # logger.debug(first_post)
+            for (url,name,tag) in [ (x['href'],stripHTML(x),x) for x in first_post.find_all('a') ]:
                (is_chapter_url,url) = self._is_normalize_chapterurl(url)
-                if is_chapter_url and name != u"\u2191": # skip quote links as indicated by up arrow character.
+                # skip quote links as indicated by up arrow character or data-xf-click=attribution
+                if is_chapter_url and name != u"\u2191" and tag.get("data-xf-click",None)!="attribution":
                    self.add_chapter(name,url)
                    if url == useurl and first_post_title == self.get_chapter(0,'url') \
                            and not self.getConfig('always_include_first_post',False):
@ -466,14 +515,56 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):

            # Didn't use threadmarks, so take created/updated dates
            # from the 'first' posting created and updated.
-            date = self.make_date(souptag.find('a',{'class':'datePermalink'}))
+            date = self.get_post_created_date(souptag)
            if date:
                self.story.setMetadata('datePublished', date)
                self.story.setMetadata('dateUpdated', date) # updated overwritten below if found.

-            date = self.make_date(souptag.find('div',{'class':'editDate'}))
+            date = self.get_post_updated_date(souptag)
            if date:
                self.story.setMetadata('dateUpdated', date)
+            # logger.debug(self.story.getMetadata('datePublished'))
+            # logger.debug(self.story.getMetadata('dateUpdated'))
+
+    def parse_title(self,souptag):
+        h1 = souptag.find('div',{'class':'titleBar'}).h1
+        ## SV has started putting 'Crossover', 'Sci-Fi' etc spans in the title h1.
+        for tag in h1.find_all('span',{'class':'prefix'}):
+            ## stick them into genre.
+            self.story.addToList('genre',stripHTML(tag))
+            tag.extract()
+        self.story.setMetadata('title',stripHTML(h1))
+
+    def get_forumtags(self,topsoup):
+        return topsoup.findAll('a',{'class':'tag'}) + topsoup.findAll('span',{'class':'prefix'})
+
+    def parse_author(self,souptag):
+        a = souptag.find('h3',{'class':'userText'}).find('a')
+        self.story.addToList('authorId',a['href'].split('/')[1])
+        authorUrl = self.getURLPrefix()+'/'+a['href']
+        self.story.addToList('authorUrl',authorUrl)
+        self.story.addToList('author',a.text)
+
+    def get_first_post(self,topsoup):
+        return topsoup.find('li',{'class':'message'}) # limit first post for date stuff below. ('#' posts above)
+
+    def get_first_post_body(self,topsoup):
+        bq = self.get_first_post(topsoup).find('blockquote')
+        bq.name='div'
+        return bq
+
+    def get_post_body(self,souptag):
+        bq = souptag.find('blockquote')
+        if not bq:
+            bq = souptag.find('div',{'class':'messageText'}) # cached gets if it was already used before
+        bq.name='div'
+        return bq
+
+    def get_post_created_date(self,souptag):
+        return self.make_date(souptag.find('a',{'class':'datePermalink'}))
+
+    def get_post_updated_date(self,souptag):
+        return self.make_date(souptag.find('div',{'class':'editDate'}))

    def make_date(self,parenttag): # forums use a BS thing where dates
                                   # can appear different if recent.
@ -496,14 +587,19 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):

    def cache_posts(self,topsoup):
        for post in topsoup.find_all('li',id=re.compile('post-[0-9]+')):
+            logger.debug("Caching %s"%post['id'])
            self.post_cache[post['id']] = post

    def get_cache_post(self,postid):
        ## saved using original 'post-99999' id for key.
+        postid=unicode(postid) # thank you, Py3.
        if '/posts/' in postid:
            ## allows chapter urls to be passed in directly.
            # assumed normalized to /posts/1234/
            postid = "post-"+postid.split('/')[-2]
+        elif '#post-' in postid:
+            postid = postid.split('#')[1]
+        logger.debug("get cache %s %s"%(postid,postid in self.post_cache))
        return self.post_cache.get(postid,None)

    # grab the text for an individual chapter.
@ -533,20 +629,13 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
                    (tmcat_num,tmcat_index)=self.threadmarks_for_reader[url]
                    reader_page_num = int((tmcat_index+posts_per_page)/posts_per_page) + offset
                    logger.debug('Reader page offset:%s tmcat_num:%s tmcat_index:%s'%(offset,tmcat_num,tmcat_index))
-                    reader_url=self.getURLPrefix()+'/threads/'+self.story.getMetadata('storyId')+'/'+tmcat_num+'/reader?page='+unicode(reader_page_num)
+                    reader_url=self.make_reader_url(tmcat_num,reader_page_num)
                    logger.debug("Fetch reader URL to: %s"%reader_url)
-                    data = self._fetchUrl(reader_url)
-                    topsoup = self.make_soup(data)
-
-                    # if no posts at all, break out of loop, we're off the end.
-                    # don't need to remember this, the page is cached.
-                    if not topsoup.find_all('li',id=re.compile(r'post-[0-9]+')):
-                        break
-
-                    # assumed normalized to /posts/1234/
-                    anchorid = "post-"+url.split('/')[-2]
-                    # logger.debug("anchorid: %s"%anchorid)
-                    souptag = topsoup.find('li',id=anchorid)
+                    topsoup = self.make_soup(self._fetchUrl(reader_url))
+                    # make_soup() loads cache with posts from that reader
+                    # page.  looking for it in cache reuses code in
+                    # cache_posts that finds post tags.
+                    souptag = self.get_cache_post(url)
                else:
                    logger.debug("post found in cache")
                if souptag:
@ -558,52 +647,57 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
            souptag = self.get_cache_post(url)
            if not souptag:
                (data,opened) = self._fetchUrlOpened(url)
-                url = opened.geturl()
+                url = unicode(opened.geturl())
                if '#' in origurl and '#' not in url:
                    url = url + origurl[origurl.index('#'):]
                    logger.debug("chapter URL redirected to: %s"%url)

-                topsoup = souptag = self.make_soup(data)
-
-                if '#' in unicode(url):
-                    anchorid = url.split('#')[1]
-                    souptag = topsoup.find('li',id=anchorid)
+                topsoup = self.make_soup(data)
+                # make_soup() loads cache with posts from that reader
+                # page.  looking for it in cache reuses code in
+                # cache_posts that finds post tags.
+                souptag = self.get_cache_post(url)
+                if not souptag and '/threads/' in url: # first post uses /thread/ URL.
+                    souptag = self.get_first_post(topsoup)

        # remove <div class="baseHtml noticeContent"> because it can
        # get confused for post content on first posts.
        for notice in souptag.find_all('div',{'class':'noticeContent'}):
            notice.extract()

-        bq = souptag.find('blockquote')
-        if not bq:
-            bq = souptag.find('div',{'class':'messageText'}) # cached gets if it was already used before
+        postbody = self.get_post_body(souptag)

-        bq.name='div'
-
-        for iframe in bq.find_all('iframe'):
+        for iframe in postbody.find_all('iframe'):
            iframe.extract() # calibre book reader & editor don't like iframes to youtube.

-        for qdiv in bq.find_all('div',{'class':'quoteExpand'}):
-            qdiv.extract() # Remove <div class="quoteExpand">click to expand</div>
-
-        ## img alt="[IMG]" class="bbCodeImage LbImage lazyload
-        ## include lazy load images.
-        for img in bq.find_all('img',{'class':'lazyload'}):
-            img['src'] = img['data-src']
-
        # XenForo uses <base href="https://forums.spacebattles.com/" />
-        return self.utf8FromSoup(self.getURLPrefix()+'/',bq)
+        return self.utf8FromSoup(self.getURLPrefix()+'/',postbody)
+
+    def make_reader_url(self,tmcat_num,reader_page_num):
+        return self.getURLPrefix()+'/threads/'+self.story.getMetadata('storyId')+'/'+tmcat_num+'/reader?page='+unicode(reader_page_num)
+
+    def get_quote_expand_tag(self,soup):
+        return soup.find_all('div',{'class':'quoteExpand'})
+
+    def get_spoiler_tags(self,topsoup):
+        return topsoup.find_all('div',class_='bbCodeSpoilerContainer')
+
+    def convert_quotes(self,soup):
+        pass

    def handle_spoilers(self,topsoup):
        '''
        Modifies tag given as required to do spoiler changes.
        '''
        if self.getConfig('remove_spoilers'):
-            for div in topsoup.find_all('div',class_='bbCodeSpoilerContainer'):
+            for div in self.get_spoiler_tags(topsoup):
                div.extract()
        elif self.getConfig('legend_spoilers'):
-            for div in topsoup.find_all('div',class_='bbCodeSpoilerContainer'):
+            for div in self.get_spoiler_tags(topsoup):
                div.name='fieldset'
+                # add copy of XF1 class name for convenience of
+                # existing output_css when XF2.
+                div['class'].append('bbCodeSpoilerContainer')
                legend = topsoup.new_tag('legend')
                legend.string = stripHTML(div.button.span)
                div.insert(0,legend)
@ -611,10 +705,10 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):

    def _do_utf8FromSoup(self,url,soup,fetch=None,allow_replace_br_with_p=True):
        if self.getConfig('replace_failed_smilies_with_alt_text'):
-            for img in soup.find_all('img',src=re.compile(r'(failedtoload|clear.png)$')):
-                #logger.debug("replace_failed_smilies_with_alt_text img: %s"%img)
+            for img in soup.find_all('img',src=re.compile(r'(^data:image|(failedtoload|clear.png)$)')):
+                # logger.debug("replace_failed_smilies_with_alt_text img: %s"%img)
                clses = unicode(img['class']) # stringify list.
-                if img.has_attr('alt') and 'mceSmilie' in clses :
+                if img.has_attr('alt') and ('mceSmilie' in clses or 'smilie--sprite' in clses):
                    ## Change the img to a span containing the alt
                    ## text, remove attrs.  This is a one-way change.
                    img.name='span'
--- a/fanficfare/configurable.py
+++ b/fanficfare/configurable.py
@ -1054,6 +1054,12 @@ class Configuration(configparser.SafeConfigParser):
            headers['Content-type']='application/x-www-form-urlencoded'
        if 'Accept' not in headers:
            headers['Accept']="text/html,*/*"
+
+        if "xf2test" in url:
+            import base64
+            base64string = base64.encodestring(b"xf2demo2019:dBfbyHVvRCsYtLg846r3").replace(b'\n', b'')
+            headers['Authorization']=b"Basic %s" % base64string
+
        req = Request(url,
                      data=ensure_binary(urlencode(parameters)),
                      headers=headers)
@ -1122,6 +1128,11 @@ class Configuration(configparser.SafeConfigParser):
            ## not present at all
            headers.append(('Referer',referer))

+        if "xf2test" in url:
+            import base64
+            base64string = base64.encodestring(b"xf2demo2019:dBfbyHVvRCsYtLg846r3").replace(b'\n', b'')
+            headers.append(('Authorization', b"Basic %s" % base64string))
+
        self.opener.addheaders = headers

        if parameters != None:
@ -1182,6 +1193,7 @@ class Configuration(configparser.SafeConfigParser):
            except Exception as e:
                excpt=e
                logger.debug("Caught an exception reading URL: %s sleeptime(%s) Exception %s."%(unicode(safe_url(url)),sleeptime,unicode(e)))
+                raise

        logger.debug("Giving up on %s" %safe_url(url))
        logger.debug(excpt, exc_info=True)