Incomplete test version for xenforo2 on xf2test.sufficientvelocity.com.

2026-01-05 23:56:08 +01:00 · 2019-04-22 14:57:55 -05:00 · 2019-04-22 14:57:55 -05:00 · e53e2bfbe5
commit e53e2bfbe5
parent ee48decec5
5 changed files with 289 additions and 25 deletions
--- a/fanficfare/adapters/init.py
+++ b/fanficfare/adapters/init.py
@ -110,6 +110,7 @@ from . import adapter_tgstorytimecom
 from . import adapter_itcouldhappennet
 from . import adapter_forumsspacebattlescom
 from . import adapter_forumssufficientvelocitycom
+from . import adapter_xf2testsufficientvelocitycom
 from . import adapter_forumquestionablequestingcom
 from . import adapter_ninelivesarchivecom
 from . import adapter_masseffect2in
--- a/fanficfare/adapters/adapter_xf2testsufficientvelocitycom.py
+++ b/fanficfare/adapters/adapter_xf2testsufficientvelocitycom.py
@ -0,0 +1,49 @@
+#  -*- coding: utf-8 -*-
+
+# Copyright 2019 FanFicFare team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import absolute_import
+import re
+
+from .base_xenforo2forum_adapter import BaseXenForo2ForumAdapter
+
+def getClass():
+    return XF2TestSufficientVelocityComAdapter
+
+class XF2TestSufficientVelocityComAdapter(BaseXenForo2ForumAdapter):
+
+    def __init__(self, config, url):
+        BaseXenForo2ForumAdapter.__init__(self, config, url)
+
+        # Each adapter needs to have a unique site abbreviation.
+        self.story.setMetadata('siteabbrev','fsv2')
+
+    @staticmethod # must be @staticmethod, don't remove it.
+    def getSiteDomain():
+        # The site domain.  Does have www here, if it uses it.
+        return 'xf2test.sufficientvelocity.com'
+
+    # @classmethod
+    # def getAcceptDomains(cls):
+    #     return [cls.getSiteDomain(),
+    #             cls.getSiteDomain().replace('forums.','forum.'),
+    #             cls.getSiteDomain().replace('forums.','')]
+
+    def getSiteURLPattern(self):
+        ## SV accepts forums.sufficientvelocity.com, forum.sufficientvelocity.com and sufficientvelocity.com
+        ## all of which redirect to forums.
+        ## We will use forums. as canonical for all
+        return super(XF2TestSufficientVelocityComAdapter, self).getSiteURLPattern().replace(re.escape("forums."),r"(forums?\.)?")
--- a/fanficfare/adapters/base_xenforo2forum_adapter.py
+++ b/fanficfare/adapters/base_xenforo2forum_adapter.py
@ -0,0 +1,200 @@
+#  -*- coding: utf-8 -*-
+
+# Copyright 2019 FanFicFare team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import absolute_import
+import logging
+from datetime import datetime
+logger = logging.getLogger(__name__)
+import re
+from xml.dom.minidom import parseString
+
+from ..htmlcleanup import stripHTML
+from .. import exceptions as exceptions
+
+# py2 vs py3 transition
+from ..six import text_type as unicode
+from ..six.moves.urllib.error import HTTPError
+
+from .base_adapter import makeDate
+from .base_xenforoforum_adapter import BaseXenForoForumAdapter
+
+logger = logging.getLogger(__name__)
+
+class BaseXenForo2ForumAdapter(BaseXenForoForumAdapter):
+
+    def __init__(self, config, url):
+        logger.info("init url: "+url)
+        BaseXenForoForumAdapter.__init__(self, config, url)
+
+    def parse_title(self,souptag):
+        h1 = souptag.find('h1',{'class':'p-title-value'})
+        logger.debug(h1)
+        ## SV has started putting 'Crossover', 'Sci-Fi' etc spans in the title h1.
+        for tag in h1.find_all('span',{'class':'label'}):
+            ## stick them into genre.
+            self.story.addToList('genre',stripHTML(tag))
+            logger.debug(stripHTML(tag))
+            tag.extract()
+        self.story.setMetadata('title',stripHTML(h1))
+        logger.debug(stripHTML(h1))
+
+    def parse_author(self,souptag):
+        a = souptag.find('section',{'class':'message-user'}).find('a')
+        logger.debug(a)
+        self.story.addToList('authorId',a['href'].split('/')[-2])
+        authorUrl = a['href'] # self.getURLPrefix()+'/'+a['href']
+        self.story.addToList('authorUrl',authorUrl)
+        self.story.addToList('author',a.text)
+
+    def cache_posts(self,topsoup):
+        for post in topsoup.find_all('article',{'class':'message--post'}):
+            logger.debug("Caching %s"%post['data-content'])
+            self.post_cache[post['data-content']] = post
+
+    def get_first_post(self,topsoup):
+        return topsoup.find('article',{'class':'message--post'})
+
+    def get_first_post_body(self,topsoup):
+        return self.get_first_post(topsoup).find('article',{'class':'message-body'}).find('div',{'class':'bbWrapper'})
+
+    def extract_threadmarks(self,souptag):
+        threadmarks=[]
+        # try threadmarks if no '#' in url
+        navdiv = souptag.find('div',{'class':'buttonGroup'})
+        if not navdiv:
+            return threadmarks
+        # was class=threadmarksTrigger.  thread cats are currently
+        # only OverlayTrigger <a>s in threadmarkMenus, but I wouldn't
+        # be surprised if that changed.  Don't want to do use just
+        # href=re because there's more than one copy on the page; plus
+        # could be included in a post.  Would be easier if <noscript>s
+        # weren't being stripped, but that's a different issue.
+        threadmarksas = navdiv.find_all('a',{'class':'menuTrigger','href':re.compile('threadmarks.*(threadmark_category=)?')})
+        ## Loop on threadmark categories.
+        tmcat_num=None
+
+        threadmarkgroups = dict() # for ordering threadmarks
+        for threadmarksa in threadmarksas:
+            logger.debug("threadmarksa:%s"%threadmarksa)
+            if 'threadmark_category=' in threadmarksa['href']:
+                tmcat_num = threadmarksa['href'].split('threadmark_category=')[1]
+            else:
+                tmcat_num = '1'
+            # get from earlier <a> now.
+            tmcat_name = stripHTML(threadmarksa)
+            if tmcat_name in self.getConfigList('skip_threadmarks_categories'):
+                continue
+
+            if tmcat_name == 'Apocrypha' and self.getConfig('apocrypha_to_omake'):
+                tmcat_name = 'Omake'
+
+            if 'http' not in threadmarksa['href']:
+                href = self.getURLPrefix()+'/'+threadmarksa['href']
+            else:
+                href = threadmarksa['href']
+            threadmarkgroups[tmcat_name]=self.fetch_threadmarks(href,
+                                                                  tmcat_name,
+                                                                  tmcat_num)
+            logger.debug(threadmarkgroups[tmcat_name])
+        for cat_name in self.getConfigList('threadmark_category_order',['Threadmarks',
+                                                                        'Sidestory',
+                                                                        'Apocrypha',
+                                                                        'Omake',
+                                                                        'Media',
+                                                                        'Informational',
+                                                                        'Staff Post']):
+            if cat_name in threadmarkgroups:
+                threadmarks.extend(threadmarkgroups[cat_name])
+        return threadmarks
+
+    def fetch_threadmarks(self,url,tmcat_name,tmcat_num, passed_tmcat_index=0):
+        logger.debug("fetch_threadmarks(%s,tmcat_num=%s,passed_tmcat_index:%s,url=%s)"%(tmcat_name,tmcat_num, passed_tmcat_index, url))
+        threadmarks=[]
+        soupmarks = self.make_soup(self._fetchUrl(url))
+        tm_list = soupmarks.find('div',{'class':'structItemContainer'})
+        if not tm_list: # load-range don't have threadmarkList.
+            tm_list = soupmarks
+        # logger.debug(tm_list)
+        markas = []
+        tmcat_index=passed_tmcat_index
+        after = False
+        for tm_item in tm_list.find_all('div',{'class':'structItem--threadmark'}):
+            atag = tm_item.find('a',{'data-tp-primary':'on'})
+            if not atag:
+                fetcher = tm_item.find('div',{'data-xf-click':'threadmark-fetcher'})
+                logger.debug(fetcher)
+                range_url = fetcher['data-fetchurl']
+                threadmarks.extend(self.fetch_threadmarks(range_url,
+                                                          tmcat_name,
+                                                          tmcat_num,
+                                                          tmcat_index))
+                tmcat_index = len(threadmarks)
+                after=True
+            else:
+                if after:
+                    logger.debug("AFTER "*10)
+                    after=False
+                url,name = atag['href'],stripHTML(atag)
+                date = self.make_date(tm_item)
+                worddd = tm_item.find('dd')
+                if worddd:
+                    kwords = stripHTML(worddd)
+                else:
+                    kwords = ""
+
+                # if atag.parent.has_attr('data-words'):
+                #     words = int(atag.parent['data-words'])
+                #     if "(" in atag.next_sibling:
+                #         kwords = atag.next_sibling.strip()
+                #     logger.debug("%s"%kwords)
+                # else:
+                #     words = ""
+                #     kwords = ""
+                if 'http' not in url:
+                    url = self.getURLPrefix()+"/"+url
+                logger.debug("%s. %s"%(tmcat_index,name))
+                threadmarks.append({"tmcat_name":tmcat_name,
+                                    "tmcat_num":tmcat_num,
+                                    "tmcat_index":tmcat_index,
+                                    "title":name,
+                                    "url":url,
+                                    "date":date,
+                                    "words":"",
+                                    "kwords":kwords})
+                tmcat_index += 1
+        return threadmarks
+
+    def make_date(self,parenttag): # forums use a BS thing where dates
+                                   # can appear different if recent.
+        datestr=None
+        try:
+            datetag = parenttag.find('time')
+            return datetime.fromtimestamp(float(datetag['data-time']))
+            # if datetag:
+            #     datestr = datetag['title']
+            # else:
+            #     datetag = parenttag.find('abbr',{'class':'DateTime'})
+            #     if datetag:
+            #         datestr="%s at %s"%(datetag['data-datestring'],datetag['data-timestring'])
+            # # Apr 24, 2015 at 4:39 AM
+            # # May 1, 2015 at 5:47 AM
+            # datestr = re.sub(r' (\d[^\d])',r' 0\1',datestr) # add leading 0 for single digit day & hours.
+            # return makeDate(datestr, self.dateformat)
+        except:
+            logger.debug('No date found in %s'%parenttag,exc_info=True)
+            return None
+
--- a/fanficfare/adapters/base_xenforoforum_adapter.py
+++ b/fanficfare/adapters/base_xenforoforum_adapter.py
@ -255,16 +255,12 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
            tmcat_num = threadmarksa['href'].split('category_id=')[1]
            # get from earlier <a> now.
            tmcat_name = stripHTML(threadmarksa.find_previous('a',{'class':'threadmarksTrigger'}))
-            prepend = ""
            if tmcat_name in self.getConfigList('skip_threadmarks_categories'):
                continue

            if tmcat_name == 'Apocrypha' and self.getConfig('apocrypha_to_omake'):
                tmcat_name = 'Omake'

-            if tmcat_name != "Threadmarks":
-                prepend = tmcat_name+" - "
-
            threadmarks.extend(self.fetch_threadmarks(self.getURLPrefix()+'/'+threadmarksa['href'],
                                                      tmcat_name,
                                                      tmcat_num))
@ -348,13 +344,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
        # use BeautifulSoup HTML parser to make everything easier to find.
        topsoup = souptag = self.make_soup(data)

-        h1 = souptag.find('div',{'class':'titleBar'}).h1
-        ## SV has started putting 'Crossover', 'Sci-Fi' etc spans in the title h1.
-        for tag in h1.find_all('span',{'class':'prefix'}):
-            ## stick them into genre.
-            self.story.addToList('genre',stripHTML(tag))
-            tag.extract()
-        self.story.setMetadata('title',stripHTML(h1))
+        self.parse_title(topsoup)

        first_post_title = self.getConfig('first_post_title','First Post')

@ -408,7 +398,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):

                if words and self.getConfig('use_threadmark_wordcounts',True):
                    self.story.setMetadata('numWords',words)
-            souptag = souptag.find('li',{'class':'message'}) # limit first post for date stuff below. ('#' posts above)
+            souptag = self.get_first_post(topsoup)

        if use_threadmark_chaps or self.getConfig('always_use_forumtags'):
            ## only use tags if threadmarks for chapters or always_use_forumtags is on.
@ -419,11 +409,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
                self.story.addToList('forumtags',tstr)

        # author moved down here to take from post URLs.
-        a = souptag.find('h3',{'class':'userText'}).find('a')
-        self.story.addToList('authorId',a['href'].split('/')[1])
-        authorUrl = self.getURLPrefix()+'/'+a['href']
-        self.story.addToList('authorUrl',authorUrl)
-        self.story.addToList('author',a.text)
+        self.parse_author(souptag)

        if self.getConfig('author_avatar_cover'):
            authorcard = self.make_soup(self._fetchUrl(authorUrl+"?card=1"))
@ -437,25 +423,24 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
            ##
            ## </div>

-        # Now go hunting for the 'chapter list'.
-        bq = souptag.find('blockquote') # assume first posting contains TOC urls.
+        # Now get first post for description and chapter list if not
+        # using threadmarks.
+        first_post = self.get_first_post_body(topsoup)

-        bq.name='div'
-
-        for iframe in bq.find_all('iframe'):
+        for iframe in first_post.find_all('iframe'):
            iframe.extract() # calibre book reader & editor don't like iframes to youtube.

-        for qdiv in bq.find_all('div',{'class':'quoteExpand'}):
+        for qdiv in first_post.find_all('div',{'class':'quoteExpand'}):
            qdiv.extract() # Remove <div class="quoteExpand">click to expand</div>

-        self.setDescription(useurl,bq)
+        self.setDescription(useurl,first_post)

        # otherwise, use first post links--include first post since
        # that's often also the first chapter.

        if self.num_chapters() < 1:
            self.add_chapter(first_post_title,useurl)
-            for (url,name) in [ (x['href'],stripHTML(x)) for x in bq.find_all('a') ]:
+            for (url,name) in [ (x['href'],stripHTML(x)) for x in first_post.find_all('a') ]:
                (is_chapter_url,url) = self._is_normalize_chapterurl(url)
                if is_chapter_url and name != u"\u2191": # skip quote links as indicated by up arrow character.
                    self.add_chapter(name,url)
@ -475,6 +460,30 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
            if date:
                self.story.setMetadata('dateUpdated', date)

+    def parse_title(self,souptag):
+        h1 = souptag.find('div',{'class':'titleBar'}).h1
+        ## SV has started putting 'Crossover', 'Sci-Fi' etc spans in the title h1.
+        for tag in h1.find_all('span',{'class':'prefix'}):
+            ## stick them into genre.
+            self.story.addToList('genre',stripHTML(tag))
+            tag.extract()
+        self.story.setMetadata('title',stripHTML(h1))
+
+    def parse_author(self,souptag):
+        a = souptag.find('h3',{'class':'userText'}).find('a')
+        self.story.addToList('authorId',a['href'].split('/')[1])
+        authorUrl = self.getURLPrefix()+'/'+a['href']
+        self.story.addToList('authorUrl',authorUrl)
+        self.story.addToList('author',a.text)
+
+    def get_first_post(self,topsoup):
+        return topsoup.find('li',{'class':'message'}) # limit first post for date stuff below. ('#' posts above)
+
+    def get_first_post_body(self,topsoup):
+        bq = self.get_first_post(topsoup).find('blockquote')
+        bq.name='div'
+        return bq
+
    def make_date(self,parenttag): # forums use a BS thing where dates
                                   # can appear different if recent.
        datestr=None
--- a/fanficfare/configurable.py
+++ b/fanficfare/configurable.py
@ -1122,6 +1122,11 @@ class Configuration(configparser.SafeConfigParser):
            ## not present at all
            headers.append(('Referer',referer))

+        if "xf2test" in url:
+            import base64
+            base64string = base64.encodestring('%s:%s' % ("xf2demo2019", "dBfbyHVvRCsYtLg846r3")).replace('\n', '')
+            headers.append(('Authorization', "Basic %s" % base64string))
+
        self.opener.addheaders = headers

        if parameters != None: