Refactor XF1 XF2 to consolidate logic.

2025-12-28 19:56:40 +01:00 · 2019-04-24 13:07:22 -05:00 · 2019-04-24 13:07:22 -05:00 · cf99d82e30
commit cf99d82e30
parent 1c42040885
2 changed files with 124 additions and 162 deletions
--- a/fanficfare/adapters/base_xenforo2forum_adapter.py
+++ b/fanficfare/adapters/base_xenforo2forum_adapter.py
@ -80,128 +80,44 @@ class BaseXenForo2ForumAdapter(BaseXenForoForumAdapter):
    def get_post_updated_date(self,souptag):
        return self.make_date(souptag.find('div',{'class':'message-lastEdit'}))

-    def extract_threadmarks(self,souptag):
-        threadmarks=[]
-        # try threadmarks if no '#' in url
-        navdiv = souptag.find('div',{'class':'buttonGroup'})
-        if not navdiv:
-            return threadmarks
-        # was class=threadmarksTrigger.  thread cats are currently
-        # only OverlayTrigger <a>s in threadmarkMenus, but I wouldn't
-        # be surprised if that changed.  Don't want to do use just
-        # href=re because there's more than one copy on the page; plus
-        # could be included in a post.  Would be easier if <noscript>s
-        # weren't being stripped, but that's a different issue.
-        threadmarksas = navdiv.find_all('a',{'class':'menuTrigger','href':re.compile('threadmarks.*(threadmark_category=)?')})
-        ## Loop on threadmark categories.
-        tmcat_num=None
+    def get_threadmarks_top(self,souptag):
+        return souptag.find('div',{'class':'buttonGroup'})

-        threadmarkgroups = dict() # for ordering threadmarks
-        for threadmarksa in threadmarksas:
-            # logger.debug("threadmarksa:%s"%threadmarksa)
-            if 'threadmark_category=' in threadmarksa['href']:
-                tmcat_num = threadmarksa['href'].split('threadmark_category=')[1]
-            else:
-                tmcat_num = '1'
-            # get from earlier <a> now.
-            tmcat_name = stripHTML(threadmarksa)
-            if tmcat_name in self.getConfigList('skip_threadmarks_categories'):
-                continue
+    def get_threadmarks(self,navdiv):
+        return navdiv.find_all('a',{'class':'menuTrigger','href':re.compile('threadmarks.*(threadmark_category=)?')})

-            if tmcat_name == 'Apocrypha' and self.getConfig('apocrypha_to_omake'):
-                tmcat_name = 'Omake'
+    def get_threadmark_catnumname(self,threadmarksa):
+        if 'threadmark_category=' in threadmarksa['href']:
+            tmcat_num = threadmarksa['href'].split('threadmark_category=')[1]
+        else:
+            tmcat_num = '1'
+        tmcat_name = stripHTML(threadmarksa)
+        return (tmcat_num,tmcat_name)

-            if 'http' not in threadmarksa['href']:
-                href = self.getURLPrefix()+'/'+threadmarksa['href']
-            else:
-                href = threadmarksa['href']
-            threadmarkgroups[tmcat_name]=self.fetch_threadmarks(href,
-                                                                  tmcat_name,
-                                                                  tmcat_num)
-            # logger.debug(threadmarkgroups[tmcat_name])
-        ## Order of threadmark groups in new SV is changed and
-        ## possibly unpredictable.  Normalize.  Keep as configurable?
-        ## What about categories not in the list?
-        default_order = ['Threadmarks',
-                         'Sidestory',
-                         'Apocrypha',
-                         'Omake',
-                         'Media',
-                         'Informational',
-                         'Staff Post']
-        # default order also *after* config'ed
-        # threadmark_category_order so if they are not also in
-        # skip_threadmarks_categories they appear in the expected
-        # order.
-        for cat_name in self.getConfigList('threadmark_category_order',default_order)+default_order:
-            if cat_name in threadmarkgroups:
-                threadmarks.extend(threadmarkgroups[cat_name])
-                del threadmarkgroups[cat_name]
-        # more categories left?  new or at least unknown
-        if threadmarkgroups:
-            cats = threadmarkgroups.keys()
-            # alphabetize for lack of a better idea to insure consist ordering
-            cats.sort()
-            for cat_name in cats:
-                threadmarks.extend(threadmarkgroups[cat_name])
-        return threadmarks
+    def get_threadmarks_list(self,soupmarks):
+        return soupmarks.find('div',{'class':'structItemContainer'})

-    def fetch_threadmarks(self,url,tmcat_name,tmcat_num, passed_tmcat_index=0):
-        logger.debug("fetch_threadmarks(%s,tmcat_num=%s,passed_tmcat_index:%s,url=%s)"%(tmcat_name,tmcat_num, passed_tmcat_index, url))
-        threadmarks=[]
-        soupmarks = self.make_soup(self._fetchUrl(url))
-        tm_list = soupmarks.find('div',{'class':'structItemContainer'})
-        if not tm_list: # load-range don't have threadmarkList.
-            tm_list = soupmarks
-        # logger.debug(tm_list)
-        markas = []
-        tmcat_index=passed_tmcat_index
-        after = False
-        for tm_item in tm_list.find_all('div',{'class':'structItem--threadmark'}):
-            atag = tm_item.find('a',{'data-tp-primary':'on'})
-            if not atag:
-                fetcher = tm_item.find('div',{'data-xf-click':'threadmark-fetcher'})
-                # logger.debug(fetcher)
-                range_url = fetcher['data-fetchurl']
-                threadmarks.extend(self.fetch_threadmarks(range_url,
-                                                          tmcat_name,
-                                                          tmcat_num,
-                                                          tmcat_index))
-                tmcat_index = len(threadmarks)
-                after=True
-            else:
-                if after:
-                    # logger.debug("AFTER "*10)
-                    after=False
-                url,name = atag['href'],stripHTML(atag)
-                date = self.make_date(tm_item)
-                worddd = tm_item.find('dd')
-                if worddd:
-                    kwords = stripHTML(worddd)
-                else:
-                    kwords = ""
+    def get_threadmarks_from_list(self,tm_list):
+        return tm_list.find_all('div',{'class':'structItem--threadmark'})

-                # if atag.parent.has_attr('data-words'):
-                #     words = int(atag.parent['data-words'])
-                #     if "(" in atag.next_sibling:
-                #         kwords = atag.next_sibling.strip()
-                #     logger.debug("%s"%kwords)
-                # else:
-                #     words = ""
-                #     kwords = ""
-                if 'http' not in url:
-                    url = self.getURLPrefix()+"/"+url
-                # logger.debug("%s. %s"%(tmcat_index,name))
-                threadmarks.append({"tmcat_name":tmcat_name,
-                                    "tmcat_num":tmcat_num,
-                                    "tmcat_index":tmcat_index,
-                                    "title":name,
-                                    "url":url,
-                                    "date":date,
-                                    "words":"",
-                                    "kwords":kwords})
-                tmcat_index += 1
-        return threadmarks
+    def get_atag_from_threadmark(self,tm_item):
+        return tm_item.find('a',{'data-tp-primary':'on'})
+
+    def get_threadmark_range_url(self,tm_item,tmcat_num):
+        fetcher = tm_item.find('div',{'data-xf-click':'threadmark-fetcher'})
+        # logger.debug(fetcher)
+        return fetcher['data-fetchurl']
+
+    def get_threadmark_date(self,tm_item):
+        return self.make_date(tm_item)
+
+    ## XF2 doesn't appear to have words, just kwords.
+    def get_threadmark_words(self,tm_item):
+        words = kwords = ""
+        worddd = tm_item.find('dd')
+        if worddd:
+            kwords = "("+stripHTML(worddd)+")" # to match XF1
+        return words,kwords

    def make_date(self,parenttag):
        datestr=None
--- a/fanficfare/adapters/base_xenforoforum_adapter.py
+++ b/fanficfare/adapters/base_xenforoforum_adapter.py
@ -233,84 +233,131 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
        self.cache_posts(soup)
        return soup

-    ## Moved over from adapter_forumquestionablequestingcom when SB/SV
-    ## threadmark.rss became 'most recent 10 in reverse order'.
+    def get_threadmarks_top(self,souptag):
+        return souptag.find('div',{'class':'threadmarkMenus'})
+
+    def get_threadmarks(self,navdiv):
+        return navdiv.find_all('a',{'class':'OverlayTrigger','href':re.compile('threadmarks.*category_id=')})
+
+    def get_threadmark_catnumname(self,threadmarksa):
+        return (threadmarksa['href'].split('category_id=')[1],
+                stripHTML(threadmarksa.find_previous('a',{'class':'threadmarksTrigger'})))
+
    def extract_threadmarks(self,souptag):
        threadmarks=[]
        # try threadmarks if no '#' in url
-        navdiv = souptag.find('div',{'class':'threadmarkMenus'})
+        navdiv = self.get_threadmarks_top(souptag)
        if not navdiv:
            return threadmarks
-        # was class=threadmarksTrigger.  thread cats are currently
-        # only OverlayTrigger <a>s in threadmarkMenus, but I wouldn't
-        # be surprised if that changed.  Don't want to do use just
-        # href=re because there's more than one copy on the page; plus
-        # could be included in a post.  Would be easier if <noscript>s
-        # weren't being stripped, but that's a different issue.
-        threadmarksas = navdiv.find_all('a',{'class':'OverlayTrigger','href':re.compile('threadmarks.*category_id=')})
-        ## Loop on threadmark categories.
-        tmcat_num=None
+        threadmarksas = self.get_threadmarks(navdiv)

+        threadmarkgroups = dict() # for ordering threadmarks
+        ## Loop on threadmark categories.
        for threadmarksa in threadmarksas:
-            tmcat_num = threadmarksa['href'].split('category_id=')[1]
-            # get from earlier <a> now.
-            tmcat_name = stripHTML(threadmarksa.find_previous('a',{'class':'threadmarksTrigger'}))
+            (tmcat_num,tmcat_name) = self.get_threadmark_catnumname(threadmarksa)
            if tmcat_name in self.getConfigList('skip_threadmarks_categories'):
                continue

            if tmcat_name == 'Apocrypha' and self.getConfig('apocrypha_to_omake'):
                tmcat_name = 'Omake'

-            threadmarks.extend(self.fetch_threadmarks(self.getURLPrefix()+'/'+threadmarksa['href'],
-                                                      tmcat_name,
-                                                      tmcat_num))
+            if 'http' not in threadmarksa['href']:
+                href = self.getURLPrefix()+'/'+threadmarksa['href']
+            else:
+                href = threadmarksa['href']
+            threadmarkgroups[tmcat_name]=self.fetch_threadmarks(href,
+                                                                tmcat_name,
+                                                                tmcat_num)
+        ## Order of threadmark groups in new SV is changed and
+        ## possibly unpredictable.  Normalize.  Keep as configurable?
+        ## What about categories not in the list?
+        default_order = ['Threadmarks',
+                         'Sidestory',
+                         'Apocrypha',
+                         'Omake',
+                         'Media',
+                         'Informational',
+                         'Staff Post']
+        # default order also *after* config'ed
+        # threadmark_category_order so if they are not also in
+        # skip_threadmarks_categories they appear in the expected
+        # order.
+        for cat_name in self.getConfigList('threadmark_category_order',default_order)+default_order:
+            if cat_name in threadmarkgroups:
+                threadmarks.extend(threadmarkgroups[cat_name])
+                del threadmarkgroups[cat_name]
+        # more categories left?  new or at least unknown
+        if threadmarkgroups:
+            cats = threadmarkgroups.keys()
+            # alphabetize for lack of a better idea to insure consist ordering
+            cats.sort()
+            for cat_name in cats:
+                threadmarks.extend(threadmarkgroups[cat_name])
        return threadmarks

+    def get_threadmarks_list(self,soupmarks):
+        return soupmarks.find('div',{'class':'threadmarkList'})
+
+    def get_threadmarks_from_list(self,tm_list):
+        return tm_list.find_all('li',{'class':'threadmarkListItem'})
+
+    def get_atag_from_threadmark(self,tm_item):
+        return tm_item.find('a',{'class':'PreviewTooltip'})
+
+    def get_threadmark_range_url(self,tm_item,tmcat_num):
+        load_range = "threadmarks/load-range?min=%s&max=%s&category_id=%s"%(tm_item['data-range-min'],
+                                                                            tm_item['data-range-max'],
+                                                                            tmcat_num)
+        return self.url+load_range
+
+    def get_threadmark_date(self,tm_item):
+        atag = self.get_atag_from_threadmark(tm_item)
+        return self.make_date(atag.find_next_sibling('div',{'class':'extra'}))
+
+    def get_threadmark_words(self,tm_item):
+        words = kwords = ""
+        atag = self.get_atag_from_threadmark(tm_item)
+        if atag.parent.has_attr('data-words'):
+            words = int(atag.parent['data-words'])
+            if "(" in atag.next_sibling:
+                kwords = atag.next_sibling.strip()
+        return words,kwords
+
    def fetch_threadmarks(self,url,tmcat_name,tmcat_num, passed_tmcat_index=0):
        logger.debug("fetch_threadmarks(%s,tmcat_num=%s,passed_tmcat_index:%s,url=%s)"%(tmcat_name,tmcat_num, passed_tmcat_index, url))
        threadmarks=[]
        soupmarks = self.make_soup(self._fetchUrl(url))
-        tm_list = soupmarks.find('div',{'class':'threadmarkList'})
-        if not tm_list: # load-range don't have threadmarkList.
+        tm_list = self.get_threadmarks_list(soupmarks)
+        if not tm_list: # load-range don't match
            tm_list = soupmarks
        # logger.debug(tm_list)
        markas = []
        tmcat_index=passed_tmcat_index
        after = False
-        for tm_item in tm_list.find_all('li',{'class':'threadmarkListItem'}):
-            atag = tm_item.find('a',{'class':'PreviewTooltip'})
+        for tm_item in self.get_threadmarks_from_list(tm_list):
+            atag = self.get_atag_from_threadmark(tm_item)
            if not atag:
-                if tm_item['data-range-min'] and tm_item['data-range-max']:
-                # logger.debug(tm_item)
-                    load_range = "threadmarks/load-range?min=%s&max=%s&category_id=%s"%(tm_item['data-range-min'],
-                                                                                        tm_item['data-range-max'],
-                                                                                        tmcat_num)
-                    threadmarks.extend(self.fetch_threadmarks(self.url+load_range,
-                                                              tmcat_name,
-                                                              tmcat_num,
-                                                              tmcat_index))
-                    tmcat_index = len(threadmarks)
-                    after=True
+                threadmarks.extend(self.fetch_threadmarks(self.get_threadmark_range_url(tm_item,tmcat_num),
+                                                          tmcat_name,
+                                                          tmcat_num,
+                                                          tmcat_index))
+                tmcat_index = len(threadmarks)
+                after=True
            else:
                if after:
                    # logger.debug("AFTER "*10)
                    after=False
                url,name = atag['href'],stripHTML(atag)
-                date = self.make_date(atag.find_next_sibling('div',{'class':'extra'}))
-                if atag.parent.has_attr('data-words'):
-                    words = int(atag.parent['data-words'])
-                    if "(" in atag.next_sibling:
-                        kwords = atag.next_sibling.strip()
-                    # logger.debug("%s"%kwords)
-                else:
-                    words = ""
-                    kwords = ""
+                date = self.get_threadmark_date(tm_item)
+                words,kwords = self.get_threadmark_words(tm_item)
+                if 'http' not in url:
+                    url = self.getURLPrefix()+"/"+url
                # logger.debug("%s. %s"%(tmcat_index,name))
                threadmarks.append({"tmcat_name":tmcat_name,
                                    "tmcat_num":tmcat_num,
                                    "tmcat_index":tmcat_index,
                                    "title":name,
-                                    "url":self.getURLPrefix()+"/"+url,
+                                    "url":url,
                                    "date":date,
                                    "words":words,
                                    "kwords":kwords})
@ -495,7 +542,6 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
        bq = souptag.find('blockquote')
        if not bq:
            bq = souptag.find('div',{'class':'messageText'}) # cached gets if it was already used before
-
        bq.name='div'
        return bq

@ -623,7 +669,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):

    def make_reader_url(self,tmcat_num,reader_page_num):
        return self.getURLPrefix()+'/threads/'+self.story.getMetadata('storyId')+'/'+tmcat_num+'/reader?page='+unicode(reader_page_num)
-    
+
    def handle_spoilers(self,topsoup):
        '''
        Modifies tag given as required to do spoiler changes.