Adding normalize_chapterurl() for xenforoforum and normalize_text_links option.

2026-04-27 01:11:21 +02:00 · 2016-10-06 20:56:37 -05:00 · 2016-10-06 20:56:37 -05:00 · a40383bada
commit a40383bada
parent c9205dd6bc
5 changed files with 166 additions and 109 deletions
--- a/calibre-plugin/plugin-defaults.ini
+++ b/calibre-plugin/plugin-defaults.ini
@ -720,6 +720,11 @@ remove_transparency: true
 ## true--replace_br_with_p also fixes the problem.
 nook_img_fix:true

+## Apply adapter's normalize_chapterurl() to all links in chapter
+## texts, if they match chapter URLs.  Currently only implemented by
+## base_xenforoforum adapters.
+#normalize_text_links:false
+
 [mobi]
 ## mobi TOC cannot be turned off right now.
 #include_tocpage: true
--- a/fanficfare/adapters/base_adapter.py
+++ b/fanficfare/adapters/base_adapter.py
@ -84,7 +84,7 @@ class BaseSiteAdapter(Configurable):

    def __init__(self, configuration, url):
        Configurable.__init__(self, configuration)
-        
+
        self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
        self.password = ""
        self.is_adult=False
@ -113,7 +113,7 @@ class BaseSiteAdapter(Configurable):
        self.logfile = None

        self.pagecache = self.get_empty_pagecache()
-        
+
        ## order of preference for decoding.
        self.decode = ["utf8",
                       "Windows-1252"] # 1252 is a superset of
@ -135,17 +135,17 @@ class BaseSiteAdapter(Configurable):
        saveheaders = self.opener.addheaders
        self.opener = u2.build_opener(u2.HTTPCookieProcessor(self.cookiejar),GZipProcessor())
        self.opener.addheaders = saveheaders
-        
+
    def load_cookiejar(self,filename):
        '''
        Needs to be called after adapter create, but before any fetchs
        are done.  Takes file *name*.
        '''
        self.get_cookiejar().load(filename, ignore_discard=True, ignore_expires=True)
-        
+
    def get_pagecache(self):
        return self.pagecache
-    
+
    def set_pagecache(self,d):
        self.pagecache=d

@ -159,7 +159,7 @@ class BaseSiteAdapter(Configurable):

    def _has_cachekey(self,cachekey):
        return self.use_pagecache() and cachekey in self.get_pagecache()
-    
+
    def _get_from_pagecache(self,cachekey):
        if self.use_pagecache():
            return self.get_pagecache().get(cachekey)
@ -176,18 +176,18 @@ class BaseSiteAdapter(Configurable):
        this and change it to True.
        '''
        return False
-        
+
    # def story_load(self,filename):
    #     d = pickle.load(self.story.metadata,filename)
    #     self.story.metadata = d['metadata']
    #     self.chapterUrls = d['chapterlist']
    #     self.story.metadataDone = True
-        
+
    def _setURL(self,url):
        self.url = url
        self.parsedUrl = up.urlparse(url)
        self.host = self.parsedUrl.netloc
-        self.path = self.parsedUrl.path        
+        self.path = self.parsedUrl.path
        self.story.setMetadata('storyUrl',self.url,condremoveentities=False)

 ## website encoding(s)--in theory, each website reports the character
@ -201,7 +201,7 @@ class BaseSiteAdapter(Configurable):
            decode = self.getConfigList('website_encodings')
        else:
            decode = self.decode
-        
+
        for code in decode:
            try:
                #print code
@ -230,7 +230,7 @@ class BaseSiteAdapter(Configurable):
                 usecache=True):
        '''
        When should cache be cleared or not used? logins...
-        
+
        extrasleep is primarily for ffnet adapter which has extra
        sleeps.  Passed into fetchs so it can be bypassed when
        cache hits.
@ -240,7 +240,7 @@ class BaseSiteAdapter(Configurable):
            logger.debug("#####################################\npagecache HIT: %s"%safe_url(cachekey))
            data,redirecturl = self._get_from_pagecache(cachekey)
            return data
-        
+
        logger.debug("#####################################\npagecache MISS: %s"%safe_url(cachekey))
        self.do_sleep(extrasleep)

@ -261,19 +261,19 @@ class BaseSiteAdapter(Configurable):
                     parameters=None,
                     extrasleep=None,
                     usecache=True):
-        
+
        return self._fetchUrlRawOpened(url,
                                       parameters,
                                       extrasleep,
                                       usecache)[0]
-        
+
    def _fetchUrlRawOpened(self, url,
                           parameters=None,
                           extrasleep=None,
                           usecache=True):
        '''
        When should cache be cleared or not used? logins...
-        
+
        extrasleep is primarily for ffnet adapter which has extra
        sleeps.  Passed into fetchs so it can be bypassed when
        cache hits.
@ -289,7 +289,7 @@ class BaseSiteAdapter(Configurable):
                def geturl(self): return self.url
                def read(self): return self.data
            return (data,FakeOpened(data,redirecturl))
-        
+
        logger.debug("#####################################\npagecache MISS: %s"%safe_url(cachekey))
        self.do_sleep(extrasleep)
        if parameters != None:
@ -298,13 +298,13 @@ class BaseSiteAdapter(Configurable):
            opened = self.opener.open(url.replace(' ','%20'),None,float(self.getConfig('connect_timeout',30.0)))
        data = opened.read()
        self._set_to_pagecache(cachekey,data,opened.url)
-        
+
        return (data,opened)

    def set_sleep(self,val):
        logger.debug("\n===========\n set sleep time %s\n==========="%val)
        self.override_sleep = val
-    
+
    def do_sleep(self,extrasleep=None):
        if extrasleep:
            time.sleep(float(extrasleep))
@ -312,7 +312,7 @@ class BaseSiteAdapter(Configurable):
            time.sleep(float(self.override_sleep))
        elif self.getConfig('slow_down_sleep_time'):
            time.sleep(float(self.getConfig('slow_down_sleep_time')))
-        
+
    def _fetchUrl(self, url,
                  parameters=None,
                  usecache=True,
@ -330,7 +330,7 @@ class BaseSiteAdapter(Configurable):

        excpt=None
        for sleeptime in [0, 0.5, 4, 9]:
-            time.sleep(sleeptime)	
+            time.sleep(sleeptime)
            try:
                (data,opened)=self._fetchUrlRawOpened(url,
                                                      parameters=parameters,
@ -345,7 +345,7 @@ class BaseSiteAdapter(Configurable):
            except Exception, e:
                excpt=e
                logger.warn("Caught an exception reading URL: %s sleeptime(%s) Exception %s."%(unicode(safe_url(url)),sleeptime,unicode(e)))
-                
+
        logger.error("Giving up on %s" %safe_url(url))
        logger.debug(excpt, exc_info=True)
        raise(excpt)
@ -357,12 +357,16 @@ class BaseSiteAdapter(Configurable):
        if last:
            self.chapterLast=int(last)-1
        self.story.set_chapters_range(first,last)
-    
+
    # Does the download the first time it's called.
    def getStory(self):
        if not self.storyDone:
            self.getStoryMetadataOnly(get_cover=True)

+            ## one-off step to normalize old chapter URLs if present.
+            if self.oldchaptersmap:
+                self.oldchaptersmap = dict((self.normalize_chapterurl(key), value) for (key, value) in self.oldchaptersmap.items())
+
            for index, (title,url) in enumerate(self.chapterUrls):
                newchap = False
                if (self.chapterFirst!=None and index < self.chapterFirst) or \
@ -388,7 +392,7 @@ class BaseSiteAdapter(Configurable):
                               url in self.oldchaptersdata and (
                            self.oldchaptersdata[url]['chapterorigtitle'] !=
                            self.oldchaptersdata[url]['chaptertitle']) )
-                    
+
                    if not data:
                        data = self.getChapterText(url)
                        # if had to fetch and has existing chapters
@ -400,13 +404,13 @@ class BaseSiteAdapter(Configurable):
                        # anyway--only if it's replaced during an
                        # update.
                        newchap = False
-                        
+
                    self.story.addChapter(url,
                                          removeEntities(title),
                                          removeEntities(data),
                                          newchap)
            self.storyDone = True
-            
+
            # include image, but no cover from story, add default_cover_image cover.
            if self.getConfig('include_images') and \
                    not self.story.cover and \
@ -423,26 +427,30 @@ class BaseSiteAdapter(Configurable):
            if not self.story.cover and self.oldcover:
                self.story.oldcover = self.oldcover
                self.story.setMetadata('cover_image','old')
-                
+
            # cheesy way to carry calibre bookmark file forward across update.
            if self.calibrebookmark:
                self.story.calibrebookmark = self.calibrebookmark
            if self.logfile:
                self.story.logfile = self.logfile
-                
+
        return self.story

    def getStoryMetadataOnly(self,get_cover=True):
        if not self.metadataDone:
            self.doExtractChapterUrlsAndMetadata(get_cover=get_cover)
-            
+
            if not self.story.getMetadataRaw('dateUpdated'):
                if self.story.getMetadataRaw('datePublished'):
                    self.story.setMetadata('dateUpdated',self.story.getMetadataRaw('datePublished'))
                else:
-                    self.story.setMetadata('dateUpdated',self.story.getMetadataRaw('dateCreated'))                
+                    self.story.setMetadata('dateUpdated',self.story.getMetadataRaw('dateCreated'))

            self.metadataDone = True
+            # normalize chapter urls.
+            for index, (title,url) in enumerate(self.chapterUrls):
+                self.chapterUrls[index] = (title,self.normalize_chapterurl(url))
+
        return self.story

    def setStoryMetadata(self,metahtml):
@ -453,36 +461,36 @@ class BaseSiteAdapter(Configurable):
                if self.story.getMetadataRaw('datePublished'):
                    self.story.setMetadata('dateUpdated',self.story.getMetadataRaw('datePublished'))
                else:
-                    self.story.setMetadata('dateUpdated',self.story.getMetadataRaw('dateCreated'))                
-    
+                    self.story.setMetadata('dateUpdated',self.story.getMetadataRaw('dateCreated'))
+
    def hookForUpdates(self,chaptercount):
        "Usually not needed."
        return chaptercount

    ###############################
-    
+
    @staticmethod
    def getSiteDomain():
        "Needs to be overriden in each adapter class."
        return 'no such domain'
-    
+
    @classmethod
    def getConfigSection(cls):
        "Only needs to be overriden if != site domain."
        return cls.getSiteDomain()
-    
+
    @classmethod
    def getConfigSections(cls):
        "Only needs to be overriden if has additional ini sections."
        return [cls.getConfigSection()]
-    
+
    @classmethod
    def stripURLParameters(cls,url):
        "Only needs to be overriden if URL contains more than one parameter"
        ## remove any trailing '&' parameters--?sid=999 will be left.
        ## that's all that any of the current adapters need or want.
        return re.sub(r"&.*$","",url)
-    
+
    ## URL pattern validation is done *after* picking an adaptor based
    ## on domain instead of *as* the adaptor selector so we can offer
    ## the user example(s) for that particular site.
@ -490,7 +498,7 @@ class BaseSiteAdapter(Configurable):
    def getSiteURLPattern(self):
        "Used to validate URL.  Should be override in each adapter class."
        return '^http://'+re.escape(self.getSiteDomain())
-    
+
    @classmethod
    def getSiteExampleURLs(cls):
        """
@ -500,7 +508,7 @@ class BaseSiteAdapter(Configurable):
        validateURL method.
        """
        return 'no such example'
-    
+
    def doExtractChapterUrlsAndMetadata(self,get_cover=True):
        '''
        There are a handful of adapters that fetch a cover image while
@ -509,7 +517,7 @@ class BaseSiteAdapter(Configurable):
        this instead of extractChapterUrlsAndMetadata()
        '''
        return self.extractChapterUrlsAndMetadata()
-    
+
    def extractChapterUrlsAndMetadata(self):
        "Needs to be overriden in each adapter class.  Populates self.story metadata and self.chapterUrls"
        pass
@ -561,7 +569,7 @@ class BaseSiteAdapter(Configurable):
            # bs4
            return soup.attrs.keys()
        return []
-        
+
    # This gives us a unicode object, not just a string containing bytes.
    # (I gave soup a unicode string, you'd think it could give it back...)
    # Now also does a bunch of other common processing for us.
@ -570,12 +578,12 @@ class BaseSiteAdapter(Configurable):
            fetch=self._fetchUrlRaw

        acceptable_attributes = self.getConfigList('keep_html_attrs',['href','name','class','id'])
-        
+
        if self.getConfig("keep_style_attr"):
            acceptable_attributes.append('style')
        if self.getConfig("keep_title_attr"):
            acceptable_attributes.append('title')
-            
+
        #print("include_images:"+self.getConfig('include_images'))
        if self.getConfig('include_images'):
            acceptable_attributes.extend(('src','alt','longdesc'))
@ -592,6 +600,19 @@ class BaseSiteAdapter(Configurable):
            if attr not in acceptable_attributes:
                del soup[attr] ## strip all tag attributes except href and name

+        ## apply adapter's normalize_chapterurls to all links in
+        ## chapter texts, if they match chapter URLs.  While this will
+        ## be occasionally helpful by itself, it's really for the next
+        ## feature: internal text links.
+        if self.getConfig('normalize_text_links'):
+            for alink in soup.find_all('a'):
+                # try:
+                if alink.has_attr('href'):
+                    logger.debug("normalize_text_links %s -> %s"%(alink['href'],self.normalize_chapterurl(alink['href'])))
+                    alink['href'] = self.normalize_chapterurl(alink['href'])
+                # except AttributeError as ae:
+                #     logger.info("Parsing for normalize_text_links failed...")
+
        try:
            # as a generator, each tag will be returned even if there's a
            # mismatch at the end.
@ -599,8 +620,8 @@ class BaseSiteAdapter(Configurable):
                for attr in self.get_attr_keys(t):
                    if attr not in acceptable_attributes:
                        del t[attr] ## strip all tag attributes except acceptable_attributes
-    
-                # these are not acceptable strict XHTML.  But we do already have 
+
+                # these are not acceptable strict XHTML.  But we do already have
                # CSS classes of the same names defined
                if t and hasattr(t,'name') and t.name is not None:
                    if t.name in self.getConfigList('replace_tags_with_spans',['u']):
@ -616,11 +637,11 @@ class BaseSiteAdapter(Configurable):
                    # remove script tags cross the board.
                    if t.name=='script':
                        t.extract()
-                        
+
        except AttributeError, ae:
            if "%s"%ae != "'NoneType' object has no attribute 'next_element'":
                logger.error("Error parsing HTML, probably poor input HTML. %s"%ae)
-        
+
        retval = unicode(soup)

        if self.getConfig('nook_img_fix') and not self.getConfig('replace_br_with_p'):
@ -629,16 +650,16 @@ class BaseSiteAdapter(Configurable):
            # that under the text for the rest of the chapter.
            retval = re.sub(r"(?!<(div|p)>)\s*(?P<imgtag><img[^>]+>)\s*(?!</(div|p)>)",
                            "<div>\g<imgtag></div>",retval)
-            
+
        # Don't want html, head or body tags in chapter html--writers add them.
        # This is primarily for epub updates.
        retval = re.sub(r"</?(html|head|body)[^>]*>\r?\n?","",retval)
-        
+
        if self.getConfig("replace_br_with_p") and allow_replace_br_with_p:
            # Apply heuristic processing to replace <br> paragraph
            # breaks with <p> tags.
            retval = replace_br_with_p(retval)
-            
+
        if self.getConfig('replace_hr'):
            # replacing a self-closing tag with a container tag in the
            # soup is more difficult than it first appears.  So cheat.
@ -648,31 +669,35 @@ class BaseSiteAdapter(Configurable):

    def make_soup(self,data):
        '''
-        Convenience method for getting a bs4 soup.  Older and
-        non-updated adapters call the included bs3 library themselves.
+        Convenience method for getting a bs4 soup.  bs3 has been removed.
        '''
-        
+
        ## html5lib handles <noscript> oddly.  See:
        ## https://bugs.launchpad.net/beautifulsoup/+bug/1277464
        ## This should 'hide' and restore <noscript> tags.
        data = data.replace("noscript>","fff_hide_noscript>")
-        
+
        ## soup and re-soup because BS4/html5lib is more forgiving of
        ## incorrectly nested tags that way.
        soup = bs4.BeautifulSoup(data,'html5lib')
        soup = bs4.BeautifulSoup(unicode(soup),'html5lib')
-        
+
        for ns in soup.find_all('fff_hide_noscript'):
            ns.name = 'noscript'
-            
+
        return soup
-    
+
+    ## For adapters, especially base_xenforoforum to override.  Make
+    ## sure to return unchanged URL if it's NOT a chapter URL...
+    def normalize_chapterurl(self,url):
+        return url
+
 def cachedfetch(realfetch,cache,url):
    if url in cache:
        return cache[url]
    else:
        return realfetch(url)
-    
+
 fullmon = {u"January":u"01", u"February":u"02", u"March":u"03", u"April":u"04", u"May":u"05",
           u"June":u"06","July":u"07", u"August":u"08", u"September":u"09", u"October":u"10",
           u"November":u"11", u"December":u"12" }
@ -687,7 +712,7 @@ def makeDate(string,dateform):
    # lie.  It has to do something even more complicated to get
    # Russian month names correct everywhere.
    do_abbrev = "%b" in dateform
-        
+
    if u"%B" in dateform or do_abbrev:
        dateform = dateform.replace(u"%B",u"%m").replace(u"%b",u"%m")
        for (name,num) in fullmon.items():
@ -708,10 +733,10 @@ def makeDate(string,dateform):
        string = string.replace(u"AM",u"").replace(u"PM",u"").replace(u"am",u"").replace(u"pm",u"")

    date = datetime.strptime(string.encode('utf-8'),dateform.encode('utf-8'))
-    
+
    if add_hours:
        date += timedelta(hours=12)
-            
+
    return date

 # .? for AO3's ']' in param names.
--- a/fanficfare/adapters/base_xenforoforum_adapter.py
+++ b/fanficfare/adapters/base_xenforoforum_adapter.py
@ -1,6 +1,6 @@
 #  -*- coding: utf-8 -*-

-# Copyright 2015 FanFicFare team
+# Copyright 2016 FanFicFare team
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -85,6 +85,62 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
    def getSiteURLPattern(self):
        return r"https?://"+re.escape(self.getSiteDomain())+r"/(?P<tp>threads|posts)/(.+\.)?(?P<id>\d+)/?[^#]*?(#post-(?P<anchorpost>\d+))?$"

+    ## For adapters, especially base_xenforoforum to override.  Make
+    ## sure to return unchanged URL if it's NOT a chapter URL.  This
+    ## is most helpful for xenforoforum because threadmarks use
+    ## thread-name URLs--which can change if the thread name changes.
+    def normalize_chapterurl(self,url):
+        (is_chapter_url,normalized_url) = self._is_normalize_chapterurl(url)
+        if is_chapter_url:
+            return normalized_url
+        else:
+            return url
+
+    ## returns (is_chapter_url,normalized_url)
+    def _is_normalize_chapterurl(self,url):
+        is_chapter_url = False
+
+        ## moved from extract metadata to share with normalize_chapterurl.
+        if not url.startswith('http'):
+            url = self.getURLPrefix()+'/'+url
+
+        if ( url.startswith(self.getURLPrefix()) or
+             url.startswith('http://'+self.getSiteDomain()) or
+             url.startswith('https://'+self.getSiteDomain()) ) and \
+             ( '/posts/' in url or '/threads/' in url or 'showpost.php' in url or 'goto/post' in url):
+            # brute force way to deal with SB's http->https change when hardcoded http urls.
+            url = url.replace('http://'+self.getSiteDomain(),self.getURLPrefix())
+
+            # http://forums.spacebattles.com/showpost.php?p=4755532&postcount=9
+            url = re.sub(r'showpost\.php\?p=([0-9]+)(&postcount=[0-9]+)?',r'/posts/\1/',url)
+
+            # http://forums.spacebattles.com/goto/post?id=15222406#post-15222406
+            url = re.sub(r'/goto/post\?id=([0-9]+)(#post-[0-9]+)?',r'/posts/\1/',url)
+
+            url = re.sub(r'(^[\'"]+|[\'"]+$)','',url) # strip leading or trailing '" from incorrect quoting.
+            url = re.sub(r'like$','',url) # strip 'like' if incorrect 'like' link instead of proper post URL.
+
+            #### moved from getChapterText()
+            ## there's some history of stories with links to the wrong
+            ## page.  This changes page#post URLs to perma-link URLs.
+            ## Which will be redirected back to page#posts, but the
+            ## *correct* ones.
+            # http://forums.sufficientvelocity.com/threads/harry-potter-and-the-not-fatal-at-all-cultural-exchange-program.330/page-4#post-39915
+            # https://forums.sufficientvelocity.com/posts/39915/
+            if '#post-' in url:
+                url = self.getURLPrefix()+'/posts/'+url.split('#post-')[1]+'/'
+
+            ## Same as above except for for case where author mistakenly
+            ## used the reply link instead of normal link to post.
+            # "http://forums.spacebattles.com/threads/manager-worm-story-thread-iv.301602/reply?quote=15962513"
+            # https://forums.spacebattles.com/posts/
+            if 'reply?quote=' in url:
+                url = self.getURLPrefix()+'/posts/'+url.split('reply?quote=')[1]+'/'
+
+            is_chapter_url = True
+        return (is_chapter_url,url)
+
+
    def use_pagecache(self):
        '''
        adapters that will work with the page cache need to implement
@ -119,7 +175,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
        # params[soup.find('input', {'id':'password'})['name']] = params['password']

        d = self._fetchUrl(loginUrl, params)
-    
+
        if "Log Out" not in d :
            logger.info("Failed to login to URL %s as %s" % (loginUrl,
                                                             params['login']))
@ -183,7 +239,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
                    threadmark_chaps = True
                    if self.getConfig('always_include_first_post'):
                        self.chapterUrls.append((first_post_title,useurl))
-                    
+
                    for (atag,url,name) in [ (x,x['href'],stripHTML(x)) for x in markas ]:
                        date = self.make_date(atag.find_next_sibling('div',{'class':'extra'}))
                        if not self.story.getMetadataRaw('datePublished') or date < self.story.getMetadataRaw('datePublished'):
@ -202,7 +258,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
                if self.getConfig('capitalize_forumtags'):
                    tstr = tstr.title()
                self.story.addToList('forumtags',tstr)
-            
+
        # Now go hunting for the 'chapter list'.
        bq = soup.find('blockquote') # assume first posting contains TOC urls.

@ -222,28 +278,9 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
        if not self.chapterUrls:
            self.chapterUrls.append((first_post_title,useurl))
            for (url,name) in [ (x['href'],stripHTML(x)) for x in bq.find_all('a') ]:
-                #logger.debug("found chapurl:%s"%url)
-                if not url.startswith('http'):
-                    url = self.getURLPrefix()+'/'+url

-                if ( url.startswith(self.getURLPrefix()) or
-                     url.startswith('http://'+self.getSiteDomain()) or
-                     url.startswith('https://'+self.getSiteDomain()) ) and \
-                     ( '/posts/' in url or '/threads/' in url or 'showpost.php' in url or 'goto/post' in url):
-
-                    # brute force way to deal with SB's http->https change when hardcoded http urls.
-                    url = url.replace('http://'+self.getSiteDomain(),self.getURLPrefix())
-
-                    # http://forums.spacebattles.com/showpost.php?p=4755532&postcount=9
-                    url = re.sub(r'showpost\.php\?p=([0-9]+)(&postcount=[0-9]+)?',r'/posts/\1/',url)
-
-                    # http://forums.spacebattles.com/goto/post?id=15222406#post-15222406
-                    url = re.sub(r'/goto/post\?id=([0-9]+)(#post-[0-9]+)?',r'/posts/\1/',url)
-
-                    url = re.sub(r'(^[\'"]+|[\'"]+$)','',url) # strip leading or trailing '" from incorrect quoting.
-                    url = re.sub(r'like$','',url) # strip 'like' if incorrect 'like' link instead of proper post URL.
-
-                    logger.debug("(ch:%s)used chapurl:%s"%(len(self.chapterUrls)+1,url))
+                (is_chapter_url,url) = self._is_normalize_chapterurl(url)
+                if is_chapter_url:
                    self.chapterUrls.append((name,url))
                    if url == useurl and first_post_title == self.chapterUrls[0][0] \
                            and not self.getConfig('always_include_first_post',False):
@ -286,22 +323,6 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
    def getChapterText(self, url):
        logger.debug('Getting chapter text from: %s' % url)

-        ## there's some history of stories with links to the wrong
-        ## page.  This changes page#post URLs to perma-link URLs.
-        ## Which will be redirected back to page#posts, but the
-        ## *correct* ones.
-        # http://forums.sufficientvelocity.com/threads/harry-potter-and-the-not-fatal-at-all-cultural-exchange-program.330/page-4#post-39915
-        # https://forums.sufficientvelocity.com/posts/39915/
-        if '#post-' in url:
-            url = self.getURLPrefix()+'/posts/'+url.split('#post-')[1]+'/'
-
-        ## Same as above except for for case where author mistakenly
-        ## used the reply link instead of normal link to post.
-        # "http://forums.spacebattles.com/threads/manager-worm-story-thread-iv.301602/reply?quote=15962513"
-        # https://forums.spacebattles.com/posts/
-        if 'reply?quote=' in url:
-            url = self.getURLPrefix()+'/posts/'+url.split('reply?quote=')[1]+'/'
-
        try:
            origurl = url
            (data,opened) = self._fetchUrlOpened(url)
@ -309,20 +330,20 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
            if '#' in origurl and '#' not in url:
                url = url + origurl[origurl.index('#'):]
            logger.debug("chapter URL redirected to: %s"%url)
-            
+
            soup = self.make_soup(data)
-    
+
            if '#' in url:
                anchorid = url.split('#')[1]
                soup = soup.find('li',id=anchorid)
-    
+
            bq = soup.find('blockquote')
-    
+
            bq.name='div'
-    
+
            for iframe in bq.find_all('iframe'):
                iframe.extract() # calibre book reader & editor don't like iframes to youtube.
-    
+
            for qdiv in bq.find_all('div',{'class':'quoteExpand'}):
                qdiv.extract() # Remove <div class="quoteExpand">click to expand</div>

@ -330,7 +351,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
            ## include lazy load images.
            for img in bq.find_all('img',{'class':'lazyload'}):
                img['src'] = img['data-src']
-    
+
        except Exception as e:
            if self.getConfig('continue_on_chapter_error'):
                bq = self.make_soup("""<div>
--- a/fanficfare/configurable.py
+++ b/fanficfare/configurable.py
@ -184,6 +184,7 @@ def get_valid_set_options():
               'include_images':(None,['epub','html'],boollist),
               'grayscale_images':(None,['epub','html'],boollist),
               'no_image_processing':(None,['epub','html'],boollist),
+               'normalize_text_links':(None,['epub','html'],boollist),

               'capitalize_forumtags':(base_xenforo_list,None,boollist),
               'continue_on_chapter_error':(base_xenforo_list,None,boollist),
@ -361,7 +362,7 @@ def get_valid_keywords():
                 'minimum_threadmarks',
                 'first_post_title',
                 'always_include_first_post',
-                 '',
+                 'normalize_text_links',
                 ])

 # *known* entry keywords -- or rather regexps for them.
--- a/fanficfare/defaults.ini
+++ b/fanficfare/defaults.ini
@ -760,6 +760,11 @@ remove_transparency: true
 ## true--replace_br_with_p also fixes the problem.
 nook_img_fix:true

+## Apply adapter's normalize_chapterurl() to all links in chapter
+## texts, if they match chapter URLs.  Currently only implemented by
+## base_xenforoforum adapters.
+#normalize_text_links:false
+
 [mobi]
 ## mobi TOC cannot be turned off right now.
 #include_tocpage: true