FanFicFare/fanficfare/adapters/adapter_storiesonlinenet.py

# -*- coding: utf-8 -*-

# Copyright 2013 Fanficdownloader team, 2020 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import absolute_import
import logging
import random

logger = logging.getLogger(__name__)
import re
from datetime import datetime
#
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions

# py2 vs py3 transition
from ..six.moves.urllib.parse import urlparse, urlunparse
from ..six import text_type as unicode

from .base_adapter import BaseSiteAdapter,  makeDate

def getClass():
    return StoriesOnlineNetAdapter

# Class name has to be unique.  Our convention is camel case the
# sitename with Adapter at the end.  www is skipped.
class StoriesOnlineNetAdapter(BaseSiteAdapter):

    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)
        # logger.debug("StoriesOnlineNetAdapter.__init__ - url='%s'" % url)

        self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
        self.password = ""
        self.is_adult=False

        # get storyId from url--url validation guarantees query correct
        m = re.match(self.getSiteURLPattern(),url)
        if m:
            self.story.setMetadata('storyId',m.group('id'))
            if '-'  in self.story.getMetadata('storyId'):
                self.story.setMetadata('storyId',self.story.getMetadata('storyId').split('-')[0])
                logger.debug("storyId date removed:%s\n"%self.story.getMetadata('storyId'))

            # chapter URLs don't have the same embedded title in URL as story.
            title = ""
            if not m.group('chapter') and m.group('title'):
                title = m.group('title')
            path = m.group('path')
            ## library allowed for storyInfo.php but doesn't work in normal story url
            if path == "library":
                path = "s"
            # normalized story URL.
            self._setURL('https://' + self.getSiteDomain() + '/'+path+'/'+self.story.getMetadata('storyId')+title)
        else:
            raise exceptions.InvalidStoryURL(url,
                                             self.getSiteDomain(),
                                             self.getSiteExampleURLs())

        # Each adapter needs to have a unique site abbreviation.
        self.story.setMetadata('siteabbrev',self.getSiteAbbrev())

        # The date format will vary from site to site.
        # http://docs.python.org/library/datetime.html#strftime-strptime-behavior
        self.dateformat = "%Y-%m-%d %I:%M:%S %p"

    @classmethod
    def getSiteAbbrev(cls):
        return 'strol'

    @staticmethod # must be @staticmethod, don't remove it.
    def getSiteDomain():
        # The site domain.  Does have www here, if it uses it.
        return 'storiesonline.net'

    @classmethod
    def getSiteExampleURLs(cls):
        return "https://"+cls.getSiteDomain()+"/s/1234/story-title https://"+cls.getSiteDomain()+"/n/1234/story-title"

    def getSiteURLPattern(self):
        return r"https?://"+re.escape(self.getSiteDomain())+r"/(?P<path>s|n|library)/(storyInfo.php\?id=)?(?P<id>\d+)(?P<chapter>:\d+)?(?P<title>/.+)?((;\d+)?$|(:i)?$)?"

    @classmethod
    def getTheme(cls):
        # preferred theme
        return "Classic"

    def needToLoginCheck(self, data):
        return 'Free Registration' in data \
                or "Invalid Password!" in data \
                or "Invalid User Name!" in data \
                or "Log In" in data \
                or "Access to unlinked chapters requires" in data \
                or "Log in to Storiesonline" in data \
                or "WLPC log in System" in data

    def performLogin(self, url):

        if self.password:
            username = self.username
            password = self.password
        else:
            username = self.getConfig("username")
            password = self.getConfig("password")

        loginUrl = 'https://' + self.getSiteDomain() + '/sol-secure/login.php'
        logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
                                                             username))

        if not username or not password:
            logger.info("Login Required for URL %s" % loginUrl)
            raise exceptions.FailedToLogin(url,username)

        ## Double POST requirement has been removed as of Oct 2021

        (data,useurl) = self.get_request_redirected(loginUrl,usecache=False)
        # logger.debug(data)
        if not self.needToLoginCheck(data):
            ## hitting login URL reminds system we're logged in?
            logger.debug("don't need to login")
            return
        soup = self.make_soup(data)
        params = {}
        tokenInput = soup.find('input',attrs={"name":'token'})
        if tokenInput != None:
            params['token'] = tokenInput['value']
        params['email'] = username
        params['password'] = password
        params['cmd'] = 'LOGIN'
        postAction = soup.find('form')['action']
        parsedUrl = urlparse(useurl)
        postUrl = urlunparse((parsedUrl.scheme,
                              parsedUrl.netloc,
                              postAction,
                              '','',''))
        data = self.post_request(postUrl,params,usecache=False)
        # logger.debug(data)
        while '<h2>Enter TOTP Code:</h2>' in data:
            if self.totp:
                logger.debug("Trying to TOTP with %s code."%self.totp)
                params = {}
                params['cmd'] = 'finishTotpVerification'
                # google auth app at least shows "123 123", but site expects
                # "123123".  Remove space if user enters it.
                params['totp_code'] = self.totp.replace(' ','')
                params['action'] = "continue"
                data = self.post_request(postUrl,params,usecache=False)
                # logger.debug(data)
                self.totp = None
            else:
                raise exceptions.NeedTimedOneTimePassword(url)

        if self.needToLoginCheck(data):
            logger.info("Failed to login to URL %s as %s" % (loginUrl,
                                                              username))
            raise exceptions.FailedToLogin(url,username)

    ## Getting the chapter list and the meta data, plus 'is adult' checking.
    def doExtractChapterUrlsAndMetadata(self, get_cover=True):
        url = self.url
        logger.debug("URL: "+url)

        ## Some stories give 404 if not logged in now.  See #1185
        if self.getConfig("always_login"):
            self.performLogin(self.url)

        ## Hit story URL to check for changed title part -- if the
        ## title has changed or (more likely?) the ID number has
        ## been reassigned to a different title, this will 404
        ## Note that the site ignores extra letters, so if the real
        ## URL is /story-title then /story-titleaaaa will still work.
        try:
            data = self.get_request(url)
        except exceptions.HTTPErrorFFF as e:
            if e.status_code in (401, 403, 410):
                data = 'Log In' # to trip needToLoginCheck
            elif e.status_code == 404:
                raise exceptions.FailedToDownload("Page Not Found - always_login needed? (%s)" % url)
            else:
                raise e
        if self.needToLoginCheck(data):
            # need to log in for this one.
            self.performLogin(url)
            data = self.get_request(url,usecache=False)

        ## SOL adds intermediate page to remind users to renew at 3-30 days before expiration - this breaks the soup 'a' search below
        if "Your premier membership is going to expire" in data:
            soup = self.make_soup(data)
            expire = soup.find(string=re.compile("Your premier membership is going to expire"))
            remindurl=(soup.find(href=re.compile("later.php"))).get('href')
            raise exceptions.FailedToDownload(self.getSiteDomain() +" says: "+expire+"\n"+"Renew or reduce expiration warning time in account setting\n"+remindurl)

        ## Premium account might redirect to a chapter, while regular
        ## account doesn't redirect to the URL with embedded /story-title
        ## So pull url from <a href="/s/000/story-title" rel="bookmark">
        ## regardless.
        soup = self.make_soup(data)
        a = soup.find('a',rel="bookmark")
        if a:
            url = 'https://'+self.host+a['href']
        else:
            # Contest entries do not have bookmark HREF
            logger.info("No Bookmark HREF, using URL="+url)

        ## Premium has "?ind=1" to force index.
        ## May not be needed w/o premium
        ## used to be :i
        if "?ind=1" not in url:
            url = url+"?ind=1"
        logger.info("use url: "+url)
        data = self.get_request(url)
        ## To include /title-in-url, but not ind=1
        self._setURL(url.replace("?ind=1",""))
        # logger.debug(data)

        if "Access denied. This story has not been validated by the adminstrators of this site." in data:
            raise exceptions.AccessDenied(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
        elif "Error! The story you're trying to access is being filtered by your choice of contents filtering." in data:
            raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Error! The story you're trying to access is being filtered by your choice of contents filtering.")
        elif "Error! Daily Limit Reached" in data or "Sorry! You have reached your daily limit of" in data:
            raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Error! Daily Limit Reached")
        elif "by (Hidden)" in data:
            #Contest entries have author set to "(Hidden)" which breaks author lookups below
            logger.info("Contest entry, setting authorId=(Hidden)")
            self.story.addToList('authorId',"(Hidden)")
            logger.info("Contest entry, setting author=(Hidden)")
            self.story.addToList('author',"(Hidden)")

        soup = self.make_soup(data)
        # logger.debug(data)

        ## Title
        a = soup.find('h1')
        self.story.setMetadata('title',stripHTML(a))

        authfrom = soup.find('footer')
        alist = authfrom.find_all('a', {'rel' : 'author'})
        if alist:
            for a in alist:
                self.story.addToList('authorId',a['href'].split('/')[2])
                self.story.addToList('authorUrl','https://'+self.host+a['href'])
                ## both 's Page and ’s Page
                self.story.addToList('author',re.sub(r".s Page$","",stripHTML(a)))
        else:
            logger.info("AuthorList empty. Contest entry?")

        # Find the chapters:
        # If multiple chapters, they are in "index-list" div.
        #    <a href="/s/00001/This-is-a-test/1">Chapter 1</a>
        #    <a href="/n/00001/This-is-a-test/1">Chapter 1</a>
        chapters = soup.select('div#index-list a[href*="/s/"],div#index-list a[href*="/n/"]')
        # logger.debug(chapters)
        if len(chapters) != 0:
            logger.debug("Number of chapters: {0}".format(len(chapters)))
            for chapter in chapters:
                # just in case there's tags, like <i> in chapter titles.
                self.add_chapter(chapter,'https://'+self.host+chapter['href'])
        else:
            self.add_chapter(self.story.getMetadata('title'),self.story.getMetadata('storyUrl'))

        # The rest of the metadata is within the article tag.
        soup = soup.find('article')

        if self.story.getList('authorUrl'):
            self.getStoryMetadataFromAuthorPage()
        else:
            logger.info("No authorurl found, setting to homepage.  Could be contest story...")
            self.story.setMetadata('authorUrl','https://' + self.getSiteDomain() + '/')


        # Some books have a cover in the index page.
        # Samples are:
        #     https://storiesonline.net/s/11999
        #     https://storiesonline.net/s/10823
        if get_cover:
            # logger.debug("Looking for the cover image...")
            cover_url = ""
            img = soup.find('img')
            if img:
                cover_url=img['src']
            # logger.debug("cover_url: %s"%cover_url)
            if cover_url:
                self.setCoverImage(url,cover_url)

        # Remove all the metadata elements to leave and preamble text. This is usually
        # a notice or a forward.
        if self.num_chapters() > 1:
            header = soup.find('header')
            header.extract()
        else:
            soup = soup.find('header')
        # Remove some tags based on their class or id
        elements_to_remove = ['#det-link', '#s-details', '#index-list', '#s-title', '#s-auth', '.copy']
        if self.getConfig('include_images') != 'true':  # false or coveronly
            elements_to_remove.append('img')
        for element_name in elements_to_remove:
            elements = soup.select(element_name)
            for element in elements:
                element.extract()
        if len(soup.contents ) > 0 and (len(soup.text.strip()) > 0 or len(soup.find_all('img')) > 0):
            self.story.setMetadata('notice', self.utf8FromSoup(url, soup))


    def getStoryMetadataFromAuthorPage(self):
        # surprisingly, the detailed page does not give enough details, so go to author's page
        story_row = self.findStoryRow()

        if story_row.name == 'tr':
            # classic theme
            self.has_universes = False

            title_cell = story_row.find('td', {'class' : 'lc2'})
            for cat in title_cell.find_all('div', {'class' : 'typediv'}):
                self.story.addToList('genre',cat.text)

            # in lieu of word count.
            self.story.setMetadata('size', story_row.find('td', {'class' : 'num'}).text)

            score = story_row.findNext('th', {'class' : 'ynum'}).text
            if re.match(r"[\d,\.]+",score):
                self.story.setMetadata('score', score)

            description_element = story_row.findNext('td', {'class' : 'lc4'})
            # logger.debug(description_element)

            self.parseDescriptionField(description_element)

            self.parseOtherAttributes(description_element)
        else:
            # modern theme (or minimalist theme, should also work)
            description_element = story_row.find('div', {'class' : 'sdesc'})

            self.parseDescriptionField(description_element)

            misc_element = story_row.find('div', {'class' : 'misc'})
            self.parseOtherAttributes(misc_element)


    def findStoryRow(self):
        page=0
        story_found = False
        while not story_found:
            page = page + 1
            try:
                data = self.get_request(self.story.getList('authorUrl')[0] + "/" + unicode(page))
            except exceptions.HTTPErrorFFF as e:
                if e.status_code == 404:
                    raise exceptions.FailedToDownload("Story not found in Author's list--Set Access Level to Full Access and change Listings Theme back to "+self.getTheme())
            asoup = self.make_soup(data)

            story_row = asoup.find('tr', {'id' : 'sr' + self.story.getMetadata('storyId')})
            if story_row:
                logger.debug("Found story row on page %d" % page)
                story_found = True
                self.has_universes = "/universes" in data
                break

            story_row = asoup.find('div', {'id' : 'sr' + self.story.getMetadata('storyId')})
            if story_row:
                logger.debug("Found story row on page %d" % page)
                story_found = True
                self.has_universes = "/universes" in data
                break

        return story_row


    def parseDescriptionField(self, description_element):
        # Parse the description field for the series or universe and the
        # actual description.

        try:
            a = description_element.find('a', href=re.compile(r"/series/\d+/.*"))
            # logger.debug("Looking for series - a='{0}'".format(a))
            if a:
                # if there's a number after the series name, series_contents is a two element list:
                # [<a href="...">Title</a>, u' (2)']
                series_contents = a.parent.contents
                i = 0 if len(series_contents) == 1 else series_contents[1].strip(' ()')
                seriesUrl = 'https://'+self.host+a['href']
                self.story.setMetadata('seriesUrl',seriesUrl)
                series_name = stripHTML(a)
                # logger.debug("Series name= %s" % series_name)
                series_soup = self.make_soup(self.get_request(seriesUrl))
                if series_soup:
                    # logger.debug("Retrieving Series - looking for name")
                    series_name = stripHTML(series_soup.find('h1', {'id' : 'ptitle'}))
                    series_name = re.sub(r' . a (series by|collection from).*$','',series_name)
                    # logger.debug("Series name: '%s'" % series_name)
                self.setSeries(series_name, i)
                # Check if series is in a universe
                if self.has_universes:
                    universe_url = self.story.getList('authorUrl')[0]  + "&type=uni"
                    universes_soup = self.make_soup(self.get_request(universe_url) )
                    # logger.debug("Universe url='{0}'".format(universe_url))
                    if universes_soup:
                        universes = universes_soup.find_all('div', {'class' : 'ser-box'})
                        # logger.debug("Number of Universes: %d" % len(universes))
                        for universe in universes:
                            # logger.debug("universe.find('a')={0}".format(universe.find('a')))
                            # The universe id is in an "a" tag that has an id but nothing else. It is the first tag.
                            # The id is prefixed with the letter "u".
                            universe_id = universe.find('a')['id'][1:]
                            # logger.debug("universe_id='%s'" % universe_id)
                            universe_name = stripHTML(universe.find('div', {'class' : 'ser-name'})).partition(' ')[2]
                            # logger.debug("universe_name='%s'" % universe_name)
                            # If there is link to the story, we have the right universe
                            story_a = universe.find('a', href=re.compile('/s/'+self.story.getMetadata('storyId')))
                            if story_a:
                                # logger.debug("Story is in a series that is in a universe! The universe is '%s'" % universe_name)
                                self.story.setMetadata("universe", universe_name)
                                self.story.setMetadata('universeUrl','https://'+self.host+ '/library/universe.php?id=' + universe_id)
                                break
                    # else:
                    #     logger.debug("No universe page")
        except:
            raise
        try:
            a = description_element.find('a', href=re.compile(r"/universe/\d+/.*"))
            # logger.debug("Looking for universe - a='{0}'".format(a))
            if a:
                self.story.setMetadata("universe",stripHTML(a))
                # Assumed only one universe, but it does have a URL--use universeHTML
                universe_name = stripHTML(a)
                universeUrl = 'https://'+self.host+a['href']
                # logger.debug("Retrieving Universe - about to get page - universeUrl='{0}".format(universeUrl))
                universe_soup = self.make_soup(self.get_request(universeUrl))
                # logger.debug("Retrieving Universe - have page")
                if universe_soup:
                    # logger.debug("Retrieving Universe - looking for name")
                    universe_name = stripHTML(universe_soup.find('h1', {'id' : 'ptitle'}))
                    universe_name = re.sub(r' .\s+A Universe from the Mind.*$','',universe_name)
                    # logger.debug("Universes name: '{0}'".format(universe_name))

                self.story.setMetadata('universeUrl',universeUrl)
                # logger.debug("Setting universe name: '{0}'".format(universe_name))
                self.story.setMetadata('universe',universe_name)
                if self.getConfig("universe_as_series") and not self.story.getMetadata('seriesUrl'):
                    # logger.debug("universe_as_series")
                    # take position in universe page as number in series.
                    for i, storya in enumerate(universe_soup.find_all('a',href=re.compile(r'^/s/\d+/'))):
                        if storya['href'].split('/')[2] == self.story.getMetadata('storyId'):
                            self.setSeries(universe_name, i+1)
                            self.story.setMetadata('seriesUrl',universeUrl)
                            break
            # else:
            #     logger.debug("Do not have a universe")
        except:
            raise

        # There's nothing around the desc to grab it by, and there's a
        # variable number of links before it.
        for line in description_element.contents:
            content = stripHTML(line)
            line = unicode(line)
            if content == '' or line.strip() == '' or line.startswith("<span") or line.startswith("<br"):
                # skip empty, <span (universe, series or context) and <br>.
                # logger.debug("Discard: %s"%line)
                pass
            else:
                # logger.debug("Use: %s"%line)
                self.setDescription('https://'+self.host+'/s/'+self.story.getMetadata('storyId'),line)
                break

    def parseDate(self,label):
        # date is passed as a timestamp and converted in JS.  used to
        # use noscript value instead, but found one story that didn't
        # include it.
        # <script> tag processing not working?
        # logger.debug('parseDate label: "%s"' % label)
        script = label.findNext('script')
        # logger.debug("script:(%s)"%script)
        # logger.debug("script.text:(%s)"%script.text)
        # logger.debug("script:(stripHTML(%s))"%stripHTML(script))
        noscript = label.findNext('noscript').text
        # I honestly have no idea why both script.text and
        # stripHTML(script) return empty string, but they do. BS or
        # html5lib maybe?
        script = "%s"%label.findNext('script')
        try:
            timestamp = script[script.index("Date(")+5:]
            # remove milliseconds that JS likes.
            timestamp = timestamp[:timestamp.index(")")-3]
            # logger.debug("timestamp:(%s)"%timestamp)
            value = datetime.fromtimestamp(float(timestamp))
        except:
            value = makeDate(stripHTML(noscript), self.dateformat)
        # logger.debug('Have a date field label: "%s", noscript: "%s", timestamp: "%s", value: "%s"' % (label, noscript, timestamp, value))
        return value

    def parseOtherAttributes(self, other_attribute_element):
        for b in other_attribute_element.find_all('b'):
            #logger.debug('Getting metadata: "%s"' % b)
            label = b.text
            if label in ['Posted:', 'Concluded:', 'Updated:']:
                value = self.parseDate(b)
                #logger.debug('Have a date field label: "%s", value: "%s"' % (label, value))
            else:
                value = b.nextSibling
            #logger.debug('label: "%s", value: "%s"' % (label, value))

            if 'Sex' in label:  # storiesonline.net uses '<b>Sex Contents:</b> No Sex'
                self.story.setMetadata('rating', value)
            if 'Age' in label:  # finestories.com,scifistories.com use '<b>Age Rating:</b> Older than XX | '
                self.story.setMetadata('rating', value.split('|')[0])
            if 'Score' in label and re.match(r"[\d,\.]+",value):
                self.story.setMetadata('score', value)
            if 'Tags' in label or 'Codes' in label:
                for code in re.split(r'\s*,\s*', value.strip()):
                    self.story.addToList('sitetags', code)
            if 'Genre' in label:
                for code in re.split(r'\s*,\s*', value.strip()):
                    self.story.addToList('genre', code)

            if 'Posted' in label:
                self.story.setMetadata('datePublished', value)
                self.story.setMetadata('dateUpdated', value)
            if 'Concluded' in label:
                self.story.setMetadata('dateUpdated', value)
            if 'Updated' in label:
                self.story.setMetadata('dateUpdated', value)

        status = other_attribute_element.find('span', {'class':'ab'})
        if status != None:
            if 'Incomplete and Inactive' in status.text:
                self.story.setMetadata('status', 'Incomplete')
            else:
                self.story.setMetadata('status', 'In-Progress')
            if "Last Activity" in status.text:
                value = self.parseDate(status)
                self.story.setMetadata('dateUpdated', value)
        else:
            self.story.setMetadata('status', 'Completed')

    def getMoreText(self, html):
        try:
            story_id = int(re.compile(r'var story_id=(\d+)').findall(html)[0])
            try:
                pid = re.compile(r'var pid=(\d+)').findall(html)[0]
            except:
                pid = 'undefined'
            ci = re.compile("var ci='([^']+)'").findall(html)[0]
            tto = re.compile("var tto='([^']+)'").findall(html)[0]
            url = "https://"+self.getSiteDomain()+"/res/responders/tl.php?r="+unicode(random.randint(1, 100001))
            params = {}
            params['cmd'] = 'gt'
            params['data[]'] = [story_id, pid, ci, story_id + 5, tto]
            ver = self.post_request(url, params)

            url = "https://"+self.getSiteDomain()+"/res/responders/tl.php?r="+unicode(random.randint(1, 100001))
            params = {}
            params['cmd'] = 'gr'
            params['data[]'] = [ver]
            return self.post_request(url, params)

        except Exception as e:
            logger.error(e)
            return None

    # grab the text for an individual chapter.
    def getChapterText(self, url):

        logger.debug('Getting chapter text from: %s' % url)

        html = self.get_request(url)
        soup = self.make_soup(html)

        # The story text is wrapped in article tags. Most of the page header and
        # footer are outside of this.
        chaptertag = soup.find('article')

        # There might be a div with a "load more" button
        srtag = soup.find('div', id='sr')

        if srtag != None:
            # logger.debug('Getting more chapter text for: %s' % url)
            moretext = self.getMoreText(html)
            if moretext != None:
                moresoup = self.make_soup(moretext)
                srtag.replace_with(moresoup)
            else:
                logger.info("Failed to get more text for %s" % url)

        # some big chapters are split over several pages
        pager = chaptertag.find('div', {'class' : 'pager'})

        self.cleanPage(chaptertag)

        if pager != None:

            urls=pager.find_all('a')
            urls=urls[:len(urls)-1]
            # logger.debug("pager urls:%s"%urls)
            pager.extract()

            for ur in urls:
                soup = self.make_soup(self.get_request("https://"+self.getSiteDomain()+ur['href']))

                pagetag = soup.find('article')

                self.cleanPage(pagetag)

                for tag in pagetag.contents[1:]:
                    chaptertag.append(tag)


        if None == chaptertag:
            raise exceptions.FailedToDownload("Error downloading Chapter: %s!  Missing required element!" % url)

        return self.utf8FromSoup(url,chaptertag)

    def cleanPage(self,pagetag):
        "Consolidate 'page' clean up code so it can be called."
        # logger.debug("cleanPage start: {0}".format(pagetag))

        chapter_title = None
        if self.getConfig('inject_chapter_title'):
            h2tag = pagetag.find('h2')
            if h2tag:
                # I'm seeing an h1 now, but it's not logged in?
                # Something's broken...
                chapter_title = h2tag.extract()

        # Strip te header section
        tag = pagetag.find('header')
        if tag:
            #logger.debug("remove before header: {0}".format(tag))
            tag.extract()

        # some big chapters are split over several pages
        # remove FIRST pager and everything before it.
        tag = pagetag.find('div', {'class' : 'pager'})
        while tag != None:
            # logger.debug("remove before pager: {0}".format(tag))
            prev = tag.previousSibling
            tag.extract()
            tag = prev

        # Find the "Continues" marker on the current page and
        # remove everything after that.  This is actually
        # effecting the *previous* 'page'.  EXCEPT!--they are
        # putting a 'conTag' at the *top* now, too.  So this
        # was nuking every page but the first and last.  Now
        # only if 'Continues'
        for contag in pagetag.find_all('span', {'class' : 'conTag'}):
            # remove everything after continues...
            if 'Continuation' in contag.text:
                tag = contag
                while tag != None:
                    # logger.debug("remove before Continuation: {0}".format(tag))
                    prev = tag.previousSibling
                    tag.extract()
                    tag = prev
            elif 'Continues' in contag.text:
                tag = contag
                while tag != None:
                    # logger.debug("remove after Continues: {0}".format(tag))
                    nxt = tag.nextSibling
                    tag.extract()
                    tag = nxt

        # some big chapters are split over several pages
        # remove LAST pager and everything before it.
        # Only needed on last page.
        tag = pagetag.find('div', {'class' : 'pager'})
        while tag != None:
            # logger.debug("remove after pager: {0}".format(tag))
            nxt = tag.nextSibling
            tag.extract()
            tag = nxt

        # If it is a chapter, there are dates at the start for when it was posted or modified. These plus
        # everything before them can be discarded.
        postedDates = pagetag.find_all('div', {'class' : 'date'})
        # logger.debug(postedDates)
        if postedDates:
            a = postedDates[0].previousSibling
            while a != None:
                # logger.debug("before dates: {0}".format(a))
                b = a.previousSibling
                a.extract()
                a = b
            for a in pagetag.find_all('div', {'class' : 'date'}):
                a.extract()

        # Kill the vote form and everything after it.
        a = pagetag.find('div', {'class' : 'vform'})
        # logger.debug("Chapter end= '{0}'".format(a))
        while a != None:
            b = a.nextSibling
            a.extract()
            a=b

        # For chapters, remove next chapter link and everything after it
        a = pagetag.find('h3', {'class' : 'end'})
        # logger.debug("Chapter end= '{0}'".format(a))
        while a != None:
            b = a.nextSibling
            a.extract()
            a=b
        # inject_chapter_title
        if chapter_title:
            chapter_title.name='h3'
            pagetag.insert(0,chapter_title)