Add a BBCode lib for summaries on fimfic(needed to mod for unicode),

Default keep_summary_html:true to now, Changes to the AO3, ficbook.net, ficwad.com adapters for summary HTML. Add options to strip and add chapter numbers to chapter titles.
2026-05-08 21:11:59 +02:00 · 2012-09-12 18:03:30 -05:00 · 2012-09-12 18:03:30 -05:00 · 0ca71a6455
commit 0ca71a6455
parent ab521ac093
24 changed files with 1328 additions and 44 deletions
--- a/calibre-plugin/about.txt
+++ b/calibre-plugin/about.txt
@ -4,7 +4,7 @@
 '<a href="http://www.mobileread.com/forums/showthread.php?t=134856">Reading List</a>',
 '<a href="http://www.mobileread.com/forums/showthread.php?t=126727">Extract ISBN</a>' and
 '<a href="http://www.mobileread.com/forums/showthread.php?t=134000">Count Pages</a>'
-plugins.</p>
+plugins.  bbcodeutils code contributed by Pau Sanchez.</p>

 <p>
 Calibre officially distributes plugins from the mobileread.com forum site.
--- a/calibre-plugin/ffdl_plugin.py
+++ b/calibre-plugin/ffdl_plugin.py
@ -28,6 +28,7 @@ from calibre.gui2 import error_dialog, warning_dialog, question_dialog, info_dia
 from calibre.gui2.dialogs.message_box import ViewLog
 from calibre.gui2.dialogs.confirm_delete import confirm
 from calibre.utils.date import local_tz
+from calibre.library.comments import sanitize_comments_html

 # The class that all interface action plugins must inherit from
 from calibre.gui2.actions import InterfaceAction
@ -36,7 +37,7 @@ from calibre_plugins.fanfictiondownloader_plugin.common_utils import (set_plugin
                                         create_menu_action_unique, get_library_uuid)

 from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader import adapters, writers, exceptions
-from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.htmlcleanup import stripHTML
+#from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.htmlcleanup import stripHTML
 from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.epubutils import get_dcsource, get_dcsource_chaptercount, get_story_url_from_html
 from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.geturls import get_urls_from_page

@ -432,13 +433,6 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
        print("url:%s"%url)
        skip_date_update = False
        
-        ## was self.ffdlconfig, but we need to be able to change it
-        ## when doing epub update.
-        ffdlconfig = SafeConfigParser()
-        ffdlconfig.readfp(StringIO(get_resources("plugin-defaults.ini")))
-        ffdlconfig.readfp(StringIO(prefs['personal.ini']))
-        adapter = adapters.getAdapter(ffdlconfig,url,fileform)
-
        options['personal.ini'] = prefs['personal.ini']
        if prefs['includeimages']:
            # this is a cheat to make it easier for users.
@ -448,6 +442,13 @@ keep_summary_html:true
 make_firstimage_cover:true
 ''' + options['personal.ini']

+        ## was self.ffdlconfig, but we need to be able to change it
+        ## when doing epub update.
+        ffdlconfig = SafeConfigParser()
+        ffdlconfig.readfp(StringIO(get_resources("plugin-defaults.ini")))
+        ffdlconfig.readfp(StringIO(options['personal.ini']))
+        adapter = adapters.getAdapter(ffdlconfig,url,fileform)
+
        ## three tries, that's enough if both user/pass & is_adult needed,
        ## or a couple tries of one or the other
        for x in range(0,2):
@ -476,7 +477,7 @@ make_firstimage_cover:true
        book['author_sort'] = book['author'] = story.getList("author", removeallentities=True)
        book['publisher'] = story.getMetadata("site")
        book['tags'] = writer.getTags(removeallentities=True) # getTags could be moved up into adapter now.  Adapter didn't used to know the fileform
-        book['comments'] = stripHTML(story.getMetadata("description")) #, removeallentities=True) comments handles entities better.
+        book['comments'] = sanitize_comments_html(story.getMetadata("description"))        
        book['series'] = story.getMetadata("series", removeallentities=True)
        
        # adapter.opener is the element with a threadlock.  But del
--- a/defaults.ini
+++ b/defaults.ini
@ -164,6 +164,45 @@ extratags: FanFiction
 ## doesn't work on some devices either.)
 #replace_hr: false

+## If set false, the summary will have all html stripped.
+## Both this and include_images must be true to get images in the
+## summary.
+keep_summary_html:true
+
+## Don't like the numbers at the start of chapter titles on some
+## sites?  You can use strip_chapter_numbers to strip them off.  Just
+## want to make them all look the same?  Strip them off, then add them
+## back on with add_chapter_numbers.  Don't like the way it strips
+## numbers or adds them back?  See chapter_title_strip_pattern and
+## chapter_title_add_pattern.
+strip_chapter_numbers:false
+add_chapter_numbers:false
+
+## (Two versions of chapter_title_strip_pattern are shown below.  You
+## should only have one uncommented.)
+## This version will remove the leading number from:
+## "1." => ""
+## "1. The Beginning" => "The Beginning"
+## "1: Start" => "Start"
+## "2, Chapter the second" => "Chapter the second"
+## etc
+chapter_title_strip_pattern:^[0-9]+[\.: -]+
+
+## This version will strip all of the above *plus* remove 'Chapter 1':
+## "Chapter 1" => ""
+## "1. Chapter 1" => ""
+## "1. Chapter 1, Bob's First Clue" => "Bob's First Clue"
+## "Chapter 2 - Pirates Place" => "Pirates Place"
+## etc
+#chapter_title_strip_pattern:^([0-9]+[\.: -]+)?(Chapter *[0-9]+[\.:, -]*)?
+
+## Uses a python template substitution.  The ${index} is the 'chapter'
+## number and ${title} is the chapter title, after applying
+## chapter_title_strip_pattern.  Those are the only variables available.
+## "The Beginning" => "1. The Beginning" 
+chapter_title_add_pattern:${index}. ${title}
+
+
 ## Each output format has a section that overrides [defaults]
 [html]

@ -271,11 +310,6 @@ output_css:
 ## stories.  Images will be converted to jpg for size if possible.
 #include_images:false

-## If not set, the summary will have all html stripped for safety.
-## Both this and include_images must be true to get images in the
-## summary.
-#keep_summary_html:false
-
 ## If set, the first image found will be made the cover image.  If
 ## keep_summary_html is true, any images in summary will be before any
 ## in chapters.
--- a/fanficdownloader/adapters/adapter_archiveofourownorg.py
+++ b/fanficdownloader/adapters/adapter_archiveofourownorg.py
@ -191,7 +191,7 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):

        a = metasoup.find('blockquote',{'class':'userstuff'})
        if a != None:
-            self.setDescription(url,a.text)
+            self.setDescription(url,a)
            #self.story.setMetadata('description',a.text)
 		
        a = metasoup.find('dd',{'class':"rating tags"})
--- a/fanficdownloader/adapters/adapter_ficbooknet.py
+++ b/fanficdownloader/adapters/adapter_ficbooknet.py
@ -201,7 +201,7 @@ class FicBookNetAdapter(BaseSiteAdapter):
                break
 				
        summary=soup.find('span', {'class' : 'urlize'})
-        self.setDescription(url,summary.text)
+        self.setDescription(url,summary)
        #self.story.setMetadata('description', summary.text)
            
    # grab the text for an individual chapter.
--- a/fanficdownloader/adapters/adapter_ficwadcom.py
+++ b/fanficdownloader/adapters/adapter_ficwadcom.py
@ -124,7 +124,7 @@ class FicwadComSiteAdapter(BaseSiteAdapter):

        # description
        storydiv = soup.find("div",{"id":"story"})
-        self.setDescription(url,storydiv.find("blockquote",{'class':'summary'}).p.string)
+        self.setDescription(url,storydiv.find("blockquote",{'class':'summary'}).p)
        #self.story.setMetadata('description', storydiv.find("blockquote",{'class':'summary'}).p.string)

        # most of the meta data is here:
--- a/fanficdownloader/adapters/adapter_fimfictionnet.py
+++ b/fanficdownloader/adapters/adapter_fimfictionnet.py
@ -27,6 +27,8 @@ from .. import BeautifulSoup as bs
 from ..htmlcleanup import stripHTML
 from .. import exceptions as exceptions

+from ..bbcodeutils.bbcodeparser import bbcodeparser
+
 from base_adapter import BaseSiteAdapter,  makeDate

 def getClass():
@ -137,12 +139,19 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
        
        # fimfic is the first site with an explicit cover image.
        if self.getConfig('include_images') and "image" in storyMetadata.keys():
-            coverurl = storyMetadata["image"]
+            if "full_image" in storyMetadata:
+                coverurl = storyMetadata["full_image"]
+            else:
+                coverurl = storyMetadata["image"]
            if coverurl.startswith('//static.fimfiction.net'): # fix for img urls missing 'http:'
                coverurl = "http:"+coverurl
            self.story.addImgUrl(self,self.url,coverurl,self._fetchUrlRaw,cover=True)
-            
-        self.setDescription(self.url, storyMetadata["description"])
+
+
+        # the fimfic API gives bbcode for desc, not html.
+        # btw, bbcode honors newlines, html doesn't.  change newlines to br tags.
+        self.setDescription(self.url,
+                            bbcodeparser().parse(storyMetadata["description"]).html(doDeepCopy=False).replace('\r','').replace('\n','<br />'))
        
        # Dates are in Unix time
        # Take the publish date from the first chapter posted
@ -152,8 +161,18 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
        self.story.setMetadata("dateUpdated", datetime.fromtimestamp(rawDateUpdated))
        
        soup = bs.BeautifulSoup(data).find("div", {"class":"story"})
-        for character in [character_icon["title"] for character_icon in soup.findAll("a", {"class":"character_icon"})]:
-            self.story.addToList("characters", character)
+        # fimfic stopped putting the char name on or around the char
+        # icon now for some reason.  Pull it from the image name with
+        # some heuristics.
+        for character in [character_icon["src"] for character_icon in soup.findAll("img", {"class":"character_icon"})]:
+            # //static.fimfiction.net/images/characters/twilight_sparkle.png
+            # 5th split /, remove last four, replace _, capitolize every word(title())
+            char = character.split('/')[5][:-4].replace('_',' ').title()
+            if char == 'Oc':
+                char = "OC"
+            if char == 'Cmc':
+                char = "Cutie Mark Crusaders"
+            self.story.addToList("characters", char)
            
            
    def getChapterText(self, url):
--- a/fanficdownloader/adapters/adapter_test1.py
+++ b/fanficdownloader/adapters/adapter_test1.py
@ -73,8 +73,8 @@ class TestSiteAdapter(BaseSiteAdapter):
            self.story.setMetadata(u'title',"Test Story Title "+self.story.getMetadata('storyId'))
            self.story.setMetadata('author','Test Author aa')
        self.story.setMetadata('storyUrl',self.url)
-        self.story.setMetadata('description',u'Description '+self.crazystring+u''' Done
-
+        self.setDescription(self.url,u'Description '+self.crazystring+u''' Done
+<p>
 Some more longer description.  "I suck at summaries!"  "Better than it sounds!"  "My first fic"
 ''')
        self.story.setMetadata('datePublished',makeDate("1975-03-15","%Y-%m-%d"))
--- a/fanficdownloader/adapters/base_adapter.py
+++ b/fanficdownloader/adapters/base_adapter.py
@ -199,7 +199,8 @@ class BaseSiteAdapter(Configurable):
                if (self.chapterFirst!=None and index < self.chapterFirst) or \
                        (self.chapterLast!=None and index > self.chapterLast):
                    self.story.addChapter(removeEntities(title),
-                                          None)
+                                          None,
+                                          self)
                else:
                    if self.oldchapters and index < len(self.oldchapters):
                        data = self.utf8FromSoup(None,
@ -208,7 +209,8 @@ class BaseSiteAdapter(Configurable):
                    else:
                        data = self.getChapterText(url)
                    self.story.addChapter(removeEntities(title),
-                                          removeEntities(data))
+                                          removeEntities(data),
+                                          self)
            self.storyDone = True
            
            # include image, but no cover from story, add default_cover_image cover.
@ -289,7 +291,7 @@ class BaseSiteAdapter(Configurable):
    def setDescription(self,url,svalue):
        #print("\n\nsvalue:\n%s\n"%svalue)
        if self.getConfig('keep_summary_html'):
-            if isinstance(svalue,str) or isinstance(svalue,unicode):
+            if isinstance(svalue,basestring):
                svalue = bs.BeautifulSoup(svalue)
            self.story.setMetadata('description',self.utf8FromSoup(url,svalue))
        else:
--- a/fanficdownloader/bbcodeutils/init.py
+++ b/fanficdownloader/bbcodeutils/init.py
--- a/fanficdownloader/bbcodeutils/bbcode2html.py
+++ b/fanficdownloader/bbcodeutils/bbcode2html.py
@ -0,0 +1,325 @@
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+#
+# Author:        Pau Sanchez (contact@pausanchez.com)
+# Version:       v1.0
+# Last Modified: 2010/09/15
+# 
+# For the latest version check out:
+#   http://www.codigomanso.com/en/projects
+# 
+# My blog:
+#   http://www.codigomanso.com/en/  - English Version
+#   http://www.codigomanso.com/es/  - Spanish Version
+#
+
+import sys
+import os
+import re
+import urllib
+
+class bbcode2html:
+  '''
+  This class gets a parsed BBCode and transforms it to valid HTML
+
+  Useful functions of this class:
+    html
+    convertToHTML
+
+  Example:
+    > parser = bbcodeparser ()
+    > parser.parse ('[b]bold[/b]')
+    > bbcode2html (parser).html()
+    <b>bold</b>
+
+    # This is faster for huge strings but changes the parser object internally
+    > bbcode2html (parser).html(doDeepCopy = False)
+    <b>bold</b>
+  '''
+  def __init__ (self, parser):
+    self._parser = parser
+    return
+
+  def html (self, allowClassAttr = False, doDeepCopy = True, parser = None):
+    '''
+    Convert current parsed code to HTML
+
+    Example:
+      code = bbcodeparser ('[b]bold[/b]')
+      code.html() -> '<b>bold</b>'
+    '''
+    if parser is None:
+      parser = self._parser
+
+    tokens = parser
+    if instanceof (parser, bbcodeparser):
+      tokens = parser.getTokens()
+
+    return bbcode2html.convertToHTML (tokens, allowClassAttr = allowClassAttr, doDeepCopy = doDeepCopy)
+
+  @staticmethod
+  def htmlString (string):
+    toReplace = {
+      u'<' : '&lt;',
+      u'>' : '&gt;',
+      u'"' : "&quot;",
+      u'&' : "&amp;"
+    }
+    for entity in toReplace:
+      string = string.replace(entity, toReplace[entity])
+    return string
+
+  @staticmethod
+  def getValidTags ():
+    simpleTags = ['b', 'u', 'i', 'sup', 'sub', 'ul', 'ol', 'li', 'table', 'tr', 'th', 'td', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+    validTags = {
+      'p'         : { 'color' : 'color', 'size' : 'size', 'font' : 'font' },
+      'color'     : { 'color' : 'color' },
+      'size'      : { 'size' : 'size' },
+      'font'      : { 'font' : 'font' },
+      'img'       : { 'alt' : 'alt', 'title' : 'title', 'width' : 'width' , 'height' : 'height', 'img' : 'img'},
+      'url'       : { 'href' : 'href', 'url' : 'href', 'link'  : 'href', 'title' : 'title' },
+      's'         : { },
+      'code'      : { },
+      'quote'     : { },
+      'list'      : { 'list' : 'type' },
+      'email'     : { 'email': 'href'},
+      'google'    : { 'google':  'google'},
+      'wikipedia' : { 'wikipedia' : 'wikipedia', 'language' : 'language', 'lang' : 'lang'}
+    }
+
+    for tag in simpleTags:
+      validTags[tag] = { }
+    return validTags
+
+  @staticmethod
+  def convertToHTML (tokens, allowClassAttr = False, validTags = None, doDeepCopy = True):
+    '''
+    Convert internally parsed BBCode to XHTML
+
+    @doDeepCopy 
+      True:  it does a deep copy of tokens so this list will remain unchanged
+      False: tokens will be modified internally, but the output will be produced like 5x faster
+             it's a good idea to use False only when this is the last operation
+    '''
+    # do a deep copy
+    if doDeepCopy:
+      import copy
+      tokens = copy.deepcopy (tokens)
+
+    # filter invalid tags and attributes
+    if validTags is None:
+      validTags = bbcode2html.getValidTags()    
+
+    bbcode2html._filterInvalidTagsAndAttributes (tokens, validTags, allowClassAttr)
+    
+    # Start to convert
+    index       = 0
+    tokenLength = len (tokens)
+
+    # use a list for the output (an order of magnitude faster than using string concatenation)
+    htmlList = []
+    lastListOpener = []
+
+    while index < tokenLength:
+
+      if isinstance (tokens [index], basestring):
+        htmlList.append (bbcode2html.htmlString (tokens [index]))
+        index += 1
+        continue
+
+      token     = tokens[index] 
+      tag       = token['tag']  # opening or closing simple tag. e.g: 'b', '/b', '/u', ...
+      tagName   = (tag[1:] if tag[0] == '/' else tag)
+      tagOpener = (u'/' if tag[0] == '/' else u'')
+      tokenArgs = (token['args'] if 'args' in token else {})
+
+      # opening or closing simple tag COLOR / SIZE
+      if (tagName in ['p', 'color', 'size', 'font']): 
+        style  = ''
+        style += ((u' color: ' + tokenArgs['color'] + u';') if ('color' in tokenArgs) else '')
+        style += ((u' font-size: ' + tokenArgs['size'] + u'pt;') if ('size' in tokenArgs) else '')
+        style += ((u' font-family: ' + tokenArgs['font'] + u';') if ('font' in tokenArgs) else '')
+        style  = style.strip()
+
+        pArgs = {}
+        if style != '':
+          pArgs ['style'] = style
+
+        if 'class' in tokenArgs:
+          pArgs ['class'] = tokenArgs['class']
+
+        if ('args' not in token) and (tagName != 'p'):
+          if (tagOpener == '/'): # if closing tag, close it
+            htmlList.append (u'</span>')
+          index += 1
+          continue
+
+        if tagName != 'p':
+          tag = tagOpener + u'span'
+
+        htmlList.append (bbcode2html.xml (tag, pArgs))
+
+      # IMG tag
+      elif tag == 'img' and (index+2 < tokenLength):
+        if 'img' in tokenArgs:
+          # has the form of <width>x<height> ?
+          sizeMatch = re.match (u'^\s*(\d+)[xX](\d+)\s*$', tokenArgs['img'])
+          if sizeMatch is not None:
+            tokenArgs['width']  = sizeMatch.group(1)
+            tokenArgs['height'] = sizeMatch.group(2)
+          # then assume is the alternative text
+          else:
+            tokenArgs['alt'] = tokenArgs['img']
+          del tokenArgs['img']
+
+        # add the source of the image
+        tokenArgs ['src'] = tokens[index+1]
+
+        # [img]http://www.whatever.com/pic.jpg[/img]
+        htmlList.append (
+          bbcode2html.xml ('img', tokenArgs, soloTag=True)
+        )
+        index += 2 # skip next token and closing tag
+
+      # URL tag
+      elif tag == 'url':
+        if ('args' not in token) and (index+2 < tokenLength):
+          # [url]http://www.google.com[/url]
+          htmlList.append (bbcode2html.xml ('a', { 'href' : tokens[index+1] }))
+        else:
+          # [url=http://www.google.com]Google[/url]
+          # [url link=http://www.google.com title="This is Google"]Google[/url]
+          htmlList.append (bbcode2html.xml ('a', tokenArgs))
+  
+      # URL closing tag (sometimes needed)
+      elif (tag == '/url') or (tag == '/email'):
+        htmlList.append (u'</a>')
+
+      # Email tag
+      elif tag == 'email':
+        if ('args' not in token) and (index+2 < tokenLength):
+          # [email]asdf@asdf.com]
+          htmlList.append (bbcode2html.xml ('a', { 'href' : u'mailto:' + tokens[index+1].strip() }))
+        else:
+          # [email=asdf@asfd.com]john smith[/email]
+          if 'href' in tokenArgs:
+            tokenArgs['href'] = u'mailto:' + tokenArgs['href']
+          htmlList.append (bbcode2html.xml ('a', tokenArgs))
+  
+      elif tagName == 'list':
+        if tagOpener == '/':
+          htmlList.append (bbcode2html.xml (u'/' + lastListOpener.pop()))
+        else:
+          if ('type' not in tokenArgs):
+            htmlList.append (bbcode2html.xml (tagOpener + u'ul', tokenArgs))
+            lastListOpener.append ('ul')
+          else:
+            htmlList.append (bbcode2html.xml (tagOpener + u'ol', tokenArgs))
+            lastListOpener.append ('ol')
+
+      elif tagName == '*':
+        htmlList.append (bbcode2html.xml (tagOpener + u'li', tokenArgs))
+
+      elif (tagName == 's'):
+        tokenArgs['style'] =  'text-decoration: line-through;'
+        htmlList.append (bbcode2html.xml (tagOpener + u'span', tokenArgs))
+
+      elif (tagName == 'code'):
+        htmlList.append (bbcode2html.xml (tagOpener + u'pre', tokenArgs))
+
+      elif (tagName == 'quote'):
+        htmlList.append (bbcode2html.xml (tagOpener + u'blockquote', tokenArgs))
+
+      elif (tagName == 'google'):
+        htmlList.append (
+          bbcode2html.xml (
+            tagOpener + u'a', 
+            {'href' : 'http://www.google.com/search?q=' + urllib.quote_plus (tokens[index+1])},
+            tokens[index+1]
+          )
+        )
+        index += 2
+
+      elif (tagName == 'wikipedia'):
+        subdomain = 'www'
+        for arg  in ['lang', 'language', 'wikipedia']:
+          if arg in tokenArgs:
+            subdomain = tokenArgs[arg] 
+
+        htmlList.append (
+          bbcode2html.xml (
+            tagOpener + u'a', 
+            {'href' : 'http://' + subdomain + '.wikipedia.org/wiki/' + tokens[index+1].replace (' ', '_')},
+            tokens[index+1]
+          )
+        )
+        index += 2
+
+      elif (tagName in validTags):
+        htmlList.append (
+          bbcode2html.xml (tag, tokenArgs)
+        )
+
+      else:
+        # ignore this tag
+        pass 
+
+      index += 1
+
+    return ''.join (htmlList)
+
+  @staticmethod
+  def _filterInvalidTagsAndAttributes (tokens, validTags, allowClassAttr):
+    '''
+    Helper function to filter out invalid attributes from the tokens list
+    '''
+    # add 'class' attribute as valid (mapping 'class' itself)
+    if allowClassAttr:
+      for attr in validTags:
+        validTags[attr]['class'] = 'class'
+
+    # remove invalid attributes from tokens
+    for tindex in range(0, len(tokens)):
+      if isinstance (tokens[tindex], dict) and ('args' in tokens[tindex]) and (tokens[tindex]['tag'] in validTags):
+        validList = validTags[tokens[tindex]['tag']]
+
+        filteredArgs = {}
+        for arg in tokens[tindex]['args']:
+          if arg in validList:
+            # rename the argument
+            filteredArgs[validList[arg]] = tokens[tindex]['args'][arg]
+          else:
+            pass # do not include this arg in the filteredArgs
+
+        tokens[tindex]['args'] = filteredArgs
+
+    return
+
+  @staticmethod
+  def xml (tag, attrs = {}, text = None, soloTag = False):
+    '''
+    Helper function to produce valid XML output
+    '''
+    xml = u'<' + tag.lower()
+
+    # make sure we sort attributes alphabetically (for deterministic output)
+    # Faster but non-deterministic:
+    #  for (key, value) in attrs.iteritems():
+    #    xml += u' ' + key + u'="' + value + u'"'
+    for key in sorted (attrs.keys()):
+      xml += u' ' + key + u'="' + attrs[key] + u'"'
+      
+    # close tag
+    if text is None:
+      if soloTag:
+        xml += u' />'
+      else:
+        xml += u'>'
+    else:
+      xml += u'>' + text + u'</' + tag.lower() + '>'
+
+    return xml
+
+
+
--- a/fanficdownloader/bbcodeutils/bbcodebuilder.py
+++ b/fanficdownloader/bbcodeutils/bbcodebuilder.py
@ -0,0 +1,77 @@
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+#
+# Author:        Pau Sanchez (contact@pausanchez.com)
+# Version:       v1.0
+# Last Modified: 2010/09/15
+# 
+# For the latest version check out:
+#   http://www.codigomanso.com/en/projects
+# 
+# My blog:
+#   http://www.codigomanso.com/en/  - English Version
+#   http://www.codigomanso.com/es/  - Spanish Version
+#
+
+import sys
+import os
+import re
+import hashlib
+
+class bbcodebuilder:
+  '''
+  This class helps to build BBCode programmatically.
+
+  The function names are used as the tag name, then the first parameter
+  is the string that goes inside the tags and any extra parameter is 
+  appended as a parameter to the tag
+
+  Examples:
+    > bbcode = bbcodebuilder()  # create a instance!
+
+    > print bbcode.b ('bold')
+    [b]bold[/b]
+
+    > print bbcode.color ('this goes in red', 'red')
+    [color=red]this goes in red[/color]
+
+    > print bbcode.url ('Google', 'http://www.google.com')
+    [url=http://www.google.com]Google[/url]
+    
+    > print bbcode.alist('item 1', 'item 2')
+    [list=a]
+      [*]item 1
+      [*]item 2
+    [/list]
+
+
+  This solution is based on the recipe found on:
+    http://code.activestate.com/recipes/576831-simple-bbcode-support/
+  '''
+
+  def __getattr__(self, name):
+    '''
+    This is a generic getter that returns a function which gets the first parameter
+    as the string that goes between the tags, and extra parameters as tag parameters.
+
+    The name of the attribute is used as the tag name
+    '''
+    class bbcodebuilder_helper:
+      def __init__(self, name):
+        self._name = name
+
+      def __call__(self, string, *args):
+        return u'[{0}{1}]{2}[/{0}]'.format(self._name, (u'=' + u','.join(map(str, args))) if args else u'', string)
+    
+    return bbcodebuilder_helper (name)
+
+  def list(self, *items):
+    return u'[list]' + u''.join(map(lambda item: u"\n  [*]" + item, items)) + u"\n[/list]"
+
+  def nlist(self, *items):
+    return u'[list=1]' + u''.join(map(lambda item: u"\n  [*]" + item, items)) + u"\n[/list]"
+
+  def alist(self, *items):
+    return u'[list=a]' + u''.join(map(lambda item: u"\n  [*]" + item, items)) + u"\n[/list]"
+
+
--- a/fanficdownloader/bbcodeutils/bbcodeparser.py
+++ b/fanficdownloader/bbcodeutils/bbcodeparser.py
@ -0,0 +1,251 @@
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+#
+# Author:        Pau Sanchez (contact@pausanchez.com)
+# Version:       v1.0
+# Last Modified: 2010/09/15
+# 
+# For the latest version check out:
+#   http://www.codigomanso.com/en/projects
+# 
+# My blog:
+#   http://www.codigomanso.com/en/  - English Version
+#   http://www.codigomanso.com/es/  - Spanish Version
+#
+
+import sys
+import os
+import re
+import hashlib
+
+class bbcodeparser:
+  '''
+  This class parses BBCode into a internal structure to allow later processing and
+  conversion to HTML.
+
+  The parser tries to fix invalid code (like unclosed tags)
+
+  Useful URLs:
+    http://en.wikipedia.org/wiki/BBCode
+    http://www.bbcode.org/reference.php
+
+  Example:
+    > bbcode = bbcodeparser ()
+    > bbcode.parse ('[b]text in bold[/b]').html()
+    <b>text in bold</b>
+
+    # dump HTML
+    > bbcode.parse ('[p][color=red]text in red').html()
+    <p><span style="color:red;">text in red</span></p>
+
+    # dump fixed BBCode
+    > bbcode.parse ('[p][color=red]text in red').bbcode()
+    [p][color=red]text in red[/color][/p]
+
+    > bbcode.parse ('This [b][i]code[/b] will be fixed[/invalid]').bbcode()
+    This [b][i]code[/i][/b] will be fixed
+
+    # dump fixed bbcode
+    > str (bbcodeparse ('This [b][i]code[/b] will be fixed[/invalid]'))
+    This [b][i]code[/i][/b] will be fixed
+  '''
+  _bbcode = ''
+  _tokens = []
+
+  def __init__ (self, bbcode = '', fixInvalidCode = True):
+    ''' Initialize and parse bbcode string (if any is given)
+    '''
+    self.parse (bbcode, fixInvalidCode)
+    return
+
+  def __str__ (self):
+    return self.bbcode()
+
+  def parse (self, bbcode = None, fixInvalidCode = True):
+    '''
+    It will parse and return the token list, trying to fix tags if
+    fixInvalidCode is True
+
+    It will return the current object to allow chaining
+
+    Example:
+      code = bbcode()
+      code.parse ('<b>bold</b>', True) -> 
+      code.parse ('<b>bold<i>italics</b>', True) -> internally will add the missing '</i>'
+    '''
+    if bbcode is not None:
+      self._bbcode = bbcode
+      self._tokens = self.tokenize (bbcode)
+      if fixInvalidCode:
+        self._tokens = self.fixWrongTags (self._tokens)
+
+    return self
+
+  # return ALL tokens
+  def getTokens (self):
+    return self._tokens
+
+  def bbcode (self):
+    '''
+    Dump BBCode again. This is useful for dumping valid BBCode
+    '''
+    bbcode = []
+    for token in self._tokens:
+      if token is None:
+        continue
+
+      if isinstance (token, basestring):
+        bbcode.append (token.replace (u'[', u'\[').replace (u']', u'\]'))
+        continue
+      
+      tag       = token['tag']  # opening or closing simple tag. e.g: 'b', '/b', '/u', ...
+      tagOpener = (u'/' if tag[0] == u'/' else u'')
+
+      if (tagOpener == '/') or ('args' not in token):
+        bbcode.append (u'[' + tag + u']')
+      else:
+        # process args
+        argstr = ''
+
+        # the arg with the same name as the tag repersents the '=whatever'
+        if tag in token['args']:
+          if re.match ('\s|"', token['args'][tag]) is None:
+            argstr = u'=' + token['args'][tag]
+          else:
+            argstr = u'="' + token['args'][tag].replace (u'"', u'\"') + u'"'
+
+        for (k,v) in token['args'].iteritems():
+          if k == tag: # already processed
+            continue
+          argstr += ' ' + k + u'="' + v.replace (u'"', u'\"') + u'"'
+          
+        bbcode.append (u'[' + tag + argstr + ']')
+
+    return u''.join (bbcode)
+
+  def html (self, allowClassAttr = False, doDeepCopy = True):
+    '''
+    Convert current parsed code to HTML
+
+    @allowClassAttr
+      Is something like [b class="asdf"] allowed?
+
+    @doDeepCopy 
+      True:  it does a deep copy of tokens so this list will remain unchanged
+      False: tokens will be modified internally, but the output will be produced like 5x faster
+             it's a good idea to use False when the string parsed is huge and this is the
+             last operation on the string
+
+    Example:
+      code = bbcode ('[b]bold[/b]')
+      code.html() -> '<b>bold</b>'
+    '''
+    from bbcode2html import bbcode2html
+    return bbcode2html.convertToHTML (self._tokens, allowClassAttr = allowClassAttr, doDeepCopy = doDeepCopy)
+
+
+  @staticmethod
+  def fixWrongTags (inTokenList):
+    ''' Add missing tokens that have not been closed properly and try to fix some scenarios
+    '''
+    opened       = []
+    outTokenList = []
+    for token in inTokenList:
+      # normal string... do nothing
+      if isinstance(token, basestring):
+        outTokenList.append (token)
+      else:
+        # if starts with '/' is closing a tag
+        if token['tag'][0] == '/':
+          while (len (opened) > 0) and (opened[-1] != token['tag'][1:]):
+            outTokenList.append ({'tag' : '/' + opened[-1] })
+            del opened[-1]
+
+          if len(opened):
+            del opened[-1]
+            outTokenList.append (token)
+
+        # opening tag
+        else:
+          # if I open the same tag I opened before, close it, and open it again
+          if (len(opened) > 0) and (token['tag'] == opened[-1]):
+            outTokenList.append ({'tag' : '/' + opened[-1] })
+          else:
+            opened.append (token['tag'])
+          outTokenList.append (token)
+
+    # close all elements that have not been closed
+    while len(opened):
+      outTokenList.append ({'tag' : '/' + opened[-1] })
+      del opened[-1]
+
+    return outTokenList
+
+  @staticmethod
+  def tokenize(code):
+    '''
+    Tokenize BBCode tags and parameters
+
+    Return the token list using a internal format. See the example:
+      [
+        { 'tag' : 'p', 'args' : { 'font' : 'arial' } },
+        'This is ',
+        { 'tag' : 'url', 'args' : {'url' : 'http://www.google.com'} },
+        'a link to google',
+        { 'tag' : '/url' },
+        { 'tag' : '/p' }
+      ]
+    '''
+    re_tags    = re.compile (r'(\[[^]]+\])', re.DOTALL | re.UNICODE)
+    re_tagName = re.compile (r'\[([^]=\s]+)([^]]*)\]',  re.DOTALL | re.UNICODE)
+    #re_tagArgs = re.compile (r'\s*([^=]*)=(("([^"]+)")|([^\s]+))',  re.DOTALL | re.UNICODE)
+    re_tagArgs = re.compile (r'\s*([\w]*)=(("([^"]+)")|([^\s]+))',  re.DOTALL | re.UNICODE)
+
+    # get a unique name and replace escaped braces 
+    unique = hashlib.md5(code).hexdigest()
+    code   = code.replace ('\[', unique+'_OPEN_BRACE')
+    code   = code.replace ('\]', unique+'_CLOSE_BRACE')
+
+    splitted = re_tags.split(code)
+
+    outTokenList = []
+    for token in splitted:
+      if len(token) == 0:
+        continue
+      
+      if token[0] == '[':
+        match   = re_tagName.match (token)
+        tagName = match.group(1)
+        tagArgs = match.group(2)
+        
+        tagToken = { 'tag' : tagName.lower() }
+
+        # parse arguments (if any)
+        if len(tagArgs) > 0:
+          allArgs = re_tagArgs.findall(tagArgs)
+
+          tagArgs = {}
+          for arg in allArgs:
+            # if the argument has no name, use the tagName itself
+            argName  = (arg[0] if arg[0] != '' else tagName)
+            argValue = (arg[3] if (arg[1][0] == '"') else arg[4])
+
+            tagArgs[argName.lower()] = argValue.replace ('\"', '"')
+
+          tagToken['args'] = tagArgs
+
+        outTokenList.append (tagToken)
+
+      # append the text as it is
+      else:
+        outTokenList.append (token)
+
+    # restore escaped braces back (once code is parsed)
+    restoredTokenList = []
+    for token in outTokenList:
+      if isinstance (token, basestring):
+        token = token.replace (unique+'_OPEN_BRACE', '[').replace (unique+'_CLOSE_BRACE', ']')
+      restoredTokenList.append (token)
+
+    return restoredTokenList
+
--- a/fanficdownloader/bbcodeutils/readme.txt
+++ b/fanficdownloader/bbcodeutils/readme.txt
@ -0,0 +1,81 @@
+AUTHOR
+  Pau Sanchez
+  http://www.codigomanso.com/
+
+VERSION:
+  bbcodeutils v1.0
+  
+LICENSE
+  This code is licensed under Creative Commons Attribution 3.0
+  http://creativecommons.org/licenses/by/3.0/
+
+  You can use this python module or any part of the code you want as long as you add
+  my name as a contributor to your project.
+  
+DESCRIPTION
+  This module can be used to produce HTML from BBCode, to generate BBCode or to fix invalid BBCode.
+
+  The classes are:
+    - bbcodeparser
+    - bbcodebuilder
+    - bbcode2html
+  
+  You can use bbcodeparser to parse BBCode and to produce output in any format you want.
+
+  Open the python file to find more information and examples of use of each class. It can
+  be a good idea to check the test.py for examples
+
+  To run the unit tests:
+    > python test.py
+  
+  To run the performance test:
+    > python test.py BBCodeTests.performanceTest  
+
+
+EXAMPLES OF BBCode:
+
+  [b] -> bold
+  [u] -> underline
+  [i] -> italic
+
+  [center] -> center the text inside
+  [color=XXX] -> change color of text
+  [size=XXX] -> change size of text
+
+  Lists:
+  [ul] -> unordered list
+  [ol] -> ordered list
+  [li] -> list item
+
+  [list] -> start unordered list
+  [*]    -> list item
+  [list=1] -> start a list of numbers
+  [list=a] -> start a list of alphabetic characters
+
+  Advanced:
+  [url] -> link to url
+  [url=http://link/url/]text[/url]  
+  [url link=http://link/url/ title="This is the title"]text[/url]  
+
+  [img]http://to/image[/img]
+  [img=230x330]http://to/image[/img]
+  [img="Alt text here"]http://to/image[/img]
+  [img="Alt text here" width=320 height=240]http://to/image[/img]
+
+  [email]asdf@asdf.com[/email]
+  [email=john@asdf.com]John Smith[/email]
+
+  [google]search this[/google]
+  [wikipedia]Tom Hanks[/wikipedia]
+  [wikipedia lang=es]Tom Hanks[/wikipedia]
+
+  Tables:
+  [table]
+  [tr]
+  [th]
+  [td]
+
+  Advanced:
+  [google]
+  [wikipedia]
+
--- a/fanficdownloader/bbcodeutils/test.py
+++ b/fanficdownloader/bbcodeutils/test.py
@ -0,0 +1,420 @@
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+#
+# Author:        Pau Sanchez (contact@pausanchez.com)
+# Version:       v1.0
+# Last Modified: 2010/09/15
+# 
+# For the latest version check out:
+#   http://www.codigomanso.com/en/projects
+# 
+# My blog:
+#   http://www.codigomanso.com/en/  - English Version
+#   http://www.codigomanso.com/es/  - Spanish Version
+#
+
+from bbcodeparser import bbcodeparser
+from bbcodebuilder import bbcodebuilder
+
+import random
+import unittest
+
+class BBCodeTests(unittest.TestCase):
+  def setUp (self):
+    self.bbcode = bbcodeparser()
+    return
+
+  def testConstructor (self):
+    self.assertEqual (bbcodeparser ('whatever').html(), 'whatever')
+    self.assertEqual (bbcodeparser ('[b]bold[/b]').html(), '<b>bold</b>')
+    self.assertEqual (str (bbcodeparser ('[b]bold[/b]')), '[b]bold[/b]')
+    return
+
+  def testBold (self):
+    self.assertEqual (self.bbcode.parse ('whatever').html(), 'whatever')
+    self.assertEqual (self.bbcode.parse ('[b]bold[/b]').html(), '<b>bold</b>')
+    self.assertEqual (self.bbcode.parse ('[B]bold[/b]').html(), '<b>bold</b>')
+    self.assertEqual (self.bbcode.parse ('this is [B]bold[/B]').html(), 'this is <b>bold</b>')
+    return
+
+  def testItalic (self):
+    self.assertEqual (self.bbcode.parse ('[i]italic[/i]').html(), '<i>italic</i>')
+    return
+
+  def testUnderline (self):
+    self.assertEqual (self.bbcode.parse ('[u]italic[/u]').html(), '<u>italic</u>')
+    return
+
+  def testURLs (self):
+    self.assertEqual (
+      self.bbcode.parse ('[url]http://www.google.com[/url]').html(),
+      '<a href="http://www.google.com">http://www.google.com</a>'
+    )
+    self.assertEqual (
+      self.bbcode.parse ('[url=http://www.google.com]Google[/url]').html(),
+      '<a href="http://www.google.com">Google</a>'
+    )
+    self.assertEqual (
+      self.bbcode.parse ('[url="http://www.google.com"]Google[/url]').html(),
+      '<a href="http://www.google.com">Google</a>'
+    )
+    self.assertEqual (
+      self.bbcode.parse ('[url="http://www.google.com" title="Search Engine"]Google[/url]').html(),
+      '<a href="http://www.google.com" title="Search Engine">Google</a>'
+    )
+    self.assertEqual (
+      self.bbcode.parse ('[url link="http://www.google.com"]Google[/url]').html(),
+      '<a href="http://www.google.com">Google</a>'
+    )
+    return
+
+  def testPTag (self):
+    self.assertEqual (
+      self.bbcode.parse ('[p color=#0000ff]blue[/p]').html(),
+      u'<p style="color: #0000ff;">blue</p>'
+    )
+    self.assertEqual (
+      self.bbcode.parse ('[p size=12]12pt font[/p]').html(),
+      u'<p style="font-size: 12pt;">12pt font</p>'
+    )
+
+    self.assertEqual (
+      self.bbcode.parse ('[p font=arial]arial[/p]').html(),
+      u'<p style="font-family: arial;">arial</p>'
+    )
+
+    self.assertEqual (
+      self.bbcode.parse ('[p font=arial color=blue size=14]blue 14pt arial').html(),
+      u'<p style="color: blue; font-size: 14pt; font-family: arial;">blue 14pt arial</p>' 
+    )
+
+    self.assertEqual (
+      self.bbcode.parse ('[p class=whatever]text[/p]').html(),
+      u'<p>text</p>' 
+    )
+    return
+
+  def testColorTag (self):
+    self.assertEqual (
+      self.bbcode.parse ('[color=#0000ff]blue[/color]').html(),
+      u'<span style="color: #0000ff;">blue</span>'
+    )
+    return
+
+  def testSizeTag (self):
+    self.assertEqual (
+      self.bbcode.parse ('[size=12]12pt font[/size]').html(),
+      u'<span style="font-size: 12pt;">12pt font</span>'
+    )
+    return
+
+  def testEmail(self):
+    self.assertEqual (
+      self.bbcode.parse ('[email]asdf@asdf.com[/email]').html(),
+      u'<a href="mailto:asdf@asdf.com">asdf@asdf.com</a>'
+    )
+
+    self.assertEqual (
+      self.bbcode.parse ('[email=john@smith.com]John Smith[/email]').html(),
+      u'<a href="mailto:john@smith.com">John Smith</a>'
+    )
+    return
+
+  def testImgTag (self):
+    self.assertEqual (
+      self.bbcode.parse ('[img]http://www.codigomanso.com/image.jpg[/img]').html(),
+      u'<img src="http://www.codigomanso.com/image.jpg" />'
+    )
+
+    self.assertEqual (
+      self.bbcode.parse ('[img="This is the ALT of the image"]http://www.codigomanso.com/image.jpg[/img]').html(),
+      u'<img alt="This is the ALT of the image" src="http://www.codigomanso.com/image.jpg" />'
+    )
+
+    self.assertEqual (
+      self.bbcode.parse ('[img=320x200]http://www.codigomanso.com/image.jpg[/img]').html(),
+      u'<img height="200" src="http://www.codigomanso.com/image.jpg" width="320" />'
+    )
+
+    self.assertEqual (
+      self.bbcode.parse ('[img=320x200 title="Image Test"]http://www.codigomanso.com/image.jpg[/img]').html(),
+      u'<img height="200" src="http://www.codigomanso.com/image.jpg" title="Image Test" width="320" />'
+    )
+
+    self.assertEqual (
+      self.bbcode.parse ('[img="whatever" width=320 height="212" title="Image Test"]http://www.codigomanso.com/image.jpg[/img]').html(),
+      u'<img alt="whatever" height="212" src="http://www.codigomanso.com/image.jpg" title="Image Test" width="320" />'
+    )
+    return
+
+  def testGoogleURL (self):
+    self.assertEqual (
+      self.bbcode.parse ('[google]asdf[/google]').html(),
+      u'<a href="http://www.google.com/search?q=asdf">asdf</a>'
+    )
+    self.assertEqual (
+      self.bbcode.parse ('[google]Tom Hanks[/google]').html(),
+      u'<a href="http://www.google.com/search?q=Tom+Hanks">Tom Hanks</a>'
+    )
+    return
+    
+  def testWikipediaURL (self):
+    self.assertEqual (
+      self.bbcode.parse ('[wikipedia]Tom Hanks[/wikipedia]').html(),
+      u'<a href="http://www.wikipedia.org/wiki/Tom_Hanks">Tom Hanks</a>'
+    )
+
+    self.assertEqual (
+      self.bbcode.parse ('[wikipedia language=en]Tom Hanks[/wikipedia]').html(),
+      u'<a href="http://en.wikipedia.org/wiki/Tom_Hanks">Tom Hanks</a>'
+    )
+    
+    self.assertEqual (
+      self.bbcode.parse ('[wikipedia lang=es]Tom Hanks[/wikipedia]').html(),
+      u'<a href="http://es.wikipedia.org/wiki/Tom_Hanks">Tom Hanks</a>'
+    )
+    
+    self.assertEqual (
+      self.bbcode.parse ('[wikipedia=es]Tom Hanks[/wikipedia]').html(),
+      u'<a href="http://es.wikipedia.org/wiki/Tom_Hanks">Tom Hanks</a>'
+    )
+    return
+    
+  def testListTags (self):
+    self.assertEqual (
+      self.bbcode.parse ('[ul][li]item 1[/li][li]item 2[/li][/ul]').html(),
+      u'<ul><li>item 1</li><li>item 2</li></ul>'
+    )
+    
+    self.assertEqual (
+      self.bbcode.parse ('[ol][li]item 1[/li][li]item 2[/li][/ol]').html(),
+      u'<ol><li>item 1</li><li>item 2</li></ol>'
+    )
+
+    self.assertEqual (
+      self.bbcode.parse ('[list][li]item 1[/li][li]item 2[/li][/list]').html(),
+      u'<ul><li>item 1</li><li>item 2</li></ul>'
+    )
+
+    self.assertEqual (
+      self.bbcode.parse ('[list][*]item 1[*]item 2[/list]').html(),
+      u'<ul><li>item 1</li><li>item 2</li></ul>'
+    )
+
+    self.assertEqual (
+      self.bbcode.parse ('[list=1][li]item 1[/li][li]item 2[/li][/list]').html(),
+      u'<ol type="1"><li>item 1</li><li>item 2</li></ol>'
+    )
+    return
+
+  def testInvalidCode (self):
+    self.assertEqual (self.bbcode.parse ('[invalid]valid text[/invalid]').html(), 'valid text')
+    self.assertEqual (
+      self.bbcode.parse ('[b]bold and [i]italics[/b]').html(),
+      '<b>bold and <i>italics</i></b>'
+    )
+    self.assertEqual (
+      self.bbcode.parse ('[/b]invalid[/b][/p]').html(),
+      'invalid'
+    )
+    self.assertEqual (
+      self.bbcode.parse ('[p][b]bold').html(),
+      '<p><b>bold</b></p>'
+    )
+    self.assertEqual (
+      self.bbcode.parse ('[p][b]a <b>').html(),
+      '<p><b>a &lt;b&gt;</b></p>'
+    )
+
+    self.assertEqual (
+      self.bbcode.parse ('[ol][li]item 1[li]item 2[/li][/ol]').html(),
+      u'<ol><li>item 1</li><li>item 2</li></ol>'
+    )
+
+    self.assertEqual (
+      self.bbcode.parse ('[b]\[b\] stands for [b]bold[/b]').html(),
+      u'<b>[b] stands for </b><b>bold</b>'
+    )
+    return
+
+  def testEscapedBrackets (self):
+    self.assertEqual (
+      self.bbcode.parse ('\[b\]not bold\[/b\]').html(),
+      u'[b]not bold[/b]'
+    )
+
+    self.assertEqual (
+      self.bbcode.parse ('[b]\[b\] stands for bold[/b]').html(),
+      u'<b>[b] stands for bold</b>'
+    )
+
+    self.assertEqual (
+      self.bbcode.parse ('\[b\][b]stands for bold[/b]').html(),
+      u'[b]<b>stands for bold</b>'
+    )
+
+    self.assertEqual (
+      self.bbcode.parse ('\[b\][b]stands for bold[/b] just like <b> in HTML').html(),
+      u'[b]<b>stands for bold</b> just like &lt;b&gt; in HTML'
+    )
+
+  def testBigExample (self):
+    inputText = """check this out
+
+    [h1 class=circle]heading[/h1]
+
+    [p size=14 color=blue font="verdana, Times New Roman"]This is [b] bold [/b] and this [i]italic[/i] and this is [color=red]red[/color] and this is [color="red"]also red[/color].
+    [/p]
+
+    fix [b][i]bold [font=verdana][size=12]and[/size][/font] italic[/b]
+    [img]http://www.codigomanso.com/b.jpg[/img]
+    [url]http://www.codigomanso.com/[/url]
+    [url=http://www.codigomanso.com/]Codigo Manso[/url]
+    [uRl link=http://www.codigomanso.com title="Codigo Manso Blog"]Codigo Manso[/url]
+
+    [ul]
+     [Li]item 1[/Li]
+     [li]item 2[/LI]
+    [/UL]
+
+    [list=1 ]
+     [*]item 1
+     [*]item 2
+    [/list]
+
+    [table class="big"]
+      [tr]
+        [th]big[/th]
+      [/tr]
+    [/table]
+    [invalid class="extra"]whatever[/invalid]"""
+
+    out = self.bbcode.parse (inputText).html(allowClassAttr = True)
+    self.assertEquals (out, '''check this out
+
+    <h1 class="circle">heading</h1>
+
+    <p style="color: blue; font-size: 14pt; font-family: verdana, Times New Roman;">This is <b> bold </b> and this <i>italic</i> and this is <span style="color: red;">red</span> and this is <span style="color: red;">also red</span>.
+    </p>
+
+    fix <b><i>bold <span style="font-family: verdana;"><span style="font-size: 12pt;">and</span></span> italic</i></b>
+    <img src="http://www.codigomanso.com/b.jpg" />
+    <a href="http://www.codigomanso.com/">http://www.codigomanso.com/</a>
+    <a href="http://www.codigomanso.com/">Codigo Manso</a>
+    <a href="http://www.codigomanso.com" title="Codigo Manso Blog">Codigo Manso</a>
+
+    <ul>
+     <li>item 1</li>
+     <li>item 2</li>
+    </ul>
+
+    <ol type="1">
+     <li>item 1
+     </li><li>item 2
+    </li></ol>
+
+    <table class="big">
+      <tr>
+        <th>big</th>
+      </tr>
+    </table>
+    whatever''')
+
+
+  def testBBCodeDumper (self):
+    self.assertEquals (
+      self.bbcode.parse ('[b]bold[/b]').bbcode(),
+      '[b]bold[/b]'
+    )
+
+    self.assertEquals (
+      self.bbcode.parse ('[color=red]text in red[/color]').bbcode(),
+      '[color=red]text in red[/color]'
+    )
+    self.assertEquals (
+      self.bbcode.parse ('[p][color=red]text in red').bbcode(),
+      '[p][color=red]text in red[/color][/p]'
+    )
+
+    self.assertEquals (
+      self.bbcode.parse ('This [b][i]code[/b] will be fixed[/invalid]').bbcode(),
+      'This [b][i]code[/i][/b] will be fixed'
+    )
+
+    self.assertEquals (
+      self.bbcode.parse ('\[[url]http://www.codigomanso.com/en[/url]\]').bbcode(),
+      "\[[url]http://www.codigomanso.com/en[/url]\]"
+    )
+
+  def performanceTest(self):
+    '''
+    This test checks the performance of parse and html operations
+
+    To run this test type:
+      > python test.py BBCodeTests.performanceTest
+    '''
+    inputText = """check this out
+
+    [h1 class=circle]heading[/h1]
+
+    [p size=14 color=blue font="verdana, Times New Roman"]This is [b] bold [/b] and this [i]italic[/i] and this is [color=red]red[/color] and this is [color="red"]also red[/color].
+    [/p]
+
+    fix [b][i]bold [font=verdana][size=12]and[/size][/font] italic[/b]
+    [img]http://www.codigomanso.com/b.jpg[/img]
+    [url]http://www.codigomanso.com/[/url]
+    [url=http://www.codigomanso.com/]Codigo Manso[/url]
+    [uRl link=http://www.codigomanso.com title="Codigo Manso Blog"]Codigo Manso[/url]
+
+    [ul]
+     [Li]item 1[/Li]
+     [li]item 2[/LI]
+    [/UL]
+
+    [list=1 ]
+     [*]item 1
+     [*]item 2
+    [/list]
+
+    [table class="big"]
+      [tr]
+        [th]big[/th]
+      [/tr]
+    [/table]
+    [invalid class="extra"]whatever[/invalid]"""
+
+    import time
+    start = time.time()
+
+    for i in range(0, 12):
+      inputText += inputText
+
+    print "len(inputText) = %.2f MB  (took %.2f seconds)" % (len(inputText)/(1024.0*1024.0), time.time() - start)
+    
+    bbcode = bbcodeparser()
+    start = time.time()
+    bbcode.parse (inputText)
+    total = (time.time() - start)
+    print "time (bbcode.parse()) = %f" % total
+    print "  >> %.2f chars/second" % (len(inputText) / total)
+
+    start = time.time()
+    bbcode.html(doDeepCopy = False)
+    total = (time.time() - start)
+    print "time (bbcode.html()) = %f" % total
+    print "  >> %.2f chars/second" % (len(inputText) / total)
+    return
+    
+  def testCodeBuilder (self):
+    bbcode = bbcodebuilder ()
+    self.assertEquals (bbcode.b ('bold'), u'[b]bold[/b]')
+    self.assertEquals (bbcode.color ('this goes in red', 'red'), u'[color=red]this goes in red[/color]')
+    self.assertEquals (bbcode.url ('Google', 'http://www.google.com'), u'[url=http://www.google.com]Google[/url]')
+    self.assertEquals (bbcode.alist('item 1', 'item 2'), u"[list=a]\n  [*]item 1\n  [*]item 2\n[/list]")
+    return
+
+if __name__ == '__main__':
+    unittest.main()
+
+
+
--- a/fanficdownloader/story.py
+++ b/fanficdownloader/story.py
@ -319,12 +319,25 @@ class Story:
    def isList(self,listname):
        return self.listables.has_key(listname)
    
-    def addChapter(self, title, html):
+    def addChapter(self, title, html, configurable=None):
+        if configurable and \
+                configurable.getConfig('strip_chapter_numbers') and \
+                configurable.getConfig('chapter_title_strip_pattern'):
+            title = re.sub(configurable.getConfig('chapter_title_strip_pattern'),"",title)
        self.chapters.append( (title,html) )

-    def getChapters(self):
+    def getChapters(self, configurable=None):
        "Chapters will be tuples of (title,html)"
-        return self.chapters
+        retval = []
+        if configurable and \
+                configurable.getConfig('add_chapter_numbers') and \
+                configurable.getConfig('chapter_title_add_pattern'):
+            for index, (title,html) in enumerate(self.chapters):
+                retval.append( (string.Template(configurable.getConfig('chapter_title_add_pattern')).substitute({'index':index+1,'title':title}),html) ) 
+        else:
+            retval = self.chapters
+            
+        return retval

    def formatFileName(self,template,allowunsafefilename=True):
        values = origvalues = self.getAllMetadata()
--- a/fanficdownloader/writers/base_writer.py
+++ b/fanficdownloader/writers/base_writer.py
@ -184,10 +184,10 @@ class BaseStoryWriter(Configurable):
        names as Story.metadata, but ENTRY should use index and chapter.
        """
        # Only do TOC if there's more than one chapter and it's configured.
-        if len(self.story.getChapters()) > 1 and self.getConfig("include_tocpage") and not self.metaonly :
+        if len(self.story.getChapters(self)) > 1 and self.getConfig("include_tocpage") and not self.metaonly :
            self._write(out,START.substitute(self.story.getAllMetadata()))

-            for index, (title,html) in enumerate(self.story.getChapters()):
+            for index, (title,html) in enumerate(self.story.getChapters(self)):
                if html:
                    self._write(out,ENTRY.substitute({'chapter':title, 'index':"%04d"%(index+1)}))

--- a/fanficdownloader/writers/writer_epub.py
+++ b/fanficdownloader/writers/writer_epub.py
@ -441,7 +441,7 @@ div { margin: 0pt; padding: 0pt; }
        if self.getConfig("include_titlepage"):
            items.append(("title_page","OEBPS/title_page.xhtml","application/xhtml+xml","Title Page"))
            itemrefs.append("title_page")
-        if len(self.story.getChapters()) > 1 and self.getConfig("include_tocpage") and not self.metaonly :
+        if len(self.story.getChapters(self)) > 1 and self.getConfig("include_tocpage") and not self.metaonly :
            items.append(("toc_page","OEBPS/toc_page.xhtml","application/xhtml+xml","Table of Contents"))
            itemrefs.append("toc_page")

@ -449,7 +449,7 @@ div { margin: 0pt; padding: 0pt; }
            items.append(("log_page","OEBPS/log_page.xhtml","application/xhtml+xml","Update Log"))
            itemrefs.append("log_page")
            
-        for index, (title,html) in enumerate(self.story.getChapters()):
+        for index, (title,html) in enumerate(self.story.getChapters(self)):
            if html:
                i=index+1
                items.append(("file%04d"%i,
@ -587,7 +587,7 @@ div { margin: 0pt; padding: 0pt; }
            outputepub.writestr("OEBPS/log_page.xhtml",logpageIO.getvalue())
        logpageIO.close()
        
-        for index, (title,html) in enumerate(self.story.getChapters()):
+        for index, (title,html) in enumerate(self.story.getChapters(self)):
            if html:
                logging.debug('Writing chapter text for: %s' % title)
                fullhtml = self.EPUB_CHAPTER_START.substitute({'chapter':title, 'index':index+1}) + html + self.EPUB_CHAPTER_END.substitute({'chapter':title, 'index':index+1})
--- a/fanficdownloader/writers/writer_html.py
+++ b/fanficdownloader/writers/writer_html.py
@ -94,7 +94,7 @@ ${output_css}
                          self.HTML_TOC_ENTRY,
                          self.HTML_TOC_PAGE_END)

-        for index, (title,html) in enumerate(self.story.getChapters()):
+        for index, (title,html) in enumerate(self.story.getChapters(self)):
            if html:
                logging.debug('Writing chapter text for: %s' % title)
                self._write(out,self.HTML_CHAPTER_START.substitute({'chapter':title, 'index':"%04d"%(index+1)}))
--- a/fanficdownloader/writers/writer_mobi.py
+++ b/fanficdownloader/writers/writer_mobi.py
@ -169,7 +169,7 @@ ${value}<br />
        #     files.append(tocpageIO.getvalue())
        # tocpageIO.close()

-        for index, (title,html) in enumerate(self.story.getChapters()):
+        for index, (title,html) in enumerate(self.story.getChapters(self)):
            if html:
                logging.debug('Writing chapter text for: %s' % title)
                fullhtml = self.MOBI_CHAPTER_START.substitute({'chapter':title, 'index':index+1}) + html + self.MOBI_CHAPTER_END.substitute({'chapter':title, 'index':index+1})
--- a/fanficdownloader/writers/writer_txt.py
+++ b/fanficdownloader/writers/writer_txt.py
@ -133,7 +133,7 @@ End file.
        
        self._write(out,self.lineends(self.wraplines(towrap)))

-        for index, (title,html) in enumerate(self.story.getChapters()):
+        for index, (title,html) in enumerate(self.story.getChapters(self)):
            if html:
                logging.debug('Writing chapter text for: %s' % title)
                self._write(out,self.lineends(self.wraplines(removeAllEntities(self.TEXT_CHAPTER_START.substitute({'chapter':title, 'index':index+1})))))
--- a/plugin-defaults.ini
+++ b/plugin-defaults.ini
@ -152,6 +152,45 @@ extratags: FanFiction
 # ${category} => Buffy:? [tT]he Vampire Slayer => BuffyCover
 # ${category} => Star Trek => StarTrekCover

+## If set false, the summary will have all html stripped.
+## Both this and include_images must be true to get images in the
+## summary.
+keep_summary_html:true
+
+## Don't like the numbers at the start of chapter titles on some
+## sites?  You can use strip_chapter_numbers to strip them off.  Just
+## want to make them all look the same?  Strip them off, then add them
+## back on with add_chapter_numbers.  Don't like the way it strips
+## numbers or adds them back?  See chapter_title_strip_pattern and
+## chapter_title_add_pattern.
+strip_chapter_numbers:false
+add_chapter_numbers:false
+
+## (Two versions of chapter_title_strip_pattern are shown below.  You
+## should only have one uncommented.)
+## This version will remove the leading number from:
+## "1." => ""
+## "1. The Beginning" => "The Beginning"
+## "1: Start" => "Start"
+## "2, Chapter the second" => "Chapter the second"
+## etc
+chapter_title_strip_pattern:^[0-9]+[\.: -]+
+
+## This version will strip all of the above *plus* remove 'Chapter 1':
+## "Chapter 1" => ""
+## "1. Chapter 1" => ""
+## "1. Chapter 1, Bob's First Clue" => "Bob's First Clue"
+## "Chapter 2 - Pirates Place" => "Pirates Place"
+## etc
+#chapter_title_strip_pattern:^([0-9]+[\.: -]+)?(Chapter *[0-9]+[\.:, -]*)?
+
+## Uses a python template substitution.  The ${index} is the 'chapter'
+## number and ${title} is the chapter title, after applying
+## chapter_title_strip_pattern.  Those are the only variables available.
+## "The Beginning" => "1. The Beginning" 
+chapter_title_add_pattern:${index}. ${title}
+
+
 ## Each output format has a section that overrides [defaults]
 [html]

@ -256,11 +295,6 @@ output_css:
 ## stories.  Images will be converted to jpg for size if possible.
 #include_images:false

-## If not set, the summary will have all html stripped for safety.
-## Both this and include_images must be true to get images in the
-## summary.
-#keep_summary_html:false
-
 ## If set, the first image found will be made the cover image.  If
 ## keep_summary_html is true, any images in summary will be before any
 ## in chapters.
--- a/plugin-example.ini
+++ b/plugin-example.ini
@ -6,6 +6,15 @@
 ## adult content.  Uncomment by removing '#' in front of is_adult.
 #is_adult:true

+## Don't like the numbers at the start of chapter titles on some
+## sites?  You can use strip_chapter_numbers to strip them off.  Just
+## want to make them all look the same?  Strip them off, then add them
+## back on with add_chapter_numbers.  Don't like the way it strips
+## numbers or adds them back?  See chapter_title_strip_pattern and
+## chapter_title_add_pattern.
+#strip_chapter_numbers:true
+#add_chapter_numbers:true
+
 [epub]
 ## include images from img tags in the body and summary of stories.
 ## Images will be converted to jpg for size if possible.  Images work
--- a/readme.txt
+++ b/readme.txt
@ -1,3 +1,21 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2011 Fanficdownloader team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Other code contributed by Pau Sanchez(bbcodeutils).
+
 To use, do:

 python downloader.py [-f (epub|html|txt)] <url>