Fixes to various from first round testing and some code clean up.

2026-05-08 12:36:11 +02:00 · 2011-05-13 21:34:58 -05:00 · 2011-05-13 21:34:58 -05:00 · df9e877865
commit df9e877865
parent a88d4cac50
12 changed files with 73 additions and 59 deletions
--- a/fanficdownloader/adapters/adapter_adastrafanficcom.py
+++ b/fanficdownloader/adapters/adapter_adastrafanficcom.py
@ -32,7 +32,7 @@ class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)
        self.story.setMetadata('siteabbrev','aaff')
-        self.decode = "utf8"
+        self.decode = "ISO-8859-1"
        self.story.addToList("category","Star Trek")
        self.is_adult=False
        
@ -48,10 +48,6 @@ class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
    def getSiteDomain():
        return 'www.adastrafanfic.com'

-    @classmethod
-    def getAcceptDomains(cls):
-        return [cls.getSiteDomain()]
-
    def getSiteExampleURLs(self):
        return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"

--- a/fanficdownloader/adapters/adapter_fictionalleyorg.py
+++ b/fanficdownloader/adapters/adapter_fictionalleyorg.py
@ -53,10 +53,6 @@ class FictionAlleyOrgSiteAdapter(BaseSiteAdapter):
    def getSiteDomain():
        return 'www.fictionalley.org'

-    @classmethod
-    def getAcceptDomains(cls):
-        return [cls.getSiteDomain()]
-
    def getSiteExampleURLs(self):
        return "http://"+self.getSiteDomain()+"/authors/drt/DA.html http://"+self.getSiteDomain()+"/authors/drt/JOTP01a.html"

--- a/fanficdownloader/adapters/adapter_fictionpresscom.py
+++ b/fanficdownloader/adapters/adapter_fictionpresscom.py
@ -43,10 +43,6 @@ class FictionPressComSiteAdapter(BaseSiteAdapter):
    def getSiteDomain():
        return 'www.fictionpress.com'

-    @classmethod
-    def getAcceptDomains(cls):
-        return ['www.fictionpress.com']
-
    def getSiteExampleURLs(self):
        return "http://www.fictionpress.com/s/1234/1/ http://www.fictionpress.com/s/1234/12/ http://www.fictionpress.com/s/1234/1/Story_Title"

--- a/fanficdownloader/adapters/adapter_ficwadcom.py
+++ b/fanficdownloader/adapters/adapter_ficwadcom.py
@ -44,10 +44,6 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
    def getSiteDomain():
        return 'www.ficwad.com'

-    @classmethod
-    def getAcceptDomains(cls):
-        return ['www.ficwad.com']
-
    def getSiteExampleURLs(self):
        return "http://www.ficwad.com/story/137169"

@ -137,8 +133,9 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
        # warnings
        # <span class="req"><a href="/help/38" title="Medium Spoilers">[!!] </a> <a href="/help/38" title="Rape/Sexual Violence">[R] </a> <a href="/help/38" title="Violence">[V] </a> <a href="/help/38" title="Child/Underage Sex">[Y] </a></span>
        spanreq = metap.find("span",{"class":"req"})
-        for a in spanreq.findAll("a"):
-            self.story.addToList('warnings',a['title'])
+        if spanreq: # can be no warnings.
+            for a in spanreq.findAll("a"):
+                self.story.addToList('warnings',a['title'])

        ## perhaps not the most efficient way to parse this, using
        ## regexps for each rather than something more complex, but
--- a/fanficdownloader/adapters/adapter_harrypotterfanfictioncom.py
+++ b/fanficdownloader/adapters/adapter_harrypotterfanfictioncom.py
@ -32,7 +32,7 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)
        self.story.setMetadata('siteabbrev','hp')
-        self.decode = "ISO-8859-1"
+        self.decode = "Windows-1252" # Another site that lies to us.  <rolls eyes>
        self.story.addToList("category","Harry Potter")
        self.is_adult=False
        
@ -115,7 +115,7 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
        for chapter in tablelist.findAll('a', href=re.compile(r'\?chapterid=\d+')):
            #javascript:if (confirm('Please note. This story may contain adult themes. By clicking here you are stating that you are over 17. Click cancel if you do not meet this requirement.')) location = '?chapterid=433441&i=1'
            # just in case there's tags, like <i> in chapter titles.
-            chpt=re.sub(r'^.*?(\?chapterid=433441).*?',r'\1',chapter['href'])
+            chpt=re.sub(r'^.*?(\?chapterid=\d+).*?',r'\1',chapter['href'])
            self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/viewstory.php'+chpt))

        self.story.setMetadata('numChapters',len(self.chapterUrls))
@ -178,12 +178,13 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
        soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
                                     selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
        
-        span = soup.find('div', {'id' : 'fluidtext'})
+        div = soup.find('div', {'id' : 'fluidtext'})

-        if None == span:
+        if None == div:
            raise exceptions.FailedToDownload("Error downloading Chapter: %s!  Missing required element!" % url)
-    
-        return utf8FromSoup(span)
+
+        print div
+        return utf8FromSoup(div)

 def getClass():
    return HarryPotterFanFictionComSiteAdapter
--- a/fanficdownloader/adapters/adapter_mediaminerorg.py
+++ b/fanficdownloader/adapters/adapter_mediaminerorg.py
@ -32,7 +32,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)
        self.story.setMetadata('siteabbrev','mm')
-        self.decode = "utf8"
+        self.decode = "ISO-8859-1"
        
        # get storyId from url--url validation guarantees query correct
        m = re.match(self.getSiteURLPattern(),url)
@ -50,10 +50,6 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
    def getSiteDomain():
        return 'www.mediaminer.org'

-    @classmethod
-    def getAcceptDomains(cls):
-        return [cls.getSiteDomain()]
-
    def getSiteExampleURLs(self):
        return "http://"+self.getSiteDomain()+"/fanfic/view_st.php/123456 http://"+self.getSiteDomain()+"/fanfic/view_ch.php/1234123/123444#fic_c"

@ -61,7 +57,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
        ##  http://www.mediaminer.org/fanfic/view_st.php/76882
        ##  http://www.mediaminer.org/fanfic/view_ch.php/167618/594087#fic_c
        return re.escape("http://"+self.getSiteDomain())+\
-            "/fanfic/view_(st|ch)\.php/"+r"(?P<id>\d+)(/\d+#fic_c)?$"
+            "/fanfic/view_(st|ch)\.php/"+r"(?P<id>\d+)(/\d+(#fic_c)?)?$"

    def extractChapterUrlsAndMetadata(self):

@ -79,18 +75,26 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
        # use BeautifulSoup HTML parser to make everything easier to find.
        soup = bs.BeautifulSoup(data)

-        ## Title
-        title = soup.find('title').string
-        ## MediaMiner - Fan Fic: Par Tout Autre Nom
-        ## MediaMiner: Question and Answer ( One-Shot )
-        ## MediaMiner: Moaning to Wake the Dead ( Chapter 1 )
-        title = re.match(r'^MediaMiner(?: - Fan Fic)?:(.*?)(?: \( .*? \))?$',title).group(1)
-
-        # [ A - All Readers ], strip '[ ' ' ]'
+        # [ A - All Readers ], strip '[' ']'
+        ## Above title because we remove the smtxt font to get title.
        rating = soup.find("font",{"class":"smtxt"}).string[1:-1]
-        self.story.setMetadata('title',title)
        self.story.setMetadata('rating',rating)

+        ## Title - Good grief.  Title varies by chaptered, 1chapter and 'type=one shot'.
+        ## <td class="ffh">Atmosphere: Chapter 1</b> <font class="smtxt">[ P - Pre-Teen ]</font></td>
+        ## <td colspan=2 class="ffh">Hearts of Ice <font class="smtxt">[ P - Pre-Teen ]</font></td>
+        ## <td colspan=2 class="ffh">Suzaku no Princess <font class="smtxt">[ P - Pre-Teen ]</font></td>
+        ## <td class="ffh">The Kraut, The Bartender, and The Drunkard: Chapter 1</b> <font class="smtxt">[ P - Pre-Teen ]</font></td>
+        ## <td class="ffh">Betrayal and Justice: A Cold Heart</b> <font size="-1">( Chapter 1 )</font> <font class="smtxt">[ A - All Readers ]</font></td>
+        title = soup.find('td',{'class':'ffh'})
+        for font in title.findAll('font'):
+            font.extract() # removes 'font' tags from inside the td.        
+        if title.has_key('colspan') or 'src.php/t/ONE_SHOT' in data:
+            titlet = title.text
+        else:
+            titlet = ':'.join(title.text.split(':')[:-1]) # strip trailing 'Chapter X', but only when no colspan and not one-shot
+        self.story.setMetadata('title',titlet)
+
        # Find authorid and URL from... author url.
        a = soup.find('a', href=re.compile(r"/fanfic/src.php/u/\d+"))
        self.story.setMetadata('authorId',a['href'].split('/')[-1])
@ -103,7 +107,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
        # Find the chapters
        select = soup.find('select',{'name':'cid'})
        if not select:
-            self.chapterUrls.append((title,self.url))
+            self.chapterUrls.append(( self.story.getMetadata('title'),self.url))
        else:
            for option in select.findAll("option"):
                chapter = stripHTML(option.string)
@ -135,7 +139,6 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
        # Everything else is in <tr bgcolor="#EEEED4">

        metastr = stripHTML(soup.find("tr",{"bgcolor":"#EEEED4"})).replace('\n',' ').replace('\r',' ').replace('\t',' ')
-        print metastr
        # Latest Revision: August 03, 2010
        m = re.match(r".*?(?:Latest Revision|Uploaded On): ([a-zA-Z]+ \d\d, \d\d\d\d)",metastr)
        if m:
@ -171,21 +174,45 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):

        logging.debug('Getting chapter text from: %s' % url)

-        soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
+        data=self._fetchUrl(url)
+        soup = bs.BeautifulStoneSoup(data,
                                     selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.

        anchor = soup.find('a',{'name':'fic_c'})

        if None == anchor:
            raise exceptions.FailedToDownload("Error downloading Chapter: %s!  Missing required element!" % url)
-    
-        for div in anchor.findAllNext('div',{'align':'left'}):
-            div.name='p' # convert to <p> mediaminer uses div with a
-                         # margin for paragraphs.
-            anchor.append(div) # cheat!  stuff all the content divs
-                               # into anchor just as a holder.
+
+        ## find divs with align=left, those are paragraphs in newer stories.
+        divlist = anchor.findAllNext('div',{'align':'left'})
+        if divlist:
+            for div in divlist:
+                div.name='p' # convert to <p> mediaminer uses div with
+                             # a margin for paragraphs.
+                anchor.append(div) # cheat!  stuff all the content
+                                   # divs into anchor just as a
+                                   # holder.
+                del div['style']
+                del div['align']
+            anchor.name='div'
+            return utf8FromSoup(anchor)
+        
+        else:
+            logging.debug('Using kludgey text find for older mediaminer story.')
+            ## Some older mediaminer stories are unparsable with BeautifulSoup.
+            ## Really nasty formatting.  Sooo... Cheat!  Parse it ourselves a bit first.
+            ## Story stuff falls between:
+            data = "<div id='HERE'>" + data[data.find('<a name="fic_c">'):] +"</div>"
+            soup = bs.BeautifulStoneSoup(data,
+                                         selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
+            for tag in soup.findAll('td',{'class':'ffh'}) + \
+                    soup.findAll('div',{'class':'acl'}) + \
+                    soup.findAll('div',{'class':'footer smtxt'}) + \
+                    soup.findAll('table',{'class':'tbbrdr'}):
+                tag.extract() # remove tag from soup.
+                
+            return utf8FromSoup(soup)
        
-        return utf8FromSoup(anchor)

 def getClass():
    return MediaMinerOrgSiteAdapter
--- a/fanficdownloader/adapters/adapter_potionsandsnitchesnet.py
+++ b/fanficdownloader/adapters/adapter_potionsandsnitchesnet.py
@ -32,7 +32,7 @@ class PotionsAndSnitchesNetSiteAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)
        self.story.setMetadata('siteabbrev','pns')
-        self.decode = "utf8"
+        self.decode = "ISO-8859-1"
        self.story.addToList("category","Harry Potter")
        
        # get storyId from url--url validation guarantees query is only sid=1234
@ -171,6 +171,11 @@ class PotionsAndSnitchesNetSiteAdapter(BaseSiteAdapter):
                                     selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
        
        span = soup.find('div', {'id' : 'story'})
+        for p in span.findAll('p'):
+            if p.has_key('style'):
+                del p['style']
+            if p.has_key('class'):
+                del p['class']

        if None == span:
            raise exceptions.FailedToDownload("Error downloading Chapter: %s!  Missing required element!" % url)
--- a/fanficdownloader/adapters/adapter_thewriterscoffeeshopcom.py
+++ b/fanficdownloader/adapters/adapter_thewriterscoffeeshopcom.py
@ -49,10 +49,6 @@ class TheWritersCoffeeShopComSiteAdapter(BaseSiteAdapter):
    def getSiteDomain():
        return 'www.thewriterscoffeeshop.com'

-    @classmethod
-    def getAcceptDomains(cls):
-        return [cls.getSiteDomain()]
-
    def getSiteExampleURLs(self):
        return "http://"+self.getSiteDomain()+"/library/viewstory.php?sid=1234"

--- a/fanficdownloader/adapters/adapter_twilightednet.py
+++ b/fanficdownloader/adapters/adapter_twilightednet.py
@ -32,7 +32,7 @@ class TwilightedNetSiteAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)
        self.story.setMetadata('siteabbrev','tw')
-        self.decode = "utf8"
+        self.decode = "ISO-8859-1" ## tw *lies*.  It claims to be UTF8 in the headers, but it isn't. "utf8"
        self.story.addToList("category","Twilight")
        self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
        self.password = ""
--- a/fanficdownloader/writers/writer_epub.py
+++ b/fanficdownloader/writers/writer_epub.py
@ -144,7 +144,7 @@ h6 { text-align: center; }
 <link href="stylesheet.css" type="text/css" charset="UTF-8" rel="stylesheet"/>
 </head>
 <body>
-<h2>${chapter}</h2>
+<h3>${chapter}</h3>
 ''')

        self.EPUB_CHAPTER_END = string.Template('''
--- a/login.html
+++ b/login.html
@ -2,7 +2,7 @@
 <html>
  <head>
    <link href="/css/index.css" rel="stylesheet" type="text/css">
-    <title>Fanfiction Downloader - read fanfiction from twilighted.net, fanfiction.net, fictionpress.com, fictionalley.org, ficwad.com, potionsandsnitches.net, harrypotterfanfiction.com, mediaminer.org on Kindle, Nook, Sony Reader, iPad, iPhone, Android, Aldiko, Stanza</title>
+    <title>Login Needed Fanfiction Downloader</title>
    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
    <meta name="google-site-verification" content="kCFc-G4bka_pJN6Rv8CapPBcwmq0hbAUZPkKWqRsAYU" />
    <script type="text/javascript">
--- a/newdownload.py
+++ b/newdownload.py
@ -17,7 +17,7 @@

 import logging
 ## XXX cli option for logging level.
-logging.basicConfig(level=logging.WARN,format="%(levelname)s:%(filename)s(%(lineno)d):%(message)s")
+logging.basicConfig(level=logging.DEBUG,format="%(levelname)s:%(filename)s(%(lineno)d):%(message)s")

 import sys, os
 from optparse import OptionParser