From df9e8778657a19890f3beee87ea33974a42da837 Mon Sep 17 00:00:00 2001 From: Jim Miller Date: Fri, 13 May 2011 21:34:58 -0500 Subject: [PATCH] Fixes to various from first round testing and some code clean up. --- .../adapters/adapter_adastrafanficcom.py | 6 +- .../adapters/adapter_fictionalleyorg.py | 4 - .../adapters/adapter_fictionpresscom.py | 4 - .../adapters/adapter_ficwadcom.py | 9 +-- .../adapter_harrypotterfanfictioncom.py | 13 ++-- .../adapters/adapter_mediaminerorg.py | 77 +++++++++++++------ .../adapters/adapter_potionsandsnitchesnet.py | 7 +- .../adapter_thewriterscoffeeshopcom.py | 4 - .../adapters/adapter_twilightednet.py | 2 +- fanficdownloader/writers/writer_epub.py | 2 +- login.html | 2 +- newdownload.py | 2 +- 12 files changed, 73 insertions(+), 59 deletions(-) diff --git a/fanficdownloader/adapters/adapter_adastrafanficcom.py b/fanficdownloader/adapters/adapter_adastrafanficcom.py index 9cdbe637..5bdc1c9b 100644 --- a/fanficdownloader/adapters/adapter_adastrafanficcom.py +++ b/fanficdownloader/adapters/adapter_adastrafanficcom.py @@ -32,7 +32,7 @@ class AdAstraFanficComSiteAdapter(BaseSiteAdapter): def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.story.setMetadata('siteabbrev','aaff') - self.decode = "utf8" + self.decode = "ISO-8859-1" self.story.addToList("category","Star Trek") self.is_adult=False @@ -48,10 +48,6 @@ class AdAstraFanficComSiteAdapter(BaseSiteAdapter): def getSiteDomain(): return 'www.adastrafanfic.com' - @classmethod - def getAcceptDomains(cls): - return [cls.getSiteDomain()] - def getSiteExampleURLs(self): return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234" diff --git a/fanficdownloader/adapters/adapter_fictionalleyorg.py b/fanficdownloader/adapters/adapter_fictionalleyorg.py index f7c06ba1..594de50e 100644 --- a/fanficdownloader/adapters/adapter_fictionalleyorg.py +++ b/fanficdownloader/adapters/adapter_fictionalleyorg.py @@ -53,10 +53,6 @@ class FictionAlleyOrgSiteAdapter(BaseSiteAdapter): def getSiteDomain(): return 'www.fictionalley.org' - @classmethod - def getAcceptDomains(cls): - return [cls.getSiteDomain()] - def getSiteExampleURLs(self): return "http://"+self.getSiteDomain()+"/authors/drt/DA.html http://"+self.getSiteDomain()+"/authors/drt/JOTP01a.html" diff --git a/fanficdownloader/adapters/adapter_fictionpresscom.py b/fanficdownloader/adapters/adapter_fictionpresscom.py index 578c4863..bc9c2905 100644 --- a/fanficdownloader/adapters/adapter_fictionpresscom.py +++ b/fanficdownloader/adapters/adapter_fictionpresscom.py @@ -43,10 +43,6 @@ class FictionPressComSiteAdapter(BaseSiteAdapter): def getSiteDomain(): return 'www.fictionpress.com' - @classmethod - def getAcceptDomains(cls): - return ['www.fictionpress.com'] - def getSiteExampleURLs(self): return "http://www.fictionpress.com/s/1234/1/ http://www.fictionpress.com/s/1234/12/ http://www.fictionpress.com/s/1234/1/Story_Title" diff --git a/fanficdownloader/adapters/adapter_ficwadcom.py b/fanficdownloader/adapters/adapter_ficwadcom.py index 791c07c2..27821994 100644 --- a/fanficdownloader/adapters/adapter_ficwadcom.py +++ b/fanficdownloader/adapters/adapter_ficwadcom.py @@ -44,10 +44,6 @@ class FicwadComSiteAdapter(BaseSiteAdapter): def getSiteDomain(): return 'www.ficwad.com' - @classmethod - def getAcceptDomains(cls): - return ['www.ficwad.com'] - def getSiteExampleURLs(self): return "http://www.ficwad.com/story/137169" @@ -137,8 +133,9 @@ class FicwadComSiteAdapter(BaseSiteAdapter): # warnings # [!!] [R] [V] [Y] spanreq = metap.find("span",{"class":"req"}) - for a in spanreq.findAll("a"): - self.story.addToList('warnings',a['title']) + if spanreq: # can be no warnings. + for a in spanreq.findAll("a"): + self.story.addToList('warnings',a['title']) ## perhaps not the most efficient way to parse this, using ## regexps for each rather than something more complex, but diff --git a/fanficdownloader/adapters/adapter_harrypotterfanfictioncom.py b/fanficdownloader/adapters/adapter_harrypotterfanfictioncom.py index 6f1538d7..a24c63f4 100644 --- a/fanficdownloader/adapters/adapter_harrypotterfanfictioncom.py +++ b/fanficdownloader/adapters/adapter_harrypotterfanfictioncom.py @@ -32,7 +32,7 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter): def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.story.setMetadata('siteabbrev','hp') - self.decode = "ISO-8859-1" + self.decode = "Windows-1252" # Another site that lies to us. self.story.addToList("category","Harry Potter") self.is_adult=False @@ -115,7 +115,7 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter): for chapter in tablelist.findAll('a', href=re.compile(r'\?chapterid=\d+')): #javascript:if (confirm('Please note. This story may contain adult themes. By clicking here you are stating that you are over 17. Click cancel if you do not meet this requirement.')) location = '?chapterid=433441&i=1' # just in case there's tags, like in chapter titles. - chpt=re.sub(r'^.*?(\?chapterid=433441).*?',r'\1',chapter['href']) + chpt=re.sub(r'^.*?(\?chapterid=\d+).*?',r'\1',chapter['href']) self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/viewstory.php'+chpt)) self.story.setMetadata('numChapters',len(self.chapterUrls)) @@ -178,12 +178,13 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter): soup = bs.BeautifulStoneSoup(self._fetchUrl(url), selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags. - span = soup.find('div', {'id' : 'fluidtext'}) + div = soup.find('div', {'id' : 'fluidtext'}) - if None == span: + if None == div: raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url) - - return utf8FromSoup(span) + + print div + return utf8FromSoup(div) def getClass(): return HarryPotterFanFictionComSiteAdapter diff --git a/fanficdownloader/adapters/adapter_mediaminerorg.py b/fanficdownloader/adapters/adapter_mediaminerorg.py index db30dc2d..273bdf78 100644 --- a/fanficdownloader/adapters/adapter_mediaminerorg.py +++ b/fanficdownloader/adapters/adapter_mediaminerorg.py @@ -32,7 +32,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter): def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.story.setMetadata('siteabbrev','mm') - self.decode = "utf8" + self.decode = "ISO-8859-1" # get storyId from url--url validation guarantees query correct m = re.match(self.getSiteURLPattern(),url) @@ -50,10 +50,6 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter): def getSiteDomain(): return 'www.mediaminer.org' - @classmethod - def getAcceptDomains(cls): - return [cls.getSiteDomain()] - def getSiteExampleURLs(self): return "http://"+self.getSiteDomain()+"/fanfic/view_st.php/123456 http://"+self.getSiteDomain()+"/fanfic/view_ch.php/1234123/123444#fic_c" @@ -61,7 +57,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter): ## http://www.mediaminer.org/fanfic/view_st.php/76882 ## http://www.mediaminer.org/fanfic/view_ch.php/167618/594087#fic_c return re.escape("http://"+self.getSiteDomain())+\ - "/fanfic/view_(st|ch)\.php/"+r"(?P\d+)(/\d+#fic_c)?$" + "/fanfic/view_(st|ch)\.php/"+r"(?P\d+)(/\d+(#fic_c)?)?$" def extractChapterUrlsAndMetadata(self): @@ -79,18 +75,26 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter): # use BeautifulSoup HTML parser to make everything easier to find. soup = bs.BeautifulSoup(data) - ## Title - title = soup.find('title').string - ## MediaMiner - Fan Fic: Par Tout Autre Nom - ## MediaMiner: Question and Answer ( One-Shot ) - ## MediaMiner: Moaning to Wake the Dead ( Chapter 1 ) - title = re.match(r'^MediaMiner(?: - Fan Fic)?:(.*?)(?: \( .*? \))?$',title).group(1) - - # [ A - All Readers ], strip '[ ' ' ]' + # [ A - All Readers ], strip '[' ']' + ## Above title because we remove the smtxt font to get title. rating = soup.find("font",{"class":"smtxt"}).string[1:-1] - self.story.setMetadata('title',title) self.story.setMetadata('rating',rating) + ## Title - Good grief. Title varies by chaptered, 1chapter and 'type=one shot'. + ## Atmosphere: Chapter 1 [ P - Pre-Teen ] + ## Hearts of Ice [ P - Pre-Teen ] + ## Suzaku no Princess [ P - Pre-Teen ] + ## The Kraut, The Bartender, and The Drunkard: Chapter 1 [ P - Pre-Teen ] + ## Betrayal and Justice: A Cold Heart ( Chapter 1 ) [ A - All Readers ] + title = soup.find('td',{'class':'ffh'}) + for font in title.findAll('font'): + font.extract() # removes 'font' tags from inside the td. + if title.has_key('colspan') or 'src.php/t/ONE_SHOT' in data: + titlet = title.text + else: + titlet = ':'.join(title.text.split(':')[:-1]) # strip trailing 'Chapter X', but only when no colspan and not one-shot + self.story.setMetadata('title',titlet) + # Find authorid and URL from... author url. a = soup.find('a', href=re.compile(r"/fanfic/src.php/u/\d+")) self.story.setMetadata('authorId',a['href'].split('/')[-1]) @@ -103,7 +107,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter): # Find the chapters select = soup.find('select',{'name':'cid'}) if not select: - self.chapterUrls.append((title,self.url)) + self.chapterUrls.append(( self.story.getMetadata('title'),self.url)) else: for option in select.findAll("option"): chapter = stripHTML(option.string) @@ -135,7 +139,6 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter): # Everything else is in metastr = stripHTML(soup.find("tr",{"bgcolor":"#EEEED4"})).replace('\n',' ').replace('\r',' ').replace('\t',' ') - print metastr # Latest Revision: August 03, 2010 m = re.match(r".*?(?:Latest Revision|Uploaded On): ([a-zA-Z]+ \d\d, \d\d\d\d)",metastr) if m: @@ -171,21 +174,45 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter): logging.debug('Getting chapter text from: %s' % url) - soup = bs.BeautifulStoneSoup(self._fetchUrl(url), + data=self._fetchUrl(url) + soup = bs.BeautifulStoneSoup(data, selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags. anchor = soup.find('a',{'name':'fic_c'}) if None == anchor: raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url) - - for div in anchor.findAllNext('div',{'align':'left'}): - div.name='p' # convert to

mediaminer uses div with a - # margin for paragraphs. - anchor.append(div) # cheat! stuff all the content divs - # into anchor just as a holder. + + ## find divs with align=left, those are paragraphs in newer stories. + divlist = anchor.findAllNext('div',{'align':'left'}) + if divlist: + for div in divlist: + div.name='p' # convert to

mediaminer uses div with + # a margin for paragraphs. + anchor.append(div) # cheat! stuff all the content + # divs into anchor just as a + # holder. + del div['style'] + del div['align'] + anchor.name='div' + return utf8FromSoup(anchor) + + else: + logging.debug('Using kludgey text find for older mediaminer story.') + ## Some older mediaminer stories are unparsable with BeautifulSoup. + ## Really nasty formatting. Sooo... Cheat! Parse it ourselves a bit first. + ## Story stuff falls between: + data = "

" + data[data.find(''):] +"
" + soup = bs.BeautifulStoneSoup(data, + selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags. + for tag in soup.findAll('td',{'class':'ffh'}) + \ + soup.findAll('div',{'class':'acl'}) + \ + soup.findAll('div',{'class':'footer smtxt'}) + \ + soup.findAll('table',{'class':'tbbrdr'}): + tag.extract() # remove tag from soup. + + return utf8FromSoup(soup) - return utf8FromSoup(anchor) def getClass(): return MediaMinerOrgSiteAdapter diff --git a/fanficdownloader/adapters/adapter_potionsandsnitchesnet.py b/fanficdownloader/adapters/adapter_potionsandsnitchesnet.py index 8becddc1..5d898b34 100644 --- a/fanficdownloader/adapters/adapter_potionsandsnitchesnet.py +++ b/fanficdownloader/adapters/adapter_potionsandsnitchesnet.py @@ -32,7 +32,7 @@ class PotionsAndSnitchesNetSiteAdapter(BaseSiteAdapter): def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.story.setMetadata('siteabbrev','pns') - self.decode = "utf8" + self.decode = "ISO-8859-1" self.story.addToList("category","Harry Potter") # get storyId from url--url validation guarantees query is only sid=1234 @@ -171,6 +171,11 @@ class PotionsAndSnitchesNetSiteAdapter(BaseSiteAdapter): selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags. span = soup.find('div', {'id' : 'story'}) + for p in span.findAll('p'): + if p.has_key('style'): + del p['style'] + if p.has_key('class'): + del p['class'] if None == span: raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url) diff --git a/fanficdownloader/adapters/adapter_thewriterscoffeeshopcom.py b/fanficdownloader/adapters/adapter_thewriterscoffeeshopcom.py index f11d37e6..0c1d857a 100644 --- a/fanficdownloader/adapters/adapter_thewriterscoffeeshopcom.py +++ b/fanficdownloader/adapters/adapter_thewriterscoffeeshopcom.py @@ -49,10 +49,6 @@ class TheWritersCoffeeShopComSiteAdapter(BaseSiteAdapter): def getSiteDomain(): return 'www.thewriterscoffeeshop.com' - @classmethod - def getAcceptDomains(cls): - return [cls.getSiteDomain()] - def getSiteExampleURLs(self): return "http://"+self.getSiteDomain()+"/library/viewstory.php?sid=1234" diff --git a/fanficdownloader/adapters/adapter_twilightednet.py b/fanficdownloader/adapters/adapter_twilightednet.py index 6b716912..96d838de 100644 --- a/fanficdownloader/adapters/adapter_twilightednet.py +++ b/fanficdownloader/adapters/adapter_twilightednet.py @@ -32,7 +32,7 @@ class TwilightedNetSiteAdapter(BaseSiteAdapter): def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.story.setMetadata('siteabbrev','tw') - self.decode = "utf8" + self.decode = "ISO-8859-1" ## tw *lies*. It claims to be UTF8 in the headers, but it isn't. "utf8" self.story.addToList("category","Twilight") self.username = "NoneGiven" # if left empty, site doesn't return any message at all. self.password = "" diff --git a/fanficdownloader/writers/writer_epub.py b/fanficdownloader/writers/writer_epub.py index 7a23ce97..28bab789 100644 --- a/fanficdownloader/writers/writer_epub.py +++ b/fanficdownloader/writers/writer_epub.py @@ -144,7 +144,7 @@ h6 { text-align: center; } -

${chapter}

+

${chapter}

''') self.EPUB_CHAPTER_END = string.Template(''' diff --git a/login.html b/login.html index bd316b9d..6bbf5f28 100644 --- a/login.html +++ b/login.html @@ -2,7 +2,7 @@ - Fanfiction Downloader - read fanfiction from twilighted.net, fanfiction.net, fictionpress.com, fictionalley.org, ficwad.com, potionsandsnitches.net, harrypotterfanfiction.com, mediaminer.org on Kindle, Nook, Sony Reader, iPad, iPhone, Android, Aldiko, Stanza + Login Needed Fanfiction Downloader