Fixes to various from first round testing and some code clean up.

This commit is contained in:
Jim Miller 2011-05-13 21:34:58 -05:00
parent a88d4cac50
commit df9e877865
12 changed files with 73 additions and 59 deletions

View file

@ -32,7 +32,7 @@ class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','aaff')
self.decode = "utf8"
self.decode = "ISO-8859-1"
self.story.addToList("category","Star Trek")
self.is_adult=False
@ -48,10 +48,6 @@ class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
def getSiteDomain():
return 'www.adastrafanfic.com'
@classmethod
def getAcceptDomains(cls):
return [cls.getSiteDomain()]
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"

View file

@ -53,10 +53,6 @@ class FictionAlleyOrgSiteAdapter(BaseSiteAdapter):
def getSiteDomain():
return 'www.fictionalley.org'
@classmethod
def getAcceptDomains(cls):
return [cls.getSiteDomain()]
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/authors/drt/DA.html http://"+self.getSiteDomain()+"/authors/drt/JOTP01a.html"

View file

@ -43,10 +43,6 @@ class FictionPressComSiteAdapter(BaseSiteAdapter):
def getSiteDomain():
return 'www.fictionpress.com'
@classmethod
def getAcceptDomains(cls):
return ['www.fictionpress.com']
def getSiteExampleURLs(self):
return "http://www.fictionpress.com/s/1234/1/ http://www.fictionpress.com/s/1234/12/ http://www.fictionpress.com/s/1234/1/Story_Title"

View file

@ -44,10 +44,6 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
def getSiteDomain():
return 'www.ficwad.com'
@classmethod
def getAcceptDomains(cls):
return ['www.ficwad.com']
def getSiteExampleURLs(self):
return "http://www.ficwad.com/story/137169"
@ -137,8 +133,9 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
# warnings
# <span class="req"><a href="/help/38" title="Medium Spoilers">[!!] </a> <a href="/help/38" title="Rape/Sexual Violence">[R] </a> <a href="/help/38" title="Violence">[V] </a> <a href="/help/38" title="Child/Underage Sex">[Y] </a></span>
spanreq = metap.find("span",{"class":"req"})
for a in spanreq.findAll("a"):
self.story.addToList('warnings',a['title'])
if spanreq: # can be no warnings.
for a in spanreq.findAll("a"):
self.story.addToList('warnings',a['title'])
## perhaps not the most efficient way to parse this, using
## regexps for each rather than something more complex, but

View file

@ -32,7 +32,7 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','hp')
self.decode = "ISO-8859-1"
self.decode = "Windows-1252" # Another site that lies to us. <rolls eyes>
self.story.addToList("category","Harry Potter")
self.is_adult=False
@ -115,7 +115,7 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
for chapter in tablelist.findAll('a', href=re.compile(r'\?chapterid=\d+')):
#javascript:if (confirm('Please note. This story may contain adult themes. By clicking here you are stating that you are over 17. Click cancel if you do not meet this requirement.')) location = '?chapterid=433441&i=1'
# just in case there's tags, like <i> in chapter titles.
chpt=re.sub(r'^.*?(\?chapterid=433441).*?',r'\1',chapter['href'])
chpt=re.sub(r'^.*?(\?chapterid=\d+).*?',r'\1',chapter['href'])
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/viewstory.php'+chpt))
self.story.setMetadata('numChapters',len(self.chapterUrls))
@ -178,12 +178,13 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
span = soup.find('div', {'id' : 'fluidtext'})
div = soup.find('div', {'id' : 'fluidtext'})
if None == span:
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(span)
print div
return utf8FromSoup(div)
def getClass():
return HarryPotterFanFictionComSiteAdapter

View file

@ -32,7 +32,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','mm')
self.decode = "utf8"
self.decode = "ISO-8859-1"
# get storyId from url--url validation guarantees query correct
m = re.match(self.getSiteURLPattern(),url)
@ -50,10 +50,6 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
def getSiteDomain():
return 'www.mediaminer.org'
@classmethod
def getAcceptDomains(cls):
return [cls.getSiteDomain()]
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/fanfic/view_st.php/123456 http://"+self.getSiteDomain()+"/fanfic/view_ch.php/1234123/123444#fic_c"
@ -61,7 +57,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
## http://www.mediaminer.org/fanfic/view_st.php/76882
## http://www.mediaminer.org/fanfic/view_ch.php/167618/594087#fic_c
return re.escape("http://"+self.getSiteDomain())+\
"/fanfic/view_(st|ch)\.php/"+r"(?P<id>\d+)(/\d+#fic_c)?$"
"/fanfic/view_(st|ch)\.php/"+r"(?P<id>\d+)(/\d+(#fic_c)?)?$"
def extractChapterUrlsAndMetadata(self):
@ -79,18 +75,26 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
## Title
title = soup.find('title').string
## MediaMiner - Fan Fic: Par Tout Autre Nom
## MediaMiner: Question and Answer ( One-Shot )
## MediaMiner: Moaning to Wake the Dead ( Chapter 1 )
title = re.match(r'^MediaMiner(?: - Fan Fic)?:(.*?)(?: \( .*? \))?$',title).group(1)
# [ A - All Readers ], strip '[ ' ' ]'
# [ A - All Readers ], strip '[' ']'
## Above title because we remove the smtxt font to get title.
rating = soup.find("font",{"class":"smtxt"}).string[1:-1]
self.story.setMetadata('title',title)
self.story.setMetadata('rating',rating)
## Title - Good grief. Title varies by chaptered, 1chapter and 'type=one shot'.
## <td class="ffh">Atmosphere: Chapter 1</b> <font class="smtxt">[ P - Pre-Teen ]</font></td>
## <td colspan=2 class="ffh">Hearts of Ice <font class="smtxt">[ P - Pre-Teen ]</font></td>
## <td colspan=2 class="ffh">Suzaku no Princess <font class="smtxt">[ P - Pre-Teen ]</font></td>
## <td class="ffh">The Kraut, The Bartender, and The Drunkard: Chapter 1</b> <font class="smtxt">[ P - Pre-Teen ]</font></td>
## <td class="ffh">Betrayal and Justice: A Cold Heart</b> <font size="-1">( Chapter 1 )</font> <font class="smtxt">[ A - All Readers ]</font></td>
title = soup.find('td',{'class':'ffh'})
for font in title.findAll('font'):
font.extract() # removes 'font' tags from inside the td.
if title.has_key('colspan') or 'src.php/t/ONE_SHOT' in data:
titlet = title.text
else:
titlet = ':'.join(title.text.split(':')[:-1]) # strip trailing 'Chapter X', but only when no colspan and not one-shot
self.story.setMetadata('title',titlet)
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"/fanfic/src.php/u/\d+"))
self.story.setMetadata('authorId',a['href'].split('/')[-1])
@ -103,7 +107,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
# Find the chapters
select = soup.find('select',{'name':'cid'})
if not select:
self.chapterUrls.append((title,self.url))
self.chapterUrls.append(( self.story.getMetadata('title'),self.url))
else:
for option in select.findAll("option"):
chapter = stripHTML(option.string)
@ -135,7 +139,6 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
# Everything else is in <tr bgcolor="#EEEED4">
metastr = stripHTML(soup.find("tr",{"bgcolor":"#EEEED4"})).replace('\n',' ').replace('\r',' ').replace('\t',' ')
print metastr
# Latest Revision: August 03, 2010
m = re.match(r".*?(?:Latest Revision|Uploaded On): ([a-zA-Z]+ \d\d, \d\d\d\d)",metastr)
if m:
@ -171,21 +174,45 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
logging.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
data=self._fetchUrl(url)
soup = bs.BeautifulStoneSoup(data,
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
anchor = soup.find('a',{'name':'fic_c'})
if None == anchor:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
for div in anchor.findAllNext('div',{'align':'left'}):
div.name='p' # convert to <p> mediaminer uses div with a
# margin for paragraphs.
anchor.append(div) # cheat! stuff all the content divs
# into anchor just as a holder.
## find divs with align=left, those are paragraphs in newer stories.
divlist = anchor.findAllNext('div',{'align':'left'})
if divlist:
for div in divlist:
div.name='p' # convert to <p> mediaminer uses div with
# a margin for paragraphs.
anchor.append(div) # cheat! stuff all the content
# divs into anchor just as a
# holder.
del div['style']
del div['align']
anchor.name='div'
return utf8FromSoup(anchor)
else:
logging.debug('Using kludgey text find for older mediaminer story.')
## Some older mediaminer stories are unparsable with BeautifulSoup.
## Really nasty formatting. Sooo... Cheat! Parse it ourselves a bit first.
## Story stuff falls between:
data = "<div id='HERE'>" + data[data.find('<a name="fic_c">'):] +"</div>"
soup = bs.BeautifulStoneSoup(data,
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
for tag in soup.findAll('td',{'class':'ffh'}) + \
soup.findAll('div',{'class':'acl'}) + \
soup.findAll('div',{'class':'footer smtxt'}) + \
soup.findAll('table',{'class':'tbbrdr'}):
tag.extract() # remove tag from soup.
return utf8FromSoup(soup)
return utf8FromSoup(anchor)
def getClass():
return MediaMinerOrgSiteAdapter

View file

@ -32,7 +32,7 @@ class PotionsAndSnitchesNetSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','pns')
self.decode = "utf8"
self.decode = "ISO-8859-1"
self.story.addToList("category","Harry Potter")
# get storyId from url--url validation guarantees query is only sid=1234
@ -171,6 +171,11 @@ class PotionsAndSnitchesNetSiteAdapter(BaseSiteAdapter):
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
span = soup.find('div', {'id' : 'story'})
for p in span.findAll('p'):
if p.has_key('style'):
del p['style']
if p.has_key('class'):
del p['class']
if None == span:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)

View file

@ -49,10 +49,6 @@ class TheWritersCoffeeShopComSiteAdapter(BaseSiteAdapter):
def getSiteDomain():
return 'www.thewriterscoffeeshop.com'
@classmethod
def getAcceptDomains(cls):
return [cls.getSiteDomain()]
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/library/viewstory.php?sid=1234"

View file

@ -32,7 +32,7 @@ class TwilightedNetSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','tw')
self.decode = "utf8"
self.decode = "ISO-8859-1" ## tw *lies*. It claims to be UTF8 in the headers, but it isn't. "utf8"
self.story.addToList("category","Twilight")
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""

View file

@ -144,7 +144,7 @@ h6 { text-align: center; }
<link href="stylesheet.css" type="text/css" charset="UTF-8" rel="stylesheet"/>
</head>
<body>
<h2>${chapter}</h2>
<h3>${chapter}</h3>
''')
self.EPUB_CHAPTER_END = string.Template('''

View file

@ -2,7 +2,7 @@
<html>
<head>
<link href="/css/index.css" rel="stylesheet" type="text/css">
<title>Fanfiction Downloader - read fanfiction from twilighted.net, fanfiction.net, fictionpress.com, fictionalley.org, ficwad.com, potionsandsnitches.net, harrypotterfanfiction.com, mediaminer.org on Kindle, Nook, Sony Reader, iPad, iPhone, Android, Aldiko, Stanza</title>
<title>Login Needed Fanfiction Downloader</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta name="google-site-verification" content="kCFc-G4bka_pJN6Rv8CapPBcwmq0hbAUZPkKWqRsAYU" />
<script type="text/javascript">

View file

@ -17,7 +17,7 @@
import logging
## XXX cli option for logging level.
logging.basicConfig(level=logging.WARN,format="%(levelname)s:%(filename)s(%(lineno)d):%(message)s")
logging.basicConfig(level=logging.DEBUG,format="%(levelname)s:%(filename)s(%(lineno)d):%(message)s")
import sys, os
from optparse import OptionParser