mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-05-08 12:36:11 +02:00
Fixes to various from first round testing and some code clean up.
This commit is contained in:
parent
a88d4cac50
commit
df9e877865
12 changed files with 73 additions and 59 deletions
|
|
@ -32,7 +32,7 @@ class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
|
|||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','aaff')
|
||||
self.decode = "utf8"
|
||||
self.decode = "ISO-8859-1"
|
||||
self.story.addToList("category","Star Trek")
|
||||
self.is_adult=False
|
||||
|
||||
|
|
@ -48,10 +48,6 @@ class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
|
|||
def getSiteDomain():
|
||||
return 'www.adastrafanfic.com'
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return [cls.getSiteDomain()]
|
||||
|
||||
def getSiteExampleURLs(self):
|
||||
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
|
||||
|
||||
|
|
|
|||
|
|
@ -53,10 +53,6 @@ class FictionAlleyOrgSiteAdapter(BaseSiteAdapter):
|
|||
def getSiteDomain():
|
||||
return 'www.fictionalley.org'
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return [cls.getSiteDomain()]
|
||||
|
||||
def getSiteExampleURLs(self):
|
||||
return "http://"+self.getSiteDomain()+"/authors/drt/DA.html http://"+self.getSiteDomain()+"/authors/drt/JOTP01a.html"
|
||||
|
||||
|
|
|
|||
|
|
@ -43,10 +43,6 @@ class FictionPressComSiteAdapter(BaseSiteAdapter):
|
|||
def getSiteDomain():
|
||||
return 'www.fictionpress.com'
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return ['www.fictionpress.com']
|
||||
|
||||
def getSiteExampleURLs(self):
|
||||
return "http://www.fictionpress.com/s/1234/1/ http://www.fictionpress.com/s/1234/12/ http://www.fictionpress.com/s/1234/1/Story_Title"
|
||||
|
||||
|
|
|
|||
|
|
@ -44,10 +44,6 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
|
|||
def getSiteDomain():
|
||||
return 'www.ficwad.com'
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return ['www.ficwad.com']
|
||||
|
||||
def getSiteExampleURLs(self):
|
||||
return "http://www.ficwad.com/story/137169"
|
||||
|
||||
|
|
@ -137,8 +133,9 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
|
|||
# warnings
|
||||
# <span class="req"><a href="/help/38" title="Medium Spoilers">[!!] </a> <a href="/help/38" title="Rape/Sexual Violence">[R] </a> <a href="/help/38" title="Violence">[V] </a> <a href="/help/38" title="Child/Underage Sex">[Y] </a></span>
|
||||
spanreq = metap.find("span",{"class":"req"})
|
||||
for a in spanreq.findAll("a"):
|
||||
self.story.addToList('warnings',a['title'])
|
||||
if spanreq: # can be no warnings.
|
||||
for a in spanreq.findAll("a"):
|
||||
self.story.addToList('warnings',a['title'])
|
||||
|
||||
## perhaps not the most efficient way to parse this, using
|
||||
## regexps for each rather than something more complex, but
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
|
|||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','hp')
|
||||
self.decode = "ISO-8859-1"
|
||||
self.decode = "Windows-1252" # Another site that lies to us. <rolls eyes>
|
||||
self.story.addToList("category","Harry Potter")
|
||||
self.is_adult=False
|
||||
|
||||
|
|
@ -115,7 +115,7 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
|
|||
for chapter in tablelist.findAll('a', href=re.compile(r'\?chapterid=\d+')):
|
||||
#javascript:if (confirm('Please note. This story may contain adult themes. By clicking here you are stating that you are over 17. Click cancel if you do not meet this requirement.')) location = '?chapterid=433441&i=1'
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
chpt=re.sub(r'^.*?(\?chapterid=433441).*?',r'\1',chapter['href'])
|
||||
chpt=re.sub(r'^.*?(\?chapterid=\d+).*?',r'\1',chapter['href'])
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/viewstory.php'+chpt))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
|
@ -178,12 +178,13 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
|
|||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
span = soup.find('div', {'id' : 'fluidtext'})
|
||||
div = soup.find('div', {'id' : 'fluidtext'})
|
||||
|
||||
if None == span:
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return utf8FromSoup(span)
|
||||
|
||||
print div
|
||||
return utf8FromSoup(div)
|
||||
|
||||
def getClass():
|
||||
return HarryPotterFanFictionComSiteAdapter
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
|
|||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','mm')
|
||||
self.decode = "utf8"
|
||||
self.decode = "ISO-8859-1"
|
||||
|
||||
# get storyId from url--url validation guarantees query correct
|
||||
m = re.match(self.getSiteURLPattern(),url)
|
||||
|
|
@ -50,10 +50,6 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
|
|||
def getSiteDomain():
|
||||
return 'www.mediaminer.org'
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return [cls.getSiteDomain()]
|
||||
|
||||
def getSiteExampleURLs(self):
|
||||
return "http://"+self.getSiteDomain()+"/fanfic/view_st.php/123456 http://"+self.getSiteDomain()+"/fanfic/view_ch.php/1234123/123444#fic_c"
|
||||
|
||||
|
|
@ -61,7 +57,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
|
|||
## http://www.mediaminer.org/fanfic/view_st.php/76882
|
||||
## http://www.mediaminer.org/fanfic/view_ch.php/167618/594087#fic_c
|
||||
return re.escape("http://"+self.getSiteDomain())+\
|
||||
"/fanfic/view_(st|ch)\.php/"+r"(?P<id>\d+)(/\d+#fic_c)?$"
|
||||
"/fanfic/view_(st|ch)\.php/"+r"(?P<id>\d+)(/\d+(#fic_c)?)?$"
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
|
|
@ -79,18 +75,26 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
|
|||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
|
||||
## Title
|
||||
title = soup.find('title').string
|
||||
## MediaMiner - Fan Fic: Par Tout Autre Nom
|
||||
## MediaMiner: Question and Answer ( One-Shot )
|
||||
## MediaMiner: Moaning to Wake the Dead ( Chapter 1 )
|
||||
title = re.match(r'^MediaMiner(?: - Fan Fic)?:(.*?)(?: \( .*? \))?$',title).group(1)
|
||||
|
||||
# [ A - All Readers ], strip '[ ' ' ]'
|
||||
# [ A - All Readers ], strip '[' ']'
|
||||
## Above title because we remove the smtxt font to get title.
|
||||
rating = soup.find("font",{"class":"smtxt"}).string[1:-1]
|
||||
self.story.setMetadata('title',title)
|
||||
self.story.setMetadata('rating',rating)
|
||||
|
||||
## Title - Good grief. Title varies by chaptered, 1chapter and 'type=one shot'.
|
||||
## <td class="ffh">Atmosphere: Chapter 1</b> <font class="smtxt">[ P - Pre-Teen ]</font></td>
|
||||
## <td colspan=2 class="ffh">Hearts of Ice <font class="smtxt">[ P - Pre-Teen ]</font></td>
|
||||
## <td colspan=2 class="ffh">Suzaku no Princess <font class="smtxt">[ P - Pre-Teen ]</font></td>
|
||||
## <td class="ffh">The Kraut, The Bartender, and The Drunkard: Chapter 1</b> <font class="smtxt">[ P - Pre-Teen ]</font></td>
|
||||
## <td class="ffh">Betrayal and Justice: A Cold Heart</b> <font size="-1">( Chapter 1 )</font> <font class="smtxt">[ A - All Readers ]</font></td>
|
||||
title = soup.find('td',{'class':'ffh'})
|
||||
for font in title.findAll('font'):
|
||||
font.extract() # removes 'font' tags from inside the td.
|
||||
if title.has_key('colspan') or 'src.php/t/ONE_SHOT' in data:
|
||||
titlet = title.text
|
||||
else:
|
||||
titlet = ':'.join(title.text.split(':')[:-1]) # strip trailing 'Chapter X', but only when no colspan and not one-shot
|
||||
self.story.setMetadata('title',titlet)
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"/fanfic/src.php/u/\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('/')[-1])
|
||||
|
|
@ -103,7 +107,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
|
|||
# Find the chapters
|
||||
select = soup.find('select',{'name':'cid'})
|
||||
if not select:
|
||||
self.chapterUrls.append((title,self.url))
|
||||
self.chapterUrls.append(( self.story.getMetadata('title'),self.url))
|
||||
else:
|
||||
for option in select.findAll("option"):
|
||||
chapter = stripHTML(option.string)
|
||||
|
|
@ -135,7 +139,6 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
|
|||
# Everything else is in <tr bgcolor="#EEEED4">
|
||||
|
||||
metastr = stripHTML(soup.find("tr",{"bgcolor":"#EEEED4"})).replace('\n',' ').replace('\r',' ').replace('\t',' ')
|
||||
print metastr
|
||||
# Latest Revision: August 03, 2010
|
||||
m = re.match(r".*?(?:Latest Revision|Uploaded On): ([a-zA-Z]+ \d\d, \d\d\d\d)",metastr)
|
||||
if m:
|
||||
|
|
@ -171,21 +174,45 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
|
|||
|
||||
logging.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
data=self._fetchUrl(url)
|
||||
soup = bs.BeautifulStoneSoup(data,
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
anchor = soup.find('a',{'name':'fic_c'})
|
||||
|
||||
if None == anchor:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
for div in anchor.findAllNext('div',{'align':'left'}):
|
||||
div.name='p' # convert to <p> mediaminer uses div with a
|
||||
# margin for paragraphs.
|
||||
anchor.append(div) # cheat! stuff all the content divs
|
||||
# into anchor just as a holder.
|
||||
|
||||
## find divs with align=left, those are paragraphs in newer stories.
|
||||
divlist = anchor.findAllNext('div',{'align':'left'})
|
||||
if divlist:
|
||||
for div in divlist:
|
||||
div.name='p' # convert to <p> mediaminer uses div with
|
||||
# a margin for paragraphs.
|
||||
anchor.append(div) # cheat! stuff all the content
|
||||
# divs into anchor just as a
|
||||
# holder.
|
||||
del div['style']
|
||||
del div['align']
|
||||
anchor.name='div'
|
||||
return utf8FromSoup(anchor)
|
||||
|
||||
else:
|
||||
logging.debug('Using kludgey text find for older mediaminer story.')
|
||||
## Some older mediaminer stories are unparsable with BeautifulSoup.
|
||||
## Really nasty formatting. Sooo... Cheat! Parse it ourselves a bit first.
|
||||
## Story stuff falls between:
|
||||
data = "<div id='HERE'>" + data[data.find('<a name="fic_c">'):] +"</div>"
|
||||
soup = bs.BeautifulStoneSoup(data,
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
for tag in soup.findAll('td',{'class':'ffh'}) + \
|
||||
soup.findAll('div',{'class':'acl'}) + \
|
||||
soup.findAll('div',{'class':'footer smtxt'}) + \
|
||||
soup.findAll('table',{'class':'tbbrdr'}):
|
||||
tag.extract() # remove tag from soup.
|
||||
|
||||
return utf8FromSoup(soup)
|
||||
|
||||
return utf8FromSoup(anchor)
|
||||
|
||||
def getClass():
|
||||
return MediaMinerOrgSiteAdapter
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ class PotionsAndSnitchesNetSiteAdapter(BaseSiteAdapter):
|
|||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','pns')
|
||||
self.decode = "utf8"
|
||||
self.decode = "ISO-8859-1"
|
||||
self.story.addToList("category","Harry Potter")
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
|
|
@ -171,6 +171,11 @@ class PotionsAndSnitchesNetSiteAdapter(BaseSiteAdapter):
|
|||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
span = soup.find('div', {'id' : 'story'})
|
||||
for p in span.findAll('p'):
|
||||
if p.has_key('style'):
|
||||
del p['style']
|
||||
if p.has_key('class'):
|
||||
del p['class']
|
||||
|
||||
if None == span:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
|
|
|||
|
|
@ -49,10 +49,6 @@ class TheWritersCoffeeShopComSiteAdapter(BaseSiteAdapter):
|
|||
def getSiteDomain():
|
||||
return 'www.thewriterscoffeeshop.com'
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return [cls.getSiteDomain()]
|
||||
|
||||
def getSiteExampleURLs(self):
|
||||
return "http://"+self.getSiteDomain()+"/library/viewstory.php?sid=1234"
|
||||
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ class TwilightedNetSiteAdapter(BaseSiteAdapter):
|
|||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','tw')
|
||||
self.decode = "utf8"
|
||||
self.decode = "ISO-8859-1" ## tw *lies*. It claims to be UTF8 in the headers, but it isn't. "utf8"
|
||||
self.story.addToList("category","Twilight")
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
|
|
|
|||
|
|
@ -144,7 +144,7 @@ h6 { text-align: center; }
|
|||
<link href="stylesheet.css" type="text/css" charset="UTF-8" rel="stylesheet"/>
|
||||
</head>
|
||||
<body>
|
||||
<h2>${chapter}</h2>
|
||||
<h3>${chapter}</h3>
|
||||
''')
|
||||
|
||||
self.EPUB_CHAPTER_END = string.Template('''
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
<html>
|
||||
<head>
|
||||
<link href="/css/index.css" rel="stylesheet" type="text/css">
|
||||
<title>Fanfiction Downloader - read fanfiction from twilighted.net, fanfiction.net, fictionpress.com, fictionalley.org, ficwad.com, potionsandsnitches.net, harrypotterfanfiction.com, mediaminer.org on Kindle, Nook, Sony Reader, iPad, iPhone, Android, Aldiko, Stanza</title>
|
||||
<title>Login Needed Fanfiction Downloader</title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<meta name="google-site-verification" content="kCFc-G4bka_pJN6Rv8CapPBcwmq0hbAUZPkKWqRsAYU" />
|
||||
<script type="text/javascript">
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@
|
|||
|
||||
import logging
|
||||
## XXX cli option for logging level.
|
||||
logging.basicConfig(level=logging.WARN,format="%(levelname)s:%(filename)s(%(lineno)d):%(message)s")
|
||||
logging.basicConfig(level=logging.DEBUG,format="%(levelname)s:%(filename)s(%(lineno)d):%(message)s")
|
||||
|
||||
import sys, os
|
||||
from optparse import OptionParser
|
||||
|
|
|
|||
Loading…
Reference in a new issue