diff --git a/fanficdownloader/adapters/adapter_adastrafanficcom.py b/fanficdownloader/adapters/adapter_adastrafanficcom.py
index 9cdbe637..5bdc1c9b 100644
--- a/fanficdownloader/adapters/adapter_adastrafanficcom.py
+++ b/fanficdownloader/adapters/adapter_adastrafanficcom.py
@@ -32,7 +32,7 @@ class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','aaff')
- self.decode = "utf8"
+ self.decode = "ISO-8859-1"
self.story.addToList("category","Star Trek")
self.is_adult=False
@@ -48,10 +48,6 @@ class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
def getSiteDomain():
return 'www.adastrafanfic.com'
- @classmethod
- def getAcceptDomains(cls):
- return [cls.getSiteDomain()]
-
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
diff --git a/fanficdownloader/adapters/adapter_fictionalleyorg.py b/fanficdownloader/adapters/adapter_fictionalleyorg.py
index f7c06ba1..594de50e 100644
--- a/fanficdownloader/adapters/adapter_fictionalleyorg.py
+++ b/fanficdownloader/adapters/adapter_fictionalleyorg.py
@@ -53,10 +53,6 @@ class FictionAlleyOrgSiteAdapter(BaseSiteAdapter):
def getSiteDomain():
return 'www.fictionalley.org'
- @classmethod
- def getAcceptDomains(cls):
- return [cls.getSiteDomain()]
-
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/authors/drt/DA.html http://"+self.getSiteDomain()+"/authors/drt/JOTP01a.html"
diff --git a/fanficdownloader/adapters/adapter_fictionpresscom.py b/fanficdownloader/adapters/adapter_fictionpresscom.py
index 578c4863..bc9c2905 100644
--- a/fanficdownloader/adapters/adapter_fictionpresscom.py
+++ b/fanficdownloader/adapters/adapter_fictionpresscom.py
@@ -43,10 +43,6 @@ class FictionPressComSiteAdapter(BaseSiteAdapter):
def getSiteDomain():
return 'www.fictionpress.com'
- @classmethod
- def getAcceptDomains(cls):
- return ['www.fictionpress.com']
-
def getSiteExampleURLs(self):
return "http://www.fictionpress.com/s/1234/1/ http://www.fictionpress.com/s/1234/12/ http://www.fictionpress.com/s/1234/1/Story_Title"
diff --git a/fanficdownloader/adapters/adapter_ficwadcom.py b/fanficdownloader/adapters/adapter_ficwadcom.py
index 791c07c2..27821994 100644
--- a/fanficdownloader/adapters/adapter_ficwadcom.py
+++ b/fanficdownloader/adapters/adapter_ficwadcom.py
@@ -44,10 +44,6 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
def getSiteDomain():
return 'www.ficwad.com'
- @classmethod
- def getAcceptDomains(cls):
- return ['www.ficwad.com']
-
def getSiteExampleURLs(self):
return "http://www.ficwad.com/story/137169"
@@ -137,8 +133,9 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
# warnings
# [!!] [R] [V] [Y]
spanreq = metap.find("span",{"class":"req"})
- for a in spanreq.findAll("a"):
- self.story.addToList('warnings',a['title'])
+ if spanreq: # can be no warnings.
+ for a in spanreq.findAll("a"):
+ self.story.addToList('warnings',a['title'])
## perhaps not the most efficient way to parse this, using
## regexps for each rather than something more complex, but
diff --git a/fanficdownloader/adapters/adapter_harrypotterfanfictioncom.py b/fanficdownloader/adapters/adapter_harrypotterfanfictioncom.py
index 6f1538d7..a24c63f4 100644
--- a/fanficdownloader/adapters/adapter_harrypotterfanfictioncom.py
+++ b/fanficdownloader/adapters/adapter_harrypotterfanfictioncom.py
@@ -32,7 +32,7 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','hp')
- self.decode = "ISO-8859-1"
+ self.decode = "Windows-1252" # Another site that lies to us.
self.story.addToList("category","Harry Potter")
self.is_adult=False
@@ -115,7 +115,7 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
for chapter in tablelist.findAll('a', href=re.compile(r'\?chapterid=\d+')):
#javascript:if (confirm('Please note. This story may contain adult themes. By clicking here you are stating that you are over 17. Click cancel if you do not meet this requirement.')) location = '?chapterid=433441&i=1'
# just in case there's tags, like in chapter titles.
- chpt=re.sub(r'^.*?(\?chapterid=433441).*?',r'\1',chapter['href'])
+ chpt=re.sub(r'^.*?(\?chapterid=\d+).*?',r'\1',chapter['href'])
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/viewstory.php'+chpt))
self.story.setMetadata('numChapters',len(self.chapterUrls))
@@ -178,12 +178,13 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
- span = soup.find('div', {'id' : 'fluidtext'})
+ div = soup.find('div', {'id' : 'fluidtext'})
- if None == span:
+ if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
-
- return utf8FromSoup(span)
+
+ print div
+ return utf8FromSoup(div)
def getClass():
return HarryPotterFanFictionComSiteAdapter
diff --git a/fanficdownloader/adapters/adapter_mediaminerorg.py b/fanficdownloader/adapters/adapter_mediaminerorg.py
index db30dc2d..273bdf78 100644
--- a/fanficdownloader/adapters/adapter_mediaminerorg.py
+++ b/fanficdownloader/adapters/adapter_mediaminerorg.py
@@ -32,7 +32,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','mm')
- self.decode = "utf8"
+ self.decode = "ISO-8859-1"
# get storyId from url--url validation guarantees query correct
m = re.match(self.getSiteURLPattern(),url)
@@ -50,10 +50,6 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
def getSiteDomain():
return 'www.mediaminer.org'
- @classmethod
- def getAcceptDomains(cls):
- return [cls.getSiteDomain()]
-
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/fanfic/view_st.php/123456 http://"+self.getSiteDomain()+"/fanfic/view_ch.php/1234123/123444#fic_c"
@@ -61,7 +57,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
## http://www.mediaminer.org/fanfic/view_st.php/76882
## http://www.mediaminer.org/fanfic/view_ch.php/167618/594087#fic_c
return re.escape("http://"+self.getSiteDomain())+\
- "/fanfic/view_(st|ch)\.php/"+r"(?P\d+)(/\d+#fic_c)?$"
+ "/fanfic/view_(st|ch)\.php/"+r"(?P\d+)(/\d+(#fic_c)?)?$"
def extractChapterUrlsAndMetadata(self):
@@ -79,18 +75,26 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
- ## Title
- title = soup.find('title').string
- ## MediaMiner - Fan Fic: Par Tout Autre Nom
- ## MediaMiner: Question and Answer ( One-Shot )
- ## MediaMiner: Moaning to Wake the Dead ( Chapter 1 )
- title = re.match(r'^MediaMiner(?: - Fan Fic)?:(.*?)(?: \( .*? \))?$',title).group(1)
-
- # [ A - All Readers ], strip '[ ' ' ]'
+ # [ A - All Readers ], strip '[' ']'
+ ## Above title because we remove the smtxt font to get title.
rating = soup.find("font",{"class":"smtxt"}).string[1:-1]
- self.story.setMetadata('title',title)
self.story.setMetadata('rating',rating)
+ ## Title - Good grief. Title varies by chaptered, 1chapter and 'type=one shot'.
+ ##
Atmosphere: Chapter 1 [ P - Pre-Teen ]
+ ##
Hearts of Ice [ P - Pre-Teen ]
+ ##
Suzaku no Princess [ P - Pre-Teen ]
+ ##
The Kraut, The Bartender, and The Drunkard: Chapter 1 [ P - Pre-Teen ]
+ ##
Betrayal and Justice: A Cold Heart ( Chapter 1 ) [ A - All Readers ]
+ title = soup.find('td',{'class':'ffh'})
+ for font in title.findAll('font'):
+ font.extract() # removes 'font' tags from inside the td.
+ if title.has_key('colspan') or 'src.php/t/ONE_SHOT' in data:
+ titlet = title.text
+ else:
+ titlet = ':'.join(title.text.split(':')[:-1]) # strip trailing 'Chapter X', but only when no colspan and not one-shot
+ self.story.setMetadata('title',titlet)
+
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"/fanfic/src.php/u/\d+"))
self.story.setMetadata('authorId',a['href'].split('/')[-1])
@@ -103,7 +107,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
# Find the chapters
select = soup.find('select',{'name':'cid'})
if not select:
- self.chapterUrls.append((title,self.url))
+ self.chapterUrls.append(( self.story.getMetadata('title'),self.url))
else:
for option in select.findAll("option"):
chapter = stripHTML(option.string)
@@ -135,7 +139,6 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
# Everything else is in
metastr = stripHTML(soup.find("tr",{"bgcolor":"#EEEED4"})).replace('\n',' ').replace('\r',' ').replace('\t',' ')
- print metastr
# Latest Revision: August 03, 2010
m = re.match(r".*?(?:Latest Revision|Uploaded On): ([a-zA-Z]+ \d\d, \d\d\d\d)",metastr)
if m:
@@ -171,21 +174,45 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
logging.debug('Getting chapter text from: %s' % url)
- soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
+ data=self._fetchUrl(url)
+ soup = bs.BeautifulStoneSoup(data,
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
anchor = soup.find('a',{'name':'fic_c'})
if None == anchor:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
-
- for div in anchor.findAllNext('div',{'align':'left'}):
- div.name='p' # convert to
mediaminer uses div with a
- # margin for paragraphs.
- anchor.append(div) # cheat! stuff all the content divs
- # into anchor just as a holder.
+
+ ## find divs with align=left, those are paragraphs in newer stories.
+ divlist = anchor.findAllNext('div',{'align':'left'})
+ if divlist:
+ for div in divlist:
+ div.name='p' # convert to
mediaminer uses div with
+ # a margin for paragraphs.
+ anchor.append(div) # cheat! stuff all the content
+ # divs into anchor just as a
+ # holder.
+ del div['style']
+ del div['align']
+ anchor.name='div'
+ return utf8FromSoup(anchor)
+
+ else:
+ logging.debug('Using kludgey text find for older mediaminer story.')
+ ## Some older mediaminer stories are unparsable with BeautifulSoup.
+ ## Really nasty formatting. Sooo... Cheat! Parse it ourselves a bit first.
+ ## Story stuff falls between:
+ data = "