Added fimfiction.net to index.html

Added fimfiction.net to defaults.ini
Add mature fic handling to fimfiction.net
More robust handling of story descriptions from fimfiction.net
Added 'characters' list
This commit is contained in:
althaine 2011-09-25 15:23:06 +10:00
parent 88bfd4ec58
commit 070096e9cb
4 changed files with 43 additions and 13 deletions

View file

@ -36,6 +36,7 @@ formatext_label:File Extension
## Sometimes there are multiple categories and/or genres.
category_label:Category
genre_label:Genre
characters_label:Characters
## Completed/In-Progress
status_label:Status
## Dates story first published, last updated, and downloaded(last with time).
@ -65,7 +66,7 @@ version_label:FFD Version
## items to include in the title page
## Empty entries will *not* appear, even if in the list.
## All current formats already include title and author.
titlepage_entries: category,genre,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,description
titlepage_entries: category,genre,characters,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,description
## include title page as first page.
include_titlepage: true
@ -129,7 +130,7 @@ zip_output: false
## entries to make epub subject tags
## lastupdate creates two tags: "Last Update Year/Month: %Y/%m" and "Last Update: %Y/%m/%d"
include_subject_tags: extratags, genre, category, lastupdate, status
include_subject_tags: extratags, genre, category, characters, lastupdate, status
## epub carries the TOC in metadata.
## mobi generated from epub will have a TOC at the end.

View file

@ -19,7 +19,7 @@ import time
import logging
import re
import urllib2
from datetime import datetime
import cookielib as cl
import fanficdownloader.BeautifulSoup as bs
from fanficdownloader.htmlcleanup import stripHTML
@ -37,23 +37,41 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
self.story.setMetadata('siteabbrev','fimficnet')
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
self._setURL("http://"+self.getSiteDomain()+"/story/"+self.story.getMetadata('storyId')+"/")
self.is_adult = False
@staticmethod
def getSiteDomain():
return 'www.fimfiction.net'
@classmethod
def getAcceptDomains(cls):
return ['www.fimfiction.net','mobile.fimfiction.net', 'www.fimfiction.com']
# mobile.fimifction.com isn't actually a valid domain, but we can still get the story id from URLs anyway
return ['www.fimfiction.net','mobile.fimfiction.net', 'www.fimfiction.com', 'mobile.fimfiction.com']
def getSiteExampleURLs(self):
return "http://www.fimfiction.net/story/1234/story-title-here http://www.fimfiction.net/story/1234/ http://www.fimfiction.com/story/1234/ http://mobile.fimfiction.net/story/1234/"
return "http://www.fimfiction.net/story/1234/story-title-here http://www.fimfiction.net/story/1234/ http://www.fimfiction.com/story/1234/1/ http://mobile.fimfiction.net/story/1234/1/story-title-here/chapter-title-here"
def getSiteURLPattern(self):
return r"http://(www|mobile)\.fimfiction\.(net|com)/story/\d+/?.*"
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
cookieproc = urllib2.HTTPCookieProcessor()
cookie = cl.Cookie(version=0, name='view_mature', value='true',
port=None, port_specified=False,
domain=self.getSiteDomain(), domain_specified=False, domain_initial_dot=False,
path='/story', path_specified=True,
secure=False,
expires=time.time()+10000,
discard=False,
comment=None,
comment_url=None,
rest={'HttpOnly': None},
rfc2109=False)
cookieproc.cookiejar.set_cookie(cookie)
self.opener = urllib2.build_opener(cookieproc)
try:
data = self._fetchUrl(self.url)
except urllib2.HTTPError, e:
@ -65,6 +83,9 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
if "Warning: mysql_fetch_array(): supplied argument is not a valid MySQL result resource" in data:
raise exceptions.StoryDoesNotExist(self.url)
if "This story has been marked as having adult content." in data:
raise exceptions.AdultCheckRequired(self.url)
soup = bs.BeautifulSoup(data).find("div", {"class":"content_box post_content_box"})
title, author = [link.text for link in soup.find("h2").findAll("a")]
@ -73,12 +94,12 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
self.story.setMetadata("authorId", author) # The author's name will be unique
self.story.setMetadata("authorUrl", "http://%s/user/%s" % (self.getSiteDomain(),author))
self.chapterUrls = [(chapter.text, "http://"+self.getSiteDomain() + chapter['href']) for chapter in soup.findAll("a", "chapter_link")]
self.chapterUrls = [(chapter.text, "http://"+self.getSiteDomain() + chapter['href']) for chapter in soup.findAll("a", {"class":"chapter_link"})]
self.story.setMetadata('numChapters',len(self.chapterUrls))
for character in [character_icon['title'] for character_icon in soup.findAll("a", {"class":"character_icon"})]:
self.story.addToList("category", character)
self.story.addToList("characters", character)
for category in [category.text for category in soup.find("div", {"class":"categories"}).findAll("a")]:
self.story.addToList("category", category)
self.story.addToList("category", "My Little Pony")
@ -102,15 +123,14 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
# Sometimes the description has an expanding element
# This removes the ellipsis and the expand button
try:
description_soup.span.extract()
description_soup.a.extract()
description_soup.find('span', {"id":re.compile(r"description_more_elipses_\d+")}).extract() # Web designer can't spell 'ellipsis'
description_soup.find('a', {"class":"more"}).extract()
except:
pass
self.story.setMetadata('description', description_soup.text)
# Unfortunately, nowhere on the page is the year mentioned. Because we would much rather update the story needlessly
# than miss an update, we hardcode the year of creation to be 2011 (when the site was created) and the year in which
# the story was updated to the current one. Nevertheless, it may be prudent to always force an update in defaults.ini
# than miss an update, we hardcode the year of creation and update to be 2011.
# Get the date of creation from the first chapter
datePublished_soup = bs.BeautifulSoup(self._fetchUrl(self.chapterUrls[0][1])).find("div", {"class":"calendar"})
@ -119,7 +139,7 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
self.story.setMetadata("datePublished", datePublished)
dateUpdated_soup = bs.BeautifulSoup(data).find("div", {"class":"calendar"})
dateUpdated_soup.find('span').extract()
dateUpdated = makeDate(str(datetime.now().year)+dateUpdated_soup.text, "%Y%b%d")
dateUpdated = makeDate("2011"+dateUpdated_soup.text, "%Y%b%d")
self.story.setMetadata("dateUpdated", dateUpdated)
def getChapterText(self, url):

View file

@ -49,6 +49,7 @@ class BaseStoryWriter(Configurable):
self.validEntries = [
'category',
'genre',
'characters',
'status',
'datePublished',
'dateUpdated',
@ -76,6 +77,7 @@ class BaseStoryWriter(Configurable):
'category':'Category',
'genre':'Genre',
'status':'Status',
'characters':'Characters',
'datePublished':'Published',
'dateUpdated':'Updated',
'dateCreated':'Packaged',

View file

@ -190,6 +190,13 @@
Use the URL of the story's chapter list, such as
<br /><a href="http://fanfic.castletv.net/viewstory.php?sid=123">http://fanfic.castletv.net/viewstory.php?sid=123</a>.
</dd>
<dt>fimfiction.net</dt>
<dd>
Use the URL of the story's chapter list, such as
<br /><a href="http://www.fimfiction.com/story/123/">http://www.fimfiction.com/story/123/</a>
<br /> or the URL of any chapter, such as
<br /><a href="http://www.fimfiction.com/story/123/1/">http://www.fimfiction.com/story/123/1/</a>.
</dd>
</dl>