From 1d1ed1ef710efa0c4936e6197894896410147725 Mon Sep 17 00:00:00 2001 From: retiefjimm Date: Tue, 22 Mar 2011 19:11:17 -0500 Subject: [PATCH] Add support for www.whofic.com. --- fanficdownloader/downloader.py | 3 + fanficdownloader/whofic.py | 236 +++++++++++++++++++++++++++++++++ index.html | 20 +-- main.py | 2 + 4 files changed, 244 insertions(+), 17 deletions(-) create mode 100644 fanficdownloader/whofic.py diff --git a/fanficdownloader/downloader.py b/fanficdownloader/downloader.py index 5d0932d2..53e9acec 100644 --- a/fanficdownloader/downloader.py +++ b/fanficdownloader/downloader.py @@ -30,6 +30,7 @@ import fictionalley import hpfiction import twilighted import adastrafanfic +import whofic import potionsNsnitches import mediaminer @@ -151,6 +152,8 @@ if __name__ == '__main__': adapter = twilighted.Twilighted(url) elif url.find('adastrafanfic.com') != -1: adapter = adastrafanfic.Adastrafanfic(url) + elif url.find('whofic.com') != -1: + adapter = whofic.Whofic(url) elif url.find('potionsandsnitches.net') != -1: adapter = potionsNsnitches.PotionsNSnitches(url) elif url.find('mediaminer.org') != -1: diff --git a/fanficdownloader/whofic.py b/fanficdownloader/whofic.py new file mode 100644 index 00000000..dbe9ddc7 --- /dev/null +++ b/fanficdownloader/whofic.py @@ -0,0 +1,236 @@ +# -*- coding: utf-8 -*- + +import os +import re +import sys +import shutil +import os.path +import urllib as u +import logging +import pprint as pp +import unittest +import urllib2 as u2 +import urlparse as up +import BeautifulSoup as bs +import htmlentitydefs as hdefs +import time +import datetime + +from adapter import * + +class Whofic(FanfictionSiteAdapter): + def __init__(self, url): + self.url = url + parsedUrl = up.urlparse(url) + self.host = parsedUrl.netloc + self.path = parsedUrl.path + self.opener = u2.build_opener(u2.HTTPCookieProcessor()) + self.storyDescription = 'Fanfiction Story' + self.authorId = '0' + self.authorURL = '' + self.storyId = '0' + self.storyPublished = datetime.date(1970, 01, 31) + self.storyCreated = datetime.datetime.now() + self.storyUpdated = datetime.date(1970, 01, 31) + self.languageId = 'en-UK' + self.language = 'English' + self.subjects = [] + self.subjects.append ('fanfiction') + self.subjects.append ('A Teaspoon and an Open Mind') + self.publisher = self.host + self.numChapters = 0 + self.numWords = 0 + self.genre = '' + self.category = '' + self.storyStatus = 'In-Progress' + self.storyRating = 'PG' + self.storyUserRating = '0' + self.storyCharacters = [] + self.storySeries = '' + self.outputName = '' + self.outputStorySep = '-whof_' + + self.chapurl = False + ss=self.url.split('?') + logging.debug('ss=%s' % ss) + if ss is not None and len(ss) > 1: + sss = ss[1].replace('&','&').split('&') + logging.debug('sss=%s' % sss) + if sss is not None and len(sss) > 0: + ssss = sss[0].split('=') + logging.debug('ssss=%s' % ssss) + if ssss is not None and len(ssss) > 1 and ssss[0] == 'sid': + self.storyId = ssss[1] + if len(sss) > 1: + ssss = sss[1].split('=') + logging.debug('ssss=%s' % ssss) + if ssss is not None and len(ssss) > 1 and ssss[0] == 'chapter': + self.chapurl = True + + self.url = 'http://' + self.host + self.path + '?sid=' + self.storyId + logging.debug('self.url=%s' % self.url) + + logging.debug("Created Whofic: url=%s" % (self.url)) + + def requiresLogin(self, url = None): + return False + + def extractIndividualUrls(self): + url = self.url + '&chapter=1' + + data = '' + try: + data = self.opener.open(url).read() + except Exception, e: + data = '' + logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".") + if data is None: + raise StoryDoesNotExist("Problem reading story URL " + url + "!") + + soup = None + try: + soup = bs.BeautifulStoneSoup(data) + except: + raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % url) + + title = soup.find('title').string + title = title.split('::')[1].strip() + logging.debug('Title: %s' % title) + self.storyName = title.split(' by ')[0].strip() + self.authorName = title.split(' by ')[1].strip() + + for a in soup.findAll('a'): + if a['href'].startswith('viewuser.php'): + self.authorId = a['href'].split('=')[1] + self.authorURL = 'http://'+self.host+'/'+a['href'] + + logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName)) + logging.debug('self.authorId=%s, self.authorName=%s' % (self.authorId, self.authorName)) + + select = soup.find('select', { 'name' : 'chapter' } ) + + result = [] + if select is None: + # no chapters found, try url by itself. + result.append((url,self.storyName)) + else: + allOptions = select.findAll('option') + for o in allOptions: + url = self.url + "&chapter=%s" % o['value'] + # just in case there's tags, like in chapter titles. + title = "%s" % o + title = re.sub('<[^>]+>','',title) + result.append((url,title)) + + ## Whofic.com puts none of the meta data in the chapters or + ## even the story chapter index page. Need to scrape the + ## author page to find it. + data = self.opener.open(self.authorURL).read() + + soup = bs.BeautifulStoneSoup(data, selfClosingTags=('br','hr')) + # find this story in the list, parse it's metadata based on + # lots of assumptions, since there's little tagging. + for a in soup.findAll('a'): + #print "a href:"+a['href'] + if a['href'].find('viewstory.php?sid='+self.storyId) != -1: + metadata = a.findParent('td') + metadatachunks = metadata.__str__('utf8').split('
') + # process metadata for this story. + #print a.findParent('td').__str__('utf8') + self.storyDescription = metadatachunks[1] + + # for cata in metadata.findAll('a'): + # if cata['href'].startswith('categories.php'): + # if len(self.category) == 0: + # self.category = cata.string + # else: + # self.category = self.category + ", " + cata.string + + # the stuff with ' - ' separators + moremeta = metadatachunks[2] + moremeta = re.sub('<[^>]+>','',moremeta) # strip tags. + print "====== moremeta: "+moremeta + + moremetaparts = moremeta.split(' - ') + + self.category = moremetaparts[0] + for cat in self.category.split(', '): + self.addSubject(cat.strip()) + + self.storyRating = moremetaparts[1] + + for warn in moremetaparts[2].split(', '): + self.addSubject(warn.strip()) + + self.genre = moremetaparts[3] + + # the stuff with ' - ' separators *and* names + moremeta = metadatachunks[5] + moremeta = re.sub('<[^>]+>','',moremeta) # strip tags. + print "====== moremeta 2: "+moremeta + + moremetaparts = moremeta.split(' - ') + + for part in moremetaparts: + (name,value) = part.split(': ') + name=name.strip() + value=value.strip() + if name == 'Published': + self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(value, '%Y.%m.%d'))) + if name == 'Updated': + self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(value, '%Y.%m.%d'))) + if name == 'Completed' and value == 'Yes': + self.storyStatus = name + if name == 'Word Count': + self.numWords = value + + break + + self.numChapters = len(result) + + return result + + def getText(self, url): + if url.find('http://') == -1: + url = 'http://' + self.host + '/' + url + + logging.debug('Getting data from: %s' % url) + + data = '' + try: + data = self.opener.open(url).read() + except Exception, e: + data = '' + logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".") + if data is None: + raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url) + + soup = None + try: + # I really wish I knew why adastra needs the selfClosingTags to make
work, but ficwad doesn't. + soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES, selfClosingTags=('br','hr')) + except: + logging.info("Failed to decode: <%s>" % data) + raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url) + + # hardly a great identifier, I know, but whofic really doesn't + # give us anything better to work with. + span = soup.find('span', {'style' : 'font-size: 100%;'}) + + if None == span: + raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url) + + return span.__str__('utf8') + + +class Whofic_UnitTests(unittest.TestCase): + def setUp(self): + logging.basicConfig(level=logging.DEBUG) + pass + + def testGetUrlsWorks(self): + url = 'http://www.whofic.com/viewstory.php?sid=37139' + self.assertEquals(6, len(Whofic(url).extractIndividualUrls())) + +if __name__ == '__main__': + unittest.main() diff --git a/index.html b/index.html index 2bc40d75..2ceec564 100644 --- a/index.html +++ b/index.html @@ -36,8 +36,6 @@

Hi, {{ nickname }}! This is a fan fiction downloader, which makes reading stories from various websites much easier.

For Amazon Kindle use Mobi output, for Sony Reader, Nook and iPad use ePub

-

To support new features, such as including story summaries, - the URL you need to use for some sites has changed. See below for example URLs for each site.

Or see your personal list of previously downloaded fanfics.

@@ -128,6 +126,9 @@
adastrafanfic.com
Use the URL of the story's chapter list, such as
http://www.adastrafanfic.com/viewstory.php?sid=854. +
whofic.com +
Use the URL of the story's chapter list, such as +
http://www.whofic.com/viewstory.php?sid=16334. @@ -141,28 +142,13 @@ Small post written by me — how to read fiction in Stanza or any other ebook reader. -
  • - Currently we support fanfiction.net, fictionpress.com, ficwad.com, fictionalley.org, harrypotterfanfiction.com, potionsandsnitches.net, mediaminer.org and twilighted.net. - fanficauthors.net and tthfanfic.org offer native ePub functionality. -
  • You can download fanfiction directly from your iPhone, Kindle or (possibly) other ebook reader.
  • -
  • - One-shots, fics with a single chapter, are now supported. -
  • -
  • - You can download fanfics and store them for 'later' by just downloading them and visiting recent - downloads section. -
  • Downloaded stories are deleted after some time (which should give you enough of time to download it and will keep Google happy about the app not going over the storage limit).
  • -
  • - If Downloader simply opens a download file window rather than saves the fanfic and gives you a link, it means it is - too large to save in the database and you need to download it straight away. -
  • If you see some funny characters in downloaded Plain Text file, make sure you choose text file encoding UTF-8 and not something else. diff --git a/main.py b/main.py index 9bddd146..2bcc8fd0 100644 --- a/main.py +++ b/main.py @@ -275,6 +275,8 @@ class FanfictionDownloaderTask(webapp.RequestHandler): adapter = twilighted.Twilighted(url) elif url.find('adastrafanfic.com') != -1: adapter = adastrafanfic.Adastrafanfic(url) + elif url.find('whofic.com') != -1: + adapter = whofic.Whofic(url) elif url.find('potionsandsnitches.net') != -1: adapter = potionsNsnitches.PotionsNSnitches(url) elif url.find('mediaminer.org') != -1: