Commit first version of reorg/rewrite. Currently CLI only.

This commit is contained in:
Jim Miller 2011-05-03 11:27:58 -05:00
commit 150316f460
85 changed files with 20691 additions and 0 deletions

37
app.yaml Normal file
View file

@ -0,0 +1,37 @@
# fanfictionloader
application: fanfictionloader
version: 3-0-2
runtime: python
api_version: 1
handlers:
- url: /r3m0v3r
script: utils/remover.py
login: admin
- url: /r3m0v3r
script: main.py
login: admin
- url: /fdownloadtask
script: main.py
login: admin
- url: /css
static_dir: css
- url: /js
static_dir: js
- url: /static
static_dir: static
- url: /favicon\.ico
static_files: static/favicon.ico
upload: static/favicon\.ico
- url: /.*
script: main.py
builtins:
- datastore_admin: on

4
cron.yaml Normal file
View file

@ -0,0 +1,4 @@
cron:
- description: cleanup job
url: /r3m0v3r
schedule: every 2 hours

71
css/index.css Normal file
View file

@ -0,0 +1,71 @@
body
{
font: 0.9em "Helvetica Neue", Arial, Helvetica, Geneva, sans-serif;
}
#main
{
width: 43%;
margin-left: 23%;
background-color: #dae6ff;
padding: 2em;
}
#greeting
{
margin-bottom: 1em;
border-color: #efefef;
}
#logpassword:hover, #logpasswordtable:hover, #urlbox:hover, #typebox:hover, #helpbox:hover, #yourfile:hover
{
border: thin solid #fffeff;
}
h1
{
text-decoration: none;
}
#logpasswordtable
{
padding: 1em;
}
#logpassword, #logpasswordtable {
// display: none;
}
#urlbox, #typebox, #logpasswordtable, #logpassword, #helpbox, #yourfile
{
margin: 1em;
padding: 1em;
border: thin dotted #fffeff;
}
div.field
{
margin-bottom: 0.5em;
}
#submitbtn
{
padding: 1em;
}
#typelabel
{
}
#typeoptions
{
margin-top: 0.5em;
}
#error
{
font-size: small;
color: #f00;
}

59
delete_fic.py Normal file
View file

@ -0,0 +1,59 @@
import os
import cgi
import sys
import logging
import traceback
import StringIO
from google.appengine.api import users
from google.appengine.ext import webapp
from google.appengine.ext.webapp import util
from fanficdownloader.downaloder import *
from fanficdownloader.ffnet import *
from fanficdownloader.output import *
from google.appengine.ext import db
from fanficdownloader.zipdir import *
from ffstorage import *
def create_mac(user, fic_id, fic_url):
return str(abs(hash(user)+hash(fic_id)))+str(abs(hash(fic_url)))
def check_mac(user, fic_id, fic_url, mac):
return (create_mac(user, fic_id, fic_url) == mac)
def create_mac_for_fic(user, fic_id):
key = db.Key(fic_id)
fanfic = db.get(key)
if fanfic.user != user:
return None
else:
return create_mac(user, key, fanfic.url)
class DeleteFicHandler(webapp.RequestHandler):
def get(self):
user = users.get_current_user()
if not user:
self.redirect('/login')
fic_id = self.request.get('fic_id')
fic_mac = self.request.get('key_id')
actual_mac = create_mac_for_fic(user, fic_id)
if actual_mac != fic_mac:
self.response.out.write("Ooops")
else:
key = db.Key(fic_id)
fanfic = db.get(key)
fanfic.delete()
self.redirect('/recent')
fics = db.GqlQuery("Select * From DownloadedFanfic WHERE user = :1", user)
template_values = dict(fics = fics, nickname = user.nickname())
path = os.path.join(os.path.dirname(__file__), 'recent.html')
self.response.out.write(template.render(path, template_values))

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1 @@
# -*- coding: utf-8 -*-

231
fanficdownloader/adapter.py Normal file
View file

@ -0,0 +1,231 @@
# -*- coding: utf-8 -*-
import logging
import datetime
from output import makeAcceptableFilename
try:
from google.appengine.api.urlfetch import fetch as googlefetch
appEngineGlob = True
except:
appEngineGlob = False
class LoginRequiredException(Exception):
def __init__(self, url):
self.url = url
def __str__(self):
return repr(self.url + ' requires user to be logged in')
class StoryArchivedAlready(Exception):
pass
class StoryDoesNotExist(Exception):
pass
class FailedToDownload(Exception):
pass
class InvalidStoryURL(Exception):
pass
class FanfictionSiteAdapter:
appEngine = appEngineGlob
login = ''
password = ''
url = ''
host = ''
path = ''
uuid = ''
storyName = ''
storyId = ''
authorName = ''
authorId = ''
authorURL = ''
outputStorySep = '-Ukn_'
outputName = ''
outputFileName = ''
storyDescription = ''
storyCharacters = []
storySeries = ''
storyPublished = datetime.date(1970, 01, 31)
storyCreated = datetime.datetime.now()
storyUpdated = datetime.date(1970, 01, 31)
languageId = 'en-UK'
language = 'English'
subjects = []
publisher = ''
numChapters = '0'
numWords = '0'
genre = ''
category = ''
storyStatus = 'In-Progress'
storyRating = ''
storyUserRating = '0'
def __init__(self, url):
# basic plain url parsing...
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
def hasAppEngine(self):
return self.appEngine
def fetchUrl(self, url):
if not self.appEngine:
return self.opener.open(url).read().decode('utf-8')
else:
return googlefetch(url,deadline=10).content
def requiresLogin(self, url = None):
return False
def performLogin(self, url = None):
return True
def extractIndividualUrls(self):
pass
def getText(self, url):
pass
def setLogin(self, login):
self.login = login
def setPassword(self, password):
self.password = password
def getHost(self):
logging.debug('self.host=%s' % self.host)
return self.host
def getUUID(self):
self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
logging.debug('self.uuid=%s' % self.uuid)
return self.uuid
def getOutputName(self):
self.outputName = makeAcceptableFilename(self.storyName.replace(" ", "_") + self.outputStorySep + self.storyId)
logging.debug('self.outputName=%s' % self.outputName)
return self.outputName
def getOutputFileName(self, booksDirectory, bookExt):
self.getOutputName() # make sure self.outputName is populated
self.outputFileName = booksDirectory + "/" + self.outputName + bookExt
logging.debug('self.outputFileName=%s' % self.outputFileName)
return self.outputFileName
def getAuthorURL(self):
logging.debug('self.authorURL=%s' % self.authorURL)
return self.authorURL
def getAuthorId(self):
logging.debug('self.authorId=%s' % self.authorId)
return self.authorId
def getAuthorName(self):
logging.debug('self.authorName=%s' % self.authorName)
return self.authorName
def getStoryURL(self):
logging.debug('self.url=%s' % self.url)
return self.url
def getStoryId(self):
logging.debug('self.storyId=%s' % self.storyId)
return self.storyId
def getStoryName(self):
logging.debug('self.storyName=%s' % self.storyName)
return self.storyName
def getStoryDescription(self):
## with out stripping \n's, appengine treats additional lines from this debug
## output as error messages.
#logging.debug('self.storyDescription=%s' % self.storyDescription.replace("\n"," ").replace('\r',''))
return self.storyDescription
def getStoryCreated(self):
self.storyCreated = datetime.datetime.now()
logging.debug('self.storyCreated=%s' % self.storyCreated)
return self.storyCreated
def addCharacter(self, character):
chara = character.upper()
for c in self.storyCharacters:
if c.upper() == chara:
return False
self.storyCharacters.append(character)
return True
def getStoryCharacters(self):
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
return self.storyCharacters
def getStoryPublished(self):
logging.debug('self.storyPublished=%s' % self.storyPublished)
return self.storyPublished
def getStoryUpdated(self):
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
return self.storyUpdated
def getStorySeries(self):
logging.debug('self.storySeries=%s' % self.storySeries)
return self.storySeries
def getLanguage(self):
logging.debug('self.language=%s' % self.language)
return self.language
def getLanguageId(self):
logging.debug('self.languageId=%s' % self.languageId)
return self.languageId
def addSubject(self, subject):
subj = subject.upper()
for s in self.subjects:
if s.upper() == subj:
return False
self.subjects.append(subject)
return True
def getSubjects(self):
logging.debug('self.subjects=%s' % self.subjects)
return self.subjects
def getPublisher(self):
logging.debug('self.publisher=%s' % self.publisher)
return self.publisher
def getNumChapters(self):
logging.debug('self.numChapters=%s' % self.numChapters)
return self.numChapters
def getNumWords(self):
logging.debug('self.numWords=%s' % self.numWords)
return self.numWords
def getCategory(self):
logging.debug('self.category=%s' % self.category)
return self.category
def getGenre(self):
logging.debug('self.genre=%s' % self.genre)
return self.genre
def getStoryStatus(self):
logging.debug('self.storyStatus=%s' % self.storyStatus)
return self.storyStatus
def getStoryRating(self):
logging.debug('self.storyRating=%s' % self.storyRating)
return self.storyRating
def getStoryUserRating(self):
logging.debug('self.storyUserRating=%s' % self.storyUserRating)
return self.storyUserRating
def getPrintableUrl(self, url):
return url

View file

@ -0,0 +1,81 @@
# -*- coding: utf-8 -*-
import os, sys, glob
from os.path import dirname, basename, normpath
import logging
import urlparse as up
## A few exceptions for different things for adapters
class FailedToDownload(Exception):
def __init__(self,error):
self.error=error
def __str__(self):
return self.error
class InvalidStoryURL(Exception):
def __init__(self,url,domain,example):
self.url=url
self.domain=domain
self.example=example
def __str__(self):
return "Bad Story URL: %s\nFor site: %s\nExample: %s" % (self.url, self.domain, self.example)
class FailedToLogin(Exception):
def __init__(self,url,username):
self.url=url
self.username=username
def __str__(self):
return "Failed to Login for URL: %s with username: %s" % (self.url, self.username)
class StoryDoesNotExist(Exception):
def __init__(self,url):
self.url=url
def __str__(self):
return "Story Does Not Exit: " + self.url
class UnknownSite(Exception):
def __init__(self,url,supported_sites_list):
self.url=url
self.supported_sites_list=supported_sites_list
def __str__(self):
return "Unknown Site("+self.url+"). Supported sites: "+", ".join(self.supported_sites_list)
## This bit of complexity allows adapters to be added by just adding
## the source file. It eliminates the long if/else clauses we used to
## need to pick out the adapter.
## List of registered site adapters.
__class_list = []
def _register_handler(cls):
__class_list.append(cls)
def getAdapter(config,url):
parsedUrl = up.urlparse(url)
logging.debug("site:"+parsedUrl.netloc)
for cls in __class_list:
if cls.matchesSite(parsedUrl.netloc):
adapter = cls(config,url) # raises InvalidStoryURL
return adapter
# No adapter found.
raise UnknownSite( url, (cls.getSiteDomain() for cls in __class_list) )
## Automatically import each adapter_*.py file.
## Each must call _register_handler() with their class to be
## registered.
filelist = glob.glob(dirname(__file__)+'/adapter_*.py')
sys.path.insert(0,normpath(dirname(__file__)))
for file in filelist:
#print "file: "+basename(file)[:-3]
__import__(basename(file)[:-3])
del sys.path[0]

View file

@ -0,0 +1,183 @@
# -*- coding: utf-8 -*-
import time
import datetime
import logging
import re
import urllib2
import BeautifulSoup as bs
import adapters
from adapters import _register_handler
from adapters.base_adapter import BaseSiteAdapter, utf8FromSoup
class FanFictionNetSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','ffnet')
# get storyId from url--url validation guarantees second part is storyId
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
# normalized story URL.
self._setURL("http://"+self.getSiteDomain()\
+"/s/"+self.story.getMetadata('storyId')+"/1/")
@staticmethod
def getSiteDomain():
return 'www.fanfiction.net'
@classmethod
def getAcceptDomains(cls):
return ['www.fanfiction.net','m.fanfiction.net']
def getSiteExampleURLs(self):
return "http://www.fanfiction.net/s/1234/1/ http://www.fanfiction.net/s/1234/12/ http://www.fanfiction.net/s/1234/1/Story_Title"
def getSiteURLPattern(self):
return r"http://(www|m)?\.fanfiction\.net/s/\d+/\d+(/|/[a-zA-Z0-9_]+)?$"
def extractChapterUrlsAndMetadata(self):
# fetch the chapter. From that we will get almost all the
# metadata and chapter list
url = self.url
logging.debug("URL: "+url)
# use BeautifulSoup HTML parser to make everything easier to find.
try:
soup = bs.BeautifulSoup(self._fetchUrl(url))
except urllib2.HTTPError, e:
if e.code == 404:
raise adapters.StoryDoesNotExist(self.url)
else:
raise e
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"^/u/\d+"))
self.story.setMetadata('authorId',a['href'].split('/')[2])
self.story.setMetadata('authorUrl','http://'+self.host+a['href'])
self.story.setMetadata('author',a.string)
# start by finding a script towards the bottom that has a
# bunch of useful stuff in it.
# var storyid = 6577076;
# var chapter = 1;
# var chapters = 17;
# var words = 42787;
# var userid = 2645830;
# var title = 'The+Invitation';
# var title_t = 'The Invitation';
# var summary = 'Dudley Dursley would be the first to say he lived a very normal life. But what happens when he gets invited to his cousin Harry Potter\'s wedding? Will Dudley get the courage to apologize for the torture he caused all those years ago? Harry/Ginny story.';
# var categoryid = 224;
# var cat_title = 'Harry Potter';
# var datep = '12-21-10';
# var dateu = '04-06-11';
# var author = 'U n F a b u l o u s M e';
for script in soup.findAll('script', src=None):
if 'var storyid' in script.string:
for line in script.string.split('\n'):
m = re.match(r"^ +var ([^ ]+) = '?(.*?)'?;$",line)
if m == None : continue
var,value = m.groups()
# remove javascript escaping from values.
value = re.sub(r'\\(.)',r'\1',value)
#print var,value
if 'words' in var:
self.story.setMetadata('numWords', value)
if 'title_t' in var:
self.story.setMetadata('title', value)
if 'summary' in var:
self.story.setMetadata('description', value)
if 'datep' in var:
self.story.setMetadata('datePublished',
datetime.datetime.fromtimestamp(time.mktime(time.strptime(value, '%m-%d-%y'))))
if 'dateu' in var:
self.story.setMetadata('dateUpdated',
datetime.datetime.fromtimestamp(time.mktime(time.strptime(value, '%m-%d-%y'))))
if 'cat_title' in var:
if "Crossover" in value:
value = re.sub(r' Crossover$','',value)
for c in value.split(' and '):
self.story.addToList('category',c)
# Screws up when the category itself
# contains ' and '. But that's rare
# and the only alternative is to find
# the 'Crossover' category URL and
# parse that page to search for <a>
# with href /crossovers/(name)/(num)/
# <a href="/crossovers/Harry_Potter/224/">Harry Potter</a>
# <a href="/crossovers/Naruto/1402/">Naruto</a>
else:
self.story.addToList('category',value)
break # for script in soup.findAll('script', src=None):
# Find the chapter selector
select = soup.find('select', { 'name' : 'chapter' } )
if select is None:
# no selector found, so it's a one-chapter story.
self.chapterUrls.append((self.story.getMetadata('title'),url))
else:
allOptions = select.findAll('option')
for o in allOptions:
url = u'http://%s/s/%s/%s/' % ( self.getSiteDomain(),
self.story.getMetadata('storyId'),
o['value'])
# just in case there's tags, like <i> in chapter titles.
title = u"%s" % o
title = re.sub(r'<[^>]+>','',title)
self.chapterUrls.append((title,url))
self.story.setMetadata('numChapters',len(self.chapterUrls))
## Pull some additional data from html. Find Rating and look around it.
a = soup.find('a', href='http://www.fictionratings.com/')
self.story.setMetadata('rating',a.string)
# after Rating, the same bit of text containing id:123456 contains
# Complete--if completed.
if 'Complete' in a.findNext(text=re.compile(r'id:\d+')):
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
# Parse genre(s) from <meta name="description" content="..."
# <meta name="description" content="Chapter 1 of a Harry Potter - Family/Friendship fanfiction. Dudley Dursley would be the first to say he lived a very normal life. But what happens when he gets invited to his cousin Harry Potter's wedding? Will Dudley get the courage to apologize for the torture he caused all those years ago? Harry/Ginny story..">
# <meta name="description" content="A Gundam Wing/AC and Gundam Seed - Romance/Sci-Fi crossover fanfiction with characters: & Kira Y.. Story summary: One-Shoot dividido en dos partes. Kira va en camino a rescatar a Lacus, pero él no es el unico. Dos personajes de diferentes universos Gundams. SEED vs ZERO.">
# <meta name="description" content="Chapter 1 of a Alvin and the chipmunks and Alpha and Omega crossover fanfiction with characters: Alvin S. & Humphrey. You'll just have to read to find out... No Flames Plesae... and tell me what you want to see by PM'ing me....">
# genre is after first -, but before first 'fanfiction'.
m = re.match(r"^(?:Chapter \d+ of a|A) (?:.*?) (?:- (?P<genres>.*?)) (?:crossover )?fanfiction",
soup.find('meta',{'name':'description'})['content'])
if m != None:
genres=m.group('genres')
# Hurt/Comfort is one genre.
genres=re.sub('Hurt/Comfort','Hurt-Comfort',genres)
for g in genres.split('/'):
self.story.addToList('genre',g)
return
def getChapterText(self, url):
logging.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
span = soup.find('div', {'id' : 'storytext'})
if None == span:
raise adapters.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(span)
_register_handler(FanFictionNetSiteAdapter)

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,200 @@
# -*- coding: utf-8 -*-
import time
import datetime
import logging
import re
import urllib
import urllib2
import BeautifulSoup as bs
import adapters
from adapters import _register_handler
from adapters.base_adapter import BaseSiteAdapter, utf8FromSoup
from htmlcleanup import stripHTML
class TwilightedNetSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','tw')
self.decode = "utf8"
self.story.addToList("category","Twilight")
self.username = "NoneGiven" # if left empty, twilighted.net doesn't return any message at all.
self.password = ""
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
logging.debug("storyId: (%s)"%self.story.getMetadata('storyId'))
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
@staticmethod
def getSiteDomain():
return 'www.twilighted.net'
@classmethod
def getAcceptDomains(cls):
return ['www.twilighted.net','twilighted.net']
def getSiteExampleURLs(self):
return "http://www.twilighted.net/viewstory.php?sid=1234 http://twilighted.net/viewstory.php?sid=5678"
def getSiteURLPattern(self):
return re.escape("http://")+r"(www\.)?"+re.escape("twilighted.net/viewstory.php?sid=")+r"\d+$"
def needToLoginCheck(self, data):
if 'Registered Users Only.' in data \
or 'There is no such account on our website' in data \
or "That password doesn't match the one in our database." in data:
return True
else:
return False
def performLogin(self, url):
data = {}
if self.password:
data['penname'] = self.username
data['password'] = self.password
else:
data['penname'] = self.getConfig("username")
data['password'] = self.getConfig("password")
data['cookiecheck'] = '1'
data['submit'] = 'Submit'
urlvals = urllib.urlencode(data)
loginUrl = 'http://' + self.getSiteDomain() + '/user.php?action=login'
logging.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
data['penname']))
d = self._fetchUrl(loginUrl, urlvals)
if self.needToLoginCheck(d) :
logging.info("Failed to login to URL %s as %s" % (loginUrl,
data['penname']))
raise adapters.FailedToLogin(url,data['penname'])
return False
else:
return True
def extractChapterUrlsAndMetadata(self):
url = self.url+'&index=1'
logging.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise adapters.StoryDoesNotExist(self.url)
else:
raise e
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url)
data = self._fetchUrl(url)
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
## Title
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',a.string)
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
## <meta name='description' content='&lt;p&gt;Description&lt;/p&gt; ...' >
## Summary, strangely, is in the content attr of a <meta name='description'> tag
## which is escaped HTML. Unfortunately, we can't use it because they don't
## escape (') chars in the desc, breakin the tag.
#meta_desc = soup.find('meta',{'name':'description'})
#metasoup = bs.BeautifulStoneSoup(meta_desc['content'])
#self.story.setMetadata('description',stripHTML(metasoup))
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Summary' in label:
## Everything until the next span class='label'
svalue = str(value)
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value.strip())
if 'Word count' in label:
self.story.setMetadata('numWords', value.strip())
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
catstext = [cat.string for cat in cats]
for cat in catstext:
self.story.addToList('category',cat.string)
## twilighted.net doesn't use genre.
# if 'Genre' in label:
# genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class'))
# genrestext = [genre.string for genre in genres]
# self.genre = ', '.join(genrestext)
# for genre in genrestext:
# self.addSubject(genre.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', datetime.datetime.fromtimestamp(time.mktime(time.strptime(value.strip(), "%B %d, %Y"))))
if 'Updated' in label:
# there's a stray [ at the end.
#value = value[0:-1]
self.story.setMetadata('dateUpdated', datetime.datetime.fromtimestamp(time.mktime(time.strptime(value.strip(), "%B %d, %Y"))))
def getChapterText(self, url):
logging.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
span = soup.find('div', {'id' : 'story'})
if None == span:
raise adapters.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(span)
_register_handler(TwilightedNetSiteAdapter)

View file

@ -0,0 +1,183 @@
# -*- coding: utf-8 -*-
import time
import datetime
import logging
import re
import urllib2
import BeautifulSoup as bs
import adapters
from adapters import _register_handler
from adapters.base_adapter import BaseSiteAdapter, utf8FromSoup
class WhoficComSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','whof')
self.decode = "ISO-8859-1"
@staticmethod
def getSiteDomain():
return 'www.whofic.com'
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+"\d+$"
def extractChapterUrlsAndMetadata(self):
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# fetch the first chapter. From that we will:
# - determine title, authorname, authorid
# - get chapter list, if not one-shot.
url = self.url+'&chapter=1'
logging.debug("URL: "+url)
# use BeautifulSoup HTML parser to make everything easier to find.
try:
soup = bs.BeautifulSoup(self._fetchUrl(url))
except urllib2.HTTPError, e:
if e.code == 404:
raise adapters.StoryDoesNotExist(self.url)
else:
raise e
# pull title(title) and author from the HTML title.
title = soup.find('title').string
logging.debug('Title: %s' % title)
title = title.split('::')[1].strip()
self.story.setMetadata('title',title.split(' by ')[0].strip())
self.story.setMetadata('author',title.split(' by ')[1].strip())
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
# Find the chapter selector
select = soup.find('select', { 'name' : 'chapter' } )
if select is None:
# no selector found, so it's a one-chapter story.
self.chapterUrls.append((self.story.getMetadata('title'),url))
else:
allOptions = select.findAll('option')
for o in allOptions:
url = self.url + "&chapter=%s" % o['value']
# just in case there's tags, like <i> in chapter titles.
title = "%s" % o
title = re.sub(r'<[^>]+>','',title)
self.chapterUrls.append((title,url))
self.story.setMetadata('numChapters',len(self.chapterUrls))
## Whofic.com puts none of the other meta data in the chapters
## or even the story chapter index page. Need to scrape the
## author page to find it.
# <table width="100%" bordercolor="#333399" border="0" cellspacing="0" cellpadding="2"><tr><td>
# <b><a href="viewstory.php?sid=38220">Accompaniment 2</a></b> by <a href="viewuser.php?uid=12412">clandestinemiscreant</a> [<a href="reviews.php?sid=38220">Reviews</a> - <a href="reviews.php?sid=38220">0</a>] <br>
# This is a series of short stories written as an accompaniment to Season 2, Season 28 for us oldies, and each is unrelated except for that one factor. Each story is canon, in that it does not change established events at time of airing, based on things mentioned and/or implied and missing or deleted scenes that were not seen in the final aired episodes.<br>
# <font size="-1"><b><a href="categories.php?catid=15">Tenth Doctor</a></b> - All Ages - None - Humor, Hurt/Comfort, Romance<br>
# <i>Characters:</i> Rose Tyler<br>
# <i>Series:</i> None<br>
# <i>Published:</i> 2010.08.15 - <i>Updated:</i> 2010.08.16 - <i>Chapters:</i> 4 - <i>Completed:</i> Yes - <i>Word Count:</i> 4890 </font>
# </td></tr></table>
logging.debug("Author URL: "+self.story.getMetadata('authorUrl'))
soup = bs.BeautifulStoneSoup(self._fetchUrl(self.story.getMetadata('authorUrl')),
selfClosingTags=('br')) # normalize <br> tags to <br />
# find this story in the list, parse it's metadata based on
# lots of assumptions about the html, since there's little
# tagging.
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')))
metadata = a.findParent('td')
metadatachunks = utf8FromSoup(metadata).split('<br />')
# process metadata for this story.
self.story.setMetadata('description', metadatachunks[1])
# First line of the stuff with ' - ' separators
moremeta = metadatachunks[2]
moremeta = re.sub(r'<[^>]+>','',moremeta) # strip tags.
moremetaparts = moremeta.split(' - ')
# first part is category--whofic.com has categories
# Doctor One-11, Torchwood, etc. We're going to
# prepend any with 'Doctor' or 'Era' (Multi-Era, Other
# Era) as 'Doctor Who'.
#
# Also push each in as 'extra tags'.
category = moremetaparts[0]
if 'Doctor' in category or 'Era' in category :
self.story.addToList('category','Doctor Who')
for cat in category.split(', '):
self.story.addToList('category',cat)
# next in that line is age rating.
self.story.setMetadata('rating',moremetaparts[1])
# after that is a possible list fo specific warnings,
# Explicit Violence, Swearing, etc
if "None" not in moremetaparts[2]:
for warn in moremetaparts[2].split(', '):
self.story.addToList('warnings',warn)
# then genre. It's another comma list. All together
# in genre, plus each in extra tags.
genre=moremetaparts[3]
for g in genre.split(r', '):
self.story.addToList('genre',g)
# the next line is stuff with ' - ' separators *and* names--with tags.
moremeta = metadatachunks[5]
moremeta = re.sub(r'<[^>]+>','',moremeta) # strip tags.
moremetaparts = moremeta.split(' - ')
for part in moremetaparts:
(name,value) = part.split(': ')
name=name.strip()
value=value.strip()
if name == 'Published':
self.story.setMetadata('datePublished', datetime.datetime.fromtimestamp(time.mktime(time.strptime(value, '%Y.%m.%d'))))
if name == 'Updated':
self.story.setMetadata('dateUpdated', datetime.datetime.fromtimestamp(time.mktime(time.strptime(value, '%Y.%m.%d'))))
if name == 'Completed':
if value == 'Yes':
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if name == 'Word Count':
self.story.setMetadata('numWords', value)
def getChapterText(self, url):
logging.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
# hardly a great identifier, I know, but whofic really doesn't
# give us anything better to work with.
span = soup.find('span', {'style' : 'font-size: 100%;'})
if None == span:
raise adapters.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(span)
_register_handler(WhoficComSiteAdapter)

View file

@ -0,0 +1,102 @@
# -*- coding: utf-8 -*-
import re
import datetime
import time
import urllib2 as u2
import urlparse as up
from story import Story
from configurable import Configurable
from htmlcleanup import removeEntities, removeAllEntities, stripHTML
from adapters import InvalidStoryURL
class BaseSiteAdapter(Configurable):
@classmethod
def matchesSite(cls,site):
return site in cls.getAcceptDomains()
@classmethod
def getAcceptDomains(cls):
return [cls.getSiteDomain()]
def validateURL(self):
return re.match(self.getSiteURLPattern(), self.url)
def __init__(self, config, url):
Configurable.__init__(self, config)
self.addConfigSection(self.getSiteDomain())
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
self.storyDone = False
self.story = Story()
self.story.setMetadata('site',self.getSiteDomain())
self.story.setMetadata('dateCreated',datetime.datetime.now())
self.chapterUrls = [] # tuples of (chapter title,chapter url)
self.decode = "utf8"
self._setURL(url)
if not self.validateURL():
raise InvalidStoryURL(url,
self.getSiteDomain(),
self.getSiteExampleURLs())
def _setURL(self,url):
self.url = url
self.parsedUrl = up.urlparse(url)
self.host = self.parsedUrl.netloc
self.path = self.parsedUrl.path
self.story.setMetadata('storyUrl',self.url)
def _fetchUrl(self, url, parameters=None):
if self.getConfig('slow_down_sleep_time'):
time.sleep(float(self.getConfig('slow_down_sleep_time')))
if parameters:
return self.opener.open(url,parameters).read().decode(self.decode)
else:
return self.opener.open(url).read().decode(self.decode)
# Does the download the first time it's called.
def getStory(self):
if not self.storyDone:
self.extractChapterUrlsAndMetadata()
for (title,url) in self.chapterUrls:
self.story.addChapter(removeEntities(title),
removeEntities(self.getChapterText(url)))
self.storyDone = True
return self.story
###############################
@staticmethod
def getSiteDomain():
"Needs to be overriden in each adapter class."
return 'no such domain'
## URL pattern validation is done *after* picking an adaptor based
## on domain instead of *as* the adaptor selector so we can offer
## the user example(s) for that particular site.
def getSiteURLPattern(self):
"Used to validate URL. Should be override in each adapter class."
return '^http://'+re.escape(self.getSiteDomain())
def getSiteExampleURLs(self):
"""
Needs to be overriden in each adapter class. It's the adapter
writer's responsibility to make sure the example(s) pass the
URL validate.
"""
return 'no such example'
def extractChapterUrlsAndMetadata(self):
"Needs to be overriden in each adapter class. Populates self.story metadata and self.chapterUrls"
pass
def getChapterText(self, url):
"Needs to be overriden in each adapter class."
pass
# this gives us a unicode object, not just a string containing bytes.
# (I gave soup a unicode string, you'd think it could give it back...)
def utf8FromSoup(soup):
return soup.__str__('utf8').decode('utf-8')

View file

@ -0,0 +1,225 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import shutil
import os.path
import urllib as u
import logging
import pprint as pp
import unittest
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time
import datetime
from adapter import *
class Adastrafanfic(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
self.storyDescription = 'Fanfiction Story'
self.authorId = '0'
self.authorURL = ''
self.storyId = '0'
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.subjects.append ('fanfiction')
self.subjects.append ('Ad Astra')
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = ''
self.category = 'Fanfiction'
self.storyStatus = 'In-Progress'
self.storyRating = 'PG'
self.storyUserRating = '0'
self.storyCharacters = []
self.storySeries = ''
self.outputName = ''
self.outputStorySep = '-aaff_'
self.chapurl = False
ss=self.url.split('?')
logging.debug('ss=%s' % ss)
if ss is not None and len(ss) > 1:
sss = ss[1].replace('&amp;','&').split('&')
logging.debug('sss=%s' % sss)
if sss is not None and len(sss) > 0:
ssss = sss[0].split('=')
logging.debug('ssss=%s' % ssss)
if ssss is not None and len(ssss) > 1 and ssss[0] == 'sid':
self.storyId = ssss[1]
if len(sss) > 1:
ssss = sss[1].split('=')
logging.debug('ssss=%s' % ssss)
if ssss is not None and len(ssss) > 1 and ssss[0] == 'chapter':
self.chapurl = True
self.url = 'http://' + self.host + self.path + '?sid=' + self.storyId
logging.debug('self.url=%s' % self.url)
logging.debug("Created Adastrafanfic: url=%s" % (self.url))
def requiresLogin(self, url = None):
return False
def extractIndividualUrls(self):
# warning=5 bypasses 'are you old enough' checks.
url = self.url + '&warning=5&chapter=1'
data = ''
try:
data = self.opener.open(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise StoryDoesNotExist("Problem reading story URL " + url + "!")
soup = None
try:
soup = bs.BeautifulStoneSoup(data)
except:
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % url)
title = soup.find('title').string
logging.debug('Title: %s' % title)
self.storyName = title.split(' by ')[0].strip()
self.authorName = title.split(' by ')[1].strip()
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
logging.debug('self.authorId=%s, self.authorName=%s' % (self.authorId, self.authorName))
select = soup.find('select', { 'name' : 'chapter' } )
result = []
if select is None:
# no chapters found, try url by itself.
result.append((url,self.storyName))
else:
allOptions = select.findAll('option')
for o in allOptions:
# warning=5 bypasses 'are you old enough' checks.
url = self.url + "&warning=5&chapter=%s" % o['value']
# ad astra can have tags, like <i> in chapter titles.
title = "%s" % o
title = re.sub('<[^>]+>','',title)
result.append((url,title))
# warning=5 bypasses 'are you old enough' checks.
url = self.url + "&warning=5&index=1"
data = self.opener.open(url).read()
soup = bs.BeautifulStoneSoup(data, selfClosingTags=('br','hr'))
# find authorId.
titlediv = soup.find('div', {'id' : 'pagetitle'})
for a in titlediv.findAll('a'):
if a['href'].startswith('viewuser.php'):
self.authorId = a['href'].split('=')[1]
self.authorURL = 'http://'+self.host+'/'+a['href']
# find other metadata
contentdiv = soup.find('div', {'class' : 'content'})
# adastra meta data is not well structured. There's an
# identifiable span class="label" around the *labels*, but
# nothing around the content for each label. And there's
# <a href> around lots of the meta data values.
# characters are given 'ln, fn'. Need to parse out
# separately. Of course, I only realized *after* doing this
# that output.py isn't actually doing anything with the
# characters... <sigh>
for a in contentdiv.findAll('a'):
if a['href'].startswith('browse.php?type=characters'):
name=a.text
if a.text.find(', ') > -1:
names=a.text.split(', ')
names.reverse()
name=' '.join(names)
self.addCharacter(name)
contentdivstring = contentdiv.__str__('utf8')
labeledlines = contentdivstring.strip().split('<span class="label">') # eats the <span class="label"> tags.
metadata = dict()
for labeledline in labeledlines:
labeledline = re.sub(r'<[^>]+>','',labeledline)
(label,sep,value)=labeledline.strip().partition(':') # a bit like split, but splits on first separator.
metadata[label.strip()]=value.strip()
#print label+"->"+value
self.storyDescription = metadata['Summary']
self.genre = metadata['Genre']
for genre in self.genre.split(", "):
self.addSubject(genre)
self.category = metadata['Categories']
for category in self.category.split(", "):
self.addSubject(category)
if metadata['Completed'] == "No":
self.storyStatus = 'In-Progress'
else:
self.storyStatus = 'Completed'
self.storyRating = metadata['Rated']
self.storySeries = metadata['Series']
self.numChapters = metadata['Chapters']
self.numWords = metadata['Word count']
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(metadata['Published'], "%m/%d/%Y")))
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(metadata['Updated'], "%m/%d/%Y")))
return result
def getText(self, url):
if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url
logging.debug('Getting data from: %s' % url)
data = ''
try:
data = self.opener.open(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
soup = None
try:
# I really wish I knew why adastra needs the selfClosingTags to make <br /> work, but ficwad doesn't.
soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES, selfClosingTags=('br','hr'))
except:
logging.info("Failed to decode: <%s>" % data)
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
div = soup.find('div', {'id' : 'story'})
if None == div:
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return div.__str__('utf8')
class Adastrafanfic_UnitTests(unittest.TestCase):
def setUp(self):
logging.basicConfig(level=logging.DEBUG)
pass
def testGetUrlsWorks(self):
url = 'http://www.adastrafanfic.com/viewstory.php?sid=426'
self.assertEquals(32, len(Adastrafanfic(url).extractIndividualUrls()))
if __name__ == '__main__':
unittest.main()

View file

View file

@ -0,0 +1,49 @@
# -*- coding: utf-8 -*-
import ConfigParser
# All of the writers(epub,html,txt) and adapters(ffnet,twlt,etc)
# inherit from Configurable. The config file(s) uses ini format:
# [sections] with key:value settings.
#
# There's a [defaults] section which is overriden by the writer's
# section [epub], which is overriden by the adapter's section for each
# site.
#
# [defaults]
# titlepage_entries: category,genre, status
# [epub]
# titlepage_entries: category,genre, status,datePublished,dateUpdated,dateCreated
# [www.whofic.com]
# titlepage_entries: category,genre, status,dateUpdated,rating
class Configurable(object):
def __init__(self, config):
self.config = config
self.sectionslist = ['defaults']
def addConfigSection(self,section):
self.sectionslist.insert(0,section)
def getConfig(self, key):
val = ""
for section in self.sectionslist:
try:
val = self.config.get(section,key)
if val and val.lower() == "false":
val = False
#print "getConfig(%s)=[%s]%s" % (key,section,val)
return val
except (ConfigParser.NoOptionError, ConfigParser.NoSectionError), e:
pass
return val
# split and strip each.
def getConfigList(self, key):
vlist = self.getConfig(key).split(',')
vlist = [ v.strip() for v in vlist ]
#print "vlist("+key+"):"+str(vlist)
return vlist

View file

@ -0,0 +1,552 @@
# -*- coding: utf-8 -*-
CSS = '''body { margin-left: 2%; margin-right: 2%; margin-top: 2%; margin-bottom: 2%; text-align: justify; }
pre { font-size: x-small; }
sml { font-size: small; }
h1 { text-align: center; }
h2 { text-align: center; }
h3 { text-align: center; }
h4 { text-align: center; }
h5 { text-align: center; }
h6 { text-align: center; }
h7 { text-align: left; font-size: large; font-weight: bold; }
.CI {
text-align:center;
margin-top:0px;
margin-bottom:0px;
padding:0px;
}
.center {text-align: center;}
.cover {text-align: center;}
.full {width: 100%; }
.quarter {width: 25%; }
.smcap {font-variant: small-caps;}
.u {text-decoration: underline;}
.bold {font-weight: bold;}
'''
MIMETYPE = '''application/epub+zip'''
TITLE_HEADER = '''<?xml version="1.0" encoding="utf-8"?><html xmlns="http://www.w3.org/1999/xhtml" xmlns:xlink="http://www.w3.org/1999/xlink"><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
<title>%s - %s</title><link href="stylesheet.css" type="text/css" charset="UTF-8" rel="stylesheet"/></head><body>
<p><h3 id="lnks"><b><a id="StoryLink" href="%s">%s</a></b> by <b><a id="AuthorLink" href="%s">%s</a></b></h3></p>
'''
TITLE_ENTRY = '''<b>%s</b> %s<br />
'''
TITLE_FOOTER = '''
<br /><b>Summary:</b><br />%s<br />
</body></html>
'''
TABLE_TITLE_HEADER = TITLE_HEADER + '''
<table class="full">
'''
TABLE_TITLE_ENTRY = '''<tr><td><b>%s</b></td><td>%s</td></tr>
'''
TABLE_TITLE_FOOTER = '''
</table>
''' + TITLE_FOOTER
CONTAINER = '''<?xml version="1.0" encoding="utf-8"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>
'''
CONTENT_START = '''<?xml version="1.0" encoding="utf-8"?>
<package version="2.0" xmlns="http://www.idpf.org/2007/opf"
unique-identifier="fanficdownloader-uuid">
<metadata xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dcterms="http://purl.org/dc/terms/"
xmlns:opf="http://www.idpf.org/2007/opf"
xmlns:calibre="http://calibre.kovidgoyal.net/2009/metadata">
<dc:identifier id="fanficdownloader-uuid">BookID-Epub-%s</dc:identifier>
<dc:title>%s</dc:title>
<dc:creator opf:role="aut">%s</dc:creator>
<dc:contributor opf:role="bkp">fanficdownloader [http://fanficdownloader.googlecode.com]</dc:contributor>
<dc:language>%s</dc:language>
<dc:rights></dc:rights>
<dc:date opf:event="publication">%s</dc:date>
<dc:date opf:event="creation">%s</dc:date>
<dc:date opf:event="modification">%s</dc:date>
<meta name="calibre:timestamp" content="%s"/>
<dc:description>%s</dc:description>
'''
CONTENT_END_METADATA = ''' <dc:publisher>%s</dc:publisher>
<dc:identifier id="BookId">%s</dc:identifier>
<dc:identifier opf:scheme="URL">%s</dc:identifier>
<dc:source>%s</dc:source>
<dc:type>FanFiction</dc:type>
<meta name="calibre:rating" content="%s"/>
</metadata>
<manifest>
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
<item id="style" href="stylesheet.css" media-type="text/css" />
'''
CONTENT_SUBJECT = ''' <dc:subject>%s</dc:subject>
'''
CONTENT_ITEM = ''' <item id="%s" href="%s" media-type="application/xhtml+xml" />
'''
CONTENT_END_MANIFEST = ''' </manifest>
<spine toc="ncx">
'''
CONTENT_ITEMREF = ''' <itemref idref="%s" />
'''
CONTENT_END = ''' </spine>
</package>
'''
TOC_START = '''<?xml version="1.0" encoding="UTF-8"?>
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
<head>
<meta name="dtb:uid" content="%s"/>
<meta name="dtb:depth" content="1"/>
<meta name="dtb:totalPageCount" content="0"/>
<meta name="dtb:maxPageNumber" content="0"/>
</head>
<docTitle>
<text>%s</text>
</docTitle>
<navMap>
'''
TOC_ITEM = '''<navPoint id="%s" playOrder="%d">
<navLabel>
<text>%s</text>
</navLabel>
<content src="%s"/>
</navPoint>
'''
TOC_END = '''</navMap>
</ncx>
'''
XHTML_START = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>%s</title>
<link href="stylesheet.css" type="text/css" rel="stylesheet" />
</head>
<body>
<div>
<h3>%s</h3>
'''
XHTML_END = '''</div>
</body>
</html>
'''
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
'blockquote', 'br', 'center', 'cite', 'code', 'col',
'colgroup', 'dd', 'del', 'dfn', 'dir', 'dl', 'dt', 'em',
'font', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'i',
'ins', 'kbd', 'label', 'li', 'ol',
'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike',
'strong', 'sub', 'sup', 'u', 'ul']
acceptable_attributes = ['href']
# entity list from http://code.google.com/p/doctype/wiki/CharacterEntitiesConsistent
entities = { '&aacute;' : 'á',
'&Aacute;' : 'Á',
'&Aacute' : 'Á',
'&aacute' : 'á',
'&acirc;' : 'â',
'&Acirc;' : 'Â',
'&Acirc' : 'Â',
'&acirc' : 'â',
'&acute;' : '´',
'&acute' : '´',
'&AElig;' : 'Æ',
'&aelig;' : 'æ',
'&AElig' : 'Æ',
'&aelig' : 'æ',
'&agrave;' : 'à',
'&Agrave;' : 'À',
'&Agrave' : 'À',
'&agrave' : 'à',
'&alefsym;' : '',
'&alpha;' : 'α',
'&Alpha;' : 'Α',
'&amp;' : '&',
'&AMP;' : '&',
'&AMP' : '&',
'&amp' : '&',
'&and;' : '',
'&ang;' : '',
'&aring;' : 'å',
'&Aring;' : 'Å',
'&Aring' : 'Å',
'&aring' : 'å',
'&asymp;' : '',
'&atilde;' : 'ã',
'&Atilde;' : 'Ã',
'&Atilde' : 'Ã',
'&atilde' : 'ã',
'&auml;' : 'ä',
'&Auml;' : 'Ä',
'&Auml' : 'Ä',
'&auml' : 'ä',
'&bdquo;' : '',
'&beta;' : 'β',
'&Beta;' : 'Β',
'&brvbar;' : '¦',
'&brvbar' : '¦',
'&bull;' : '',
'&cap;' : '',
'&ccedil;' : 'ç',
'&Ccedil;' : 'Ç',
'&Ccedil' : 'Ç',
'&ccedil' : 'ç',
'&cedil;' : '¸',
'&cedil' : '¸',
'&cent;' : '¢',
'&cent' : '¢',
'&chi;' : 'χ',
'&Chi;' : 'Χ',
'&circ;' : 'ˆ',
'&clubs;' : '',
'&cong;' : '',
'&copy;' : '©',
'&COPY;' : '©',
'&COPY' : '©',
'&copy' : '©',
'&crarr;' : '',
'&cup;' : '',
'&curren;' : '¤',
'&curren' : '¤',
'&dagger;' : '',
'&Dagger;' : '',
'&darr;' : '',
'&dArr;' : '',
'&deg;' : '°',
'&deg' : '°',
'&delta;' : 'δ',
'&Delta;' : 'Δ',
'&diams;' : '',
'&divide;' : '÷',
'&divide' : '÷',
'&eacute;' : 'é',
'&Eacute;' : 'É',
'&Eacute' : 'É',
'&eacute' : 'é',
'&ecirc;' : 'ê',
'&Ecirc;' : 'Ê',
'&Ecirc' : 'Ê',
'&ecirc' : 'ê',
'&egrave;' : 'è',
'&Egrave;' : 'È',
'&Egrave' : 'È',
'&egrave' : 'è',
'&empty;' : '',
'&emsp;' : '',
'&ensp;' : '',
'&epsilon;' : 'ε',
'&Epsilon;' : 'Ε',
'&equiv;' : '',
'&eta;' : 'η',
'&Eta;' : 'Η',
'&eth;' : 'ð',
'&ETH;' : 'Ð',
'&ETH' : 'Ð',
'&eth' : 'ð',
'&euml;' : 'ë',
'&Euml;' : 'Ë',
'&Euml' : 'Ë',
'&euml' : 'ë',
'&euro;' : '',
'&exist;' : '',
'&fnof;' : 'ƒ',
'&forall;' : '',
'&frac12;' : '½',
'&frac12' : '½',
'&frac14;' : '¼',
'&frac14' : '¼',
'&frac34;' : '¾',
'&frac34' : '¾',
'&frasl;' : '',
'&gamma;' : 'γ',
'&Gamma;' : 'Γ',
'&ge;' : '',
'&gt;' : '>',
'&GT;' : '>',
'&GT' : '>',
'&gt' : '>',
'&harr;' : '',
'&hArr;' : '',
'&hearts;' : '',
'&hellip;' : '',
'&iacute;' : 'í',
'&Iacute;' : 'Í',
'&Iacute' : 'Í',
'&iacute' : 'í',
'&icirc;' : 'î',
'&Icirc;' : 'Î',
'&Icirc' : 'Î',
'&icirc' : 'î',
'&iexcl;' : '¡',
'&iexcl' : '¡',
'&igrave;' : 'ì',
'&Igrave;' : 'Ì',
'&Igrave' : 'Ì',
'&igrave' : 'ì',
'&image;' : '',
'&infin;' : '',
'&int;' : '',
'&iota;' : 'ι',
'&Iota;' : 'Ι',
'&iquest;' : '¿',
'&iquest' : '¿',
'&isin;' : '',
'&iuml;' : 'ï',
'&Iuml;' : 'Ï',
'&Iuml' : 'Ï',
'&iuml' : 'ï',
'&kappa;' : 'κ',
'&Kappa;' : 'Κ',
'&lambda;' : 'λ',
'&Lambda;' : 'Λ',
'&laquo;' : '«',
'&laquo' : '«',
'&larr;' : '',
'&lArr;' : '',
'&lceil;' : '',
'&ldquo;' : '',
'&le;' : '',
'&lfloor;' : '',
'&lowast;' : '',
'&loz;' : '',
'&lrm;' : '',
'&lsaquo;' : '',
'&lsquo;' : '',
'&lt;' : '<',
'&LT;' : '<',
'&LT' : '<',
'&lt' : '<',
'&macr;' : '¯',
'&macr' : '¯',
'&mdash;' : '',
'&micro;' : 'µ',
'&micro' : 'µ',
'&middot;' : '·',
'&middot' : '·',
'&minus;' : '',
'&mu;' : 'μ',
'&Mu;' : 'Μ',
'&nabla;' : '',
'&nbsp;' : ' ',
'&nbsp' : ' ',
'&ndash;' : '',
'&ne;' : '',
'&ni;' : '',
'&not;' : '¬',
'&not' : '¬',
'&notin;' : '',
'&nsub;' : '',
'&ntilde;' : 'ñ',
'&Ntilde;' : 'Ñ',
'&Ntilde' : 'Ñ',
'&ntilde' : 'ñ',
'&nu;' : 'ν',
'&Nu;' : 'Ν',
'&oacute;' : 'ó',
'&Oacute;' : 'Ó',
'&Oacute' : 'Ó',
'&oacute' : 'ó',
'&ocirc;' : 'ô',
'&Ocirc;' : 'Ô',
'&Ocirc' : 'Ô',
'&ocirc' : 'ô',
'&OElig;' : 'Œ',
'&oelig;' : 'œ',
'&ograve;' : 'ò',
'&Ograve;' : 'Ò',
'&Ograve' : 'Ò',
'&ograve' : 'ò',
'&oline;' : '',
'&omega;' : 'ω',
'&Omega;' : 'Ω',
'&omicron;' : 'ο',
'&Omicron;' : 'Ο',
'&oplus;' : '',
'&or;' : '',
'&ordf;' : 'ª',
'&ordf' : 'ª',
'&ordm;' : 'º',
'&ordm' : 'º',
'&oslash;' : 'ø',
'&Oslash;' : 'Ø',
'&Oslash' : 'Ø',
'&oslash' : 'ø',
'&otilde;' : 'õ',
'&Otilde;' : 'Õ',
'&Otilde' : 'Õ',
'&otilde' : 'õ',
'&otimes;' : '',
'&ouml;' : 'ö',
'&Ouml;' : 'Ö',
'&Ouml' : 'Ö',
'&ouml' : 'ö',
'&para;' : '',
'&para' : '',
'&part;' : '',
'&permil;' : '',
'&perp;' : '',
'&phi;' : 'φ',
'&Phi;' : 'Φ',
'&pi;' : 'π',
'&Pi;' : 'Π',
'&piv;' : 'ϖ',
'&plusmn;' : '±',
'&plusmn' : '±',
'&pound;' : '£',
'&pound' : '£',
'&prime;' : '',
'&Prime;' : '',
'&prod;' : '',
'&prop;' : '',
'&psi;' : 'ψ',
'&Psi;' : 'Ψ',
'&quot;' : '"',
'&QUOT;' : '"',
'&QUOT' : '"',
'&quot' : '"',
'&radic;' : '',
'&raquo;' : '»',
'&raquo' : '»',
'&rarr;' : '',
'&rArr;' : '',
'&rceil;' : '',
'&rdquo;' : '',
'&real;' : '',
'&reg;' : '®',
'&REG;' : '®',
'&REG' : '®',
'&reg' : '®',
'&rfloor;' : '',
'&rho;' : 'ρ',
'&Rho;' : 'Ρ',
'&rlm;' : '',
'&rsaquo;' : '',
'&rsquo;' : '',
'&sbquo;' : '',
'&scaron;' : 'š',
'&Scaron;' : 'Š',
'&sdot;' : '',
'&sect;' : '§',
'&sect' : '§',
'&shy;' : '­', # strange optional hyphenation control character, not just a dash
'&shy' : '­',
'&sigma;' : 'σ',
'&Sigma;' : 'Σ',
'&sigmaf;' : 'ς',
'&sim;' : '',
'&spades;' : '',
'&sub;' : '',
'&sube;' : '',
'&sum;' : '',
'&sup1;' : '¹',
'&sup1' : '¹',
'&sup2;' : '²',
'&sup2' : '²',
'&sup3;' : '³',
'&sup3' : '³',
'&sup;' : '',
'&supe;' : '',
'&szlig;' : 'ß',
'&szlig' : 'ß',
'&tau;' : 'τ',
'&Tau;' : 'Τ',
'&there4;' : '',
'&theta;' : 'θ',
'&Theta;' : 'Θ',
'&thetasym;' : 'ϑ',
'&thinsp;' : '',
'&thorn;' : 'þ',
'&THORN;' : 'Þ',
'&THORN' : 'Þ',
'&thorn' : 'þ',
'&tilde;' : '˜',
'&times;' : '×',
'&times' : '×',
'&trade;' : '',
'&uacute;' : 'ú',
'&Uacute;' : 'Ú',
'&Uacute' : 'Ú',
'&uacute' : 'ú',
'&uarr;' : '',
'&uArr;' : '',
'&ucirc;' : 'û',
'&Ucirc;' : 'Û',
'&Ucirc' : 'Û',
'&ucirc' : 'û',
'&ugrave;' : 'ù',
'&Ugrave;' : 'Ù',
'&Ugrave' : 'Ù',
'&ugrave' : 'ù',
'&uml;' : '¨',
'&uml' : '¨',
'&upsih;' : 'ϒ',
'&upsilon;' : 'υ',
'&Upsilon;' : 'Υ',
'&uuml;' : 'ü',
'&Uuml;' : 'Ü',
'&Uuml' : 'Ü',
'&uuml' : 'ü',
'&weierp;' : '',
'&xi;' : 'ξ',
'&Xi;' : 'Ξ',
'&yacute;' : 'ý',
'&Yacute;' : 'Ý',
'&Yacute' : 'Ý',
'&yacute' : 'ý',
'&yen;' : '¥',
'&yen' : '¥',
'&yuml;' : 'ÿ',
'&Yuml;' : 'Ÿ',
'&yuml' : 'ÿ',
'&zeta;' : 'ζ',
'&Zeta;' : 'Ζ',
'&zwj;' : '', # strange spacing control character, not just a space
'&zwnj;' : '', # strange spacing control character, not just a space
}
FB2_PROLOGUE = '<FictionBook>'
FB2_DESCRIPTION = '''<description>
<title-info>
<genre>fanfiction</genre>
<author>
<first-name></first-name>
<middle-name></middle-name>
<last-name>%s</last-name>
</author>
<book-title>%s</book-title>
<lang>eng</lang>
</title-info>
<document-info>
<author>
<nickname>sgzmd</nickname>
</author>
<date value="%s">%s</date>
<id>sgzmd_%s</id>
<version>2.0</version>
</document-info>
</description>'''
HTML_ESC_Definitions = 'HTML_Escape.def'

View file

@ -0,0 +1,111 @@
[defaults]
## [defaults] section applies to all formats and sites but may be
## overridden.
# All available titlepage_entries:
# category
# genre
# status
# datePublished
# dateUpdated
# dateCreated
# rating
# warnings
# numChapters
# numWords
# site
# siteabbrev
# author
# authorId
# authorURL
# title
# storyId
# storyUrl
# extratags
# description
# formatname
# formatext
## items to include in title page
titlepage_entries: category,genre,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,storyId,authorId,extratags,description
## include title page as first page.
include_titlepage: true
## include TOC page immediately after title page.
include_tocpage: true
## python string Template, string with ${title}, ${author} etc, same as titlepage_entries
## Can include directories. ${formatext} will be added if not in name somewhere.
output_filename: ${title}-${siteabbrev}_${storyId}${formatext}
## Make directories as needed.
make_directories: true
## put output (with output_filename) in a zip file zip_filename.
zip_output: false
## Can include directories. .zip will be added if not in name somewhere
zip_filename: ${title}-${siteabbrev}_${storyId}${formatext}.zip
## try to make the output file name 'safe'--remove invalid filename chars.
## applies to both output_filename & zip_filename
safe_filename: true
## extra tags (comma separated) to include, primarily for epub.
extratags: FanFiction
## number of seconds to sleep between calls to the story site.
slow_down_sleep_time:0.5
## Each output format has a section that overrides [defaults]
[html]
[txt]
## Add URLs since there aren't links.
titlepage_entries: category,genre,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,extratags,storyUrl, author URL, description
# use \r\n for line endings, the windows convention. txt output only.
windows_eol: true
[epub]
## epub is already a zip file.
zip_output: false
# possible subject tags: extratags, genre, category, warnings, lastupdate
# lastupdate creates two tags: "Last Update Year/Month: %Y/%m" and "Last Update: %Y/%m/%d"
include_subject_tags: extratags, genre, category, lastupdate
include_tocpage: false
# epub->mobi conversions typically don't like tables.
titlepage_use_table: true
## When using tables, make these span both columns.
wide_titlepage_entries: description, storyUrl, author URL
## Each site has a section that overrides [defaults] *and* the format section
[test1.com]
titlepage_entries: title,description,category,genre, status,dateCreated,rating,numChapters,numWords,extratags,description,storyUrl,extratags
extratags: FanFiction,Testing
## If necessary, you can define [<site>:<format>] sections to customize
## the formats differently for the same site. Overrides defaults, format and site.
[test1.com:txt]
extratags: FanFiction,Testing,Text
[test1.com:html]
extratags: FanFiction,Testing,HTML
[www.twilighted.net]
## Some sites require login (or login for some rated stories)
## The program can prompt you, or you can save it in config.
## This should go in your personal.ini, not defaults.ini.
#username:YourName
#password:yourpassword
[www.whofic.com]
[www.fanfiction.net]

View file

@ -0,0 +1,220 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import shutil
import os.path
import getpass
import logging
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import zipdir
import output
import adapter
from adapter import StoryArchivedAlready
from adapter import StoryDoesNotExist
from adapter import FailedToDownload
from adapter import InvalidStoryURL
from adapter import LoginRequiredException
import ffnet
import fpcom
import ficwad
import fictionalley
import hpfiction
import twilighted
import twiwrite
import adastrafanfic
import whofic
import potionsNsnitches
import mediaminer
import time
class FanficLoader:
'''A controller class which handles the interaction between various specific downloaders and writers'''
booksDirectory = "books"
standAlone = False
def __init__(self, adapter, writerClass, quiet = False, inmemory = False, compress=True, overwrite=False):
self.adapter = adapter
self.writerClass = writerClass
self.quiet = quiet
self.inmemory = inmemory
self.compress = compress
self.badLogin = False
self.overWrite = overwrite
def getBooksDirectory(self):
return self.booksDirectory
def setBooksDirectory(self, bd):
self.booksDirectory = bd
return self.booksDirectory
def getStandAlone(self):
return self.standAlone
def setStandAlone(self, sa):
self.standAlone = sa
return self.standAlone
def getOverWrite(self):
return self.overWrite
def setOverWrite(self, sa):
self.overWrite = sa
return self.overWrite
def getAdapter():
return self.adapter
def download(self):
logging.debug("Trying to download the story")
if self.adapter.requiresLogin():
logging.debug("Story requires login")
if not self.adapter.performLogin():
logging.debug("Login/password problem")
self.badLogin = True
raise adapter.LoginRequiredException(self.adapter.url)
urls = self.adapter.extractIndividualUrls()
logging.debug("self.writerClass=%s" % self.writerClass)
if self.standAlone and not self.inmemory:
s = self.adapter.getOutputFileName(self.booksDirectory, self.writerClass.getFormatExt())
logging.debug("Always overwrite? %s" % self.overWrite)
if not self.overWrite:
logging.debug("Checking if current archive of the story exists. Filename=%s" % s)
if not zipdir.checkNewer ( s, self.adapter.getStoryUpdated() ):
raise StoryArchivedAlready("A Current archive file \"" + s + "\" already exists! Skipping!")
else:
logging.debug("Do not check for existance of archive file.")
self.writer = self.writerClass(self.booksDirectory,
self.adapter,
inmemory=self.inmemory,
compress=self.compress)
i = 1
for u,n in urls:
if not self.quiet:
print('Downloading chapter %d/%d' % (i, len(urls)))
text = self.adapter.getText(u)
self.writer.writeChapter(i, n, text)
i = i+1
#time.sleep(2)
self.writer.finalise()
if self.inmemory:
self.name = self.writer.name
return self.writer.output.getvalue()
if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG,format="%(levelname)s:%(filename)s(%(lineno)d):%(message)s")
argvlen = len(sys.argv)
url = None
bookFormat = 'epub'
if argvlen > 1:
url = sys.argv[1]
if argvlen > 2:
bookFormat = sys.argv[2]
if url is None:
print >> sys.stderr, "Usage: downloader.py URL Type"
sys.exit(-1)
if type(url) is unicode:
print('URL is unicode')
url = url.encode('latin1')
url = url.strip()
adapter = None
writerClass = None
if url.find('fanficauthors') != -1:
print >> sys.stderr, "fanficauthors.net already provides ebooks"
sys.exit(0)
elif url.find('fictionalley') != -1:
adapter = fictionalley.FictionAlley(url)
elif url.find('ficwad') != -1:
adapter = ficwad.FicWad(url)
elif url.find('fanfiction.net') != -1:
adapter = ffnet.FFNet(url)
elif url.find('fictionpress.com') != -1:
adapter = fpcom.FPCom(url)
elif url.find('harrypotterfanfiction.com') != -1:
adapter = hpfiction.HPFiction(url)
elif url.find('twilighted.net') != -1:
adapter = twilighted.Twilighted(url)
elif url.find('twiwrite.net') != -1:
adapter = twiwrite.Twiwrite(url)
elif url.find('adastrafanfic.com') != -1:
adapter = adastrafanfic.Adastrafanfic(url)
elif url.find('whofic.com') != -1:
adapter = whofic.Whofic(url)
elif url.find('potionsandsnitches.net') != -1:
adapter = potionsNsnitches.PotionsNSnitches(url)
elif url.find('mediaminer.org') != -1:
adapter = mediaminer.MediaMiner(url)
else:
print >> sys.stderr, "Oi! I can haz not appropriate adapter for URL %s!" % url
sys.exit(1)
if bookFormat == 'epub':
writerClass = output.EPubFanficWriter
elif bookFormat == 'html':
writerClass = output.HTMLWriter
elif bookFormat == 'mobi':
writerClass = output.MobiWriter
elif bookFormat == 'text':
writerClass = output.TextWriter
if adapter.requiresLogin(url):
print("Meow, URL %s requires you to haz been logged in! Please can I haz this datas?" % url)
sys.stdout.write("Can I haz ur login? ")
login = sys.stdin.readline().strip()
password = getpass.getpass(prompt='Can I haz ur password? ')
print("Login: `%s`, Password: `%s`" % (login, password))
adapter.setLogin(login)
adapter.setPassword(password)
loader = FanficLoader(adapter,
writerClass)
loader.setStandAlone(True)
if bookFormat != 'epub':
loader.setOverWrite(True)
try:
loader.download()
except FailedToDownload, ftd:
print >> sys.stderr, str(ftd)
sys.exit(2) # Error Downloading
except InvalidStoryURL, isu:
print >> sys.stderr, str(isu)
sys.exit(3) # Unknown Error
except StoryArchivedAlready, se:
print >> sys.stderr, str(se)
sys.exit(10) # Skipped
except StoryDoesNotExist, sdne:
print >> sys.stderr, str(sdne)
sys.exit(20) # Missing
except LoginRequiredException, lre:
print >> sys.stderr, str(lre)
sys.exit(30) # Missing
except Exception, e:
print >> sys.stderr, str(e)
sys.exit(99) # Unknown Error
sys.exit(0)

View file

@ -0,0 +1,293 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# epubmerge.py 1.0
# Copyright 2011, Jim Miller
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import getopt
import os
import zlib
import zipfile
from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED
from time import time
from xml.dom.minidom import parse, parseString, getDOMImplementation
def usage():
print "epubmerge 1.0 Merges multiple epub format ebooks together"
print "\nUsage: " + sys.argv[0]+" [options] <input epub> [<input epub> ...]\n"
print " Options:"
print " -h --help"
print " -o <output file> --output=<output file> Default: merge.epub"
print " -t <output title> --title=<output title> Default: '<First Title> Anthology'"
print " -a <author name> --author=<author name> Default: <All authors from epubs>"
print " Multiple authors may be given."
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], "t:a:o:h", ["title=","author=", "output=","help"])
except getopt.GetoptError, err:
# print help information and exit:
print str(err) # will print something like "option -a not recognized"
usage()
sys.exit(2)
if( len(args) < 1 ):
usage()
sys.exit()
outputopt = "merge.epub"
titleopt = None
authoropts = [] # list of strings
for o, a in opts:
if o in ("-h", "--help"):
usage()
sys.exit()
elif o in ("-t", "--title"):
titleopt = a
elif o in ("-a", "--author"):
authoropts.append(a)
elif o in ("-o", "--output"):
outputopt = a
else:
assert False, "unhandled option"
## Add .epub if not already there.
if( not outputopt.lower().endswith(".epub") ):
outputopt=outputopt+".epub"
print "output file: "+outputopt
## Write mimetype file, must be first and uncompressed.
## Older versions of python(2.4/5) don't allow you to specify
## compression by individual file.
## Overwrite if existing output file.
outputepub = ZipFile(outputopt, "w", compression=ZIP_STORED)
outputepub.debug = 3
outputepub.writestr("mimetype", "application/epub+zip")
outputepub.close()
## Re-open file for content.
outputepub = ZipFile(outputopt, "a", compression=ZIP_DEFLATED)
outputepub.debug = 3
## Create META-INF/container.xml file. The only thing it does is
## point to content.opf
containerdom = getDOMImplementation().createDocument(None, "container", None)
containertop = containerdom.documentElement
containertop.setAttribute("version","1.0")
containertop.setAttribute("xmlns","urn:oasis:names:tc:opendocument:xmlns:container")
rootfiles = containerdom.createElement("rootfiles")
containertop.appendChild(rootfiles)
rootfiles.appendChild(newTag(containerdom,"rootfile",{"full-path":"content.opf",
"media-type":"application/oebps-package+xml"}))
outputepub.writestr("META-INF/container.xml",containerdom.toprettyxml(indent=' ',encoding='utf-8'))
## Process input epubs.
items = [] # list of (id, href, type) tuples(all strings) -- From .opfs' manifests
items.append(("ncx","toc.ncx","application/x-dtbncx+xml")) ## we'll generate the toc.ncx file,
## but it needs to be in the items manifest.
itemrefs = [] # list of strings -- idrefs from .opfs' spines
navmaps = [] # list of navMap DOM elements -- TOC data for each from toc.ncx files
booktitles = [] # list of strings -- Each book's title
allauthors = [] # list of lists of strings -- Each book's list of authors.
booknum=1
for filename in args:
print "input file: "+filename
book = "%d" % booknum
epub = ZipFile(filename, 'r')
## Find the .opf file.
container = epub.read("META-INF/container.xml")
containerdom = parseString(container)
rootfilenodelist = containerdom.getElementsByTagName("rootfile")
rootfilename = rootfilenodelist[0].getAttribute("full-path")
## Save the path to the .opf file--hrefs inside it are relative to it.
relpath = os.path.dirname(rootfilename)
if( len(relpath) > 0 ):
relpath=relpath+"/"
metadom = parseString(epub.read(rootfilename))
## Save indiv book title
booktitles.append(metadom.getElementsByTagName("dc:title")[0].firstChild.data)
## Save authors.
authors=[]
for creator in metadom.getElementsByTagName("dc:creator"):
if( creator.getAttribute("opf:role") == "aut" ):
authors.append(creator.firstChild.data)
allauthors.append(authors)
for item in metadom.getElementsByTagName("item"):
if( item.getAttribute("media-type") == "application/x-dtbncx+xml" ):
# TOC file is only one with this type--as far as I know.
# grab the whole navmap, deal with it later.
tocdom = parseString(epub.read(relpath+item.getAttribute("href")))
for navpoint in tocdom.getElementsByTagName("navPoint"):
navpoint.setAttribute("id","a"+book+navpoint.getAttribute("id"))
for content in tocdom.getElementsByTagName("content"):
content.setAttribute("src",book+"/"+relpath+content.getAttribute("src"))
navmaps.append(tocdom.getElementsByTagName("navMap")[0])
else:
id="a"+book+item.getAttribute("id")
href=book+"/"+relpath+item.getAttribute("href")
href=href.encode('utf8')
items.append((id,href,item.getAttribute("media-type")))
outputepub.writestr(href,
epub.read(relpath+item.getAttribute("href")))
for itemref in metadom.getElementsByTagName("itemref"):
itemrefs.append("a"+book+itemref.getAttribute("idref"))
booknum=booknum+1;
## create content.opf file.
uniqueid="epubmerge-uid-%d" % time() # real sophisticated uid scheme.
contentdom = getDOMImplementation().createDocument(None, "package", None)
package = contentdom.documentElement
package.setAttribute("version","2.0")
package.setAttribute("xmlns","http://www.idpf.org/2007/opf")
package.setAttribute("unique-identifier","epubmerge-id")
metadata=newTag(contentdom,"metadata",
attrs={"xmlns:dc":"http://purl.org/dc/elements/1.1/",
"xmlns:opf":"http://www.idpf.org/2007/opf"})
package.appendChild(metadata)
metadata.appendChild(newTag(contentdom,"dc:identifier",text=uniqueid,attrs={"id":"epubmerge-id"}))
if( titleopt is None ):
titleopt = booktitles[0]+" Anthology"
metadata.appendChild(newTag(contentdom,"dc:title",text=titleopt))
# If cmdline authors, use those instead of those collected from the epubs
# (allauthors kept for TOC & description gen below.
if( len(authoropts) > 1 ):
useauthors=[authoropts]
else:
useauthors=allauthors
usedauthors=dict()
for authorlist in useauthors:
for author in authorlist:
if( not usedauthors.has_key(author) ):
usedauthors[author]=author
metadata.appendChild(newTag(contentdom,"dc:creator",
attrs={"opf:role":"aut"},
text=author))
metadata.appendChild(newTag(contentdom,"dc:contributor",text="epubmerge",attrs={"opf:role":"bkp"}))
metadata.appendChild(newTag(contentdom,"dc:rights",text="Copyrights as per source stories"))
metadata.appendChild(newTag(contentdom,"dc:language",text="en"))
# created now, but not filled in until TOC generation to save loops.
description = newTag(contentdom,"dc:description",text="Anthology containing:\n")
metadata.appendChild(description)
manifest = contentdom.createElement("manifest")
package.appendChild(manifest)
for item in items:
(id,href,type)=item
manifest.appendChild(newTag(contentdom,"item",
attrs={'id':id,
'href':href,
'media-type':type}))
spine = newTag(contentdom,"spine",attrs={"toc":"ncx"})
package.appendChild(spine)
for itemref in itemrefs:
spine.appendChild(newTag(contentdom,"itemref",
attrs={"idref":itemref,
"linear":"yes"}))
## create toc.ncx file
tocncxdom = getDOMImplementation().createDocument(None, "ncx", None)
ncx = tocncxdom.documentElement
ncx.setAttribute("version","2005-1")
ncx.setAttribute("xmlns","http://www.daisy.org/z3986/2005/ncx/")
head = tocncxdom.createElement("head")
ncx.appendChild(head)
head.appendChild(newTag(tocncxdom,"meta",
attrs={"name":"dtb:uid", "content":uniqueid}))
head.appendChild(newTag(tocncxdom,"meta",
attrs={"name":"dtb:depth", "content":"1"}))
head.appendChild(newTag(tocncxdom,"meta",
attrs={"name":"dtb:totalPageCount", "content":"0"}))
head.appendChild(newTag(tocncxdom,"meta",
attrs={"name":"dtb:maxPageNumber", "content":"0"}))
docTitle = tocncxdom.createElement("docTitle")
docTitle.appendChild(newTag(tocncxdom,"text",text=titleopt))
ncx.appendChild(docTitle)
tocnavMap = tocncxdom.createElement("navMap")
ncx.appendChild(tocnavMap)
## TOC navPoints can ge nested, but this flattens them for
## simplicity, plus adds a navPoint for each epub.
booknum=0
for navmap in navmaps:
navpoints = navmap.getElementsByTagName("navPoint")
## Copy first navPoint of each epub, give a different id and
## text: bookname by authorname
newnav = navpoints[0].cloneNode(True)
newnav.setAttribute("id","book"+newnav.getAttribute("id"))
## For purposes of TOC titling & desc, use first book author
newtext = newTag(tocncxdom,"text",text=booktitles[booknum]+" by "+allauthors[booknum][0])
description.appendChild(contentdom.createTextNode(booktitles[booknum]+" by "+allauthors[booknum][0]+"\n"))
text = newnav.getElementsByTagName("text")[0]
text.parentNode.replaceChild(newtext,text)
tocnavMap.appendChild(newnav)
for navpoint in navpoints:
tocnavMap.appendChild(navpoint)
booknum=booknum+1;
## Force strict ordering of playOrder
playorder=1
for navpoint in tocncxdom.getElementsByTagName("navPoint"):
navpoint.setAttribute("playOrder","%d" % playorder)
if( not navpoint.getAttribute("id").startswith("book") ):
playorder = playorder + 1
## content.opf written now due to description being filled in
## during TOC generation to save loops.
outputepub.writestr("content.opf",contentdom.toxml('utf-8'))
outputepub.writestr("toc.ncx",tocncxdom.toxml('utf-8'))
outputepub.close()
## Utility method for creating new tags.
def newTag(dom,name,attrs=None,text=None):
tag = dom.createElement(name)
if( attrs is not None ):
for attr in attrs.keys():
tag.setAttribute(attr,attrs[attr])
if( text is not None ):
tag.appendChild(dom.createTextNode(text))
return tag
if __name__ == "__main__":
main()

368
fanficdownloader/ffnet.py Normal file
View file

@ -0,0 +1,368 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import cgi
import uuid
import shutil
import os.path
import logging
import unittest
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time
import datetime
from constants import *
from adapter import *
try:
import login_password
except:
# tough luck
pass
class FFNet(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
self.storyName = 'FF.Net story'
self.authorName = 'FF.Net author'
self.storyDescription = 'Fanfiction Story'
self.storyCharacters = []
self.storySeries = ''
self.authorId = '0'
self.authorURL = self.path
self.storyId = '0'
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.subjects.append ('FanFiction')
logging.debug('self.subjects=%s' % self.subjects)
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = 'FanFiction'
self.category = 'FF.Net Category'
self.storyStatus = 'In-Progress'
self.storyRating = 'K'
self.storyUserRating = '0'
self.outputName = ''
self.outputStorySep = '-ffnet_'
logging.debug('self.path=%s' % self.path)
if self.path.startswith('/'):
self.path = self.path[1:]
spl = self.path.split('/')
logging.debug('spl=%s' % spl)
if spl is not None:
if len(spl) > 0 and spl[0] != 's':
raise InvalidStoryURL("Error URL \"%s\" is not a story." % self.url)
if len(spl) > 1:
self.storyId = spl[1]
if len(spl) > 2:
chapter = spl[1]
else:
chapter = '1'
if len(spl) == 5:
self.path = "/".join(spl[1:-1])
if self.path.endswith('/'):
self.path = self.path[:-1]
logging.debug('self.path=%s' % self.path)
if self.host is not None and self.host == "m.fanfiction.net":
self.host = "www.fanfiction.net"
logging.debug('self.host=%s' % self.host)
self.url = "http://" + self.host + "/" + self.path
logging.debug('self.url=%s' % self.url)
logging.debug('self.storyId=%s' % self.storyId)
if not self.appEngine:
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
else:
self.opener = None
logging.debug("Created FF.Net: url=%s" % (self.url))
def _getLoginScript(self):
return self.path
def _getVarValue(self, varstr):
#logging.debug('_getVarValue varstr=%s' % varstr)
vals = varstr.split('=')
#logging.debug('vals=%s' % vals)
retstr="".join(vals[+1:])
#logging.debug('retstr=%s' % retstr)
if retstr.startswith(' '):
retstr = retstr[1:]
if retstr.endswith(';'):
retstr = retstr[:-1]
return retstr
def _splitCrossover(self, subject):
if "Crossover" in subject:
self.addSubject ("Crossover")
logging.debug('Crossover=%s' % subject)
if subject.find(' and ') != -1:
words = subject.split(' ')
logging.debug('words=%s' % words)
subj = ''
for s in words:
if s in "and Crossover":
if len(subj) > 0:
self.addSubject(subj)
subj = ''
else:
if len(subj) > 0:
subj = subj + ' '
subj = subj + s
if len(subj) > 0:
self.addSubject(subj)
else:
self.addSubject(subject)
else:
self.addSubject(subject)
return True
def _splitGenre(self, subject):
if len(subject) > 0:
words = subject.split('/')
logging.debug('words=%s' % words)
for subj in words:
if len(subj) > 0:
self.addSubject(subj)
return True
def extractIndividualUrls(self):
data = ''
try:
data = self.fetchUrl(self.url)
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + self.url + ". Exception " + unicode(e) + ".")
if data is None:
raise StoryDoesNotExist("Problem reading story URL " + self.url + "!")
d2 = re.sub('&\#[0-9]+;', ' ', data)
soup = None
try:
soup = bs.BeautifulStoneSoup(d2)
except:
logging.error("Failed to decode: <%s>" % d2)
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % self.url)
allA = soup.findAll('a')
for a in allA:
if 'href' in a._getAttrMap() and a['href'].find('/u/') != -1:
self.authorName = a.string
(u1, u2, self.authorId, u3) = a['href'].split('/')
logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName))
urls = []
lines = data.split('\n')
for l in lines:
if l.find("&#187;") != -1 and l.find('<b>') != -1:
s2 = bs.BeautifulStoneSoup(l)
self.storyName = unicode(s2.find('b').string)
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
elif l.find("<a href='/u/") != -1:
s2 = bs.BeautifulStoneSoup(l)
self.authorName = unicode(s2.a.string)
(u1, u2, self.authorId, u3) = s2.a['href'].split('/')
logging.debug('self.authorId=%s, self.authorName=%s' % (self.authorId, self.authorName))
elif l.find("Rated: <a href=") != -1:
if "Complete" in l:
self.storyStatus = 'Completed'
else:
self.storyStatus = 'In-Progress'
s2 = bs.BeautifulStoneSoup(l)
self.storyRating = unicode(s2.a.string).strip()
logging.debug('self.storyRating=%s' % self.storyRating)
logging.debug('s2.a=%s' % s2.a)
s3 = l.split(' - ')
logging.debug('s3=%s' % s3)
if len(s3) > 0:
if s3[1].find("Reviews: <a href=") != -1:
continue
self.language = s3[1].strip()
logging.debug('self.language=%s' % self.language)
if len(s3) > 1:
if s3[2].find("Reviews: <a href=") != -1:
continue
self.genre = s3[2].strip()
if "&" in self.genre:
self.genre = ''
continue
logging.debug('self.genre=%s' % self.genre)
self._splitGenre(self.genre)
logging.debug('self.subjects=%s' % self.subjects)
elif l.find("<SELECT title='chapter navigation'") != -1:
if len(urls) > 0:
continue
try:
u = l.decode('utf-8')
except UnicodeEncodeError, e:
u = l
except:
u = l.encode('ascii', 'xmlcharrefreplace')
u = re.sub('&\#[0-9]+;', ' ', u)
s2 = bs.BeautifulSoup(u)
options = s2.findAll('option')
for o in options:
url = 'http://' + self.host + '/s/' + self.storyId + '/' + o['value']
title = o.string
logging.debug('URL = `%s`, Title = `%s`' % (url, title))
urls.append((url,title))
elif l.find("var chapters") != -1:
self.numChapters = self._getVarValue (l)
logging.debug('self.numChapters=%s' % self.numChapters)
elif l.find("var words") != -1:
self.numWords = self._getVarValue (l)
logging.debug('self.numWords=%s' % self.numWords)
elif l.find("var categoryid") != -1:
categoryid = self._getVarValue (l)
logging.debug('categoryid=%s' % categoryid)
elif l.find("var cat_title") != -1:
self.category = self._getVarValue (l).strip("'")
logging.debug('self.category=%s' % self.category)
self._splitCrossover(self.category)
logging.debug('self.subjects=%s' % self.subjects)
elif l.find("var summary") != -1:
self.storyDescription = self._getVarValue (l).strip("'")
if '&' in self.storyDescription:
s = self.storyDescription.split('&')
logging.debug('s=%s' % s)
self.storyDescription = ''
for ss in s:
if len(self.storyDescription) > 0:
if len(ss) > 4 and 'amp;' in ss[1:4]:
self.storyDescription = self.storyDescription + '&' + ss
else:
self.storyDescription = self.storyDescription + '&amp;' + ss
else:
self.storyDescription = ss
#logging.debug('self.storyDescription=%s' % self.storyDescription.replace('\n',' ').replace('\r',''))
elif l.find("var datep") != -1:
dateps = self._getVarValue (l)
self.storyPublished = datetime.datetime(*time.strptime ( dateps, "'%m-%d-%y'" )[0:5])
logging.debug('self.storyPublished=%s' % self.storyPublished.strftime("%Y-%m-%dT%I:%M:%S"))
elif l.find("var dateu") != -1:
dateus = self._getVarValue (l)
self.storyUpdated = datetime.datetime(*time.strptime ( dateus, "'%m-%d-%y'" )[0:5])
logging.debug('self.storyUpdated=%s' % self.storyUpdated.strftime("%Y-%m-%dT%I:%M:%S"))
if len(urls) <= 0:
# no chapters found, try url by itself.
urls.append((self.url,self.storyName))
self.authorURL = 'http://' + self.host + '/u/' + self.authorId
#logging.debug('urls=%s' % urls)
return urls
def getText(self, url):
data = None
# try up to three times, with longer sleeps first.
for sleeptime in [0.5, 4, 9]:
time.sleep(sleeptime)
try:
logging.debug("Fetching URL: %s sleeptime: %f" % (url, sleeptime))
data = self.fetchUrl(url)
if data is not None:
break
except Exception, e:
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
logging.error("Data downloaded: <%s>" % data)
if data is None:
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
lines = data.split('\n')
textbuf = ''
emit = False
olddata = data
try:
data = data.decode('utf8')
except:
data = olddata
soup = None
try:
soup = bs.BeautifulStoneSoup(data)
except:
logging.debug(data)
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
div = soup.find('div', {'id' : 'storytext'})
if None == div:
if "Story Not Found" in data:
logging.info("Story not Found at %s" % url)
raise FailedToDownload("Story not Found at %s" % url)
logging.debug(data)
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return div.__str__('utf8')
class FFA_UnitTests(unittest.TestCase):
def setUp(self):
logging.basicConfig(level=logging.DEBUG)
pass
def testChaptersAuthStory(self):
f = FFNet('http://www.fanfiction.net/s/5257563/1')
f.extractIndividualUrls()
self.assertEquals('Beka0502', f.getAuthorName())
self.assertEquals("Draco's Redemption", f.getStoryName())
def testChaptersCountNames(self):
f = FFNet('http://www.fanfiction.net/s/5257563/1')
urls = f.extractIndividualUrls()
self.assertEquals(10, len(urls))
def testGetText(self):
url = 'http://www.fanfiction.net/s/5257563/1'
f = FFNet(url)
text = f.getText(url)
self.assertTrue(text.find('He was just about to look at some photos when he heard a crack') != -1)
def testBrokenWands(self):
url = 'http://www.fanfiction.net/s/1527263/30/Harry_Potter_and_Broken_Wands'
f = FFNet(url)
text = f.getText(url)
urls = f.extractIndividualUrls()
def testFictionPress(self):
url = 'http://www.fictionpress.com/s/2725180/1/Behind_This_Facade'
f = FFNet(url)
urls = f.extractIndividualUrls()
self.assertEquals('Behind This Facade', f.getStoryName())
self.assertEquals('IntoxicatingMelody', f.getAuthorName())
text = f.getText(url)
self.assertTrue(text.find('Kale Resgerald at your service" He answered, "So, can we go now? Or do you want to') != -1)
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,301 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import shutil
import logging
import os.path
import urllib as u
import pprint as pp
import urllib2 as u2
import cookielib as cl
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time as time
import datetime
from adapter import *
class FictionAlley(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
logging.debug('self.host=%s' % self.host)
logging.debug('self.path=%s' % self.path)
cookieproc = u2.HTTPCookieProcessor()
# FictionAlley wants a cookie to prove you're old enough to read R+ rated stuff.
cookie = cl.Cookie(version=0, name='fauser', value='wizard',
port=None, port_specified=False,
domain='www.fictionalley.org', domain_specified=False, domain_initial_dot=False,
path='/authors', path_specified=True,
secure=False,
expires=time.time()+10000,
discard=False,
comment=None,
comment_url=None,
rest={'HttpOnly': None},
rfc2109=False)
cookieproc.cookiejar.set_cookie(cookie)
self.opener = u2.build_opener(cookieproc)
ss = self.path.split('/')
self.storyDescription = 'Fanfiction Story'
self.authorId = ''
self.authorURL = ''
self.storyId = ''
if len(ss) > 2 and ss[1] == 'authors':
self.authorId = ss[2]
self.authorURL = 'http://' + self.host + '/authors/' + self.authorId
if len(ss) > 3:
self.storyId = ss[3].replace ('.html','')
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.subjects.append ('fanfiction')
self.subjects.append ('Harry Potter')
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = ''
self.category = 'Harry Potter'
self.storyStatus = 'Unknown' # fictionalley doesn't give us in-progress/completed anywhere.
self.storyRating = 'K'
self.storyUserRating = '0'
self.storyCharacters = []
self.storySeries = ''
self.storyName = ''
self.outputName = ''
self.outputStorySep = '-fa_'
def getPasswordLine(self):
return 'opaopapassword'
def getLoginScript(self):
return 'opaopaloginscript'
def getLoginPasswordOthers(self):
login = dict(login = 'name', password = 'pass')
other = dict(submit = 'Log In', remember='yes')
return (login, other)
def _processChapterHeaders(self, div):
brs = div.findAll ('br')
for br in brs:
keystr=''
valstr=''
if len(br.contents) > 2:
keystr = br.contents[1]
if keystr is not None:
strs = re.split ("<[^>]+>", unicode(keystr))
keystr=''
for s in strs:
keystr = keystr + s
valstr = br.contents[2].strip(' ')
if keystr is not None:
if keystr == 'Rating:':
self.storyRating = valstr
logging.debug('self.storyRating=%s' % self.storyRating)
elif keystr == 'Genre:':
self.genre = valstr
logging.debug('self.genre=%s' % self.genre)
s2 = valstr.split(', ')
for ss2 in s2:
self.addSubject(ss2)
logging.debug('self.subjects=%s' % self.subjects)
elif keystr == 'Main Character(s):':
s2 = valstr.split(', ')
for ss2 in s2:
self.addCharacter(ss2)
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
elif keystr == 'Summary:':
self.storyDescription = valstr
#logging.debug('self.storyDescription=%s' % self.storyDescription.replace("\n"," ").replace('\r',''))
def extractIndividualUrls(self):
data = ''
try:
data = self.opener.open(self.url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + self.url + ". Exception " + unicode(e) + ".")
if data is None:
raise StoryDoesNotExist("Problem reading story URL " + self.url + "!")
# There is some usefull information in the headers of the first chapter page..
data = data.replace('<!-- headerstart -->','<crazytagstringnobodywouldstumbleonaccidently id="storyheaders">').replace('<!-- headerend -->','</crazytagstringnobodywouldstumbleonaccidently>')
soup = None
try:
soup = bs.BeautifulStoneSoup(data)
except:
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % self.url)
breadcrumbs = soup.find('div', {'class': 'breadcrumbs'})
if breadcrumbs is not None:
# Be aware that this means that the user has entered the {STORY}01.html
# We will not have valid Publised and Updated dates. User should enter
# the {STORY}.html instead. We should force that instead of this.
#logging.debug('breadcrumbs=%s' % breadcrumbs )
bcas = breadcrumbs.findAll('a')
#logging.debug('bcas=%s' % bcas )
if bcas is not None and len(bcas) > 1:
bca = bcas[1]
#logging.debug('bca=%s' % bca )
if 'href' in bca._getAttrMap():
#logging.debug('bca.href=%s' % bca['href'] )
url = unicode(bca['href'])
if url is not None and len(url) > 0:
self.url = url
logging.debug('self.url=%s' % self.url )
ss = self.url.split('/')
self.storyId = ss[-1].replace('.html','')
self.storyName = bca.string
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
data = self.opener.open(self.url).read()
# There is some usefull information in the headers of the first chapter page..
data = data.replace('<!-- headerstart -->','<crazytagstringnobodywouldstumbleonaccidently id="storyheaders">').replace('<!-- headerend -->','</crazytagstringnobodywouldstumbleonaccidently>')
soup = bs.BeautifulStoneSoup(data)
# If it is decided that we really do care about number of words.. It's only available on the author's page..
#d0 = self.opener.open(self.authorURL).read()
#soupA = bs.BeautifulStoneSoup(d0)
#dls = soupA.findAll('dl')
#logging.debug('dls=%s' % dls)
# Get title from <title>, remove before '-'.
if len(self.storyName) == 0:
title = soup.find('title').string
self.storyName = "-".join(title.split('-')[1:]).strip().replace(" (Story Text)","")
links = soup.findAll('li')
self.numChapters = 0;
result = []
if len(links) == 0:
# Be aware that this means that the user has entered the {STORY}01.html
# We will not have valid Publised and Updated dates. User should enter
# the {STORY}.html instead. We should force that instead of this.
breadcrumbs = soup.find('div', {'class': 'breadcrumbs'})
self.authorName = breadcrumbs.a.string.replace("'s Fics","")
result.append((self.url,self.storyName))
#logging.debug('chapter[%s]=%s, %s' % (self.numChapters+1,self.url,self.storyName))
self.numChapters = self.numChapters + 1;
div = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storyheaders'})
if div is not None:
self._processChapterHeaders(div)
else:
author = soup.find('h1', {'class' : 'title'})
self.authorName = author.a.string
summary = soup.find('div', {'class' : 'summary'})
ss = summary.contents
if len(ss) > 1:
ss1 = ss[0].split(': ')
if len(ss1) > 1 and ss1[0] == 'Rating':
self.storyRating = ss1[1]
logging.debug('self.storyRating=%s' % self.storyRating)
self.storyDescription = unicode(ss[1]).replace("<br>","").replace("</br>","").replace('\n','')
#logging.debug('self.storyDescription=%s' % self.storyDescription.replace("\n"," ").replace('\r',''))
for li in links:
a = li.find('a', {'class' : 'chapterlink'})
s = li.contents
if a is not None:
url = a['href']
title = a.string
result.append((url,title))
#logging.debug('chapter[%s]=%s, %s' % (self.numChapters+1,url,title))
if self.numChapters == 0:
# fictionalley uses full URLs in chapter list.
d1 = self.opener.open(url).read()
# find <!-- headerstart --> & <!-- headerend --> and
# replaced with matching div pair for easier parsing.
# Yes, it's an evil kludge, but what can ya do? Using
# something other than div prevents soup from pairing
# our div with poor html inside the story text.
d1 = d1.replace('<!-- headerstart -->','<crazytagstringnobodywouldstumbleonaccidently id="storyheaders">').replace('<!-- headerend -->','</crazytagstringnobodywouldstumbleonaccidently>')
sop = bs.BeautifulStoneSoup(d1)
div = sop.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storyheaders'})
if div is not None:
self._processChapterHeaders(div)
self.numChapters = self.numChapters + 1
if len(s) > 1:
datestr=''
ss2 = s[1].replace('\n','').replace('(','').split(' ')
if len(ss2) > 2 and ss2[0] == 'Posted:':
datestr = ss2[1] + ' ' + ss2[2]
tmpdate = datetime.datetime.fromtimestamp(time.mktime(time.strptime(datestr.strip(' '), "%Y-%m-%d %H:%M:%S")))
if self.numChapters == 1:
self.storyPublished = tmpdate
self.storyUpdated = tmpdate
logging.debug('self.storyPublished=%s, self.storyUpdated=%s' % (self.storyPublished, self.storyUpdated))
else:
logging.debug('li chapterlink not found! li=%s' % li)
logging.debug('Story "%s" by %s' % (self.storyName, self.authorName))
return result
def getText(self, url):
# fictionalley uses full URLs in chapter list.
data = ''
try:
data = self.opener.open(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
# find <!-- headerend --> & <!-- footerstart --> and
# replaced with matching div pair for easier parsing.
# Yes, it's an evil kludge, but what can ya do? Using
# something other than div prevents soup from pairing
# our div with poor html inside the story text.
data = data.replace('<!-- headerend -->','<crazytagstringnobodywouldstumbleonaccidently id="storytext">').replace('<!-- footerstart -->','</crazytagstringnobodywouldstumbleonaccidently>')
soup = None
try:
soup = bs.BeautifulStoneSoup(data)
except:
logging.info("Failed to decode: <%s>" % data)
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
div = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storytext'})
if None == div:
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
html = soup.findAll('html')
if len(html) > 1:
return html[1].__str__('utf8')
else:
return div.__str__('utf8').replace('crazytagstringnobodywouldstumbleonaccidently','div')
if __name__ == '__main__':
url = 'http://www.fictionalley.org/authors/drt/DA.html'
data = self.opener.open(url).read()
host = up.urlparse(url).netloc
fw = FictionAlley(url)
urls = fw.extractIndividualUrls(data, host, url)
pp.pprint(urls)
print(fw.getText(data))

257
fanficdownloader/ficwad.py Normal file
View file

@ -0,0 +1,257 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import shutil
import os.path
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import logging
import time
import datetime
from adapter import *
class FicWad(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
self.host = up.urlparse(url).netloc
self.storyDescription = 'Fanfiction Story'
self.authorId = '0'
self.storyId = '0'
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.subjects.append ('fanfiction')
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = 'FanFiction'
self.category = 'Category'
self.storyStatus = 'In-Progress'
self.storyRating = 'PG'
self.storyUserRating = '0'
self.storyCharacters = []
self.storySeries = ''
self.outputName = ''
self.outputStorySep = '-fw_'
def getPasswordLine(self):
return 'opaopapassword'
def getLoginScript(self):
return 'opaopaloginscript'
def getLoginPasswordOthers(self):
login = dict(login = 'name', password = 'pass')
other = dict(submit = 'Log In', remember='yes')
return (login, other)
def extractIndividualUrls(self):
oldurl = ''
cururl = self.url
data = ''
try:
data = u2.urlopen(self.url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + self.url + ". Exception " + unicode(e) + ".")
if data is None:
raise StoryDoesNotExist("Problem reading story URL " + self.url + "!")
soup = None
try:
soup = bs.BeautifulStoneSoup(data)
except:
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % self.url)
story = soup.find('div', {'id' : 'story'})
crumbtrail = story.find('h3') # the only h3 ficwad uses.
allAhrefs = crumbtrail.findAll('a')
# last of crumbtrail
storyinfo = allAhrefs[-1]
(u0, u1, storyid) = storyinfo['href'].split('/')
if u1 == "story":
# This page does not have the correct information on it.. Need to get the Story Title Page
logging.debug('URL %s is a chapter URL. Getting Title Page http://%s/%s/%s.' % (self.url, self.host, u1, storyid))
oldurl = self.url
self.url = 'http://' + self.host + '/' + u1 + '/' + storyid
data = u2.urlopen(self.url).read()
soup = bs.BeautifulStoneSoup(data)
story = soup.find('div', {'id' : 'story'})
crumbtrail = story.find('h3') # the only h3 ficwad uses.
allAhrefs = crumbtrail.findAll('a')
# save chapter name from header in case of one-shot.
storyinfo = story.find('h4').find('a')
(u0, u1, self.storyId) = storyinfo['href'].split('/')
self.storyName = storyinfo.string.strip()
logging.debug('self.storyName=%s, self.storyId=%s' % (self.storyName, self.storyId))
author = soup.find('span', {'class' : 'author'})
self.authorName = unicode(author.a.string)
(u0, u1,self.authorId) = author.a['href'].split('/')
self.authorURL = 'http://' + self.host + author.a['href']
logging.debug('self.authorName=%s self.authorId=%s' % (self.authorName, self.authorId))
description = soup.find('blockquote', {'class' : 'summary'})
if description is not None:
self.storyDescription = unicode(description.p.string)
#logging.debug('self.storyDescription=%s' % self.storyDescription.replace('\n',' ').replace('\r',''))
meta = soup.find('p', {'class' : 'meta'})
if meta is not None:
logging.debug('meta.s pre=%s' % meta.__str__('utf8'))
s = re.sub('<[^>]+>','',unicode(meta)).replace('\n',' ').replace('\t','').split(' - ')
#logging.debug('meta.s post=%s' % s)
for ss in s:
s1 = ss.replace('&nbsp;','').split(':')
#logging.debug('ss=%s' % ss)
if len(s1) > 1:
skey = s1[0].strip()
#logging.debug('Checking = %s' % skey)
if skey == 'Category':
# ficwad doesn't allow multiple categories.
self.category = unicode(s1[1])
logging.debug('self.category=%s' % self.category)
self.addSubject(self.category)
logging.debug('self.subjects=%s' % self.subjects)
elif skey == 'Rating':
self.storyRating = s1[1]
logging.debug('self.storyRating=%s' % self.storyRating)
elif skey == 'Genres':
self.genre = s1[1]
logging.debug('self.genre=%s' % self.genre)
s2 = s1[1].split(', ')
for ss2 in s2:
self.addSubject(ss2)
logging.debug('self.subjects=%s' % self.subjects)
elif skey == 'Characters':
s2 = s1[1].split(', ')
for ss2 in s2:
self.addCharacter(ss2)
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
elif skey == 'Chapters':
self.numChapters = s1[1]
logging.debug('self.numChapters=%s' % self.numChapters)
elif skey == 'Warnings':
logging.debug('Warnings=%s' % s1[1])
elif skey == 'Published':
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d")))
logging.debug('self.storyPublished=%s' % self.storyPublished)
elif skey == 'Updated':
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d")))
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
else:
if ss == 'Complete' :
self.storyStatus = 'Completed'
elif ss.endswith('words'):
self.numWords=ss.replace('words','').replace('&nbsp;','')
logging.debug('self.numWords=%s' % self.numWords)
logging.debug('Story "%s" by %s' % (self.storyName, self.authorName))
result = []
ii = 1
if oldurl is not None and len(oldurl) > 0:
logging.debug('Switching back to %s' % oldurl)
cururl = oldurl
data = u2.urlopen(oldurl).read()
soup = bs.BeautifulStoneSoup(data)
storylist = soup.find('ul', {'id' : 'storylist'})
if storylist is not None:
allBlocked = storylist.findAll('li', {'class' : 'blocked'})
if allBlocked is not None:
#logging.debug('allBlocked=%s' % allBlocked)
raise FailedToDownload("Are you sure %s is a chapter URL(not the chapter list)?"%cururl)
raise LoginRequiredException(cururl)
allH4s = storylist.findAll('h4')
#logging.debug('allH4s=%s' % allH4s)
if allH4s is not None:
for h4 in allH4s:
chapterinfo = h4.find('a')
#logging.debug('Chapter1=%s' % chapterinfo)
url = 'http://' + self.host + chapterinfo['href']
title = chapterinfo.string.strip()
#logging.debug('Chapter=%s, %s' % (url, title))
# ficwad includes 'Story Index' in the dropdown of chapters,
# but it's not a real chapter.
if title != "Story Index":
logging.debug('Chapter[%s]=%s, %s' % (ii, url, title))
result.append((url,title))
ii = ii+1
else:
logging.debug('Skipping Story Index. URL %s' % url)
if ii == 1:
select = soup.find('select', { 'name' : 'goto' } )
if select is None:
self.numChapters = '1'
logging.debug('self.numChapters=%s' % self.numChapters)
result.append((self.url,self.storyName))
logging.debug('Chapter[%s]=%s %s' % (ii, self.url, self.storyName))
else:
allOptions = select.findAll('option')
for o in allOptions:
url = 'http://' + self.host + o['value']
title = o.string
# ficwad includes 'Story Index' in the dropdown of chapters,
# but it's not a real chapter.
if title != "Story Index":
logging.debug('Chapter[%s]=%s, %s' % (ii, url, title))
result.append((url,title))
ii = ii+1
else:
logging.debug('Skipping Story Index. URL %s' % url)
return result
def getText(self, url):
if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url
data = ''
try:
data = u2.urlopen(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
try:
soup = bs.BeautifulStoneSoup(data)
except:
logging.info("Failed to decode: <%s>" % data)
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
div = soup.find('div', {'id' : 'storytext'})
if None == div:
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return div.__str__('utf8')
if __name__ == '__main__':
url = 'http://www.ficwad.com/story/14536'
data = u2.urlopen(url).read()
host = up.urlparse(url).netloc
fw = FicWad(url)
urls = fw.extractIndividualUrls()
pp.pprint(urls)
print(fw.getText(data))

301
fanficdownloader/fpcom.py Normal file
View file

@ -0,0 +1,301 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import cgi
import uuid
import shutil
import os.path
import logging
import unittest
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time
import datetime
from constants import *
from adapter import *
try:
import login_password
except:
# tough luck
pass
class FPCom(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
self.storyName = ''
self.authorName = ''
self.storyDescription = ''
self.storyCharacters = []
self.storySeries = ''
self.authorId = '0'
self.authorURL = self.path
self.storyId = '0'
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = ''
self.category = ''
self.storyStatus = 'In-Progress'
self.storyRating = 'K'
self.storyUserRating = '0'
self.outputName = ''
self.outputStorySep = '-fpcom_'
if self.path.startswith('/'):
self.path = self.path[1:]
spl = self.path.split('/')
if spl is not None:
if len(spl) > 0 and spl[0] != 's':
raise InvalidStoryURL("Error URL \"%s\" is not a story." % self.url)
if len(spl) > 1:
self.storyId = spl[1]
if len(spl) > 2:
chapter = spl[1]
else:
chapter = '1'
if len(spl) == 5:
self.path = "/".join(spl[1:-1])
if self.path.endswith('/'):
self.path = self.path[:-1]
logging.debug('self.path=%s' % self.path)
if not self.appEngine:
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
else:
self.opener = None
logging.debug("Created FP.Com: url=%s" % (self.url))
def _getLoginScript(self):
return self.path
def _processInfoLine(self, line):
have_lang = False
words = line.split(' - ')
if words is not None:
for word in words:
if word.find(':') != -1:
sds = word.split(': ')
if sds is not None and len(sds) > 1:
if sds[0] == 'Updated':
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(sds[1].strip(' '), "%m-%d-%y")))
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
elif sds[0] == 'Published':
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(sds[1].strip(' '), "%m-%d-%y")))
logging.debug('self.storyPublished=%s' % self.storyPublished)
elif sds[0] == 'Reviews':
reviews = sds[1]
logging.debug('reviews=%s' % reviews)
elif word.find('Complete') != -1:
self.storyStatus = 'Completed'
logging.debug('self.storyStatus=%s' % self.storyStatus)
elif not have_lang:
have_lang = True
language = word
logging.debug('language=%s' % language)
else:
self.category = word
logging.debug('self.category=%s' % self.category)
sgs = self.category.split('/')
for sg in sgs:
self.addSubject(sg)
logging.debug('self.subjects=%s' % self.subjects)
def extractIndividualUrls(self):
data = ''
try:
data = self.fetchUrl(self.url)
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + self.url + ". Exception " + unicode(e) + ".")
if data is None:
raise StoryDoesNotExist("Problem reading story URL " + self.url + "!")
d2 = re.sub('&\#[0-9]+;', ' ', data)
soup = None
try:
soup = bs.BeautifulStoneSoup(d2)
except:
logging.error("Failed to decode: <%s>" % d2)
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % self.url)
allA = soup.findAll('a')
for a in allA:
if 'href' in a._getAttrMap() and a['href'].find('/u/') != -1:
self.authorName = a.string
(u1, u2, self.authorId, u3) = a['href'].split('/')
logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName))
urls = []
metas = soup.findAll ('meta', {'name' : 'description'})
if metas is not None:
for meta in metas:
if 'content' in meta._getAttrMap():
self.storyDescription = unicode(meta['content'])
#logging.debug('self.storyDescription=%s' % self.storyDescription.replace("\n"," ").replace('\r',''))
title=meta.find('title')
logging.debug('title=%s' % title.string)
tt = title.string.split(',')
if tt is not None:
if len(tt) > 0:
self.storyName = tt[0]
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
if len(tt) > 1:
tt1 = tt[1].split(' - ')
if tt1 is not None and len(tt1) > 0:
self.category = tt1[0].strip()
logging.debug('self.category=%s' % self.category)
cc = self.category.split(' ')
for cc1 in cc:
if cc1 is not None and cc1 != 'a':
if cc1 == 'fanfic':
self.addSubject('FanFiction')
else:
self.addSubject(cc1)
logging.debug('self.subjects=%s' % self.subjects)
numchapters = 0
urlstory = ''
fidochap = soup.find('form', {'name':'fidochap'})
sl = fidochap.find('select', {'title':'chapter navigation'})
if sl is not None:
logging.debug('sl=%s' % sl )
if 'onchange' in sl._getAttrMap():
ocs = sl['onchange'].split('\'')
logging.debug('ocs=%s' % ocs)
if ocs is not None and len(ocs) > 3:
urlstory = ocs[3]
logging.debug('urlstory=%s' % urlstory)
opts = sl.findAll('option')
for o in opts:
if 'value' in o._getAttrMap():
url = 'http://' + self.host + '/s/' + self.storyId + '/' + o['value'] + urlstory
logging.debug('URL=%s, Title=%s' % (url, o.string))
urls.append((url, o.string))
numchapters = numchapters + 1
if numchapters == 0:
numchapters = 1
url = 'http://' + self.host + '/s/' + self.storyId + '/1' + urlstory
logging.debug('URL=%s, Title=%s' % (url, self.storyName))
urls.append((url, self.storyName))
self.numChapters = unicode(numchapters)
logging.debug('self.numChapters=%s' % self.numChapters)
logging.debug('urls=%s' % urls)
self.genre = ''
tds = fidochap.findAll('td')
for td in tds:
tdb = td.find('b')
if tdb is not None and tdb.string == self.storyName:
tdas = td.findAll('a')
for tda in tdas:
ss = tda.string
if ss is not None:
if len(self.genre) > 0:
self.genre = self.genre + ', '
self.genre = self.genre + ss
self.addSubject(ss)
logging.debug('self.genre=%s' % self.genre)
logging.debug('self.subjects=%s' % self.subjects)
tda = td.find ('a')
if tda is not None and tda.string.find('Rated:') != -1:
tdas = re.split ("<[^>]+>", unicode(td).replace('\n','').replace('&nbsp;',' '))
if tdas is not None:
ll = len(tdas)
if ll > 2:
ss = tdas[2].split(': ')
if ss is not None and len(ss) > 1:
self.storyRating = ss[1]
logging.debug('self.storyRating=%s' % self.storyRating)
if ll > 3:
self._processInfoLine (tdas[3])
if ll > 5:
self._processInfoLine (tdas[5])
self.authorURL = 'http://' + self.host + '/u/' + self.authorId
return urls
def getText(self, url):
# time.sleep( 2.0 )
data = ''
try:
data = self.fetchUrl(url)
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
lines = data.split('\n')
textbuf = ''
emit = False
olddata = data
try:
data = data.decode('utf8')
except:
data = olddata
soup = None
try:
soup = bs.BeautifulStoneSoup(data)
except:
logging.info("Failed to decode: <%s>" % data)
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
div = soup.find('div', {'id' : 'storytext'})
if None == div:
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return div.__str__('utf8')
class FPC_UnitTests(unittest.TestCase):
def setUp(self):
logging.basicConfig(level=logging.DEBUG)
pass
def testFictionPress(self):
url = 'http://www.fictionpress.com/s/2725180/1/Behind_This_Facade'
f = FPCom(url)
urls = f.extractIndividualUrls()
self.assertEquals('Behind This Facade', f.getStoryName())
self.assertEquals('IntoxicatingMelody', f.getAuthorName())
text = f.getText(url)
self.assertTrue(text.find('Kale Resgerald at your service" He answered, "So, can we go now? Or do you want to') != -1)
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,280 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import cgi
import uuid
import shutil
import os.path
import logging
import unittest
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time
import datetime
from constants import *
from adapter import *
try:
import login_password
except:
# tough luck
pass
class HPFiction(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
logging.debug('self.url=%s' % self.url)
logging.debug('self.host=%s' % self.host)
logging.debug('self.path=%s' % self.path)
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
self.chapurl = False
self.storyId = '0'
sss = self.url.split('?')
logging.debug('sss=%s' % sss)
if sss is not None and len(sss) > 1:
sc = sss[1].split('=')
logging.debug('sc=%s' % sc)
if sc is not None and len(sc) > 1:
if sc[0] == 'chapterid':
self.chapurl = True
elif sc[0] == 'psid' or sc[0] == 'sid':
self.storyId = sc[1]
self.storyDescription = 'Fanfiction Story'
self.authorId = '0'
self.authorURL = ''
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.subjects.append ('fanfiction')
self.subjects.append ('Harry Potter')
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = 'FanFiction'
self.category = 'Category'
self.storyStatus = 'In-Progress'
self.storyRating = 'K'
self.storyUserRating = '0'
self.storyCharacters = []
self.storySeries = ''
self.outputName = ''
self.outputStorySep = '-hp_'
logging.debug("Created HPFiction: url=%s" % (self.url))
def _getLoginScript(self):
return self.path
def extractIndividualUrls(self):
data = ''
try:
data = self.opener.open(self.url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + self.url + ". Exception " + unicode(e) + ".")
if data is None:
raise StoryDoesNotExist("Problem reading story URL " + self.url + "!")
soup = None
try:
soup = bs.BeautifulSoup(data)
except:
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % self.url)
links = soup.findAll('a')
def_chapurl = ''
def_chaptitle = ''
if self.chapurl:
foundid = False
for a in links:
if a['href'].find('psid') != -1:
sp = a['href'].split('?')
if sp is not None and len(sp) > 1:
for sp1 in sp:
if sp1.find('psid') != -1:
ps = sp1.split('=')
if ps is not None and len(ps) > 1:
self.storyId = ps[1].replace('\'','')
foundid = True
self.storyName = a.string
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
break
if foundid:
self.url = "http://" + self.host + "/viewstory.php?psid=" + self.storyId
logging.debug('Title Page URL=%s' % self.url)
data1 = self.opener.open(self.url).read()
hdrsoup = bs.BeautifulSoup(data1)
else:
hdrsoup = soup
else:
hdrsoup = soup
for a in links:
if not self.chapurl and a['href'].find('psid') != -1:
sp = a['href'].split('?')
if sp is not None and len(sp) > 1:
for sp1 in sp:
if sp1.find('psid') != -1:
ps = sp1.split('=')
if ps is not None and len(ps) > 1:
self.storyId = ps[1].replace('\'','')
self.storyName = a.string
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
elif a['href'].find('viewuser.php') != -1:
self.authorName = a.string
self.authorURL = 'http://' + self.host + '/' + a['href']
(u1, self.authorId) = a['href'].split('=')
logging.debug('self.authorName=%s, self.authorId=%s' % (self.authorName, self.authorId))
elif a['href'].find('chapterid=') != -1 and len(def_chapurl) == 0:
def_chapurl = 'http://' + self.host + '/viewstory.php' + unicode(a['href'])
def_chaptitle = a.string
logging.debug('def_chapurl=%s, def_chaptitle=%s' % (def_chapurl, def_chaptitle))
centers = hdrsoup.findAll('center')
for center in centers:
tds = center.findAll ('td')
if tds is not None and len(tds) > 0:
for td in tds:
s = re.split ("<[^>]+>", unicode(td).replace('\n','').replace('&nbsp;',' '))
ii = 0
ll = len(s)
sss = ''
while ii < ll - 1:
if s[ii] is not None and len(s[ii]) > 0:
if s[ii] == 'Rating:':
self.storyRating = s[ii+1]
logging.debug('self.storyRating=%s' % self.storyRating)
ii = ii + 2
elif s[ii] == 'Chapters:':
self.numChapters = s[ii+1]
logging.debug('self.numChapters=%s' % self.numChapters)
ii = ii + 2
elif s[ii] == 'Characters:':
s2 = s[ii+1].split(', ')
for ss2 in s2:
self.addCharacter(ss2)
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
ii = ii + 2
elif s[ii] == 'Genre(s):':
self.genre = s[ii+1]
logging.debug('self.genre=%s' % self.genre)
s2 = s[ii+1].split(', ')
for ss2 in s2:
self.addSubject(ss2)
logging.debug('self.subjects=%s' % self.subjects)
ii = ii + 2
elif s[ii] == 'Status:':
if s[ii+1].strip(' ') == "Work In Progress":
self.storyStatus = 'In-Progress'
else:
self.storyStatus = 'Completed'
ii = ii + 2
elif s[ii] == 'First Published:':
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s[ii+1].strip(' '), "%Y.%m.%d")))
logging.debug('self.storyPublished=%s' % self.storyPublished)
ii = ii + 2
elif s[ii] == 'Last Updated:':
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s[ii+1].strip(' '), "%Y.%m.%d")))
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
ii = ii + 2
elif s[ii] == 'Last Published Chapter:':
ii = ii + 2
elif s[ii] == 'Pairings:':
ii = ii + 2
elif s[ii] == 'Warnings:':
ii = ii + 2
else:
sss = sss + ' ' + s[ii]
ii = ii + 1
else:
ii = ii + 1
self.storyDescription = sss
#logging.debug('self.storyDescription=%s' % self.storyDescription.replace("\n"," ").replace('\r',''))
urls = []
select = soup.find('select', {'name' : 'chapterid'})
if select is None:
# no chapters found, try url by itself.
if len(def_chapurl) > 0:
urls.append((def_chapurl, def_chaptitle))
else:
urls.append((self.url,self.storyName))
else:
for o in select.findAll('option'):
if 'value' in o._getAttrMap():
url = 'http://' + self.host + self.path + o['value']
title = o.string
if title != "Story Index":
urls.append((url,title))
return urls
def getText(self, url):
logging.debug('Downloading from URL: %s' % url)
data = ''
try:
data = self.opener.open(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
soup = None
try:
soup = bs.BeautifulSoup(data)
except:
logging.info("Failed to decode: <%s>" % data)
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
divtext = soup.find('div', {'id' : 'fluidtext'})
if None == divtext:
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return divtext.__str__('utf8')
class FF_UnitTests(unittest.TestCase):
def setUp(self):
logging.basicConfig(level=logging.DEBUG)
pass
def testChaptersAuthStory(self):
f = HPFiction('http://www.harrypotterfanfiction.com/viewstory.php?chapterid=80123')
urls = f.extractIndividualUrls()
self.assertEquals(49, len(urls))
self.assertEquals('Elisha', f.getAuthorName())
self.assertEquals('A Secret Thought', f.getStoryName())
def testGetText(self):
url = 'http://www.harrypotterfanfiction.com/viewstory.php?chapterid=80123'
f = HPFiction(url)
#urls = f.extractIndividualUrls()
text = f.getText(url)
self.assertTrue(text.find('She pulled out of his arms and felt the subtle regret') != -1)
if __name__ == '__main__':
unittest.main()

126
fanficdownloader/html.py Normal file
View file

@ -0,0 +1,126 @@
#!/usr/bin/python
# Copyright(c) 2009 Andrew Chatham and Vijay Pandurangan
import re
import sys
import StringIO
import urllib
from BeautifulSoup import BeautifulSoup
class HtmlProcessor:
WHITESPACE_RE = re.compile(r'\s')
# Look for </blockquote <p>
BAD_TAG_RE = re.compile(r'<[^>]+<', re.MULTILINE)
def __init__(self, html, unfill=0):
self.unfill = unfill
html = self._ProcessRawHtml(html)
self._soup = BeautifulSoup(html)
if self._soup.title:
self.title = self._soup.title.contents[0]
else:
self.title = None
def _ProcessRawHtml(self, html):
new_html, count = HtmlProcessor.BAD_TAG_RE.subn('<', html)
if count:
print >>sys.stderr, 'Replaced %d bad tags' % count
return new_html
def _StubInternalAnchors(self):
'''Replace each internal anchor with a fixed-size filepos anchor.
Looks for every anchor with <a href="#myanchor"> and replaces that
with <a filepos="00000000050">. Stores anchors in self._anchor_references'''
self._anchor_references = []
anchor_num = 0
# anchor links
anchorlist = self._soup.findAll('a', href=re.compile('^#'))
# treat reference tags like a tags for TOCTOP.
anchorlist.extend(self._soup.findAll('reference', href=re.compile('^#')))
for anchor in anchorlist:
self._anchor_references.append((anchor_num, anchor['href']))
del anchor['href']
anchor['filepos'] = '%.10d' % anchor_num
anchor_num += 1
def _ReplaceAnchorStubs(self):
# TODO: Browsers allow extra whitespace in the href names.
# use __str__ instead of prettify--it inserts extra spaces.
assembled_text = self._soup.__str__('utf8')
del self._soup # shouldn't touch this anymore
for anchor_num, original_ref in self._anchor_references:
ref = urllib.unquote(original_ref[1:]) # remove leading '#'
# Find the position of ref in the utf-8 document.
# TODO(chatham): Using regexes and looking for name= would be better.
newpos = assembled_text.rfind(ref.encode('utf-8'))
if newpos == -1:
print >>sys.stderr, 'Could not find anchor "%s"' % original_ref
continue
newpos += len(ref) + 2 # don't point into the middle of the <a name> tag
old_filepos = 'filepos="%.10d"' % anchor_num
new_filepos = 'filepos="%.10d"' % newpos
assert assembled_text.find(old_filepos) != -1
assembled_text = assembled_text.replace(old_filepos, new_filepos, 1)
return assembled_text
def _FixPreTags(self):
'''Replace <pre> tags with HTML-ified text.'''
pres = self._soup.findAll('pre')
for pre in pres:
pre.replaceWith(self._FixPreContents(str(pre.contents[0])))
def _FixPreContents(self, text):
if self.unfill:
line_splitter = '\n\n'
line_joiner = '<p>'
else:
line_splitter = '\n'
line_joiner = '<br>'
lines = []
for line in text.split(line_splitter):
lines.append(self.WHITESPACE_RE.subn('&nbsp;', line)[0])
return line_joiner.join(lines)
def _RemoveUnsupported(self):
'''Remove any tags which the kindle cannot handle.'''
# TODO(chatham): <link> tags to script?
unsupported_tags = ('script', 'style')
for tag_type in unsupported_tags:
for element in self._soup.findAll(tag_type):
element.extract()
def RenameAnchors(self, prefix):
'''Rename every internal anchor to have the given prefix, then
return the contents of the body tag.'''
for anchor in self._soup.findAll('a', href=re.compile('^#')):
anchor['href'] = '#' + prefix + anchor['href'][1:]
for a in self._soup.findAll('a'):
if a.get('name'):
a['name'] = prefix + a['name']
# TODO(chatham): figure out how to fix this. sometimes body comes out
# as NoneType.
content = []
if self._soup.body is not None:
content = [unicode(c) for c in self._soup.body.contents]
return '\n'.join(content)
def CleanHtml(self):
# TODO(chatham): fix_html_br, fix_html
self._RemoveUnsupported()
self._StubInternalAnchors()
self._FixPreTags()
return self._ReplaceAnchorStubs()
if __name__ == '__main__':
FILE ='/tmp/documentation.html'
#FILE = '/tmp/multipre.html'
FILE = '/tmp/view.html'
import codecs
d = open(FILE).read()
h = HtmlProcessor(d)
s = h.CleanHtml()
#print s

View file

@ -0,0 +1,452 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""html2text: Turn HTML into equivalent Markdown-structured text."""
__version__ = "2.37"
__author__ = "Aaron Swartz (me@aaronsw.com)"
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
# TODO:
# Support decoded entities with unifiable.
if not hasattr(__builtins__, 'True'): True, False = 1, 0
import re, sys, urllib, htmlentitydefs, codecs, StringIO, types
import sgmllib
import urlparse
sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
try: from textwrap import wrap
except: pass
# Use Unicode characters instead of their ascii psuedo-replacements
UNICODE_SNOB = 0
# Put the links after each paragraph instead of at the end.
LINKS_EACH_PARAGRAPH = 0
# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
BODY_WIDTH = 78
# Don't show internal links (href="#local-anchor") -- corresponding link targets
# won't be visible in the plain text file anyway.
SKIP_INTERNAL_LINKS = False
### Entity Nonsense ###
def name2cp(k):
if k == 'apos': return ord("'")
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
return htmlentitydefs.name2codepoint[k]
else:
k = htmlentitydefs.entitydefs[k]
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
return ord(codecs.latin_1_decode(k)[0])
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
'ndash':'-', 'oelig':'oe', 'aelig':'ae',
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
unifiable_n = {}
for k in unifiable.keys():
unifiable_n[name2cp(k)] = unifiable[k]
def charref(name):
if name[0] in ['x','X']:
c = int(name[1:], 16)
else:
c = int(name)
if not UNICODE_SNOB and c in unifiable_n.keys():
return unifiable_n[c]
else:
return unichr(c)
def entityref(c):
if not UNICODE_SNOB and c in unifiable.keys():
return unifiable[c]
else:
try: name2cp(c)
except KeyError: return "&" + c
else: return unichr(name2cp(c))
def replaceEntities(s):
s = s.group(1)
if s[0] == "#":
return charref(s[1:])
else: return entityref(s)
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
def unescape(s):
return r_unescape.sub(replaceEntities, s)
def fixattrs(attrs):
# Fix bug in sgmllib.py
if not attrs: return attrs
newattrs = []
for attr in attrs:
newattrs.append((attr[0], unescape(attr[1])))
return newattrs
### End Entity Nonsense ###
def onlywhite(line):
"""Return true if the line does only consist of whitespace characters."""
for c in line:
if c is not ' ' and c is not ' ':
return c is ' '
return line
def optwrap(text):
"""Wrap all paragraphs in the provided text."""
if not BODY_WIDTH:
return text
assert wrap, "Requires Python 2.3."
result = ''
newlines = 0
for para in text.split("\n"):
if len(para) > 0:
if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
for line in wrap(para, BODY_WIDTH):
result += line + "\n"
result += "\n"
newlines = 2
else:
if not onlywhite(para):
result += para + "\n"
newlines = 1
else:
if newlines < 2:
result += "\n"
newlines += 1
return result
def hn(tag):
if tag[0] == 'h' and len(tag) == 2:
try:
n = int(tag[1])
if n in range(1, 10): return n
except ValueError: return 0
class _html2text(sgmllib.SGMLParser):
def __init__(self, out=None, baseurl=''):
sgmllib.SGMLParser.__init__(self)
if out is None: self.out = self.outtextf
else: self.out = out
self.outtext = u''
self.quiet = 0
self.p_p = 0
self.outcount = 0
self.start = 1
self.space = 0
self.a = []
self.astack = []
self.acount = 0
self.list = []
self.blockquote = 0
self.pre = 0
self.startpre = 0
self.lastWasNL = 0
self.abbr_title = None # current abbreviation definition
self.abbr_data = None # last inner HTML (for abbr being defined)
self.abbr_list = {} # stack of abbreviations to write later
self.baseurl = baseurl
def outtextf(self, s):
self.outtext += s
def close(self):
sgmllib.SGMLParser.close(self)
self.pbr()
self.o('', 0, 'end')
return self.outtext
def handle_charref(self, c):
self.o(charref(c))
def handle_entityref(self, c):
self.o(entityref(c))
def unknown_starttag(self, tag, attrs):
self.handle_tag(tag, attrs, 1)
def unknown_endtag(self, tag):
self.handle_tag(tag, None, 0)
def previousIndex(self, attrs):
""" returns the index of certain set of attributes (of a link) in the
self.a list
If the set of attributes is not found, returns None
"""
if not attrs.has_key('href'): return None
i = -1
for a in self.a:
i += 1
match = 0
if a.has_key('href') and a['href'] == attrs['href']:
if a.has_key('title') or attrs.has_key('title'):
if (a.has_key('title') and attrs.has_key('title') and
a['title'] == attrs['title']):
match = True
else:
match = True
if match: return i
def handle_tag(self, tag, attrs, start):
attrs = fixattrs(attrs)
if hn(tag):
self.p()
if start: self.o(hn(tag)*"#" + ' ')
if tag in ['p', 'div']: self.p()
if tag == "br" and start: self.o(" \n")
if tag == "hr" and start:
self.p()
self.o("* * *")
self.p()
if tag in ["head", "style", 'script']:
if start: self.quiet += 1
else: self.quiet -= 1
if tag in ["body"]:
self.quiet = 0 # sites like 9rules.com never close <head>
if tag == "blockquote":
if start:
self.p(); self.o('> ', 0, 1); self.start = 1
self.blockquote += 1
else:
self.blockquote -= 1
self.p()
if tag in ['em', 'i', 'u']: self.o("_")
if tag in ['strong', 'b']: self.o("**")
if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
if tag == "abbr":
if start:
attrsD = {}
for (x, y) in attrs: attrsD[x] = y
attrs = attrsD
self.abbr_title = None
self.abbr_data = ''
if attrs.has_key('title'):
self.abbr_title = attrs['title']
else:
if self.abbr_title != None:
self.abbr_list[self.abbr_data] = self.abbr_title
self.abbr_title = None
self.abbr_data = ''
if tag == "a":
if start:
attrsD = {}
for (x, y) in attrs: attrsD[x] = y
attrs = attrsD
if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')):
self.astack.append(attrs)
self.o("[")
else:
self.astack.append(None)
else:
if self.astack:
a = self.astack.pop()
if a:
i = self.previousIndex(a)
if i is not None:
a = self.a[i]
else:
self.acount += 1
a['count'] = self.acount
a['outcount'] = self.outcount
self.a.append(a)
self.o("][" + `a['count']` + "]")
if tag == "img" and start:
attrsD = {}
for (x, y) in attrs: attrsD[x] = y
attrs = attrsD
if attrs.has_key('src'):
attrs['href'] = attrs['src']
alt = attrs.get('alt', '')
i = self.previousIndex(attrs)
if i is not None:
attrs = self.a[i]
else:
self.acount += 1
attrs['count'] = self.acount
attrs['outcount'] = self.outcount
self.a.append(attrs)
self.o("![")
self.o(alt)
self.o("]["+`attrs['count']`+"]")
if tag == 'dl' and start: self.p()
if tag == 'dt' and not start: self.pbr()
if tag == 'dd' and start: self.o(' ')
if tag == 'dd' and not start: self.pbr()
if tag in ["ol", "ul"]:
if start:
self.list.append({'name':tag, 'num':0})
else:
if self.list: self.list.pop()
self.p()
if tag == 'li':
if start:
self.pbr()
if self.list: li = self.list[-1]
else: li = {'name':'ul', 'num':0}
self.o(" "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.
if li['name'] == "ul": self.o("* ")
elif li['name'] == "ol":
li['num'] += 1
self.o(`li['num']`+". ")
self.start = 1
else:
self.pbr()
if tag in ["table", "tr"] and start: self.p()
if tag == 'td': self.pbr()
if tag == "pre":
if start:
self.startpre = 1
self.pre = 1
else:
self.pre = 0
self.p()
def pbr(self):
if self.p_p == 0: self.p_p = 1
def p(self): self.p_p = 2
def o(self, data, puredata=0, force=0):
if self.abbr_data is not None: self.abbr_data += data
if not self.quiet:
if puredata and not self.pre:
data = re.sub('\s+', ' ', data)
if data and data[0] == ' ':
self.space = 1
data = data[1:]
if not data and not force: return
if self.startpre:
#self.out(" :") #TODO: not output when already one there
self.startpre = 0
bq = (">" * self.blockquote)
if not (force and data and data[0] == ">") and self.blockquote: bq += " "
if self.pre:
bq += " "
data = data.replace("\n", "\n"+bq)
if self.start:
self.space = 0
self.p_p = 0
self.start = 0
if force == 'end':
# It's the end.
self.p_p = 0
self.out("\n")
self.space = 0
if self.p_p:
self.out(('\n'+bq)*self.p_p)
self.space = 0
if self.space:
if not self.lastWasNL: self.out(' ')
self.space = 0
if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
if force == "end": self.out("\n")
newa = []
for link in self.a:
if self.outcount > link['outcount']:
self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href']))
if link.has_key('title'): self.out(" ("+link['title']+")")
self.out("\n")
else:
newa.append(link)
if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
self.a = newa
if self.abbr_list and force == "end":
for abbr, definition in self.abbr_list.items():
self.out(" *[" + abbr + "]: " + definition + "\n")
self.p_p = 0
self.out(data)
self.lastWasNL = data and data[-1] == '\n'
self.outcount += 1
def handle_data(self, data):
if r'\/script>' in data: self.quiet -= 1
self.o(data, 1)
def unknown_decl(self, data): pass
def wrapwrite(text): sys.stdout.write(text.encode('utf8'))
def html2text_file(html, out=wrapwrite, baseurl=''):
h = _html2text(out, baseurl)
h.feed(html)
h.feed("")
return h.close()
def html2text(html, baseurl=''):
return optwrap(html2text_file(html, None, baseurl))
if __name__ == "__main__":
baseurl = ''
if sys.argv[1:]:
arg = sys.argv[1]
if arg.startswith('http://'):
baseurl = arg
j = urllib.urlopen(baseurl)
try:
from feedparser import _getCharacterEncoding as enc
except ImportError:
enc = lambda x, y: ('utf-8', 1)
text = j.read()
encoding = enc(j.headers, text)[0]
if encoding == 'us-ascii': encoding = 'utf-8'
data = text.decode(encoding)
else:
encoding = 'utf8'
if len(sys.argv) > 2:
encoding = sys.argv[2]
data = open(arg, 'r').read().decode(encoding)
else:
data = sys.stdin.read().decode('utf8')
wrapwrite(html2text(data, baseurl))

View file

@ -0,0 +1,19 @@
# -*- coding: utf-8 -*-
XHTML_START = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>${title} by ${author}</title>
<link href="stylesheet.css" type="text/css" rel="stylesheet" />
</head>
<body>
<div>
<h1>${title} by ${author}</h1>
${body}
</body></html>
'''
XHTML_CHAPTER_START = '''<h2>${chapter}</h2>'''
XHTML_END = ''''''

View file

@ -0,0 +1,448 @@
# -*- coding: utf-8 -*-
import re
def _unirepl(match):
"Return the unicode string for a decimal number"
if match.group(1)=='x':
radix=16
else:
radix=10
value = int(match.group(2), radix )
return unichr(value)
def _replaceNumberEntities(data):
p = re.compile(r'&#(x?)(\d+);')
return p.sub(_unirepl, data)
def _replaceNotEntities(data):
# not just \w or \S. regexp from c:\Python25\lib\sgmllib.py
# (or equiv), SGMLParser, entityref
p = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
return p.sub(r'&\1', data)
def stripHTML(soup):
return removeAllEntities(re.sub(r'<[^>]+>','',"%s" % soup)).strip()
def conditionalRemoveEntities(value):
if isinstance(value,str) or isinstance(value,unicode) :
return removeEntities(value.strip())
else:
return value
def removeAllEntities(text):
# Remove &lt; &lt; and &amp;
return removeEntities(text).replace('&lt;', '<').replace('&gt;', '>').replace('&amp;', '&')
def removeEntities(text):
# replace numeric versions of [&<>] with named versions,
# then replace named versions with actual characters,
if text is None:
return ""
if not (isinstance(text,str) or isinstance(text,unicode)):
return str(text)
try:
t = text.decode('utf-8')
except UnicodeEncodeError, e:
try:
t = text.encode ('ascii', 'xmlcharrefreplace')
except UnicodeEncodeError, e:
t = text
text = t
text = re.sub(r'&#0*38;','&amp;',text)
text = re.sub(r'&#0*60;','&lt;',text)
text = re.sub(r'&#0*62;','&gt;',text)
# replace remaining &#000; entities with unicode value, such as &#039; -> '
text = _replaceNumberEntities(text)
# replace several named entities with character, such as &mdash; -> -
# see constants.py for the list.
# reverse sort will put entities with ; before the same one without, when valid.
for e in reversed(sorted(entities.keys())):
v = entities[e]
try:
text = text.replace(e, v)
except UnicodeDecodeError, ex:
# for the pound symbol in constants.py
text = text.replace(e, v.decode('utf-8'))
# SGMLParser, and in turn, BeautifulStoneSoup doesn't parse
# entities terribly well and inserts (;) after something that
# it thinks might be an entity. AT&T becomes AT&T; All of my
# attempts to fix this by changing the input to
# BeautifulStoneSoup break something else instead. But at
# this point, there should be *no* real entities left, so find
# these not-entities and removing them here should be safe.
text = _replaceNotEntities(text)
# &lt; &lt; and &amp; are the only html entities allowed in xhtml, put those back.
return text.replace('&', '&amp;').replace('&amp;lt', '&lt;').replace('&amp;gt', '&gt;')
# entity list from http://code.google.com/p/doctype/wiki/CharacterEntitiesConsistent
entities = { '&aacute;' : 'á',
'&Aacute;' : 'Á',
'&Aacute' : 'Á',
'&aacute' : 'á',
'&acirc;' : 'â',
'&Acirc;' : 'Â',
'&Acirc' : 'Â',
'&acirc' : 'â',
'&acute;' : '´',
'&acute' : '´',
'&AElig;' : 'Æ',
'&aelig;' : 'æ',
'&AElig' : 'Æ',
'&aelig' : 'æ',
'&agrave;' : 'à',
'&Agrave;' : 'À',
'&Agrave' : 'À',
'&agrave' : 'à',
'&alefsym;' : '',
'&alpha;' : 'α',
'&Alpha;' : 'Α',
'&amp;' : '&',
'&AMP;' : '&',
'&AMP' : '&',
'&amp' : '&',
'&and;' : '',
'&ang;' : '',
'&aring;' : 'å',
'&Aring;' : 'Å',
'&Aring' : 'Å',
'&aring' : 'å',
'&asymp;' : '',
'&atilde;' : 'ã',
'&Atilde;' : 'Ã',
'&Atilde' : 'Ã',
'&atilde' : 'ã',
'&auml;' : 'ä',
'&Auml;' : 'Ä',
'&Auml' : 'Ä',
'&auml' : 'ä',
'&bdquo;' : '',
'&beta;' : 'β',
'&Beta;' : 'Β',
'&brvbar;' : '¦',
'&brvbar' : '¦',
'&bull;' : '',
'&cap;' : '',
'&ccedil;' : 'ç',
'&Ccedil;' : 'Ç',
'&Ccedil' : 'Ç',
'&ccedil' : 'ç',
'&cedil;' : '¸',
'&cedil' : '¸',
'&cent;' : '¢',
'&cent' : '¢',
'&chi;' : 'χ',
'&Chi;' : 'Χ',
'&circ;' : 'ˆ',
'&clubs;' : '',
'&cong;' : '',
'&copy;' : '©',
'&COPY;' : '©',
'&COPY' : '©',
'&copy' : '©',
'&crarr;' : '',
'&cup;' : '',
'&curren;' : '¤',
'&curren' : '¤',
'&dagger;' : '',
'&Dagger;' : '',
'&darr;' : '',
'&dArr;' : '',
'&deg;' : '°',
'&deg' : '°',
'&delta;' : 'δ',
'&Delta;' : 'Δ',
'&diams;' : '',
'&divide;' : '÷',
'&divide' : '÷',
'&eacute;' : 'é',
'&Eacute;' : 'É',
'&Eacute' : 'É',
'&eacute' : 'é',
'&ecirc;' : 'ê',
'&Ecirc;' : 'Ê',
'&Ecirc' : 'Ê',
'&ecirc' : 'ê',
'&egrave;' : 'è',
'&Egrave;' : 'È',
'&Egrave' : 'È',
'&egrave' : 'è',
'&empty;' : '',
'&emsp;' : '',
'&ensp;' : '',
'&epsilon;' : 'ε',
'&Epsilon;' : 'Ε',
'&equiv;' : '',
'&eta;' : 'η',
'&Eta;' : 'Η',
'&eth;' : 'ð',
'&ETH;' : 'Ð',
'&ETH' : 'Ð',
'&eth' : 'ð',
'&euml;' : 'ë',
'&Euml;' : 'Ë',
'&Euml' : 'Ë',
'&euml' : 'ë',
'&euro;' : '',
'&exist;' : '',
'&fnof;' : 'ƒ',
'&forall;' : '',
'&frac12;' : '½',
'&frac12' : '½',
'&frac14;' : '¼',
'&frac14' : '¼',
'&frac34;' : '¾',
'&frac34' : '¾',
'&frasl;' : '',
'&gamma;' : 'γ',
'&Gamma;' : 'Γ',
'&ge;' : '',
#'&gt;' : '>',
#'&GT;' : '>',
#'&GT' : '>',
#'&gt' : '>',
'&harr;' : '',
'&hArr;' : '',
'&hearts;' : '',
'&hellip;' : '',
'&iacute;' : 'í',
'&Iacute;' : 'Í',
'&Iacute' : 'Í',
'&iacute' : 'í',
'&icirc;' : 'î',
'&Icirc;' : 'Î',
'&Icirc' : 'Î',
'&icirc' : 'î',
'&iexcl;' : '¡',
'&iexcl' : '¡',
'&igrave;' : 'ì',
'&Igrave;' : 'Ì',
'&Igrave' : 'Ì',
'&igrave' : 'ì',
'&image;' : '',
'&infin;' : '',
'&int;' : '',
'&iota;' : 'ι',
'&Iota;' : 'Ι',
'&iquest;' : '¿',
'&iquest' : '¿',
'&isin;' : '',
'&iuml;' : 'ï',
'&Iuml;' : 'Ï',
'&Iuml' : 'Ï',
'&iuml' : 'ï',
'&kappa;' : 'κ',
'&Kappa;' : 'Κ',
'&lambda;' : 'λ',
'&Lambda;' : 'Λ',
'&laquo;' : '«',
'&laquo' : '«',
'&larr;' : '',
'&lArr;' : '',
'&lceil;' : '',
'&ldquo;' : '',
'&le;' : '',
'&lfloor;' : '',
'&lowast;' : '',
'&loz;' : '',
'&lrm;' : '',
'&lsaquo;' : '',
'&lsquo;' : '',
#'&lt;' : '<',
#'&LT;' : '<',
#'&LT' : '<',
#'&lt' : '<',
'&macr;' : '¯',
'&macr' : '¯',
'&mdash;' : '',
'&micro;' : 'µ',
'&micro' : 'µ',
'&middot;' : '·',
'&middot' : '·',
'&minus;' : '',
'&mu;' : 'μ',
'&Mu;' : 'Μ',
'&nabla;' : '',
'&nbsp;' : ' ',
'&nbsp' : ' ',
'&ndash;' : '',
'&ne;' : '',
'&ni;' : '',
'&not;' : '¬',
'&not' : '¬',
'&notin;' : '',
'&nsub;' : '',
'&ntilde;' : 'ñ',
'&Ntilde;' : 'Ñ',
'&Ntilde' : 'Ñ',
'&ntilde' : 'ñ',
'&nu;' : 'ν',
'&Nu;' : 'Ν',
'&oacute;' : 'ó',
'&Oacute;' : 'Ó',
'&Oacute' : 'Ó',
'&oacute' : 'ó',
'&ocirc;' : 'ô',
'&Ocirc;' : 'Ô',
'&Ocirc' : 'Ô',
'&ocirc' : 'ô',
'&OElig;' : 'Œ',
'&oelig;' : 'œ',
'&ograve;' : 'ò',
'&Ograve;' : 'Ò',
'&Ograve' : 'Ò',
'&ograve' : 'ò',
'&oline;' : '',
'&omega;' : 'ω',
'&Omega;' : 'Ω',
'&omicron;' : 'ο',
'&Omicron;' : 'Ο',
'&oplus;' : '',
'&or;' : '',
'&ordf;' : 'ª',
'&ordf' : 'ª',
'&ordm;' : 'º',
'&ordm' : 'º',
'&oslash;' : 'ø',
'&Oslash;' : 'Ø',
'&Oslash' : 'Ø',
'&oslash' : 'ø',
'&otilde;' : 'õ',
'&Otilde;' : 'Õ',
'&Otilde' : 'Õ',
'&otilde' : 'õ',
'&otimes;' : '',
'&ouml;' : 'ö',
'&Ouml;' : 'Ö',
'&Ouml' : 'Ö',
'&ouml' : 'ö',
'&para;' : '',
'&para' : '',
'&part;' : '',
'&permil;' : '',
'&perp;' : '',
'&phi;' : 'φ',
'&Phi;' : 'Φ',
'&pi;' : 'π',
'&Pi;' : 'Π',
'&piv;' : 'ϖ',
'&plusmn;' : '±',
'&plusmn' : '±',
'&pound;' : '£',
'&pound' : '£',
'&prime;' : '',
'&Prime;' : '',
'&prod;' : '',
'&prop;' : '',
'&psi;' : 'ψ',
'&Psi;' : 'Ψ',
'&quot;' : '"',
'&QUOT;' : '"',
'&QUOT' : '"',
'&quot' : '"',
'&radic;' : '',
'&raquo;' : '»',
'&raquo' : '»',
'&rarr;' : '',
'&rArr;' : '',
'&rceil;' : '',
'&rdquo;' : '',
'&real;' : '',
'&reg;' : '®',
'&REG;' : '®',
'&REG' : '®',
'&reg' : '®',
'&rfloor;' : '',
'&rho;' : 'ρ',
'&Rho;' : 'Ρ',
'&rlm;' : '',
'&rsaquo;' : '',
'&rsquo;' : '',
'&sbquo;' : '',
'&scaron;' : 'š',
'&Scaron;' : 'Š',
'&sdot;' : '',
'&sect;' : '§',
'&sect' : '§',
'&shy;' : '­', # strange optional hyphenation control character, not just a dash
'&shy' : '­',
'&sigma;' : 'σ',
'&Sigma;' : 'Σ',
'&sigmaf;' : 'ς',
'&sim;' : '',
'&spades;' : '',
'&sub;' : '',
'&sube;' : '',
'&sum;' : '',
'&sup1;' : '¹',
'&sup1' : '¹',
'&sup2;' : '²',
'&sup2' : '²',
'&sup3;' : '³',
'&sup3' : '³',
'&sup;' : '',
'&supe;' : '',
'&szlig;' : 'ß',
'&szlig' : 'ß',
'&tau;' : 'τ',
'&Tau;' : 'Τ',
'&there4;' : '',
'&theta;' : 'θ',
'&Theta;' : 'Θ',
'&thetasym;' : 'ϑ',
'&thinsp;' : '',
'&thorn;' : 'þ',
'&THORN;' : 'Þ',
'&THORN' : 'Þ',
'&thorn' : 'þ',
'&tilde;' : '˜',
'&times;' : '×',
'&times' : '×',
'&trade;' : '',
'&uacute;' : 'ú',
'&Uacute;' : 'Ú',
'&Uacute' : 'Ú',
'&uacute' : 'ú',
'&uarr;' : '',
'&uArr;' : '',
'&ucirc;' : 'û',
'&Ucirc;' : 'Û',
'&Ucirc' : 'Û',
'&ucirc' : 'û',
'&ugrave;' : 'ù',
'&Ugrave;' : 'Ù',
'&Ugrave' : 'Ù',
'&ugrave' : 'ù',
'&uml;' : '¨',
'&uml' : '¨',
'&upsih;' : 'ϒ',
'&upsilon;' : 'υ',
'&Upsilon;' : 'Υ',
'&uuml;' : 'ü',
'&Uuml;' : 'Ü',
'&Uuml' : 'Ü',
'&uuml' : 'ü',
'&weierp;' : '',
'&xi;' : 'ξ',
'&Xi;' : 'Ξ',
'&yacute;' : 'ý',
'&Yacute;' : 'Ý',
'&Yacute' : 'Ý',
'&yacute' : 'ý',
'&yen;' : '¥',
'&yen' : '¥',
'&yuml;' : 'ÿ',
'&Yuml;' : 'Ÿ',
'&yuml' : 'ÿ',
'&zeta;' : 'ζ',
'&Zeta;' : 'Ζ',
'&zwj;' : '', # strange spacing control character, not just a space
'&zwnj;' : '', # strange spacing control character, not just a space
}

View file

@ -0,0 +1,366 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import cgi
import uuid
import shutil
import os.path
import logging
import unittest
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time
import datetime
from constants import *
from adapter import *
try:
import login_password
except:
# tough luck
pass
class MediaMiner(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
self.storyName = ''
self.authorName = ''
self.storyDescription = ''
self.storyCharacters = []
self.storySeries = ''
self.authorId = '0'
self.authorURL = self.path
self.storyId = '0'
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = ''
self.category = ''
self.storyStatus = 'In-Progress'
self.storyRating = 'K'
self.storyUserRating = '0'
self.outputName = ''
self.outputStorySep = '-mm_'
logging.debug('self.url=%s' % self.url)
if self.url.find('view_st.php') != -1:
ss = self.url.split('view_st.php')
logging.debug('ss=%s' % ss)
if ss is not None and len(ss) > 1:
self.storyId = ss[1].replace('/','').strip()
elif self.url.find('view_ch.php?') != -1:
ss = self.url.split('=')
logging.debug('ss=%s' % ss)
if ss is not None and len(ss) > 1:
self.storyId = ss[-1].replace('/','').strip()
self.path = '/fanfic/view_st.php/' + self.storyId
self.url = 'http://' + self.host + self.path
logging.debug('self.url=%s' % self.url)
elif self.url.find('view_ch.php/') != -1:
ss = self.url.split('/')
logging.debug('ss=%s' % ss)
if ss is not None and len(ss) > 2:
self.storyId = ss[-2].strip()
self.path = '/fanfic/view_st.php/' + self.storyId
self.url = 'http://' + self.host + self.path
logging.debug('self.url=%s' % self.url)
else:
raise InvalidStoryURL("Error URL \"%s\" is not a story." % self.url)
logging.debug('self.storyId=%s' % self.storyId)
logging.debug('self.path=%s' % self.path)
if not self.appEngine:
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
else:
self.opener = None
logging.debug("Created MediaMiner: url=%s" % (self.url))
def _getLoginScript(self):
return self.path
def extractIndividualUrls(self):
data = None
try:
data = self.fetchUrl(self.url)
except Exception, e:
data = None
logging.error("Caught an exception reading URL " + self.url + ". Exception " + unicode(e) + ".")
if data is None:
raise StoryDoesNotExist("Problem reading story URL " + self.url + "!")
#data.replace('<br />',' ').replace('<br>',' ').replace('</br>',' ')
soup = None
try:
soup = bs.BeautifulSoup(data)
except:
logging.error("Failed to decode: <%s>" % data)
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % self.url)
#logging.debug('soap=%s' % soup)
urls = []
td_ffh = soup.find('td', {'class' : 'ffh'})
#logging.debug('td_ffh=%s' % td_ffh)
if td_ffh is not None:
#logging.debug('td_ffh.text=%s' % td_ffh.find(text=True))
self.storyName = unicode(td_ffh.find(text=True)).strip()
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
fft = td_ffh.find('font', {'class' : 'smtxt'})
#logging.debug('fft=%s' % fft)
if fft is not None:
ffts = fft.string.split(' ')
if ffts is not None:
if len(ffts) > 1:
self.storyRating = ffts[1]
logging.debug('self.storyRating=%s' % self.storyRating)
self.genre = ''
td_smtxt = soup.findAll('td')
if td_smtxt is None:
#logging.debug('td_smtxt is NONE!')
pass
else:
ll = len(td_smtxt)
#logging.debug('td_smtxt=%s, len=%s' % (td_smtxt, ll))
for ii in range(ll):
td = td_smtxt[ii]
if 'class' in td._getAttrMap() and td['class'] != 'smtxt':
#logging.debug('td has class attribute but is not smtxt')
continue
ss = unicode(td).replace('\n','').replace('\r','').replace('&nbsp;', ' ')
#logging.debug('ss=%s' % ss)
if len(ss) > 1 and (ss.find('Genre(s):') != -1 or ss.find('Type:') != -1):
#logging.debug('ss=%s' % ss)
ssbs = td.findAll('b')
#logging.debug('ssbs=%s' % ssbs)
bb = 0
while bb < len(ssbs):
nvs = bs.NavigableString('')
sst=''
ssb = ssbs[bb]
ssbt = unicode(ssb.text).strip()
#logging.debug('ssb=%s' % ssb)
#logging.debug('ssbt=%s' % ssbt)
ssbn = ssb.nextSibling
while ssbn is not None:
#logging.debug('ssbn=%s' % ssbn)
#logging.debug('ssbn.class=%s' % ssbn.__class__)
if nvs.__class__ == ssbn.__class__:
st = unicode(ssbn)
if st.strip() != '|':
sst = sst + st
else:
#logging.debug('ssbn.name=%s' % ssbn.name)
if ssbn.name == 'b':
break
ssbnts = ssbn.findAll(text=True)
for ssbnt in ssbnts:
sst = sst + ssbnt
ssbn = ssbn.nextSibling
sst = sst.replace('&nbsp;',' ').strip()
#logging.debug('sst=%s' % sst)
if bb == 0:
ssbt = ssbt.replace(':','')
self.addSubject(ssbt)
self.addSubject(sst)
logging.debug('self.subjects=%s' % self.subjects)
else:
if ssbt == 'Genre(s):':
self.genre = sst
logging.debug('self.genre=%s' % self.genre)
sts = sst.split(' / ')
for st in sts:
self.addSubject(st.strip())
logging.debug('self.subjects=%s' % self.subjects)
elif ssbt == 'Type:':
self.category = sst
logging.debug('self.category=%s' % self.category)
self.addSubject(sst)
logging.debug('self.subjects=%s' % self.subjects)
elif ssbt == 'Author:':
pass
elif ssbt == 'Visits:':
pass
elif ssbt == 'Size:':
pass
elif ssbt == 'Pages:':
pass
elif ssbt == 'Status:':
if sst == "Completed":
self.storyStatus = 'Completed'
else:
self.storyStatus = 'In-Progress'
elif ssbt == 'Words:':
self.numWords = sst.replace('|','').strip()
logging.debug('self.numWords=%s' % self.numWords)
pass
elif ssbt == 'Summary:':
self.storyDescription = sst.strip()
#logging.debug('self.storyDescription=%s' % self.storyDescription.replace("\n"," ").replace('\r',''))
elif ssbt == 'Latest Revision:' or ssbt == 'Uploaded On:':
#logging.debug('sst=%s' % sst)
ssts = sst.split(' ')
if ssts is not None and len(ssts) > 3:
sst = ssts[0] + ' ' + ssts[1] + ' ' + ssts[2]
#logging.debug('sst=%s' % sst)
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(sst.strip(' '), "%B %d, %Y")))
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
else:
pass
bb = bb+1
smtxt_as = td_smtxt[ii].findAll('a')
#logging.debug('smtxt_as=%s' % smtxt_as)
for smtxt_a in smtxt_as:
if 'href' in smtxt_a._getAttrMap() and smtxt_a['href'].find('/u/'):
sta = smtxt_a['href']
#logging.debug('sta=%s' % sta)
stas = sta.split('/u/')
#logging.debug('stas=%s' % stas)
if stas is not None and len(stas) > 1:
self.authorId = stas[1]
self.authorURL = 'http://' + self.host + sta
self.authorName = smtxt_a.string
logging.debug('self.authorName=%s, self.authorId=%s' % (self.authorName, self.authorId))
urlstory=''
numchapters = 0
td_tbbrdr = soup.find('td', {'class' : 'tbbrdr'})
if td_tbbrdr is not None:
#logging.debug('td_tbbrdr=%s' % td_tbbrdr )
sl = td_tbbrdr.find('select', {'name':'cid'})
if sl is not None:
#logging.debug('sl=%s' % sl )
opts = sl.findAll('option')
for o in opts:
#logging.debug('o=%s' % o)
if 'value' in o._getAttrMap():
url = 'http://' + self.host + '/fanfic/view_ch.php/' + self.storyId + '/' + o['value']
logging.debug('URL=%s, Title=%s' % (url, o.string))
if numchapters == 0:
ss = o.string.split('[')
if ss is not None and len(ss) > 1:
ssd = ss[-1].replace(']','')
#logging.debug('ssd=%s' % ssd)
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(ssd.strip(' '), "%b %d, %Y")))
logging.debug('self.storyPublished=%s' % self.storyPublished)
urls.append((url, o.string))
numchapters = numchapters + 1
if numchapters == 0:
numchapters = 1
url = 'http://' + self.host + '/fanfic/view_st.php/' + self.storyId
self.storyPublished = self.storyUpdated
logging.debug('self.storyPublished=%s' % self.storyPublished)
ssd = self.storyName + ' [' + self.storyPublished.strftime("%b %d, %Y") + ']'
logging.debug('URL=%s, Title=%s' % (url, ssd))
urls.append((url, ssd))
self.numChapters = unicode(numchapters)
logging.debug('self.numChapters=%s' % self.numChapters)
#logging.debug('urls=%s' % urls)
return urls
def getText(self, url):
# time.sleep( 2.0 )
logging.debug('url=%s' % url)
data = ''
try:
data = self.fetchUrl(url)
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
soup = None
try:
soup = bs.BeautifulSoup(data)
except:
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
# convert div's to p's. mediaminer uses div with a
# margin for paragraphs.
divlist = soup.findAll('div', {'class' : None})
for tag in divlist:
tag.name='p';
nvs = bs.NavigableString('')
sst=''
allAs = soup.findAll ('a', { 'name' : 'fic_c' })
#logging.debug('allAs=%s' % allAs)
for a in allAs:
#logging.debug('a=%s' % a)
foundfirst = False
done = False
nxta = a.nextSibling
while nxta is not None and not done:
#logging.debug('nxta=%s' % nxta)
#logging.debug('nxta.class=%s' % nxta.__class__)
st = unicode(nxta)
if nvs.__class__ != nxta.__class__:
#logging.debug('nxta.name=%s' % nxta.name)
if nxta.name == 'table':
st = ''
if foundfirst:
done = True
if nxta.name == 'div' and 'class' in nxta._getAttrMap() and nxta['class'] == 'acl' and foundfirst:
st = ''
done = True
if nxta.name == 'br':
if not foundfirst:
st = ''
else:
foundfirst = True
else:
foundfirst = True
sst = sst + st
nxta = nxta.nextSibling
if sst is None:
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return sst
class FPC_UnitTests(unittest.TestCase):
def setUp(self):
logging.basicConfig(level=logging.DEBUG)
pass
def testFictionPress(self):
url = 'http://www.fictionpress.com/s/2725180/1/Behind_This_Facade'
f = FPCom(url)
urls = f.extractIndividualUrls()
self.assertEquals('Behind This Facade', f.getStoryName())
self.assertEquals('IntoxicatingMelody', f.getAuthorName())
text = f.getText(url)
self.assertTrue(text.find('Kale Resgerald at your service" He answered, "So, can we go now? Or do you want to') != -1)
if __name__ == '__main__':
unittest.main()

384
fanficdownloader/mobi.py Normal file
View file

@ -0,0 +1,384 @@
#!/usr/bin/python
# Copyright(c) 2009 Andrew Chatham and Vijay Pandurangan
import StringIO
import struct
import time
import random
import logging
from html import HtmlProcessor
# http://wiki.mobileread.com/wiki/MOBI
# http://membres.lycos.fr/microfirst/palm/pdb.html
encoding = {
'UTF-8' : 65001,
'latin-1' : 1252,
}
languages = {"en-us" : 0x0409,
"sv" : 0x041d,
"fi" : 0x000b,
"en" : 0x0009,
"en-gb" : 0x0809}
def ToHex(s):
v = ['%.2x' % ord(c) for c in s]
return ' '.join(v)
class _SubEntry:
def __init__(self, pos, html_data):
self.pos = pos
self.html = HtmlProcessor(html_data)
self.title = self.html.title
self._name = 'mobi_article_%d' % pos
if not self.title:
self.title = 'Article %d' % self.pos
def TocLink(self):
return '<a href="#%s_MOBI_START">%.80s</a>' % (self._name, self.title)
def Anchor(self):
return '<a name="%s_MOBI_START">' % self._name
def Body(self):
return self.html.RenameAnchors(self._name + '_')
class Converter:
def __init__(self, refresh_url='', title='Unknown', author='Unknown', publisher='Unknown'):
self._header = Header()
self._header.SetTitle(title)
self._header.SetAuthor(author)
self._header.SetPublisher(publisher)
self._refresh_url = refresh_url
def ConvertString(self, s):
out = StringIO.StringIO()
self._ConvertStringToFile(s, out)
return out.getvalue()
def ConvertStrings(self, html_strs):
out = StringIO.StringIO()
self._ConvertStringsToFile(html_strs, out)
return out.getvalue()
def ConvertFile(self, html_file, out_file):
self._ConvertStringToFile(open(html_file,'rb').read(),
open(out_file, 'wb'))
def ConvertFiles(self, html_files, out_file):
html_strs = [open(f,'rb').read() for f in html_files]
self._ConvertStringsToFile(html_strs, open(out_file, 'wb'))
def MakeOneHTML(self, html_strs):
"""This takes a list of HTML strings and returns a big HTML file with
all contents consolidated. It constructs a table of contents and adds
anchors within the text
"""
title_html = []
toc_html = []
body_html = []
PAGE_BREAK = '<mbp:pagebreak>'
# pull out the title page, assumed first html_strs.
htmltitle = html_strs[0]
entrytitle = _SubEntry(1, htmltitle)
title_html.append(entrytitle.Body())
title_html.append(PAGE_BREAK)
toc_html.append('<a name="TOCTOP"><h3>Table of Contents</h3><br />')
for pos, html in enumerate(html_strs[1:]):
entry = _SubEntry(pos+1, html)
toc_html.append('%s<br />' % entry.TocLink())
# give some space between bodies of work.
body_html.append(PAGE_BREAK)
body_html.append(entry.Anchor())
body_html.append(entry.Body())
# TODO: this title can get way too long with RSS feeds. Not sure how to fix
# cheat slightly and use the <a href> code to set filepos in references.
header = '''<html>
<head>
<title>Bibliorize %s GMT</title>
<guide>
<reference href="#TOCTOP" type="toc" title="Table of Contents"/>
</guide>
</head>
<body>
''' % time.ctime(time.time())
footer = '</body></html>'
all_html = header + '\n'.join(title_html + toc_html + body_html) + footer
#print "%s" % all_html.encode('utf8')
return all_html
def _ConvertStringsToFile(self, html_strs, out_file):
try:
tmp = self.MakeOneHTML(html_strs)
self._ConvertStringToFile(tmp, out_file)
except Exception, e:
logging.error('Error %s', e)
logging.debug('Details: %s' % html_strs)
def _ConvertStringToFile(self, html_data, out):
html = HtmlProcessor(html_data)
data = html.CleanHtml()
# collect offsets of '<mbp:pagebreak>' tags, use to make index list.
# indexlist = [] # list of (offset,length) tuples.
# not in current use.
# j=0
# lastj=0
# while True:
# j=data.find('<mbp:pagebreak>',lastj+10) # plus a bit so we find the next.
# if j < 0:
# break
# indexlist.append((lastj,j-lastj))
# print "index offset: %d length: %d" % (lastj,j-lastj)
# lastj=j
records = []
# title = html.title
# if title:
# self._header.SetTitle(title)
record_id = 1
for start_pos in range(0, len(data), Record.MAX_SIZE):
end = min(len(data), start_pos + Record.MAX_SIZE)
record_data = data[start_pos:end]
records.append(self._header.AddRecord(record_data, record_id))
#print "HTML Record %03d: (size:%d) [[%s ... %s]]" % ( record_id, len(record_data), record_data[:20], record_data[-20:] )
record_id += 1
self._header.SetImageRecordIndex(record_id)
records[0:0] = [self._header.MobiHeader()]
header, rec_offset = self._header.PDBHeader(len(records))
out.write(header)
for record in records:
record.WriteHeader(out, rec_offset)
#print "rec_offset: %d len(record.data): %d" % (rec_offset,len(record.data))
rec_offset += (len(record.data)+1) # plus one for trailing null
# Write to nuls for some reason
out.write('\0\0')
for record in records:
record.WriteData(out)
out.write('\0')
# needs a trailing null, I believe it indicates zero length 'overlap'.
# otherwise, the readers eat the last char of each html record.
# Calibre writes another 6-7 bytes of stuff after that, but we seem
# to be getting along without it.
class Record:
MAX_SIZE = 4096
INDEX_LEN = 8
_unique_id_seed = 28 # should be arbitrary, but taken from MobiHeader
# TODO(chatham): Record compression doesn't look that hard.
def __init__(self, data, record_id):
assert len(data) <= self.MAX_SIZE
self.data = data
if record_id != 0:
self._id = record_id
else:
Record._unique_id_seed += 1
self._id = 0
def __repr__(self):
return 'Record: id=%d len=%d' % (self._id, len(self.data))
def _SetUniqueId(self):
Record._unique_id_seed += 1
# TODO(chatham): Wraparound crap
self._id = Record._unique_id_seed
def WriteData(self, out):
out.write(self.data)
def WriteHeader(self, out, rec_offset):
attributes = 64 # dirty?
header = struct.pack('>IbbH',
rec_offset,
attributes,
0, self._id)
assert len(header) == Record.INDEX_LEN
out.write(header)
EXTH_HEADER_FIELDS = {
'author' : 100,
'publisher' : 101,
}
class Header:
EPOCH_1904 = 2082844800
def __init__(self):
self._length = 0
self._record_count = 0
self._title = '2008_2_34'
self._author = 'Unknown author'
self._publisher = 'Unknown publisher'
self._first_image_index = 0
def SetAuthor(self, author):
self._author = author.encode('ascii','ignore')
def SetTitle(self, title):
# TODO(chatham): Reevaluate whether this needs to be ASCII.
# maybe just do sys.setdefaultencoding('utf-8')? Problems
# appending self._title with other things.
self._title = title.encode('ascii','ignore')
def SetPublisher(self, publisher):
self._publisher = publisher.encode('ascii','ignore')
def AddRecord(self, data, record_id):
self.max_record_size = max(Record.MAX_SIZE, len(data))
self._record_count += 1
self._length += len(data)
return Record(data, record_id)
def _ReplaceWord(self, data, pos, word):
return data[:pos] + struct.pack('>I', word) + data[pos+4:]
def PalmDocHeader(self):
compression = 1 # no compression
unused = 0
encryption_type = 0 # no ecryption
records = self._record_count + 1 # the header record itself
palmdoc_header = struct.pack('>HHIHHHH',
compression,
unused,
self._length,
records,
Record.MAX_SIZE,
encryption_type,
unused)
assert len(palmdoc_header) == 16
return palmdoc_header
def PDBHeader(self, num_records):
HEADER_LEN = 32+2+2+9*4
RECORD_INDEX_HEADER_LEN = 6
RESOURCE_INDEX_LEN = 10
index_len = RECORD_INDEX_HEADER_LEN + num_records * Record.INDEX_LEN
rec_offset = HEADER_LEN + index_len + 2
short_title = self._title[0:31]
attributes = 0
version = 0
ctime = self.EPOCH_1904 + int(time.time())
mtime = self.EPOCH_1904 + int(time.time())
backup_time = self.EPOCH_1904 + int(time.time())
modnum = 0
appinfo_offset = 0
sort_offset = 0
type = 'BOOK'
creator = 'MOBI'
id_seed = 36
header = struct.pack('>32sHHII',
short_title, attributes, version,
ctime, mtime)
header += struct.pack('>IIII', backup_time, modnum,
appinfo_offset, sort_offset)
header += struct.pack('>4s4sI',
type, creator, id_seed)
next_record = 0 # not used?
header += struct.pack('>IH', next_record, num_records)
return header, rec_offset
def _GetExthHeader(self):
# They set author, publisher, coveroffset, thumboffset
data = {'author' : self._author,
'publisher' : self._publisher,
}
# Turn string type names into EXTH typeids.
r = []
for key, value in data.items():
typeid = EXTH_HEADER_FIELDS[key]
length_encoding_len = 8
r.append(struct.pack('>LL', typeid, len(value) + length_encoding_len,) + value)
content = ''.join(r)
# Pad to word boundary
while len(content) % 4:
content += '\0'
TODO_mysterious = 12
exth = 'EXTH' + struct.pack('>LL', len(content) + TODO_mysterious, len(data)) + content
return exth
def SetImageRecordIndex(self, idx):
self._first_image_index = idx
def MobiHeader(self):
exth_header = self._GetExthHeader();
palmdoc_header = self.PalmDocHeader()
fs = 0xffffffff
# Record 0
header_len = 0xE4 # TODO
mobi_type = 2 # BOOK
text_encoding = encoding['UTF-8']
unique_id = random.randint(1, 1<<32)
creator_version = 4
reserved = '%c' % 0xff * 40
nonbook_index = fs
full_name_offset = header_len + len(palmdoc_header) + len(exth_header) # put full name after header
language = languages['en-us']
unused = 0
mobi_header = struct.pack('>4sIIIII40sIIIIII',
'MOBI',
header_len,
mobi_type,
text_encoding,
unique_id,
creator_version,
reserved,
nonbook_index,
full_name_offset,
len(self._title),
language,
fs, fs)
assert len(mobi_header) == 104 - 16
unknown_fields = chr(0) * 32
drm_offset = 0
drm_count = 0
drm_size = 0
drm_flags = 0
exth_flags = 0x50
header_end = chr(0) * 64
mobi_header += struct.pack('>IIIIIII',
creator_version,
self._first_image_index,
fs,
unused,
fs,
unused,
exth_flags)
mobi_header += '\0' * 112 # TODO: Why this much padding?
# Set some magic offsets to be 0xFFFFFFF.
for pos in (0x94, 0x98, 0xb0, 0xb8, 0xc0, 0xc8, 0xd0, 0xd8, 0xdc):
mobi_header = self._ReplaceWord(mobi_header, pos, fs)
# 16 bytes?
padding = '\0' * 48 * 4 # why?
total_header = palmdoc_header + mobi_header + exth_header + self._title + padding
return self.AddRecord(total_header, 0)
if __name__ == '__main__':
import sys
m = Converter(title='Testing Mobi', author='Mobi Author', publisher='mobi converter')
m.ConvertFiles(sys.argv[1:], 'test.mobi')
#m.ConvertFile(sys.argv[1], 'test.mobi')

View file

@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-
import logging
import sys, os
import adapters
import writers
import ConfigParser
from writers.writer_html import HTMLWriter
logging.basicConfig(level=logging.DEBUG,format="%(levelname)s:%(filename)s(%(lineno)d):%(message)s")
config = ConfigParser.ConfigParser()
logging.debug('reading defaults.ini config file, if present')
config.read('defaults.ini')
logging.debug('reading personal.ini config file, if present')
config.read('personal.ini')
def writeStory(adapter,writeformat):
writer = writers.getWriter(writeformat,config,adapter.getStory())
writer.writeStory()
del writer
try:
adapter = adapters.getAdapter(config,sys.argv[1])
#try:
print adapter.getStory()
#except adapters.FailedToLogin, ftl:
# print "Login Failed, trying with user/pass"
# adapter.username="BobsClue"
# adapter.password="XXXXXXXXX"
# print adapter.getStory()
writeStory(adapter,"epub")
writeStory(adapter,"html")
writeStory(adapter,"txt")
del adapter
except adapters.InvalidStoryURL, isu:
print isu
except adapters.StoryDoesNotExist, dne:
print dne
except adapters.UnknownSite, us:
print us

643
fanficdownloader/output.py Normal file
View file

@ -0,0 +1,643 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import cgi
import uuid
import unicodedata
import codecs
import shutil
import string
import os.path
import zipfile
import StringIO
import logging
import hashlib
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import mobi
import zipdir
import html_constants
from constants import *
import html2text
import datetime
class FanficWriter:
def __init__(self):
pass
def writeChapter(self, index, title, text):
pass
def finalise(self):
pass
@staticmethod
def getFormatName():
return 'base'
@staticmethod
def getFormatExt():
return '.bse'
class TextWriter(FanficWriter):
htmlWriter = None
@staticmethod
def getFormatName():
return 'text'
@staticmethod
def getFormatExt():
return '.txt'
def __init__(self, base, adapter, inmemory=False, compress=False):
self.inmemory = inmemory
self.htmlWriter = HTMLWriter(base, adapter, True, False)
def writeChapter(self, index, title, text):
self.htmlWriter.writeChapter(index, title, text)
def finalise(self):
self.htmlWriter.finalise()
self.name=self.htmlWriter.name
self.fileName = self.htmlWriter.fileName.replace(".html",".txt")
if self.inmemory:
self.output = StringIO.StringIO()
else:
self.output = open(self.fileName, 'w')
self.output.write(html2text.html2text(self.htmlWriter.output.getvalue().decode('utf-8')).encode('utf-8'))
if not self.inmemory:
self.output.close()
class MobiWriter(FanficWriter):
chapters = []
files = {}
@staticmethod
def getFormatName():
return 'mobi'
@staticmethod
def getFormatExt():
return '.mobi'
def __init__(self, base, adapter, inmemory=False, compress=False):
self.basePath = base
self.storyTitle = removeEntities(adapter.getStoryName())
self.name = makeAcceptableFilename(adapter.getOutputName())
self.fileName = self.basePath + '/' + self.name + self.getFormatExt()
self.authorName = removeEntities(adapter.getAuthorName())
self.publisher = adapter.getPublisher()
self.adapter = adapter
self.mobi = mobi
self.inmemory = inmemory
self.files = {}
self.chapters = []
if not self.inmemory and os.path.exists(self.fileName):
os.remove(self.fileName)
if self.inmemory:
self.output = StringIO.StringIO()
else:
self.output = open(self.fileName, 'wb')
self.xhtmlTemplate = string.Template(html_constants.XHTML_START)
self.chapterStartTemplate = string.Template(html_constants.XHTML_CHAPTER_START)
def _printableVersion(self, text):
try:
d = text.decode('utf-8')
return d
except:
return text
def _writeFile(self, fileName, data):
#logging.debug('_writeFile(`%s`, data)' % fileName)
if fileName in self.files:
try:
d = data.decode('utf-8')
except UnicodeEncodeError, e:
d = data
self.files[fileName].write(d)
else:
self.files[fileName] = StringIO.StringIO()
self._writeFile(fileName, data)
def _getFilesStrings(self):
strings = []
if "title_page.xhtml" in self.files:
strings.append(self.files["title_page.xhtml"].getvalue())
del(self.files["title_page.xhtml"])
keys = self.files.keys()
keys.sort()
# Assumed all other files are chapter0000.xhtml.
for fn in keys:
strings.append(self.files[fn].getvalue())
return strings
def writeChapter(self, index, title, text):
title = removeEntities(title)
logging.debug("Writing chapter: %s" % title)
#title = self._printableVersion(title) #title.decode('utf-8')
text = removeEntities(text)
#text = self._printableVersion(text) #text.decode('utf-8')
# BeautifulStoneSoup doesn't have any selfClosingTags by default.
# hr & br needs to be if they're going to work.
# Some stories do use multiple br tags as their section breaks...
self.soup = bs.BeautifulStoneSoup(text, selfClosingTags=('br','hr'))
allTags = self.soup.findAll(recursive=True)
for t in allTags:
for attr in t._getAttrMap().keys():
if attr not in acceptable_attributes:
del t[attr]
# these are not acceptable strict XHTML. But we do already have
# CSS classes of the same names defined in constants.py
if t.name in ('u'):
t['class']=t.name
t.name='span'
if t.name in ('center'):
t['class']=t.name
t.name='div'
# removes paired, but empty tags.
if t.string != None and len(t.string.strip()) == 0 :
t.extract()
text = self.soup.__str__('utf8')
# ffnet(& maybe others) gives the whole chapter text
# as one line. This causes problems for nook(at
# least) when the chapter size starts getting big
# (200k+) Using Soup's prettify() messes up italics
# and such. Done after soup extract so <p> and <br>
# tags are normalized. Doing it here seems less evil
# than hacking BeautifulSoup, but it's debatable.
text = text.replace('</p>','</p>\n').replace('<br />','<br />\n')
filename="chapter%04d.xhtml" % index
self._writeFile(filename, XHTML_START % (title, title))
self._writeFile(filename, text)
self._writeFile(filename, XHTML_END)
#self.body = self.body + '\n' + self.chapterStartTemplate.substitute({'chapter' : title})
#self.body = self.body + '\n' + text
def finalise(self):
logging.debug("Finalising...")
published = self.adapter.getStoryPublished().strftime("%Y-%m-%d")
createda = self.adapter.getStoryCreated().strftime("%Y-%m-%d %H:%M:%S")
created = self.adapter.getStoryCreated().strftime("%Y-%m-%d")
updated = self.adapter.getStoryUpdated().strftime("%Y-%m-%d")
updateyy = self.adapter.getStoryUpdated().strftime("%Y")
updatemm = self.adapter.getStoryUpdated().strftime("%m")
updatedd = self.adapter.getStoryUpdated().strftime("%d")
calibre = self.adapter.getStoryUpdated().strftime("%Y-%m-%dT%H:%M:%S")
description = self.adapter.getStoryDescription()
if hasattr(description, "text"):
description = description.text
prevalue=description
try:
description = unicode(description)
except:
description=prevalue
if description is not None and len(description) > 0:
description = description.replace ('\\\'', '\'').replace('\\\"', '\"')
description = removeEntities(description)
else:
description = ' '
### writing content -- title page
titleFilePath = "title_page.xhtml"
self._writeFile(titleFilePath, TITLE_HEADER % (self.authorName, self.storyTitle, self.adapter.getStoryURL(), self.storyTitle, self.adapter.getAuthorURL(), self.authorName))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Category:', self.adapter.getCategory()))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Genre:', self.adapter.getGenre()))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Status:', self.adapter.getStoryStatus()))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Published:', published))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Updated:', updated))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Packaged:', createda))
tmpstr = self.adapter.getStoryRating() + " / " + self.adapter.getStoryUserRating()
self._writeFile(titleFilePath, TITLE_ENTRY % ('Rating Age/User:', tmpstr))
tmpstr = unicode(self.adapter.getNumChapters()) + " / " + commaGroups(unicode(self.adapter.getNumWords()))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Chapters/Words:', tmpstr))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Publisher:', self.adapter.getHost()))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Story ID:', self.adapter.getStoryId()))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Author ID:', self.adapter.getAuthorId()))
self._writeFile(titleFilePath, TITLE_FOOTER % description )
c = mobi.Converter(title=self.storyTitle,
author=self.authorName,
publisher=self.publisher)
mobidata = c.ConvertStrings(self._getFilesStrings())
self.output.write(mobidata)
if not self.inmemory:
self.output.close()
# zipdir.toZip(filename, self.directory)
class HTMLWriter(FanficWriter):
body = ''
@staticmethod
def getFormatName():
return 'html'
@staticmethod
def getFormatExt():
return '.html'
def __init__(self, base, adapter, inmemory=False, compress=False, mobi = False):
self.basePath = base
self.storyTitle = removeEntities(adapter.getStoryName())
self.name = makeAcceptableFilename(adapter.getOutputName())
self.fileName = self.basePath + '/' + self.name + self.getFormatExt()
self.authorName = removeEntities(adapter.getAuthorName())
self.adapter = adapter
self.mobi = mobi
self.inmemory = inmemory
if not self.inmemory and os.path.exists(self.fileName):
os.remove(self.fileName)
if self.inmemory:
self.output = StringIO.StringIO()
else:
self.output = open(self.fileName, 'w')
self.xhtmlTemplate = string.Template(html_constants.XHTML_START)
self.chapterStartTemplate = string.Template(html_constants.XHTML_CHAPTER_START)
def _printableVersion(self, text):
try:
d = text.decode('utf-8')
return d
except:
return text
def writeChapter(self, index, title, text):
title = self._printableVersion(title) #title.decode('utf-8')
text = self._printableVersion(text) #text.decode('utf-8')
self.body = self.body + '\n' + self.chapterStartTemplate.substitute({'chapter' : title})
self.body = self.body + '\n' + text
def finalise(self):
html = self.xhtmlTemplate.substitute({'title' : self.storyTitle, 'author' : self.authorName, 'body' : self.body})
soup = bs.BeautifulSoup(html)
result = soup.__str__('utf8')
# f = open(self.fileName, 'w')
# f.write(result)
# f.close()
self.output.write(result)
if not self.inmemory:
self.output.close()
class EPubFanficWriter(FanficWriter):
chapters = []
files = {}
@staticmethod
def getFormatName():
return 'epub'
@staticmethod
def getFormatExt():
return '.epub'
def __init__(self, base, adapter, inmemory=False, compress=True):
self.basePath = base
self.storyTitle = removeEntities(adapter.getStoryName())
self.name = makeAcceptableFilename(adapter.getOutputName())
self.directory = self.basePath + '/' + self.name
self.authorName = removeEntities(adapter.getAuthorName())
self.inmemory = inmemory
self.adapter = adapter
self.files = {}
self.chapters = []
if not self.inmemory:
self.inmemory = True
self.writeToFile = True
else:
self.writeToFile = False
if not self.inmemory:
if os.path.exists(self.directory):
shutil.rmtree(self.directory)
os.mkdir(self.directory)
os.mkdir(self.directory + '/META-INF')
os.mkdir(self.directory + '/OEBPS')
self._writeFile('mimetype', MIMETYPE)
self._writeFile('META-INF/container.xml', CONTAINER)
self._writeFile('OEBPS/stylesheet.css', CSS)
def _writeFile(self, fileName, data):
#logging.debug('_writeFile(`%s`, data)' % fileName)
if fileName in self.files:
try:
d = data.decode('utf-8')
except UnicodeEncodeError, e:
d = data
self.files[fileName].write(d)
else:
if self.inmemory:
self.files[fileName] = StringIO.StringIO()
else:
self.files[fileName] = open(self.directory + '/' + fileName, encoding='utf-8', mode='w')
self._writeFile(fileName, data)
def _closeFiles(self):
if not self.inmemory:
for f in self.files:
self.files[f].close()
def writeChapter(self, index, title, text):
title = removeEntities(title)
logging.debug("Writing chapter: %s" % title)
fileName="chapter%04d.xhtml" % index
filePath = self.directory + "/OEBPS/" + fileName
fn = 'OEBPS/' + fileName
# f = open(filePath, 'w')
text = removeEntities(text)
# BeautifulStoneSoup doesn't have any selfClosingTags by default.
# hr & br needs to be if they're going to work.
# Some stories do use multiple br tags as their section breaks...
self.soup = bs.BeautifulStoneSoup(text, selfClosingTags=('br','hr'))
allTags = self.soup.findAll(recursive=True)
for t in allTags:
for attr in t._getAttrMap().keys():
if attr not in acceptable_attributes:
del t[attr]
# these are not acceptable strict XHTML. But we do already have
# CSS classes of the same names defined in constants.py
if t.name in ('u'):
t['class']=t.name
t.name='span'
if t.name in ('center'):
t['class']=t.name
t.name='div'
# removes paired, but empty tags.
if t.string != None and len(t.string.strip()) == 0 :
t.extract()
text = self.soup.__str__('utf8')
# ffnet(& maybe others) gives the whole chapter text
# as one line. This causes problems for nook(at
# least) when the chapter size starts getting big
# (200k+) Using Soup's prettify() messes up italics
# and such. Done after soup extract so <p> and <br>
# tags are normalized. Doing it here seems less evil
# than hacking BeautifulSoup, but it's debatable.
text = text.replace('</p>','</p>\n').replace('<br />','<br />\n')
self._writeFile(fn, XHTML_START % (title, title))
self._writeFile(fn, text)
self._writeFile(fn, XHTML_END)
# print >> f, XHTML_START % (title, title)
# f.write(text)
# print >> f, XHTML_END
self.chapters.append((title, fileName))
def finalise(self):
logging.debug("Finalising...")
### writing table of contents -- ncx file
tocFilePath = "OEBPS/toc.ncx"
# toc = open(tocFilePath, 'w')
# print >> toc, TOC_START % self.storyTitle
self._writeFile(tocFilePath, TOC_START % (self.adapter.getUUID(), self.storyTitle))
published = self.adapter.getStoryPublished().strftime("%Y-%m-%d")
createda = self.adapter.getStoryCreated().strftime("%Y-%m-%d %H:%M:%S")
created = self.adapter.getStoryCreated().strftime("%Y-%m-%d")
updated = self.adapter.getStoryUpdated().strftime("%Y-%m-%d")
updateyy = self.adapter.getStoryUpdated().strftime("%Y")
updatemm = self.adapter.getStoryUpdated().strftime("%m")
updatedd = self.adapter.getStoryUpdated().strftime("%d")
calibre = self.adapter.getStoryUpdated().strftime("%Y-%m-%dT%H:%M:%S")
description = self.adapter.getStoryDescription()
if hasattr(description, "text"):
description = description.text
prevalue=description
try:
description = unicode(description)
except:
description=prevalue
if description is not None and len(description) > 0:
description = description.replace ('\\\'', '\'').replace('\\\"', '\"')
description = removeEntities(description)
else:
description = ' '
### writing content -- title page
titleFilePath = "OEBPS/title_page.xhtml"
self._writeFile(titleFilePath, TITLE_HEADER % (self.authorName, self.storyTitle, self.adapter.getStoryURL(), self.storyTitle, self.adapter.getAuthorURL(), self.authorName))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Category:', self.adapter.getCategory()))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Genre:', self.adapter.getGenre()))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Status:', self.adapter.getStoryStatus()))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Published:', published))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Updated:', updated))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Packaged:', createda))
tmpstr = self.adapter.getStoryRating() + " / " + self.adapter.getStoryUserRating()
self._writeFile(titleFilePath, TITLE_ENTRY % ('Rating Age/User:', tmpstr))
tmpstr = unicode(self.adapter.getNumChapters()) + " / " + commaGroups(unicode(self.adapter.getNumWords()))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Chapters/Words:', tmpstr))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Publisher:', self.adapter.getHost()))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Story ID:', self.adapter.getStoryId()))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Author ID:', self.adapter.getAuthorId()))
self._writeFile(titleFilePath, TITLE_FOOTER % description )
### writing content -- opf file
opfFilePath = "OEBPS/content.opf"
# opf = open(opfFilePath, 'w')
self._writeFile(opfFilePath, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName, self.adapter.getLanguageId(), published, created, updated, calibre, description))
if self.adapter.storyStatus != 'Unknown':
self.adapter.addSubject(self.adapter.storyStatus)
i = 0
subjs = []
subjs = self.adapter.getSubjects()
for subj in subjs:
self._writeFile(opfFilePath, CONTENT_SUBJECT % subj)
i = i + 1
if (i <= 0):
self._writeFile(opfFilePath, CONTENT_SUBJECT % "FanFiction")
subj = "Last Update Year/Month: " + updateyy + "/" + updatemm
self._writeFile(opfFilePath, CONTENT_SUBJECT % subj)
subj = "Last Update: " + updateyy + "/" + updatemm + "/" + updatedd
self._writeFile(opfFilePath, CONTENT_SUBJECT % subj)
self._writeFile(opfFilePath, CONTENT_END_METADATA % (self.adapter.getPublisher(), self.adapter.getUUID(), self.adapter.getStoryURL(), self.adapter.getStoryURL(), self.adapter.getStoryUserRating()))
# print >> opf, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName)
ids = []
i = 0
t = "Title Page"
f = "title_page.xhtml"
chapterId = "title_page"
self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f))
self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))
ids.append(chapterId)
i = i + 1
for t,f in self.chapters:
chapterId = "chapter%04d" % i
self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f))
self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))
ids.append(chapterId)
i = i + 1
# logging.d('Toc and refs printed, proceesing to ref-ids....')
self._writeFile(tocFilePath, TOC_END)
self._writeFile(opfFilePath, CONTENT_END_MANIFEST)
for chapterId in ids:
self._writeFile(opfFilePath, CONTENT_ITEMREF % chapterId)
self._writeFile(opfFilePath, CONTENT_END)
self._closeFiles()
filename = self.directory + self.getFormatExt()
zipdata = zipdir.inMemoryZip(self.files)
if self.writeToFile:
f = open(filename, 'wb')
f.write(zipdata.getvalue())
f.close()
else:
self.output = zipdata
# zipdir.toZip(filename, self.directory)
def unirepl(match):
"Return the unicode string for a decimal number"
if match.group(1)=='x':
radix=16
else:
radix=10
value = int(match.group(2), radix )
return unichr(value)
def replaceNumberEntities(data):
p = re.compile(r'&#(x?)(\d+);')
return p.sub(unirepl, data)
def replaceNotEntities(data):
# not just \w or \S. regexp from c:\Python25\lib\sgmllib.py
# (or equiv), SGMLParser, entityref
p = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
return p.sub(r'&\1', data)
def removeEntities(text):
# replace numeric versions of [&<>] with named versions.
if text is None:
return text
try:
t = text.decode('utf-8')
except UnicodeEncodeError, e:
try:
t = text.encode ('ascii', 'xmlcharrefreplace')
except UnicodeEncodeError, e:
t = text
text = t
text = re.sub(r'&#0*38;','&amp;',text)
text = re.sub(r'&#0*60;','&lt;',text)
text = re.sub(r'&#0*62;','&gt;',text)
# replace remaining &#000; entities with unicode value, such as &#039; -> '
text = replaceNumberEntities(text)
# replace several named entities with character, such as &mdash; -> -
# see constants.py for the list.
# reverse sort will put entities with ; before the same one without, when valid.
for e in reversed(sorted(entities.keys())):
v = entities[e]
try:
text = text.replace(e, v)
except UnicodeDecodeError, ex:
# for the pound symbol in constants.py
text = text.replace(e, v.decode('utf-8'))
# SGMLParser, and in turn, BeautifulStoneSoup doesn't parse
# entities terribly well and inserts (;) after something that
# it thinks might be an entity. AT&T becomes AT&T; All of my
# attempts to fix this by changing the input to
# BeautifulStoneSoup break something else instead. But at
# this point, there should be *no* real entities left, so find
# these not-entities and removing them here should be safe.
text = replaceNotEntities(text)
# &lt; &lt; and &amp; are the only html entities allowed in xhtml, put those back.
text = text.replace('&', '&amp;').replace('&amp;lt;', '&lt;').replace('&amp;gt;', '&gt;')
return text
def makeAcceptableFilename(text):
return re.sub('[^a-zA-Z0-9_-]+','',removeEntities(text).replace(" ", "_").replace(":","_"))
def commaGroups(s):
groups = []
while s and s[-1].isdigit():
groups.append(s[-3:])
s = s[:-3]
return s + ','.join(reversed(groups))

View file

@ -0,0 +1,367 @@
# -*- coding: utf-8 -*-
# Copied from the twilighted.py because site is almost the same..
# of course, now that we're trying to scrape more detail about the
# story, there were differences in how headers are displayed
import os
import re
import sys
import shutil
import os.path
import urllib as u
import logging
import pprint as pp
import unittest
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time
import datetime
from adapter import *
class PotionsNSnitches(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
self.password = ''
self.login='sigizmund'
self.storyDescription = 'Fanfiction Story'
self.authorId = '0'
self.authorURL = ''
self.storyId = '0'
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.subjects.append ('fanfiction')
self.subjects.append ('Harry Potter')
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = 'FanFiction'
self.category = 'Category'
self.storyStatus = 'In-Progress'
self.storyRating = 'PG'
self.storyUserRating = '0'
self.storyCharacters = []
self.storySeries = ''
self.outputName = ''
self.outputStorySep = '-pns_'
self.chapurl = False
ss=self.url.split('?')
if ss is not None and len(ss) > 1:
sss = ss[1].replace('&amp;','&').split('&')
if sss is not None and len(sss) > 0:
ssss = sss[0].split('=')
if ssss is not None and len(ssss) > 1 and ssss[0] == 'sid':
self.storyId = ssss[1]
if len(sss) > 1:
ssss = sss[1].split('=')
if ssss is not None and len(ssss) > 1 and ssss[0] == 'chapter':
self.chapurl = True
self.url = 'http://' + self.host + self.path + '?sid=' + self.storyId
logging.debug('self.url=%s' % self.url)
logging.debug("Created PotionsNSnitches: url=%s" % (self.url))
def _getLoginScript(self):
return '/user.php?action=login'
def reqLoginData(self, data):
if data.find('Registered Users Only. Please click OK to login or register.') != -1 or data.find('There is no such account on our website') != -1:
return True
else:
return False
def _fillCharacters(self, strlist, idx, maxlen):
ii = idx
while ii < maxlen:
chara = strlist[ii].strip()
if len(chara) > 0:
if chara.find(':') != -1:
return (ii-1)
elif chara.find(',') == -1:
self.addCharacter (chara)
ii = ii + 1
return (ii)
def _buildGenre(self, strlist, idx, maxlen):
self.genre = ''
ii = idx
while ii < maxlen:
genre = strlist[ii].strip()
if len(genre) > 0:
if genre.find(':') != -1:
return (ii-1)
elif genre.find(',') != -1:
genre = ', '
else:
self.addSubject (genre)
self.genre = self.genre + genre
ii = ii + 1
return (ii)
def _buildCategory(self, strlist, idx, maxlen):
self.category = ''
ii = idx
while ii < maxlen:
cat = strlist[ii].strip()
if len(cat) > 0:
if cat.find(':') != -1:
return (ii-1)
elif cat.find(',') != -1:
cat = ', '
else:
self.addSubject (cat)
self.category = self.category + cat
ii = ii + 1
return (ii)
def extractIndividualUrls(self):
url = self.url + '&chapter=1'
data = ''
try:
data = self.opener.open(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise StoryDoesNotExist("Problem reading story URL " + url + "!")
if self.reqLoginData(data):
self.performLogin()
data = ''
try:
data = self.opener.open(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise StoryDoesNotExist("Problem reading story URL " + url + "!")
if self.reqLoginData(data):
raise FailedToDownload("Error downloading Story: %s! Login Failed!" % url)
soup = None
try:
soup = bs.BeautifulStoneSoup(data)
except:
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % url)
self.storyName = ''
self.authorName = ''
self.storyId = '0'
title = soup.find('title').string
if title is not None and len(title) > 0:
logging.debug('Title: %s' % title)
ss = title.split(' by ')
if ss is not None and len(ss) > 1:
self.storyName = ss[0].strip()
self.authorName = ss[1].strip()
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
logging.debug('self.authorId=%s, self.authorName=%s' % (self.authorId, self.authorName))
select = soup.find('select', { 'name' : 'chapter' } )
result = []
if select is None:
# no chapters found, try url by itself.
chaptitle = soup.find('div', { 'id' : 'chaptertitle' } )
if chaptitle is not None and chaptitle.string is not None and len(chaptitle.string) > 0:
result.append((url,chaptitle.string))
else:
result.append((url,self.storyName))
else:
allOptions = select.findAll('option')
for o in allOptions:
url = self.url + "&chapter=%s" % o['value']
title = o.string
result.append((url,title))
url = self.url + "&index=1"
data = self.opener.open(url).read()
lines = data.split('\n')
soup = bs.BeautifulStoneSoup(data)
pgt = soup.find('div', {'id' : 'pagetitle'})
#logging.debug('pagetitle: %s' % pgt)
pgtAs = pgt.findAll('a')
#logging.debug('pgtAs: %s' % pgtAs)
for a in pgtAs:
if a['href'].find('viewstory.php') != -1:
(u1, self.storyId) = a['href'].split('=')
self.storyName = a.string
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
elif a['href'].find('viewuser.php') != -1:
self.authorName = a.string
self.authorURL = 'http://' + self.host + '/' + a['href']
(u1, self.authorId) = a['href'].split('=')
logging.debug('self.authorName=%s, self.authorId=%s' % (self.authorName, self.authorId))
output = soup.find('div', {'id' : 'output'})
#logging.debug('output: %s' % unicode(output))
if output is not None and len(unicode(output)) > 1:
s2 = re.split ('<[^>]+>', unicode(output))
#logging.debug('s2=%s' % s2)
ii = 0
ll = len(s2)
while ii < ll:
if s2[ii] == 'Summary:' and ii+1 < ll:
self.storyDescription = s2[ii+1].strip()
logging.debug('self.storyDescription: %s' % self.storyDescription)
break;
ii = ii+1
cnt = soup.find('div', {'class' : 'content'})
#logging.debug('content: %s' % cnt)
cnttd = cnt.findAll('td')
#logging.debug('cnttd: %s' % cnttd)
for td in cnttd:
#logging.debug('td: %s' % unicode(td))
ss = unicode(td).replace('\n','').replace('\r','').replace('&nbsp;', ' ')
if len(ss) > 1:
s2 = re.split ('<[^>]+>', ss)
#logging.debug('s2=%s' % s2)
ii = 0
ll = len(s2)
while ii < ll-1:
if s2[ii] is not None and len(s2[ii]) > 0 and s2[ii].find(':') != -1:
skey = s2[ii].strip()
ii = ii+1
if skey == 'Rated:':
self.storyRating = s2[ii].strip()
logging.debug('self.storyRating=%s' % self.storyRating)
ii = ii + 1
elif skey == 'Chapters:':
self.numChapters = s2[ii].strip()
logging.debug('self.numChapters=%s' % self.numChapters)
ii = ii + 1
elif skey == 'Characters:':
ii = self._fillCharacters(s2, ii, ll)
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
ii = ii + 1
elif skey == 'Genres:':
ii = self._buildGenre(s2, ii, ll)
logging.debug('self.genre=%s' % self.genre)
logging.debug('self.subjects=%s' % self.subjects)
elif skey == 'Categories:':
ii = self._buildCategory(s2, ii, ll)
logging.debug('self.category=%s' % self.category)
logging.debug('self.subjects=%s' % self.subjects)
elif skey == 'Completed:':
if s2[ii].strip(' ') == "No":
self.storyStatus = 'In-Progress'
else:
self.storyStatus = 'Completed'
ii = ii + 1
elif skey == 'Word count:':
self.numWords = s2[ii].strip()
if self.numWords is None or len(self.numWords) == 0:
self.numWords = '0'
logging.debug('self.numWords=%s' % self.numWords)
ii = ii + 1
elif skey == 'Takes Place:':
ii = ii + 1
elif skey == 'Awards:':
ii = ii + 1
elif skey == 'Series:':
ii = ii + 1
elif skey == 'Read:':
ii = ii + 1
elif skey == 'Warnings:':
ii = ii + 1
else:
ii = ii + 1
tls = soup.findAll('div', {'style' : 'text-align: center;'})
for tl in tls:
#logging.debug('tl: %s' % tl)
ss = unicode(tl).replace('\n','').replace('\r','').replace('&nbsp;', ' ')
if ss.find('Published:') != -1:
s2 = re.split ('<[^>]+>', ss)
#logging.debug('s2: %s' % s2)
ii = 0
ll = len(s2)
while ii < ll-1:
if s2[ii] is not None and len(s2[ii]) > 0 and s2[ii].find(':') != -1:
skey = s2[ii].strip()
#logging.debug('skey: %s' % skey)
ii = ii+1
if skey == 'Published:':
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s2[ii].strip(' '), "%b %d %Y")))
logging.debug('self.storyPublished=%s' % self.storyPublished)
ii = ii + 1
elif skey == 'Updated:':
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s2[ii].strip(' '), "%b %d %Y")))
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
ii = ii + 1
else:
ii = ii + 1
if (self.storyName is None or len(self.storyName) == 0) and self.storyId == '0':
logging.error('self.storyName is empty!! Exitting!')
exit(1)
return result
def getText(self, url):
if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url
logging.debug('Getting data from: %s' % url)
data = ''
try:
data = self.opener.open(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
# need to do this, because for some reason the <br /> tag in the story causes problems
data = data.replace('<br />', ' SOMETHING_BR ')
soup = None
try:
soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES)
except:
logging.info("Failed to decode: <%s>" % data)
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
div = soup.find('div', {'id' : 'story'})
if None == div:
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
# put the <br /> tags back in..
text = div.__str__('utf8').replace(' SOMETHING_BR ','<br />')
return text
class PotionsNSnitches_UnitTests(unittest.TestCase):
def setUp(self):
logging.basicConfig(level=logging.DEBUG)
pass
def testLoginWorks(self):
pass
def testGetUrlsWorks(self):
url = 'http://potionsandsnitches.net/fanfiction/viewstory.php?sid=2230'
self.assertEquals(32, len(Twilighted(url).extractIndividualUrls()))
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,10 @@
To use, do:
python downloader.py <url> (epub|html|text|mobi)
Eg:
python downloader.py http://www.fanfiction.net/s/5192986/1/A_Fox_in_Tokyo epub
This tool uses Python 2.5.2, but should work with newer versions of Python.

64
fanficdownloader/story.py Normal file
View file

@ -0,0 +1,64 @@
# -*- coding: utf-8 -*-
from htmlcleanup import conditionalRemoveEntities
class Story:
def __init__(self):
self.metadata = {}
self.chapters = [] # chapters will be tuples of (title,html)
self.listables = {} # some items (extratags, category, warnings & genres) are also kept as lists.
def setMetadata(self, key, value):
self.metadata[key]=conditionalRemoveEntities(value)
def getMetadataRaw(self,key):
if self.metadata.has_key(key):
return self.metadata[key]
def getMetadata(self, key):
if self.getLists().has_key(key):
return ', '.join(self.getList(key))
if self.metadata.has_key(key):
value = self.metadata[key]
if value:
if key == "numWords":
value = commaGroups(value)
if key == "dateCreated":
value = value.strftime("%Y-%m-%d %H:%M:%S")
if key == "datePublished" or key == "dateUpdated":
value = value.strftime("%Y-%m-%d")
return value
def addToList(self,listname,value):
if not self.listables.has_key(listname):
self.listables[listname]=[]
# prevent duplicates.
if not value in self.listables[listname]:
self.listables[listname].append(conditionalRemoveEntities(value))
def getList(self,listname):
if not self.listables.has_key(listname):
return []
return self.listables[listname]
def getLists(self):
return self.listables
def addChapter(self, title, html):
self.chapters.append( (title,html) )
def getChapters(self):
"Chapters will be tuples of (title,html)"
return self.chapters
def __str__(self):
return "Metadata: " +str(self.metadata) + "\nListables: " +str(self.listables) #+ "\nChapters: "+str(self.chapters)
def commaGroups(s):
groups = []
while s and s[-1].isdigit():
groups.append(s[-3:])
s = s[:-3]
return s + ','.join(reversed(groups))

View file

@ -0,0 +1,316 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import shutil
import os.path
import urllib as u
import logging
import pprint as pp
import unittest
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time
import datetime
from adapter import *
import twipassword
class Twilighted(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
self.password=twipassword.password
self.login='sigizmund'
self.storyDescription = 'Fanfiction Story'
self.authorId = '0'
self.authorURL = ''
self.storyId = '0'
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.subjects.append ('fanfiction')
self.subjects.append ('Twilight')
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = ''
self.category = 'Fanfiction'
self.storyStatus = 'In-Progress'
self.storyRating = 'PG'
self.storyUserRating = '0'
self.storyCharacters = []
self.storySeries = ''
self.outputName = ''
self.outputStorySep = '-tw_'
self.chapurl = False
ss=self.url.split('?')
logging.debug('ss=%s' % ss)
if ss is not None and len(ss) > 1:
sss = ss[1].replace('&amp;','&').split('&')
logging.debug('sss=%s' % sss)
if sss is not None and len(sss) > 0:
ssss = sss[0].split('=')
logging.debug('ssss=%s' % ssss)
if ssss is not None and len(ssss) > 1 and ssss[0] == 'sid':
self.storyId = ssss[1]
if len(sss) > 1:
ssss = sss[1].split('=')
logging.debug('ssss=%s' % ssss)
if ssss is not None and len(ssss) > 1 and ssss[0] == 'chapter':
self.chapurl = True
self.url = 'http://' + self.host + self.path + '?sid=' + self.storyId
logging.debug('self.url=%s' % self.url)
logging.debug("Created Twilighted: url=%s" % (self.url))
def _getLoginScript(self):
return '/user.php?action=login'
def reqLoginData(self, data):
if data.find('Registered Users Only. Please click OK to login or register.') != -1 or data.find('There is no such account on our website') != -1:
return True
else:
return False
def requiresLogin(self, url = None):
return True
def performLogin(self, url = None):
data = {}
data['penname'] = self.login
data['password'] = self.password
data['cookiecheck'] = '1'
data['submit'] = 'Submit'
urlvals = u.urlencode(data)
loginUrl = 'http://' + self.host + self._getLoginScript()
logging.debug("Will now login to URL %s" % loginUrl)
req = self.opener.open(loginUrl, urlvals)
d = req.read().decode('utf-8')
if self.reqLoginData(d) :
return False
else:
return True
def extractIndividualUrls(self):
url = self.url + '&chapter=1'
data = ''
try:
data = self.opener.open(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise StoryDoesNotExist("Problem reading story URL " + url + "!")
if self.reqLoginData(data):
self.performLogin()
data = ''
try:
data = self.opener.open(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise StoryDoesNotExist("Problem reading story URL " + url + "!")
if self.reqLoginData(data):
raise FailedToDownload("Error downloading Story: %s! Login Failed!" % url)
soup = None
try:
soup = bs.BeautifulStoneSoup(data)
except:
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % url)
title = soup.find('title').string
logging.debug('Title: %s' % title)
self.storyName = title.split(' by ')[0].strip()
self.authorName = title.split(' by ')[1].strip()
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
logging.debug('self.authorId=%s, self.authorName=%s' % (self.authorId, self.authorName))
select = soup.find('select', { 'name' : 'chapter' } )
result = []
if select is None:
# no chapters found, try url by itself.
result.append((self.url,self.storyName))
else:
allOptions = select.findAll('option')
for o in allOptions:
url = self.url + "&chapter=%s" % o['value']
title = o.string
result.append((url,title))
url = self.url + "&index=1"
data = self.opener.open(url).read()
lines = data.split('\n')
soup = bs.BeautifulStoneSoup(data)
metas = soup.findAll('meta')
for meta in metas:
if 'name' in meta._getAttrMap() and meta['name'].find('description') != -1:
#logging.debug('Meta: %s' % meta)
if 'content' in meta._getAttrMap():
s1 = bs.BeautifulStoneSoup(meta['content'])
ps = s1.findAll('p')
if len(ps) > 0:
self.storyDescription = ps[0]
#logging.debug('self.storyDescription=%s' % self.storyDescription.replace("\n"," ").replace('\r',''))
else:
divs = meta.findAll('div')
#logging.debug('Divs: %s' % divs)
for div in divs:
#logging.debug('Div: %s' % div)
if 'id' in div._getAttrMap() and div['id'].find('pagetitle') != -1:
#logging.debug('Div PAGETITLE: %s' % div)
allA = div.findAll('a')
for a in allA:
if 'href' in a._getAttrMap():
if a['href'].find('viewstory.php?sid=') != -1:
str1 = a.string
(vs, self.storyId) = a['href'].split('=')
logging.debug('self.storyId=%s self.storyName=%s' % (self.storyId, self.storyName))
if a['href'].find('viewuser.php?uid=') != -1:
str1 = a.string
(vs, self.authorId) = a['href'].split('=')
logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName))
self.authorURL = 'http://'+self.host+'/viewuser.php?uid='+self.authorId
logging.debug('self.authorURL=%s' % self.authorURL)
if 'class' in div._getAttrMap() and div['class'].find('content') != -1:
#logging.debug('Div CONTENT: %s' % div)
brs = div.findAll('br')
for br in brs:
buf = unicode(br).encode('utf-8')
strs = re.split ('<[^>]+>', buf)
#logging.debug('BUF: %s' % strs)
ii = 2
stlen = len(strs)
while stlen > ii+1:
if len(strs[ii]) == 0:
ii = ii+1
continue
if strs[ii] == 'Categories:':
ii = ii+1
while stlen > ii and len(strs[ii]) != 0 and strs[ii].find(':') == -1:
if strs[ii] != ' ' and strs[ii] != ', ':
if len(self.genre) > 0:
self.genre = self.genre + ', '
self.genre = strs[ii].strip(' ')
if len(self.category) == 0:
self.category = strs[ii].strip(' ')
self.addSubject(strs[ii].strip(' '))
ii = ii+1
logging.debug('self.subjects=%s' % self.subjects)
if strs[ii] == 'Characters: ':
ii = ii+1
while stlen > ii and len(strs[ii]) != 0 and strs[ii].find(':') == -1:
if strs[ii] != ' ' and strs[ii] != ', ':
self.addCharacter(strs[ii].strip(' '))
ii = ii+1
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
elif strs[ii] == 'Completed:':
if strs[ii+1].strip(' ') == "No":
self.storyStatus = 'In-Progress'
else:
self.storyStatus = 'Completed'
ii = ii+2
logging.debug('self.storyStatus=%s' % self.storyStatus)
elif strs[ii] == 'Rated:':
self.storyRating = strs[ii+1].strip(' ')
ii = ii+2
logging.debug('self.storyRating=%s' % self.storyRating)
elif strs[ii] == 'Series:':
self.storySeries = strs[ii+1].strip(' ')
if self.storySeries == 'None':
self.storySeries = ''
ii = ii+2
logging.debug('self.storySeries=%s' % self.storySeries)
elif strs[ii] == 'Chapters: ':
self.numChapters = strs[ii+1].strip(' ')
ii = ii+2
logging.debug('self.numChapters=%s' % self.numChapters)
elif strs[ii] == 'Word count:':
self.numWords = strs[ii+1].strip(' ')
ii = ii+2
logging.debug('self.numWords=%s' % self.numWords)
elif strs[ii] == ' Published: ':
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(strs[ii+1].strip(' '), "%B %d, %Y")))
ii = ii+2
logging.debug('self.storyPublished=%s' % self.storyPublished)
elif strs[ii] == 'Updated:':
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(strs[ii+1].strip(' '), "%B %d, %Y")))
ii = ii+2
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
else:
logging.debug('Skipped Label \"%s\" Value \"%s\"' % (strs[ii], strs[ii+1]))
ii = ii+2
return result
def getText(self, url):
if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url
logging.debug('Getting data from: %s' % url)
data = ''
try:
data = self.opener.open(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
soup = None
try:
soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES)
except:
logging.info("Failed to decode: <%s>" % data)
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
div = soup.find('div', {'id' : 'story'})
if None == div:
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return div.__str__('utf8')
class Twilighted_UnitTests(unittest.TestCase):
def setUp(self):
logging.basicConfig(level=logging.DEBUG)
pass
def testLoginWorks(self):
url = 'http://www.twilighted.net/viewstory.php?sid=10004'
self.assertTrue(Twilighted(url).performLogin())
def testGetUrlsWorks(self):
url = 'http://www.twilighted.net/viewstory.php?sid=10004'
self.assertEquals(32, len(Twilighted(url).extractIndividualUrls()))
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,5 @@
# -*- coding: utf-8 -*-
# This is really for the web version. download.py will ask.
password='somepass'
twiwritepassword='otherpass'

View file

@ -0,0 +1,280 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import shutil
import os.path
import urllib as u
import logging
import pprint as pp
import unittest
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time
import datetime
from adapter import *
import twipassword
class Twiwrite(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
self.password=twipassword.twiwritepassword
self.login='BobsClue'
self.storyDescription = 'Fanfiction Story'
self.authorId = '0'
self.authorURL = ''
self.storyId = '0'
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.subjects.append ('fanfiction')
self.subjects.append ('Twiwrite')
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = ''
self.category = 'Fanfiction'
self.storyStatus = 'Unknown'
self.storyRating = 'Unknown'
self.storyUserRating = '0'
self.storyCharacters = []
self.storySeries = ''
self.outputName = ''
self.outputStorySep = '-twrt_'
self.chapurl = False
ss=self.url.split('?')
logging.debug('ss=%s' % ss)
if ss is not None and len(ss) > 1:
sss = ss[1].replace('&amp;','&').split('&')
logging.debug('sss=%s' % sss)
if sss is not None and len(sss) > 0:
ssss = sss[0].split('=')
logging.debug('ssss=%s' % ssss)
if ssss is not None and len(ssss) > 1 and ssss[0] == 'sid':
self.storyId = ssss[1]
if len(sss) > 1:
ssss = sss[1].split('=')
logging.debug('ssss=%s' % ssss)
if ssss is not None and len(ssss) > 1 and ssss[0] == 'chapter':
self.chapurl = True
self.url = 'http://' + self.host + self.path + '?sid=' + self.storyId
logging.debug('self.url=%s' % self.url)
logging.debug("Created Twiwrite: url=%s" % (self.url))
def _getLoginScript(self):
return '/user.php?action=login'
def reqLoginData(self, data):
if data.find('Registered Users Only') != -1 or data.find('There is no such account on our website') != -1:
return True
else:
return False
def requiresLogin(self, url = None):
return False
def performLogin(self, url = None):
data = {}
data['penname'] = self.login
data['password'] = self.password
data['cookiecheck'] = '1'
data['submit'] = 'Submit'
urlvals = u.urlencode(data)
loginUrl = 'http://' + self.host + self._getLoginScript()
logging.debug("Will now login to URL %s" % loginUrl)
req = self.opener.open(loginUrl, urlvals)
d = req.read().decode('utf-8')
if self.reqLoginData(d) :
return False
else:
return True
def extractIndividualUrls(self):
url = self.url + '&chapter=1&ageconsent=ok&warning=1'
data = ''
try:
data = self.opener.open(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise StoryDoesNotExist("Problem reading story URL " + url + "!")
if self.reqLoginData(data):
self.performLogin()
data = ''
try:
data = self.opener.open(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise StoryDoesNotExist("Problem reading story URL " + url + "!")
if self.reqLoginData(data):
raise FailedToDownload("Error downloading Story: %s! Login Failed!" % url)
soup = None
try:
soup = bs.BeautifulStoneSoup(data)
except:
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % url)
#<div id="pagetitle"><a href="viewstory.php?sid=280">Twilight for Dummies</a> by <a href="viewuser.php?uid=61">The Chick Norris</a> </div>
div = soup.find('div',{'id':'pagetitle'})
titlea = div.find('a', href=re.compile(r"viewstory.php"))
self.storyName = titlea.string
authora = div.find('a', href=re.compile(r"viewuser.php"))
self.authorName = authora.string
self.authorId= authora['href'].split('=')[1]
self.authorURL = 'http://'+self.host+'/'+authora['href']
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
logging.debug('self.authorId=%s, self.authorName=%s' % (self.authorId, self.authorName))
select = soup.find('select', { 'name' : 'chapter' } )
result = []
if select is None:
# no chapters found, try url by itself.
result.append((self.url,self.storyName))
else:
allOptions = select.findAll('option')
for o in allOptions:
url = self.url + "&chapter=%s&ageconsent=ok&warning=1" % o['value']
title = o.string
result.append((url,title))
url = self.url + "&index=1&ageconsent=ok&warning=1"
data = self.opener.open(url).read()
lines = data.split('\n')
soup = bs.BeautifulStoneSoup(data)
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
value = labelspan.nextSibling
label = labelspan.string
if 'Rated' in label:
self.storyRating = value.strip()
if 'Chapters' in label:
self.numChapters = value.strip()
if 'Word count' in label:
self.numWords = value.strip()
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
catstext = [cat.string for cat in cats]
self.category = ', '.join(catstext)
for cat in catstext:
self.addSubject(cat.string)
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class'))
genrestext = [genre.string for genre in genres]
self.genre = ', '.join(genrestext)
for genre in genrestext:
self.addSubject(genre.string)
if 'Completed' in label:
if 'Yes' in value:
self.storyStatus = 'Completed'
else:
self.storyStatus = 'In-Progress'
if 'Published' in label:
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(value.strip(), "%B %d, %Y")))
if 'Updated' in label:
# there's a stray [ at the end.
value = value[0:-1]
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(value.strip(), "%B %d, %Y")))
# the only things in <p> tags in <div class='content'> are the parts of the summary.
divcontent = soup.find('div',{'class':'content'})
# metadesc = soup.find('meta',{'name':'description'})
# contentsoup = bs.BeautifulStoneSoup(metadesc['content'])
ps = divcontent.findAll('p')
pstext=[]
for p in ps:
if p.string:
s = p.string.replace('&nbsp;',' ').strip()
if s:
pstext.append(p.string)
self.storyDescription = ' '.join(pstext)
print "self.storyDescription: %s"%self.storyDescription
return result
def getText(self, url):
if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url
logging.debug('Getting data from: %s' % url)
data = ''
try:
data = self.opener.open(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
soup = None
try:
soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES)
except:
logging.info("Failed to decode: <%s>" % data)
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
div = soup.find('div', {'id' : 'story'})
if None == div:
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return div.__str__('utf8')
class Twiwrite_UnitTests(unittest.TestCase):
def setUp(self):
logging.basicConfig(level=logging.DEBUG)
pass
def testLoginWorks(self):
url = 'http://www.twiwrite.net/viewstory.php?sid=117'
self.assertTrue(Twiwrite(url).performLogin())
def testGetUrlsWorks(self):
url = 'http://www.twiwrite.net/viewstory.php?sid=117'
self.assertEquals(36, len(Twiwrite(url).extractIndividualUrls()))
if __name__ == '__main__':
unittest.main()

225
fanficdownloader/whofic.py Normal file
View file

@ -0,0 +1,225 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import shutil
import os.path
import urllib as u
import logging
import pprint as pp
import unittest
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time
import datetime
from adapter import *
class Whofic(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
self.storyDescription = 'Fanfiction Story'
self.authorId = '0'
self.authorURL = ''
self.storyId = '0'
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.subjects.append ('Fanfiction')
self.subjects.append ('Doctor Who')
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = ''
self.category = ''
self.storyStatus = 'In-Progress'
self.storyRating = 'PG'
self.storyUserRating = '0'
self.storyCharacters = []
self.storySeries = ''
self.outputName = ''
self.outputStorySep = '-whof_'
self.chapurl = False
ss=self.url.split('?')
logging.debug('ss=%s' % ss)
if ss is not None and len(ss) > 1:
sss = ss[1].replace('&amp;','&').split('&')
logging.debug('sss=%s' % sss)
if sss is not None and len(sss) > 0:
ssss = sss[0].split('=')
logging.debug('ssss=%s' % ssss)
if ssss is not None and len(ssss) > 1 and ssss[0] == 'sid':
self.storyId = ssss[1]
if len(sss) > 1:
ssss = sss[1].split('=')
logging.debug('ssss=%s' % ssss)
if ssss is not None and len(ssss) > 1 and ssss[0] == 'chapter':
self.chapurl = True
self.url = 'http://' + self.host + self.path + '?sid=' + self.storyId
logging.debug('self.url=%s' % self.url)
logging.debug("Created Whofic: url=%s" % (self.url))
def requiresLogin(self, url = None):
return False
def extractIndividualUrls(self):
url = self.url + '&chapter=1'
data = ''
try:
data = self.opener.open(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise StoryDoesNotExist("Problem reading story URL " + url + "!")
soup = None
try:
soup = bs.BeautifulStoneSoup(data)
except:
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % url)
title = soup.find('title').string
title = title.split('::')[1].strip()
logging.debug('Title: %s' % title)
self.storyName = title.split(' by ')[0].strip()
self.authorName = title.split(' by ')[1].strip()
for a in soup.findAll('a'):
if a['href'].startswith('viewuser.php'):
self.authorId = a['href'].split('=')[1]
self.authorURL = 'http://'+self.host+'/'+a['href']
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
logging.debug('self.authorId=%s, self.authorName=%s' % (self.authorId, self.authorName))
select = soup.find('select', { 'name' : 'chapter' } )
result = []
if select is None:
# no chapters found, try url by itself.
result.append((url,self.storyName))
else:
allOptions = select.findAll('option')
for o in allOptions:
url = self.url + "&chapter=%s" % o['value']
# just in case there's tags, like <i> in chapter titles.
title = "%s" % o
title = re.sub('<[^>]+>','',title)
result.append((url,title))
## Whofic.com puts none of the meta data in the chapters or
## even the story chapter index page. Need to scrape the
## author page to find it.
data = self.opener.open(self.authorURL).read()
soup = bs.BeautifulStoneSoup(data, selfClosingTags=('br','hr'))
# find this story in the list, parse it's metadata based on
# lots of assumptions, since there's little tagging.
for a in soup.findAll('a'):
if a['href'].find('viewstory.php?sid='+self.storyId) != -1:
metadata = a.findParent('td')
metadatachunks = metadata.__str__('utf8').split('<br />')
# process metadata for this story.
self.storyDescription = metadatachunks[1].strip()
# the stuff with ' - ' separators
moremeta = metadatachunks[2]
moremeta = re.sub('<[^>]+>','',moremeta) # strip tags.
moremetaparts = moremeta.split(' - ')
self.category = moremetaparts[0]
for cat in self.category.split(', '):
self.addSubject(cat.strip())
self.storyRating = moremetaparts[1]
for warn in moremetaparts[2].split(', '):
self.addSubject(warn.strip())
self.genre = moremetaparts[3]
# the stuff with ' - ' separators *and* names
moremeta = metadatachunks[5]
moremeta = re.sub('<[^>]+>','',moremeta) # strip tags.
moremetaparts = moremeta.split(' - ')
for part in moremetaparts:
(name,value) = part.split(': ')
name=name.strip()
value=value.strip()
if name == 'Published':
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(value, '%Y.%m.%d')))
if name == 'Updated':
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(value, '%Y.%m.%d')))
if name == 'Completed' and value == 'Yes':
self.storyStatus = name
if name == 'Word Count':
self.numWords = value
break
self.numChapters = len(result)
return result
def getText(self, url):
if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url
logging.debug('Getting data from: %s' % url)
data = ''
try:
data = self.opener.open(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
soup = None
try:
# I really wish I knew why adastra needs the selfClosingTags to make <br /> work, but ficwad doesn't.
soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES, selfClosingTags=('br','hr'))
except:
logging.info("Failed to decode: <%s>" % data)
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
# hardly a great identifier, I know, but whofic really doesn't
# give us anything better to work with.
span = soup.find('span', {'style' : 'font-size: 100%;'})
if None == span:
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return span.__str__('utf8')
class Whofic_UnitTests(unittest.TestCase):
def setUp(self):
logging.basicConfig(level=logging.DEBUG)
pass
def testGetUrlsWorks(self):
url = 'http://www.whofic.com/viewstory.php?sid=37139'
self.assertEquals(6, len(Whofic(url).extractIndividualUrls()))
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,16 @@
# -*- coding: utf-8 -*-
## This could (should?) use a dynamic loader like adapters, but for
## now, it's static, since there's so few of them.
from writers.writer_html import HTMLWriter
from writers.writer_txt import TextWriter
from writers.writer_epub import EpubWriter
def getWriter(type,config,story):
if type == "html":
return HTMLWriter(config,story)
if type == "txt":
return TextWriter(config,story)
if type == "epub":
return EpubWriter(config,story)

View file

@ -0,0 +1,168 @@
# -*- coding: utf-8 -*-
import re
import os.path
import string
import StringIO
import zipfile
from zipfile import ZipFile, ZIP_DEFLATED
from story import Story
from configurable import Configurable
from htmlcleanup import removeEntities, removeAllEntities, stripHTML
from adapters.base_adapter import *
class BaseStoryWriter(Configurable):
@staticmethod
def getFormatName():
return 'base'
@staticmethod
def getFormatExt():
return '.bse'
def __init__(self, config, story):
Configurable.__init__(self, config)
self.addConfigSection(self.getFormatName())
self.story = story
self.titleLabels = {
'category':'Category',
'genre':'Genre',
'status':'Status',
'datePublished':'Published',
'dateUpdated':'Updated',
'dateCreated':'Packaged',
'rating':'Rating',
'warnings':'Warnings',
'numChapters':'Chapters',
'numWords':'Words',
'site':'Publisher',
'storyId':'Story ID',
'authorId':'Author ID',
'extratags':'Extra Tags',
'title':'Title',
'storyUrl':'Story URL',
'description':'Summary',
'author':'Author',
'authorUrl':'Author URL',
'formatname':'File Format',
'formatext':'File Extension',
}
self.story.setMetadata('formatname',self.getFormatName())
self.story.setMetadata('formatext',self.getFormatExt())
def getOutputFileName(self):
return self.getFileName(self.getConfig('output_filename'))
def getZipFileName(self):
return self.getFileName(self.getConfig('zip_filename'),extension=".zip")
def getFileName(self,template,extension="${formatext}"):
values = self.story.metadata
fallback=False
# fall back default:
if not template:
template="${title}-${siteabbrev}_${storyId}${formatext}"
fallback=True
# Add extension if not already included.
if extension not in template:
template+=extension
if fallback or self.getConfig('safe_filename'):
values={}
pattern = re.compile(r"[^a-zA-Z0-9_\. \[\]\(\)&'-]+")
for k in self.story.metadata.keys():
values[k]=re.sub(pattern,'_', removeAllEntities(self.story.getMetadata(k)))
return string.Template(template).substitute(values).encode('utf8')
def _write(self, out, text):
out.write(text.encode('utf8'))
def writeTitlePage(self, out, START, ENTRY, END, WIDE_ENTRY=None):
"""
Write the title page, but only include entries that there's
metadata for. START, ENTRY and END are expected to already by
string.Template(). START and END are expected to use the same
names as Story.metadata, but ENTRY should use label and value.
"""
if self.getConfig("include_titlepage"):
self._write(out,START.substitute(self.story.metadata))
if WIDE_ENTRY==None:
WIDE_ENTRY=ENTRY
titleEntriesList = self.getConfigList("titlepage_entries")
wideTitleEntriesList = self.getConfigList("wide_titlepage_entries")
for entry in titleEntriesList:
if entry in self.titleLabels:
if self.story.getMetadata(entry):
if entry in wideTitleEntriesList:
TEMPLATE=WIDE_ENTRY
else:
TEMPLATE=ENTRY
self._write(out,TEMPLATE.substitute({'label':self.titleLabels[entry],
'value':self.story.getMetadata(entry)}))
self._write(out,END.substitute(self.story.metadata))
def writeTOCPage(self, out, START, ENTRY, END):
"""
Write the Table of Contents page. START, ENTRY and END are expected to already by
string.Template(). START and END are expected to use the same
names as Story.metadata, but ENTRY should use index and chapter.
"""
# Only do TOC if there's more than one chapter and it's configured.
if len(self.story.getChapters()) > 1 and self.getConfig("include_tocpage"):
self._write(out,START.substitute(self.story.metadata))
for index, (title,html) in enumerate(self.story.getChapters()):
self._write(out,ENTRY.substitute({'chapter':title, 'index':"%04d"%(index+1)}))
self._write(out,END.substitute(self.story.metadata))
# if no outstream is given, write to file.
def writeStory(self,outstream=None):
self.addConfigSection(self.story.getMetadata('site'))
self.addConfigSection(self.story.getMetadata('site')+":"+self.getFormatName())
for tag in self.getConfigList("extratags"):
self.story.addToList("extratags",tag)
zipfilename=self.getZipFileName()
filename=self.getOutputFileName()
if self.getConfig('zip_output'):
outfilename=zipfilename
else:
outfilename=filename
if not outstream:
if self.getConfig('make_directories'):
path=""
dirs = os.path.dirname(outfilename).split('/')
for dir in dirs:
path+=dir+"/"
if not os.path.exists(path):
os.mkdir(path) ## os.makedirs() doesn't work in 2.5.2?
outstream = open(outfilename,"wb")
if self.getConfig('zip_output'):
out = StringIO.StringIO()
self.writeStoryImpl(out)
zipout = ZipFile(outstream, 'w', compression=ZIP_DEFLATED)
zipout.writestr(filename,out.getvalue())
zipout.close()
out.close()
else:
self.writeStoryImpl(outstream)
outstream.close()
def writeStoryImpl(self, out):
"Must be overriden by sub classes."
pass

View file

@ -0,0 +1,404 @@
# -*- coding: utf-8 -*-
import logging
import string
import StringIO
import zipfile
from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED
## XML isn't as forgiving as HTML, so rather than generate as strings,
## use DOM to generate the XML files.
from xml.dom.minidom import parse, parseString, getDOMImplementation
from writers.base_writer import *
class EpubWriter(BaseStoryWriter):
@staticmethod
def getFormatName():
return 'epub'
@staticmethod
def getFormatExt():
return '.epub'
def __init__(self, config, story):
BaseStoryWriter.__init__(self, config, story)
self.EPUB_CSS='''body { margin-left: 2%; margin-right: 2%; margin-top: 2%; margin-bottom: 2%; text-align: justify; }
pre { font-size: x-small; }
sml { font-size: small; }
h1 { text-align: center; }
h2 { text-align: center; }
h3 { text-align: center; }
h4 { text-align: center; }
h5 { text-align: center; }
h6 { text-align: center; }
.CI {
text-align:center;
margin-top:0px;
margin-bottom:0px;
padding:0px;
}
.center {text-align: center;}
.cover {text-align: center;}
.full {width: 100%; }
.quarter {width: 25%; }
.smcap {font-variant: small-caps;}
.u {text-decoration: underline;}
.bold {font-weight: bold;}
'''
self.EPUB_TITLE_PAGE_START = string.Template('''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>${title} by ${author}</title>
<link href="stylesheet.css" type="text/css" charset="UTF-8" rel="stylesheet"/>
</head>
<body>
<h1><a href="${storyUrl}">${title}</a> by <a href="${authorUrl}">${author}</a></h1>
<div>
''')
self.EPUB_TITLE_ENTRY = string.Template('''
<b>${label}:</b> ${value}<br />
''')
self.EPUB_TITLE_PAGE_END = string.Template('''
</div>
</body>
</html>
''')
self.EPUB_TABLE_TITLE_PAGE_START = string.Template('''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>${title} by ${author}</title>
<link href="stylesheet.css" type="text/css" charset="UTF-8" rel="stylesheet"/>
</head>
<body>
<h1><a href="${storyUrl}">${title}</a> by <a href="${authorUrl}">${author}</a></h1>
<table class="full">
''')
self.EPUB_TABLE_TITLE_ENTRY = string.Template('''
<tr><td><b>${label}:</b></td><td>${value}</td></tr>
''')
self.EPUB_TABLE_TITLE_WIDE_ENTRY = string.Template('''
<tr><td colspan="2"><b>${label}:</b> ${value}</td></tr>
''')
self.EPUB_TABLE_TITLE_PAGE_END = string.Template('''
</table>
</body>
</html>
''')
self.EPUB_TOC_PAGE_START = string.Template('''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>${title} by ${author}</title>
<link href="stylesheet.css" type="text/css" charset="UTF-8" rel="stylesheet"/>
</head>
<body>
<div>
<h3>Table of Contents</h3>
''')
self.EPUB_TOC_ENTRY = string.Template('''
<a href="file${index}.xhtml">${chapter}</a><br />
''')
self.EPUB_TOC_PAGE_END = string.Template('''
</div>
</body>
</html>
''')
self.EPUB_CHAPTER_START = string.Template('''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>${chapter}</title>
<link href="stylesheet.css" type="text/css" charset="UTF-8" rel="stylesheet"/>
</head>
<body>
<h2>${chapter}</h2>
''')
self.EPUB_CHAPTER_END = string.Template('''
</body>
</html>
''')
def getMetadata(self,key):
return removeAllEntities(self.story.getMetadata(key))
def writeStoryImpl(self, out):
## Python 2.5 ZipFile is rather more primative than later
## versions. It can operate on a file, or on a StringIO, but
## not on an open stream. OTOH, I suspect we would have had
## problems with closing and opening again to change the
## compression type anyway.
zipio = StringIO.StringIO()
## mimetype must be first file and uncompressed. Python 2.5
## ZipFile can't change compression type file-by-file, so we
## have to close and re-open
outputepub = ZipFile(zipio, 'w', compression=ZIP_STORED)
outputepub.writestr('mimetype','application/epub+zip')
outputepub.close()
## Re-open file for content.
outputepub = ZipFile(zipio, 'a', compression=ZIP_DEFLATED)
## Create META-INF/container.xml file. The only thing it does is
## point to content.opf
containerdom = getDOMImplementation().createDocument(None, "container", None)
containertop = containerdom.documentElement
containertop.setAttribute("version","1.0")
containertop.setAttribute("xmlns","urn:oasis:names:tc:opendocument:xmlns:container")
rootfiles = containerdom.createElement("rootfiles")
containertop.appendChild(rootfiles)
rootfiles.appendChild(newTag(containerdom,"rootfile",{"full-path":"content.opf",
"media-type":"application/oebps-package+xml"}))
outputepub.writestr("META-INF/container.xml",containerdom.toxml(encoding='utf-8'))
del containerdom
## Epub has two metadata files with real data. We're putting
## them in content.opf (pointed to by META-INF/container.xml)
## and toc.ncx (pointed to by content.opf)
## content.opf contains metadata, a 'manifest' list of all
## other included files, and another 'spine' list of the items in the
## file
uniqueid= 'fanficdownloader-uid:%s-u%s-s%s' % (
self.getMetadata('site'),
self.getMetadata('authorId'),
self.getMetadata('storyId'))
contentdom = getDOMImplementation().createDocument(None, "package", None)
package = contentdom.documentElement
package.setAttribute("version","2.0")
package.setAttribute("xmlns","http://www.idpf.org/2007/opf")
package.setAttribute("unique-identifier","fanficdownloader-uid")
metadata=newTag(contentdom,"metadata",
attrs={"xmlns:dc":"http://purl.org/dc/elements/1.1/",
"xmlns:opf":"http://www.idpf.org/2007/opf"})
package.appendChild(metadata)
metadata.appendChild(newTag(contentdom,"dc:identifier",
text=uniqueid,
attrs={"id":"fanficdownloader-uid"}))
if self.getMetadata('title'):
metadata.appendChild(newTag(contentdom,"dc:title",text=self.getMetadata('title')))
if self.getMetadata('author'):
metadata.appendChild(newTag(contentdom,"dc:creator",
attrs={"opf:role":"aut"},
text=self.getMetadata('author')))
metadata.appendChild(newTag(contentdom,"dc:contributor",text="fanficdownloader [http://fanficdownloader.googlecode.com]",attrs={"opf:role":"bkp"}))
metadata.appendChild(newTag(contentdom,"dc:rights",text=""))
metadata.appendChild(newTag(contentdom,"dc:language",text="en"))
# published, created, updated, calibre
# Leave calling self.story.getMetadataRaw directly in case date format changes.
if self.story.getMetadataRaw('datePublished'):
metadata.appendChild(newTag(contentdom,"dc:date",
attrs={"opf:event":"publication"},
text=self.story.getMetadataRaw('datePublished').strftime("%Y-%m-%d")))
if self.story.getMetadataRaw('dateCreated'):
metadata.appendChild(newTag(contentdom,"dc:date",
attrs={"opf:event":"creation"},
text=self.story.getMetadataRaw('dateCreated').strftime("%Y-%m-%d")))
if self.story.getMetadataRaw('dateUpdated'):
metadata.appendChild(newTag(contentdom,"dc:date",
attrs={"opf:event":"modification"},
text=self.story.getMetadataRaw('dateUpdated').strftime("%Y-%m-%d")))
metadata.appendChild(newTag(contentdom,"meta",
attrs={"name":"calibre:timestamp",
"content":self.story.getMetadataRaw('dateUpdated').strftime("%Y-%m-%dT%H:%M:%S")}))
# Last Update tags for Bill.
self.story.addToList('lastupdate',self.story.getMetadataRaw('dateUpdated').strftime("Last Update Year/Month: %Y/%m"))
self.story.addToList('lastupdate',self.story.getMetadataRaw('dateUpdated').strftime("Last Update: %Y/%m/%d"))
if self.getMetadata('description'):
metadata.appendChild(newTag(contentdom,"dc:description",text=
self.getMetadata('description')))
# listables all go into dc:suject tags, but only if they are configured.
for (name,lst) in self.story.getLists().iteritems():
if name in self.getConfigList("include_subject_tags"):
for tag in lst:
metadata.appendChild(newTag(contentdom,"dc:subject",text=
tag))
if self.getMetadata('site'):
metadata.appendChild(newTag(contentdom,"dc:publisher",
text=self.getMetadata('site')))
if self.getMetadata('storyUrl'):
metadata.appendChild(newTag(contentdom,"dc:identifier",
attrs={"opf:scheme":"URL"},
text=self.getMetadata('storyUrl')))
metadata.appendChild(newTag(contentdom,"dc:source",
text=self.getMetadata('storyUrl')))
## end of metadata, create manifest.
items = [] # list of (id, href, type, title) tuples(all strings)
itemrefs = [] # list of strings -- idrefs from .opfs' spines
items.append(("ncx","toc.ncx","application/x-dtbncx+xml",None)) ## we'll generate the toc.ncx file,
## but it needs to be in the items manifest.
items.append(("style","OEBPS/stylesheet.css","text/css",None))
if self.getConfig("include_titlepage"):
items.append(("title_page","OEBPS/title_page.xhtml","application/xhtml+xml","Title Page"))
itemrefs.append("title_page")
if self.getConfig("include_tocpage"):
items.append(("toc_page","OEBPS/toc_page.xhtml","application/xhtml+xml","Table of Contents"))
itemrefs.append("toc_page")
for index, (title,html) in enumerate(self.story.getChapters()):
i=index+1
items.append(("file%04d"%i,
"OEBPS/file%04d.xhtml"%i,
"application/xhtml+xml",
title))
itemrefs.append("file%04d"%i)
manifest = contentdom.createElement("manifest")
package.appendChild(manifest)
for item in items:
(id,href,type,title)=item
manifest.appendChild(newTag(contentdom,"item",
attrs={'id':id,
'href':href,
'media-type':type}))
spine = newTag(contentdom,"spine",attrs={"toc":"ncx"})
package.appendChild(spine)
for itemref in itemrefs:
spine.appendChild(newTag(contentdom,"itemref",
attrs={"idref":itemref,
"linear":"yes"}))
# write content.opf to zip.
outputepub.writestr("content.opf",contentdom.toxml(encoding='utf-8'))
del contentdom
## create toc.ncx file
tocncxdom = getDOMImplementation().createDocument(None, "ncx", None)
ncx = tocncxdom.documentElement
ncx.setAttribute("version","2005-1")
ncx.setAttribute("xmlns","http://www.daisy.org/z3986/2005/ncx/")
head = tocncxdom.createElement("head")
ncx.appendChild(head)
head.appendChild(newTag(tocncxdom,"meta",
attrs={"name":"dtb:uid", "content":uniqueid}))
head.appendChild(newTag(tocncxdom,"meta",
attrs={"name":"dtb:depth", "content":"1"}))
head.appendChild(newTag(tocncxdom,"meta",
attrs={"name":"dtb:totalPageCount", "content":"0"}))
head.appendChild(newTag(tocncxdom,"meta",
attrs={"name":"dtb:maxPageNumber", "content":"0"}))
docTitle = tocncxdom.createElement("docTitle")
docTitle.appendChild(newTag(tocncxdom,"text",text=self.getMetadata('title')))
ncx.appendChild(docTitle)
tocnavMap = tocncxdom.createElement("navMap")
ncx.appendChild(tocnavMap)
# <navPoint id="<id>" playOrder="<risingnumberfrom0>">
# <navLabel>
# <text><chapter title></text>
# </navLabel>
# <content src="<chapterfile>"/>
# </navPoint>
index=0
for item in items:
(id,href,type,title)=item
# only items to be skipped, toc.ncx, stylesheet.css, should have no title.
if title :
navPoint = newTag(tocncxdom,"navPoint",
attrs={'id':id,
'playOrder':str(index)})
tocnavMap.appendChild(navPoint)
navLabel = newTag(tocncxdom,"navLabel")
navPoint.appendChild(navLabel)
navLabel.appendChild(newTag(tocncxdom,"text",text=title))
navPoint.appendChild(newTag(tocncxdom,"content",attrs={"src":href}))
index=index+1
# write toc.ncs to zip file
outputepub.writestr("toc.ncx",tocncxdom.toxml(encoding='utf-8'))
del tocncxdom
# write stylesheet.css file.
outputepub.writestr("OEBPS/stylesheet.css",self.EPUB_CSS)
# write title page.
if self.getConfig("titlepage_use_table"):
TITLE_PAGE_START = self.EPUB_TABLE_TITLE_PAGE_START
TITLE_ENTRY = self.EPUB_TABLE_TITLE_ENTRY
WIDE_TITLE_ENTRY = self.EPUB_TABLE_TITLE_WIDE_ENTRY
TITLE_PAGE_END = self.EPUB_TABLE_TITLE_PAGE_END
else:
TITLE_PAGE_START = self.EPUB_TITLE_PAGE_START
TITLE_ENTRY = self.EPUB_TITLE_ENTRY
WIDE_TITLE_ENTRY = self.EPUB_TITLE_ENTRY # same, only wide in tables.
TITLE_PAGE_END = self.EPUB_TITLE_PAGE_END
titlepageIO = StringIO.StringIO()
self.writeTitlePage(out=titlepageIO,
START=TITLE_PAGE_START,
ENTRY=TITLE_ENTRY,
WIDE_ENTRY=WIDE_TITLE_ENTRY,
END=TITLE_PAGE_END)
if titlepageIO.getvalue(): # will be false if no title page.
outputepub.writestr("OEBPS/title_page.xhtml",titlepageIO.getvalue())
titlepageIO.close()
# write toc page.
tocpageIO = StringIO.StringIO()
self.writeTOCPage(tocpageIO,
self.EPUB_TOC_PAGE_START,
self.EPUB_TOC_ENTRY,
self.EPUB_TOC_PAGE_END)
if tocpageIO.getvalue(): # will be false if no toc page.
outputepub.writestr("OEBPS/toc_page.xhtml",tocpageIO.getvalue())
tocpageIO.close()
for index, (title,html) in enumerate(self.story.getChapters()):
logging.debug('Writing chapter text for: %s' % title)
fullhtml = self.EPUB_CHAPTER_START.substitute({'chapter':title, 'index':index+1}) + html + self.EPUB_CHAPTER_END.substitute({'chapter':title, 'index':index+1})
# ffnet(& maybe others) gives the whole chapter text as
# one line. This causes problems for nook(at least) when
# the chapter size starts getting big (200k+)
fullhtml = fullhtml.replace('</p>','</p>\n').replace('<br />','<br />\n')
outputepub.writestr("OEBPS/file%04d.xhtml"%(index+1),fullhtml.encode('utf-8'))
del fullhtml
outputepub.close()
out.write(zipio.getvalue())
zipio.close()
## Utility method for creating new tags.
def newTag(dom,name,attrs=None,text=None):
tag = dom.createElement(name)
if( attrs is not None ):
for attr in attrs.keys():
tag.setAttribute(attr,attrs[attr])
if( text is not None ):
tag.appendChild(dom.createTextNode(text))
return tag

View file

@ -0,0 +1,84 @@
# -*- coding: utf-8 -*-
import logging
import string
from writers.base_writer import *
class HTMLWriter(BaseStoryWriter):
@staticmethod
def getFormatName():
return 'html'
@staticmethod
def getFormatExt():
return '.html'
def __init__(self, config, story):
BaseStoryWriter.__init__(self, config, story)
self.HTML_FILE_START = string.Template('''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>${title} by ${author}</title>
</head>
<body>
<h1><a href="${storyUrl}">${title}</a> by <a href="${authorUrl}">${author}</a></h1>
''')
self.HTML_TITLE_PAGE_START = string.Template('''
<table class="full">
''')
self.HTML_TITLE_ENTRY = string.Template('''
<tr><td><b>${label}:</b></td><td>${value}</td></tr>
''')
self.HTML_TITLE_PAGE_END = string.Template('''
</table>
''')
self.HTML_TOC_PAGE_START = string.Template('''
<a name="TOCTOP"><h3>Table of Contents</h3>
<p>
''')
self.HTML_TOC_ENTRY = string.Template('''
<a href="#section${index}">${chapter}</a><br />
''')
self.HTML_TOC_PAGE_END = string.Template('''
</p>
''')
self.HTML_CHAPTER_START = string.Template('''
<a name="section${index}"><h2>${chapter}</h2></a>
''')
self.HTML_FILE_END = string.Template('''
</body>
</html>''')
def writeStoryImpl(self, out):
self._write(out,self.HTML_FILE_START.substitute(self.story.metadata))
self.writeTitlePage(out,
self.HTML_TITLE_PAGE_START,
self.HTML_TITLE_ENTRY,
self.HTML_TITLE_PAGE_END)
self.writeTOCPage(out,
self.HTML_TOC_PAGE_START,
self.HTML_TOC_ENTRY,
self.HTML_TOC_PAGE_END)
for index, (title,html) in enumerate(self.story.getChapters()):
logging.debug('Writing chapter text for: %s' % title)
self._write(out,self.HTML_CHAPTER_START.substitute({'chapter':title, 'index':"%04d"%(index+1)}))
self._write(out,html)
self._write(out,self.HTML_FILE_END.substitute(self.story.metadata))

View file

@ -0,0 +1,142 @@
# -*- coding: utf-8 -*-
import logging
import string
from textwrap import wrap
from writers.base_writer import *
from html2text import html2text, BODY_WIDTH
## In BaseStoryWriter, we define _write to encode <unicode> objects
## back into <string> for true output. But txt needs to write the
## title page and TOC to a buffer first to wordwrap. And StringIO
## gets pissy about unicode bytes in its buflist. This decodes the
## unicode containing <string> object passed in back to a <unicode>
## object so they join up properly. Could override _write to not
## encode and do out.write(whatever.encode('utf8') instead. Honestly
## not sure which is uglier.
class KludgeStringIO():
def __init__(self, buf = ''):
self.buflist=[]
def write(self,s):
try:
s=s.decode('utf-8')
except:
pass
self.buflist.append(s)
def getvalue(self):
return u''.join(self.buflist)
def close(self):
pass
class TextWriter(BaseStoryWriter):
@staticmethod
def getFormatName():
return 'txt'
@staticmethod
def getFormatExt():
return '.txt'
def __init__(self, config, story):
BaseStoryWriter.__init__(self, config, story)
self.TEXT_FILE_START = string.Template(u'''
${title}
by ${author}
''')
self.TEXT_TITLE_PAGE_START = string.Template(u'''
''')
self.TEXT_TITLE_ENTRY = string.Template(u'''${label}: ${value}
''')
self.TEXT_TITLE_PAGE_END = string.Template(u'''
''')
self.TEXT_TOC_PAGE_START = string.Template(u'''
TABLE OF CONTENTS
''')
self.TEXT_TOC_ENTRY = string.Template(u'''
${chapter}
''')
self.TEXT_TOC_PAGE_END = string.Template(u'''
''')
self.TEXT_CHAPTER_START = string.Template(u'''
\t${chapter}
''')
self.TEXT_FILE_END = string.Template(u'''
End file.
''')
def writeStoryImpl(self, out):
wrapout = KludgeStringIO()
wrapout.write(self.TEXT_FILE_START.substitute(self.story.metadata))
self.writeTitlePage(wrapout,
self.TEXT_TITLE_PAGE_START,
self.TEXT_TITLE_ENTRY,
self.TEXT_TITLE_PAGE_END)
towrap = wrapout.getvalue()
self.writeTOCPage(wrapout,
self.TEXT_TOC_PAGE_START,
self.TEXT_TOC_ENTRY,
self.TEXT_TOC_PAGE_END)
towrap = wrapout.getvalue()
wrapout.close()
towrap = removeAllEntities(towrap)
self._write(out,self.lineends(self.wraplines(towrap)))
for index, (title,html) in enumerate(self.story.getChapters()):
logging.debug('Writing chapter text for: %s' % title)
self._write(out,self.lineends(self.wraplines(removeAllEntities(self.TEXT_CHAPTER_START.substitute({'chapter':title, 'index':index+1})))))
self._write(out,self.lineends(html2text(html)))
self._write(out,self.lineends(self.wraplines(self.TEXT_FILE_END.substitute(self.story.metadata))))
def wraplines(self, text):
result=''
for para in text.split("\n"):
first=True
for line in wrap(para, BODY_WIDTH):
if first:
first=False
else:
result += u"\n"
result += line
result += u"\n"
return result
## The appengine will return unix line endings.
def lineends(self, txt):
txt = txt.replace('\r','')
if self.getConfig("windows_eol"):
txt = txt.replace('\n',u'\r\n')
return txt

177
fanficdownloader/zipdir.py Normal file
View file

@ -0,0 +1,177 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
import sys
import os
import zlib
import zipfile
from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED
from contextlib import closing
import logging
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time
import datetime
from datetime import timedelta
import StringIO
class InvalidEPub(Exception):
pass
def checkNewer(filename, curdte):
ret = True
if not os.path.isfile(filename):
logging.debug('File %s does not already exist.' % filename)
return ret
#logging.debug('filename=%s, curdte=%s' % (filename, curdte))
lastdate = None
with closing(ZipFile(open(filename, 'rb'))) as epub:
titleFilePath = "OEBPS/title_page.xhtml"
contentFilePath = "OEBPS/content.opf"
namelist = set(epub.namelist())
#logging.debug('namelist=%s' % namelist)
if 'mimetype' not in namelist or \
'META-INF/container.xml' not in namelist:
#raise InvalidEPub('%s: not a valid EPUB' % filename)
logging.debug('File %s is not a valid EPub format file.' % filename)
return ret
if contentFilePath not in namelist:
return ret # file is not newer
data = epub.read(contentFilePath)
soup = bs.BeautifulStoneSoup(data)
lstdte = soup.find ('dc:date', {'opf:event' : 'modification'})
#logging.debug('lstdte=%s' % lstdte.string)
if lstdte is None and titleFilePath in namelist:
data = epub.read(titleFilePath)
soup = bs.BeautifulStoneSoup(data)
fld = ''
allTDs = soup.findAll ('td')
for td in allTDs:
b = td.find ('b')
if b is not None:
fld = b.string
if td.string is not None and fld == "Updated:":
lastdate = td.string
#logging.debug('title lastdate=%s' % lastdate)
else:
lastdate = lstdte.string.strip(' ')
#logging.debug('contents lastdate=%s' % lastdate)
if lastdate is not None:
currUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(curdte.strftime('%Y-%m-%d'), "%Y-%m-%d")))
storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(lastdate, "%Y-%m-%d")))
logging.debug('File %s last update date is %s, comparing to %s' % (filename, storyUpdated, currUpdated))
if currUpdated <= storyUpdated :
ret = False
logging.debug("Does %s need to be updated? %s" % (filename, ret))
return ret
def toZip(filename, directory):
zippedHelp = zipfile.ZipFile(filename, "w", compression=zipfile.ZIP_DEFLATED)
lst = os.listdir(directory)
for entity in lst:
if entity.startswith('.'):
continue
each = os.path.join(directory,entity)
print(each)
if os.path.isfile(each):
print(each)
# epub standard requires mimetype to be uncompressed and first file.
if entity == 'mimetype':
zippedHelp.write(each, arcname=entity, compress_type=zipfile.ZIP_STORED)
else:
zippedHelp.write(each, arcname=entity)
else:
addFolderToZip(zippedHelp,entity, each)
zippedHelp.close()
def addFolderToZip(zippedHelp,folder,fpath):
#print('addFolderToZip(%s)' % folder)
if folder == '.' or folder == '..':
return
folderFiles = os.listdir(fpath)
for f in folderFiles:
if os.path.isfile(fpath + '/' + f):
#print('basename=%s' % os.path.basename(fpath + '/' + f))
zippedHelp.write(fpath + '/' + f, folder + '/' + f, zipfile.ZIP_DEFLATED)
elif os.path.isdir(f):
addFolderToZip(zippedHelp,f)
def inMemoryZip(files):
# files have a structure of {'path/to/file' => content} dictionary
io = StringIO.StringIO()
if 'mimetype' in files:
# This fixes the uncompressed mimetype-first issue by opening
# the in memory file as STORE, putting in the mimetype, then
# closing and re-opening with DEFLATED. while it is often
# true that mimetype is the first file, we can't assume it,
# because the dict object is defined as unordered.
path='mimetype'
memzip = zipfile.ZipFile(io, 'a', compression=zipfile.ZIP_STORED)
memzip.debug = 3
if type(files[path]) != type('str'):
data = files[path].getvalue()
else:
data = files[path]
logging.debug("Writing ZIP path %s" % path)
try:
memzip.writestr(path, data.encode('utf-8'))
except UnicodeDecodeError, e:
memzip.writestr(path.encode('utf-8'), data.encode('utf-8'))
memzip.close()
# remove it from the files dict.
del(files['mimetype'])
# open in 'a' append mode.
memzip = zipfile.ZipFile(io, 'a', compression=zipfile.ZIP_DEFLATED)
memzip.debug = 3
for path in files:
if type(files[path]) != type('str'):
data = files[path].getvalue()
else:
data = files[path]
# logging.debug(data)
logging.debug("Writing ZIP path %s" % path)
try:
memzip.writestr(path, data.encode('utf-8'))
except UnicodeDecodeError, e:
memzip.writestr(path.encode('utf-8'), data.encode('utf-8'))
# declares all the files created by Windows.
for zf in memzip.filelist:
zf.create_system = 0
memzip.close()
return io
if __name__ == '__main__':
# toZip('sample.epub', "books/A_Time_To_Reflect")
# z = zipfile.ZipFile('sample.epub', 'r')
files = {'test.txt' : 'test', 'data/abc.txt' : 'abc'}
data = inMemoryZip(files)
f = open('res.zip', 'w')
f.write(data)
f.close()

19
ffstorage.py Normal file
View file

@ -0,0 +1,19 @@
from google.appengine.ext import db
class DownloadMeta(db.Model):
user = db.UserProperty()
url = db.StringProperty()
name = db.StringProperty()
title = db.StringProperty()
author = db.StringProperty()
format = db.StringProperty()
failure = db.StringProperty()
completed = db.BooleanProperty(default=False)
date = db.DateTimeProperty(auto_now_add=True)
# data_chunks is implicit from DownloadData def.
class DownloadData(db.Model):
download = db.ReferenceProperty(DownloadMeta,
collection_name='data_chunks')
blob = db.BlobProperty()
index = db.IntegerProperty()

109
index-ajax.html Normal file
View file

@ -0,0 +1,109 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
<html>
<head>
<link href="css/index.css" rel="stylesheet" type="text/css">
<link type="text/css" href="http://jqueryui.com/latest/themes/base/ui.all.css" rel="stylesheet" />
<title>Fanfiction Downloader (fanfiction.net, fictionalley, ficwad to epub and HTML)</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<script src="/js/jquery-1.3.2.js"></script>
<script src="/js/fdownloader.js"></script>
<script type="text/javascript" src="http://jqueryui.com/latest/ui/ui.core.js"></script>
<script type="text/javascript" src="http://jqueryui.com/latest/ui/ui.progressbar.js"></script>
</head>
<body>
<div id='main'>
<h1>
<a href="/" style="text-decoration: none; color: black;">FanFiction Downloader</a>
</h1>
<!-- <form action="/fdown" method="post"> -->
<div id='urlbox'>
<div id='greeting'>
Hi, {{ nickname }}! This is a fan fiction downloader, which makes reading stories from various websites much easier. Please paste a URL of the <em>first chapter</em> in the box to start. Alternatively, see your personal list of <a href="/recent">previously downloaded fanfics</a>.
</div>
<input type="text" id='url' name="url" size="50" value='{{ url }}'>
<div style="margin-top: 0.5em;">
Ebook format &nbsp;<select name="format" id="format">
<option value='epub'>ePub</option>
<option value='html'>HTML</option>
</select>
</div>
<div id='error' style='color: red'>
</div>
</div>
<div id='yourfile' style='display:none'>
</div>
<div id='typebox'>
</div>
<h3>
Login and Password
</h3>
<div id='logpassword'>
If the story requires a login and password to download (e.g. marked as Mature on FFA), you may need to provide your credentials to download it, otherwise just leave it empty
</div>
<div id='logpasswordtable'>
<div class='fieldandlabel'>
<div class='label'>Login</div>
<div class='field'><input type='text' name='login' id='login' size='50'></div>
</div>
<div class='fieldandlabel'>
<div class='label'>Password</div>
<div class='field'><input type='password' id='password' name='password' size='50'></div>
</div>
</div>
<div id='submitbtn'>
<span id='submit_button'><button onclick='downloadFanfic();'>Download</button></span>
<span id='ajax_loader' style='display:none'><img src="/static/ajax-loader.gif"></span>
</div>
<div id="progressbar">
</div>
<div id='helpbox'>
Few things to know, which will make your life substantially easier:
<ol>
<li>Small <a href="http://www.sigizmund.com/reading-fanfiction-off-line-in-stanza-and-oth">post written by me</a> &mdash; how to read fiction in Stanza or any other ebook reader. </a></li>
<li>Currently we support fanfiction.net, fictionpress.com, fanficauthors.net and ficwad.com</li>
<li>Paste a URL of the first chapter of the fanfic, not the index page</li>
<li>Fics with a single chapter are not supported (you can just copy and paste it)</li>
<li>Stories which are too long may not be downloaded correctly and application will report a time-out error &mdash; this is a limitation which is currently imposed by Google AppEngine on a long-running activities</li>
<li>FicWad support is somewhat flaky &mdash; if you feel it doesn't work for you, send all the details to me</li>
<li>You can download fanfics and store them for 'later' by just downloading them and visiting <a href="/recent">recent downloads</a> section, but in future they will be deleted after 5 days to save the space</li>
<li>If Downloader simply opens a download file window rather than saves the fanfic and gives you a link, it means it is too large to save in the database and you need to download it straight away</li>
<li>If you think that something that should work in fact doesn't, drop me a mail to <a href='mailto:sigizmund@gmail.com'>sigizmund@gmail.com</a></li>
</ol>
Otherwise, just have fun, and if you want to say thank you &mdash; use the email above.
</div>
<div style='text-align: center'>
<img src="http://code.google.com/appengine/images/appengine-silver-120x30.gif"
alt="Powered by Google App Engine" />
<br/><br/>
FanfictionLoader is a web front-end to <A href="http://code.google.com/p/fanficdownloader/">fanficdownloader</a><br/>
Copyright &copy; <a href="http://twitter.com/sigizmund">Roman Kirillov</a>
</div>
<!-- </form> -->
</div>
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
try {
var pageTracker = _gat._getTracker("UA-12136939-1");
pageTracker._trackPageview();
} catch(err) {}</script>
</body>
</html>

219
index.html Normal file
View file

@ -0,0 +1,219 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
<html>
<head>
<link href="/css/index.css" rel="stylesheet" type="text/css">
<title>Fanfiction Downloader - read fanfiction from twilighted.net, fanfiction.net, fictionpress.com, fictionalley.org, ficwad.com, potionsandsnitches.net, harrypotterfanfiction.com, mediaminer.org on Kindle, Nook, Sony Reader, iPad, iPhone, Android, Aldiko, Stanza</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta name="google-site-verification" content="kCFc-G4bka_pJN6Rv8CapPBcwmq0hbAUZPkKWqRsAYU" />
</head>
<body>
<div id='main'>
<h1>
<a href="/" style="text-decoration: none; color: black;">FanFiction Downloader</a>
</h1>
<div style="text-align: center">
<script type="text/javascript"><!--
google_ad_client = "ca-pub-0320924304307555";
/* Standard */
google_ad_slot = "8974025478";
google_ad_width = 468;
google_ad_height = 60;
//-->
</script>
<script type="text/javascript"
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
</script>
</div>
<!-- <div id='yourfile'> -->
{{yourfile}}
<!-- </div> -->
{% if authorized %}
<form action="/fdown" method="post">
<div id='urlbox'>
<div id='greeting'>
<p>Hi, {{ nickname }}! This is a fan fiction downloader, which makes reading stories from various websites
much easier. </p>
<p>For Amazon Kindle use Mobi output(see notice below), for Sony Reader, Nook and iPad use ePub</p>
<p>Or see your personal list of <a href="/recent">previously downloaded fanfics</a>.</p>
</div>
<h3>Experimental New Feature</h3>
<p>
If you select EPub format, when it's done you will also be given a 'Convert' link.
</p>
<p>
That link will take you to <a href="http://convertfiles.com">convertfiles.com</a> where you can
directly convert your new story to FictionBook (fb2), Mobipocket (mobi), MS Reader (lit) or Adobe Portable
Document Format(pdf).
There's also a 'Convert' link for EPubs on your <a href="/recent">recent downloads</a>
page. We'd really like to hear from users about this in our <a href="http://groups.google.com/group/fanfic-downloader">Google Group</a>.
</p>
<p>
We'd especially like Kindle and other Mobi users to try it. The <a href="http://convertfiles.com">convertfiles.com</a> Mobi file
appears to be more correct than our Mobi output.
</p>
<div id='error'>
{{ error_message }}
</div>
<input type="text" name="url" size="50" value='{{ url }}'>
</div>
<div id='typebox'>
<div id='typelabel'>Ebook format</div>
<div id='typeoptions'>
<input type='radio' name='format' value='epub' checked>EPub</input>
<input type='radio' name='format' value='html'>HTML</input>
<input type='radio' name='format' value='text'>Plain Text</input>
<input type='radio' name='format' value='mobi'>Mobi (Kindle)</input>
</div>
</div>
<div id='logpasswordtable'>
<h3>Login and Password</h3>
<div id='logpassword'>
If the story requires a login and
password to download, you may need
to provide your credentials to
download it, otherwise just leave
it empty. Currently only needed
by twilighted.net and twiwrite.net.
</div>
<div class='fieldandlabel'>
<div class='label'>Login</div>
<div class='field'><input type='text' name='login' size='50'></div>
</div>
<div class='fieldandlabel'>
<div class='label'>Password</div>
<div class='field'><input type='password' name='password' size='50'></div>
</div>
</div>
<div id='submitbtn'>
<input type="submit" value="Download">
</div>
</form>
{% else %}
<div id='urlbox'>
<div id='greeting'>
<p>
This is a fan fiction downloader, which makes reading stories from various websites much easier. Before you
can start downloading fanfics, you need to login, so downloader can remember your fanfics and store them.
</p>
<p><a href="{{ login_url }}">Login using Google account</a></p>
</div>
</div>
{% endif %}
<div id='helpbox'>
<dl>
<dt>fictionalley.org
<dd>Use the URL of the story's chapter list, such as
<br /><a href="http://www.fictionalley.org/authors/drt/DA.html">http://www.fictionalley.org/authors/drt/DA.html</a>. Or the story text URL for
fictionalley.org one-shots, such as
<br /><a href="http://www.fictionalley.org/authors/drt/JOTP01a.html">http://www.fictionalley.org/authors/drt/JOTP01a.html</a>.
<dt>fanfiction.net
<dd>Use the URL of any story chapter, with or without story title such as
<br /><a href="http://www.fanfiction.net/s/5192986/1/A_Fox_in_Tokyo">http://www.fanfiction.net/s/5192986/1/A_Fox_in_Tokyo</a> or
<br /><a href="http://www.fanfiction.net/s/2345466/3/">http://www.fanfiction.net/s/2345466/3/</a>.
<dt>fictionpress.com
<dd>Use the URL of any story chapter, such as
<br /><a href="http://www.fictionpress.com/s/2851771/1/Untouchable_Love">http://www.fictionpress.com/s/2851771/1/Untouchable_Love</a> or
<br /><a href="http://www.fictionpress.com/s/2847338/6/">http://www.fictionpress.com/s/2847338/6/</a>.
<dt>twilighted.net
<dd>Use the URL of the start of the story, such as
<br /><a href="http://twilighted.net/viewstory.php?sid=8422">http://twilighted.net/viewstory.php?sid=8422</a>.
<dt>twiwrite.net
<dd>Use the URL of the start of the story, such as
<br /><a href="http://twiwrite.net/viewstory.php?sid=427">http://twiwrite.net/viewstory.php?sid=427</a>.
<dt>ficwad.com
<dd>Use the URL of any story chapter, such as
<br /><a href="http://www.ficwad.com/story/75246">http://www.ficwad.com/story/75246</a>.
<dt>harrypotterfanfiction.com
<dd>Use the URL of the story's chapter list, such as
<br /><a href="http://www.harrypotterfanfiction.com/viewstory.php?psid=289208">http://www.harrypotterfanfiction.com/viewstory.php?psid=289208</a>.
<dt>potionsandsnitches.net
<dd>Use the URL of the story's chapter list, such as
<br /><a href="http://potionsandsnitches.net/fanfiction/viewstory.php?sid=2332">http://potionsandsnitches.net/fanfiction/viewstory.php?sid=2332</a>.
<dt>mediaminer.org
<dd>Use the URL of the story's chapter list, such as
<br /><a href="http://www.mediaminer.org/fanfic/view_st.php/156934">http://www.mediaminer.org/fanfic/view_st.php/166653</a>.
Or the story URL for one-shots, such as
<br /><a href="http://www.mediaminer.org/fanfic/view_st.php/167618">http://www.mediaminer.org/fanfic/view_st.php/167618</a>.
<dt>adastrafanfic.com
<dd>Use the URL of the story's chapter list, such as
<br /><a href="http://www.adastrafanfic.com/viewstory.php?sid=854">http://www.adastrafanfic.com/viewstory.php?sid=854</a>.
<dt>whofic.com
<dd>Use the URL of the story's chapter list, such as
<br /><a href="http://www.whofic.com/viewstory.php?sid=16334">http://www.whofic.com/viewstory.php?sid=16334</a>.
</dl>
A few additional things to know, which will make your life substantially easier:
<ol>
<li>
First thing to know: I do not use your login and password. In fact, all I know about it is your ID &ndash; password
is being verified by Google and is absolutely, totally unknown to anyone but you.
</li>
<li>
Small <a href="http://www.sigizmund.com/reading-fanfiction-off-line-in-stanza-and-oth">post written by me</a>
&mdash; how to read fiction in Stanza or any other ebook reader.
</li>
<li>
You can download fanfiction directly from your iPhone, Kindle or (possibly) other ebook reader.
</li>
<li>
Downloaded stories are deleted after some time (which should give you enough of time to download it and will keep
Google happy about the app not going over the storage limit).
</li>
<li>
If you see some funny characters in downloaded Plain Text file, make sure you choose text file encoding UTF-8 and
not something else.
</li>
<li>
If you think that something that should work in fact doesn't, drop me a mail
to <a href='mailto:sigizmund@gmail.com'>sigizmund@gmail.com</a>, or, even better, write an email to
our <a href="http://groups.google.com/group/fanfic-downloader">Google Group</a>. I also encourage you to join it so
you will find out about latest updates and fixes as soon as possible
</li>
</ol>
Otherwise, just have fun, and if you want to say thank you &mdash; use the contacts above.
</div>
<div style='text-align: center'>
<img src="http://code.google.com/appengine/images/appengine-silver-120x30.gif"
alt="Powered by Google App Engine" />
<br/><br/>
FanfictionLoader is a web front-end to <A href="http://code.google.com/p/fanficdownloader/">fanficdownloader</a><br/>
Copyright &copy; <a href="http://twitter.com/sigizmund">Roman Kirillov</a>
</div>
<div style="margin-top: 1em; text-align: center'">
<script type="text/javascript"><!--
google_ad_client = "pub-2027714004231956";
/* FFD */
google_ad_slot = "7330682770";
google_ad_width = 468;
google_ad_height = 60;
//-->
</script>
<script type="text/javascript"
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
</script>
</div>
</div>
<script type="text/javascript"><!--
google_ad_client = "ca-pub-0320924304307555";
/* Standard */
google_ad_slot = "8974025478";
google_ad_width = 468;
google_ad_height = 60;
//-->
</script>
<script type="text/javascript"
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
</script>
</body>
</html>

33
index.yaml Normal file
View file

@ -0,0 +1,33 @@
indexes:
# AUTOGENERATED
# This index.yaml is automatically updated whenever the dev_appserver
# detects that a new type of query is run. If you want to manage the
# index.yaml file manually, remove the above marker line (the line
# saying "# AUTOGENERATED"). If you want to manage some indexes
# manually, move them above the marker line. The index.yaml file is
# automatically uploaded to the admin console when you next deploy
# your application using appcfg.py.
- kind: DownloadData
properties:
- name: download
- name: index
- kind: DownloadMeta
properties:
- name: user
- name: date
direction: desc
- kind: DownloadedFanfic
properties:
- name: cleared
- name: date
- kind: DownloadedFanfic
properties:
- name: user
- name: date
direction: desc

116
js/fdownloader.js Normal file
View file

@ -0,0 +1,116 @@
var g_CurrentKey = null;
var g_Counter = 0;
var COUNTER_MAX = 50;
function setErrorState(error)
{
olderr = error;
error = error + "<br/><a href='mailto:sigizmund@gmail.com?subject=Problem with the fanfiction downloader'>" + "Complain about this error</a>";
$('#error').html(error);
}
function clearErrorState()
{
$('#error').html('');
}
function showFile(data)
{
$('#yourfile').html('<a href="/file?id=' + data.key + '">' + data.name + " by " + data.author + "</a>");
$('#yourfile').show();
}
function hideFile()
{
$('#yourfile').hide();
}
function checkResults()
{
if ( g_Counter >= COUNTER_MAX )
{
return;
}
g_Counter+=1;
$.getJSON('/progress', { 'key' : g_CurrentKey }, function(data)
{
if ( data.result != "Nope")
{
if ( data.result != "OK" )
{
leaveLoadingState();
setErrorState(data.result);
}
else
{
showFile(data);
leaveLoadingState();
// result = data.split("|");
// showFile(result[1], result[2], result[3]);
}
$("#progressbar").progressbar('destroy');
g_Counter = 101;
}
});
if ( g_Counter < COUNTER_MAX )
setTimeout("checkResults()", 1000);
else
{
leaveLoadingState();
setErrorState("Operation takes too long - terminating by timeout (story too long?)");
}
}
function enterLoadingState()
{
$('#submit_button').hide();
$('#ajax_loader').show();
}
function leaveLoadingState()
{
$('#submit_button').show();
$('#ajax_loader').hide();
}
function downloadFanfic()
{
clearErrorState();
hideFile();
format = $("#format").val();
alert(format);
return;
var url = $('#url').val();
var login = $('#login').val();
var password = $('#password').val();
if ( url == '' )
{
setErrorState('URL shouldn\'t be empty');
return;
}
if ( (url.indexOf('fanfiction.net') == -1 && url.indexOf('fanficauthors') == -1 && url.indexOf('ficwad') == -1 && url.indexOf('fictionpress') == -1) || (url.indexOf('adultfanfiction.net') != -1) )
{
setErrorState("This source is not yet supported. Ping me if you want it!");
return;
}
$.post('/submitDownload', {'url' : url, 'login' : login, 'password' : password, 'format' : format}, function(data)
{
g_CurrentKey = data;
g_Counter = 0;
setTimeout("checkResults()", 1000);
enterLoadingState();
})
}

4376
js/jquery-1.3.2.js vendored Normal file

File diff suppressed because it is too large Load diff

433
main.py Normal file
View file

@ -0,0 +1,433 @@
#!/usr/bin/env python
#
# Copyright 2007 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import sys
import zlib
import logging
import traceback
import StringIO
from google.appengine.runtime import DeadlineExceededError
from google.appengine.api import taskqueue
from google.appengine.ext.webapp import template
from google.appengine.api import users
from google.appengine.ext import webapp
from google.appengine.ext.webapp import util
from fanficdownloader.downloader import *
from fanficdownloader.ffnet import *
from fanficdownloader.output import *
from fanficdownloader import twilighted
from fanficdownloader import adastrafanfic
from google.appengine.ext import db
from fanficdownloader.zipdir import *
from ffstorage import *
class LoginRequired(webapp.RequestHandler):
def get(self):
user = users.get_current_user()
if user:
self.redirect('/')
return
else:
logging.debug(users.create_login_url('/'))
url = users.create_login_url(self.request.uri)
template_values = {'login_url' : url}
path = os.path.join(os.path.dirname(__file__), 'index-nonlogin.html')
self.response.out.write(template.render(path, template_values))
class MainHandler(webapp.RequestHandler):
def get(self):
user = users.get_current_user()
if user:
error = self.request.get('error')
template_values = {'nickname' : user.nickname(), 'authorized': True}
url = self.request.get('url')
template_values['url'] = url
if error != None and len(error) > 1:
if error == 'login_required':
template_values['error_message'] = 'This story (or one of the chapters) requires you to be logged in.'
elif error == 'bad_url':
template_values['error_message'] = 'Unsupported URL: ' + url
elif error == 'custom':
template_values['error_message'] = 'Error happened: ' + self.request.get('errtext')
filename = self.request.get('file')
if len(filename) > 1:
template_values['yourfile'] = '''<div id='yourfile'><a href='/file?id=%s'>"%s" by %s</a></div>''' % (filename, self.request.get('name'), self.request.get('author'))
self.response.headers['Content-Type'] = 'text/html'
path = os.path.join(os.path.dirname(__file__), 'index.html')
self.response.out.write(template.render(path, template_values))
else:
logging.debug(users.create_login_url('/'))
url = users.create_login_url(self.request.uri)
template_values = {'login_url' : url, 'authorized': False}
path = os.path.join(os.path.dirname(__file__), 'index.html')
self.response.out.write(template.render(path, template_values))
class FileServer(webapp.RequestHandler):
def get(self):
fileId = self.request.get('id')
if fileId == None or len(fileId) < 3:
self.redirect('/')
return
key = db.Key(fileId)
fanfic = db.get(key)
# check for completed & failure.
name = fanfic.name.encode('utf-8')
name = makeAcceptableFilename(name)
logging.info("Serving file: %s" % name)
if fanfic.format == 'epub':
self.response.headers['Content-Type'] = 'application/epub+zip'
self.response.headers['Content-disposition'] = 'attachment; filename=' + name + '.epub'
elif fanfic.format == 'html':
self.response.headers['Content-Type'] = 'text/html'
self.response.headers['Content-disposition'] = 'attachment; filename=' + name + '.html.zip'
elif fanfic.format == 'text':
self.response.headers['Content-Type'] = 'text/plain'
self.response.headers['Content-disposition'] = 'attachment; filename=' +name + '.txt.zip'
elif fanfic.format == 'mobi':
self.response.headers['Content-Type'] = 'application/x-mobipocket-ebook'
self.response.headers['Content-disposition'] = 'attachment; filename=' + name + '.mobi'
data = DownloadData.all().filter("download =", fanfic).order("index")
# epub, txt and html are all already compressed.
# Each chunk is compress individually to avoid having
# to hold the whole in memory just for the
# compress/uncompress
if fanfic.format == 'mobi':
def dc(data):
try:
return zlib.decompress(data)
# if error, assume it's a chunk from before we started compessing.
except zlib.error:
return data
else:
def dc(data):
return data
for datum in data:
self.response.out.write(dc(datum.blob))
class FileStatusServer(webapp.RequestHandler):
def get(self):
user = users.get_current_user()
if not user:
self.redirect(users.create_login_url(self.request.uri))
return
fileId = self.request.get('id')
if fileId == None or len(fileId) < 3:
self.redirect('/')
key = db.Key(fileId)
fic = db.get(key)
logging.info("Status url: %s" % fic.url)
if fic.completed and fic.format=='epub':
escaped_url = urlEscape(self.request.host_url+"/file/"+fic.name+"."+fic.format+"?id="+fileId+"&fake=file."+fic.format)
else:
escaped_url=False
template_values = dict(fic = fic,
nickname = user.nickname(),
escaped_url = escaped_url
)
path = os.path.join(os.path.dirname(__file__), 'status.html')
self.response.out.write(template.render(path, template_values))
class RecentFilesServer(webapp.RequestHandler):
def get(self):
user = users.get_current_user()
if not user:
self.redirect(users.create_login_url(self.request.uri))
return
q = DownloadMeta.all()
q.filter('user =', user).order('-date')
fics = q.fetch(100)
for fic in fics:
if fic.completed and fic.format == 'epub':
fic.escaped_url = urlEscape(self.request.host_url+"/file/"+fic.name+"."+fic.format+"?id="+str(fic.key())+"&fake=file."+fic.format)
template_values = dict(fics = fics, nickname = user.nickname())
path = os.path.join(os.path.dirname(__file__), 'recent.html')
self.response.out.write(template.render(path, template_values))
class RecentAllFilesServer(webapp.RequestHandler):
def get(self):
user = users.get_current_user()
if user.nickname() != 'sigizmund':
return
fics = db.GqlQuery("Select * From DownloadedFanfic")
template_values = dict(fics = fics, nickname = user.nickname())
path = os.path.join(os.path.dirname(__file__), 'recent.html')
self.response.out.write(template.render(path, template_values))
class FanfictionDownloader(webapp.RequestHandler):
def get(self):
self.post()
def post(self):
logging.getLogger().setLevel(logging.DEBUG)
user = users.get_current_user()
if not user:
self.redirect(users.create_login_url(self.request.uri))
return
format = self.request.get('format')
url = self.request.get('url')
login = self.request.get('login')
password = self.request.get('password')
logging.info("Queuing Download: " + url)
# use existing record if available.
q = DownloadMeta.all().filter('user =', user).filter('url =',url).filter('format =',format).fetch(1)
if( q is None or len(q) < 1 ):
download = DownloadMeta()
else:
download = q[0]
download.completed=False
download.failure=None
for c in download.data_chunks:
c.delete()
download.user = user
download.url = url
download.format = format
download.put()
taskqueue.add(url='/fdowntask',
queue_name="download",
params={'format':format,
'url':url,
'login':login,
'password':password,
'user':user.email()})
logging.info("enqueued download key: " + str(download.key()))
self.redirect('/status?id='+str(download.key()))
return
class FanfictionDownloaderTask(webapp.RequestHandler):
def _printableVersion(self, text):
text = removeEntities(text)
try:
d = text.decode('utf-8')
except:
d = text
return d
def post(self):
logging.getLogger().setLevel(logging.DEBUG)
format = self.request.get('format')
url = self.request.get('url')
login = self.request.get('login')
password = self.request.get('password')
# User object can't pass, just email address
user = users.User(self.request.get('user'))
logging.info("Downloading: " + url + " for user: "+user.nickname())
adapter = None
writerClass = None
# use existing record if available.
q = DownloadMeta.all().filter('user =', user).filter('url =',url).filter('format =',format).fetch(1)
if( q is None or len(q) < 1 ):
download = DownloadMeta()
else:
download = q[0]
download.completed=False
for c in download.data_chunks:
c.delete()
download.user = user
download.url = url
download.format = format
download.put()
logging.info('Creating adapter...')
try:
if url.find('fictionalley') != -1:
adapter = fictionalley.FictionAlley(url)
elif url.find('ficwad') != -1:
adapter = ficwad.FicWad(url)
elif url.find('fanfiction.net') != -1:
adapter = ffnet.FFNet(url)
elif url.find('fictionpress.com') != -1:
adapter = fpcom.FPCom(url)
elif url.find('harrypotterfanfiction.com') != -1:
adapter = hpfiction.HPFiction(url)
elif url.find('twilighted.net') != -1:
adapter = twilighted.Twilighted(url)
elif url.find('twiwrite.net') != -1:
adapter = twiwrite.Twiwrite(url)
elif url.find('adastrafanfic.com') != -1:
adapter = adastrafanfic.Adastrafanfic(url)
elif url.find('whofic.com') != -1:
adapter = whofic.Whofic(url)
elif url.find('potionsandsnitches.net') != -1:
adapter = potionsNsnitches.PotionsNSnitches(url)
elif url.find('mediaminer.org') != -1:
adapter = mediaminer.MediaMiner(url)
else:
logging.debug("Bad URL detected")
download.failure = url +" is not a valid story URL."
download.put()
return
except Exception, e:
logging.exception(e)
download.failure = "Adapter was not created: " + str(e)
download.put()
return
logging.info('Created an adaper: %s' % adapter)
if len(login) > 1:
adapter.setLogin(login)
adapter.setPassword(password)
if format == 'epub':
writerClass = output.EPubFanficWriter
elif format == 'html':
writerClass = output.HTMLWriter
elif format == 'mobi':
writerClass = output.MobiWriter
else:
writerClass = output.TextWriter
loader = FanficLoader(adapter,
writerClass,
quiet = True,
inmemory=True,
compress=False)
try:
data = loader.download()
if format == 'html' or format == 'text':
# data is uncompressed hence huge
ext = '.html'
if format == 'text':
ext = '.txt'
logging.debug(data)
files = {makeAcceptableFilename(str(adapter.getOutputName())) + ext : StringIO.StringIO(data.decode('utf-8')) }
d = inMemoryZip(files)
data = d.getvalue()
except LoginRequiredException, e:
logging.exception(e)
download.failure = 'Login problem detected'
download.put()
return
except Exception, e:
logging.exception(e)
download.failure = 'Some exception happened in downloader: ' + str(e)
download.put()
return
if data == None:
if loader.badLogin:
logging.debug("Bad login detected")
download.failure = 'Login failed'
download.put()
return
download.failure = 'No data returned by adaptor'
download.put()
else:
download.name = self._printableVersion(adapter.getOutputName())
download.title = self._printableVersion(adapter.getStoryName())
download.author = self._printableVersion(adapter.getAuthorName())
download.put()
index=0
# epub, txt and html are all already compressed.
# Each chunk is compressed individually to avoid having
# to hold the whole in memory just for the
# compress/uncompress.
if format == 'mobi':
def c(data):
return zlib.compress(data)
else:
def c(data):
return data
while( len(data) > 0 ):
DownloadData(download=download,
index=index,
blob=c(data[:1000000])).put()
index += 1
data = data[1000000:]
download.completed=True
download.put()
logging.info("Download finished OK")
return
def toPercentDecimal(match):
"Return the %decimal number for the character for url escaping"
s = match.group(1)
return "%%%02x" % ord(s)
def urlEscape(data):
"Escape text, including unicode, for use in URLs"
p = re.compile(r'([^\w])')
return p.sub(toPercentDecimal, data.encode("utf-8"))
def main():
application = webapp.WSGIApplication([('/', MainHandler),
('/fdowntask', FanfictionDownloaderTask),
('/fdown', FanfictionDownloader),
(r'/file.*', FileServer),
('/status', FileStatusServer),
('/recent', RecentFilesServer),
('/r2d2', RecentAllFilesServer),
('/login', LoginRequired)],
debug=False)
util.run_wsgi_app(application)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.DEBUG)
main()

7
queue.yaml Normal file
View file

@ -0,0 +1,7 @@
queue:
- name: default
rate: 1/s
- name: download
rate: 10/s
retry_parameters:
task_retry_limit: 2

80
recent.html Normal file
View file

@ -0,0 +1,80 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
<html>
<head>
<link href="/css/index.css" rel="stylesheet" type="text/css">
<title>Fanfiction Downloader (fanfiction.net, fanficauthors, fictionalley, ficwad to epub and HTML)</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
</head>
<body>
<div id='main'>
<h1>
<a href="/" style="text-decoration: none; color: black;">FanFiction Downloader</a>
</h1>
<script type="text/javascript"><!--
google_ad_client = "ca-pub-0320924304307555";
/* Standard */
google_ad_slot = "8974025478";
google_ad_width = 468;
google_ad_height = 60;
//-->
</script>
<script type="text/javascript"
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
</script>
<!-- <div id='yourfile'> -->
{{yourfile}}
<!-- </div> -->
<div id='urlbox'>
<div id='greeting'>
Hi, {{ nickname }}! These are the fanfics you've recently requested.
</div>
</div>
<div id='helpbox'>
{% for fic in fics %}
<p>
{% if fic.completed %}
<a href="/file?id={{ fic.key }}">Download {{ fic.title }}</a>
by {{ fic.author }} ({{ fic.format }})<br/>
{% if fic.escaped_url %}
<a href="http://www.convertfiles.com/index.php?url={{ fic.escaped_url }}">Convert {{ fic.title }} to other formats</a><br />
{% endif %}
{% endif %}
{% if fic.failure %}
<div id='error'>{{ fic.failure }}</div>
{% endif %}
{% if not fic.completed and not fic.failure %}
Request Processing...<br />
{% endif %}
<small><a href="{{ fic.url }}">{{ fic.url }}</a></small>
</p>
{% endfor %}
</div>
<script type="text/javascript"><!--
google_ad_client = "ca-pub-0320924304307555";
/* Standard */
google_ad_slot = "8974025478";
google_ad_width = 468;
google_ad_height = 60;
//-->
</script>
<script type="text/javascript"
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
</script>
</div>
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
try {
var pageTracker = _gat._getTracker("UA-12136939-1");
pageTracker._trackPageview();
} catch(err) {}</script>
</body>
</html>

318
simplejson/__init__.py Normal file
View file

@ -0,0 +1,318 @@
r"""JSON (JavaScript Object Notation) <http://json.org> is a subset of
JavaScript syntax (ECMA-262 3rd edition) used as a lightweight data
interchange format.
:mod:`simplejson` exposes an API familiar to users of the standard library
:mod:`marshal` and :mod:`pickle` modules. It is the externally maintained
version of the :mod:`json` library contained in Python 2.6, but maintains
compatibility with Python 2.4 and Python 2.5 and (currently) has
significant performance advantages, even without using the optional C
extension for speedups.
Encoding basic Python object hierarchies::
>>> import simplejson as json
>>> json.dumps(['foo', {'bar': ('baz', None, 1.0, 2)}])
'["foo", {"bar": ["baz", null, 1.0, 2]}]'
>>> print json.dumps("\"foo\bar")
"\"foo\bar"
>>> print json.dumps(u'\u1234')
"\u1234"
>>> print json.dumps('\\')
"\\"
>>> print json.dumps({"c": 0, "b": 0, "a": 0}, sort_keys=True)
{"a": 0, "b": 0, "c": 0}
>>> from StringIO import StringIO
>>> io = StringIO()
>>> json.dump(['streaming API'], io)
>>> io.getvalue()
'["streaming API"]'
Compact encoding::
>>> import simplejson as json
>>> json.dumps([1,2,3,{'4': 5, '6': 7}], separators=(',',':'))
'[1,2,3,{"4":5,"6":7}]'
Pretty printing::
>>> import simplejson as json
>>> s = json.dumps({'4': 5, '6': 7}, sort_keys=True, indent=4)
>>> print '\n'.join([l.rstrip() for l in s.splitlines()])
{
"4": 5,
"6": 7
}
Decoding JSON::
>>> import simplejson as json
>>> obj = [u'foo', {u'bar': [u'baz', None, 1.0, 2]}]
>>> json.loads('["foo", {"bar":["baz", null, 1.0, 2]}]') == obj
True
>>> json.loads('"\\"foo\\bar"') == u'"foo\x08ar'
True
>>> from StringIO import StringIO
>>> io = StringIO('["streaming API"]')
>>> json.load(io)[0] == 'streaming API'
True
Specializing JSON object decoding::
>>> import simplejson as json
>>> def as_complex(dct):
... if '__complex__' in dct:
... return complex(dct['real'], dct['imag'])
... return dct
...
>>> json.loads('{"__complex__": true, "real": 1, "imag": 2}',
... object_hook=as_complex)
(1+2j)
>>> import decimal
>>> json.loads('1.1', parse_float=decimal.Decimal) == decimal.Decimal('1.1')
True
Specializing JSON object encoding::
>>> import simplejson as json
>>> def encode_complex(obj):
... if isinstance(obj, complex):
... return [obj.real, obj.imag]
... raise TypeError(repr(o) + " is not JSON serializable")
...
>>> json.dumps(2 + 1j, default=encode_complex)
'[2.0, 1.0]'
>>> json.JSONEncoder(default=encode_complex).encode(2 + 1j)
'[2.0, 1.0]'
>>> ''.join(json.JSONEncoder(default=encode_complex).iterencode(2 + 1j))
'[2.0, 1.0]'
Using simplejson.tool from the shell to validate and pretty-print::
$ echo '{"json":"obj"}' | python -m simplejson.tool
{
"json": "obj"
}
$ echo '{ 1.2:3.4}' | python -m simplejson.tool
Expecting property name: line 1 column 2 (char 2)
"""
__version__ = '2.0.9'
__all__ = [
'dump', 'dumps', 'load', 'loads',
'JSONDecoder', 'JSONEncoder',
]
__author__ = 'Bob Ippolito <bob@redivi.com>'
from decoder import JSONDecoder
from encoder import JSONEncoder
_default_encoder = JSONEncoder(
skipkeys=False,
ensure_ascii=True,
check_circular=True,
allow_nan=True,
indent=None,
separators=None,
encoding='utf-8',
default=None,
)
def dump(obj, fp, skipkeys=False, ensure_ascii=True, check_circular=True,
allow_nan=True, cls=None, indent=None, separators=None,
encoding='utf-8', default=None, **kw):
"""Serialize ``obj`` as a JSON formatted stream to ``fp`` (a
``.write()``-supporting file-like object).
If ``skipkeys`` is true then ``dict`` keys that are not basic types
(``str``, ``unicode``, ``int``, ``long``, ``float``, ``bool``, ``None``)
will be skipped instead of raising a ``TypeError``.
If ``ensure_ascii`` is false, then the some chunks written to ``fp``
may be ``unicode`` instances, subject to normal Python ``str`` to
``unicode`` coercion rules. Unless ``fp.write()`` explicitly
understands ``unicode`` (as in ``codecs.getwriter()``) this is likely
to cause an error.
If ``check_circular`` is false, then the circular reference check
for container types will be skipped and a circular reference will
result in an ``OverflowError`` (or worse).
If ``allow_nan`` is false, then it will be a ``ValueError`` to
serialize out of range ``float`` values (``nan``, ``inf``, ``-inf``)
in strict compliance of the JSON specification, instead of using the
JavaScript equivalents (``NaN``, ``Infinity``, ``-Infinity``).
If ``indent`` is a non-negative integer, then JSON array elements and object
members will be pretty-printed with that indent level. An indent level
of 0 will only insert newlines. ``None`` is the most compact representation.
If ``separators`` is an ``(item_separator, dict_separator)`` tuple
then it will be used instead of the default ``(', ', ': ')`` separators.
``(',', ':')`` is the most compact JSON representation.
``encoding`` is the character encoding for str instances, default is UTF-8.
``default(obj)`` is a function that should return a serializable version
of obj or raise TypeError. The default simply raises TypeError.
To use a custom ``JSONEncoder`` subclass (e.g. one that overrides the
``.default()`` method to serialize additional types), specify it with
the ``cls`` kwarg.
"""
# cached encoder
if (not skipkeys and ensure_ascii and
check_circular and allow_nan and
cls is None and indent is None and separators is None and
encoding == 'utf-8' and default is None and not kw):
iterable = _default_encoder.iterencode(obj)
else:
if cls is None:
cls = JSONEncoder
iterable = cls(skipkeys=skipkeys, ensure_ascii=ensure_ascii,
check_circular=check_circular, allow_nan=allow_nan, indent=indent,
separators=separators, encoding=encoding,
default=default, **kw).iterencode(obj)
# could accelerate with writelines in some versions of Python, at
# a debuggability cost
for chunk in iterable:
fp.write(chunk)
def dumps(obj, skipkeys=False, ensure_ascii=True, check_circular=True,
allow_nan=True, cls=None, indent=None, separators=None,
encoding='utf-8', default=None, **kw):
"""Serialize ``obj`` to a JSON formatted ``str``.
If ``skipkeys`` is false then ``dict`` keys that are not basic types
(``str``, ``unicode``, ``int``, ``long``, ``float``, ``bool``, ``None``)
will be skipped instead of raising a ``TypeError``.
If ``ensure_ascii`` is false, then the return value will be a
``unicode`` instance subject to normal Python ``str`` to ``unicode``
coercion rules instead of being escaped to an ASCII ``str``.
If ``check_circular`` is false, then the circular reference check
for container types will be skipped and a circular reference will
result in an ``OverflowError`` (or worse).
If ``allow_nan`` is false, then it will be a ``ValueError`` to
serialize out of range ``float`` values (``nan``, ``inf``, ``-inf``) in
strict compliance of the JSON specification, instead of using the
JavaScript equivalents (``NaN``, ``Infinity``, ``-Infinity``).
If ``indent`` is a non-negative integer, then JSON array elements and
object members will be pretty-printed with that indent level. An indent
level of 0 will only insert newlines. ``None`` is the most compact
representation.
If ``separators`` is an ``(item_separator, dict_separator)`` tuple
then it will be used instead of the default ``(', ', ': ')`` separators.
``(',', ':')`` is the most compact JSON representation.
``encoding`` is the character encoding for str instances, default is UTF-8.
``default(obj)`` is a function that should return a serializable version
of obj or raise TypeError. The default simply raises TypeError.
To use a custom ``JSONEncoder`` subclass (e.g. one that overrides the
``.default()`` method to serialize additional types), specify it with
the ``cls`` kwarg.
"""
# cached encoder
if (not skipkeys and ensure_ascii and
check_circular and allow_nan and
cls is None and indent is None and separators is None and
encoding == 'utf-8' and default is None and not kw):
return _default_encoder.encode(obj)
if cls is None:
cls = JSONEncoder
return cls(
skipkeys=skipkeys, ensure_ascii=ensure_ascii,
check_circular=check_circular, allow_nan=allow_nan, indent=indent,
separators=separators, encoding=encoding, default=default,
**kw).encode(obj)
_default_decoder = JSONDecoder(encoding=None, object_hook=None)
def load(fp, encoding=None, cls=None, object_hook=None, parse_float=None,
parse_int=None, parse_constant=None, **kw):
"""Deserialize ``fp`` (a ``.read()``-supporting file-like object containing
a JSON document) to a Python object.
If the contents of ``fp`` is encoded with an ASCII based encoding other
than utf-8 (e.g. latin-1), then an appropriate ``encoding`` name must
be specified. Encodings that are not ASCII based (such as UCS-2) are
not allowed, and should be wrapped with
``codecs.getreader(fp)(encoding)``, or simply decoded to a ``unicode``
object and passed to ``loads()``
``object_hook`` is an optional function that will be called with the
result of any object literal decode (a ``dict``). The return value of
``object_hook`` will be used instead of the ``dict``. This feature
can be used to implement custom decoders (e.g. JSON-RPC class hinting).
To use a custom ``JSONDecoder`` subclass, specify it with the ``cls``
kwarg.
"""
return loads(fp.read(),
encoding=encoding, cls=cls, object_hook=object_hook,
parse_float=parse_float, parse_int=parse_int,
parse_constant=parse_constant, **kw)
def loads(s, encoding=None, cls=None, object_hook=None, parse_float=None,
parse_int=None, parse_constant=None, **kw):
"""Deserialize ``s`` (a ``str`` or ``unicode`` instance containing a JSON
document) to a Python object.
If ``s`` is a ``str`` instance and is encoded with an ASCII based encoding
other than utf-8 (e.g. latin-1) then an appropriate ``encoding`` name
must be specified. Encodings that are not ASCII based (such as UCS-2)
are not allowed and should be decoded to ``unicode`` first.
``object_hook`` is an optional function that will be called with the
result of any object literal decode (a ``dict``). The return value of
``object_hook`` will be used instead of the ``dict``. This feature
can be used to implement custom decoders (e.g. JSON-RPC class hinting).
``parse_float``, if specified, will be called with the string
of every JSON float to be decoded. By default this is equivalent to
float(num_str). This can be used to use another datatype or parser
for JSON floats (e.g. decimal.Decimal).
``parse_int``, if specified, will be called with the string
of every JSON int to be decoded. By default this is equivalent to
int(num_str). This can be used to use another datatype or parser
for JSON integers (e.g. float).
``parse_constant``, if specified, will be called with one of the
following strings: -Infinity, Infinity, NaN, null, true, false.
This can be used to raise an exception if invalid JSON numbers
are encountered.
To use a custom ``JSONDecoder`` subclass, specify it with the ``cls``
kwarg.
"""
if (cls is None and encoding is None and object_hook is None and
parse_int is None and parse_float is None and
parse_constant is None and not kw):
return _default_decoder.decode(s)
if cls is None:
cls = JSONDecoder
if object_hook is not None:
kw['object_hook'] = object_hook
if parse_float is not None:
kw['parse_float'] = parse_float
if parse_int is not None:
kw['parse_int'] = parse_int
if parse_constant is not None:
kw['parse_constant'] = parse_constant
return cls(encoding=encoding, **kw).decode(s)

BIN
simplejson/__init__.pyc Normal file

Binary file not shown.

2329
simplejson/_speedups.c Normal file

File diff suppressed because it is too large Load diff

354
simplejson/decoder.py Normal file
View file

@ -0,0 +1,354 @@
"""Implementation of JSONDecoder
"""
import re
import sys
import struct
from simplejson.scanner import make_scanner
try:
from simplejson._speedups import scanstring as c_scanstring
except ImportError:
c_scanstring = None
__all__ = ['JSONDecoder']
FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
def _floatconstants():
_BYTES = '7FF80000000000007FF0000000000000'.decode('hex')
if sys.byteorder != 'big':
_BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
nan, inf = struct.unpack('dd', _BYTES)
return nan, inf, -inf
NaN, PosInf, NegInf = _floatconstants()
def linecol(doc, pos):
lineno = doc.count('\n', 0, pos) + 1
if lineno == 1:
colno = pos
else:
colno = pos - doc.rindex('\n', 0, pos)
return lineno, colno
def errmsg(msg, doc, pos, end=None):
# Note that this function is called from _speedups
lineno, colno = linecol(doc, pos)
if end is None:
#fmt = '{0}: line {1} column {2} (char {3})'
#return fmt.format(msg, lineno, colno, pos)
fmt = '%s: line %d column %d (char %d)'
return fmt % (msg, lineno, colno, pos)
endlineno, endcolno = linecol(doc, end)
#fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
#return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
_CONSTANTS = {
'-Infinity': NegInf,
'Infinity': PosInf,
'NaN': NaN,
}
STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
BACKSLASH = {
'"': u'"', '\\': u'\\', '/': u'/',
'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
}
DEFAULT_ENCODING = "utf-8"
def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHUNK.match):
"""Scan the string s for a JSON string. End is the index of the
character in s after the quote that started the JSON string.
Unescapes all valid JSON string escape sequences and raises ValueError
on attempt to decode an invalid string. If strict is False then literal
control characters are allowed in the string.
Returns a tuple of the decoded string and the index of the character in s
after the end quote."""
if encoding is None:
encoding = DEFAULT_ENCODING
chunks = []
_append = chunks.append
begin = end - 1
while 1:
chunk = _m(s, end)
if chunk is None:
raise ValueError(
errmsg("Unterminated string starting at", s, begin))
end = chunk.end()
content, terminator = chunk.groups()
# Content is contains zero or more unescaped string characters
if content:
if not isinstance(content, unicode):
content = unicode(content, encoding)
_append(content)
# Terminator is the end of string, a literal control character,
# or a backslash denoting that an escape sequence follows
if terminator == '"':
break
elif terminator != '\\':
if strict:
msg = "Invalid control character %r at" % (terminator,)
#msg = "Invalid control character {0!r} at".format(terminator)
raise ValueError(errmsg(msg, s, end))
else:
_append(terminator)
continue
try:
esc = s[end]
except IndexError:
raise ValueError(
errmsg("Unterminated string starting at", s, begin))
# If not a unicode escape sequence, must be in the lookup table
if esc != 'u':
try:
char = _b[esc]
except KeyError:
msg = "Invalid \\escape: " + repr(esc)
raise ValueError(errmsg(msg, s, end))
end += 1
else:
# Unicode escape sequence
esc = s[end + 1:end + 5]
next_end = end + 5
if len(esc) != 4:
msg = "Invalid \\uXXXX escape"
raise ValueError(errmsg(msg, s, end))
uni = int(esc, 16)
# Check for surrogate pair on UCS-4 systems
if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
if not s[end + 5:end + 7] == '\\u':
raise ValueError(errmsg(msg, s, end))
esc2 = s[end + 7:end + 11]
if len(esc2) != 4:
raise ValueError(errmsg(msg, s, end))
uni2 = int(esc2, 16)
uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
next_end += 6
char = unichr(uni)
end = next_end
# Append the unescaped character
_append(char)
return u''.join(chunks), end
# Use speedup if available
scanstring = c_scanstring or py_scanstring
WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
WHITESPACE_STR = ' \t\n\r'
def JSONObject((s, end), encoding, strict, scan_once, object_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
pairs = {}
# Use a slice to prevent IndexError from being raised, the following
# check will raise a more specific ValueError if the string is empty
nextchar = s[end:end + 1]
# Normally we expect nextchar == '"'
if nextchar != '"':
if nextchar in _ws:
end = _w(s, end).end()
nextchar = s[end:end + 1]
# Trivial empty object
if nextchar == '}':
return pairs, end + 1
elif nextchar != '"':
raise ValueError(errmsg("Expecting property name", s, end))
end += 1
while True:
key, end = scanstring(s, end, encoding, strict)
# To skip some function call overhead we optimize the fast paths where
# the JSON key separator is ": " or just ":".
if s[end:end + 1] != ':':
end = _w(s, end).end()
if s[end:end + 1] != ':':
raise ValueError(errmsg("Expecting : delimiter", s, end))
end += 1
try:
if s[end] in _ws:
end += 1
if s[end] in _ws:
end = _w(s, end + 1).end()
except IndexError:
pass
try:
value, end = scan_once(s, end)
except StopIteration:
raise ValueError(errmsg("Expecting object", s, end))
pairs[key] = value
try:
nextchar = s[end]
if nextchar in _ws:
end = _w(s, end + 1).end()
nextchar = s[end]
except IndexError:
nextchar = ''
end += 1
if nextchar == '}':
break
elif nextchar != ',':
raise ValueError(errmsg("Expecting , delimiter", s, end - 1))
try:
nextchar = s[end]
if nextchar in _ws:
end += 1
nextchar = s[end]
if nextchar in _ws:
end = _w(s, end + 1).end()
nextchar = s[end]
except IndexError:
nextchar = ''
end += 1
if nextchar != '"':
raise ValueError(errmsg("Expecting property name", s, end - 1))
if object_hook is not None:
pairs = object_hook(pairs)
return pairs, end
def JSONArray((s, end), scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
values = []
nextchar = s[end:end + 1]
if nextchar in _ws:
end = _w(s, end + 1).end()
nextchar = s[end:end + 1]
# Look-ahead for trivial empty array
if nextchar == ']':
return values, end + 1
_append = values.append
while True:
try:
value, end = scan_once(s, end)
except StopIteration:
raise ValueError(errmsg("Expecting object", s, end))
_append(value)
nextchar = s[end:end + 1]
if nextchar in _ws:
end = _w(s, end + 1).end()
nextchar = s[end:end + 1]
end += 1
if nextchar == ']':
break
elif nextchar != ',':
raise ValueError(errmsg("Expecting , delimiter", s, end))
try:
if s[end] in _ws:
end += 1
if s[end] in _ws:
end = _w(s, end + 1).end()
except IndexError:
pass
return values, end
class JSONDecoder(object):
"""Simple JSON <http://json.org> decoder
Performs the following translations in decoding by default:
+---------------+-------------------+
| JSON | Python |
+===============+===================+
| object | dict |
+---------------+-------------------+
| array | list |
+---------------+-------------------+
| string | unicode |
+---------------+-------------------+
| number (int) | int, long |
+---------------+-------------------+
| number (real) | float |
+---------------+-------------------+
| true | True |
+---------------+-------------------+
| false | False |
+---------------+-------------------+
| null | None |
+---------------+-------------------+
It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
their corresponding ``float`` values, which is outside the JSON spec.
"""
def __init__(self, encoding=None, object_hook=None, parse_float=None,
parse_int=None, parse_constant=None, strict=True):
"""``encoding`` determines the encoding used to interpret any ``str``
objects decoded by this instance (utf-8 by default). It has no
effect when decoding ``unicode`` objects.
Note that currently only encodings that are a superset of ASCII work,
strings of other encodings should be passed in as ``unicode``.
``object_hook``, if specified, will be called with the result
of every JSON object decoded and its return value will be used in
place of the given ``dict``. This can be used to provide custom
deserializations (e.g. to support JSON-RPC class hinting).
``parse_float``, if specified, will be called with the string
of every JSON float to be decoded. By default this is equivalent to
float(num_str). This can be used to use another datatype or parser
for JSON floats (e.g. decimal.Decimal).
``parse_int``, if specified, will be called with the string
of every JSON int to be decoded. By default this is equivalent to
int(num_str). This can be used to use another datatype or parser
for JSON integers (e.g. float).
``parse_constant``, if specified, will be called with one of the
following strings: -Infinity, Infinity, NaN.
This can be used to raise an exception if invalid JSON numbers
are encountered.
"""
self.encoding = encoding
self.object_hook = object_hook
self.parse_float = parse_float or float
self.parse_int = parse_int or int
self.parse_constant = parse_constant or _CONSTANTS.__getitem__
self.strict = strict
self.parse_object = JSONObject
self.parse_array = JSONArray
self.parse_string = scanstring
self.scan_once = make_scanner(self)
def decode(self, s, _w=WHITESPACE.match):
"""Return the Python representation of ``s`` (a ``str`` or ``unicode``
instance containing a JSON document)
"""
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
end = _w(s, end).end()
if end != len(s):
raise ValueError(errmsg("Extra data", s, end, len(s)))
return obj
def raw_decode(self, s, idx=0):
"""Decode a JSON document from ``s`` (a ``str`` or ``unicode`` beginning
with a JSON document) and return a 2-tuple of the Python
representation and the index in ``s`` where the document ended.
This can be used to decode a JSON document from a string that may
have extraneous data at the end.
"""
try:
obj, end = self.scan_once(s, idx)
except StopIteration:
raise ValueError("No JSON object could be decoded")
return obj, end

BIN
simplejson/decoder.pyc Normal file

Binary file not shown.

440
simplejson/encoder.py Normal file
View file

@ -0,0 +1,440 @@
"""Implementation of JSONEncoder
"""
import re
try:
from simplejson._speedups import encode_basestring_ascii as c_encode_basestring_ascii
except ImportError:
c_encode_basestring_ascii = None
try:
from simplejson._speedups import make_encoder as c_make_encoder
except ImportError:
c_make_encoder = None
ESCAPE = re.compile(r'[\x00-\x1f\\"\b\f\n\r\t]')
ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])')
HAS_UTF8 = re.compile(r'[\x80-\xff]')
ESCAPE_DCT = {
'\\': '\\\\',
'"': '\\"',
'\b': '\\b',
'\f': '\\f',
'\n': '\\n',
'\r': '\\r',
'\t': '\\t',
}
for i in range(0x20):
#ESCAPE_DCT.setdefault(chr(i), '\\u{0:04x}'.format(i))
ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,))
# Assume this produces an infinity on all machines (probably not guaranteed)
INFINITY = float('1e66666')
FLOAT_REPR = repr
def encode_basestring(s):
"""Return a JSON representation of a Python string
"""
def replace(match):
return ESCAPE_DCT[match.group(0)]
return '"' + ESCAPE.sub(replace, s) + '"'
def py_encode_basestring_ascii(s):
"""Return an ASCII-only JSON representation of a Python string
"""
if isinstance(s, str) and HAS_UTF8.search(s) is not None:
s = s.decode('utf-8')
def replace(match):
s = match.group(0)
try:
return ESCAPE_DCT[s]
except KeyError:
n = ord(s)
if n < 0x10000:
#return '\\u{0:04x}'.format(n)
return '\\u%04x' % (n,)
else:
# surrogate pair
n -= 0x10000
s1 = 0xd800 | ((n >> 10) & 0x3ff)
s2 = 0xdc00 | (n & 0x3ff)
#return '\\u{0:04x}\\u{1:04x}'.format(s1, s2)
return '\\u%04x\\u%04x' % (s1, s2)
return '"' + str(ESCAPE_ASCII.sub(replace, s)) + '"'
encode_basestring_ascii = c_encode_basestring_ascii or py_encode_basestring_ascii
class JSONEncoder(object):
"""Extensible JSON <http://json.org> encoder for Python data structures.
Supports the following objects and types by default:
+-------------------+---------------+
| Python | JSON |
+===================+===============+
| dict | object |
+-------------------+---------------+
| list, tuple | array |
+-------------------+---------------+
| str, unicode | string |
+-------------------+---------------+
| int, long, float | number |
+-------------------+---------------+
| True | true |
+-------------------+---------------+
| False | false |
+-------------------+---------------+
| None | null |
+-------------------+---------------+
To extend this to recognize other objects, subclass and implement a
``.default()`` method with another method that returns a serializable
object for ``o`` if possible, otherwise it should call the superclass
implementation (to raise ``TypeError``).
"""
item_separator = ', '
key_separator = ': '
def __init__(self, skipkeys=False, ensure_ascii=True,
check_circular=True, allow_nan=True, sort_keys=False,
indent=None, separators=None, encoding='utf-8', default=None):
"""Constructor for JSONEncoder, with sensible defaults.
If skipkeys is false, then it is a TypeError to attempt
encoding of keys that are not str, int, long, float or None. If
skipkeys is True, such items are simply skipped.
If ensure_ascii is true, the output is guaranteed to be str
objects with all incoming unicode characters escaped. If
ensure_ascii is false, the output will be unicode object.
If check_circular is true, then lists, dicts, and custom encoded
objects will be checked for circular references during encoding to
prevent an infinite recursion (which would cause an OverflowError).
Otherwise, no such check takes place.
If allow_nan is true, then NaN, Infinity, and -Infinity will be
encoded as such. This behavior is not JSON specification compliant,
but is consistent with most JavaScript based encoders and decoders.
Otherwise, it will be a ValueError to encode such floats.
If sort_keys is true, then the output of dictionaries will be
sorted by key; this is useful for regression tests to ensure
that JSON serializations can be compared on a day-to-day basis.
If indent is a non-negative integer, then JSON array
elements and object members will be pretty-printed with that
indent level. An indent level of 0 will only insert newlines.
None is the most compact representation.
If specified, separators should be a (item_separator, key_separator)
tuple. The default is (', ', ': '). To get the most compact JSON
representation you should specify (',', ':') to eliminate whitespace.
If specified, default is a function that gets called for objects
that can't otherwise be serialized. It should return a JSON encodable
version of the object or raise a ``TypeError``.
If encoding is not None, then all input strings will be
transformed into unicode using that encoding prior to JSON-encoding.
The default is UTF-8.
"""
self.skipkeys = skipkeys
self.ensure_ascii = ensure_ascii
self.check_circular = check_circular
self.allow_nan = allow_nan
self.sort_keys = sort_keys
self.indent = indent
if separators is not None:
self.item_separator, self.key_separator = separators
if default is not None:
self.default = default
self.encoding = encoding
def default(self, o):
"""Implement this method in a subclass such that it returns
a serializable object for ``o``, or calls the base implementation
(to raise a ``TypeError``).
For example, to support arbitrary iterators, you could
implement default like this::
def default(self, o):
try:
iterable = iter(o)
except TypeError:
pass
else:
return list(iterable)
return JSONEncoder.default(self, o)
"""
raise TypeError(repr(o) + " is not JSON serializable")
def encode(self, o):
"""Return a JSON string representation of a Python data structure.
>>> JSONEncoder().encode({"foo": ["bar", "baz"]})
'{"foo": ["bar", "baz"]}'
"""
# This is for extremely simple cases and benchmarks.
if isinstance(o, basestring):
if isinstance(o, str):
_encoding = self.encoding
if (_encoding is not None
and not (_encoding == 'utf-8')):
o = o.decode(_encoding)
if self.ensure_ascii:
return encode_basestring_ascii(o)
else:
return encode_basestring(o)
# This doesn't pass the iterator directly to ''.join() because the
# exceptions aren't as detailed. The list call should be roughly
# equivalent to the PySequence_Fast that ''.join() would do.
chunks = self.iterencode(o, _one_shot=True)
if not isinstance(chunks, (list, tuple)):
chunks = list(chunks)
return ''.join(chunks)
def iterencode(self, o, _one_shot=False):
"""Encode the given object and yield each string
representation as available.
For example::
for chunk in JSONEncoder().iterencode(bigobject):
mysocket.write(chunk)
"""
if self.check_circular:
markers = {}
else:
markers = None
if self.ensure_ascii:
_encoder = encode_basestring_ascii
else:
_encoder = encode_basestring
if self.encoding != 'utf-8':
def _encoder(o, _orig_encoder=_encoder, _encoding=self.encoding):
if isinstance(o, str):
o = o.decode(_encoding)
return _orig_encoder(o)
def floatstr(o, allow_nan=self.allow_nan, _repr=FLOAT_REPR, _inf=INFINITY, _neginf=-INFINITY):
# Check for specials. Note that this type of test is processor- and/or
# platform-specific, so do tests which don't depend on the internals.
if o != o:
text = 'NaN'
elif o == _inf:
text = 'Infinity'
elif o == _neginf:
text = '-Infinity'
else:
return _repr(o)
if not allow_nan:
raise ValueError(
"Out of range float values are not JSON compliant: " +
repr(o))
return text
if _one_shot and c_make_encoder is not None and not self.indent and not self.sort_keys:
_iterencode = c_make_encoder(
markers, self.default, _encoder, self.indent,
self.key_separator, self.item_separator, self.sort_keys,
self.skipkeys, self.allow_nan)
else:
_iterencode = _make_iterencode(
markers, self.default, _encoder, self.indent, floatstr,
self.key_separator, self.item_separator, self.sort_keys,
self.skipkeys, _one_shot)
return _iterencode(o, 0)
def _make_iterencode(markers, _default, _encoder, _indent, _floatstr, _key_separator, _item_separator, _sort_keys, _skipkeys, _one_shot,
## HACK: hand-optimized bytecode; turn globals into locals
False=False,
True=True,
ValueError=ValueError,
basestring=basestring,
dict=dict,
float=float,
id=id,
int=int,
isinstance=isinstance,
list=list,
long=long,
str=str,
tuple=tuple,
):
def _iterencode_list(lst, _current_indent_level):
if not lst:
yield '[]'
return
if markers is not None:
markerid = id(lst)
if markerid in markers:
raise ValueError("Circular reference detected")
markers[markerid] = lst
buf = '['
if _indent is not None:
_current_indent_level += 1
newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
separator = _item_separator + newline_indent
buf += newline_indent
else:
newline_indent = None
separator = _item_separator
first = True
for value in lst:
if first:
first = False
else:
buf = separator
if isinstance(value, basestring):
yield buf + _encoder(value)
elif value is None:
yield buf + 'null'
elif value is True:
yield buf + 'true'
elif value is False:
yield buf + 'false'
elif isinstance(value, (int, long)):
yield buf + str(value)
elif isinstance(value, float):
yield buf + _floatstr(value)
else:
yield buf
if isinstance(value, (list, tuple)):
chunks = _iterencode_list(value, _current_indent_level)
elif isinstance(value, dict):
chunks = _iterencode_dict(value, _current_indent_level)
else:
chunks = _iterencode(value, _current_indent_level)
for chunk in chunks:
yield chunk
if newline_indent is not None:
_current_indent_level -= 1
yield '\n' + (' ' * (_indent * _current_indent_level))
yield ']'
if markers is not None:
del markers[markerid]
def _iterencode_dict(dct, _current_indent_level):
if not dct:
yield '{}'
return
if markers is not None:
markerid = id(dct)
if markerid in markers:
raise ValueError("Circular reference detected")
markers[markerid] = dct
yield '{'
if _indent is not None:
_current_indent_level += 1
newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
item_separator = _item_separator + newline_indent
yield newline_indent
else:
newline_indent = None
item_separator = _item_separator
first = True
if _sort_keys:
items = dct.items()
items.sort(key=lambda kv: kv[0])
else:
items = dct.iteritems()
for key, value in items:
if isinstance(key, basestring):
pass
# JavaScript is weakly typed for these, so it makes sense to
# also allow them. Many encoders seem to do something like this.
elif isinstance(key, float):
key = _floatstr(key)
elif key is True:
key = 'true'
elif key is False:
key = 'false'
elif key is None:
key = 'null'
elif isinstance(key, (int, long)):
key = str(key)
elif _skipkeys:
continue
else:
raise TypeError("key " + repr(key) + " is not a string")
if first:
first = False
else:
yield item_separator
yield _encoder(key)
yield _key_separator
if isinstance(value, basestring):
yield _encoder(value)
elif value is None:
yield 'null'
elif value is True:
yield 'true'
elif value is False:
yield 'false'
elif isinstance(value, (int, long)):
yield str(value)
elif isinstance(value, float):
yield _floatstr(value)
else:
if isinstance(value, (list, tuple)):
chunks = _iterencode_list(value, _current_indent_level)
elif isinstance(value, dict):
chunks = _iterencode_dict(value, _current_indent_level)
else:
chunks = _iterencode(value, _current_indent_level)
for chunk in chunks:
yield chunk
if newline_indent is not None:
_current_indent_level -= 1
yield '\n' + (' ' * (_indent * _current_indent_level))
yield '}'
if markers is not None:
del markers[markerid]
def _iterencode(o, _current_indent_level):
if isinstance(o, basestring):
yield _encoder(o)
elif o is None:
yield 'null'
elif o is True:
yield 'true'
elif o is False:
yield 'false'
elif isinstance(o, (int, long)):
yield str(o)
elif isinstance(o, float):
yield _floatstr(o)
elif isinstance(o, (list, tuple)):
for chunk in _iterencode_list(o, _current_indent_level):
yield chunk
elif isinstance(o, dict):
for chunk in _iterencode_dict(o, _current_indent_level):
yield chunk
else:
if markers is not None:
markerid = id(o)
if markerid in markers:
raise ValueError("Circular reference detected")
markers[markerid] = o
o = _default(o)
for chunk in _iterencode(o, _current_indent_level):
yield chunk
if markers is not None:
del markers[markerid]
return _iterencode

BIN
simplejson/encoder.pyc Normal file

Binary file not shown.

65
simplejson/scanner.py Normal file
View file

@ -0,0 +1,65 @@
"""JSON token scanner
"""
import re
try:
from simplejson._speedups import make_scanner as c_make_scanner
except ImportError:
c_make_scanner = None
__all__ = ['make_scanner']
NUMBER_RE = re.compile(
r'(-?(?:0|[1-9]\d*))(\.\d+)?([eE][-+]?\d+)?',
(re.VERBOSE | re.MULTILINE | re.DOTALL))
def py_make_scanner(context):
parse_object = context.parse_object
parse_array = context.parse_array
parse_string = context.parse_string
match_number = NUMBER_RE.match
encoding = context.encoding
strict = context.strict
parse_float = context.parse_float
parse_int = context.parse_int
parse_constant = context.parse_constant
object_hook = context.object_hook
def _scan_once(string, idx):
try:
nextchar = string[idx]
except IndexError:
raise StopIteration
if nextchar == '"':
return parse_string(string, idx + 1, encoding, strict)
elif nextchar == '{':
return parse_object((string, idx + 1), encoding, strict, _scan_once, object_hook)
elif nextchar == '[':
return parse_array((string, idx + 1), _scan_once)
elif nextchar == 'n' and string[idx:idx + 4] == 'null':
return None, idx + 4
elif nextchar == 't' and string[idx:idx + 4] == 'true':
return True, idx + 4
elif nextchar == 'f' and string[idx:idx + 5] == 'false':
return False, idx + 5
m = match_number(string, idx)
if m is not None:
integer, frac, exp = m.groups()
if frac or exp:
res = parse_float(integer + (frac or '') + (exp or ''))
else:
res = parse_int(integer)
return res, m.end()
elif nextchar == 'N' and string[idx:idx + 3] == 'NaN':
return parse_constant('NaN'), idx + 3
elif nextchar == 'I' and string[idx:idx + 8] == 'Infinity':
return parse_constant('Infinity'), idx + 8
elif nextchar == '-' and string[idx:idx + 9] == '-Infinity':
return parse_constant('-Infinity'), idx + 9
else:
raise StopIteration
return _scan_once
make_scanner = c_make_scanner or py_make_scanner

BIN
simplejson/scanner.pyc Normal file

Binary file not shown.

View file

@ -0,0 +1,23 @@
import unittest
import doctest
def additional_tests():
import simplejson
import simplejson.encoder
import simplejson.decoder
suite = unittest.TestSuite()
for mod in (simplejson, simplejson.encoder, simplejson.decoder):
suite.addTest(doctest.DocTestSuite(mod))
suite.addTest(doctest.DocFileSuite('../../index.rst'))
return suite
def main():
suite = additional_tests()
runner = unittest.TextTestRunner()
runner.run(suite)
if __name__ == '__main__':
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
main()

View file

@ -0,0 +1,30 @@
from unittest import TestCase
import simplejson as json
def default_iterable(obj):
return list(obj)
class TestCheckCircular(TestCase):
def test_circular_dict(self):
dct = {}
dct['a'] = dct
self.assertRaises(ValueError, json.dumps, dct)
def test_circular_list(self):
lst = []
lst.append(lst)
self.assertRaises(ValueError, json.dumps, lst)
def test_circular_composite(self):
dct2 = {}
dct2['a'] = []
dct2['a'].append(dct2)
self.assertRaises(ValueError, json.dumps, dct2)
def test_circular_default(self):
json.dumps([set()], default=default_iterable)
self.assertRaises(TypeError, json.dumps, [set()])
def test_circular_off_default(self):
json.dumps([set()], default=default_iterable, check_circular=False)
self.assertRaises(TypeError, json.dumps, [set()], check_circular=False)

View file

@ -0,0 +1,22 @@
import decimal
from unittest import TestCase
import simplejson as json
class TestDecode(TestCase):
def test_decimal(self):
rval = json.loads('1.1', parse_float=decimal.Decimal)
self.assert_(isinstance(rval, decimal.Decimal))
self.assertEquals(rval, decimal.Decimal('1.1'))
def test_float(self):
rval = json.loads('1', parse_int=float)
self.assert_(isinstance(rval, float))
self.assertEquals(rval, 1.0)
def test_decoder_optimizations(self):
# Several optimizations were made that skip over calls to
# the whitespace regex, so this test is designed to try and
# exercise the uncommon cases. The array cases are already covered.
rval = json.loads('{ "key" : "value" , "k":"v" }')
self.assertEquals(rval, {"key":"value", "k":"v"})

View file

@ -0,0 +1,9 @@
from unittest import TestCase
import simplejson as json
class TestDefault(TestCase):
def test_default(self):
self.assertEquals(
json.dumps(type, default=repr),
json.dumps(repr(type)))

View file

@ -0,0 +1,21 @@
from unittest import TestCase
from cStringIO import StringIO
import simplejson as json
class TestDump(TestCase):
def test_dump(self):
sio = StringIO()
json.dump({}, sio)
self.assertEquals(sio.getvalue(), '{}')
def test_dumps(self):
self.assertEquals(json.dumps({}), '{}')
def test_encode_truefalse(self):
self.assertEquals(json.dumps(
{True: False, False: True}, sort_keys=True),
'{"false": true, "true": false}')
self.assertEquals(json.dumps(
{2: 3.0, 4.0: 5L, False: 1, 6L: True, "7": 0}, sort_keys=True),
'{"false": 1, "2": 3.0, "4.0": 5, "6": true, "7": 0}')

View file

@ -0,0 +1,38 @@
from unittest import TestCase
import simplejson.encoder
CASES = [
(u'/\\"\ucafe\ubabe\uab98\ufcde\ubcda\uef4a\x08\x0c\n\r\t`1~!@#$%^&*()_+-=[]{}|;:\',./<>?', '"/\\\\\\"\\ucafe\\ubabe\\uab98\\ufcde\\ubcda\\uef4a\\b\\f\\n\\r\\t`1~!@#$%^&*()_+-=[]{}|;:\',./<>?"'),
(u'\u0123\u4567\u89ab\ucdef\uabcd\uef4a', '"\\u0123\\u4567\\u89ab\\ucdef\\uabcd\\uef4a"'),
(u'controls', '"controls"'),
(u'\x08\x0c\n\r\t', '"\\b\\f\\n\\r\\t"'),
(u'{"object with 1 member":["array with 1 element"]}', '"{\\"object with 1 member\\":[\\"array with 1 element\\"]}"'),
(u' s p a c e d ', '" s p a c e d "'),
(u'\U0001d120', '"\\ud834\\udd20"'),
(u'\u03b1\u03a9', '"\\u03b1\\u03a9"'),
('\xce\xb1\xce\xa9', '"\\u03b1\\u03a9"'),
(u'\u03b1\u03a9', '"\\u03b1\\u03a9"'),
('\xce\xb1\xce\xa9', '"\\u03b1\\u03a9"'),
(u'\u03b1\u03a9', '"\\u03b1\\u03a9"'),
(u'\u03b1\u03a9', '"\\u03b1\\u03a9"'),
(u"`1~!@#$%^&*()_+-={':[,]}|;.</>?", '"`1~!@#$%^&*()_+-={\':[,]}|;.</>?"'),
(u'\x08\x0c\n\r\t', '"\\b\\f\\n\\r\\t"'),
(u'\u0123\u4567\u89ab\ucdef\uabcd\uef4a', '"\\u0123\\u4567\\u89ab\\ucdef\\uabcd\\uef4a"'),
]
class TestEncodeBaseStringAscii(TestCase):
def test_py_encode_basestring_ascii(self):
self._test_encode_basestring_ascii(simplejson.encoder.py_encode_basestring_ascii)
def test_c_encode_basestring_ascii(self):
if not simplejson.encoder.c_encode_basestring_ascii:
return
self._test_encode_basestring_ascii(simplejson.encoder.c_encode_basestring_ascii)
def _test_encode_basestring_ascii(self, encode_basestring_ascii):
fname = encode_basestring_ascii.__name__
for input_string, expect in CASES:
result = encode_basestring_ascii(input_string)
self.assertEquals(result, expect,
'%r != %r for %s(%r)' % (result, expect, fname, input_string))

View file

@ -0,0 +1,76 @@
from unittest import TestCase
import simplejson as json
# Fri Dec 30 18:57:26 2005
JSONDOCS = [
# http://json.org/JSON_checker/test/fail1.json
'"A JSON payload should be an object or array, not a string."',
# http://json.org/JSON_checker/test/fail2.json
'["Unclosed array"',
# http://json.org/JSON_checker/test/fail3.json
'{unquoted_key: "keys must be quoted}',
# http://json.org/JSON_checker/test/fail4.json
'["extra comma",]',
# http://json.org/JSON_checker/test/fail5.json
'["double extra comma",,]',
# http://json.org/JSON_checker/test/fail6.json
'[ , "<-- missing value"]',
# http://json.org/JSON_checker/test/fail7.json
'["Comma after the close"],',
# http://json.org/JSON_checker/test/fail8.json
'["Extra close"]]',
# http://json.org/JSON_checker/test/fail9.json
'{"Extra comma": true,}',
# http://json.org/JSON_checker/test/fail10.json
'{"Extra value after close": true} "misplaced quoted value"',
# http://json.org/JSON_checker/test/fail11.json
'{"Illegal expression": 1 + 2}',
# http://json.org/JSON_checker/test/fail12.json
'{"Illegal invocation": alert()}',
# http://json.org/JSON_checker/test/fail13.json
'{"Numbers cannot have leading zeroes": 013}',
# http://json.org/JSON_checker/test/fail14.json
'{"Numbers cannot be hex": 0x14}',
# http://json.org/JSON_checker/test/fail15.json
'["Illegal backslash escape: \\x15"]',
# http://json.org/JSON_checker/test/fail16.json
'["Illegal backslash escape: \\\'"]',
# http://json.org/JSON_checker/test/fail17.json
'["Illegal backslash escape: \\017"]',
# http://json.org/JSON_checker/test/fail18.json
'[[[[[[[[[[[[[[[[[[[["Too deep"]]]]]]]]]]]]]]]]]]]]',
# http://json.org/JSON_checker/test/fail19.json
'{"Missing colon" null}',
# http://json.org/JSON_checker/test/fail20.json
'{"Double colon":: null}',
# http://json.org/JSON_checker/test/fail21.json
'{"Comma instead of colon", null}',
# http://json.org/JSON_checker/test/fail22.json
'["Colon instead of comma": false]',
# http://json.org/JSON_checker/test/fail23.json
'["Bad value", truth]',
# http://json.org/JSON_checker/test/fail24.json
"['single quote']",
# http://code.google.com/p/simplejson/issues/detail?id=3
u'["A\u001FZ control characters in string"]',
]
SKIPS = {
1: "why not have a string payload?",
18: "spec doesn't specify any nesting limitations",
}
class TestFail(TestCase):
def test_failures(self):
for idx, doc in enumerate(JSONDOCS):
idx = idx + 1
if idx in SKIPS:
json.loads(doc)
continue
try:
json.loads(doc)
except ValueError:
pass
else:
self.fail("Expected failure for fail%d.json: %r" % (idx, doc))

View file

@ -0,0 +1,15 @@
import math
from unittest import TestCase
import simplejson as json
class TestFloat(TestCase):
def test_floats(self):
for num in [1617161771.7650001, math.pi, math.pi**100, math.pi**-100, 3.1]:
self.assertEquals(float(json.dumps(num)), num)
self.assertEquals(json.loads(json.dumps(num)), num)
def test_ints(self):
for num in [1, 1L, 1<<32, 1<<64]:
self.assertEquals(json.dumps(num), str(num))
self.assertEquals(int(json.dumps(num)), num)

View file

@ -0,0 +1,41 @@
from unittest import TestCase
import simplejson as json
import textwrap
class TestIndent(TestCase):
def test_indent(self):
h = [['blorpie'], ['whoops'], [], 'd-shtaeou', 'd-nthiouh', 'i-vhbjkhnth',
{'nifty': 87}, {'field': 'yes', 'morefield': False} ]
expect = textwrap.dedent("""\
[
[
"blorpie"
],
[
"whoops"
],
[],
"d-shtaeou",
"d-nthiouh",
"i-vhbjkhnth",
{
"nifty": 87
},
{
"field": "yes",
"morefield": false
}
]""")
d1 = json.dumps(h)
d2 = json.dumps(h, indent=2, sort_keys=True, separators=(',', ': '))
h1 = json.loads(d1)
h2 = json.loads(d2)
self.assertEquals(h1, h)
self.assertEquals(h2, h)
self.assertEquals(d2, expect)

View file

@ -0,0 +1,76 @@
from unittest import TestCase
import simplejson as json
# from http://json.org/JSON_checker/test/pass1.json
JSON = r'''
[
"JSON Test Pattern pass1",
{"object with 1 member":["array with 1 element"]},
{},
[],
-42,
true,
false,
null,
{
"integer": 1234567890,
"real": -9876.543210,
"e": 0.123456789e-12,
"E": 1.234567890E+34,
"": 23456789012E666,
"zero": 0,
"one": 1,
"space": " ",
"quote": "\"",
"backslash": "\\",
"controls": "\b\f\n\r\t",
"slash": "/ & \/",
"alpha": "abcdefghijklmnopqrstuvwyz",
"ALPHA": "ABCDEFGHIJKLMNOPQRSTUVWYZ",
"digit": "0123456789",
"special": "`1~!@#$%^&*()_+-={':[,]}|;.</>?",
"hex": "\u0123\u4567\u89AB\uCDEF\uabcd\uef4A",
"true": true,
"false": false,
"null": null,
"array":[ ],
"object":{ },
"address": "50 St. James Street",
"url": "http://www.JSON.org/",
"comment": "// /* <!-- --",
"# -- --> */": " ",
" s p a c e d " :[1,2 , 3
,
4 , 5 , 6 ,7 ],
"compact": [1,2,3,4,5,6,7],
"jsontext": "{\"object with 1 member\":[\"array with 1 element\"]}",
"quotes": "&#34; \u0022 %22 0x22 034 &#x22;",
"\/\\\"\uCAFE\uBABE\uAB98\uFCDE\ubcda\uef4A\b\f\n\r\t`1~!@#$%^&*()_+-=[]{}|;:',./<>?"
: "A key can be any string"
},
0.5 ,98.6
,
99.44
,
1066
,"rosebud"]
'''
class TestPass1(TestCase):
def test_parse(self):
# test in/out equivalence and parsing
res = json.loads(JSON)
out = json.dumps(res)
self.assertEquals(res, json.loads(out))
try:
json.dumps(res, allow_nan=False)
except ValueError:
pass
else:
self.fail("23456789012E666 should be out of range")

View file

@ -0,0 +1,14 @@
from unittest import TestCase
import simplejson as json
# from http://json.org/JSON_checker/test/pass2.json
JSON = r'''
[[[[[[[[[[[[[[[[[[["Not too deep"]]]]]]]]]]]]]]]]]]]
'''
class TestPass2(TestCase):
def test_parse(self):
# test in/out equivalence and parsing
res = json.loads(JSON)
out = json.dumps(res)
self.assertEquals(res, json.loads(out))

View file

@ -0,0 +1,20 @@
from unittest import TestCase
import simplejson as json
# from http://json.org/JSON_checker/test/pass3.json
JSON = r'''
{
"JSON Test Pattern pass3": {
"The outermost value": "must be an object or array.",
"In this test": "It is an object."
}
}
'''
class TestPass3(TestCase):
def test_parse(self):
# test in/out equivalence and parsing
res = json.loads(JSON)
out = json.dumps(res)
self.assertEquals(res, json.loads(out))

View file

@ -0,0 +1,67 @@
from unittest import TestCase
import simplejson as json
class JSONTestObject:
pass
class RecursiveJSONEncoder(json.JSONEncoder):
recurse = False
def default(self, o):
if o is JSONTestObject:
if self.recurse:
return [JSONTestObject]
else:
return 'JSONTestObject'
return json.JSONEncoder.default(o)
class TestRecursion(TestCase):
def test_listrecursion(self):
x = []
x.append(x)
try:
json.dumps(x)
except ValueError:
pass
else:
self.fail("didn't raise ValueError on list recursion")
x = []
y = [x]
x.append(y)
try:
json.dumps(x)
except ValueError:
pass
else:
self.fail("didn't raise ValueError on alternating list recursion")
y = []
x = [y, y]
# ensure that the marker is cleared
json.dumps(x)
def test_dictrecursion(self):
x = {}
x["test"] = x
try:
json.dumps(x)
except ValueError:
pass
else:
self.fail("didn't raise ValueError on dict recursion")
x = {}
y = {"a": x, "b": x}
# ensure that the marker is cleared
json.dumps(x)
def test_defaultrecursion(self):
enc = RecursiveJSONEncoder()
self.assertEquals(enc.encode(JSONTestObject), '"JSONTestObject"')
enc.recurse = True
try:
enc.encode(JSONTestObject)
except ValueError:
pass
else:
self.fail("didn't raise ValueError on default recursion")

View file

@ -0,0 +1,111 @@
import sys
import decimal
from unittest import TestCase
import simplejson as json
import simplejson.decoder
class TestScanString(TestCase):
def test_py_scanstring(self):
self._test_scanstring(simplejson.decoder.py_scanstring)
def test_c_scanstring(self):
if not simplejson.decoder.c_scanstring:
return
self._test_scanstring(simplejson.decoder.c_scanstring)
def _test_scanstring(self, scanstring):
self.assertEquals(
scanstring('"z\\ud834\\udd20x"', 1, None, True),
(u'z\U0001d120x', 16))
if sys.maxunicode == 65535:
self.assertEquals(
scanstring(u'"z\U0001d120x"', 1, None, True),
(u'z\U0001d120x', 6))
else:
self.assertEquals(
scanstring(u'"z\U0001d120x"', 1, None, True),
(u'z\U0001d120x', 5))
self.assertEquals(
scanstring('"\\u007b"', 1, None, True),
(u'{', 8))
self.assertEquals(
scanstring('"A JSON payload should be an object or array, not a string."', 1, None, True),
(u'A JSON payload should be an object or array, not a string.', 60))
self.assertEquals(
scanstring('["Unclosed array"', 2, None, True),
(u'Unclosed array', 17))
self.assertEquals(
scanstring('["extra comma",]', 2, None, True),
(u'extra comma', 14))
self.assertEquals(
scanstring('["double extra comma",,]', 2, None, True),
(u'double extra comma', 21))
self.assertEquals(
scanstring('["Comma after the close"],', 2, None, True),
(u'Comma after the close', 24))
self.assertEquals(
scanstring('["Extra close"]]', 2, None, True),
(u'Extra close', 14))
self.assertEquals(
scanstring('{"Extra comma": true,}', 2, None, True),
(u'Extra comma', 14))
self.assertEquals(
scanstring('{"Extra value after close": true} "misplaced quoted value"', 2, None, True),
(u'Extra value after close', 26))
self.assertEquals(
scanstring('{"Illegal expression": 1 + 2}', 2, None, True),
(u'Illegal expression', 21))
self.assertEquals(
scanstring('{"Illegal invocation": alert()}', 2, None, True),
(u'Illegal invocation', 21))
self.assertEquals(
scanstring('{"Numbers cannot have leading zeroes": 013}', 2, None, True),
(u'Numbers cannot have leading zeroes', 37))
self.assertEquals(
scanstring('{"Numbers cannot be hex": 0x14}', 2, None, True),
(u'Numbers cannot be hex', 24))
self.assertEquals(
scanstring('[[[[[[[[[[[[[[[[[[[["Too deep"]]]]]]]]]]]]]]]]]]]]', 21, None, True),
(u'Too deep', 30))
self.assertEquals(
scanstring('{"Missing colon" null}', 2, None, True),
(u'Missing colon', 16))
self.assertEquals(
scanstring('{"Double colon":: null}', 2, None, True),
(u'Double colon', 15))
self.assertEquals(
scanstring('{"Comma instead of colon", null}', 2, None, True),
(u'Comma instead of colon', 25))
self.assertEquals(
scanstring('["Colon instead of comma": false]', 2, None, True),
(u'Colon instead of comma', 25))
self.assertEquals(
scanstring('["Bad value", truth]', 2, None, True),
(u'Bad value', 12))
def test_issue3623(self):
self.assertRaises(ValueError, json.decoder.scanstring, "xxx", 1,
"xxx")
self.assertRaises(UnicodeDecodeError,
json.encoder.encode_basestring_ascii, "xx\xff")

View file

@ -0,0 +1,42 @@
import textwrap
from unittest import TestCase
import simplejson as json
class TestSeparators(TestCase):
def test_separators(self):
h = [['blorpie'], ['whoops'], [], 'd-shtaeou', 'd-nthiouh', 'i-vhbjkhnth',
{'nifty': 87}, {'field': 'yes', 'morefield': False} ]
expect = textwrap.dedent("""\
[
[
"blorpie"
] ,
[
"whoops"
] ,
[] ,
"d-shtaeou" ,
"d-nthiouh" ,
"i-vhbjkhnth" ,
{
"nifty" : 87
} ,
{
"field" : "yes" ,
"morefield" : false
}
]""")
d1 = json.dumps(h)
d2 = json.dumps(h, indent=2, sort_keys=True, separators=(' ,', ' : '))
h1 = json.loads(d1)
h2 = json.loads(d2)
self.assertEquals(h1, h)
self.assertEquals(h2, h)
self.assertEquals(d2, expect)

View file

@ -0,0 +1,64 @@
from unittest import TestCase
import simplejson as json
class TestUnicode(TestCase):
def test_encoding1(self):
encoder = json.JSONEncoder(encoding='utf-8')
u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
s = u.encode('utf-8')
ju = encoder.encode(u)
js = encoder.encode(s)
self.assertEquals(ju, js)
def test_encoding2(self):
u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
s = u.encode('utf-8')
ju = json.dumps(u, encoding='utf-8')
js = json.dumps(s, encoding='utf-8')
self.assertEquals(ju, js)
def test_encoding3(self):
u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
j = json.dumps(u)
self.assertEquals(j, '"\\u03b1\\u03a9"')
def test_encoding4(self):
u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
j = json.dumps([u])
self.assertEquals(j, '["\\u03b1\\u03a9"]')
def test_encoding5(self):
u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
j = json.dumps(u, ensure_ascii=False)
self.assertEquals(j, u'"%s"' % (u,))
def test_encoding6(self):
u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
j = json.dumps([u], ensure_ascii=False)
self.assertEquals(j, u'["%s"]' % (u,))
def test_big_unicode_encode(self):
u = u'\U0001d120'
self.assertEquals(json.dumps(u), '"\\ud834\\udd20"')
self.assertEquals(json.dumps(u, ensure_ascii=False), u'"\U0001d120"')
def test_big_unicode_decode(self):
u = u'z\U0001d120x'
self.assertEquals(json.loads('"' + u + '"'), u)
self.assertEquals(json.loads('"z\\ud834\\udd20x"'), u)
def test_unicode_decode(self):
for i in range(0, 0xd7ff):
u = unichr(i)
s = '"\\u%04x"' % (i,)
self.assertEquals(json.loads(s), u)
def test_default_encoding(self):
self.assertEquals(json.loads(u'{"a": "\xe9"}'.encode('utf-8')),
{'a': u'\xe9'})
def test_unicode_preservation(self):
self.assertEquals(type(json.loads(u'""')), unicode)
self.assertEquals(type(json.loads(u'"a"')), unicode)
self.assertEquals(type(json.loads(u'["a"]')[0]), unicode)

37
simplejson/tool.py Normal file
View file

@ -0,0 +1,37 @@
r"""Command-line tool to validate and pretty-print JSON
Usage::
$ echo '{"json":"obj"}' | python -m simplejson.tool
{
"json": "obj"
}
$ echo '{ 1.2:3.4}' | python -m simplejson.tool
Expecting property name: line 1 column 2 (char 2)
"""
import sys
import simplejson
def main():
if len(sys.argv) == 1:
infile = sys.stdin
outfile = sys.stdout
elif len(sys.argv) == 2:
infile = open(sys.argv[1], 'rb')
outfile = sys.stdout
elif len(sys.argv) == 3:
infile = open(sys.argv[1], 'rb')
outfile = open(sys.argv[2], 'wb')
else:
raise SystemExit(sys.argv[0] + " [infile [outfile]]")
try:
obj = simplejson.load(infile)
except ValueError, e:
raise SystemExit(e)
simplejson.dump(obj, outfile, sort_keys=True, indent=4)
outfile.write('\n')
if __name__ == '__main__':
main()

BIN
static/ajax-loader.gif Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

BIN
static/favicon.ico Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

79
status.html Normal file
View file

@ -0,0 +1,79 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
<html>
<head>
<link href="/css/index.css" rel="stylesheet" type="text/css">
<title>{% if fic.completed %} Finished {% else %} {% if fic.failure %} Failed {% else %} Working... {% endif %} {% endif %} - Fanfiction Downloader</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta name="google-site-verification" content="kCFc-G4bka_pJN6Rv8CapPBcwmq0hbAUZPkKWqRsAYU" />
{% if not fic.completed and not fic.failure %}
<meta http-equiv="refresh" content="7">
{% endif %}
</head>
<body>
<div id='main'>
<h1>
<a href="/" style="text-decoration: none; color: black;">FanFiction Downloader</a>
</h1>
<div style="text-align: center">
<script type="text/javascript"><!--
google_ad_client = "ca-pub-0320924304307555";
/* Standard */
google_ad_slot = "8974025478";
google_ad_width = 468;
google_ad_height = 60;
//-->
</script>
<script type="text/javascript"
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
</script>
</div>
<div id='urlbox'>
<div id='greeting'>
<p><a href='{{ fic.url }}'>{{ fic.url }}</a></p>
</div>
<div>
{% if fic.completed %}
<p>Your fic has finished processing and you can download it now:</p>
<p><a href="/file?id={{ fic.key }}">Download {{ fic.title }}</a>
by {{ fic.author }} ({{ fic.format }})</p>
{% if escaped_url %}
<p><a href="http://www.convertfiles.com/index.php?url={{ escaped_url }}">Convert {{ fic.title }} to other formats</a></p>
{% endif %}
{% else %}
{% if fic.failure %}
Your fic failed to process. Please check the URL and the error message below.<br />
<div id='error'>
{{ fic.failure }}
</div>
{% else %}
<p>Not done yet. This page will periodically poll to see if your story has finished.</p>
{% endif %}
{% endif %}
<p>Or see your personal list of <a href="/recent">previously downloaded fanfics</a>.</p>
</div>
</div>
<div style='text-align: center'>
<img src="http://code.google.com/appengine/images/appengine-silver-120x30.gif"
alt="Powered by Google App Engine" />
<br/><br/>
FanfictionLoader is a web front-end to <A href="http://code.google.com/p/fanficdownloader/">fanficdownloader</a><br/>
Copyright &copy; <a href="http://twitter.com/sigizmund">Roman Kirillov</a>
</div>
<div style="margin-top: 1em; text-align: center'">
<script type="text/javascript"><!--
google_ad_client = "ca-pub-0320924304307555";
/* Standard */
google_ad_slot = "8974025478";
google_ad_width = 468;
google_ad_height = 60;
//-->
</script>
<script type="text/javascript"
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
</script>
</div>
</div>
</body>
</html>

52
utils/remover.py Normal file
View file

@ -0,0 +1,52 @@
#!/usr/bin/env python
# encoding: utf-8
"""
remover.py
Created by Roman on 2010-06-20.
Copyright (c) 2010 __MyCompanyName__. All rights reserved.
"""
import datetime
import logging
from google.appengine.ext.webapp import util
from google.appengine.ext import webapp
from google.appengine.api import users
from ffstorage import *
class Remover(webapp.RequestHandler):
def get(self):
logging.debug("Starting r3m0v3r")
user = users.get_current_user()
logging.debug("Working as user %s" % user)
theDate = datetime.date.today() - datetime.timedelta(days=5)
logging.debug("Will delete stuff older than %s" % theDate)
fics = DownloadMeta.all()
fics.filter("date <",theDate).order("date")
results = fics.fetch(100)
logging.debug([x.name for x in results])
num = 0
for d in results:
d.delete()
for c in d.data_chunks:
c.delete()
num = num + 1
logging.debug('Delete '+d.url)
logging.info('Deleted instances: %d' % num)
self.response.out.write('Deleted instances: %d' % num)
def main():
application = webapp.WSGIApplication([('/r3m0v3r', Remover)],
debug=False)
util.run_wsgi_app(application)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.DEBUG)
main()