Change a couple of the example story URLs.

This commit is contained in:
retiefjimm 2010-11-23 12:42:33 -06:00
commit f3571959df
62 changed files with 16384 additions and 0 deletions

31
app.yaml Normal file
View file

@ -0,0 +1,31 @@
application: fanfictionloader
version: 2-5-5
runtime: python
api_version: 1
handlers:
- url: /generate_mock_data
script: mocks/generate_mock_data.py
login: admin
- url: /r3m0v3r
script: utils/remover.py
login: admin
- url: /r3m0v3r
script: main.py
login: admin
- url: /css
static_dir: css
- url: /js
static_dir: js
- url: /static
static_dir: static
- url: /.*
script: main.py

4
cron.yaml Normal file
View file

@ -0,0 +1,4 @@
cron:
- description: cleanup job
url: /r3m0v3r
schedule: every 3 hours

71
css/index.css Normal file
View file

@ -0,0 +1,71 @@
body
{
font: 0.9em "Helvetica Neue", Arial, Helvetica, Geneva, sans-serif;
}
#main
{
width: 43%;
margin-left: 23%;
background-color: #dae6ff;
padding: 2em;
}
#greeting
{
margin-bottom: 1em;
border-color: #efefef;
}
#logpassword:hover, #logpasswordtable:hover, #urlbox:hover, #typebox:hover, #helpbox:hover, #yourfile:hover
{
border: thin solid #fffeff;
}
h1
{
text-decoration: none;
}
#logpasswordtable
{
padding: 1em;
}
#logpassword, #logpasswordtable {
display: none;
}
#urlbox, #typebox, #logpasswordtable, #logpassword, #helpbox, #yourfile
{
margin: 1em;
padding: 1em;
border: thin dotted #fffeff;
}
div.field
{
margin-bottom: 0.5em;
}
#submitbtn
{
padding: 1em;
}
#typelabel
{
}
#typeoptions
{
margin-top: 0.5em;
}
#error
{
font-size: small;
color: #f00;
}

59
delete_fic.py Normal file
View file

@ -0,0 +1,59 @@
import os
import cgi
import sys
import logging
import traceback
import StringIO
from google.appengine.api import users
from google.appengine.ext import webapp
from google.appengine.ext.webapp import util
from fanficdownloader.downaloder import *
from fanficdownloader.ffnet import *
from fanficdownloader.output import *
from google.appengine.ext import db
from fanficdownloader.zipdir import *
from ffstorage import *
def create_mac(user, fic_id, fic_url):
return str(abs(hash(user)+hash(fic_id)))+str(abs(hash(fic_url)))
def check_mac(user, fic_id, fic_url, mac):
return (create_mac(user, fic_id, fic_url) == mac)
def create_mac_for_fic(user, fic_id):
key = db.Key(fic_id)
fanfic = db.get(key)
if fanfic.user != user:
return None
else:
return create_mac(user, key, fanfic.url)
class DeleteFicHandler(webapp.RequestHandler):
def get(self):
user = users.get_current_user()
if not user:
self.redirect('/login')
fic_id = self.request.get('fic_id')
fic_mac = self.request.get('key_id')
actual_mac = create_mac_for_fic(user, fic_id)
if actual_mac != fic_mac:
self.response.out.write("Ooops")
else:
key = db.Key(fic_id)
fanfic = db.get(key)
fanfic.delete()
self.redirect('/recent')
fics = db.GqlQuery("Select * From DownloadedFanfic WHERE user = :1", user)
template_values = dict(fics = fics, nickname = user.nickname())
path = os.path.join(os.path.dirname(__file__), 'recent.html')
self.response.out.write(template.render(path, template_values))

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1 @@
# -*- coding: utf-8 -*-

229
fanficdownloader/adapter.py Normal file
View file

@ -0,0 +1,229 @@
# -*- coding: utf-8 -*-
import logging
import datetime
from output import makeAcceptableFilename
try:
from google.appengine.api.urlfetch import fetch as googlefetch
appEngineGlob = True
except:
appEngineGlob = False
class LoginRequiredException(Exception):
def __init__(self, url):
self.url = url
def __str__(self):
return repr(self.url + ' requires user to be logged in')
class StoryArchivedAlready(Exception):
pass
class StoryDoesNotExist(Exception):
pass
class FailedToDownload(Exception):
pass
class InvalidStoryURL(Exception):
pass
class FanfictionSiteAdapter:
appEngine = appEngineGlob
login = ''
password = ''
url = ''
host = ''
path = ''
uuid = ''
storyName = ''
storyId = ''
authorName = ''
authorId = ''
authorURL = ''
outputStorySep = '-Ukn_'
outputName = ''
outputFileName = ''
storyDescription = ''
storyCharacters = []
storySeries = ''
storyPublished = datetime.date(1970, 01, 31)
storyCreated = datetime.datetime.now()
storyUpdated = datetime.date(1970, 01, 31)
languageId = 'en-UK'
language = 'English'
subjects = []
publisher = ''
numChapters = '0'
numWords = '0'
genre = ''
category = ''
storyStatus = 'In-Progress'
storyRating = ''
storyUserRating = '0'
def __init__(self, url):
# basic plain url parsing...
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
def hasAppEngine(self):
return self.appEngine
def fetchUrl(self, url):
if not self.appEngine:
return self.opener.open(url).read().decode('utf-8')
else:
return googlefetch(url).content
def requiresLogin(self, url = None):
return False
def performLogin(self, url = None):
return True
def extractIndividualUrls(self):
pass
def getText(self, url):
pass
def setLogin(self, login):
self.login = login
def setPassword(self, password):
self.password = password
def getHost(self):
logging.debug('self.host=%s' % self.host)
return self.host
def getUUID(self):
self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
logging.debug('self.uuid=%s' % self.uuid)
return self.uuid
def getOutputName(self):
self.outputName = makeAcceptableFilename(self.storyName.replace(" ", "_") + self.outputStorySep + self.storyId)
logging.debug('self.outputName=%s' % self.outputName)
return self.outputName
def getOutputFileName(self, booksDirectory, bookExt):
self.getOutputName() # make sure self.outputName is populated
self.outputFileName = booksDirectory + "/" + self.outputName + bookExt
logging.debug('self.outputFileName=%s' % self.outputFileName)
return self.outputFileName
def getAuthorURL(self):
logging.debug('self.authorURL=%s' % self.authorURL)
return self.authorURL
def getAuthorId(self):
logging.debug('self.authorId=%s' % self.authorId)
return self.authorId
def getAuthorName(self):
logging.debug('self.authorName=%s' % self.authorName)
return self.authorName
def getStoryURL(self):
logging.debug('self.url=%s' % self.url)
return self.url
def getStoryId(self):
logging.debug('self.storyId=%s' % self.storyId)
return self.storyId
def getStoryName(self):
logging.debug('self.storyName=%s' % self.storyName)
return self.storyName
def getStoryDescription(self):
logging.debug('self.storyDescription=%s' % self.storyDescription)
return self.storyDescription
def getStoryCreated(self):
self.storyCreated = datetime.datetime.now()
logging.debug('self.storyCreated=%s' % self.storyCreated)
return self.storyCreated
def addCharacter(self, character):
chara = character.upper()
for c in self.storyCharacters:
if c.upper() == chara:
return False
self.storyCharacters.append(character)
return True
def getStoryCharacters(self):
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
return self.storyCharacters
def getStoryPublished(self):
logging.debug('self.storyPublished=%s' % self.storyPublished)
return self.storyPublished
def getStoryUpdated(self):
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
return self.storyUpdated
def getStorySeries(self):
logging.debug('self.storySeries=%s' % self.storySeries)
return self.storySeries
def getLanguage(self):
logging.debug('self.language=%s' % self.language)
return self.language
def getLanguageId(self):
logging.debug('self.languageId=%s' % self.languageId)
return self.languageId
def addSubject(self, subject):
subj = subject.upper()
for s in self.subjects:
if s.upper() == subj:
return False
self.subjects.append(subject)
return True
def getSubjects(self):
logging.debug('self.subjects=%s' % self.subjects)
return self.subjects
def getPublisher(self):
logging.debug('self.publisher=%s' % self.publisher)
return self.publisher
def getNumChapters(self):
logging.debug('self.numChapters=%s' % self.numChapters)
return self.numChapters
def getNumWords(self):
logging.debug('self.numWords=%s' % self.numWords)
return self.numWords
def getCategory(self):
logging.debug('self.category=%s' % self.category)
return self.category
def getGenre(self):
logging.debug('self.genre=%s' % self.genre)
return self.genre
def getStoryStatus(self):
logging.debug('self.storyStatus=%s' % self.storyStatus)
return self.storyStatus
def getStoryRating(self):
logging.debug('self.storyRating=%s' % self.storyRating)
return self.storyRating
def getStoryUserRating(self):
logging.debug('self.storyUserRating=%s' % self.storyUserRating)
return self.storyUserRating
def getPrintableUrl(self, url):
return url

View file

View file

@ -0,0 +1,542 @@
# -*- coding: utf-8 -*-
CSS = '''body { margin-left: 2%; margin-right: 2%; margin-top: 2%; margin-bottom: 2%; text-align: justify; }
pre { font-size: x-small; }
sml { font-size: small; }
h1 { text-align: center; }
h2 { text-align: center; }
h3 { text-align: center; }
h4 { text-align: center; }
h5 { text-align: center; }
h6 { text-align: center; }
h7 { text-align: left; font-size: large; font-weight: bold; }
.CI {
text-align:center;
margin-top:0px;
margin-bottom:0px;
padding:0px;
}
.center {text-align: center;}
.cover {text-align: center;}
.full {width: 100%; }
.quarter {width: 25%; }
.smcap {font-variant: small-caps;}
.u {text-decoration: underline;}
.bold {font-weight: bold;}
'''
MIMETYPE = '''application/epub+zip'''
TITLE_HEADER = '''<?xml version="1.0" encoding="utf-8"?><html xmlns="http://www.w3.org/1999/xhtml" xmlns:xlink="http://www.w3.org/1999/xlink"><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
<title>%s - %s</title><link href="stylesheet.css" type="text/css" charset="UTF-8" rel="stylesheet"/></head><body>
<p><h7 id="lnks"><b><a id="StoryLink" href="%s">%s</a></b> by <b><a id="AuthorLink" href="%s">%s</a></b></h7></p>
<table class="full">
'''
TITLE_ENTRY = '''<tr><td><b>%s</b></td><td>%s</td></tr>
'''
TITLE_FOOTER = '''</table>
<p><b>Summary:</b><br />%s</p>
</body></html>
'''
CONTAINER = '''<?xml version="1.0" encoding="utf-8"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>
'''
CONTENT_START = '''<?xml version="1.0" encoding="utf-8"?>
<package version="2.0" xmlns="http://www.idpf.org/2007/opf"
unique-identifier="fanficdownloader-uuid">
<metadata xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dcterms="http://purl.org/dc/terms/"
xmlns:opf="http://www.idpf.org/2007/opf"
xmlns:calibre="http://calibre.kovidgoyal.net/2009/metadata">
<dc:identifier id="fanficdownloader-uuid">BookID-Epub-%s</dc:identifier>
<dc:title>%s</dc:title>
<dc:creator opf:role="aut">%s</dc:creator>
<dc:contributor opf:role="bkp">fanficdownloader [http://fanficdownloader.googlecode.com]</dc:contributor>
<dc:language>%s</dc:language>
<dc:rights></dc:rights>
<dc:date opf:event="publication">%s</dc:date>
<dc:date opf:event="creation">%s</dc:date>
<dc:date opf:event="modification">%s</dc:date>
<meta name="calibre:timestamp" content="%s"/>
<dc:description>%s</dc:description>
'''
CONTENT_END_METADATA = ''' <dc:publisher>%s</dc:publisher>
<dc:identifier id="BookId">%s</dc:identifier>
<dc:identifier opf:scheme="URL">%s</dc:identifier>
<dc:source>%s</dc:source>
<dc:type>FanFiction</dc:type>
<meta name="calibre:rating" content="%s"/>
</metadata>
<manifest>
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
<item id="style" href="stylesheet.css" media-type="text/css" />
'''
CONTENT_SUBJECT = ''' <dc:subject>%s</dc:subject>
'''
CONTENT_ITEM = ''' <item id="%s" href="%s" media-type="application/xhtml+xml" />
'''
CONTENT_END_MANIFEST = ''' </manifest>
<spine toc="ncx">
'''
CONTENT_ITEMREF = ''' <itemref idref="%s" />
'''
CONTENT_END = ''' </spine>
</package>
'''
TOC_START = '''<?xml version="1.0" encoding="UTF-8"?>
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
<head>
<meta name="dtb:uid" content="%s"/>
<meta name="dtb:depth" content="1"/>
<meta name="dtb:totalPageCount" content="0"/>
<meta name="dtb:maxPageNumber" content="0"/>
</head>
<docTitle>
<text>%s</text>
</docTitle>
<navMap>
'''
TOC_ITEM = '''<navPoint id="%s" playOrder="%d">
<navLabel>
<text>%s</text>
</navLabel>
<content src="%s"/>
</navPoint>
'''
TOC_END = '''</navMap>
</ncx>
'''
XHTML_START = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>%s</title>
<link href="stylesheet.css" type="text/css" rel="stylesheet" />
</head>
<body>
<div>
<h3>%s</h3>
'''
XHTML_END = '''</div>
</body>
</html>
'''
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
'blockquote', 'br', 'center', 'cite', 'code', 'col',
'colgroup', 'dd', 'del', 'dfn', 'dir', 'dl', 'dt', 'em',
'font', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'i',
'ins', 'kbd', 'label', 'li', 'ol',
'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike',
'strong', 'sub', 'sup', 'u', 'ul']
acceptable_attributes = ['href']
# entity list from http://code.google.com/p/doctype/wiki/CharacterEntitiesConsistent
entities = { '&aacute;' : 'á',
'&Aacute;' : 'Á',
'&Aacute' : 'Á',
'&aacute' : 'á',
'&acirc;' : 'â',
'&Acirc;' : 'Â',
'&Acirc' : 'Â',
'&acirc' : 'â',
'&acute;' : '´',
'&acute' : '´',
'&AElig;' : 'Æ',
'&aelig;' : 'æ',
'&AElig' : 'Æ',
'&aelig' : 'æ',
'&agrave;' : 'à',
'&Agrave;' : 'À',
'&Agrave' : 'À',
'&agrave' : 'à',
'&alefsym;' : '',
'&alpha;' : 'α',
'&Alpha;' : 'Α',
'&amp;' : '&',
'&AMP;' : '&',
'&AMP' : '&',
'&amp' : '&',
'&and;' : '',
'&ang;' : '',
'&aring;' : 'å',
'&Aring;' : 'Å',
'&Aring' : 'Å',
'&aring' : 'å',
'&asymp;' : '',
'&atilde;' : 'ã',
'&Atilde;' : 'Ã',
'&Atilde' : 'Ã',
'&atilde' : 'ã',
'&auml;' : 'ä',
'&Auml;' : 'Ä',
'&Auml' : 'Ä',
'&auml' : 'ä',
'&bdquo;' : '',
'&beta;' : 'β',
'&Beta;' : 'Β',
'&brvbar;' : '¦',
'&brvbar' : '¦',
'&bull;' : '',
'&cap;' : '',
'&ccedil;' : 'ç',
'&Ccedil;' : 'Ç',
'&Ccedil' : 'Ç',
'&ccedil' : 'ç',
'&cedil;' : '¸',
'&cedil' : '¸',
'&cent;' : '¢',
'&cent' : '¢',
'&chi;' : 'χ',
'&Chi;' : 'Χ',
'&circ;' : 'ˆ',
'&clubs;' : '',
'&cong;' : '',
'&copy;' : '©',
'&COPY;' : '©',
'&COPY' : '©',
'&copy' : '©',
'&crarr;' : '',
'&cup;' : '',
'&curren;' : '¤',
'&curren' : '¤',
'&dagger;' : '',
'&Dagger;' : '',
'&darr;' : '',
'&dArr;' : '',
'&deg;' : '°',
'&deg' : '°',
'&delta;' : 'δ',
'&Delta;' : 'Δ',
'&diams;' : '',
'&divide;' : '÷',
'&divide' : '÷',
'&eacute;' : 'é',
'&Eacute;' : 'É',
'&Eacute' : 'É',
'&eacute' : 'é',
'&ecirc;' : 'ê',
'&Ecirc;' : 'Ê',
'&Ecirc' : 'Ê',
'&ecirc' : 'ê',
'&egrave;' : 'è',
'&Egrave;' : 'È',
'&Egrave' : 'È',
'&egrave' : 'è',
'&empty;' : '',
'&emsp;' : '',
'&ensp;' : '',
'&epsilon;' : 'ε',
'&Epsilon;' : 'Ε',
'&equiv;' : '',
'&eta;' : 'η',
'&Eta;' : 'Η',
'&eth;' : 'ð',
'&ETH;' : 'Ð',
'&ETH' : 'Ð',
'&eth' : 'ð',
'&euml;' : 'ë',
'&Euml;' : 'Ë',
'&Euml' : 'Ë',
'&euml' : 'ë',
'&euro;' : '',
'&exist;' : '',
'&fnof;' : 'ƒ',
'&forall;' : '',
'&frac12;' : '½',
'&frac12' : '½',
'&frac14;' : '¼',
'&frac14' : '¼',
'&frac34;' : '¾',
'&frac34' : '¾',
'&frasl;' : '',
'&gamma;' : 'γ',
'&Gamma;' : 'Γ',
'&ge;' : '',
'&gt;' : '>',
'&GT;' : '>',
'&GT' : '>',
'&gt' : '>',
'&harr;' : '',
'&hArr;' : '',
'&hearts;' : '',
'&hellip;' : '',
'&iacute;' : 'í',
'&Iacute;' : 'Í',
'&Iacute' : 'Í',
'&iacute' : 'í',
'&icirc;' : 'î',
'&Icirc;' : 'Î',
'&Icirc' : 'Î',
'&icirc' : 'î',
'&iexcl;' : '¡',
'&iexcl' : '¡',
'&igrave;' : 'ì',
'&Igrave;' : 'Ì',
'&Igrave' : 'Ì',
'&igrave' : 'ì',
'&image;' : '',
'&infin;' : '',
'&int;' : '',
'&iota;' : 'ι',
'&Iota;' : 'Ι',
'&iquest;' : '¿',
'&iquest' : '¿',
'&isin;' : '',
'&iuml;' : 'ï',
'&Iuml;' : 'Ï',
'&Iuml' : 'Ï',
'&iuml' : 'ï',
'&kappa;' : 'κ',
'&Kappa;' : 'Κ',
'&lambda;' : 'λ',
'&Lambda;' : 'Λ',
'&laquo;' : '«',
'&laquo' : '«',
'&larr;' : '',
'&lArr;' : '',
'&lceil;' : '',
'&ldquo;' : '',
'&le;' : '',
'&lfloor;' : '',
'&lowast;' : '',
'&loz;' : '',
'&lrm;' : '',
'&lsaquo;' : '',
'&lsquo;' : '',
'&lt;' : '<',
'&LT;' : '<',
'&LT' : '<',
'&lt' : '<',
'&macr;' : '¯',
'&macr' : '¯',
'&mdash;' : '',
'&micro;' : 'µ',
'&micro' : 'µ',
'&middot;' : '·',
'&middot' : '·',
'&minus;' : '',
'&mu;' : 'μ',
'&Mu;' : 'Μ',
'&nabla;' : '',
'&nbsp;' : ' ',
'&nbsp' : ' ',
'&ndash;' : '',
'&ne;' : '',
'&ni;' : '',
'&not;' : '¬',
'&not' : '¬',
'&notin;' : '',
'&nsub;' : '',
'&ntilde;' : 'ñ',
'&Ntilde;' : 'Ñ',
'&Ntilde' : 'Ñ',
'&ntilde' : 'ñ',
'&nu;' : 'ν',
'&Nu;' : 'Ν',
'&oacute;' : 'ó',
'&Oacute;' : 'Ó',
'&Oacute' : 'Ó',
'&oacute' : 'ó',
'&ocirc;' : 'ô',
'&Ocirc;' : 'Ô',
'&Ocirc' : 'Ô',
'&ocirc' : 'ô',
'&OElig;' : 'Œ',
'&oelig;' : 'œ',
'&ograve;' : 'ò',
'&Ograve;' : 'Ò',
'&Ograve' : 'Ò',
'&ograve' : 'ò',
'&oline;' : '',
'&omega;' : 'ω',
'&Omega;' : 'Ω',
'&omicron;' : 'ο',
'&Omicron;' : 'Ο',
'&oplus;' : '',
'&or;' : '',
'&ordf;' : 'ª',
'&ordf' : 'ª',
'&ordm;' : 'º',
'&ordm' : 'º',
'&oslash;' : 'ø',
'&Oslash;' : 'Ø',
'&Oslash' : 'Ø',
'&oslash' : 'ø',
'&otilde;' : 'õ',
'&Otilde;' : 'Õ',
'&Otilde' : 'Õ',
'&otilde' : 'õ',
'&otimes;' : '',
'&ouml;' : 'ö',
'&Ouml;' : 'Ö',
'&Ouml' : 'Ö',
'&ouml' : 'ö',
'&para;' : '',
'&para' : '',
'&part;' : '',
'&permil;' : '',
'&perp;' : '',
'&phi;' : 'φ',
'&Phi;' : 'Φ',
'&pi;' : 'π',
'&Pi;' : 'Π',
'&piv;' : 'ϖ',
'&plusmn;' : '±',
'&plusmn' : '±',
'&pound;' : '£',
'&pound' : '£',
'&prime;' : '',
'&Prime;' : '',
'&prod;' : '',
'&prop;' : '',
'&psi;' : 'ψ',
'&Psi;' : 'Ψ',
'&quot;' : '"',
'&QUOT;' : '"',
'&QUOT' : '"',
'&quot' : '"',
'&radic;' : '',
'&raquo;' : '»',
'&raquo' : '»',
'&rarr;' : '',
'&rArr;' : '',
'&rceil;' : '',
'&rdquo;' : '',
'&real;' : '',
'&reg;' : '®',
'&REG;' : '®',
'&REG' : '®',
'&reg' : '®',
'&rfloor;' : '',
'&rho;' : 'ρ',
'&Rho;' : 'Ρ',
'&rlm;' : '',
'&rsaquo;' : '',
'&rsquo;' : '',
'&sbquo;' : '',
'&scaron;' : 'š',
'&Scaron;' : 'Š',
'&sdot;' : '',
'&sect;' : '§',
'&sect' : '§',
'&shy;' : '­', # strange optional hyphenation control character, not just a dash
'&shy' : '­',
'&sigma;' : 'σ',
'&Sigma;' : 'Σ',
'&sigmaf;' : 'ς',
'&sim;' : '',
'&spades;' : '',
'&sub;' : '',
'&sube;' : '',
'&sum;' : '',
'&sup1;' : '¹',
'&sup1' : '¹',
'&sup2;' : '²',
'&sup2' : '²',
'&sup3;' : '³',
'&sup3' : '³',
'&sup;' : '',
'&supe;' : '',
'&szlig;' : 'ß',
'&szlig' : 'ß',
'&tau;' : 'τ',
'&Tau;' : 'Τ',
'&there4;' : '',
'&theta;' : 'θ',
'&Theta;' : 'Θ',
'&thetasym;' : 'ϑ',
'&thinsp;' : '',
'&thorn;' : 'þ',
'&THORN;' : 'Þ',
'&THORN' : 'Þ',
'&thorn' : 'þ',
'&tilde;' : '˜',
'&times;' : '×',
'&times' : '×',
'&trade;' : '',
'&uacute;' : 'ú',
'&Uacute;' : 'Ú',
'&Uacute' : 'Ú',
'&uacute' : 'ú',
'&uarr;' : '',
'&uArr;' : '',
'&ucirc;' : 'û',
'&Ucirc;' : 'Û',
'&Ucirc' : 'Û',
'&ucirc' : 'û',
'&ugrave;' : 'ù',
'&Ugrave;' : 'Ù',
'&Ugrave' : 'Ù',
'&ugrave' : 'ù',
'&uml;' : '¨',
'&uml' : '¨',
'&upsih;' : 'ϒ',
'&upsilon;' : 'υ',
'&Upsilon;' : 'Υ',
'&uuml;' : 'ü',
'&Uuml;' : 'Ü',
'&Uuml' : 'Ü',
'&uuml' : 'ü',
'&weierp;' : '',
'&xi;' : 'ξ',
'&Xi;' : 'Ξ',
'&yacute;' : 'ý',
'&Yacute;' : 'Ý',
'&Yacute' : 'Ý',
'&yacute' : 'ý',
'&yen;' : '¥',
'&yen' : '¥',
'&yuml;' : 'ÿ',
'&Yuml;' : 'Ÿ',
'&yuml' : 'ÿ',
'&zeta;' : 'ζ',
'&Zeta;' : 'Ζ',
'&zwj;' : '', # strange spacing control character, not just a space
'&zwnj;' : '', # strange spacing control character, not just a space
}
FB2_PROLOGUE = '<FictionBook>'
FB2_DESCRIPTION = '''<description>
<title-info>
<genre>fanfiction</genre>
<author>
<first-name></first-name>
<middle-name></middle-name>
<last-name>%s</last-name>
</author>
<book-title>%s</book-title>
<lang>eng</lang>
</title-info>
<document-info>
<author>
<nickname>sgzmd</nickname>
</author>
<date value="%s">%s</date>
<id>sgzmd_%s</id>
<version>2.0</version>
</document-info>
</description>'''
HTML_ESC_Definitions = 'HTML_Escape.def'

View file

@ -0,0 +1,205 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import shutil
import os.path
import getpass
import logging
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import zipdir
import output
import adapter
from adapter import StoryArchivedAlready
from adapter import StoryDoesNotExist
from adapter import FailedToDownload
from adapter import InvalidStoryURL
from adapter import LoginRequiredException
import ffnet
import fpcom
import ficwad
import fictionalley
import hpfiction
import twilighted
import potionsNsnitches
import mediaminer
import time
class FanficLoader:
'''A controller class which handles the interaction between various specific downloaders and writers'''
booksDirectory = "books"
standAlone = False
def __init__(self, adapter, writerClass, quiet = False, inmemory = False, compress=True, overwrite=False):
self.adapter = adapter
self.writerClass = writerClass
self.quiet = quiet
self.inmemory = inmemory
self.compress = compress
self.badLogin = False
self.overWrite = overwrite
def getBooksDirectory(self):
return self.booksDirectory
def setBooksDirectory(self, bd):
self.booksDirectory = bd
return self.booksDirectory
def getStandAlone(self):
return self.standAlone
def setStandAlone(self, sa):
self.standAlone = sa
return self.standAlone
def getOverWrite(self):
return self.overWrite
def setOverWrite(self, sa):
self.overWrite = sa
return self.overWrite
def getAdapter():
return self.adapter
def download(self):
logging.debug("Trying to download the story")
if self.adapter.requiresLogin():
logging.debug("Story requires login")
if not self.adapter.performLogin():
logging.debug("Login/password problem")
self.badLogin = True
raise adapter.LoginRequiredException(self.adapter.url)
urls = self.adapter.extractIndividualUrls()
logging.debug("self.writerClass=%s" % self.writerClass)
if self.standAlone and not self.inmemory:
s = self.adapter.getOutputFileName(self.booksDirectory, self.writerClass.getFormatExt())
logging.debug("Always overwrite? %s" % self.overWrite)
if not self.overWrite:
logging.debug("Checking if current archive of the story exists. Filename=%s" % s)
if not zipdir.checkNewer ( s, self.adapter.getStoryUpdated() ):
raise StoryArchivedAlready("A Current archive file \"" + s + "\" already exists! Skipping!")
else:
logging.debug("Do not check for existance of archive file.")
self.writer = self.writerClass(self.booksDirectory, self.adapter, inmemory=self.inmemory, compress=self.compress)
i = 1
for u,n in urls:
if not self.quiet:
print('Downloading chapter %d/%d' % (i, len(urls)))
text = self.adapter.getText(u)
self.writer.writeChapter(i, n, text)
i = i+1
# time.sleep(2)
self.writer.finalise()
if self.inmemory:
self.name = self.writer.name
return self.writer.output.getvalue()
if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG)
argvlen = len(sys.argv)
url = None
bookFormat = 'epub'
if argvlen > 1:
url = sys.argv[1]
if argvlen > 2:
bookFormat = sys.argv[2]
if url is None:
print >> sys.stderr, "Usage: downloader.py URL Type"
sys.exit(-1)
if type(url) is unicode:
print('URL is unicode')
url = url.encode('latin1')
url = url.strip()
adapter = None
writerClass = None
if url.find('fanficauthors') != -1:
print >> sys.stderr, "fanficauthors.net already provides ebooks"
sys.exit(0)
elif url.find('fictionalley') != -1:
adapter = fictionalley.FictionAlley(url)
elif url.find('ficwad') != -1:
adapter = ficwad.FicWad(url)
elif url.find('fanfiction.net') != -1:
adapter = ffnet.FFNet(url)
elif url.find('fictionpress.com') != -1:
adapter = fpcom.FPCom(url)
elif url.find('harrypotterfanfiction.com') != -1:
adapter = hpfiction.HPFiction(url)
elif url.find('twilighted.net') != -1:
adapter = twilighted.Twilighted(url)
elif url.find('potionsandsnitches.net') != -1:
adapter = potionsNsnitches.PotionsNSnitches(url)
elif url.find('mediaminer.org') != -1:
adapter = mediaminer.MediaMiner(url)
else:
print >> sys.stderr, "Oi! I can haz not appropriate adapter for URL %s!" % url
sys.exit(1)
if bookFormat == 'epub':
writerClass = output.EPubFanficWriter
elif bookFormat == 'html':
writerClass = output.HTMLWriter
elif bookFormat == 'text':
writerClass = output.TextWriter
if adapter.requiresLogin(url):
print("Meow, URL %s requires you to haz been logged in! Please can I haz this datas?" % url)
sys.stdout.write("Can I haz ur login? ")
login = sys.stdin.readline().strip()
password = getpass.getpass(prompt='Can I haz ur password? ')
print("Login: `%s`, Password: `%s`" % (login, password))
adapter.setLogin(login)
adapter.setPassword(password)
loader = FanficLoader(adapter, writerClass)
loader.setStandAlone(True)
if bookFormat != 'epub':
loader.setOverWrite(True)
try:
loader.download()
except FailedToDownload, ftd:
print >> sys.stderr, str(ftd)
sys.exit(2) # Error Downloading
except InvalidStoryURL, isu:
print >> sys.stderr, str(isu)
sys.exit(3) # Unknown Error
except StoryArchivedAlready, se:
print >> sys.stderr, str(se)
sys.exit(10) # Skipped
except StoryDoesNotExist, sdne:
print >> sys.stderr, str(sdne)
sys.exit(20) # Missing
except LoginRequiredException, lre:
print >> sys.stderr, str(lre)
sys.exit(30) # Missing
except Exception, e:
print >> sys.stderr, str(e)
sys.exit(99) # Unknown Error
sys.exit(0)

358
fanficdownloader/ffnet.py Normal file
View file

@ -0,0 +1,358 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import cgi
import uuid
import shutil
import os.path
import logging
import unittest
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time
import datetime
from constants import *
from adapter import *
try:
import login_password
except:
# tough luck
pass
class FFNet(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
self.storyName = 'FF.Net story'
self.authorName = 'FF.Net author'
self.storyDescription = 'Fanfiction Story'
self.storyCharacters = []
self.storySeries = ''
self.authorId = '0'
self.authorURL = self.path
self.storyId = '0'
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.subjects.append ('FanFiction')
logging.debug('self.subjects=%s' % self.subjects)
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = 'FanFiction'
self.category = 'FF.Net Category'
self.storyStatus = 'In-Progress'
self.storyRating = 'K'
self.storyUserRating = '0'
self.outputName = ''
self.outputStorySep = '-ffnet_'
logging.debug('self.path=%s' % self.path)
if self.path.startswith('/'):
self.path = self.path[1:]
spl = self.path.split('/')
logging.debug('spl=%s' % spl)
if spl is not None:
if len(spl) > 0 and spl[0] != 's':
raise InvalidStoryURL("Error URL \"%s\" is not a story." % self.url)
if len(spl) > 1:
self.storyId = spl[1]
if len(spl) > 2:
chapter = spl[1]
else:
chapter = '1'
if len(spl) == 5:
self.path = "/".join(spl[1:-1])
if self.path.endswith('/'):
self.path = self.path[:-1]
logging.debug('self.path=%s' % self.path)
if self.host is not None and self.host == "m.fanfiction.net":
self.host = "www.fanfiction.net"
logging.debug('self.host=%s' % self.host)
self.url = "http://" + self.host + "/" + self.path
logging.debug('self.url=%s' % self.url)
logging.debug('self.storyId=%s' % self.storyId)
if not self.appEngine:
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
else:
self.opener = None
logging.debug("Created FF.Net: url=%s" % (self.url))
def _getLoginScript(self):
return self.path
def _getVarValue(self, varstr):
#logging.debug('_getVarValue varstr=%s' % varstr)
vals = varstr.split('=')
#logging.debug('vals=%s' % vals)
retstr="".join(vals[+1:])
#logging.debug('retstr=%s' % retstr)
if retstr.startswith(' '):
retstr = retstr[1:]
if retstr.endswith(';'):
retstr = retstr[:-1]
return retstr
def _splitCrossover(self, subject):
if "Crossover" in subject:
self.addSubject ("Crossover")
logging.debug('Crossover=%s' % subject)
if subject.find(' and ') != -1:
words = subject.split(' ')
logging.debug('words=%s' % words)
subj = ''
for s in words:
if s in "and Crossover":
if len(subj) > 0:
self.addSubject(subj)
subj = ''
else:
if len(subj) > 0:
subj = subj + ' '
subj = subj + s
if len(subj) > 0:
self.addSubject(subj)
else:
self.addSubject(subject)
else:
self.addSubject(subject)
return True
def _splitGenre(self, subject):
if len(subject) > 0:
words = subject.split('/')
logging.debug('words=%s' % words)
for subj in words:
if len(subj) > 0:
self.addSubject(subj)
return True
def extractIndividualUrls(self):
data = ''
try:
data = self.fetchUrl(self.url)
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + self.url + ". Exception " + unicode(e) + ".")
if data is None:
raise StoryDoesNotExist("Problem reading story URL " + self.url + "!")
d2 = re.sub('&\#[0-9]+;', ' ', data)
soup = None
try:
soup = bs.BeautifulStoneSoup(d2)
except:
logging.error("Failed to decode: <%s>" % d2)
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % self.url)
allA = soup.findAll('a')
for a in allA:
if 'href' in a._getAttrMap() and a['href'].find('/u/') != -1:
self.authorName = a.string
(u1, u2, self.authorId, u3) = a['href'].split('/')
logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName))
urls = []
lines = data.split('\n')
for l in lines:
if l.find("&#187;") != -1 and l.find('<b>') != -1:
s2 = bs.BeautifulStoneSoup(l)
self.storyName = unicode(s2.find('b').string)
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
elif l.find("<a href='/u/") != -1:
s2 = bs.BeautifulStoneSoup(l)
self.authorName = unicode(s2.a.string)
(u1, u2, self.authorId, u3) = s2.a['href'].split('/')
logging.debug('self.authorId=%s, self.authorName=%s' % (self.authorId, self.authorName))
elif l.find("Rated: <a href=") != -1:
s2 = bs.BeautifulStoneSoup(l)
self.storyRating = unicode(s2.a.string).strip()
logging.debug('self.storyRating=%s' % self.storyRating)
logging.debug('s2.a=%s' % s2.a)
s3 = l.split('-')
logging.debug('s3=%s' % s3)
if len(s3) > 0:
if s3[1].find("Reviews: <a href=") != -1:
continue
self.language = s3[1].strip()
logging.debug('self.language=%s' % self.language)
if len(s3) > 1:
if s3[2].find("Reviews: <a href=") != -1:
continue
self.genre = s3[2].strip()
if "&" in self.genre:
self.genre = ''
continue
logging.debug('self.genre=%s' % self.genre)
self._splitGenre(self.genre)
logging.debug('self.subjects=%s' % self.subjects)
if "Complete" in l:
self.storyStatus = 'Completed'
else:
self.storyStatus = 'In-Progress'
elif l.find("<SELECT title='chapter navigation'") != -1:
if len(urls) > 0:
continue
try:
u = l.decode('utf-8')
except UnicodeEncodeError, e:
u = l
except:
u = l.encode('ascii', 'xmlcharrefreplace')
u = re.sub('&\#[0-9]+;', ' ', u)
s2 = bs.BeautifulSoup(u)
options = s2.findAll('option')
for o in options:
url = 'http://' + self.host + '/s/' + self.storyId + '/' + o['value']
title = o.string
logging.debug('URL = `%s`, Title = `%s`' % (url, title))
urls.append((url,title))
elif l.find("var chapters") != -1:
self.numChapters = self._getVarValue (l)
logging.debug('self.numChapters=%s' % self.numChapters)
elif l.find("var words") != -1:
self.numWords = self._getVarValue (l)
logging.debug('self.numWords=%s' % self.numWords)
elif l.find("var categoryid") != -1:
categoryid = self._getVarValue (l)
logging.debug('categoryid=%s' % categoryid)
elif l.find("var cat_title") != -1:
self.category = self._getVarValue (l).strip("'")
logging.debug('self.category=%s' % self.category)
self._splitCrossover(self.category)
logging.debug('self.subjects=%s' % self.subjects)
elif l.find("var summary") != -1:
self.storyDescription = self._getVarValue (l).strip("'")
if '&' in self.storyDescription:
s = self.storyDescription.split('&')
logging.debug('s=%s' % s)
self.storyDescription = ''
for ss in s:
if len(self.storyDescription) > 0:
if len(ss) > 4 and 'amp;' in ss[1:4]:
self.storyDescription = self.storyDescription + '&' + ss
else:
self.storyDescription = self.storyDescription + '&amp;' + ss
else:
self.storyDescription = ss
logging.debug('self.storyDescription=%s' % self.storyDescription)
elif l.find("var datep") != -1:
dateps = self._getVarValue (l)
self.storyPublished = datetime.datetime(*time.strptime ( dateps, "'%m-%d-%y'" )[0:5])
logging.debug('self.storyPublished=%s' % self.storyPublished.strftime("%Y-%m-%dT%I:%M:%S"))
elif l.find("var dateu") != -1:
dateus = self._getVarValue (l)
self.storyUpdated = datetime.datetime(*time.strptime ( dateus, "'%m-%d-%y'" )[0:5])
logging.debug('self.storyUpdated=%s' % self.storyUpdated.strftime("%Y-%m-%dT%I:%M:%S"))
if len(urls) <= 0:
# no chapters found, try url by itself.
urls.append((self.url,self.storyName))
self.authorURL = 'http://' + self.host + '/u/' + self.authorId
#logging.debug('urls=%s' % urls)
return urls
def getText(self, url):
# time.sleep( 2.0 )
data = ''
try:
logging.debug("Fetching URL: %s" % url)
data = self.fetchUrl(url)
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
lines = data.split('\n')
textbuf = ''
emit = False
olddata = data
try:
data = data.decode('utf8')
except:
data = olddata
soup = None
try:
soup = bs.BeautifulStoneSoup(data)
except:
logging.debug(data)
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
div = soup.find('div', {'id' : 'storytext'})
if None == div:
logging.debug(data)
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return div.__str__('utf8')
class FFA_UnitTests(unittest.TestCase):
def setUp(self):
logging.basicConfig(level=logging.DEBUG)
pass
def testChaptersAuthStory(self):
f = FFNet('http://www.fanfiction.net/s/5257563/1')
f.extractIndividualUrls()
self.assertEquals('Beka0502', f.getAuthorName())
self.assertEquals("Draco's Redemption", f.getStoryName())
def testChaptersCountNames(self):
f = FFNet('http://www.fanfiction.net/s/5257563/1')
urls = f.extractIndividualUrls()
self.assertEquals(10, len(urls))
def testGetText(self):
url = 'http://www.fanfiction.net/s/5257563/1'
f = FFNet(url)
text = f.getText(url)
self.assertTrue(text.find('He was just about to look at some photos when he heard a crack') != -1)
def testBrokenWands(self):
url = 'http://www.fanfiction.net/s/1527263/30/Harry_Potter_and_Broken_Wands'
f = FFNet(url)
text = f.getText(url)
urls = f.extractIndividualUrls()
def testFictionPress(self):
url = 'http://www.fictionpress.com/s/2725180/1/Behind_This_Facade'
f = FFNet(url)
urls = f.extractIndividualUrls()
self.assertEquals('Behind This Facade', f.getStoryName())
self.assertEquals('IntoxicatingMelody', f.getAuthorName())
text = f.getText(url)
self.assertTrue(text.find('Kale Resgerald at your service" He answered, "So, can we go now? Or do you want to') != -1)
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,301 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import shutil
import logging
import os.path
import urllib as u
import pprint as pp
import urllib2 as u2
import cookielib as cl
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time as time
import datetime
from adapter import *
class FictionAlley(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
logging.debug('self.host=%s' % self.host)
logging.debug('self.path=%s' % self.path)
cookieproc = u2.HTTPCookieProcessor()
# FictionAlley wants a cookie to prove you're old enough to read R+ rated stuff.
cookie = cl.Cookie(version=0, name='fauser', value='wizard',
port=None, port_specified=False,
domain='www.fictionalley.org', domain_specified=False, domain_initial_dot=False,
path='/authors', path_specified=True,
secure=False,
expires=time.time()+10000,
discard=False,
comment=None,
comment_url=None,
rest={'HttpOnly': None},
rfc2109=False)
cookieproc.cookiejar.set_cookie(cookie)
self.opener = u2.build_opener(cookieproc)
ss = self.path.split('/')
self.storyDescription = 'Fanfiction Story'
self.authorId = ''
self.authorURL = ''
self.storyId = ''
if len(ss) > 2 and ss[1] == 'authors':
self.authorId = ss[2]
self.authorURL = 'http://' + self.host + '/authors/' + self.authorId
if len(ss) > 3:
self.storyId = ss[3].replace ('.html','')
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.subjects.append ('fanfiction')
self.subjects.append ('Harry Potter')
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = ''
self.category = 'Harry Potter'
self.storyStatus = 'In-Progress'
self.storyRating = 'K'
self.storyUserRating = '0'
self.storyCharacters = []
self.storySeries = ''
self.storyName = ''
self.outputName = ''
self.outputStorySep = '-fa_'
def getPasswordLine(self):
return 'opaopapassword'
def getLoginScript(self):
return 'opaopaloginscript'
def getLoginPasswordOthers(self):
login = dict(login = 'name', password = 'pass')
other = dict(submit = 'Log In', remember='yes')
return (login, other)
def _processChapterHeaders(self, div):
brs = div.findAll ('br')
for br in brs:
keystr=''
valstr=''
if len(br.contents) > 2:
keystr = br.contents[1]
if keystr is not None:
strs = re.split ("<[^>]+>", unicode(keystr))
keystr=''
for s in strs:
keystr = keystr + s
valstr = br.contents[2].strip(' ')
if keystr is not None:
if keystr == 'Rating:':
self.storyRating = valstr
logging.debug('self.storyRating=%s' % self.storyRating)
elif keystr == 'Genre:':
self.genre = valstr
logging.debug('self.genre=%s' % self.genre)
s2 = valstr.split(', ')
for ss2 in s2:
self.addSubject(ss2)
logging.debug('self.subjects=%s' % self.subjects)
elif keystr == 'Main Character(s):':
s2 = valstr.split(', ')
for ss2 in s2:
self.addCharacter(ss2)
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
elif keystr == 'Summary:':
self.storyDescription = valstr
logging.debug('self.storyDescription=%s' % self.storyDescription)
def extractIndividualUrls(self):
data = ''
try:
data = self.opener.open(self.url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + self.url + ". Exception " + unicode(e) + ".")
if data is None:
raise StoryDoesNotExist("Problem reading story URL " + self.url + "!")
# There is some usefull information in the headers of the first chapter page..
data = data.replace('<!-- headerstart -->','<crazytagstringnobodywouldstumbleonaccidently id="storyheaders">').replace('<!-- headerend -->','</crazytagstringnobodywouldstumbleonaccidently>')
soup = None
try:
soup = bs.BeautifulStoneSoup(data)
except:
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % self.url)
breadcrumbs = soup.find('div', {'class': 'breadcrumbs'})
if breadcrumbs is not None:
# Be aware that this means that the user has entered the {STORY}01.html
# We will not have valid Publised and Updated dates. User should enter
# the {STORY}.html instead. We should force that instead of this.
#logging.debug('breadcrumbs=%s' % breadcrumbs )
bcas = breadcrumbs.findAll('a')
#logging.debug('bcas=%s' % bcas )
if bcas is not None and len(bcas) > 1:
bca = bcas[1]
#logging.debug('bca=%s' % bca )
if 'href' in bca._getAttrMap():
#logging.debug('bca.href=%s' % bca['href'] )
url = unicode(bca['href'])
if url is not None and len(url) > 0:
self.url = url
logging.debug('self.url=%s' % self.url )
ss = self.url.split('/')
self.storyId = ss[-1].replace('.html','')
self.storyName = bca.string
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
data = self.opener.open(self.url).read()
# There is some usefull information in the headers of the first chapter page..
data = data.replace('<!-- headerstart -->','<crazytagstringnobodywouldstumbleonaccidently id="storyheaders">').replace('<!-- headerend -->','</crazytagstringnobodywouldstumbleonaccidently>')
soup = bs.BeautifulStoneSoup(data)
# If it is decided that we really do care about number of words.. It's only available on the author's page..
#d0 = self.opener.open(self.authorURL).read()
#soupA = bs.BeautifulStoneSoup(d0)
#dls = soupA.findAll('dl')
#logging.debug('dls=%s' % dls)
# Get title from <title>, remove before '-'.
if len(self.storyName) == 0:
title = soup.find('title').string
self.storyName = "-".join(title.split('-')[1:]).strip().replace(" (Story Text)","")
links = soup.findAll('li')
self.numChapters = 0;
result = []
if len(links) == 0:
# Be aware that this means that the user has entered the {STORY}01.html
# We will not have valid Publised and Updated dates. User should enter
# the {STORY}.html instead. We should force that instead of this.
breadcrumbs = soup.find('div', {'class': 'breadcrumbs'})
self.authorName = breadcrumbs.a.string.replace("'s Fics","")
result.append((self.url,self.storyName))
#logging.debug('chapter[%s]=%s, %s' % (self.numChapters+1,self.url,self.storyName))
self.numChapters = self.numChapters + 1;
div = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storyheaders'})
if div is not None:
self._processChapterHeaders(div)
else:
author = soup.find('h1', {'class' : 'title'})
self.authorName = author.a.string
summary = soup.find('div', {'class' : 'summary'})
ss = summary.contents
if len(ss) > 1:
ss1 = ss[0].split(': ')
if len(ss1) > 1 and ss1[0] == 'Rating':
self.storyRating = ss1[1]
logging.debug('self.storyRating=%s' % self.storyRating)
self.storyDescription = unicode(ss[1]).replace("<br>","").replace("</br>","").replace('\n','')
logging.debug('self.storyDescription=%s' % self.storyDescription)
for li in links:
a = li.find('a', {'class' : 'chapterlink'})
s = li.contents
if a is not None:
url = a['href']
title = a.string
result.append((url,title))
#logging.debug('chapter[%s]=%s, %s' % (self.numChapters+1,url,title))
if self.numChapters == 0:
# fictionalley uses full URLs in chapter list.
d1 = self.opener.open(url).read()
# find <!-- headerstart --> & <!-- headerend --> and
# replaced with matching div pair for easier parsing.
# Yes, it's an evil kludge, but what can ya do? Using
# something other than div prevents soup from pairing
# our div with poor html inside the story text.
d1 = d1.replace('<!-- headerstart -->','<crazytagstringnobodywouldstumbleonaccidently id="storyheaders">').replace('<!-- headerend -->','</crazytagstringnobodywouldstumbleonaccidently>')
sop = bs.BeautifulStoneSoup(d1)
div = sop.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storyheaders'})
if div is not None:
self._processChapterHeaders(div)
self.numChapters = self.numChapters + 1
if len(s) > 1:
datestr=''
ss2 = s[1].replace('\n','').replace('(','').split(' ')
if len(ss2) > 2 and ss2[0] == 'Posted:':
datestr = ss2[1] + ' ' + ss2[2]
tmpdate = datetime.datetime.fromtimestamp(time.mktime(time.strptime(datestr.strip(' '), "%Y-%m-%d %H:%M:%S")))
if self.numChapters == 1:
self.storyPublished = tmpdate
self.storyUpdated = tmpdate
logging.debug('self.storyPublished=%s, self.storyUpdated=%s' % (self.storyPublished, self.storyUpdated))
else:
logging.debug('li chapterlink not found! li=%s' % li)
logging.debug('Story "%s" by %s' % (self.storyName, self.authorName))
return result
def getText(self, url):
# fictionalley uses full URLs in chapter list.
data = ''
try:
data = self.opener.open(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
# find <!-- headerend --> & <!-- footerstart --> and
# replaced with matching div pair for easier parsing.
# Yes, it's an evil kludge, but what can ya do? Using
# something other than div prevents soup from pairing
# our div with poor html inside the story text.
data = data.replace('<!-- headerend -->','<crazytagstringnobodywouldstumbleonaccidently id="storytext">').replace('<!-- footerstart -->','</crazytagstringnobodywouldstumbleonaccidently>')
soup = None
try:
soup = bs.BeautifulStoneSoup(data)
except:
logging.info("Failed to decode: <%s>" % data)
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
div = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storytext'})
if None == div:
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
html = soup.findAll('html')
if len(html) > 1:
return html[1].__str__('utf8')
else:
return div.__str__('utf8').replace('crazytagstringnobodywouldstumbleonaccidently','div')
if __name__ == '__main__':
url = 'http://www.fictionalley.org/authors/drt/DA.html'
data = self.opener.open(url).read()
host = up.urlparse(url).netloc
fw = FictionAlley(url)
urls = fw.extractIndividualUrls(data, host, url)
pp.pprint(urls)
print(fw.getText(data))

267
fanficdownloader/ficwad.py Normal file
View file

@ -0,0 +1,267 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import shutil
import os.path
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import logging
import time
import datetime
from adapter import *
class FicWad(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
self.host = up.urlparse(url).netloc
self.storyDescription = 'Fanfiction Story'
self.authorId = '0'
self.storyId = '0'
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.subjects.append ('fanfiction')
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = 'FanFiction'
self.category = 'Category'
self.storyStatus = 'In-Progress'
self.storyRating = 'PG'
self.storyUserRating = '0'
self.storyCharacters = []
self.storySeries = ''
self.outputName = ''
self.outputStorySep = '-fw_'
def getPasswordLine(self):
return 'opaopapassword'
def getLoginScript(self):
return 'opaopaloginscript'
def getLoginPasswordOthers(self):
login = dict(login = 'name', password = 'pass')
other = dict(submit = 'Log In', remember='yes')
return (login, other)
def extractIndividualUrls(self):
oldurl = ''
cururl = self.url
data = ''
try:
data = u2.urlopen(self.url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + self.url + ". Exception " + unicode(e) + ".")
if data is None:
raise StoryDoesNotExist("Problem reading story URL " + self.url + "!")
soup = None
try:
soup = bs.BeautifulStoneSoup(data)
except:
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % self.url)
story = soup.find('div', {'id' : 'story'})
crumbtrail = story.find('h3') # the only h3 ficwad uses.
allAhrefs = crumbtrail.findAll('a')
# last of crumbtrail
storyinfo = allAhrefs[-1]
(u0, u1, storyid) = storyinfo['href'].split('/')
if u1 == "story":
# This page does not have the correct information on it.. Need to get the Story Title Page
logging.debug('URL %s is a chapter URL. Getting Title Page http://%s/%s/%s.' % (self.url, self.host, u1, storyid))
oldurl = self.url
self.url = 'http://' + self.host + '/' + u1 + '/' + storyid
data = u2.urlopen(self.url).read()
soup = bs.BeautifulStoneSoup(data)
story = soup.find('div', {'id' : 'story'})
crumbtrail = story.find('h3') # the only h3 ficwad uses.
allAhrefs = crumbtrail.findAll('a')
# save chapter name from header in case of one-shot.
storyinfo = story.find('h4').find('a')
(u0, u1, self.storyId) = storyinfo['href'].split('/')
self.storyName = storyinfo.string.strip()
logging.debug('self.storyName=%s, self.storyId=%s' % (self.storyName, self.storyId))
author = soup.find('span', {'class' : 'author'})
self.authorName = unicode(author.a.string)
(u0, u1,self.authorId) = author.a['href'].split('/')
self.authorURL = 'http://' + self.host + author.a['href']
logging.debug('self.authorName=%s self.authorId=%s' % (self.authorName, self.authorId))
description = soup.find('blockquote', {'class' : 'summary'})
if description is not None:
self.storyDescription = unicode(description.p.string)
logging.debug('self.storyDescription=%s' % self.storyDescription)
meta = soup.find('p', {'class' : 'meta'})
if meta is not None:
s = unicode(meta).replace('\n',' ').replace('\t','').split(' - ')
#logging.debug('meta.s=%s' % s)
for ss in s:
s1 = ss.replace('&nbsp;','').split(':')
#logging.debug('meta.s.s1=%s' % s1)
if len(s1) > 1:
s2 = re.split ('<[^>]+>', s1[0])
#logging.debug('meta.s.s1.s2=%s' % s2)
if len(s2) > 1:
s1[0] = s2[1]
skey = s1[0].strip()
#logging.debug('Checking = %s' % skey)
if skey == 'Category':
soup1 = bs.BeautifulStoneSoup(s1[1])
allAs = soup1.findAll('a')
for a in allAs:
if self.category == 'Category':
self.category = unicode(a.string)
logging.debug('self.category=%s' % self.category)
self.addSubject(self.category)
logging.debug('self.subjects=%s' % self.subjects)
elif skey == 'Rating':
self.storyRating = s1[1]
logging.debug('self.storyRating=%s' % self.storyRating)
elif skey == 'Genres':
self.genre = s1[1]
logging.debug('self.genre=%s' % self.genre)
s2 = s1[1].split(', ')
for ss2 in s2:
self.addSubject(ss2)
logging.debug('self.subjects=%s' % self.subjects)
elif skey == 'Characters':
s2 = s1[1].split(', ')
for ss2 in s2:
self.addCharacter(ss2)
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
elif skey == 'Chapters':
self.numChapters = s1[1]
logging.debug('self.numChapters=%s' % self.numChapters)
elif skey == 'Warnings':
logging.debug('Warnings=%s' % s1[1])
elif skey == 'Published':
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d")))
logging.debug('self.storyPublished=%s' % self.storyPublished)
elif skey == 'Updated':
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d")))
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
else:
s3 = re.split ('<[^>]+>', s1[0])
#logging.debug('meta.s.s1.s3=%s' % s3)
if len(s3) > 1:
s1[0] = s3[0]
s4 = s1[0].split('w')
#logging.debug('meta.s.s1.s4=%s' % s4)
if len(s4) > 1 and s4[1] == 'ords':
self.numWords = s4[0]
logging.debug('self.numWords=%s' % self.numWords)
logging.debug('Story "%s" by %s' % (self.storyName, self.authorName))
result = []
ii = 1
if oldurl is not None and len(oldurl) > 0:
logging.debug('Switching back to %s' % oldurl)
cururl = oldurl
data = u2.urlopen(oldurl).read()
soup = bs.BeautifulStoneSoup(data)
storylist = soup.find('ul', {'id' : 'storylist'})
if storylist is not None:
allBlocked = storylist.findAll('li', {'class' : 'blocked'})
if allBlocked is not None:
#logging.debug('allBlocked=%s' % allBlocked)
raise LoginRequiredException(cururl)
allH4s = storylist.findAll('h4')
#logging.debug('allH4s=%s' % allH4s)
if allH4s is not None:
for h4 in allH4s:
chapterinfo = h4.find('a')
#logging.debug('Chapter1=%s' % chapterinfo)
url = 'http://' + self.host + chapterinfo['href']
title = chapterinfo.string.strip()
#logging.debug('Chapter=%s, %s' % (url, title))
# ficwad includes 'Story Index' in the dropdown of chapters,
# but it's not a real chapter.
if title != "Story Index":
logging.debug('Chapter[%s]=%s, %s' % (ii, url, title))
result.append((url,title))
ii = ii+1
else:
logging.debug('Skipping Story Index. URL %s' % url)
if ii == 1:
select = soup.find('select', { 'name' : 'goto' } )
if select is None:
self.numChapters = '1'
logging.debug('self.numChapters=%s' % self.numChapters)
result.append((self.url,self.storyName))
logging.debug('Chapter[%s]=%s %s' % (ii, self.url, self.storyName))
else:
allOptions = select.findAll('option')
for o in allOptions:
url = 'http://' + self.host + o['value']
title = o.string
# ficwad includes 'Story Index' in the dropdown of chapters,
# but it's not a real chapter.
if title != "Story Index":
logging.debug('Chapter[%s]=%s, %s' % (ii, url, title))
result.append((url,title))
ii = ii+1
else:
logging.debug('Skipping Story Index. URL %s' % url)
return result
def getText(self, url):
if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url
data = ''
try:
data = u2.urlopen(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
try:
soup = bs.BeautifulStoneSoup(data)
except:
logging.info("Failed to decode: <%s>" % data)
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
div = soup.find('div', {'id' : 'storytext'})
if None == div:
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return div.__str__('utf8')
if __name__ == '__main__':
url = 'http://www.ficwad.com/story/14536'
data = u2.urlopen(url).read()
host = up.urlparse(url).netloc
fw = FicWad(url)
urls = fw.extractIndividualUrls()
pp.pprint(urls)
print(fw.getText(data))

344
fanficdownloader/fpcom.py Normal file
View file

@ -0,0 +1,344 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import cgi
import uuid
import shutil
import os.path
import logging
import unittest
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time
import datetime
from constants import *
from adapter import *
try:
import login_password
except:
# tough luck
pass
class FPCom(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
self.storyName = ''
self.authorName = ''
self.storyDescription = ''
self.storyCharacters = []
self.storySeries = ''
self.authorId = '0'
self.authorURL = self.path
self.storyId = '0'
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = ''
self.category = ''
self.storyStatus = 'In-Progress'
self.storyRating = 'K'
self.storyUserRating = '0'
self.outputName = ''
self.outputStorySep = '-fpcom_'
if self.path.startswith('/'):
self.path = self.path[1:]
spl = self.path.split('/')
if spl is not None:
if len(spl) > 0 and spl[0] != 's':
raise InvalidStoryURL("Error URL \"%s\" is not a story." % self.url)
if len(spl) > 1:
self.storyId = spl[1]
if len(spl) > 2:
chapter = spl[1]
else:
chapter = '1'
if len(spl) == 5:
self.path = "/".join(spl[1:-1])
if self.path.endswith('/'):
self.path = self.path[:-1]
logging.debug('self.path=%s' % self.path)
if not self.appEngine:
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
else:
self.opener = None
logging.debug("Created FP.Com: url=%s" % (self.url))
def _getLoginScript(self):
return self.path
def _getVarValue(self, varstr):
#logging.debug('_getVarValue varstr=%s' % varstr)
vals = varstr.split('=')
#logging.debug('vals=%s' % vals)
retstr="".join(vals[+1:])
#logging.debug('retstr=%s' % retstr)
if retstr.startswith(' '):
retstr = retstr[1:]
if retstr.endswith(';'):
retstr = retstr[:-1]
return retstr
def _splitCrossover(self, subject):
if "Crossover" in subject:
self.addSubject ("Crossover")
logging.debug('Crossover=%s' % subject)
if subject.find(' and ') != -1:
words = subject.split(' ')
logging.debug('words=%s' % words)
subj = ''
for s in words:
if s in "and Crossover":
if len(subj) > 0:
self.addSubject(subj)
subj = ''
else:
if len(subj) > 0:
subj = subj + ' '
subj = subj + s
if len(subj) > 0:
self.addSubject(subj)
else:
self.addSubject(subject)
else:
self.addSubject(subject)
return True
def _splitGenre(self, subject):
if len(subject) > 0:
words = subject.split('/')
logging.debug('words=%s' % words)
for subj in words:
if len(subj) > 0:
self.addSubject(subj)
return True
def extractIndividualUrls(self):
data = ''
try:
data = self.fetchUrl(self.url)
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + self.url + ". Exception " + unicode(e) + ".")
if data is None:
raise StoryDoesNotExist("Problem reading story URL " + self.url + "!")
d2 = re.sub('&\#[0-9]+;', ' ', data)
soup = None
try:
soup = bs.BeautifulStoneSoup(d2)
except:
logging.error("Failed to decode: <%s>" % d2)
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % self.url)
allA = soup.findAll('a')
for a in allA:
if 'href' in a._getAttrMap() and a['href'].find('/u/') != -1:
self.authorName = a.string
(u1, u2, self.authorId, u3) = a['href'].split('/')
logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName))
urls = []
metas = soup.findAll ('meta', {'name' : 'description'})
if metas is not None:
for meta in metas:
if 'content' in meta._getAttrMap():
self.storyDescription = unicode(meta['content'])
logging.debug('self.storyDescription=%s' % self.storyDescription)
title=meta.find('title')
logging.debug('title=%s' % title.string)
tt = title.string.split(',')
if tt is not None:
if len(tt) > 0:
self.storyName = tt[0]
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
if len(tt) > 1:
tt1 = tt[1].split(' - ')
if tt1 is not None and len(tt1) > 0:
self.category = tt1[0].strip()
logging.debug('self.category=%s' % self.category)
cc = self.category.split(' ')
for cc1 in cc:
if cc1 is not None and cc1 != 'a':
if cc1 == 'fanfic':
self.addSubject('FanFiction')
else:
self.addSubject(cc1)
logging.debug('self.subjects=%s' % self.subjects)
numchapters = 0
urlstory = ''
fidochap = soup.find('form', {'name':'fidochap'})
sl = fidochap.find('select', {'title':'chapter navigation'})
if sl is not None:
logging.debug('sl=%s' % sl )
if 'onchange' in sl._getAttrMap():
ocs = sl['onchange'].split('\'')
logging.debug('ocs=%s' % ocs)
if ocs is not None and len(ocs) > 3:
urlstory = ocs[3]
logging.debug('urlstory=%s' % urlstory)
opts = sl.findAll('option')
for o in opts:
if 'value' in o._getAttrMap():
url = 'http://' + self.host + '/s/' + self.storyId + '/' + o['value'] + urlstory
logging.debug('URL=%s, Title=%s' % (url, o.string))
urls.append((url, o.string))
numchapters = numchapters + 1
if numchapters == 0:
numchapters = 1
url = 'http://' + self.host + '/s/' + self.storyId + '/1' + urlstory
logging.debug('URL=%s, Title=%s' % (url, self.storyName))
urls.append((url, self.storyName))
self.numChapters = unicode(numchapters)
logging.debug('self.numChapters=%s' % self.numChapters)
logging.debug('urls=%s' % urls)
self.genre = ''
tds = fidochap.findAll('td')
for td in tds:
tdb = td.find('b')
if tdb is not None and tdb.string == self.storyName:
tdas = td.findAll('a')
for tda in tdas:
ss = tda.string
if ss is not None:
if len(self.genre) > 0:
self.genre = self.genre + ', '
self.genre = self.genre + ss
self.addSubject(ss)
logging.debug('self.genre=%s' % self.genre)
logging.debug('self.subjects=%s' % self.subjects)
tda = td.find ('a')
if tda is not None and tda.string.find('Rated:') != -1:
tdas = re.split ("<[^>]+>", unicode(td).replace('\n','').replace('&nbsp;',' '))
if tdas is not None:
ll = len(tdas)
if ll > 2:
ss = tdas[2].split(': ')
if ss is not None and len(ss) > 1:
self.storyRating = ss[1]
logging.debug('self.storyRating=%s' % self.storyRating)
if ll > 3:
ss = tdas[3].split(' - ')
if ss is not None:
lls = len(ss)
if lls > 1:
language = ss[1]
logging.debug('language=%s' % language)
if lls > 2:
self.category = ss[2]
logging.debug('self.category=%s' % self.category)
sgs = self.category.split('/')
for sg in sgs:
self.addSubject(sg)
logging.debug('self.subjects=%s' % self.subjects)
if lls > 3 and ss[3].strip() == 'Reviews:' and ll > 4:
reviews = tdas[4]
logging.debug('reviews=%s' % reviews)
if ll > 5:
ss = tdas[5].split(' - ')
if ss is not None:
lls = len(ss)
if lls > 1:
sds = ss[1].split(': ')
if sds is not None and len(sds) > 1 and sds[0] == 'Published':
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(sds[1].strip(' '), "%m-%d-%y")))
logging.debug('self.storyPublished=%s' % self.storyPublished)
lls = len(ss)
if lls > 2:
sds = ss[2].split(': ')
if sds is not None and len(sds) > 1 and sds[0] == 'Updated':
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(sds[1].strip(' '), "%m-%d-%y")))
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
self.authorURL = 'http://' + self.host + '/u/' + self.authorId
return urls
def getText(self, url):
time.sleep( 2.0 )
data = ''
try:
data = self.fetchUrl(url)
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
lines = data.split('\n')
textbuf = ''
emit = False
olddata = data
try:
data = data.decode('utf8')
except:
data = olddata
soup = None
try:
soup = bs.BeautifulStoneSoup(data)
except:
logging.info("Failed to decode: <%s>" % data)
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
div = soup.find('div', {'id' : 'storytext'})
if None == div:
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return div.__str__('utf8')
class FPC_UnitTests(unittest.TestCase):
def setUp(self):
logging.basicConfig(level=logging.DEBUG)
pass
def testFictionPress(self):
url = 'http://www.fictionpress.com/s/2725180/1/Behind_This_Facade'
f = FPCom(url)
urls = f.extractIndividualUrls()
self.assertEquals('Behind This Facade', f.getStoryName())
self.assertEquals('IntoxicatingMelody', f.getAuthorName())
text = f.getText(url)
self.assertTrue(text.find('Kale Resgerald at your service" He answered, "So, can we go now? Or do you want to') != -1)
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,280 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import cgi
import uuid
import shutil
import os.path
import logging
import unittest
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time
import datetime
from constants import *
from adapter import *
try:
import login_password
except:
# tough luck
pass
class HPFiction(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
logging.debug('self.url=%s' % self.url)
logging.debug('self.host=%s' % self.host)
logging.debug('self.path=%s' % self.path)
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
self.chapurl = False
self.storyId = '0'
sss = self.url.split('?')
logging.debug('sss=%s' % sss)
if sss is not None and len(sss) > 1:
sc = sss[1].split('=')
logging.debug('sc=%s' % sc)
if sc is not None and len(sc) > 1:
if sc[0] == 'chapterid':
self.chapurl = True
elif sc[0] == 'psid' or sc[0] == 'sid':
self.storyId = sc[1]
self.storyDescription = 'Fanfiction Story'
self.authorId = '0'
self.authorURL = ''
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.subjects.append ('fanfiction')
self.subjects.append ('Harry Potter')
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = 'FanFiction'
self.category = 'Category'
self.storyStatus = 'In-Progress'
self.storyRating = 'K'
self.storyUserRating = '0'
self.storyCharacters = []
self.storySeries = ''
self.outputName = ''
self.outputStorySep = '-hp_'
logging.debug("Created HPFiction: url=%s" % (self.url))
def _getLoginScript(self):
return self.path
def extractIndividualUrls(self):
data = ''
try:
data = self.opener.open(self.url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + self.url + ". Exception " + unicode(e) + ".")
if data is None:
raise StoryDoesNotExist("Problem reading story URL " + self.url + "!")
soup = None
try:
soup = bs.BeautifulSoup(data)
except:
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % self.url)
links = soup.findAll('a')
def_chapurl = ''
def_chaptitle = ''
if self.chapurl:
foundid = False
for a in links:
if a['href'].find('psid') != -1:
sp = a['href'].split('?')
if sp is not None and len(sp) > 1:
for sp1 in sp:
if sp1.find('psid') != -1:
ps = sp1.split('=')
if ps is not None and len(ps) > 1:
self.storyId = ps[1].replace('\'','')
foundid = True
self.storyName = a.string
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
break
if foundid:
self.url = "http://" + self.host + "/viewstory.php?psid=" + self.storyId
logging.debug('Title Page URL=%s' % self.url)
data1 = self.opener.open(self.url).read()
hdrsoup = bs.BeautifulSoup(data1)
else:
hdrsoup = soup
else:
hdrsoup = soup
for a in links:
if not self.chapurl and a['href'].find('psid') != -1:
sp = a['href'].split('?')
if sp is not None and len(sp) > 1:
for sp1 in sp:
if sp1.find('psid') != -1:
ps = sp1.split('=')
if ps is not None and len(ps) > 1:
self.storyId = ps[1].replace('\'','')
self.storyName = a.string
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
elif a['href'].find('viewuser.php') != -1:
self.authorName = a.string
self.authorURL = 'http://' + self.host + '/' + a['href']
(u1, self.authorId) = a['href'].split('=')
logging.debug('self.authorName=%s, self.authorId=%s' % (self.authorName, self.authorId))
elif a['href'].find('chapterid=') != -1 and len(def_chapurl) == 0:
def_chapurl = 'http://' + self.host + '/viewstory.php' + unicode(a['href'])
def_chaptitle = a.string
logging.debug('def_chapurl=%s, def_chaptitle=%s' % (def_chapurl, def_chaptitle))
centers = hdrsoup.findAll('center')
for center in centers:
tds = center.findAll ('td')
if tds is not None and len(tds) > 0:
for td in tds:
s = re.split ("<[^>]+>", unicode(td).replace('\n','').replace('&nbsp;',' '))
ii = 0
ll = len(s)
sss = ''
while ii < ll - 1:
if s[ii] is not None and len(s[ii]) > 0:
if s[ii] == 'Rating:':
self.storyRating = s[ii+1]
logging.debug('self.storyRating=%s' % self.storyRating)
ii = ii + 2
elif s[ii] == 'Chapters:':
self.numChapters = s[ii+1]
logging.debug('self.numChapters=%s' % self.numChapters)
ii = ii + 2
elif s[ii] == 'Characters:':
s2 = s[ii+1].split(', ')
for ss2 in s2:
self.addCharacter(ss2)
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
ii = ii + 2
elif s[ii] == 'Genre(s):':
self.genre = s[ii+1]
logging.debug('self.genre=%s' % self.genre)
s2 = s[ii+1].split(', ')
for ss2 in s2:
self.addSubject(ss2)
logging.debug('self.subjects=%s' % self.subjects)
ii = ii + 2
elif s[ii] == 'Status:':
if s[ii+1].strip(' ') == "Work In Progress":
self.storyStatus = 'In-Progress'
else:
self.storyStatus = 'Completed'
ii = ii + 2
elif s[ii] == 'First Published:':
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s[ii+1].strip(' '), "%Y.%m.%d")))
logging.debug('self.storyPublished=%s' % self.storyPublished)
ii = ii + 2
elif s[ii] == 'Last Updated:':
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s[ii+1].strip(' '), "%Y.%m.%d")))
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
ii = ii + 2
elif s[ii] == 'Last Published Chapter:':
ii = ii + 2
elif s[ii] == 'Pairings:':
ii = ii + 2
elif s[ii] == 'Warnings:':
ii = ii + 2
else:
sss = sss + ' ' + s[ii]
ii = ii + 1
else:
ii = ii + 1
self.storyDescription = sss
logging.debug('self.storyDescription=%s' % self.storyDescription)
urls = []
select = soup.find('select', {'name' : 'chapterid'})
if select is None:
# no chapters found, try url by itself.
if len(def_chapurl) > 0:
urls.append((def_chapurl, def_chaptitle))
else:
urls.append((self.url,self.storyName))
else:
for o in select.findAll('option'):
if 'value' in o._getAttrMap():
url = 'http://' + self.host + self.path + o['value']
title = o.string
if title != "Story Index":
urls.append((url,title))
return urls
def getText(self, url):
logging.debug('Downloading from URL: %s' % url)
data = ''
try:
data = self.opener.open(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
soup = None
try:
soup = bs.BeautifulSoup(data)
except:
logging.info("Failed to decode: <%s>" % data)
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
divtext = soup.find('div', {'id' : 'fluidtext'})
if None == divtext:
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return divtext.__str__('utf8')
class FF_UnitTests(unittest.TestCase):
def setUp(self):
logging.basicConfig(level=logging.DEBUG)
pass
def testChaptersAuthStory(self):
f = HPFiction('http://www.harrypotterfanfiction.com/viewstory.php?chapterid=80123')
urls = f.extractIndividualUrls()
self.assertEquals(49, len(urls))
self.assertEquals('Elisha', f.getAuthorName())
self.assertEquals('A Secret Thought', f.getStoryName())
def testGetText(self):
url = 'http://www.harrypotterfanfiction.com/viewstory.php?chapterid=80123'
f = HPFiction(url)
#urls = f.extractIndividualUrls()
text = f.getText(url)
self.assertTrue(text.find('She pulled out of his arms and felt the subtle regret') != -1)
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,452 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""html2text: Turn HTML into equivalent Markdown-structured text."""
__version__ = "2.37"
__author__ = "Aaron Swartz (me@aaronsw.com)"
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
# TODO:
# Support decoded entities with unifiable.
if not hasattr(__builtins__, 'True'): True, False = 1, 0
import re, sys, urllib, htmlentitydefs, codecs, StringIO, types
import sgmllib
import urlparse
sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
try: from textwrap import wrap
except: pass
# Use Unicode characters instead of their ascii psuedo-replacements
UNICODE_SNOB = 0
# Put the links after each paragraph instead of at the end.
LINKS_EACH_PARAGRAPH = 0
# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
BODY_WIDTH = 78
# Don't show internal links (href="#local-anchor") -- corresponding link targets
# won't be visible in the plain text file anyway.
SKIP_INTERNAL_LINKS = False
### Entity Nonsense ###
def name2cp(k):
if k == 'apos': return ord("'")
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
return htmlentitydefs.name2codepoint[k]
else:
k = htmlentitydefs.entitydefs[k]
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
return ord(codecs.latin_1_decode(k)[0])
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
'ndash':'-', 'oelig':'oe', 'aelig':'ae',
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
unifiable_n = {}
for k in unifiable.keys():
unifiable_n[name2cp(k)] = unifiable[k]
def charref(name):
if name[0] in ['x','X']:
c = int(name[1:], 16)
else:
c = int(name)
if not UNICODE_SNOB and c in unifiable_n.keys():
return unifiable_n[c]
else:
return unichr(c)
def entityref(c):
if not UNICODE_SNOB and c in unifiable.keys():
return unifiable[c]
else:
try: name2cp(c)
except KeyError: return "&" + c
else: return unichr(name2cp(c))
def replaceEntities(s):
s = s.group(1)
if s[0] == "#":
return charref(s[1:])
else: return entityref(s)
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
def unescape(s):
return r_unescape.sub(replaceEntities, s)
def fixattrs(attrs):
# Fix bug in sgmllib.py
if not attrs: return attrs
newattrs = []
for attr in attrs:
newattrs.append((attr[0], unescape(attr[1])))
return newattrs
### End Entity Nonsense ###
def onlywhite(line):
"""Return true if the line does only consist of whitespace characters."""
for c in line:
if c is not ' ' and c is not ' ':
return c is ' '
return line
def optwrap(text):
"""Wrap all paragraphs in the provided text."""
if not BODY_WIDTH:
return text
assert wrap, "Requires Python 2.3."
result = ''
newlines = 0
for para in text.split("\n"):
if len(para) > 0:
if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
for line in wrap(para, BODY_WIDTH):
result += line + "\n"
result += "\n"
newlines = 2
else:
if not onlywhite(para):
result += para + "\n"
newlines = 1
else:
if newlines < 2:
result += "\n"
newlines += 1
return result
def hn(tag):
if tag[0] == 'h' and len(tag) == 2:
try:
n = int(tag[1])
if n in range(1, 10): return n
except ValueError: return 0
class _html2text(sgmllib.SGMLParser):
def __init__(self, out=None, baseurl=''):
sgmllib.SGMLParser.__init__(self)
if out is None: self.out = self.outtextf
else: self.out = out
self.outtext = u''
self.quiet = 0
self.p_p = 0
self.outcount = 0
self.start = 1
self.space = 0
self.a = []
self.astack = []
self.acount = 0
self.list = []
self.blockquote = 0
self.pre = 0
self.startpre = 0
self.lastWasNL = 0
self.abbr_title = None # current abbreviation definition
self.abbr_data = None # last inner HTML (for abbr being defined)
self.abbr_list = {} # stack of abbreviations to write later
self.baseurl = baseurl
def outtextf(self, s):
self.outtext += s
def close(self):
sgmllib.SGMLParser.close(self)
self.pbr()
self.o('', 0, 'end')
return self.outtext
def handle_charref(self, c):
self.o(charref(c))
def handle_entityref(self, c):
self.o(entityref(c))
def unknown_starttag(self, tag, attrs):
self.handle_tag(tag, attrs, 1)
def unknown_endtag(self, tag):
self.handle_tag(tag, None, 0)
def previousIndex(self, attrs):
""" returns the index of certain set of attributes (of a link) in the
self.a list
If the set of attributes is not found, returns None
"""
if not attrs.has_key('href'): return None
i = -1
for a in self.a:
i += 1
match = 0
if a.has_key('href') and a['href'] == attrs['href']:
if a.has_key('title') or attrs.has_key('title'):
if (a.has_key('title') and attrs.has_key('title') and
a['title'] == attrs['title']):
match = True
else:
match = True
if match: return i
def handle_tag(self, tag, attrs, start):
attrs = fixattrs(attrs)
if hn(tag):
self.p()
if start: self.o(hn(tag)*"#" + ' ')
if tag in ['p', 'div']: self.p()
if tag == "br" and start: self.o(" \n")
if tag == "hr" and start:
self.p()
self.o("* * *")
self.p()
if tag in ["head", "style", 'script']:
if start: self.quiet += 1
else: self.quiet -= 1
if tag in ["body"]:
self.quiet = 0 # sites like 9rules.com never close <head>
if tag == "blockquote":
if start:
self.p(); self.o('> ', 0, 1); self.start = 1
self.blockquote += 1
else:
self.blockquote -= 1
self.p()
if tag in ['em', 'i', 'u']: self.o("_")
if tag in ['strong', 'b']: self.o("**")
if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
if tag == "abbr":
if start:
attrsD = {}
for (x, y) in attrs: attrsD[x] = y
attrs = attrsD
self.abbr_title = None
self.abbr_data = ''
if attrs.has_key('title'):
self.abbr_title = attrs['title']
else:
if self.abbr_title != None:
self.abbr_list[self.abbr_data] = self.abbr_title
self.abbr_title = None
self.abbr_data = ''
if tag == "a":
if start:
attrsD = {}
for (x, y) in attrs: attrsD[x] = y
attrs = attrsD
if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')):
self.astack.append(attrs)
self.o("[")
else:
self.astack.append(None)
else:
if self.astack:
a = self.astack.pop()
if a:
i = self.previousIndex(a)
if i is not None:
a = self.a[i]
else:
self.acount += 1
a['count'] = self.acount
a['outcount'] = self.outcount
self.a.append(a)
self.o("][" + `a['count']` + "]")
if tag == "img" and start:
attrsD = {}
for (x, y) in attrs: attrsD[x] = y
attrs = attrsD
if attrs.has_key('src'):
attrs['href'] = attrs['src']
alt = attrs.get('alt', '')
i = self.previousIndex(attrs)
if i is not None:
attrs = self.a[i]
else:
self.acount += 1
attrs['count'] = self.acount
attrs['outcount'] = self.outcount
self.a.append(attrs)
self.o("![")
self.o(alt)
self.o("]["+`attrs['count']`+"]")
if tag == 'dl' and start: self.p()
if tag == 'dt' and not start: self.pbr()
if tag == 'dd' and start: self.o(' ')
if tag == 'dd' and not start: self.pbr()
if tag in ["ol", "ul"]:
if start:
self.list.append({'name':tag, 'num':0})
else:
if self.list: self.list.pop()
self.p()
if tag == 'li':
if start:
self.pbr()
if self.list: li = self.list[-1]
else: li = {'name':'ul', 'num':0}
self.o(" "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.
if li['name'] == "ul": self.o("* ")
elif li['name'] == "ol":
li['num'] += 1
self.o(`li['num']`+". ")
self.start = 1
else:
self.pbr()
if tag in ["table", "tr"] and start: self.p()
if tag == 'td': self.pbr()
if tag == "pre":
if start:
self.startpre = 1
self.pre = 1
else:
self.pre = 0
self.p()
def pbr(self):
if self.p_p == 0: self.p_p = 1
def p(self): self.p_p = 2
def o(self, data, puredata=0, force=0):
if self.abbr_data is not None: self.abbr_data += data
if not self.quiet:
if puredata and not self.pre:
data = re.sub('\s+', ' ', data)
if data and data[0] == ' ':
self.space = 1
data = data[1:]
if not data and not force: return
if self.startpre:
#self.out(" :") #TODO: not output when already one there
self.startpre = 0
bq = (">" * self.blockquote)
if not (force and data and data[0] == ">") and self.blockquote: bq += " "
if self.pre:
bq += " "
data = data.replace("\n", "\n"+bq)
if self.start:
self.space = 0
self.p_p = 0
self.start = 0
if force == 'end':
# It's the end.
self.p_p = 0
self.out("\n")
self.space = 0
if self.p_p:
self.out(('\n'+bq)*self.p_p)
self.space = 0
if self.space:
if not self.lastWasNL: self.out(' ')
self.space = 0
if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
if force == "end": self.out("\n")
newa = []
for link in self.a:
if self.outcount > link['outcount']:
self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href']))
if link.has_key('title'): self.out(" ("+link['title']+")")
self.out("\n")
else:
newa.append(link)
if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
self.a = newa
if self.abbr_list and force == "end":
for abbr, definition in self.abbr_list.items():
self.out(" *[" + abbr + "]: " + definition + "\n")
self.p_p = 0
self.out(data)
self.lastWasNL = data and data[-1] == '\n'
self.outcount += 1
def handle_data(self, data):
if r'\/script>' in data: self.quiet -= 1
self.o(data, 1)
def unknown_decl(self, data): pass
def wrapwrite(text): sys.stdout.write(text.encode('utf8'))
def html2text_file(html, out=wrapwrite, baseurl=''):
h = _html2text(out, baseurl)
h.feed(html)
h.feed("")
return h.close()
def html2text(html, baseurl=''):
return optwrap(html2text_file(html, None, baseurl))
if __name__ == "__main__":
baseurl = ''
if sys.argv[1:]:
arg = sys.argv[1]
if arg.startswith('http://'):
baseurl = arg
j = urllib.urlopen(baseurl)
try:
from feedparser import _getCharacterEncoding as enc
except ImportError:
enc = lambda x, y: ('utf-8', 1)
text = j.read()
encoding = enc(j.headers, text)[0]
if encoding == 'us-ascii': encoding = 'utf-8'
data = text.decode(encoding)
else:
encoding = 'utf8'
if len(sys.argv) > 2:
encoding = sys.argv[2]
data = open(arg, 'r').read().decode(encoding)
else:
data = sys.stdin.read().decode('utf8')
wrapwrite(html2text(data, baseurl))

View file

@ -0,0 +1,19 @@
# -*- coding: utf-8 -*-
XHTML_START = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>${title} by ${author}</title>
<link href="stylesheet.css" type="text/css" rel="stylesheet" />
</head>
<body>
<div>
<h1>${title} by ${author}</h1>
${body}
</body></html>
'''
XHTML_CHAPTER_START = '''<h2>${chapter}</h2>'''
XHTML_END = ''''''

View file

@ -0,0 +1,406 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import cgi
import uuid
import shutil
import os.path
import logging
import unittest
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time
import datetime
from constants import *
from adapter import *
try:
import login_password
except:
# tough luck
pass
class MediaMiner(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
self.storyName = ''
self.authorName = ''
self.storyDescription = ''
self.storyCharacters = []
self.storySeries = ''
self.authorId = '0'
self.authorURL = self.path
self.storyId = '0'
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = ''
self.category = ''
self.storyStatus = 'In-Progress'
self.storyRating = 'K'
self.storyUserRating = '0'
self.outputName = ''
self.outputStorySep = '-mm_'
logging.debug('self.url=%s' % self.url)
if self.url.find('view_st.php') != -1:
ss = self.url.split('view_st.php')
logging.debug('ss=%s' % ss)
if ss is not None and len(ss) > 1:
self.storyId = ss[1].replace('/','').strip()
elif self.url.find('view_ch.php?') != -1:
ss = self.url.split('=')
logging.debug('ss=%s' % ss)
if ss is not None and len(ss) > 1:
self.storyId = ss[-1].replace('/','').strip()
self.path = '/fanfic/view_st.php/' + self.storyId
self.url = 'http://' + self.host + self.path
logging.debug('self.url=%s' % self.url)
elif self.url.find('view_ch.php/') != -1:
ss = self.url.split('/')
logging.debug('ss=%s' % ss)
if ss is not None and len(ss) > 2:
self.storyId = ss[-2].strip()
self.path = '/fanfic/view_st.php/' + self.storyId
self.url = 'http://' + self.host + self.path
logging.debug('self.url=%s' % self.url)
else:
raise InvalidStoryURL("Error URL \"%s\" is not a story." % self.url)
logging.debug('self.storyId=%s' % self.storyId)
logging.debug('self.path=%s' % self.path)
if not self.appEngine:
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
else:
self.opener = None
logging.debug("Created MediaMiner: url=%s" % (self.url))
def _getLoginScript(self):
return self.path
def _getVarValue(self, varstr):
#logging.debug('_getVarValue varstr=%s' % varstr)
vals = varstr.split('=')
#logging.debug('vals=%s' % vals)
retstr="".join(vals[+1:])
#logging.debug('retstr=%s' % retstr)
if retstr.startswith(' '):
retstr = retstr[1:]
if retstr.endswith(';'):
retstr = retstr[:-1]
return retstr
def _splitCrossover(self, subject):
if "Crossover" in subject:
self.addSubject ("Crossover")
logging.debug('Crossover=%s' % subject)
if subject.find(' and ') != -1:
words = subject.split(' ')
logging.debug('words=%s' % words)
subj = ''
for s in words:
if s in "and Crossover":
if len(subj) > 0:
self.addSubject(subj)
subj = ''
else:
if len(subj) > 0:
subj = subj + ' '
subj = subj + s
if len(subj) > 0:
self.addSubject(subj)
else:
self.addSubject(subject)
else:
self.addSubject(subject)
return True
def _splitGenre(self, subject):
if len(subject) > 0:
words = subject.split('/')
logging.debug('words=%s' % words)
for subj in words:
if len(subj) > 0:
self.addSubject(subj)
return True
def extractIndividualUrls(self):
data = None
try:
data = self.fetchUrl(self.url)
except Exception, e:
data = None
logging.error("Caught an exception reading URL " + self.url + ". Exception " + unicode(e) + ".")
if data is None:
raise StoryDoesNotExist("Problem reading story URL " + self.url + "!")
#data.replace('<br />',' ').replace('<br>',' ').replace('</br>',' ')
soup = None
try:
soup = bs.BeautifulSoup(data)
except:
logging.error("Failed to decode: <%s>" % data)
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % self.url)
#logging.debug('soap=%s' % soup)
urls = []
td_ffh = soup.find('td', {'class' : 'ffh'})
#logging.debug('td_ffh=%s' % td_ffh)
if td_ffh is not None:
#logging.debug('td_ffh.text=%s' % td_ffh.find(text=True))
self.storyName = unicode(td_ffh.find(text=True)).strip()
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
fft = td_ffh.find('font', {'class' : 'smtxt'})
#logging.debug('fft=%s' % fft)
if fft is not None:
ffts = fft.string.split(' ')
if ffts is not None:
if len(ffts) > 1:
self.storyRating = ffts[1]
logging.debug('self.storyRating=%s' % self.storyRating)
self.genre = ''
td_smtxt = soup.findAll('td')
if td_smtxt is None:
#logging.debug('td_smtxt is NONE!')
pass
else:
ll = len(td_smtxt)
#logging.debug('td_smtxt=%s, len=%s' % (td_smtxt, ll))
for ii in range(ll):
td = td_smtxt[ii]
if 'class' in td._getAttrMap() and td['class'] != 'smtxt':
#logging.debug('td has class attribute but is not smtxt')
continue
ss = unicode(td).replace('\n','').replace('\r','').replace('&nbsp;', ' ')
#logging.debug('ss=%s' % ss)
if len(ss) > 1 and (ss.find('Genre(s):') != -1 or ss.find('Type:') != -1):
#logging.debug('ss=%s' % ss)
ssbs = td.findAll('b')
#logging.debug('ssbs=%s' % ssbs)
bb = 0
while bb < len(ssbs):
nvs = bs.NavigableString('')
sst=''
ssb = ssbs[bb]
ssbt = unicode(ssb.text).strip()
#logging.debug('ssb=%s' % ssb)
#logging.debug('ssbt=%s' % ssbt)
ssbn = ssb.nextSibling
while ssbn is not None:
#logging.debug('ssbn=%s' % ssbn)
#logging.debug('ssbn.class=%s' % ssbn.__class__)
if nvs.__class__ == ssbn.__class__:
st = unicode(ssbn)
if st.strip() != '|':
sst = sst + st
else:
#logging.debug('ssbn.name=%s' % ssbn.name)
if ssbn.name == 'b':
break
ssbnts = ssbn.findAll(text=True)
for ssbnt in ssbnts:
sst = sst + ssbnt
ssbn = ssbn.nextSibling
sst = sst.replace('&nbsp;',' ').strip()
#logging.debug('sst=%s' % sst)
if bb == 0:
ssbt = ssbt.replace(':','')
self.addSubject(ssbt)
self.addSubject(sst)
logging.debug('self.subjects=%s' % self.subjects)
else:
if ssbt == 'Genre(s):':
self.genre = sst
logging.debug('self.genre=%s' % self.genre)
sts = sst.split(' / ')
for st in sts:
self.addSubject(st.strip())
logging.debug('self.subjects=%s' % self.subjects)
elif ssbt == 'Type:':
self.category = sst
logging.debug('self.category=%s' % self.category)
self.addSubject(sst)
logging.debug('self.subjects=%s' % self.subjects)
elif ssbt == 'Author:':
pass
elif ssbt == 'Visits:':
pass
elif ssbt == 'Size:':
pass
elif ssbt == 'Pages:':
pass
elif ssbt == 'Status:':
if sst == "Completed":
self.storyStatus = 'Completed'
else:
self.storyStatus = 'In-Progress'
elif ssbt == 'Words:':
self.numWords = sst.replace('|','').strip()
logging.debug('self.numWords=%s' % self.numWords)
pass
elif ssbt == 'Summary:':
self.storyDescription = sst.strip()
logging.debug('self.storyDescription=%s' % self.storyDescription)
elif ssbt == 'Latest Revision:' or ssbt == 'Uploaded On:':
#logging.debug('sst=%s' % sst)
ssts = sst.split(' ')
if ssts is not None and len(ssts) > 3:
sst = ssts[0] + ' ' + ssts[1] + ' ' + ssts[2]
#logging.debug('sst=%s' % sst)
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(sst.strip(' '), "%B %d, %Y")))
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
else:
pass
bb = bb+1
smtxt_as = td_smtxt[ii].findAll('a')
#logging.debug('smtxt_as=%s' % smtxt_as)
for smtxt_a in smtxt_as:
if 'href' in smtxt_a._getAttrMap() and smtxt_a['href'].find('/u/'):
sta = smtxt_a['href']
#logging.debug('sta=%s' % sta)
stas = sta.split('/u/')
#logging.debug('stas=%s' % stas)
if stas is not None and len(stas) > 1:
self.authorId = stas[1]
self.authorURL = 'http://' + self.host + sta
self.authorName = smtxt_a.string
logging.debug('self.authorName=%s, self.authorId=%s' % (self.authorName, self.authorId))
urlstory=''
numchapters = 0
td_tbbrdr = soup.find('td', {'class' : 'tbbrdr'})
if td_tbbrdr is not None:
#logging.debug('td_tbbrdr=%s' % td_tbbrdr )
sl = td_tbbrdr.find('select', {'name':'cid'})
if sl is not None:
#logging.debug('sl=%s' % sl )
opts = sl.findAll('option')
for o in opts:
#logging.debug('o=%s' % o)
if 'value' in o._getAttrMap():
url = 'http://' + self.host + '/fanfic/view_ch.php/' + self.storyId + '/' + o['value']
logging.debug('URL=%s, Title=%s' % (url, o.string))
if numchapters == 0:
ss = o.string.split('[')
if ss is not None and len(ss) > 1:
ssd = ss[-1].replace(']','')
#logging.debug('ssd=%s' % ssd)
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(ssd.strip(' '), "%b %d, %Y")))
logging.debug('self.storyPublished=%s' % self.storyPublished)
urls.append((url, o.string))
numchapters = numchapters + 1
if numchapters == 0:
numchapters = 1
url = 'http://' + self.host + '/fanfic/view_st.php/' + self.storyId
self.storyPublished = self.storyUpdated
logging.debug('self.storyPublished=%s' % self.storyPublished)
ssd = self.storyName + ' [' + self.storyPublished.strftime("%b %d, %Y") + ']'
logging.debug('URL=%s, Title=%s' % (url, ssd))
urls.append((url, ssd))
self.numChapters = unicode(numchapters)
logging.debug('self.numChapters=%s' % self.numChapters)
#logging.debug('urls=%s' % urls)
return urls
def getText(self, url):
time.sleep( 2.0 )
logging.debug('url=%s' % url)
data = ''
try:
data = self.fetchUrl(url)
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
soup = None
try:
soup = bs.BeautifulSoup(data)
except:
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
nvs = bs.NavigableString('')
sst=''
allAs = soup.findAll ('a', { 'name' : 'fic_c' })
#logging.debug('allAs=%s' % allAs)
for a in allAs:
#logging.debug('a=%s' % a)
foundfirst = False
done = False
nxta = a.nextSibling
while nxta is not None and not done:
#logging.debug('nxta=%s' % nxta)
#logging.debug('nxta.class=%s' % nxta.__class__)
st = unicode(nxta)
if nvs.__class__ != nxta.__class__:
#logging.debug('nxta.name=%s' % nxta.name)
if nxta.name == 'table':
st = ''
if foundfirst:
done = True
if nxta.name == 'div' and 'class' in nxta._getAttrMap() and nxta['class'] == 'acl' and foundfirst:
st = ''
done = True
if nxta.name == 'br':
if not foundfirst:
st = ''
else:
foundfirst = True
else:
foundfirst = True
sst = sst + st
nxta = nxta.nextSibling
if sst is None:
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return sst
class FPC_UnitTests(unittest.TestCase):
def setUp(self):
logging.basicConfig(level=logging.DEBUG)
pass
def testFictionPress(self):
url = 'http://www.fictionpress.com/s/2725180/1/Behind_This_Facade'
f = FPCom(url)
urls = f.extractIndividualUrls()
self.assertEquals('Behind This Facade', f.getStoryName())
self.assertEquals('IntoxicatingMelody', f.getAuthorName())
text = f.getText(url)
self.assertTrue(text.find('Kale Resgerald at your service" He answered, "So, can we go now? Or do you want to') != -1)
if __name__ == '__main__':
unittest.main()

424
fanficdownloader/output.py Normal file
View file

@ -0,0 +1,424 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import cgi
import uuid
import unicodedata
import codecs
import shutil
import string
import os.path
import zipfile
import StringIO
import logging
import hashlib
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import zipdir
import html_constants
from constants import *
import html2text
import datetime
class FanficWriter:
def __init__(self):
pass
def writeChapter(self, index, title, text):
pass
def finalise(self):
pass
@staticmethod
def getFormatName():
return 'base'
@staticmethod
def getFormatExt():
return '.bse'
class TextWriter(FanficWriter):
htmlWriter = None
@staticmethod
def getFormatName():
return 'text'
@staticmethod
def getFormatExt():
return '.txt'
def __init__(self, base, adapter, inmemory=False, compress=False):
self.inmemory = inmemory
self.htmlWriter = HTMLWriter(base, adapter, True, False)
def writeChapter(self, index, title, text):
self.htmlWriter.writeChapter(index, title, text)
def finalise(self):
self.htmlWriter.finalise()
self.name=self.htmlWriter.name
self.fileName = self.htmlWriter.fileName.replace(".html",".txt")
if self.inmemory:
self.output = StringIO.StringIO()
else:
self.output = open(self.fileName, 'w')
self.output.write(html2text.html2text(self.htmlWriter.output.getvalue().decode('utf-8')).encode('utf-8'))
if not self.inmemory:
self.output.close()
class HTMLWriter(FanficWriter):
body = ''
@staticmethod
def getFormatName():
return 'html'
@staticmethod
def getFormatExt():
return '.html'
def __init__(self, base, adapter, inmemory=False, compress=False):
self.basePath = base
self.storyTitle = removeEntities(adapter.getStoryName())
self.name = makeAcceptableFilename(adapter.getOutputName())
self.fileName = self.basePath + '/' + self.name + self.getFormatExt()
self.authorName = removeEntities(adapter.getAuthorName())
self.adapter = adapter
self.inmemory = inmemory
if not self.inmemory and os.path.exists(self.fileName):
os.remove(self.fileName)
if self.inmemory:
self.output = StringIO.StringIO()
else:
self.output = open(self.fileName, 'w')
self.xhtmlTemplate = string.Template(html_constants.XHTML_START)
self.chapterStartTemplate = string.Template(html_constants.XHTML_CHAPTER_START)
def _printableVersion(self, text):
try:
d = text.decode('utf-8')
return d
except:
return text
def writeChapter(self, index, title, text):
title = self._printableVersion(title) #title.decode('utf-8')
text = self._printableVersion(text) #text.decode('utf-8')
self.body = self.body + '\n' + self.chapterStartTemplate.substitute({'chapter' : title})
self.body = self.body + '\n' + text
def finalise(self):
html = self.xhtmlTemplate.substitute({'title' : self.storyTitle, 'author' : self.authorName, 'body' : self.body})
soup = bs.BeautifulSoup(html)
result = soup.__str__('utf8')
# f = open(self.fileName, 'w')
# f.write(result)
# f.close()
self.output.write(result)
if not self.inmemory:
self.output.close()
class EPubFanficWriter(FanficWriter):
chapters = []
files = {}
@staticmethod
def getFormatName():
return 'epub'
@staticmethod
def getFormatExt():
return '.epub'
def __init__(self, base, adapter, inmemory=False, compress=True):
self.basePath = base
self.storyTitle = removeEntities(adapter.getStoryName())
self.name = makeAcceptableFilename(adapter.getOutputName())
self.directory = self.basePath + '/' + self.name
self.authorName = removeEntities(adapter.getAuthorName())
self.inmemory = inmemory
self.adapter = adapter
self.files = {}
self.chapters = []
if not self.inmemory:
self.inmemory = True
self.writeToFile = True
else:
self.writeToFile = False
if not self.inmemory:
if os.path.exists(self.directory):
shutil.rmtree(self.directory)
os.mkdir(self.directory)
os.mkdir(self.directory + '/META-INF')
os.mkdir(self.directory + '/OEBPS')
self._writeFile('mimetype', MIMETYPE)
self._writeFile('META-INF/container.xml', CONTAINER)
self._writeFile('OEBPS/stylesheet.css', CSS)
def _writeFile(self, fileName, data):
#logging.debug('_writeFile(`%s`, data)' % fileName)
if fileName in self.files:
try:
d = data.decode('utf-8')
except UnicodeEncodeError, e:
d = data
self.files[fileName].write(d)
else:
if self.inmemory:
self.files[fileName] = StringIO.StringIO()
else:
self.files[fileName] = open(self.directory + '/' + fileName, encoding='utf-8', mode='w')
self._writeFile(fileName, data)
def _closeFiles(self):
if not self.inmemory:
for f in self.files:
self.files[f].close()
def writeChapter(self, index, title, text):
title = removeEntities(title)
logging.debug("Writing chapter: %s" % title)
fileName="chapter%04d.xhtml" % index
filePath = self.directory + "/OEBPS/" + fileName
fn = 'OEBPS/' + fileName
# f = open(filePath, 'w')
text = removeEntities(text)
# BeautifulStoneSoup doesn't have any selfClosingTags by default.
# hr & br needs to be if they're going to work.
# Some stories do use multiple br tags as their section breaks...
self.soup = bs.BeautifulStoneSoup(text, selfClosingTags=('br','hr'))
allTags = self.soup.findAll(recursive=True)
for t in allTags:
for attr in t._getAttrMap().keys():
if attr not in acceptable_attributes:
del t[attr]
# these are not acceptable strict XHTML. But we do already have
# CSS classes of the same names defined in constants.py
if t.name in ('u'):
t['class']=t.name
t.name='span'
if t.name in ('center'):
t['class']=t.name
t.name='div'
# removes paired, but empty tags.
if t.string != None and len(t.string.strip()) == 0 :
t.extract()
text = self.soup.__str__('utf8')
# ffnet(& maybe others) gives the whole chapter text
# as one line. This causes problems for nook(at
# least) when the chapter size starts getting big
# (200k+) Using Soup's prettify() messes up italics
# and such. Done after soup extract so <p> and <br>
# tags are normalized. Doing it here seems less evil
# than hacking BeautifulSoup, but it's debatable.
text = text.replace('</p>','</p>\n').replace('<br />','<br />\n')
self._writeFile(fn, XHTML_START % (title, title))
self._writeFile(fn, text)
self._writeFile(fn, XHTML_END)
# print >> f, XHTML_START % (title, title)
# f.write(text)
# print >> f, XHTML_END
self.chapters.append((title, fileName))
def finalise(self):
logging.debug("Finalising...")
### writing table of contents -- ncx file
tocFilePath = "OEBPS/toc.ncx"
# toc = open(tocFilePath, 'w')
# print >> toc, TOC_START % self.storyTitle
self._writeFile(tocFilePath, TOC_START % (self.adapter.getUUID(), self.storyTitle))
published = self.adapter.getStoryPublished().strftime("%Y-%m-%d")
createda = self.adapter.getStoryCreated().strftime("%Y-%m-%d %H:%M:%S")
created = self.adapter.getStoryCreated().strftime("%Y-%m-%d")
updated = self.adapter.getStoryUpdated().strftime("%Y-%m-%d")
calibre = self.adapter.getStoryUpdated().strftime("%Y-%m-%dT%H:%M:%S")
description = self.adapter.getStoryDescription()
if hasattr(description, "text"):
description = unicode(description.text)
else:
description = unicode(description)
if description is not None and len(description) > 0:
description = description.replace ('\\\'', '\'').replace('\\\"', '\"')
description = removeEntities(description)
else:
description = ' '
### writing content -- title page
titleFilePath = "OEBPS/title_page.xhtml"
self._writeFile(titleFilePath, TITLE_HEADER % (self.authorName, self.storyTitle, self.adapter.getStoryURL(), self.storyTitle, self.adapter.getAuthorURL(), self.authorName))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Category:', self.adapter.getCategory()))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Genre:', self.adapter.getGenre()))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Status:', self.adapter.getStoryStatus()))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Published:', published))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Updated:', updated))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Packaged:', createda))
tmpstr = self.adapter.getStoryRating() + " / " + self.adapter.getStoryUserRating()
self._writeFile(titleFilePath, TITLE_ENTRY % ('Rating Age/User:', tmpstr))
tmpstr = unicode(self.adapter.getNumChapters()) + " / " + unicode(self.adapter.getNumWords())
self._writeFile(titleFilePath, TITLE_ENTRY % ('Chapters/Words:', tmpstr))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Publisher:', self.adapter.getHost()))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Story ID:', self.adapter.getStoryId()))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Author ID:', self.adapter.getAuthorId()))
self._writeFile(titleFilePath, TITLE_FOOTER % description )
### writing content -- opf file
opfFilePath = "OEBPS/content.opf"
# opf = open(opfFilePath, 'w')
self._writeFile(opfFilePath, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName, self.adapter.getLanguageId(), published, created, updated, calibre, description))
i = 0
subjs = []
subjs = self.adapter.getSubjects()
for subj in subjs:
self._writeFile(opfFilePath, CONTENT_SUBJECT % subj)
i = i + 1
if (i <= 0):
self._writeFile(opfFilePath, CONTENT_SUBJECT % "FanFiction")
self._writeFile(opfFilePath, CONTENT_END_METADATA % (self.adapter.getPublisher(), self.adapter.getUUID(), self.adapter.getStoryURL(), self.adapter.getStoryURL(), self.adapter.getStoryUserRating()))
# print >> opf, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName)
ids = []
i = 0
t = "Title Page"
f = "title_page.xhtml"
chapterId = "Title Page"
self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f))
self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))
ids.append(chapterId)
i = i + 1
for t,f in self.chapters:
chapterId = "chapter%04d" % i
self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f))
self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))
ids.append(chapterId)
i = i + 1
# logging.d('Toc and refs printed, proceesing to ref-ids....')
self._writeFile(tocFilePath, TOC_END)
self._writeFile(opfFilePath, CONTENT_END_MANIFEST)
for chapterId in ids:
self._writeFile(opfFilePath, CONTENT_ITEMREF % chapterId)
self._writeFile(opfFilePath, CONTENT_END)
self._closeFiles()
filename = self.directory + self.getFormatExt()
zipdata = zipdir.inMemoryZip(self.files)
if self.writeToFile:
f = open(filename, 'wb')
f.write(zipdata.getvalue())
f.close()
else:
self.output = zipdata
# zipdir.toZip(filename, self.directory)
def unirepl(match):
"Return the unicode string for a decimal number"
if match.group(1)=='x':
radix=16
else:
radix=10
value = int(match.group(2), radix )
return unichr(value)
def replaceNumberEntities(data):
p = re.compile(r'&#(x?)(\d+);')
return p.sub(unirepl, data)
def removeEntities(text):
# replace numeric versions of [&<>] with named versions.
try:
t = text.decode('utf-8')
except UnicodeEncodeError, e:
try:
t = text.encode ('ascii', 'xmlcharrefreplace')
except UnicodeEncodeError, e:
t = text
text = t
text = re.sub(r'&#0*38;','&amp;',text)
text = re.sub(r'&#0*60;','&lt;',text)
text = re.sub(r'&#0*62;','&gt;',text)
# replace remaining &#000; entities with unicode value, such as &#039; -> '
text = replaceNumberEntities(text)
# replace several named entities with character, such as &mdash; -> -
# see constants.py for the list.
# reverse sort will put entities with ; before the same one without, when valid.
for e in reversed(sorted(entities.keys())):
v = entities[e]
try:
text = text.replace(e, v)
except UnicodeDecodeError, ex:
# for the pound symbol in constants.py
text = text.replace(e, v.decode('utf-8'))
# &lt; &lt; and &amp; are the only html entities allowed in xhtml, put those back.
text = text.replace('&', '&amp;').replace('&amp;lt;', '&lt;').replace('&amp;gt;', '&gt;')
return text
def makeAcceptableFilename(text):
return re.sub('[^a-zA-Z0-9_-]+','',removeEntities(text).replace(" ", "_").replace(":","_"))

View file

@ -0,0 +1,367 @@
# -*- coding: utf-8 -*-
# Copied from the twilighted.py because site is almost the same..
# of course, now that we're trying to scrape more detail about the
# story, there were differences in how headers are displayed
import os
import re
import sys
import shutil
import os.path
import urllib as u
import logging
import pprint as pp
import unittest
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time
import datetime
from adapter import *
class PotionsNSnitches(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
self.password = ''
self.login='sigizmund'
self.storyDescription = 'Fanfiction Story'
self.authorId = '0'
self.authorURL = ''
self.storyId = '0'
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.subjects.append ('fanfiction')
self.subjects.append ('Harry Potter')
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = 'FanFiction'
self.category = 'Category'
self.storyStatus = 'In-Progress'
self.storyRating = 'PG'
self.storyUserRating = '0'
self.storyCharacters = []
self.storySeries = ''
self.outputName = ''
self.outputStorySep = '-pns_'
self.chapurl = False
ss=self.url.split('?')
if ss is not None and len(ss) > 1:
sss = ss[1].replace('&amp;','&').split('&')
if sss is not None and len(sss) > 0:
ssss = sss[0].split('=')
if ssss is not None and len(ssss) > 1 and ssss[0] == 'sid':
self.storyId = ssss[1]
if len(sss) > 1:
ssss = sss[1].split('=')
if ssss is not None and len(ssss) > 1 and ssss[0] == 'chapter':
self.chapurl = True
self.url = 'http://' + self.host + self.path + '?sid=' + self.storyId
logging.debug('self.url=%s' % self.url)
logging.debug("Created PotionsNSnitches: url=%s" % (self.url))
def _getLoginScript(self):
return '/user.php?action=login'
def reqLoginData(self, data):
if data.find('Registered Users Only. Please click OK to login or register.') != -1 or data.find('There is no such account on our website') != -1:
return True
else:
return False
def _fillCharacters(self, strlist, idx, maxlen):
ii = idx
while ii < maxlen:
chara = strlist[ii].strip()
if len(chara) > 0:
if chara.find(':') != -1:
return (ii-1)
elif chara.find(',') == -1:
self.addCharacter (chara)
ii = ii + 1
return (ii)
def _buildGenre(self, strlist, idx, maxlen):
self.genre = ''
ii = idx
while ii < maxlen:
genre = strlist[ii].strip()
if len(genre) > 0:
if genre.find(':') != -1:
return (ii-1)
elif genre.find(',') != -1:
genre = ', '
else:
self.addSubject (genre)
self.genre = self.genre + genre
ii = ii + 1
return (ii)
def _buildCategory(self, strlist, idx, maxlen):
self.category = ''
ii = idx
while ii < maxlen:
cat = strlist[ii].strip()
if len(cat) > 0:
if cat.find(':') != -1:
return (ii-1)
elif cat.find(',') != -1:
cat = ', '
else:
self.addSubject (cat)
self.category = self.category + cat
ii = ii + 1
return (ii)
def extractIndividualUrls(self):
url = self.url + '&chapter=1'
data = ''
try:
data = self.opener.open(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise StoryDoesNotExist("Problem reading story URL " + url + "!")
if self.reqLoginData(data):
self.performLogin()
data = ''
try:
data = self.opener.open(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise StoryDoesNotExist("Problem reading story URL " + url + "!")
if self.reqLoginData(data):
raise FailedToDownload("Error downloading Story: %s! Login Failed!" % url)
soup = None
try:
soup = bs.BeautifulStoneSoup(data)
except:
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % url)
self.storyName = ''
self.authorName = ''
self.storyId = '0'
title = soup.find('title').string
if title is not None and len(title) > 0:
logging.debug('Title: %s' % title)
ss = title.split(' by ')
if ss is not None and len(ss) > 1:
self.storyName = ss[0].strip()
self.authorName = ss[1].strip()
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
logging.debug('self.authorId=%s, self.authorName=%s' % (self.authorId, self.authorName))
select = soup.find('select', { 'name' : 'chapter' } )
result = []
if select is None:
# no chapters found, try url by itself.
chaptitle = soup.find('div', { 'id' : 'chaptertitle' } )
if chaptitle is not None and chaptitle.string is not None and len(chaptitle.string) > 0:
result.append((url,chaptitle.string))
else:
result.append((url,self.storyName))
else:
allOptions = select.findAll('option')
for o in allOptions:
url = self.url + "&chapter=%s" % o['value']
title = o.string
result.append((url,title))
url = self.url + "&index=1"
data = self.opener.open(url).read()
lines = data.split('\n')
soup = bs.BeautifulStoneSoup(data)
pgt = soup.find('div', {'id' : 'pagetitle'})
#logging.debug('pagetitle: %s' % pgt)
pgtAs = pgt.findAll('a')
#logging.debug('pgtAs: %s' % pgtAs)
for a in pgtAs:
if a['href'].find('viewstory.php') != -1:
(u1, self.storyId) = a['href'].split('=')
self.storyName = a.string
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
elif a['href'].find('viewuser.php') != -1:
self.authorName = a.string
self.authorURL = 'http://' + self.host + '/' + a['href']
(u1, self.authorId) = a['href'].split('=')
logging.debug('self.authorName=%s, self.authorId=%s' % (self.authorName, self.authorId))
output = soup.find('div', {'id' : 'output'})
#logging.debug('output: %s' % unicode(output))
if output is not None and len(unicode(output)) > 1:
s2 = re.split ('<[^>]+>', unicode(output))
#logging.debug('s2=%s' % s2)
ii = 0
ll = len(s2)
while ii < ll:
if s2[ii] == 'Summary:' and ii+1 < ll:
self.storyDescription = s2[ii+1].strip()
logging.debug('self.storyDescription: %s' % self.storyDescription)
break;
ii = ii+1
cnt = soup.find('div', {'class' : 'content'})
#logging.debug('content: %s' % cnt)
cnttd = cnt.findAll('td')
#logging.debug('cnttd: %s' % cnttd)
for td in cnttd:
#logging.debug('td: %s' % unicode(td))
ss = unicode(td).replace('\n','').replace('\r','').replace('&nbsp;', ' ')
if len(ss) > 1:
s2 = re.split ('<[^>]+>', ss)
#logging.debug('s2=%s' % s2)
ii = 0
ll = len(s2)
while ii < ll-1:
if s2[ii] is not None and len(s2[ii]) > 0 and s2[ii].find(':') != -1:
skey = s2[ii].strip()
ii = ii+1
if skey == 'Rated:':
self.storyRating = s2[ii].strip()
logging.debug('self.storyRating=%s' % self.storyRating)
ii = ii + 1
elif skey == 'Chapters:':
self.numChapters = s2[ii].strip()
logging.debug('self.numChapters=%s' % self.numChapters)
ii = ii + 1
elif skey == 'Characters:':
ii = self._fillCharacters(s2, ii, ll)
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
ii = ii + 1
elif skey == 'Genres:':
ii = self._buildGenre(s2, ii, ll)
logging.debug('self.genre=%s' % self.genre)
logging.debug('self.subjects=%s' % self.subjects)
elif skey == 'Categories:':
ii = self._buildCategory(s2, ii, ll)
logging.debug('self.category=%s' % self.category)
logging.debug('self.subjects=%s' % self.subjects)
elif skey == 'Completed:':
if s2[ii].strip(' ') == "No":
self.storyStatus = 'In-Progress'
else:
self.storyStatus = 'Completed'
ii = ii + 1
elif skey == 'Word count:':
self.numWords = s2[ii].strip()
if self.numWords is None or len(self.numWords) == 0:
self.numWords = '0'
logging.debug('self.numWords=%s' % self.numWords)
ii = ii + 1
elif skey == 'Takes Place:':
ii = ii + 1
elif skey == 'Awards:':
ii = ii + 1
elif skey == 'Series:':
ii = ii + 1
elif skey == 'Read:':
ii = ii + 1
elif skey == 'Warnings:':
ii = ii + 1
else:
ii = ii + 1
tls = soup.findAll('div', {'style' : 'text-align: center;'})
for tl in tls:
#logging.debug('tl: %s' % tl)
ss = unicode(tl).replace('\n','').replace('\r','').replace('&nbsp;', ' ')
if ss.find('Published:') != -1:
s2 = re.split ('<[^>]+>', ss)
#logging.debug('s2: %s' % s2)
ii = 0
ll = len(s2)
while ii < ll-1:
if s2[ii] is not None and len(s2[ii]) > 0 and s2[ii].find(':') != -1:
skey = s2[ii].strip()
#logging.debug('skey: %s' % skey)
ii = ii+1
if skey == 'Published:':
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s2[ii].strip(' '), "%b %d %Y")))
logging.debug('self.storyPublished=%s' % self.storyPublished)
ii = ii + 1
elif skey == 'Updated:':
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s2[ii].strip(' '), "%b %d %Y")))
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
ii = ii + 1
else:
ii = ii + 1
if (self.storyName is None or len(self.storyName) == 0) and self.storyId == '0':
logging.error('self.storyName is empty!! Exitting!')
exit(1)
return result
def getText(self, url):
if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url
logging.debug('Getting data from: %s' % url)
data = ''
try:
data = self.opener.open(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
# need to do this, because for some reason the <br /> tag in the story causes problems
data = data.replace('<br />', ' SOMETHING_BR ')
soup = None
try:
soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES)
except:
logging.info("Failed to decode: <%s>" % data)
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
div = soup.find('div', {'id' : 'story'})
if None == div:
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
# put the <br /> tags back in..
text = div.__str__('utf8').replace(' SOMETHING_BR ','<br />')
return text
class PotionsNSnitches_UnitTests(unittest.TestCase):
def setUp(self):
logging.basicConfig(level=logging.DEBUG)
pass
def testLoginWorks(self):
pass
def testGetUrlsWorks(self):
url = 'http://potionsandsnitches.net/fanfiction/viewstory.php?sid=2230'
self.assertEquals(32, len(Twilighted(url).extractIndividualUrls()))
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,10 @@
To use, do:
python downloader.py <url> (epub|html)
Eg:
python downloader.py http://www.fanfiction.net/s/5192986/1/A_Fox_in_Tokyo epub
This tool uses Python 2.5.2, but should work with newer versions.

View file

@ -0,0 +1,316 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import shutil
import os.path
import urllib as u
import logging
import pprint as pp
import unittest
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time
import datetime
from adapter import *
import twipassword
class Twilighted(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
self.password=twipassword.password
self.login='sigizmund'
self.storyDescription = 'Fanfiction Story'
self.authorId = '0'
self.authorURL = ''
self.storyId = '0'
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.subjects.append ('fanfiction')
self.subjects.append ('Twilight')
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = ''
self.category = 'Fanfiction'
self.storyStatus = 'In-Progress'
self.storyRating = 'PG'
self.storyUserRating = '0'
self.storyCharacters = []
self.storySeries = ''
self.outputName = ''
self.outputStorySep = '-tw_'
self.chapurl = False
ss=self.url.split('?')
logging.debug('ss=%s' % ss)
if ss is not None and len(ss) > 1:
sss = ss[1].replace('&amp;','&').split('&')
logging.debug('sss=%s' % sss)
if sss is not None and len(sss) > 0:
ssss = sss[0].split('=')
logging.debug('ssss=%s' % ssss)
if ssss is not None and len(ssss) > 1 and ssss[0] == 'sid':
self.storyId = ssss[1]
if len(sss) > 1:
ssss = sss[1].split('=')
logging.debug('ssss=%s' % ssss)
if ssss is not None and len(ssss) > 1 and ssss[0] == 'chapter':
self.chapurl = True
self.url = 'http://' + self.host + '/' + self.path + '?sid=' + self.storyId
logging.debug('self.url=%s' % self.url)
logging.debug("Created Twilighted: url=%s" % (self.url))
def _getLoginScript(self):
return '/user.php?action=login'
def reqLoginData(self, data):
if data.find('Registered Users Only. Please click OK to login or register.') != -1 or data.find('There is no such account on our website') != -1:
return True
else:
return False
def requiresLogin(self, url = None):
return True
def performLogin(self, url = None):
data = {}
data['penname'] = self.login
data['password'] = self.password
data['cookiecheck'] = '1'
data['submit'] = 'Submit'
urlvals = u.urlencode(data)
loginUrl = 'http://' + self.host + self._getLoginScript()
logging.debug("Will now login to URL %s" % loginUrl)
req = self.opener.open(loginUrl, urlvals)
d = req.read().decode('utf-8')
if self.reqLoginData(d) :
return False
else:
return True
def extractIndividualUrls(self):
url = self.url + '&chapter=1'
data = ''
try:
data = self.opener.open(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise StoryDoesNotExist("Problem reading story URL " + url + "!")
if self.reqLoginData(data):
self.performLogin()
data = ''
try:
data = self.opener.open(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise StoryDoesNotExist("Problem reading story URL " + url + "!")
if self.reqLoginData(data):
raise FailedToDownload("Error downloading Story: %s! Login Failed!" % url)
soup = None
try:
soup = bs.BeautifulStoneSoup(data)
except:
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % url)
title = soup.find('title').string
logging.debug('Title: %s' % title)
self.storyName = title.split(' by ')[0].strip()
self.authorName = title.split(' by ')[1].strip()
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
logging.debug('self.authorId=%s, self.authorName=%s' % (self.authorId, self.authorName))
select = soup.find('select', { 'name' : 'chapter' } )
result = []
if select is None:
# no chapters found, try url by itself.
result.append((self.url,self.storyName))
else:
allOptions = select.findAll('option')
for o in allOptions:
url = self.url + "&chapter=%s" % o['value']
title = o.string
result.append((url,title))
url = self.url + "&index=1"
data = self.opener.open(url).read()
lines = data.split('\n')
soup = bs.BeautifulStoneSoup(data)
metas = soup.findAll('meta')
for meta in metas:
if 'name' in meta._getAttrMap() and meta['name'].find('description') != -1:
#logging.debug('Meta: %s' % meta)
if 'content' in meta._getAttrMap():
s1 = bs.BeautifulStoneSoup(meta['content'])
ps = s1.findAll('p')
if len(ps) > 0:
self.storyDescription = ps[0]
logging.debug('self.storyDescription=%s' % (self.storyDescription))
else:
divs = meta.findAll('div')
#logging.debug('Divs: %s' % divs)
for div in divs:
#logging.debug('Div: %s' % div)
if 'id' in div._getAttrMap() and div['id'].find('pagetitle') != -1:
#logging.debug('Div PAGETITLE: %s' % div)
allA = div.findAll('a')
for a in allA:
if 'href' in a._getAttrMap():
if a['href'].find('viewstory.php?sid=') != -1:
str1 = a.string
(vs, self.storyId) = a['href'].split('=')
logging.debug('self.storyId=%s self.storyName=%s' % (self.storyId, self.storyName))
if a['href'].find('viewuser.php?uid=') != -1:
str1 = a.string
(vs, self.authorId) = a['href'].split('=')
logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName))
self.authorURL = 'http://'+self.host+'/viewuser.php?uid='+self.authorId
logging.debug('self.authorURL=%s' % self.authorURL)
if 'class' in div._getAttrMap() and div['class'].find('content') != -1:
#logging.debug('Div CONTENT: %s' % div)
brs = div.findAll('br')
for br in brs:
buf = unicode(br).encode('utf-8')
strs = re.split ('<[^>]+>', buf)
#logging.debug('BUF: %s' % strs)
ii = 2
stlen = len(strs)
while stlen > ii+1:
if len(strs[ii]) == 0:
ii = ii+1
continue
if strs[ii] == 'Categories:':
ii = ii+1
while stlen > ii and len(strs[ii]) != 0 and strs[ii].find(':') == -1:
if strs[ii] != ' ' and strs[ii] != ', ':
if len(self.genre) > 0:
self.genre = self.genre + ', '
self.genre = strs[ii].strip(' ')
if len(self.category) == 0:
self.category = strs[ii].strip(' ')
self.addSubject(strs[ii].strip(' '))
ii = ii+1
logging.debug('self.subjects=%s' % self.subjects)
if strs[ii] == 'Characters: ':
ii = ii+1
while stlen > ii and len(strs[ii]) != 0 and strs[ii].find(':') == -1:
if strs[ii] != ' ' and strs[ii] != ', ':
self.addCharacter(strs[ii].strip(' '))
ii = ii+1
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
elif strs[ii] == 'Completed:':
if strs[ii+1].strip(' ') == "No":
self.storyStatus = 'In-Progress'
else:
self.storyStatus = 'Completed'
ii = ii+2
logging.debug('self.storyStatus=%s' % self.storyStatus)
elif strs[ii] == 'Rated:':
self.storyRating = strs[ii+1].strip(' ')
ii = ii+2
logging.debug('self.storyRating=%s' % self.storyRating)
elif strs[ii] == 'Series:':
self.storySeries = strs[ii+1].strip(' ')
if self.storySeries == 'None':
self.storySeries = ''
ii = ii+2
logging.debug('self.storySeries=%s' % self.storySeries)
elif strs[ii] == 'Chapters: ':
self.numChapters = strs[ii+1].strip(' ')
ii = ii+2
logging.debug('self.numChapters=%s' % self.numChapters)
elif strs[ii] == 'Word count:':
self.numWords = strs[ii+1].strip(' ')
ii = ii+2
logging.debug('self.numWords=%s' % self.numWords)
elif strs[ii] == ' Published: ':
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(strs[ii+1].strip(' '), "%B %d, %Y")))
ii = ii+2
logging.debug('self.storyPublished=%s' % self.storyPublished)
elif strs[ii] == 'Updated:':
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(strs[ii+1].strip(' '), "%B %d, %Y")))
ii = ii+2
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
else:
logging.debug('Skipped Label \"%s\" Value \"%s\"' % (strs[ii], strs[ii+1]))
ii = ii+2
return result
def getText(self, url):
if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url
logging.debug('Getting data from: %s' % url)
data = ''
try:
data = self.opener.open(url).read()
except Exception, e:
data = ''
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
if data is None:
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
soup = None
try:
soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES)
except:
logging.info("Failed to decode: <%s>" % data)
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
div = soup.find('div', {'id' : 'story'})
if None == div:
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return div.__str__('utf8')
class Twilighted_UnitTests(unittest.TestCase):
def setUp(self):
logging.basicConfig(level=logging.DEBUG)
pass
def testLoginWorks(self):
url = 'http://www.twilighted.net/viewstory.php?sid=10004'
self.assertTrue(Twilighted(url).performLogin())
def testGetUrlsWorks(self):
url = 'http://www.twilighted.net/viewstory.php?sid=10004'
self.assertEquals(32, len(Twilighted(url).extractIndividualUrls()))
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,4 @@
# -*- coding: utf-8 -*-
# This is really for the web version. downalod.py will ask.
password='somepass'

177
fanficdownloader/zipdir.py Normal file
View file

@ -0,0 +1,177 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
import sys
import os
import zlib
import zipfile
from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED
from contextlib import closing
import logging
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time
import datetime
from datetime import timedelta
import StringIO
class InvalidEPub(Exception):
pass
def checkNewer(filename, curdte):
ret = True
if not os.path.isfile(filename):
logging.debug('File %s does not already exist.' % filename)
return ret
#logging.debug('filename=%s, curdte=%s' % (filename, curdte))
lastdate = None
with closing(ZipFile(open(filename, 'rb'))) as epub:
titleFilePath = "OEBPS/title_page.xhtml"
contentFilePath = "OEBPS/content.opf"
namelist = set(epub.namelist())
#logging.debug('namelist=%s' % namelist)
if 'mimetype' not in namelist or \
'META-INF/container.xml' not in namelist:
#raise InvalidEPub('%s: not a valid EPUB' % filename)
logging.debug('File %s is not a valid EPub format file.' % filename)
return ret
if contentFilePath not in namelist:
return ret # file is not newer
data = epub.read(contentFilePath)
soup = bs.BeautifulStoneSoup(data)
lstdte = soup.find ('dc:date', {'opf:event' : 'modification'})
#logging.debug('lstdte=%s' % lstdte.string)
if lstdte is None and titleFilePath in namelist:
data = epub.read(titleFilePath)
soup = bs.BeautifulStoneSoup(data)
fld = ''
allTDs = soup.findAll ('td')
for td in allTDs:
b = td.find ('b')
if b is not None:
fld = b.string
if td.string is not None and fld == "Updated:":
lastdate = td.string
#logging.debug('title lastdate=%s' % lastdate)
else:
lastdate = lstdte.string.strip(' ')
#logging.debug('contents lastdate=%s' % lastdate)
if lastdate is not None:
currUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(curdte.strftime('%Y-%m-%d'), "%Y-%m-%d")))
storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(lastdate, "%Y-%m-%d")))
logging.debug('File %s last update date is %s, comparing to %s' % (filename, storyUpdated, currUpdated))
if currUpdated <= storyUpdated :
ret = False
logging.debug("Does %s need to be updated? %s" % (filename, ret))
return ret
def toZip(filename, directory):
zippedHelp = zipfile.ZipFile(filename, "w", compression=zipfile.ZIP_DEFLATED)
lst = os.listdir(directory)
for entity in lst:
if entity.startswith('.'):
continue
each = os.path.join(directory,entity)
print(each)
if os.path.isfile(each):
print(each)
# epub standard requires mimetype to be uncompressed and first file.
if entity == 'mimetype':
zippedHelp.write(each, arcname=entity, compress_type=zipfile.ZIP_STORED)
else:
zippedHelp.write(each, arcname=entity)
else:
addFolderToZip(zippedHelp,entity, each)
zippedHelp.close()
def addFolderToZip(zippedHelp,folder,fpath):
#print('addFolderToZip(%s)' % folder)
if folder == '.' or folder == '..':
return
folderFiles = os.listdir(fpath)
for f in folderFiles:
if os.path.isfile(fpath + '/' + f):
#print('basename=%s' % os.path.basename(fpath + '/' + f))
zippedHelp.write(fpath + '/' + f, folder + '/' + f, zipfile.ZIP_DEFLATED)
elif os.path.isdir(f):
addFolderToZip(zippedHelp,f)
def inMemoryZip(files):
# files have a structure of {'path/to/file' => content} dictionary
io = StringIO.StringIO()
if 'mimetype' in files:
# This fixes the uncompressed mimetype-first issue by opening
# the in memory file as STORE, putting in the mimetype, then
# closing and re-opening with DEFLATED. while it is often
# true that mimetype is the first file, we can't assume it,
# because the dict object is defined as unordered.
path='mimetype'
memzip = zipfile.ZipFile(io, 'a', compression=zipfile.ZIP_STORED)
memzip.debug = 3
if type(files[path]) != type('str'):
data = files[path].getvalue()
else:
data = files[path]
logging.debug("Writing ZIP path %s" % path)
try:
memzip.writestr(path, data.encode('utf-8'))
except UnicodeDecodeError, e:
memzip.writestr(path.encode('utf-8'), data.encode('utf-8'))
memzip.close()
# remove it from the files dict.
del(files['mimetype'])
# open in 'a' append mode.
memzip = zipfile.ZipFile(io, 'a', compression=zipfile.ZIP_DEFLATED)
memzip.debug = 3
for path in files:
if type(files[path]) != type('str'):
data = files[path].getvalue()
else:
data = files[path]
# logging.debug(data)
logging.debug("Writing ZIP path %s" % path)
try:
memzip.writestr(path, data.encode('utf-8'))
except UnicodeDecodeError, e:
memzip.writestr(path.encode('utf-8'), data.encode('utf-8'))
# declares all the files created by Windows.
for zf in memzip.filelist:
zf.create_system = 0
memzip.close()
return io
if __name__ == '__main__':
# toZip('sample.epub', "books/A_Time_To_Reflect")
# z = zipfile.ZipFile('sample.epub', 'r')
files = {'test.txt' : 'test', 'data/abc.txt' : 'abc'}
data = inMemoryZip(files)
f = open('res.zip', 'w')
f.write(data)
f.close()

21
ffstorage.py Normal file
View file

@ -0,0 +1,21 @@
from google.appengine.ext import db
class OneDownload(db.Model):
user = db.UserProperty()
url = db.StringProperty()
format = db.StringProperty()
login = db.StringProperty()
password = db.StringProperty()
failure = db.StringProperty()
date = db.DateTimeProperty(auto_now_add=True)
class DownloadedFanfic(db.Model):
user = db.UserProperty()
url = db.StringProperty()
name = db.StringProperty()
author = db.StringProperty()
format = db.StringProperty()
date = db.DateTimeProperty(auto_now_add=True)
blob = db.BlobProperty()
mac = db.StringProperty()
cleared = db.BooleanProperty(default=False)

109
index-ajax.html Normal file
View file

@ -0,0 +1,109 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
<html>
<head>
<link href="css/index.css" rel="stylesheet" type="text/css">
<link type="text/css" href="http://jqueryui.com/latest/themes/base/ui.all.css" rel="stylesheet" />
<title>Fanfiction Downloader (fanfiction.net, fictionalley, ficwad to epub and HTML)</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<script src="/js/jquery-1.3.2.js"></script>
<script src="/js/fdownloader.js"></script>
<script type="text/javascript" src="http://jqueryui.com/latest/ui/ui.core.js"></script>
<script type="text/javascript" src="http://jqueryui.com/latest/ui/ui.progressbar.js"></script>
</head>
<body>
<div id='main'>
<h1>
<a href="/" style="text-decoration: none; color: black;">FanFiction Downloader</a>
</h1>
<!-- <form action="/fdown" method="post"> -->
<div id='urlbox'>
<div id='greeting'>
Hi, {{ nickname }}! This is a fan fiction downloader, which makes reading stories from various websites much easier. Please paste a URL of the <em>first chapter</em> in the box to start. Alternatively, see your personal list of <a href="/recent">previously downloaded fanfics</a>.
</div>
<input type="text" id='url' name="url" size="50" value='{{ url }}'>
<div style="margin-top: 0.5em;">
Ebook format &nbsp;<select name="format" id="format">
<option value='epub'>ePub</option>
<option value='html'>HTML</option>
</select>
</div>
<div id='error' style='color: red'>
</div>
</div>
<div id='yourfile' style='display:none'>
</div>
<div id='typebox'>
</div>
<h3>
Login and Password
</h3>
<div id='logpassword'>
If the story requires a login and password to download (e.g. marked as Mature on FFA), you may need to provide your credentials to download it, otherwise just leave it empty
</div>
<div id='logpasswordtable'>
<div class='fieldandlabel'>
<div class='label'>Login</div>
<div class='field'><input type='text' name='login' id='login' size='50'></div>
</div>
<div class='fieldandlabel'>
<div class='label'>Password</div>
<div class='field'><input type='password' id='password' name='password' size='50'></div>
</div>
</div>
<div id='submitbtn'>
<span id='submit_button'><button onclick='downloadFanfic();'>Download</button></span>
<span id='ajax_loader' style='display:none'><img src="/static/ajax-loader.gif"></span>
</div>
<div id="progressbar">
</div>
<div id='helpbox'>
Few things to know, which will make your life substantially easier:
<ol>
<li>Small <a href="http://www.sigizmund.com/reading-fanfiction-off-line-in-stanza-and-oth">post written by me</a> &mdash; how to read fiction in Stanza or any other ebook reader. </a></li>
<li>Currently we support fanfiction.net, fictionpress.com, fanficauthors.net and ficwad.com</li>
<li>Paste a URL of the first chapter of the fanfic, not the index page</li>
<li>Fics with a single chapter are not supported (you can just copy and paste it)</li>
<li>Stories which are too long may not be downloaded correctly and application will report a time-out error &mdash; this is a limitation which is currently imposed by Google AppEngine on a long-running activities</li>
<li>FicWad support is somewhat flaky &mdash; if you feel it doesn't work for you, send all the details to me</li>
<li>You can download fanfics and store them for 'later' by just downloading them and visiting <a href="/recent">recent downloads</a> section, but in future they will be deleted after 5 days to save the space</li>
<li>If Downloader simply opens a download file window rather than saves the fanfic and gives you a link, it means it is too large to save in the database and you need to download it straight away</li>
<li>If you think that something that should work in fact doesn't, drop me a mail to <a href='mailto:sigizmund@gmail.com'>sigizmund@gmail.com</a></li>
</ol>
Otherwise, just have fun, and if you want to say thank you &mdash; use the email above.
</div>
<div style='text-align: center'>
<img src="http://code.google.com/appengine/images/appengine-silver-120x30.gif"
alt="Powered by Google App Engine" />
<br/><br/>
FanfictionLoader is a web front-end to <A href="http://code.google.com/p/fanficdownloader/">fanficdownloader</a><br/>
Copyright &copy; <a href="http://twitter.com/sigizmund">Roman Kirillov</a>
</div>
<!-- </form> -->
</div>
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
try {
var pageTracker = _gat._getTracker("UA-12136939-1");
pageTracker._trackPageview();
} catch(err) {}</script>
</body>
</html>

204
index.html Normal file
View file

@ -0,0 +1,204 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
<html>
<head>
<link href="css/index.css" rel="stylesheet" type="text/css">
<title>Fanfiction Downloader &mdash; twilighted.net, fanfiction.net, fictionpress.com, fictionalley.org, ficwad.com, potionsandsnitches.net, harrypotterfanfiction.com, mediaminer.org to epub and HTML to Stanza, Kindle, Nook, Sony Reader</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta name="google-site-verification" content="kCFc-G4bka_pJN6Rv8CapPBcwmq0hbAUZPkKWqRsAYU" />
</head>
<body>
<div id='main'>
<h1>
<a href="/" style="text-decoration: none; color: black;">FanFiction Downloader</a>
</h1>
<div style="text-align: center">
<script type="text/javascript"><!--
google_ad_client = "pub-2027714004231956";
/* FFD */
google_ad_slot = "7330682770";
google_ad_width = 468;
google_ad_height = 60;
//-->
</script>
<script type="text/javascript"
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
</script>
</div>
<!-- <div id='yourfile'> -->
{{yourfile}}
<!-- </div> -->
{% if authorized %}
<form action="/fdown" method="post">
<div id='urlbox'>
<div id='greeting'>
<p>Hi, {{ nickname }}! This is a fan fiction downloader, which makes reading stories from various websites
much easier. </p>
<p>To support new features, such as including story summaries,
the URL you need to use for some sites has changed. See below for example URLs for each site. </p>
<p>Or see your personal list of <a href="/recent">previously downloaded fanfics</a>.</p>
</div>
<div id='error'>
{{ error_message }}
</div>
<input type="text" name="url" size="50" value='{{ url }}'>
</div>
<div id='typebox'>
<div id='typelabel'>Ebook format</div>
<div id='typeoptions'>
<input type='radio' name='format' value='epub' checked>EPub</input>
<input type='radio' name='format' value='html'>HTML</input>
<input type='radio' name='format' value='text'>Plain Text</input>
</div>
</div>
<div id='logpasswordtable'>
<h3>Login and Password</h3>
<div id='logpassword'>
If the story requires a login and password to download (e.g. marked as Mature on FFA), you may need to provide
your credentials to download it, otherwise just leave it empty
</div>
<div class='fieldandlabel'>
<div class='label'>Login</div>
<div class='field'><input type='text' name='login' size='50'></div>
</div>
<div class='fieldandlabel'>
<div class='label'>Password</div>
<div class='field'><input type='password' name='password' size='50'></div>
</div>
</div>
<div id='submitbtn'>
<input type="submit" value="Download">
</div>
</form>
{% else %}
<div id='urlbox'>
<div id='greeting'>
<p>
This is a fan fiction downloader, which makes reading stories from various websites much easier. Before you
can start downloading fanfics, you need to login, so downloader can remember your fanfics and store them.
</p>
<p><a href="{{ login_url }}">Login using Google account</a></p>
</div>
</div>
{% endif %}
<div id='helpbox'>
<dl>
<dt>fictionalley.org
<dd>Use the URL of the story's chapter list, such as
<br /><a href="http://www.fictionalley.org/authors/drt/DA.html">http://www.fictionalley.org/authors/drt/DA.html</a>. Or the story text URL for
fictionalley.org one-shots, such as
<br /><a href="http://www.fictionalley.org/authors/drt/JOTP01a.html">http://www.fictionalley.org/authors/drt/JOTP01a.html</a>.
<dt>fanfiction.net
<dd>Use the URL of any story chapter, with or without story title such as
<br /><a href="http://www.fanfiction.net/s/5192986/1/A_Fox_in_Tokyo">http://www.fanfiction.net/s/5192986/1/A_Fox_in_Tokyo</a> or
<br /><a href="http://www.fanfiction.net/s/2345466/3/">http://www.fanfiction.net/s/5192986/5/</a>.
<dt>fictionpress.com
<dd>Use the URL of any story chapter, such as
<br /><a href="http://www.fictionpress.com/s/2851771/1/Untouchable_Love">http://www.fictionpress.com/s/2851771/1/Untouchable_Love</a> or
<br /><a href="http://www.fictionpress.com/s/2847338/6/">http://www.fictionpress.com/s/2847338/6/</a>.
<dt>twilighted.net
<dd>Use the URL of the start of the story, such as
<br /><a href="http://twilighted.net/viewstory.php?sid=8422">http://twilighted.net/viewstory.php?sid=8422</a>.
<dt>ficwad.com
<dd>Use the URL of any story chapter, such as
<br /><a href="http://www.ficwad.com/story/75246">http://www.ficwad.com/story/75246</a>.
<dt>harrypotterfanfiction.com
<dd>Use the URL of the story's chapter list, such as
<br /><a href="http://www.harrypotterfanfiction.com/viewstory.php?psid=289208">http://www.harrypotterfanfiction.com/viewstory.php?psid=289208</a>.
<dt>potionsandsnitches.net
<dd>Use the URL of the story's chapter list, such as
<br /><a href="http://potionsandsnitches.net/fanfiction/viewstory.php?sid=2332">http://potionsandsnitches.net/fanfiction/viewstory.php?sid=2332</a>.
<dt>mediaminer.org
<dd>Use the URL of the story's chapter list, such as
<br /><a href="http://www.mediaminer.org/fanfic/view_st.php/156934">http://www.mediaminer.org/fanfic/view_st.php/166653</a>.
Or the story URL for one-shots, such as
<br /><a href="http://www.mediaminer.org/fanfic/view_st.php/167618">http://www.mediaminer.org/fanfic/view_st.php/167618</a>.
</dl>
A few additional things to know, which will make your life substantially easier:
<ol>
<li>
First thing to know: I do not use your login and password. In fact, all I know about it is your ID &ndash; password
is being verified by Google and is absolutely, totally unknown to anyone but you.
</li>
<li>
Small <a href="http://www.sigizmund.com/reading-fanfiction-off-line-in-stanza-and-oth">post written by me</a>
&mdash; how to read fiction in Stanza or any other ebook reader.
</li>
<li>
Currently we support fanfiction.net, fictionpress.com, ficwad.com, fictionalley.org, harrypotterfanfiction.com, potionsandsnitches.net, mediaminer.org and twilighted.net.
fanficauthors.net and tthfanfic.org offer native ePub functionality.
</li>
<li>
You can download fanfiction directly from your iPhone, Kindle or (possibly) other ebook reader.
</li>
<li>
One-shots, fics with a single chapter, <em>are</em> now supported.
</li>
<li>
You can download fanfics and store them for 'later' by just downloading them and visiting <a href="/recent">recent
downloads</a> section.
</li>
<li>
Downloaded stories are deleted after some time (which should give you enough of time to download it and will keep
Google happy about the app not going over the storage limit).
</li>
<li>
If Downloader simply opens a download file window rather than saves the fanfic and gives you a link, it means it is
too large to save in the database and you need to download it straight away.
</li>
<li>
If you see some funny characters in downloaded Plain Text file, make sure you choose text file encoding UTF-8 and
not something else.
</li>
<li>
If you think that something that should work in fact doesn't, drop me a mail
to <a href='mailto:sigizmund@gmail.com'>sigizmund@gmail.com</a>, or, even better, write an email to
our <a href="http://groups.google.com/group/fanfic-downloader">Google Group</a>. I also encourage you to join it so
you will find out about latest updates and fixes as soon as possible
</li>
</ol>
Otherwise, just have fun, and if you want to say thank you &mdash; use the contacts above.
</div>
<div style='text-align: center'>
<img src="http://code.google.com/appengine/images/appengine-silver-120x30.gif"
alt="Powered by Google App Engine" />
<br/><br/>
FanfictionLoader is a web front-end to <A href="http://code.google.com/p/fanficdownloader/">fanficdownloader</a><br/>
Copyright &copy; <a href="http://twitter.com/sigizmund">Roman Kirillov</a>
</div>
<div style="margin-top: 1em; text-align: center'">
<script type="text/javascript"><!--
google_ad_client = "pub-2027714004231956";
/* FFD */
google_ad_slot = "7330682770";
google_ad_width = 468;
google_ad_height = 60;
//-->
</script>
<script type="text/javascript"
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
</script>
</div>
</div>
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
try {
var pageTracker = _gat._getTracker("UA-12136939-1");
pageTracker._trackPageview();
} catch(err) {}</script>
</body>
</html>

22
index.yaml Normal file
View file

@ -0,0 +1,22 @@
indexes:
# AUTOGENERATED
# This index.yaml is automatically updated whenever the dev_appserver
# detects that a new type of query is run. If you want to manage the
# index.yaml file manually, remove the above marker line (the line
# saying "# AUTOGENERATED"). If you want to manage some indexes
# manually, move them above the marker line. The index.yaml file is
# automatically uploaded to the admin console when you next deploy
# your application using appcfg.py.
- kind: DownloadedFanfic
properties:
- name: cleared
- name: date
- kind: DownloadedFanfic
properties:
- name: user
- name: date
direction: desc

116
js/fdownloader.js Normal file
View file

@ -0,0 +1,116 @@
var g_CurrentKey = null;
var g_Counter = 0;
var COUNTER_MAX = 50;
function setErrorState(error)
{
olderr = error;
error = error + "<br/><a href='mailto:sigizmund@gmail.com?subject=Problem with the fanfiction downloader'>" + "Complain about this error</a>";
$('#error').html(error);
}
function clearErrorState()
{
$('#error').html('');
}
function showFile(data)
{
$('#yourfile').html('<a href="/file?id=' + data.key + '">' + data.name + " by " + data.author + "</a>");
$('#yourfile').show();
}
function hideFile()
{
$('#yourfile').hide();
}
function checkResults()
{
if ( g_Counter >= COUNTER_MAX )
{
return;
}
g_Counter+=1;
$.getJSON('/progress', { 'key' : g_CurrentKey }, function(data)
{
if ( data.result != "Nope")
{
if ( data.result != "OK" )
{
leaveLoadingState();
setErrorState(data.result);
}
else
{
showFile(data);
leaveLoadingState();
// result = data.split("|");
// showFile(result[1], result[2], result[3]);
}
$("#progressbar").progressbar('destroy');
g_Counter = 101;
}
});
if ( g_Counter < COUNTER_MAX )
setTimeout("checkResults()", 1000);
else
{
leaveLoadingState();
setErrorState("Operation takes too long - terminating by timeout (story too long?)");
}
}
function enterLoadingState()
{
$('#submit_button').hide();
$('#ajax_loader').show();
}
function leaveLoadingState()
{
$('#submit_button').show();
$('#ajax_loader').hide();
}
function downloadFanfic()
{
clearErrorState();
hideFile();
format = $("#format").val();
alert(format);
return;
var url = $('#url').val();
var login = $('#login').val();
var password = $('#password').val();
if ( url == '' )
{
setErrorState('URL shouldn\'t be empty');
return;
}
if ( (url.indexOf('fanfiction.net') == -1 && url.indexOf('fanficauthors') == -1 && url.indexOf('ficwad') == -1 && url.indexOf('fictionpress') == -1) || (url.indexOf('adultfanfiction.net') != -1) )
{
setErrorState("This source is not yet supported. Ping me if you want it!");
return;
}
$.post('/submitDownload', {'url' : url, 'login' : login, 'password' : password, 'format' : format}, function(data)
{
g_CurrentKey = data;
g_Counter = 0;
setTimeout("checkResults()", 1000);
enterLoadingState();
})
}

4376
js/jquery-1.3.2.js vendored Normal file

File diff suppressed because it is too large Load diff

316
main.py Normal file
View file

@ -0,0 +1,316 @@
#!/usr/bin/env python
#
# Copyright 2007 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import sys
import logging
import traceback
import StringIO
from google.appengine.runtime import DeadlineExceededError
from google.appengine.ext.webapp import template
from google.appengine.api import users
from google.appengine.ext import webapp
from google.appengine.ext.webapp import util
from fanficdownloader.downloader import *
from fanficdownloader.ffnet import *
from fanficdownloader.output import *
from fanficdownloader import twilighted
from google.appengine.ext import db
from fanficdownloader.zipdir import *
from ffstorage import *
class LoginRequired(webapp.RequestHandler):
def get(self):
user = users.get_current_user()
if user:
self.redirect('/')
else:
logging.debug(users.create_login_url('/'))
url = users.create_login_url(self.request.uri)
template_values = {'login_url' : url}
path = os.path.join(os.path.dirname(__file__), 'index-nonlogin.html')
self.response.out.write(template.render(path, template_values))
class MainHandler(webapp.RequestHandler):
def get(self):
user = users.get_current_user()
if user:
error = self.request.get('error')
template_values = {'nickname' : user.nickname(), 'authorized': True}
url = self.request.get('url')
template_values['url'] = url
if error != None and len(error) > 1:
if error == 'login_required':
template_values['error_message'] = 'This story (or one of the chapters) requires you to be logged in.'
elif error == 'bad_url':
template_values['error_message'] = 'Unsupported URL: ' + url
elif error == 'custom':
template_values['error_message'] = 'Error happened: ' + self.request.get('errtext')
filename = self.request.get('file')
if len(filename) > 1:
template_values['yourfile'] = '''<div id='yourfile'><a href='/file?id=%s'>"%s" by %s</a></div>''' % (filename, self.request.get('name'), self.request.get('author'))
self.response.headers['Content-Type'] = 'text/html'
path = os.path.join(os.path.dirname(__file__), 'index.html')
self.response.out.write(template.render(path, template_values))
else:
# self.redirect(users.create_login_url(self.request.uri))
# self.redirect('/login')
logging.debug(users.create_login_url('/'))
url = users.create_login_url(self.request.uri)
template_values = {'login_url' : url, 'authorized': False}
path = os.path.join(os.path.dirname(__file__), 'index.html')
self.response.out.write(template.render(path, template_values))
class FileServer(webapp.RequestHandler):
def get(self):
# user = users.get_current_user()
fileId = self.request.get('id')
if fileId == None or len(fileId) < 3:
self.redirect('/')
key = db.Key(fileId)
fanfic = db.get(key)
name = fanfic.name.encode('utf-8')
name = makeAcceptableFilename(name)
logging.info("Serving file: %s" % name)
if fanfic.format == 'epub':
self.response.headers['Content-Type'] = 'application/epub+zip'
self.response.headers['Content-disposition'] = 'attachment; filename=' + name + '.epub'
elif fanfic.format == 'html':
self.response.headers['Content-Type'] = 'text/html'
self.response.headers['Content-disposition'] = 'attachment; filename=' + name + '.html.zip'
elif fanfic.format == 'text':
self.response.headers['Content-Type'] = 'text/plain'
self.response.headers['Content-disposition'] = 'attachment; filename=' +name + '.txt.zip'
self.response.out.write(fanfic.blob)
class RecentFilesServer(webapp.RequestHandler):
def get(self):
user = users.get_current_user()
if not user:
self.redirect('/login')
# fics = db.GqlQuery("Select * From DownloadedFanfic WHERE user = :1 and cleared = :2", user)
q = DownloadedFanfic.all()
q.filter('user =', user)
q.filter('cleared =', False)
fics = q.fetch(100)
template_values = dict(fics = fics, nickname = user.nickname())
path = os.path.join(os.path.dirname(__file__), 'recent.html')
self.response.out.write(template.render(path, template_values))
class RecentAllFilesServer(webapp.RequestHandler):
def get(self):
user = users.get_current_user()
if user.nickname() != 'sigizmund':
return
fics = db.GqlQuery("Select * From DownloadedFanfic")
template_values = dict(fics = fics, nickname = user.nickname())
path = os.path.join(os.path.dirname(__file__), 'recent.html')
self.response.out.write(template.render(path, template_values))
class FanfictionDownloader(webapp.RequestHandler):
def _printableVersion(self, text):
text = removeEntities(text)
try:
d = text.decode('utf-8')
except:
d = text
return d
def post(self):
logging.getLogger().setLevel(logging.DEBUG)
user = users.get_current_user()
if not user:
self.redirect(users.create_login_url('/'))
format = self.request.get('format')
url = self.request.get('url')
login = self.request.get('login')
password = self.request.get('password')
logging.info("Downloading: " + url)
adapter = None
writerClass = None
download = OneDownload()
download.user = user
download.url = url
download.login = login
download.password = password
download.format = format
logging.info('Creating adapter...')
try:
if url.find('fictionalley') != -1:
adapter = fictionalley.FictionAlley(url)
elif url.find('ficwad') != -1:
adapter = ficwad.FicWad(url)
elif url.find('fanfiction.net') != -1:
adapter = ffnet.FFNet(url)
elif url.find('fictionpress.com') != -1:
adapter = fpcom.FPCom(url)
elif url.find('harrypotterfanfiction.com') != -1:
adapter = hpfiction.HPFiction(url)
elif url.find('twilighted.net') != -1:
adapter = twilighted.Twilighted(url)
elif url.find('potionsandsnitches.net') != -1:
adapter = potionsNsnitches.PotionsNSnitches(url)
elif url.find('mediaminer.org') != -1:
adapter = mediaminer.MediaMiner(url)
else:
logging.debug("Bad URL detected")
self.redirect('/?error=bad_url&url=' + urlEscape(url) )
return
except Exception, e:
logging.exception(e)
download.failure = "Adapter was not created: " + str(e)
download.put()
self.redirect('/?error=custom&url=' + urlEscape(url) + '&errtext=' + urlEscape(str(traceback.format_exc())) )
return
logging.info('Created an adaper: %s' % adapter)
if len(login) > 1:
adapter.setLogin(login)
adapter.setPassword(password)
if format == 'epub':
writerClass = output.EPubFanficWriter
elif format == 'html':
writerClass = output.HTMLWriter
else:
writerClass = output.TextWriter
loader = FanficLoader(adapter, writerClass, quiet = True, inmemory=True, compress=False)
try:
data = loader.download()
if format == 'html' or format == 'text':
# data is uncompressed hence huge
ext = '.html'
if format == 'text':
ext = '.txt'
logging.debug(data)
files = {makeAcceptableFilename(str(adapter.getOutputName())) + ext : StringIO.StringIO(data.decode('utf-8')) }
d = inMemoryZip(files)
data = d.getvalue()
except LoginRequiredException, e:
logging.exception(e)
download.failure = 'Login problem detected'
download.put()
self.redirect('/?error=login_required&url=' + urlEscape(url))
return
except:
e = sys.exc_info()[0]
logging.exception(e)
download.failure = 'Some exception happened in downloader: ' + str(e)
download.put()
self.redirect('/?error=custom&url=' + urlEscape(url) + '&errtext=' + urlEscape(str(traceback.format_exc())) )
return
if data == None:
if loader.badLogin:
logging.debug("Bad login detected")
download.failure = 'Login problem detected'
download.put()
self.redirect('/?error=login_required&url=' + urlEscape(url))
else:
fic = DownloadedFanfic()
fic.user = user
fic.url = url
fic.format = format
fic.name = self._printableVersion(adapter.getOutputName())
fic.author = self._printableVersion(adapter.getAuthorName())
fic.blob = data
try:
fic.put()
key = fic.key()
download.put()
self.redirect('/?file='+str(key)+'&name=' + urlEscape(fic.name) + '&author=' + urlEscape(fic.author))
logging.info("Download finished OK")
except Exception, e:
logging.exception(e)
# it was too large, won't save it
name = str(makeAcceptableFilename(adapter.getStoryName()))
if format == 'epub':
self.response.headers['Content-Type'] = 'application/epub+zip'
self.response.headers['Content-disposition'] = 'attachment; filename=' + name + '.epub'
elif format == 'html':
self.response.headers['Content-Type'] = 'application/zip'
self.response.headers['Content-disposition'] = 'attachment; filename=' + name + '.html.zip'
elif format == 'text':
self.response.headers['Content-Type'] = 'application/zip'
self.response.headers['Content-disposition'] = 'attachment; filename=' + name + '.txt.zip'
self.response.out.write(data)
def toPercentDecimal(match):
"Return the %decimal number for the character for url escaping"
s = match.group(1)
return "%%%02x" % ord(s)
def urlEscape(data):
"Escape text, including unicode, for use in URLs"
p = re.compile(r'([^\w])')
return p.sub(toPercentDecimal, data.encode("utf-8"))
def main():
application = webapp.WSGIApplication([('/', MainHandler), ('/fdown', FanfictionDownloader), ('/file', FileServer), ('/recent', RecentFilesServer), ('/r2d2', RecentAllFilesServer), ('/login', LoginRequired)],
debug=False)
util.run_wsgi_app(application)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.DEBUG)
main()

5
queue.yaml Normal file
View file

@ -0,0 +1,5 @@
queue:
- name: default
rate: 1/s
- name: download
rate: 10/s

69
recent.html Normal file
View file

@ -0,0 +1,69 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
<html>
<head>
<link href="css/index.css" rel="stylesheet" type="text/css">
<title>Fanfiction Downloader (fanfiction.net, fanficauthors, fictionalley, ficwad to epub and HTML)</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
</head>
<body>
<div id='main'>
<h1>
<a href="/" style="text-decoration: none; color: black;">FanFiction Downloader</a>
</h1>
<script type="text/javascript"><!--
google_ad_client = "pub-2027714004231956";
/* 468x60, created 6/9/10 */
google_ad_slot = "8817097473";
google_ad_width = 468;
google_ad_height = 60;
//-->
</script>
<script type="text/javascript"
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
</script>
<!-- <div id='yourfile'> -->
{{yourfile}}
<!-- </div> -->
<div id='urlbox'>
<div id='greeting'>
Hi, {{ nickname }}! These fanfics you've downloaded previously.
</div>
</div>
<div id='helpbox'>
{% for fic in fics %}
<p> <a href="/file?id={{ fic.key }}">{{ fic.name }}</a> by {{ fic.author }} ({{ fic.format }})<br/><small><a href="{{ fic.url }}">{{ fic.url }}</a></small></p>
{% endfor %}
</div>
<script type="text/javascript"><!--
google_ad_client = "pub-2027714004231956";
/* 468x60, created 6/9/10 */
google_ad_slot = "2009456648";
google_ad_width = 468;
google_ad_height = 60;
//-->
</script>
<script type="text/javascript"
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
</script>
</div>
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
try {
var pageTracker = _gat._getTracker("UA-12136939-1");
pageTracker._trackPageview();
} catch(err) {}</script>
</body>
</html>

318
simplejson/__init__.py Normal file
View file

@ -0,0 +1,318 @@
r"""JSON (JavaScript Object Notation) <http://json.org> is a subset of
JavaScript syntax (ECMA-262 3rd edition) used as a lightweight data
interchange format.
:mod:`simplejson` exposes an API familiar to users of the standard library
:mod:`marshal` and :mod:`pickle` modules. It is the externally maintained
version of the :mod:`json` library contained in Python 2.6, but maintains
compatibility with Python 2.4 and Python 2.5 and (currently) has
significant performance advantages, even without using the optional C
extension for speedups.
Encoding basic Python object hierarchies::
>>> import simplejson as json
>>> json.dumps(['foo', {'bar': ('baz', None, 1.0, 2)}])
'["foo", {"bar": ["baz", null, 1.0, 2]}]'
>>> print json.dumps("\"foo\bar")
"\"foo\bar"
>>> print json.dumps(u'\u1234')
"\u1234"
>>> print json.dumps('\\')
"\\"
>>> print json.dumps({"c": 0, "b": 0, "a": 0}, sort_keys=True)
{"a": 0, "b": 0, "c": 0}
>>> from StringIO import StringIO
>>> io = StringIO()
>>> json.dump(['streaming API'], io)
>>> io.getvalue()
'["streaming API"]'
Compact encoding::
>>> import simplejson as json
>>> json.dumps([1,2,3,{'4': 5, '6': 7}], separators=(',',':'))
'[1,2,3,{"4":5,"6":7}]'
Pretty printing::
>>> import simplejson as json
>>> s = json.dumps({'4': 5, '6': 7}, sort_keys=True, indent=4)
>>> print '\n'.join([l.rstrip() for l in s.splitlines()])
{
"4": 5,
"6": 7
}
Decoding JSON::
>>> import simplejson as json
>>> obj = [u'foo', {u'bar': [u'baz', None, 1.0, 2]}]
>>> json.loads('["foo", {"bar":["baz", null, 1.0, 2]}]') == obj
True
>>> json.loads('"\\"foo\\bar"') == u'"foo\x08ar'
True
>>> from StringIO import StringIO
>>> io = StringIO('["streaming API"]')
>>> json.load(io)[0] == 'streaming API'
True
Specializing JSON object decoding::
>>> import simplejson as json
>>> def as_complex(dct):
... if '__complex__' in dct:
... return complex(dct['real'], dct['imag'])
... return dct
...
>>> json.loads('{"__complex__": true, "real": 1, "imag": 2}',
... object_hook=as_complex)
(1+2j)
>>> import decimal
>>> json.loads('1.1', parse_float=decimal.Decimal) == decimal.Decimal('1.1')
True
Specializing JSON object encoding::
>>> import simplejson as json
>>> def encode_complex(obj):
... if isinstance(obj, complex):
... return [obj.real, obj.imag]
... raise TypeError(repr(o) + " is not JSON serializable")
...
>>> json.dumps(2 + 1j, default=encode_complex)
'[2.0, 1.0]'
>>> json.JSONEncoder(default=encode_complex).encode(2 + 1j)
'[2.0, 1.0]'
>>> ''.join(json.JSONEncoder(default=encode_complex).iterencode(2 + 1j))
'[2.0, 1.0]'
Using simplejson.tool from the shell to validate and pretty-print::
$ echo '{"json":"obj"}' | python -m simplejson.tool
{
"json": "obj"
}
$ echo '{ 1.2:3.4}' | python -m simplejson.tool
Expecting property name: line 1 column 2 (char 2)
"""
__version__ = '2.0.9'
__all__ = [
'dump', 'dumps', 'load', 'loads',
'JSONDecoder', 'JSONEncoder',
]
__author__ = 'Bob Ippolito <bob@redivi.com>'
from decoder import JSONDecoder
from encoder import JSONEncoder
_default_encoder = JSONEncoder(
skipkeys=False,
ensure_ascii=True,
check_circular=True,
allow_nan=True,
indent=None,
separators=None,
encoding='utf-8',
default=None,
)
def dump(obj, fp, skipkeys=False, ensure_ascii=True, check_circular=True,
allow_nan=True, cls=None, indent=None, separators=None,
encoding='utf-8', default=None, **kw):
"""Serialize ``obj`` as a JSON formatted stream to ``fp`` (a
``.write()``-supporting file-like object).
If ``skipkeys`` is true then ``dict`` keys that are not basic types
(``str``, ``unicode``, ``int``, ``long``, ``float``, ``bool``, ``None``)
will be skipped instead of raising a ``TypeError``.
If ``ensure_ascii`` is false, then the some chunks written to ``fp``
may be ``unicode`` instances, subject to normal Python ``str`` to
``unicode`` coercion rules. Unless ``fp.write()`` explicitly
understands ``unicode`` (as in ``codecs.getwriter()``) this is likely
to cause an error.
If ``check_circular`` is false, then the circular reference check
for container types will be skipped and a circular reference will
result in an ``OverflowError`` (or worse).
If ``allow_nan`` is false, then it will be a ``ValueError`` to
serialize out of range ``float`` values (``nan``, ``inf``, ``-inf``)
in strict compliance of the JSON specification, instead of using the
JavaScript equivalents (``NaN``, ``Infinity``, ``-Infinity``).
If ``indent`` is a non-negative integer, then JSON array elements and object
members will be pretty-printed with that indent level. An indent level
of 0 will only insert newlines. ``None`` is the most compact representation.
If ``separators`` is an ``(item_separator, dict_separator)`` tuple
then it will be used instead of the default ``(', ', ': ')`` separators.
``(',', ':')`` is the most compact JSON representation.
``encoding`` is the character encoding for str instances, default is UTF-8.
``default(obj)`` is a function that should return a serializable version
of obj or raise TypeError. The default simply raises TypeError.
To use a custom ``JSONEncoder`` subclass (e.g. one that overrides the
``.default()`` method to serialize additional types), specify it with
the ``cls`` kwarg.
"""
# cached encoder
if (not skipkeys and ensure_ascii and
check_circular and allow_nan and
cls is None and indent is None and separators is None and
encoding == 'utf-8' and default is None and not kw):
iterable = _default_encoder.iterencode(obj)
else:
if cls is None:
cls = JSONEncoder
iterable = cls(skipkeys=skipkeys, ensure_ascii=ensure_ascii,
check_circular=check_circular, allow_nan=allow_nan, indent=indent,
separators=separators, encoding=encoding,
default=default, **kw).iterencode(obj)
# could accelerate with writelines in some versions of Python, at
# a debuggability cost
for chunk in iterable:
fp.write(chunk)
def dumps(obj, skipkeys=False, ensure_ascii=True, check_circular=True,
allow_nan=True, cls=None, indent=None, separators=None,
encoding='utf-8', default=None, **kw):
"""Serialize ``obj`` to a JSON formatted ``str``.
If ``skipkeys`` is false then ``dict`` keys that are not basic types
(``str``, ``unicode``, ``int``, ``long``, ``float``, ``bool``, ``None``)
will be skipped instead of raising a ``TypeError``.
If ``ensure_ascii`` is false, then the return value will be a
``unicode`` instance subject to normal Python ``str`` to ``unicode``
coercion rules instead of being escaped to an ASCII ``str``.
If ``check_circular`` is false, then the circular reference check
for container types will be skipped and a circular reference will
result in an ``OverflowError`` (or worse).
If ``allow_nan`` is false, then it will be a ``ValueError`` to
serialize out of range ``float`` values (``nan``, ``inf``, ``-inf``) in
strict compliance of the JSON specification, instead of using the
JavaScript equivalents (``NaN``, ``Infinity``, ``-Infinity``).
If ``indent`` is a non-negative integer, then JSON array elements and
object members will be pretty-printed with that indent level. An indent
level of 0 will only insert newlines. ``None`` is the most compact
representation.
If ``separators`` is an ``(item_separator, dict_separator)`` tuple
then it will be used instead of the default ``(', ', ': ')`` separators.
``(',', ':')`` is the most compact JSON representation.
``encoding`` is the character encoding for str instances, default is UTF-8.
``default(obj)`` is a function that should return a serializable version
of obj or raise TypeError. The default simply raises TypeError.
To use a custom ``JSONEncoder`` subclass (e.g. one that overrides the
``.default()`` method to serialize additional types), specify it with
the ``cls`` kwarg.
"""
# cached encoder
if (not skipkeys and ensure_ascii and
check_circular and allow_nan and
cls is None and indent is None and separators is None and
encoding == 'utf-8' and default is None and not kw):
return _default_encoder.encode(obj)
if cls is None:
cls = JSONEncoder
return cls(
skipkeys=skipkeys, ensure_ascii=ensure_ascii,
check_circular=check_circular, allow_nan=allow_nan, indent=indent,
separators=separators, encoding=encoding, default=default,
**kw).encode(obj)
_default_decoder = JSONDecoder(encoding=None, object_hook=None)
def load(fp, encoding=None, cls=None, object_hook=None, parse_float=None,
parse_int=None, parse_constant=None, **kw):
"""Deserialize ``fp`` (a ``.read()``-supporting file-like object containing
a JSON document) to a Python object.
If the contents of ``fp`` is encoded with an ASCII based encoding other
than utf-8 (e.g. latin-1), then an appropriate ``encoding`` name must
be specified. Encodings that are not ASCII based (such as UCS-2) are
not allowed, and should be wrapped with
``codecs.getreader(fp)(encoding)``, or simply decoded to a ``unicode``
object and passed to ``loads()``
``object_hook`` is an optional function that will be called with the
result of any object literal decode (a ``dict``). The return value of
``object_hook`` will be used instead of the ``dict``. This feature
can be used to implement custom decoders (e.g. JSON-RPC class hinting).
To use a custom ``JSONDecoder`` subclass, specify it with the ``cls``
kwarg.
"""
return loads(fp.read(),
encoding=encoding, cls=cls, object_hook=object_hook,
parse_float=parse_float, parse_int=parse_int,
parse_constant=parse_constant, **kw)
def loads(s, encoding=None, cls=None, object_hook=None, parse_float=None,
parse_int=None, parse_constant=None, **kw):
"""Deserialize ``s`` (a ``str`` or ``unicode`` instance containing a JSON
document) to a Python object.
If ``s`` is a ``str`` instance and is encoded with an ASCII based encoding
other than utf-8 (e.g. latin-1) then an appropriate ``encoding`` name
must be specified. Encodings that are not ASCII based (such as UCS-2)
are not allowed and should be decoded to ``unicode`` first.
``object_hook`` is an optional function that will be called with the
result of any object literal decode (a ``dict``). The return value of
``object_hook`` will be used instead of the ``dict``. This feature
can be used to implement custom decoders (e.g. JSON-RPC class hinting).
``parse_float``, if specified, will be called with the string
of every JSON float to be decoded. By default this is equivalent to
float(num_str). This can be used to use another datatype or parser
for JSON floats (e.g. decimal.Decimal).
``parse_int``, if specified, will be called with the string
of every JSON int to be decoded. By default this is equivalent to
int(num_str). This can be used to use another datatype or parser
for JSON integers (e.g. float).
``parse_constant``, if specified, will be called with one of the
following strings: -Infinity, Infinity, NaN, null, true, false.
This can be used to raise an exception if invalid JSON numbers
are encountered.
To use a custom ``JSONDecoder`` subclass, specify it with the ``cls``
kwarg.
"""
if (cls is None and encoding is None and object_hook is None and
parse_int is None and parse_float is None and
parse_constant is None and not kw):
return _default_decoder.decode(s)
if cls is None:
cls = JSONDecoder
if object_hook is not None:
kw['object_hook'] = object_hook
if parse_float is not None:
kw['parse_float'] = parse_float
if parse_int is not None:
kw['parse_int'] = parse_int
if parse_constant is not None:
kw['parse_constant'] = parse_constant
return cls(encoding=encoding, **kw).decode(s)

BIN
simplejson/__init__.pyc Normal file

Binary file not shown.

2329
simplejson/_speedups.c Normal file

File diff suppressed because it is too large Load diff

354
simplejson/decoder.py Normal file
View file

@ -0,0 +1,354 @@
"""Implementation of JSONDecoder
"""
import re
import sys
import struct
from simplejson.scanner import make_scanner
try:
from simplejson._speedups import scanstring as c_scanstring
except ImportError:
c_scanstring = None
__all__ = ['JSONDecoder']
FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
def _floatconstants():
_BYTES = '7FF80000000000007FF0000000000000'.decode('hex')
if sys.byteorder != 'big':
_BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
nan, inf = struct.unpack('dd', _BYTES)
return nan, inf, -inf
NaN, PosInf, NegInf = _floatconstants()
def linecol(doc, pos):
lineno = doc.count('\n', 0, pos) + 1
if lineno == 1:
colno = pos
else:
colno = pos - doc.rindex('\n', 0, pos)
return lineno, colno
def errmsg(msg, doc, pos, end=None):
# Note that this function is called from _speedups
lineno, colno = linecol(doc, pos)
if end is None:
#fmt = '{0}: line {1} column {2} (char {3})'
#return fmt.format(msg, lineno, colno, pos)
fmt = '%s: line %d column %d (char %d)'
return fmt % (msg, lineno, colno, pos)
endlineno, endcolno = linecol(doc, end)
#fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
#return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
_CONSTANTS = {
'-Infinity': NegInf,
'Infinity': PosInf,
'NaN': NaN,
}
STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
BACKSLASH = {
'"': u'"', '\\': u'\\', '/': u'/',
'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
}
DEFAULT_ENCODING = "utf-8"
def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHUNK.match):
"""Scan the string s for a JSON string. End is the index of the
character in s after the quote that started the JSON string.
Unescapes all valid JSON string escape sequences and raises ValueError
on attempt to decode an invalid string. If strict is False then literal
control characters are allowed in the string.
Returns a tuple of the decoded string and the index of the character in s
after the end quote."""
if encoding is None:
encoding = DEFAULT_ENCODING
chunks = []
_append = chunks.append
begin = end - 1
while 1:
chunk = _m(s, end)
if chunk is None:
raise ValueError(
errmsg("Unterminated string starting at", s, begin))
end = chunk.end()
content, terminator = chunk.groups()
# Content is contains zero or more unescaped string characters
if content:
if not isinstance(content, unicode):
content = unicode(content, encoding)
_append(content)
# Terminator is the end of string, a literal control character,
# or a backslash denoting that an escape sequence follows
if terminator == '"':
break
elif terminator != '\\':
if strict:
msg = "Invalid control character %r at" % (terminator,)
#msg = "Invalid control character {0!r} at".format(terminator)
raise ValueError(errmsg(msg, s, end))
else:
_append(terminator)
continue
try:
esc = s[end]
except IndexError:
raise ValueError(
errmsg("Unterminated string starting at", s, begin))
# If not a unicode escape sequence, must be in the lookup table
if esc != 'u':
try:
char = _b[esc]
except KeyError:
msg = "Invalid \\escape: " + repr(esc)
raise ValueError(errmsg(msg, s, end))
end += 1
else:
# Unicode escape sequence
esc = s[end + 1:end + 5]
next_end = end + 5
if len(esc) != 4:
msg = "Invalid \\uXXXX escape"
raise ValueError(errmsg(msg, s, end))
uni = int(esc, 16)
# Check for surrogate pair on UCS-4 systems
if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
if not s[end + 5:end + 7] == '\\u':
raise ValueError(errmsg(msg, s, end))
esc2 = s[end + 7:end + 11]
if len(esc2) != 4:
raise ValueError(errmsg(msg, s, end))
uni2 = int(esc2, 16)
uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
next_end += 6
char = unichr(uni)
end = next_end
# Append the unescaped character
_append(char)
return u''.join(chunks), end
# Use speedup if available
scanstring = c_scanstring or py_scanstring
WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
WHITESPACE_STR = ' \t\n\r'
def JSONObject((s, end), encoding, strict, scan_once, object_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
pairs = {}
# Use a slice to prevent IndexError from being raised, the following
# check will raise a more specific ValueError if the string is empty
nextchar = s[end:end + 1]
# Normally we expect nextchar == '"'
if nextchar != '"':
if nextchar in _ws:
end = _w(s, end).end()
nextchar = s[end:end + 1]
# Trivial empty object
if nextchar == '}':
return pairs, end + 1
elif nextchar != '"':
raise ValueError(errmsg("Expecting property name", s, end))
end += 1
while True:
key, end = scanstring(s, end, encoding, strict)
# To skip some function call overhead we optimize the fast paths where
# the JSON key separator is ": " or just ":".
if s[end:end + 1] != ':':
end = _w(s, end).end()
if s[end:end + 1] != ':':
raise ValueError(errmsg("Expecting : delimiter", s, end))
end += 1
try:
if s[end] in _ws:
end += 1
if s[end] in _ws:
end = _w(s, end + 1).end()
except IndexError:
pass
try:
value, end = scan_once(s, end)
except StopIteration:
raise ValueError(errmsg("Expecting object", s, end))
pairs[key] = value
try:
nextchar = s[end]
if nextchar in _ws:
end = _w(s, end + 1).end()
nextchar = s[end]
except IndexError:
nextchar = ''
end += 1
if nextchar == '}':
break
elif nextchar != ',':
raise ValueError(errmsg("Expecting , delimiter", s, end - 1))
try:
nextchar = s[end]
if nextchar in _ws:
end += 1
nextchar = s[end]
if nextchar in _ws:
end = _w(s, end + 1).end()
nextchar = s[end]
except IndexError:
nextchar = ''
end += 1
if nextchar != '"':
raise ValueError(errmsg("Expecting property name", s, end - 1))
if object_hook is not None:
pairs = object_hook(pairs)
return pairs, end
def JSONArray((s, end), scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
values = []
nextchar = s[end:end + 1]
if nextchar in _ws:
end = _w(s, end + 1).end()
nextchar = s[end:end + 1]
# Look-ahead for trivial empty array
if nextchar == ']':
return values, end + 1
_append = values.append
while True:
try:
value, end = scan_once(s, end)
except StopIteration:
raise ValueError(errmsg("Expecting object", s, end))
_append(value)
nextchar = s[end:end + 1]
if nextchar in _ws:
end = _w(s, end + 1).end()
nextchar = s[end:end + 1]
end += 1
if nextchar == ']':
break
elif nextchar != ',':
raise ValueError(errmsg("Expecting , delimiter", s, end))
try:
if s[end] in _ws:
end += 1
if s[end] in _ws:
end = _w(s, end + 1).end()
except IndexError:
pass
return values, end
class JSONDecoder(object):
"""Simple JSON <http://json.org> decoder
Performs the following translations in decoding by default:
+---------------+-------------------+
| JSON | Python |
+===============+===================+
| object | dict |
+---------------+-------------------+
| array | list |
+---------------+-------------------+
| string | unicode |
+---------------+-------------------+
| number (int) | int, long |
+---------------+-------------------+
| number (real) | float |
+---------------+-------------------+
| true | True |
+---------------+-------------------+
| false | False |
+---------------+-------------------+
| null | None |
+---------------+-------------------+
It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
their corresponding ``float`` values, which is outside the JSON spec.
"""
def __init__(self, encoding=None, object_hook=None, parse_float=None,
parse_int=None, parse_constant=None, strict=True):
"""``encoding`` determines the encoding used to interpret any ``str``
objects decoded by this instance (utf-8 by default). It has no
effect when decoding ``unicode`` objects.
Note that currently only encodings that are a superset of ASCII work,
strings of other encodings should be passed in as ``unicode``.
``object_hook``, if specified, will be called with the result
of every JSON object decoded and its return value will be used in
place of the given ``dict``. This can be used to provide custom
deserializations (e.g. to support JSON-RPC class hinting).
``parse_float``, if specified, will be called with the string
of every JSON float to be decoded. By default this is equivalent to
float(num_str). This can be used to use another datatype or parser
for JSON floats (e.g. decimal.Decimal).
``parse_int``, if specified, will be called with the string
of every JSON int to be decoded. By default this is equivalent to
int(num_str). This can be used to use another datatype or parser
for JSON integers (e.g. float).
``parse_constant``, if specified, will be called with one of the
following strings: -Infinity, Infinity, NaN.
This can be used to raise an exception if invalid JSON numbers
are encountered.
"""
self.encoding = encoding
self.object_hook = object_hook
self.parse_float = parse_float or float
self.parse_int = parse_int or int
self.parse_constant = parse_constant or _CONSTANTS.__getitem__
self.strict = strict
self.parse_object = JSONObject
self.parse_array = JSONArray
self.parse_string = scanstring
self.scan_once = make_scanner(self)
def decode(self, s, _w=WHITESPACE.match):
"""Return the Python representation of ``s`` (a ``str`` or ``unicode``
instance containing a JSON document)
"""
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
end = _w(s, end).end()
if end != len(s):
raise ValueError(errmsg("Extra data", s, end, len(s)))
return obj
def raw_decode(self, s, idx=0):
"""Decode a JSON document from ``s`` (a ``str`` or ``unicode`` beginning
with a JSON document) and return a 2-tuple of the Python
representation and the index in ``s`` where the document ended.
This can be used to decode a JSON document from a string that may
have extraneous data at the end.
"""
try:
obj, end = self.scan_once(s, idx)
except StopIteration:
raise ValueError("No JSON object could be decoded")
return obj, end

BIN
simplejson/decoder.pyc Normal file

Binary file not shown.

440
simplejson/encoder.py Normal file
View file

@ -0,0 +1,440 @@
"""Implementation of JSONEncoder
"""
import re
try:
from simplejson._speedups import encode_basestring_ascii as c_encode_basestring_ascii
except ImportError:
c_encode_basestring_ascii = None
try:
from simplejson._speedups import make_encoder as c_make_encoder
except ImportError:
c_make_encoder = None
ESCAPE = re.compile(r'[\x00-\x1f\\"\b\f\n\r\t]')
ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])')
HAS_UTF8 = re.compile(r'[\x80-\xff]')
ESCAPE_DCT = {
'\\': '\\\\',
'"': '\\"',
'\b': '\\b',
'\f': '\\f',
'\n': '\\n',
'\r': '\\r',
'\t': '\\t',
}
for i in range(0x20):
#ESCAPE_DCT.setdefault(chr(i), '\\u{0:04x}'.format(i))
ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,))
# Assume this produces an infinity on all machines (probably not guaranteed)
INFINITY = float('1e66666')
FLOAT_REPR = repr
def encode_basestring(s):
"""Return a JSON representation of a Python string
"""
def replace(match):
return ESCAPE_DCT[match.group(0)]
return '"' + ESCAPE.sub(replace, s) + '"'
def py_encode_basestring_ascii(s):
"""Return an ASCII-only JSON representation of a Python string
"""
if isinstance(s, str) and HAS_UTF8.search(s) is not None:
s = s.decode('utf-8')
def replace(match):
s = match.group(0)
try:
return ESCAPE_DCT[s]
except KeyError:
n = ord(s)
if n < 0x10000:
#return '\\u{0:04x}'.format(n)
return '\\u%04x' % (n,)
else:
# surrogate pair
n -= 0x10000
s1 = 0xd800 | ((n >> 10) & 0x3ff)
s2 = 0xdc00 | (n & 0x3ff)
#return '\\u{0:04x}\\u{1:04x}'.format(s1, s2)
return '\\u%04x\\u%04x' % (s1, s2)
return '"' + str(ESCAPE_ASCII.sub(replace, s)) + '"'
encode_basestring_ascii = c_encode_basestring_ascii or py_encode_basestring_ascii
class JSONEncoder(object):
"""Extensible JSON <http://json.org> encoder for Python data structures.
Supports the following objects and types by default:
+-------------------+---------------+
| Python | JSON |
+===================+===============+
| dict | object |
+-------------------+---------------+
| list, tuple | array |
+-------------------+---------------+
| str, unicode | string |
+-------------------+---------------+
| int, long, float | number |
+-------------------+---------------+
| True | true |
+-------------------+---------------+
| False | false |
+-------------------+---------------+
| None | null |
+-------------------+---------------+
To extend this to recognize other objects, subclass and implement a
``.default()`` method with another method that returns a serializable
object for ``o`` if possible, otherwise it should call the superclass
implementation (to raise ``TypeError``).
"""
item_separator = ', '
key_separator = ': '
def __init__(self, skipkeys=False, ensure_ascii=True,
check_circular=True, allow_nan=True, sort_keys=False,
indent=None, separators=None, encoding='utf-8', default=None):
"""Constructor for JSONEncoder, with sensible defaults.
If skipkeys is false, then it is a TypeError to attempt
encoding of keys that are not str, int, long, float or None. If
skipkeys is True, such items are simply skipped.
If ensure_ascii is true, the output is guaranteed to be str
objects with all incoming unicode characters escaped. If
ensure_ascii is false, the output will be unicode object.
If check_circular is true, then lists, dicts, and custom encoded
objects will be checked for circular references during encoding to
prevent an infinite recursion (which would cause an OverflowError).
Otherwise, no such check takes place.
If allow_nan is true, then NaN, Infinity, and -Infinity will be
encoded as such. This behavior is not JSON specification compliant,
but is consistent with most JavaScript based encoders and decoders.
Otherwise, it will be a ValueError to encode such floats.
If sort_keys is true, then the output of dictionaries will be
sorted by key; this is useful for regression tests to ensure
that JSON serializations can be compared on a day-to-day basis.
If indent is a non-negative integer, then JSON array
elements and object members will be pretty-printed with that
indent level. An indent level of 0 will only insert newlines.
None is the most compact representation.
If specified, separators should be a (item_separator, key_separator)
tuple. The default is (', ', ': '). To get the most compact JSON
representation you should specify (',', ':') to eliminate whitespace.
If specified, default is a function that gets called for objects
that can't otherwise be serialized. It should return a JSON encodable
version of the object or raise a ``TypeError``.
If encoding is not None, then all input strings will be
transformed into unicode using that encoding prior to JSON-encoding.
The default is UTF-8.
"""
self.skipkeys = skipkeys
self.ensure_ascii = ensure_ascii
self.check_circular = check_circular
self.allow_nan = allow_nan
self.sort_keys = sort_keys
self.indent = indent
if separators is not None:
self.item_separator, self.key_separator = separators
if default is not None:
self.default = default
self.encoding = encoding
def default(self, o):
"""Implement this method in a subclass such that it returns
a serializable object for ``o``, or calls the base implementation
(to raise a ``TypeError``).
For example, to support arbitrary iterators, you could
implement default like this::
def default(self, o):
try:
iterable = iter(o)
except TypeError:
pass
else:
return list(iterable)
return JSONEncoder.default(self, o)
"""
raise TypeError(repr(o) + " is not JSON serializable")
def encode(self, o):
"""Return a JSON string representation of a Python data structure.
>>> JSONEncoder().encode({"foo": ["bar", "baz"]})
'{"foo": ["bar", "baz"]}'
"""
# This is for extremely simple cases and benchmarks.
if isinstance(o, basestring):
if isinstance(o, str):
_encoding = self.encoding
if (_encoding is not None
and not (_encoding == 'utf-8')):
o = o.decode(_encoding)
if self.ensure_ascii:
return encode_basestring_ascii(o)
else:
return encode_basestring(o)
# This doesn't pass the iterator directly to ''.join() because the
# exceptions aren't as detailed. The list call should be roughly
# equivalent to the PySequence_Fast that ''.join() would do.
chunks = self.iterencode(o, _one_shot=True)
if not isinstance(chunks, (list, tuple)):
chunks = list(chunks)
return ''.join(chunks)
def iterencode(self, o, _one_shot=False):
"""Encode the given object and yield each string
representation as available.
For example::
for chunk in JSONEncoder().iterencode(bigobject):
mysocket.write(chunk)
"""
if self.check_circular:
markers = {}
else:
markers = None
if self.ensure_ascii:
_encoder = encode_basestring_ascii
else:
_encoder = encode_basestring
if self.encoding != 'utf-8':
def _encoder(o, _orig_encoder=_encoder, _encoding=self.encoding):
if isinstance(o, str):
o = o.decode(_encoding)
return _orig_encoder(o)
def floatstr(o, allow_nan=self.allow_nan, _repr=FLOAT_REPR, _inf=INFINITY, _neginf=-INFINITY):
# Check for specials. Note that this type of test is processor- and/or
# platform-specific, so do tests which don't depend on the internals.
if o != o:
text = 'NaN'
elif o == _inf:
text = 'Infinity'
elif o == _neginf:
text = '-Infinity'
else:
return _repr(o)
if not allow_nan:
raise ValueError(
"Out of range float values are not JSON compliant: " +
repr(o))
return text
if _one_shot and c_make_encoder is not None and not self.indent and not self.sort_keys:
_iterencode = c_make_encoder(
markers, self.default, _encoder, self.indent,
self.key_separator, self.item_separator, self.sort_keys,
self.skipkeys, self.allow_nan)
else:
_iterencode = _make_iterencode(
markers, self.default, _encoder, self.indent, floatstr,
self.key_separator, self.item_separator, self.sort_keys,
self.skipkeys, _one_shot)
return _iterencode(o, 0)
def _make_iterencode(markers, _default, _encoder, _indent, _floatstr, _key_separator, _item_separator, _sort_keys, _skipkeys, _one_shot,
## HACK: hand-optimized bytecode; turn globals into locals
False=False,
True=True,
ValueError=ValueError,
basestring=basestring,
dict=dict,
float=float,
id=id,
int=int,
isinstance=isinstance,
list=list,
long=long,
str=str,
tuple=tuple,
):
def _iterencode_list(lst, _current_indent_level):
if not lst:
yield '[]'
return
if markers is not None:
markerid = id(lst)
if markerid in markers:
raise ValueError("Circular reference detected")
markers[markerid] = lst
buf = '['
if _indent is not None:
_current_indent_level += 1
newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
separator = _item_separator + newline_indent
buf += newline_indent
else:
newline_indent = None
separator = _item_separator
first = True
for value in lst:
if first:
first = False
else:
buf = separator
if isinstance(value, basestring):
yield buf + _encoder(value)
elif value is None:
yield buf + 'null'
elif value is True:
yield buf + 'true'
elif value is False:
yield buf + 'false'
elif isinstance(value, (int, long)):
yield buf + str(value)
elif isinstance(value, float):
yield buf + _floatstr(value)
else:
yield buf
if isinstance(value, (list, tuple)):
chunks = _iterencode_list(value, _current_indent_level)
elif isinstance(value, dict):
chunks = _iterencode_dict(value, _current_indent_level)
else:
chunks = _iterencode(value, _current_indent_level)
for chunk in chunks:
yield chunk
if newline_indent is not None:
_current_indent_level -= 1
yield '\n' + (' ' * (_indent * _current_indent_level))
yield ']'
if markers is not None:
del markers[markerid]
def _iterencode_dict(dct, _current_indent_level):
if not dct:
yield '{}'
return
if markers is not None:
markerid = id(dct)
if markerid in markers:
raise ValueError("Circular reference detected")
markers[markerid] = dct
yield '{'
if _indent is not None:
_current_indent_level += 1
newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
item_separator = _item_separator + newline_indent
yield newline_indent
else:
newline_indent = None
item_separator = _item_separator
first = True
if _sort_keys:
items = dct.items()
items.sort(key=lambda kv: kv[0])
else:
items = dct.iteritems()
for key, value in items:
if isinstance(key, basestring):
pass
# JavaScript is weakly typed for these, so it makes sense to
# also allow them. Many encoders seem to do something like this.
elif isinstance(key, float):
key = _floatstr(key)
elif key is True:
key = 'true'
elif key is False:
key = 'false'
elif key is None:
key = 'null'
elif isinstance(key, (int, long)):
key = str(key)
elif _skipkeys:
continue
else:
raise TypeError("key " + repr(key) + " is not a string")
if first:
first = False
else:
yield item_separator
yield _encoder(key)
yield _key_separator
if isinstance(value, basestring):
yield _encoder(value)
elif value is None:
yield 'null'
elif value is True:
yield 'true'
elif value is False:
yield 'false'
elif isinstance(value, (int, long)):
yield str(value)
elif isinstance(value, float):
yield _floatstr(value)
else:
if isinstance(value, (list, tuple)):
chunks = _iterencode_list(value, _current_indent_level)
elif isinstance(value, dict):
chunks = _iterencode_dict(value, _current_indent_level)
else:
chunks = _iterencode(value, _current_indent_level)
for chunk in chunks:
yield chunk
if newline_indent is not None:
_current_indent_level -= 1
yield '\n' + (' ' * (_indent * _current_indent_level))
yield '}'
if markers is not None:
del markers[markerid]
def _iterencode(o, _current_indent_level):
if isinstance(o, basestring):
yield _encoder(o)
elif o is None:
yield 'null'
elif o is True:
yield 'true'
elif o is False:
yield 'false'
elif isinstance(o, (int, long)):
yield str(o)
elif isinstance(o, float):
yield _floatstr(o)
elif isinstance(o, (list, tuple)):
for chunk in _iterencode_list(o, _current_indent_level):
yield chunk
elif isinstance(o, dict):
for chunk in _iterencode_dict(o, _current_indent_level):
yield chunk
else:
if markers is not None:
markerid = id(o)
if markerid in markers:
raise ValueError("Circular reference detected")
markers[markerid] = o
o = _default(o)
for chunk in _iterencode(o, _current_indent_level):
yield chunk
if markers is not None:
del markers[markerid]
return _iterencode

BIN
simplejson/encoder.pyc Normal file

Binary file not shown.

65
simplejson/scanner.py Normal file
View file

@ -0,0 +1,65 @@
"""JSON token scanner
"""
import re
try:
from simplejson._speedups import make_scanner as c_make_scanner
except ImportError:
c_make_scanner = None
__all__ = ['make_scanner']
NUMBER_RE = re.compile(
r'(-?(?:0|[1-9]\d*))(\.\d+)?([eE][-+]?\d+)?',
(re.VERBOSE | re.MULTILINE | re.DOTALL))
def py_make_scanner(context):
parse_object = context.parse_object
parse_array = context.parse_array
parse_string = context.parse_string
match_number = NUMBER_RE.match
encoding = context.encoding
strict = context.strict
parse_float = context.parse_float
parse_int = context.parse_int
parse_constant = context.parse_constant
object_hook = context.object_hook
def _scan_once(string, idx):
try:
nextchar = string[idx]
except IndexError:
raise StopIteration
if nextchar == '"':
return parse_string(string, idx + 1, encoding, strict)
elif nextchar == '{':
return parse_object((string, idx + 1), encoding, strict, _scan_once, object_hook)
elif nextchar == '[':
return parse_array((string, idx + 1), _scan_once)
elif nextchar == 'n' and string[idx:idx + 4] == 'null':
return None, idx + 4
elif nextchar == 't' and string[idx:idx + 4] == 'true':
return True, idx + 4
elif nextchar == 'f' and string[idx:idx + 5] == 'false':
return False, idx + 5
m = match_number(string, idx)
if m is not None:
integer, frac, exp = m.groups()
if frac or exp:
res = parse_float(integer + (frac or '') + (exp or ''))
else:
res = parse_int(integer)
return res, m.end()
elif nextchar == 'N' and string[idx:idx + 3] == 'NaN':
return parse_constant('NaN'), idx + 3
elif nextchar == 'I' and string[idx:idx + 8] == 'Infinity':
return parse_constant('Infinity'), idx + 8
elif nextchar == '-' and string[idx:idx + 9] == '-Infinity':
return parse_constant('-Infinity'), idx + 9
else:
raise StopIteration
return _scan_once
make_scanner = c_make_scanner or py_make_scanner

BIN
simplejson/scanner.pyc Normal file

Binary file not shown.

View file

@ -0,0 +1,23 @@
import unittest
import doctest
def additional_tests():
import simplejson
import simplejson.encoder
import simplejson.decoder
suite = unittest.TestSuite()
for mod in (simplejson, simplejson.encoder, simplejson.decoder):
suite.addTest(doctest.DocTestSuite(mod))
suite.addTest(doctest.DocFileSuite('../../index.rst'))
return suite
def main():
suite = additional_tests()
runner = unittest.TextTestRunner()
runner.run(suite)
if __name__ == '__main__':
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
main()

View file

@ -0,0 +1,30 @@
from unittest import TestCase
import simplejson as json
def default_iterable(obj):
return list(obj)
class TestCheckCircular(TestCase):
def test_circular_dict(self):
dct = {}
dct['a'] = dct
self.assertRaises(ValueError, json.dumps, dct)
def test_circular_list(self):
lst = []
lst.append(lst)
self.assertRaises(ValueError, json.dumps, lst)
def test_circular_composite(self):
dct2 = {}
dct2['a'] = []
dct2['a'].append(dct2)
self.assertRaises(ValueError, json.dumps, dct2)
def test_circular_default(self):
json.dumps([set()], default=default_iterable)
self.assertRaises(TypeError, json.dumps, [set()])
def test_circular_off_default(self):
json.dumps([set()], default=default_iterable, check_circular=False)
self.assertRaises(TypeError, json.dumps, [set()], check_circular=False)

View file

@ -0,0 +1,22 @@
import decimal
from unittest import TestCase
import simplejson as json
class TestDecode(TestCase):
def test_decimal(self):
rval = json.loads('1.1', parse_float=decimal.Decimal)
self.assert_(isinstance(rval, decimal.Decimal))
self.assertEquals(rval, decimal.Decimal('1.1'))
def test_float(self):
rval = json.loads('1', parse_int=float)
self.assert_(isinstance(rval, float))
self.assertEquals(rval, 1.0)
def test_decoder_optimizations(self):
# Several optimizations were made that skip over calls to
# the whitespace regex, so this test is designed to try and
# exercise the uncommon cases. The array cases are already covered.
rval = json.loads('{ "key" : "value" , "k":"v" }')
self.assertEquals(rval, {"key":"value", "k":"v"})

View file

@ -0,0 +1,9 @@
from unittest import TestCase
import simplejson as json
class TestDefault(TestCase):
def test_default(self):
self.assertEquals(
json.dumps(type, default=repr),
json.dumps(repr(type)))

View file

@ -0,0 +1,21 @@
from unittest import TestCase
from cStringIO import StringIO
import simplejson as json
class TestDump(TestCase):
def test_dump(self):
sio = StringIO()
json.dump({}, sio)
self.assertEquals(sio.getvalue(), '{}')
def test_dumps(self):
self.assertEquals(json.dumps({}), '{}')
def test_encode_truefalse(self):
self.assertEquals(json.dumps(
{True: False, False: True}, sort_keys=True),
'{"false": true, "true": false}')
self.assertEquals(json.dumps(
{2: 3.0, 4.0: 5L, False: 1, 6L: True, "7": 0}, sort_keys=True),
'{"false": 1, "2": 3.0, "4.0": 5, "6": true, "7": 0}')

View file

@ -0,0 +1,38 @@
from unittest import TestCase
import simplejson.encoder
CASES = [
(u'/\\"\ucafe\ubabe\uab98\ufcde\ubcda\uef4a\x08\x0c\n\r\t`1~!@#$%^&*()_+-=[]{}|;:\',./<>?', '"/\\\\\\"\\ucafe\\ubabe\\uab98\\ufcde\\ubcda\\uef4a\\b\\f\\n\\r\\t`1~!@#$%^&*()_+-=[]{}|;:\',./<>?"'),
(u'\u0123\u4567\u89ab\ucdef\uabcd\uef4a', '"\\u0123\\u4567\\u89ab\\ucdef\\uabcd\\uef4a"'),
(u'controls', '"controls"'),
(u'\x08\x0c\n\r\t', '"\\b\\f\\n\\r\\t"'),
(u'{"object with 1 member":["array with 1 element"]}', '"{\\"object with 1 member\\":[\\"array with 1 element\\"]}"'),
(u' s p a c e d ', '" s p a c e d "'),
(u'\U0001d120', '"\\ud834\\udd20"'),
(u'\u03b1\u03a9', '"\\u03b1\\u03a9"'),
('\xce\xb1\xce\xa9', '"\\u03b1\\u03a9"'),
(u'\u03b1\u03a9', '"\\u03b1\\u03a9"'),
('\xce\xb1\xce\xa9', '"\\u03b1\\u03a9"'),
(u'\u03b1\u03a9', '"\\u03b1\\u03a9"'),
(u'\u03b1\u03a9', '"\\u03b1\\u03a9"'),
(u"`1~!@#$%^&*()_+-={':[,]}|;.</>?", '"`1~!@#$%^&*()_+-={\':[,]}|;.</>?"'),
(u'\x08\x0c\n\r\t', '"\\b\\f\\n\\r\\t"'),
(u'\u0123\u4567\u89ab\ucdef\uabcd\uef4a', '"\\u0123\\u4567\\u89ab\\ucdef\\uabcd\\uef4a"'),
]
class TestEncodeBaseStringAscii(TestCase):
def test_py_encode_basestring_ascii(self):
self._test_encode_basestring_ascii(simplejson.encoder.py_encode_basestring_ascii)
def test_c_encode_basestring_ascii(self):
if not simplejson.encoder.c_encode_basestring_ascii:
return
self._test_encode_basestring_ascii(simplejson.encoder.c_encode_basestring_ascii)
def _test_encode_basestring_ascii(self, encode_basestring_ascii):
fname = encode_basestring_ascii.__name__
for input_string, expect in CASES:
result = encode_basestring_ascii(input_string)
self.assertEquals(result, expect,
'%r != %r for %s(%r)' % (result, expect, fname, input_string))

View file

@ -0,0 +1,76 @@
from unittest import TestCase
import simplejson as json
# Fri Dec 30 18:57:26 2005
JSONDOCS = [
# http://json.org/JSON_checker/test/fail1.json
'"A JSON payload should be an object or array, not a string."',
# http://json.org/JSON_checker/test/fail2.json
'["Unclosed array"',
# http://json.org/JSON_checker/test/fail3.json
'{unquoted_key: "keys must be quoted}',
# http://json.org/JSON_checker/test/fail4.json
'["extra comma",]',
# http://json.org/JSON_checker/test/fail5.json
'["double extra comma",,]',
# http://json.org/JSON_checker/test/fail6.json
'[ , "<-- missing value"]',
# http://json.org/JSON_checker/test/fail7.json
'["Comma after the close"],',
# http://json.org/JSON_checker/test/fail8.json
'["Extra close"]]',
# http://json.org/JSON_checker/test/fail9.json
'{"Extra comma": true,}',
# http://json.org/JSON_checker/test/fail10.json
'{"Extra value after close": true} "misplaced quoted value"',
# http://json.org/JSON_checker/test/fail11.json
'{"Illegal expression": 1 + 2}',
# http://json.org/JSON_checker/test/fail12.json
'{"Illegal invocation": alert()}',
# http://json.org/JSON_checker/test/fail13.json
'{"Numbers cannot have leading zeroes": 013}',
# http://json.org/JSON_checker/test/fail14.json
'{"Numbers cannot be hex": 0x14}',
# http://json.org/JSON_checker/test/fail15.json
'["Illegal backslash escape: \\x15"]',
# http://json.org/JSON_checker/test/fail16.json
'["Illegal backslash escape: \\\'"]',
# http://json.org/JSON_checker/test/fail17.json
'["Illegal backslash escape: \\017"]',
# http://json.org/JSON_checker/test/fail18.json
'[[[[[[[[[[[[[[[[[[[["Too deep"]]]]]]]]]]]]]]]]]]]]',
# http://json.org/JSON_checker/test/fail19.json
'{"Missing colon" null}',
# http://json.org/JSON_checker/test/fail20.json
'{"Double colon":: null}',
# http://json.org/JSON_checker/test/fail21.json
'{"Comma instead of colon", null}',
# http://json.org/JSON_checker/test/fail22.json
'["Colon instead of comma": false]',
# http://json.org/JSON_checker/test/fail23.json
'["Bad value", truth]',
# http://json.org/JSON_checker/test/fail24.json
"['single quote']",
# http://code.google.com/p/simplejson/issues/detail?id=3
u'["A\u001FZ control characters in string"]',
]
SKIPS = {
1: "why not have a string payload?",
18: "spec doesn't specify any nesting limitations",
}
class TestFail(TestCase):
def test_failures(self):
for idx, doc in enumerate(JSONDOCS):
idx = idx + 1
if idx in SKIPS:
json.loads(doc)
continue
try:
json.loads(doc)
except ValueError:
pass
else:
self.fail("Expected failure for fail%d.json: %r" % (idx, doc))

View file

@ -0,0 +1,15 @@
import math
from unittest import TestCase
import simplejson as json
class TestFloat(TestCase):
def test_floats(self):
for num in [1617161771.7650001, math.pi, math.pi**100, math.pi**-100, 3.1]:
self.assertEquals(float(json.dumps(num)), num)
self.assertEquals(json.loads(json.dumps(num)), num)
def test_ints(self):
for num in [1, 1L, 1<<32, 1<<64]:
self.assertEquals(json.dumps(num), str(num))
self.assertEquals(int(json.dumps(num)), num)

View file

@ -0,0 +1,41 @@
from unittest import TestCase
import simplejson as json
import textwrap
class TestIndent(TestCase):
def test_indent(self):
h = [['blorpie'], ['whoops'], [], 'd-shtaeou', 'd-nthiouh', 'i-vhbjkhnth',
{'nifty': 87}, {'field': 'yes', 'morefield': False} ]
expect = textwrap.dedent("""\
[
[
"blorpie"
],
[
"whoops"
],
[],
"d-shtaeou",
"d-nthiouh",
"i-vhbjkhnth",
{
"nifty": 87
},
{
"field": "yes",
"morefield": false
}
]""")
d1 = json.dumps(h)
d2 = json.dumps(h, indent=2, sort_keys=True, separators=(',', ': '))
h1 = json.loads(d1)
h2 = json.loads(d2)
self.assertEquals(h1, h)
self.assertEquals(h2, h)
self.assertEquals(d2, expect)

View file

@ -0,0 +1,76 @@
from unittest import TestCase
import simplejson as json
# from http://json.org/JSON_checker/test/pass1.json
JSON = r'''
[
"JSON Test Pattern pass1",
{"object with 1 member":["array with 1 element"]},
{},
[],
-42,
true,
false,
null,
{
"integer": 1234567890,
"real": -9876.543210,
"e": 0.123456789e-12,
"E": 1.234567890E+34,
"": 23456789012E666,
"zero": 0,
"one": 1,
"space": " ",
"quote": "\"",
"backslash": "\\",
"controls": "\b\f\n\r\t",
"slash": "/ & \/",
"alpha": "abcdefghijklmnopqrstuvwyz",
"ALPHA": "ABCDEFGHIJKLMNOPQRSTUVWYZ",
"digit": "0123456789",
"special": "`1~!@#$%^&*()_+-={':[,]}|;.</>?",
"hex": "\u0123\u4567\u89AB\uCDEF\uabcd\uef4A",
"true": true,
"false": false,
"null": null,
"array":[ ],
"object":{ },
"address": "50 St. James Street",
"url": "http://www.JSON.org/",
"comment": "// /* <!-- --",
"# -- --> */": " ",
" s p a c e d " :[1,2 , 3
,
4 , 5 , 6 ,7 ],
"compact": [1,2,3,4,5,6,7],
"jsontext": "{\"object with 1 member\":[\"array with 1 element\"]}",
"quotes": "&#34; \u0022 %22 0x22 034 &#x22;",
"\/\\\"\uCAFE\uBABE\uAB98\uFCDE\ubcda\uef4A\b\f\n\r\t`1~!@#$%^&*()_+-=[]{}|;:',./<>?"
: "A key can be any string"
},
0.5 ,98.6
,
99.44
,
1066
,"rosebud"]
'''
class TestPass1(TestCase):
def test_parse(self):
# test in/out equivalence and parsing
res = json.loads(JSON)
out = json.dumps(res)
self.assertEquals(res, json.loads(out))
try:
json.dumps(res, allow_nan=False)
except ValueError:
pass
else:
self.fail("23456789012E666 should be out of range")

View file

@ -0,0 +1,14 @@
from unittest import TestCase
import simplejson as json
# from http://json.org/JSON_checker/test/pass2.json
JSON = r'''
[[[[[[[[[[[[[[[[[[["Not too deep"]]]]]]]]]]]]]]]]]]]
'''
class TestPass2(TestCase):
def test_parse(self):
# test in/out equivalence and parsing
res = json.loads(JSON)
out = json.dumps(res)
self.assertEquals(res, json.loads(out))

View file

@ -0,0 +1,20 @@
from unittest import TestCase
import simplejson as json
# from http://json.org/JSON_checker/test/pass3.json
JSON = r'''
{
"JSON Test Pattern pass3": {
"The outermost value": "must be an object or array.",
"In this test": "It is an object."
}
}
'''
class TestPass3(TestCase):
def test_parse(self):
# test in/out equivalence and parsing
res = json.loads(JSON)
out = json.dumps(res)
self.assertEquals(res, json.loads(out))

View file

@ -0,0 +1,67 @@
from unittest import TestCase
import simplejson as json
class JSONTestObject:
pass
class RecursiveJSONEncoder(json.JSONEncoder):
recurse = False
def default(self, o):
if o is JSONTestObject:
if self.recurse:
return [JSONTestObject]
else:
return 'JSONTestObject'
return json.JSONEncoder.default(o)
class TestRecursion(TestCase):
def test_listrecursion(self):
x = []
x.append(x)
try:
json.dumps(x)
except ValueError:
pass
else:
self.fail("didn't raise ValueError on list recursion")
x = []
y = [x]
x.append(y)
try:
json.dumps(x)
except ValueError:
pass
else:
self.fail("didn't raise ValueError on alternating list recursion")
y = []
x = [y, y]
# ensure that the marker is cleared
json.dumps(x)
def test_dictrecursion(self):
x = {}
x["test"] = x
try:
json.dumps(x)
except ValueError:
pass
else:
self.fail("didn't raise ValueError on dict recursion")
x = {}
y = {"a": x, "b": x}
# ensure that the marker is cleared
json.dumps(x)
def test_defaultrecursion(self):
enc = RecursiveJSONEncoder()
self.assertEquals(enc.encode(JSONTestObject), '"JSONTestObject"')
enc.recurse = True
try:
enc.encode(JSONTestObject)
except ValueError:
pass
else:
self.fail("didn't raise ValueError on default recursion")

View file

@ -0,0 +1,111 @@
import sys
import decimal
from unittest import TestCase
import simplejson as json
import simplejson.decoder
class TestScanString(TestCase):
def test_py_scanstring(self):
self._test_scanstring(simplejson.decoder.py_scanstring)
def test_c_scanstring(self):
if not simplejson.decoder.c_scanstring:
return
self._test_scanstring(simplejson.decoder.c_scanstring)
def _test_scanstring(self, scanstring):
self.assertEquals(
scanstring('"z\\ud834\\udd20x"', 1, None, True),
(u'z\U0001d120x', 16))
if sys.maxunicode == 65535:
self.assertEquals(
scanstring(u'"z\U0001d120x"', 1, None, True),
(u'z\U0001d120x', 6))
else:
self.assertEquals(
scanstring(u'"z\U0001d120x"', 1, None, True),
(u'z\U0001d120x', 5))
self.assertEquals(
scanstring('"\\u007b"', 1, None, True),
(u'{', 8))
self.assertEquals(
scanstring('"A JSON payload should be an object or array, not a string."', 1, None, True),
(u'A JSON payload should be an object or array, not a string.', 60))
self.assertEquals(
scanstring('["Unclosed array"', 2, None, True),
(u'Unclosed array', 17))
self.assertEquals(
scanstring('["extra comma",]', 2, None, True),
(u'extra comma', 14))
self.assertEquals(
scanstring('["double extra comma",,]', 2, None, True),
(u'double extra comma', 21))
self.assertEquals(
scanstring('["Comma after the close"],', 2, None, True),
(u'Comma after the close', 24))
self.assertEquals(
scanstring('["Extra close"]]', 2, None, True),
(u'Extra close', 14))
self.assertEquals(
scanstring('{"Extra comma": true,}', 2, None, True),
(u'Extra comma', 14))
self.assertEquals(
scanstring('{"Extra value after close": true} "misplaced quoted value"', 2, None, True),
(u'Extra value after close', 26))
self.assertEquals(
scanstring('{"Illegal expression": 1 + 2}', 2, None, True),
(u'Illegal expression', 21))
self.assertEquals(
scanstring('{"Illegal invocation": alert()}', 2, None, True),
(u'Illegal invocation', 21))
self.assertEquals(
scanstring('{"Numbers cannot have leading zeroes": 013}', 2, None, True),
(u'Numbers cannot have leading zeroes', 37))
self.assertEquals(
scanstring('{"Numbers cannot be hex": 0x14}', 2, None, True),
(u'Numbers cannot be hex', 24))
self.assertEquals(
scanstring('[[[[[[[[[[[[[[[[[[[["Too deep"]]]]]]]]]]]]]]]]]]]]', 21, None, True),
(u'Too deep', 30))
self.assertEquals(
scanstring('{"Missing colon" null}', 2, None, True),
(u'Missing colon', 16))
self.assertEquals(
scanstring('{"Double colon":: null}', 2, None, True),
(u'Double colon', 15))
self.assertEquals(
scanstring('{"Comma instead of colon", null}', 2, None, True),
(u'Comma instead of colon', 25))
self.assertEquals(
scanstring('["Colon instead of comma": false]', 2, None, True),
(u'Colon instead of comma', 25))
self.assertEquals(
scanstring('["Bad value", truth]', 2, None, True),
(u'Bad value', 12))
def test_issue3623(self):
self.assertRaises(ValueError, json.decoder.scanstring, "xxx", 1,
"xxx")
self.assertRaises(UnicodeDecodeError,
json.encoder.encode_basestring_ascii, "xx\xff")

View file

@ -0,0 +1,42 @@
import textwrap
from unittest import TestCase
import simplejson as json
class TestSeparators(TestCase):
def test_separators(self):
h = [['blorpie'], ['whoops'], [], 'd-shtaeou', 'd-nthiouh', 'i-vhbjkhnth',
{'nifty': 87}, {'field': 'yes', 'morefield': False} ]
expect = textwrap.dedent("""\
[
[
"blorpie"
] ,
[
"whoops"
] ,
[] ,
"d-shtaeou" ,
"d-nthiouh" ,
"i-vhbjkhnth" ,
{
"nifty" : 87
} ,
{
"field" : "yes" ,
"morefield" : false
}
]""")
d1 = json.dumps(h)
d2 = json.dumps(h, indent=2, sort_keys=True, separators=(' ,', ' : '))
h1 = json.loads(d1)
h2 = json.loads(d2)
self.assertEquals(h1, h)
self.assertEquals(h2, h)
self.assertEquals(d2, expect)

View file

@ -0,0 +1,64 @@
from unittest import TestCase
import simplejson as json
class TestUnicode(TestCase):
def test_encoding1(self):
encoder = json.JSONEncoder(encoding='utf-8')
u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
s = u.encode('utf-8')
ju = encoder.encode(u)
js = encoder.encode(s)
self.assertEquals(ju, js)
def test_encoding2(self):
u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
s = u.encode('utf-8')
ju = json.dumps(u, encoding='utf-8')
js = json.dumps(s, encoding='utf-8')
self.assertEquals(ju, js)
def test_encoding3(self):
u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
j = json.dumps(u)
self.assertEquals(j, '"\\u03b1\\u03a9"')
def test_encoding4(self):
u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
j = json.dumps([u])
self.assertEquals(j, '["\\u03b1\\u03a9"]')
def test_encoding5(self):
u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
j = json.dumps(u, ensure_ascii=False)
self.assertEquals(j, u'"%s"' % (u,))
def test_encoding6(self):
u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
j = json.dumps([u], ensure_ascii=False)
self.assertEquals(j, u'["%s"]' % (u,))
def test_big_unicode_encode(self):
u = u'\U0001d120'
self.assertEquals(json.dumps(u), '"\\ud834\\udd20"')
self.assertEquals(json.dumps(u, ensure_ascii=False), u'"\U0001d120"')
def test_big_unicode_decode(self):
u = u'z\U0001d120x'
self.assertEquals(json.loads('"' + u + '"'), u)
self.assertEquals(json.loads('"z\\ud834\\udd20x"'), u)
def test_unicode_decode(self):
for i in range(0, 0xd7ff):
u = unichr(i)
s = '"\\u%04x"' % (i,)
self.assertEquals(json.loads(s), u)
def test_default_encoding(self):
self.assertEquals(json.loads(u'{"a": "\xe9"}'.encode('utf-8')),
{'a': u'\xe9'})
def test_unicode_preservation(self):
self.assertEquals(type(json.loads(u'""')), unicode)
self.assertEquals(type(json.loads(u'"a"')), unicode)
self.assertEquals(type(json.loads(u'["a"]')[0]), unicode)

37
simplejson/tool.py Normal file
View file

@ -0,0 +1,37 @@
r"""Command-line tool to validate and pretty-print JSON
Usage::
$ echo '{"json":"obj"}' | python -m simplejson.tool
{
"json": "obj"
}
$ echo '{ 1.2:3.4}' | python -m simplejson.tool
Expecting property name: line 1 column 2 (char 2)
"""
import sys
import simplejson
def main():
if len(sys.argv) == 1:
infile = sys.stdin
outfile = sys.stdout
elif len(sys.argv) == 2:
infile = open(sys.argv[1], 'rb')
outfile = sys.stdout
elif len(sys.argv) == 3:
infile = open(sys.argv[1], 'rb')
outfile = open(sys.argv[2], 'wb')
else:
raise SystemExit(sys.argv[0] + " [infile [outfile]]")
try:
obj = simplejson.load(infile)
except ValueError, e:
raise SystemExit(e)
simplejson.dump(obj, outfile, sort_keys=True, indent=4)
outfile.write('\n')
if __name__ == '__main__':
main()

BIN
static/ajax-loader.gif Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

BIN
static/favicon.ico Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

53
utils/remover.py Normal file
View file

@ -0,0 +1,53 @@
#!/usr/bin/env python
# encoding: utf-8
"""
remover.py
Created by Roman on 2010-06-20.
Copyright (c) 2010 __MyCompanyName__. All rights reserved.
"""
import datetime
import logging
from google.appengine.ext.webapp import util
from google.appengine.ext import webapp
from google.appengine.api import users
from ffstorage import *
class Remover(webapp.RequestHandler):
def get(self):
logging.debug("Starting r3m0v3r")
user = users.get_current_user()
logging.debug("Working as user %s" % user)
theDate = datetime.date.today() - datetime.timedelta(days=2)
logging.debug("Will delete stuff older than %s" % theDate)
fics = DownloadedFanfic.all()
fics.order("date")
results = fics.fetch(50)
logging.debug([x.name for x in results])
num = 0
for d in results:
# d.blob = None
# d.cleared = True
d.delete()
num = num + 1
logging.info('Deleted instances: %d' % num)
self.response.out.write('Deleted instances: %d' % num)
def main():
application = webapp.WSGIApplication([('/r3m0v3r', Remover)],
debug=False)
util.run_wsgi_app(application)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.DEBUG)
main()