mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-04-24 07:52:59 +02:00
Change a couple of the example story URLs.
This commit is contained in:
commit
f3571959df
62 changed files with 16384 additions and 0 deletions
31
app.yaml
Normal file
31
app.yaml
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
application: fanfictionloader
|
||||
version: 2-5-5
|
||||
runtime: python
|
||||
api_version: 1
|
||||
|
||||
handlers:
|
||||
- url: /generate_mock_data
|
||||
script: mocks/generate_mock_data.py
|
||||
login: admin
|
||||
|
||||
- url: /r3m0v3r
|
||||
script: utils/remover.py
|
||||
login: admin
|
||||
|
||||
- url: /r3m0v3r
|
||||
script: main.py
|
||||
login: admin
|
||||
|
||||
- url: /css
|
||||
static_dir: css
|
||||
|
||||
- url: /js
|
||||
static_dir: js
|
||||
|
||||
- url: /static
|
||||
static_dir: static
|
||||
|
||||
|
||||
- url: /.*
|
||||
script: main.py
|
||||
|
||||
4
cron.yaml
Normal file
4
cron.yaml
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
cron:
|
||||
- description: cleanup job
|
||||
url: /r3m0v3r
|
||||
schedule: every 3 hours
|
||||
71
css/index.css
Normal file
71
css/index.css
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
body
|
||||
{
|
||||
font: 0.9em "Helvetica Neue", Arial, Helvetica, Geneva, sans-serif;
|
||||
}
|
||||
|
||||
#main
|
||||
{
|
||||
width: 43%;
|
||||
margin-left: 23%;
|
||||
background-color: #dae6ff;
|
||||
padding: 2em;
|
||||
}
|
||||
|
||||
#greeting
|
||||
{
|
||||
margin-bottom: 1em;
|
||||
border-color: #efefef;
|
||||
}
|
||||
|
||||
|
||||
|
||||
#logpassword:hover, #logpasswordtable:hover, #urlbox:hover, #typebox:hover, #helpbox:hover, #yourfile:hover
|
||||
{
|
||||
border: thin solid #fffeff;
|
||||
}
|
||||
|
||||
h1
|
||||
{
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
#logpasswordtable
|
||||
{
|
||||
padding: 1em;
|
||||
}
|
||||
|
||||
#logpassword, #logpasswordtable {
|
||||
display: none;
|
||||
}
|
||||
|
||||
#urlbox, #typebox, #logpasswordtable, #logpassword, #helpbox, #yourfile
|
||||
{
|
||||
margin: 1em;
|
||||
padding: 1em;
|
||||
border: thin dotted #fffeff;
|
||||
}
|
||||
|
||||
div.field
|
||||
{
|
||||
margin-bottom: 0.5em;
|
||||
}
|
||||
|
||||
#submitbtn
|
||||
{
|
||||
padding: 1em;
|
||||
}
|
||||
|
||||
#typelabel
|
||||
{
|
||||
}
|
||||
|
||||
#typeoptions
|
||||
{
|
||||
margin-top: 0.5em;
|
||||
}
|
||||
|
||||
#error
|
||||
{
|
||||
font-size: small;
|
||||
color: #f00;
|
||||
}
|
||||
59
delete_fic.py
Normal file
59
delete_fic.py
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
import os
|
||||
import cgi
|
||||
import sys
|
||||
import logging
|
||||
import traceback
|
||||
import StringIO
|
||||
|
||||
from google.appengine.api import users
|
||||
from google.appengine.ext import webapp
|
||||
from google.appengine.ext.webapp import util
|
||||
|
||||
from fanficdownloader.downaloder import *
|
||||
from fanficdownloader.ffnet import *
|
||||
from fanficdownloader.output import *
|
||||
|
||||
from google.appengine.ext import db
|
||||
|
||||
from fanficdownloader.zipdir import *
|
||||
|
||||
from ffstorage import *
|
||||
|
||||
def create_mac(user, fic_id, fic_url):
|
||||
return str(abs(hash(user)+hash(fic_id)))+str(abs(hash(fic_url)))
|
||||
|
||||
def check_mac(user, fic_id, fic_url, mac):
|
||||
return (create_mac(user, fic_id, fic_url) == mac)
|
||||
|
||||
def create_mac_for_fic(user, fic_id):
|
||||
key = db.Key(fic_id)
|
||||
fanfic = db.get(key)
|
||||
if fanfic.user != user:
|
||||
return None
|
||||
else:
|
||||
return create_mac(user, key, fanfic.url)
|
||||
|
||||
class DeleteFicHandler(webapp.RequestHandler):
|
||||
def get(self):
|
||||
user = users.get_current_user()
|
||||
if not user:
|
||||
self.redirect('/login')
|
||||
|
||||
fic_id = self.request.get('fic_id')
|
||||
fic_mac = self.request.get('key_id')
|
||||
|
||||
actual_mac = create_mac_for_fic(user, fic_id)
|
||||
if actual_mac != fic_mac:
|
||||
self.response.out.write("Ooops")
|
||||
else:
|
||||
key = db.Key(fic_id)
|
||||
fanfic = db.get(key)
|
||||
fanfic.delete()
|
||||
self.redirect('/recent')
|
||||
|
||||
|
||||
fics = db.GqlQuery("Select * From DownloadedFanfic WHERE user = :1", user)
|
||||
template_values = dict(fics = fics, nickname = user.nickname())
|
||||
path = os.path.join(os.path.dirname(__file__), 'recent.html')
|
||||
self.response.out.write(template.render(path, template_values))
|
||||
|
||||
2014
fanficdownloader/BeautifulSoup.py
Normal file
2014
fanficdownloader/BeautifulSoup.py
Normal file
File diff suppressed because it is too large
Load diff
1
fanficdownloader/__init__.py
Normal file
1
fanficdownloader/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
229
fanficdownloader/adapter.py
Normal file
229
fanficdownloader/adapter.py
Normal file
|
|
@ -0,0 +1,229 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
import datetime
|
||||
from output import makeAcceptableFilename
|
||||
|
||||
try:
|
||||
from google.appengine.api.urlfetch import fetch as googlefetch
|
||||
appEngineGlob = True
|
||||
except:
|
||||
appEngineGlob = False
|
||||
|
||||
class LoginRequiredException(Exception):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
|
||||
def __str__(self):
|
||||
return repr(self.url + ' requires user to be logged in')
|
||||
|
||||
class StoryArchivedAlready(Exception):
|
||||
pass
|
||||
|
||||
class StoryDoesNotExist(Exception):
|
||||
pass
|
||||
|
||||
class FailedToDownload(Exception):
|
||||
pass
|
||||
|
||||
class InvalidStoryURL(Exception):
|
||||
pass
|
||||
|
||||
class FanfictionSiteAdapter:
|
||||
appEngine = appEngineGlob
|
||||
login = ''
|
||||
password = ''
|
||||
url = ''
|
||||
host = ''
|
||||
path = ''
|
||||
uuid = ''
|
||||
storyName = ''
|
||||
storyId = ''
|
||||
authorName = ''
|
||||
authorId = ''
|
||||
authorURL = ''
|
||||
outputStorySep = '-Ukn_'
|
||||
outputName = ''
|
||||
outputFileName = ''
|
||||
storyDescription = ''
|
||||
storyCharacters = []
|
||||
storySeries = ''
|
||||
storyPublished = datetime.date(1970, 01, 31)
|
||||
storyCreated = datetime.datetime.now()
|
||||
storyUpdated = datetime.date(1970, 01, 31)
|
||||
languageId = 'en-UK'
|
||||
language = 'English'
|
||||
subjects = []
|
||||
publisher = ''
|
||||
numChapters = '0'
|
||||
numWords = '0'
|
||||
genre = ''
|
||||
category = ''
|
||||
storyStatus = 'In-Progress'
|
||||
storyRating = ''
|
||||
storyUserRating = '0'
|
||||
def __init__(self, url):
|
||||
# basic plain url parsing...
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
|
||||
def hasAppEngine(self):
|
||||
return self.appEngine
|
||||
|
||||
def fetchUrl(self, url):
|
||||
if not self.appEngine:
|
||||
return self.opener.open(url).read().decode('utf-8')
|
||||
else:
|
||||
return googlefetch(url).content
|
||||
|
||||
def requiresLogin(self, url = None):
|
||||
return False
|
||||
|
||||
def performLogin(self, url = None):
|
||||
return True
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
pass
|
||||
|
||||
def getText(self, url):
|
||||
pass
|
||||
|
||||
def setLogin(self, login):
|
||||
self.login = login
|
||||
|
||||
def setPassword(self, password):
|
||||
self.password = password
|
||||
|
||||
def getHost(self):
|
||||
logging.debug('self.host=%s' % self.host)
|
||||
return self.host
|
||||
|
||||
def getUUID(self):
|
||||
self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
|
||||
logging.debug('self.uuid=%s' % self.uuid)
|
||||
return self.uuid
|
||||
|
||||
def getOutputName(self):
|
||||
self.outputName = makeAcceptableFilename(self.storyName.replace(" ", "_") + self.outputStorySep + self.storyId)
|
||||
logging.debug('self.outputName=%s' % self.outputName)
|
||||
return self.outputName
|
||||
|
||||
def getOutputFileName(self, booksDirectory, bookExt):
|
||||
self.getOutputName() # make sure self.outputName is populated
|
||||
self.outputFileName = booksDirectory + "/" + self.outputName + bookExt
|
||||
logging.debug('self.outputFileName=%s' % self.outputFileName)
|
||||
return self.outputFileName
|
||||
|
||||
def getAuthorURL(self):
|
||||
logging.debug('self.authorURL=%s' % self.authorURL)
|
||||
return self.authorURL
|
||||
|
||||
def getAuthorId(self):
|
||||
logging.debug('self.authorId=%s' % self.authorId)
|
||||
return self.authorId
|
||||
|
||||
def getAuthorName(self):
|
||||
logging.debug('self.authorName=%s' % self.authorName)
|
||||
return self.authorName
|
||||
|
||||
def getStoryURL(self):
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
return self.url
|
||||
|
||||
def getStoryId(self):
|
||||
logging.debug('self.storyId=%s' % self.storyId)
|
||||
return self.storyId
|
||||
|
||||
def getStoryName(self):
|
||||
logging.debug('self.storyName=%s' % self.storyName)
|
||||
return self.storyName
|
||||
|
||||
def getStoryDescription(self):
|
||||
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
||||
return self.storyDescription
|
||||
|
||||
def getStoryCreated(self):
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
logging.debug('self.storyCreated=%s' % self.storyCreated)
|
||||
return self.storyCreated
|
||||
|
||||
def addCharacter(self, character):
|
||||
chara = character.upper()
|
||||
for c in self.storyCharacters:
|
||||
if c.upper() == chara:
|
||||
return False
|
||||
self.storyCharacters.append(character)
|
||||
return True
|
||||
|
||||
def getStoryCharacters(self):
|
||||
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
||||
return self.storyCharacters
|
||||
|
||||
def getStoryPublished(self):
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
return self.storyPublished
|
||||
|
||||
def getStoryUpdated(self):
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
||||
return self.storyUpdated
|
||||
|
||||
def getStorySeries(self):
|
||||
logging.debug('self.storySeries=%s' % self.storySeries)
|
||||
return self.storySeries
|
||||
|
||||
def getLanguage(self):
|
||||
logging.debug('self.language=%s' % self.language)
|
||||
return self.language
|
||||
|
||||
def getLanguageId(self):
|
||||
logging.debug('self.languageId=%s' % self.languageId)
|
||||
return self.languageId
|
||||
|
||||
def addSubject(self, subject):
|
||||
subj = subject.upper()
|
||||
for s in self.subjects:
|
||||
if s.upper() == subj:
|
||||
return False
|
||||
self.subjects.append(subject)
|
||||
return True
|
||||
|
||||
def getSubjects(self):
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
return self.subjects
|
||||
|
||||
def getPublisher(self):
|
||||
logging.debug('self.publisher=%s' % self.publisher)
|
||||
return self.publisher
|
||||
|
||||
def getNumChapters(self):
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
return self.numChapters
|
||||
|
||||
def getNumWords(self):
|
||||
logging.debug('self.numWords=%s' % self.numWords)
|
||||
return self.numWords
|
||||
|
||||
def getCategory(self):
|
||||
logging.debug('self.category=%s' % self.category)
|
||||
return self.category
|
||||
|
||||
def getGenre(self):
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
return self.genre
|
||||
|
||||
def getStoryStatus(self):
|
||||
logging.debug('self.storyStatus=%s' % self.storyStatus)
|
||||
return self.storyStatus
|
||||
|
||||
def getStoryRating(self):
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
return self.storyRating
|
||||
|
||||
def getStoryUserRating(self):
|
||||
logging.debug('self.storyUserRating=%s' % self.storyUserRating)
|
||||
return self.storyUserRating
|
||||
|
||||
def getPrintableUrl(self, url):
|
||||
return url
|
||||
0
fanficdownloader/books/place holder.txt
Normal file
0
fanficdownloader/books/place holder.txt
Normal file
542
fanficdownloader/constants.py
Normal file
542
fanficdownloader/constants.py
Normal file
|
|
@ -0,0 +1,542 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
CSS = '''body { margin-left: 2%; margin-right: 2%; margin-top: 2%; margin-bottom: 2%; text-align: justify; }
|
||||
pre { font-size: x-small; }
|
||||
sml { font-size: small; }
|
||||
h1 { text-align: center; }
|
||||
h2 { text-align: center; }
|
||||
h3 { text-align: center; }
|
||||
h4 { text-align: center; }
|
||||
h5 { text-align: center; }
|
||||
h6 { text-align: center; }
|
||||
h7 { text-align: left; font-size: large; font-weight: bold; }
|
||||
.CI {
|
||||
text-align:center;
|
||||
margin-top:0px;
|
||||
margin-bottom:0px;
|
||||
padding:0px;
|
||||
}
|
||||
.center {text-align: center;}
|
||||
.cover {text-align: center;}
|
||||
.full {width: 100%; }
|
||||
.quarter {width: 25%; }
|
||||
.smcap {font-variant: small-caps;}
|
||||
.u {text-decoration: underline;}
|
||||
.bold {font-weight: bold;}
|
||||
'''
|
||||
|
||||
MIMETYPE = '''application/epub+zip'''
|
||||
|
||||
TITLE_HEADER = '''<?xml version="1.0" encoding="utf-8"?><html xmlns="http://www.w3.org/1999/xhtml" xmlns:xlink="http://www.w3.org/1999/xlink"><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
|
||||
<title>%s - %s</title><link href="stylesheet.css" type="text/css" charset="UTF-8" rel="stylesheet"/></head><body>
|
||||
<p><h7 id="lnks"><b><a id="StoryLink" href="%s">%s</a></b> by <b><a id="AuthorLink" href="%s">%s</a></b></h7></p>
|
||||
<table class="full">
|
||||
'''
|
||||
|
||||
TITLE_ENTRY = '''<tr><td><b>%s</b></td><td>%s</td></tr>
|
||||
'''
|
||||
|
||||
TITLE_FOOTER = '''</table>
|
||||
<p><b>Summary:</b><br />%s</p>
|
||||
</body></html>
|
||||
'''
|
||||
|
||||
CONTAINER = '''<?xml version="1.0" encoding="utf-8"?>
|
||||
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
|
||||
<rootfiles>
|
||||
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
|
||||
</rootfiles>
|
||||
</container>
|
||||
'''
|
||||
|
||||
CONTENT_START = '''<?xml version="1.0" encoding="utf-8"?>
|
||||
<package version="2.0" xmlns="http://www.idpf.org/2007/opf"
|
||||
unique-identifier="fanficdownloader-uuid">
|
||||
<metadata xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:dcterms="http://purl.org/dc/terms/"
|
||||
xmlns:opf="http://www.idpf.org/2007/opf"
|
||||
xmlns:calibre="http://calibre.kovidgoyal.net/2009/metadata">
|
||||
<dc:identifier id="fanficdownloader-uuid">BookID-Epub-%s</dc:identifier>
|
||||
<dc:title>%s</dc:title>
|
||||
<dc:creator opf:role="aut">%s</dc:creator>
|
||||
<dc:contributor opf:role="bkp">fanficdownloader [http://fanficdownloader.googlecode.com]</dc:contributor>
|
||||
<dc:language>%s</dc:language>
|
||||
<dc:rights></dc:rights>
|
||||
<dc:date opf:event="publication">%s</dc:date>
|
||||
<dc:date opf:event="creation">%s</dc:date>
|
||||
<dc:date opf:event="modification">%s</dc:date>
|
||||
<meta name="calibre:timestamp" content="%s"/>
|
||||
<dc:description>%s</dc:description>
|
||||
'''
|
||||
|
||||
CONTENT_END_METADATA = ''' <dc:publisher>%s</dc:publisher>
|
||||
<dc:identifier id="BookId">%s</dc:identifier>
|
||||
<dc:identifier opf:scheme="URL">%s</dc:identifier>
|
||||
<dc:source>%s</dc:source>
|
||||
<dc:type>FanFiction</dc:type>
|
||||
<meta name="calibre:rating" content="%s"/>
|
||||
</metadata>
|
||||
<manifest>
|
||||
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
|
||||
<item id="style" href="stylesheet.css" media-type="text/css" />
|
||||
'''
|
||||
|
||||
CONTENT_SUBJECT = ''' <dc:subject>%s</dc:subject>
|
||||
'''
|
||||
|
||||
CONTENT_ITEM = ''' <item id="%s" href="%s" media-type="application/xhtml+xml" />
|
||||
'''
|
||||
|
||||
CONTENT_END_MANIFEST = ''' </manifest>
|
||||
<spine toc="ncx">
|
||||
'''
|
||||
|
||||
CONTENT_ITEMREF = ''' <itemref idref="%s" />
|
||||
'''
|
||||
|
||||
CONTENT_END = ''' </spine>
|
||||
</package>
|
||||
'''
|
||||
|
||||
TOC_START = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
|
||||
<head>
|
||||
<meta name="dtb:uid" content="%s"/>
|
||||
<meta name="dtb:depth" content="1"/>
|
||||
<meta name="dtb:totalPageCount" content="0"/>
|
||||
<meta name="dtb:maxPageNumber" content="0"/>
|
||||
</head>
|
||||
<docTitle>
|
||||
<text>%s</text>
|
||||
</docTitle>
|
||||
<navMap>
|
||||
'''
|
||||
|
||||
TOC_ITEM = '''<navPoint id="%s" playOrder="%d">
|
||||
<navLabel>
|
||||
<text>%s</text>
|
||||
</navLabel>
|
||||
<content src="%s"/>
|
||||
</navPoint>
|
||||
'''
|
||||
|
||||
TOC_END = '''</navMap>
|
||||
</ncx>
|
||||
'''
|
||||
|
||||
XHTML_START = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>%s</title>
|
||||
<link href="stylesheet.css" type="text/css" rel="stylesheet" />
|
||||
</head>
|
||||
<body>
|
||||
<div>
|
||||
<h3>%s</h3>
|
||||
'''
|
||||
|
||||
XHTML_END = '''</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
|
||||
'blockquote', 'br', 'center', 'cite', 'code', 'col',
|
||||
'colgroup', 'dd', 'del', 'dfn', 'dir', 'dl', 'dt', 'em',
|
||||
'font', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'i',
|
||||
'ins', 'kbd', 'label', 'li', 'ol',
|
||||
'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike',
|
||||
'strong', 'sub', 'sup', 'u', 'ul']
|
||||
|
||||
acceptable_attributes = ['href']
|
||||
|
||||
# entity list from http://code.google.com/p/doctype/wiki/CharacterEntitiesConsistent
|
||||
entities = { 'á' : 'á',
|
||||
'Á' : 'Á',
|
||||
'Á' : 'Á',
|
||||
'á' : 'á',
|
||||
'â' : 'â',
|
||||
'Â' : 'Â',
|
||||
'Â' : 'Â',
|
||||
'â' : 'â',
|
||||
'´' : '´',
|
||||
'´' : '´',
|
||||
'Æ' : 'Æ',
|
||||
'æ' : 'æ',
|
||||
'Æ' : 'Æ',
|
||||
'æ' : 'æ',
|
||||
'à' : 'à',
|
||||
'À' : 'À',
|
||||
'À' : 'À',
|
||||
'à' : 'à',
|
||||
'ℵ' : 'ℵ',
|
||||
'α' : 'α',
|
||||
'Α' : 'Α',
|
||||
'&' : '&',
|
||||
'&' : '&',
|
||||
'&' : '&',
|
||||
'&' : '&',
|
||||
'∧' : '∧',
|
||||
'∠' : '∠',
|
||||
'å' : 'å',
|
||||
'Å' : 'Å',
|
||||
'Å' : 'Å',
|
||||
'å' : 'å',
|
||||
'≈' : '≈',
|
||||
'ã' : 'ã',
|
||||
'Ã' : 'Ã',
|
||||
'Ã' : 'Ã',
|
||||
'ã' : 'ã',
|
||||
'ä' : 'ä',
|
||||
'Ä' : 'Ä',
|
||||
'Ä' : 'Ä',
|
||||
'ä' : 'ä',
|
||||
'„' : '„',
|
||||
'β' : 'β',
|
||||
'Β' : 'Β',
|
||||
'¦' : '¦',
|
||||
'¦' : '¦',
|
||||
'•' : '•',
|
||||
'∩' : '∩',
|
||||
'ç' : 'ç',
|
||||
'Ç' : 'Ç',
|
||||
'Ç' : 'Ç',
|
||||
'ç' : 'ç',
|
||||
'¸' : '¸',
|
||||
'¸' : '¸',
|
||||
'¢' : '¢',
|
||||
'¢' : '¢',
|
||||
'χ' : 'χ',
|
||||
'Χ' : 'Χ',
|
||||
'ˆ' : 'ˆ',
|
||||
'♣' : '♣',
|
||||
'≅' : '≅',
|
||||
'©' : '©',
|
||||
'©' : '©',
|
||||
'©' : '©',
|
||||
'©' : '©',
|
||||
'↵' : '↵',
|
||||
'∪' : '∪',
|
||||
'¤' : '¤',
|
||||
'¤' : '¤',
|
||||
'†' : '†',
|
||||
'‡' : '‡',
|
||||
'↓' : '↓',
|
||||
'⇓' : '⇓',
|
||||
'°' : '°',
|
||||
'°' : '°',
|
||||
'δ' : 'δ',
|
||||
'Δ' : 'Δ',
|
||||
'♦' : '♦',
|
||||
'÷' : '÷',
|
||||
'÷' : '÷',
|
||||
'é' : 'é',
|
||||
'É' : 'É',
|
||||
'É' : 'É',
|
||||
'é' : 'é',
|
||||
'ê' : 'ê',
|
||||
'Ê' : 'Ê',
|
||||
'Ê' : 'Ê',
|
||||
'ê' : 'ê',
|
||||
'è' : 'è',
|
||||
'È' : 'È',
|
||||
'È' : 'È',
|
||||
'è' : 'è',
|
||||
'∅' : '∅',
|
||||
' ' : ' ',
|
||||
' ' : ' ',
|
||||
'ε' : 'ε',
|
||||
'Ε' : 'Ε',
|
||||
'≡' : '≡',
|
||||
'η' : 'η',
|
||||
'Η' : 'Η',
|
||||
'ð' : 'ð',
|
||||
'Ð' : 'Ð',
|
||||
'Ð' : 'Ð',
|
||||
'ð' : 'ð',
|
||||
'ë' : 'ë',
|
||||
'Ë' : 'Ë',
|
||||
'Ë' : 'Ë',
|
||||
'ë' : 'ë',
|
||||
'€' : '€',
|
||||
'∃' : '∃',
|
||||
'ƒ' : 'ƒ',
|
||||
'∀' : '∀',
|
||||
'½' : '½',
|
||||
'½' : '½',
|
||||
'¼' : '¼',
|
||||
'¼' : '¼',
|
||||
'¾' : '¾',
|
||||
'¾' : '¾',
|
||||
'⁄' : '⁄',
|
||||
'γ' : 'γ',
|
||||
'Γ' : 'Γ',
|
||||
'≥' : '≥',
|
||||
'>' : '>',
|
||||
'>' : '>',
|
||||
'>' : '>',
|
||||
'>' : '>',
|
||||
'↔' : '↔',
|
||||
'⇔' : '⇔',
|
||||
'♥' : '♥',
|
||||
'…' : '…',
|
||||
'í' : 'í',
|
||||
'Í' : 'Í',
|
||||
'Í' : 'Í',
|
||||
'í' : 'í',
|
||||
'î' : 'î',
|
||||
'Î' : 'Î',
|
||||
'Î' : 'Î',
|
||||
'î' : 'î',
|
||||
'¡' : '¡',
|
||||
'¡' : '¡',
|
||||
'ì' : 'ì',
|
||||
'Ì' : 'Ì',
|
||||
'Ì' : 'Ì',
|
||||
'ì' : 'ì',
|
||||
'ℑ' : 'ℑ',
|
||||
'∞' : '∞',
|
||||
'∫' : '∫',
|
||||
'ι' : 'ι',
|
||||
'Ι' : 'Ι',
|
||||
'¿' : '¿',
|
||||
'¿' : '¿',
|
||||
'∈' : '∈',
|
||||
'ï' : 'ï',
|
||||
'Ï' : 'Ï',
|
||||
'Ï' : 'Ï',
|
||||
'ï' : 'ï',
|
||||
'κ' : 'κ',
|
||||
'Κ' : 'Κ',
|
||||
'λ' : 'λ',
|
||||
'Λ' : 'Λ',
|
||||
'«' : '«',
|
||||
'«' : '«',
|
||||
'←' : '←',
|
||||
'⇐' : '⇐',
|
||||
'⌈' : '⌈',
|
||||
'“' : '“',
|
||||
'≤' : '≤',
|
||||
'⌊' : '⌊',
|
||||
'∗' : '∗',
|
||||
'◊' : '◊',
|
||||
'‎' : '',
|
||||
'‹' : '‹',
|
||||
'‘' : '‘',
|
||||
'<' : '<',
|
||||
'<' : '<',
|
||||
'<' : '<',
|
||||
'<' : '<',
|
||||
'¯' : '¯',
|
||||
'¯' : '¯',
|
||||
'—' : '—',
|
||||
'µ' : 'µ',
|
||||
'µ' : 'µ',
|
||||
'·' : '·',
|
||||
'·' : '·',
|
||||
'−' : '−',
|
||||
'μ' : 'μ',
|
||||
'Μ' : 'Μ',
|
||||
'∇' : '∇',
|
||||
' ' : ' ',
|
||||
' ' : ' ',
|
||||
'–' : '–',
|
||||
'≠' : '≠',
|
||||
'∋' : '∋',
|
||||
'¬' : '¬',
|
||||
'¬' : '¬',
|
||||
'∉' : '∉',
|
||||
'⊄' : '⊄',
|
||||
'ñ' : 'ñ',
|
||||
'Ñ' : 'Ñ',
|
||||
'Ñ' : 'Ñ',
|
||||
'ñ' : 'ñ',
|
||||
'ν' : 'ν',
|
||||
'Ν' : 'Ν',
|
||||
'ó' : 'ó',
|
||||
'Ó' : 'Ó',
|
||||
'Ó' : 'Ó',
|
||||
'ó' : 'ó',
|
||||
'ô' : 'ô',
|
||||
'Ô' : 'Ô',
|
||||
'Ô' : 'Ô',
|
||||
'ô' : 'ô',
|
||||
'Œ' : 'Œ',
|
||||
'œ' : 'œ',
|
||||
'ò' : 'ò',
|
||||
'Ò' : 'Ò',
|
||||
'Ò' : 'Ò',
|
||||
'ò' : 'ò',
|
||||
'‾' : '‾',
|
||||
'ω' : 'ω',
|
||||
'Ω' : 'Ω',
|
||||
'ο' : 'ο',
|
||||
'Ο' : 'Ο',
|
||||
'⊕' : '⊕',
|
||||
'∨' : '∨',
|
||||
'ª' : 'ª',
|
||||
'ª' : 'ª',
|
||||
'º' : 'º',
|
||||
'º' : 'º',
|
||||
'ø' : 'ø',
|
||||
'Ø' : 'Ø',
|
||||
'Ø' : 'Ø',
|
||||
'ø' : 'ø',
|
||||
'õ' : 'õ',
|
||||
'Õ' : 'Õ',
|
||||
'Õ' : 'Õ',
|
||||
'õ' : 'õ',
|
||||
'⊗' : '⊗',
|
||||
'ö' : 'ö',
|
||||
'Ö' : 'Ö',
|
||||
'Ö' : 'Ö',
|
||||
'ö' : 'ö',
|
||||
'¶' : '¶',
|
||||
'¶' : '¶',
|
||||
'∂' : '∂',
|
||||
'‰' : '‰',
|
||||
'⊥' : '⊥',
|
||||
'φ' : 'φ',
|
||||
'Φ' : 'Φ',
|
||||
'π' : 'π',
|
||||
'Π' : 'Π',
|
||||
'ϖ' : 'ϖ',
|
||||
'±' : '±',
|
||||
'±' : '±',
|
||||
'£' : '£',
|
||||
'£' : '£',
|
||||
'′' : '′',
|
||||
'″' : '″',
|
||||
'∏' : '∏',
|
||||
'∝' : '∝',
|
||||
'ψ' : 'ψ',
|
||||
'Ψ' : 'Ψ',
|
||||
'"' : '"',
|
||||
'"' : '"',
|
||||
'"' : '"',
|
||||
'"' : '"',
|
||||
'√' : '√',
|
||||
'»' : '»',
|
||||
'»' : '»',
|
||||
'→' : '→',
|
||||
'⇒' : '⇒',
|
||||
'⌉' : '⌉',
|
||||
'”' : '”',
|
||||
'ℜ' : 'ℜ',
|
||||
'®' : '®',
|
||||
'®' : '®',
|
||||
'®' : '®',
|
||||
'®' : '®',
|
||||
'⌋' : '⌋',
|
||||
'ρ' : 'ρ',
|
||||
'Ρ' : 'Ρ',
|
||||
'‏' : '',
|
||||
'›' : '›',
|
||||
'’' : '’',
|
||||
'‚' : '‚',
|
||||
'š' : 'š',
|
||||
'Š' : 'Š',
|
||||
'⋅' : '⋅',
|
||||
'§' : '§',
|
||||
'§' : '§',
|
||||
'­' : '', # strange optional hyphenation control character, not just a dash
|
||||
'­' : '',
|
||||
'σ' : 'σ',
|
||||
'Σ' : 'Σ',
|
||||
'ς' : 'ς',
|
||||
'∼' : '∼',
|
||||
'♠' : '♠',
|
||||
'⊂' : '⊂',
|
||||
'⊆' : '⊆',
|
||||
'∑' : '∑',
|
||||
'¹' : '¹',
|
||||
'¹' : '¹',
|
||||
'²' : '²',
|
||||
'²' : '²',
|
||||
'³' : '³',
|
||||
'³' : '³',
|
||||
'⊃' : '⊃',
|
||||
'⊇' : '⊇',
|
||||
'ß' : 'ß',
|
||||
'ß' : 'ß',
|
||||
'τ' : 'τ',
|
||||
'Τ' : 'Τ',
|
||||
'∴' : '∴',
|
||||
'θ' : 'θ',
|
||||
'Θ' : 'Θ',
|
||||
'ϑ' : 'ϑ',
|
||||
' ' : ' ',
|
||||
'þ' : 'þ',
|
||||
'Þ' : 'Þ',
|
||||
'Þ' : 'Þ',
|
||||
'þ' : 'þ',
|
||||
'˜' : '˜',
|
||||
'×' : '×',
|
||||
'×' : '×',
|
||||
'™' : '™',
|
||||
'ú' : 'ú',
|
||||
'Ú' : 'Ú',
|
||||
'Ú' : 'Ú',
|
||||
'ú' : 'ú',
|
||||
'↑' : '↑',
|
||||
'⇑' : '⇑',
|
||||
'û' : 'û',
|
||||
'Û' : 'Û',
|
||||
'Û' : 'Û',
|
||||
'û' : 'û',
|
||||
'ù' : 'ù',
|
||||
'Ù' : 'Ù',
|
||||
'Ù' : 'Ù',
|
||||
'ù' : 'ù',
|
||||
'¨' : '¨',
|
||||
'¨' : '¨',
|
||||
'ϒ' : 'ϒ',
|
||||
'υ' : 'υ',
|
||||
'Υ' : 'Υ',
|
||||
'ü' : 'ü',
|
||||
'Ü' : 'Ü',
|
||||
'Ü' : 'Ü',
|
||||
'ü' : 'ü',
|
||||
'℘' : '℘',
|
||||
'ξ' : 'ξ',
|
||||
'Ξ' : 'Ξ',
|
||||
'ý' : 'ý',
|
||||
'Ý' : 'Ý',
|
||||
'Ý' : 'Ý',
|
||||
'ý' : 'ý',
|
||||
'¥' : '¥',
|
||||
'¥' : '¥',
|
||||
'ÿ' : 'ÿ',
|
||||
'Ÿ' : 'Ÿ',
|
||||
'ÿ' : 'ÿ',
|
||||
'ζ' : 'ζ',
|
||||
'Ζ' : 'Ζ',
|
||||
'‍' : '', # strange spacing control character, not just a space
|
||||
'‌' : '', # strange spacing control character, not just a space
|
||||
}
|
||||
|
||||
FB2_PROLOGUE = '<FictionBook>'
|
||||
FB2_DESCRIPTION = '''<description>
|
||||
<title-info>
|
||||
<genre>fanfiction</genre>
|
||||
<author>
|
||||
<first-name></first-name>
|
||||
<middle-name></middle-name>
|
||||
<last-name>%s</last-name>
|
||||
</author>
|
||||
<book-title>%s</book-title>
|
||||
<lang>eng</lang>
|
||||
</title-info>
|
||||
<document-info>
|
||||
<author>
|
||||
<nickname>sgzmd</nickname>
|
||||
</author>
|
||||
<date value="%s">%s</date>
|
||||
<id>sgzmd_%s</id>
|
||||
<version>2.0</version>
|
||||
</document-info>
|
||||
</description>'''
|
||||
|
||||
HTML_ESC_Definitions = 'HTML_Escape.def'
|
||||
205
fanficdownloader/downloader.py
Normal file
205
fanficdownloader/downloader.py
Normal file
|
|
@ -0,0 +1,205 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
import os.path
|
||||
import getpass
|
||||
import logging
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
|
||||
import zipdir
|
||||
|
||||
import output
|
||||
import adapter
|
||||
from adapter import StoryArchivedAlready
|
||||
from adapter import StoryDoesNotExist
|
||||
from adapter import FailedToDownload
|
||||
from adapter import InvalidStoryURL
|
||||
from adapter import LoginRequiredException
|
||||
import ffnet
|
||||
import fpcom
|
||||
import ficwad
|
||||
import fictionalley
|
||||
import hpfiction
|
||||
import twilighted
|
||||
import potionsNsnitches
|
||||
import mediaminer
|
||||
|
||||
import time
|
||||
|
||||
class FanficLoader:
|
||||
'''A controller class which handles the interaction between various specific downloaders and writers'''
|
||||
booksDirectory = "books"
|
||||
standAlone = False
|
||||
|
||||
def __init__(self, adapter, writerClass, quiet = False, inmemory = False, compress=True, overwrite=False):
|
||||
self.adapter = adapter
|
||||
self.writerClass = writerClass
|
||||
self.quiet = quiet
|
||||
self.inmemory = inmemory
|
||||
self.compress = compress
|
||||
self.badLogin = False
|
||||
self.overWrite = overwrite
|
||||
|
||||
def getBooksDirectory(self):
|
||||
return self.booksDirectory
|
||||
|
||||
def setBooksDirectory(self, bd):
|
||||
self.booksDirectory = bd
|
||||
return self.booksDirectory
|
||||
|
||||
def getStandAlone(self):
|
||||
return self.standAlone
|
||||
|
||||
def setStandAlone(self, sa):
|
||||
self.standAlone = sa
|
||||
return self.standAlone
|
||||
|
||||
def getOverWrite(self):
|
||||
return self.overWrite
|
||||
|
||||
def setOverWrite(self, sa):
|
||||
self.overWrite = sa
|
||||
return self.overWrite
|
||||
|
||||
def getAdapter():
|
||||
return self.adapter
|
||||
|
||||
def download(self):
|
||||
logging.debug("Trying to download the story")
|
||||
if self.adapter.requiresLogin():
|
||||
logging.debug("Story requires login")
|
||||
if not self.adapter.performLogin():
|
||||
logging.debug("Login/password problem")
|
||||
self.badLogin = True
|
||||
raise adapter.LoginRequiredException(self.adapter.url)
|
||||
|
||||
urls = self.adapter.extractIndividualUrls()
|
||||
|
||||
logging.debug("self.writerClass=%s" % self.writerClass)
|
||||
if self.standAlone and not self.inmemory:
|
||||
s = self.adapter.getOutputFileName(self.booksDirectory, self.writerClass.getFormatExt())
|
||||
logging.debug("Always overwrite? %s" % self.overWrite)
|
||||
if not self.overWrite:
|
||||
logging.debug("Checking if current archive of the story exists. Filename=%s" % s)
|
||||
if not zipdir.checkNewer ( s, self.adapter.getStoryUpdated() ):
|
||||
raise StoryArchivedAlready("A Current archive file \"" + s + "\" already exists! Skipping!")
|
||||
else:
|
||||
logging.debug("Do not check for existance of archive file.")
|
||||
|
||||
self.writer = self.writerClass(self.booksDirectory, self.adapter, inmemory=self.inmemory, compress=self.compress)
|
||||
|
||||
i = 1
|
||||
for u,n in urls:
|
||||
if not self.quiet:
|
||||
print('Downloading chapter %d/%d' % (i, len(urls)))
|
||||
text = self.adapter.getText(u)
|
||||
self.writer.writeChapter(i, n, text)
|
||||
i = i+1
|
||||
# time.sleep(2)
|
||||
|
||||
self.writer.finalise()
|
||||
|
||||
if self.inmemory:
|
||||
self.name = self.writer.name
|
||||
return self.writer.output.getvalue()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
argvlen = len(sys.argv)
|
||||
url = None
|
||||
bookFormat = 'epub'
|
||||
if argvlen > 1:
|
||||
url = sys.argv[1]
|
||||
if argvlen > 2:
|
||||
bookFormat = sys.argv[2]
|
||||
|
||||
if url is None:
|
||||
print >> sys.stderr, "Usage: downloader.py URL Type"
|
||||
sys.exit(-1)
|
||||
|
||||
if type(url) is unicode:
|
||||
print('URL is unicode')
|
||||
url = url.encode('latin1')
|
||||
url = url.strip()
|
||||
adapter = None
|
||||
writerClass = None
|
||||
|
||||
if url.find('fanficauthors') != -1:
|
||||
print >> sys.stderr, "fanficauthors.net already provides ebooks"
|
||||
sys.exit(0)
|
||||
elif url.find('fictionalley') != -1:
|
||||
adapter = fictionalley.FictionAlley(url)
|
||||
elif url.find('ficwad') != -1:
|
||||
adapter = ficwad.FicWad(url)
|
||||
elif url.find('fanfiction.net') != -1:
|
||||
adapter = ffnet.FFNet(url)
|
||||
elif url.find('fictionpress.com') != -1:
|
||||
adapter = fpcom.FPCom(url)
|
||||
elif url.find('harrypotterfanfiction.com') != -1:
|
||||
adapter = hpfiction.HPFiction(url)
|
||||
elif url.find('twilighted.net') != -1:
|
||||
adapter = twilighted.Twilighted(url)
|
||||
elif url.find('potionsandsnitches.net') != -1:
|
||||
adapter = potionsNsnitches.PotionsNSnitches(url)
|
||||
elif url.find('mediaminer.org') != -1:
|
||||
adapter = mediaminer.MediaMiner(url)
|
||||
else:
|
||||
print >> sys.stderr, "Oi! I can haz not appropriate adapter for URL %s!" % url
|
||||
sys.exit(1)
|
||||
|
||||
if bookFormat == 'epub':
|
||||
writerClass = output.EPubFanficWriter
|
||||
elif bookFormat == 'html':
|
||||
writerClass = output.HTMLWriter
|
||||
elif bookFormat == 'text':
|
||||
writerClass = output.TextWriter
|
||||
|
||||
if adapter.requiresLogin(url):
|
||||
print("Meow, URL %s requires you to haz been logged in! Please can I haz this datas?" % url)
|
||||
sys.stdout.write("Can I haz ur login? ")
|
||||
login = sys.stdin.readline().strip()
|
||||
password = getpass.getpass(prompt='Can I haz ur password? ')
|
||||
print("Login: `%s`, Password: `%s`" % (login, password))
|
||||
|
||||
adapter.setLogin(login)
|
||||
adapter.setPassword(password)
|
||||
|
||||
|
||||
loader = FanficLoader(adapter, writerClass)
|
||||
loader.setStandAlone(True)
|
||||
if bookFormat != 'epub':
|
||||
loader.setOverWrite(True)
|
||||
|
||||
|
||||
try:
|
||||
loader.download()
|
||||
except FailedToDownload, ftd:
|
||||
print >> sys.stderr, str(ftd)
|
||||
sys.exit(2) # Error Downloading
|
||||
except InvalidStoryURL, isu:
|
||||
print >> sys.stderr, str(isu)
|
||||
sys.exit(3) # Unknown Error
|
||||
except StoryArchivedAlready, se:
|
||||
print >> sys.stderr, str(se)
|
||||
sys.exit(10) # Skipped
|
||||
except StoryDoesNotExist, sdne:
|
||||
print >> sys.stderr, str(sdne)
|
||||
sys.exit(20) # Missing
|
||||
except LoginRequiredException, lre:
|
||||
print >> sys.stderr, str(lre)
|
||||
sys.exit(30) # Missing
|
||||
except Exception, e:
|
||||
print >> sys.stderr, str(e)
|
||||
sys.exit(99) # Unknown Error
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
358
fanficdownloader/ffnet.py
Normal file
358
fanficdownloader/ffnet.py
Normal file
|
|
@ -0,0 +1,358 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import cgi
|
||||
import uuid
|
||||
import shutil
|
||||
import os.path
|
||||
import logging
|
||||
import unittest
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from constants import *
|
||||
from adapter import *
|
||||
|
||||
try:
|
||||
import login_password
|
||||
except:
|
||||
# tough luck
|
||||
pass
|
||||
|
||||
class FFNet(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
|
||||
self.storyName = 'FF.Net story'
|
||||
self.authorName = 'FF.Net author'
|
||||
self.storyDescription = 'Fanfiction Story'
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
self.authorId = '0'
|
||||
self.authorURL = self.path
|
||||
self.storyId = '0'
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.subjects.append ('FanFiction')
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = 'FanFiction'
|
||||
self.category = 'FF.Net Category'
|
||||
self.storyStatus = 'In-Progress'
|
||||
self.storyRating = 'K'
|
||||
self.storyUserRating = '0'
|
||||
self.outputName = ''
|
||||
self.outputStorySep = '-ffnet_'
|
||||
|
||||
logging.debug('self.path=%s' % self.path)
|
||||
|
||||
if self.path.startswith('/'):
|
||||
self.path = self.path[1:]
|
||||
|
||||
spl = self.path.split('/')
|
||||
logging.debug('spl=%s' % spl)
|
||||
if spl is not None:
|
||||
if len(spl) > 0 and spl[0] != 's':
|
||||
raise InvalidStoryURL("Error URL \"%s\" is not a story." % self.url)
|
||||
if len(spl) > 1:
|
||||
self.storyId = spl[1]
|
||||
if len(spl) > 2:
|
||||
chapter = spl[1]
|
||||
else:
|
||||
chapter = '1'
|
||||
if len(spl) == 5:
|
||||
self.path = "/".join(spl[1:-1])
|
||||
|
||||
if self.path.endswith('/'):
|
||||
self.path = self.path[:-1]
|
||||
|
||||
logging.debug('self.path=%s' % self.path)
|
||||
|
||||
if self.host is not None and self.host == "m.fanfiction.net":
|
||||
self.host = "www.fanfiction.net"
|
||||
logging.debug('self.host=%s' % self.host)
|
||||
self.url = "http://" + self.host + "/" + self.path
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
|
||||
logging.debug('self.storyId=%s' % self.storyId)
|
||||
if not self.appEngine:
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
else:
|
||||
self.opener = None
|
||||
|
||||
logging.debug("Created FF.Net: url=%s" % (self.url))
|
||||
|
||||
def _getLoginScript(self):
|
||||
return self.path
|
||||
|
||||
def _getVarValue(self, varstr):
|
||||
#logging.debug('_getVarValue varstr=%s' % varstr)
|
||||
vals = varstr.split('=')
|
||||
#logging.debug('vals=%s' % vals)
|
||||
retstr="".join(vals[+1:])
|
||||
#logging.debug('retstr=%s' % retstr)
|
||||
if retstr.startswith(' '):
|
||||
retstr = retstr[1:]
|
||||
if retstr.endswith(';'):
|
||||
retstr = retstr[:-1]
|
||||
return retstr
|
||||
|
||||
def _splitCrossover(self, subject):
|
||||
if "Crossover" in subject:
|
||||
self.addSubject ("Crossover")
|
||||
logging.debug('Crossover=%s' % subject)
|
||||
if subject.find(' and ') != -1:
|
||||
words = subject.split(' ')
|
||||
logging.debug('words=%s' % words)
|
||||
subj = ''
|
||||
for s in words:
|
||||
if s in "and Crossover":
|
||||
if len(subj) > 0:
|
||||
self.addSubject(subj)
|
||||
subj = ''
|
||||
else:
|
||||
if len(subj) > 0:
|
||||
subj = subj + ' '
|
||||
subj = subj + s
|
||||
if len(subj) > 0:
|
||||
self.addSubject(subj)
|
||||
else:
|
||||
self.addSubject(subject)
|
||||
else:
|
||||
self.addSubject(subject)
|
||||
return True
|
||||
|
||||
def _splitGenre(self, subject):
|
||||
if len(subject) > 0:
|
||||
words = subject.split('/')
|
||||
logging.debug('words=%s' % words)
|
||||
for subj in words:
|
||||
if len(subj) > 0:
|
||||
self.addSubject(subj)
|
||||
return True
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
data = ''
|
||||
try:
|
||||
data = self.fetchUrl(self.url)
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + self.url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise StoryDoesNotExist("Problem reading story URL " + self.url + "!")
|
||||
|
||||
d2 = re.sub('&\#[0-9]+;', ' ', data)
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(d2)
|
||||
except:
|
||||
logging.error("Failed to decode: <%s>" % d2)
|
||||
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % self.url)
|
||||
|
||||
allA = soup.findAll('a')
|
||||
for a in allA:
|
||||
if 'href' in a._getAttrMap() and a['href'].find('/u/') != -1:
|
||||
self.authorName = a.string
|
||||
(u1, u2, self.authorId, u3) = a['href'].split('/')
|
||||
logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName))
|
||||
|
||||
urls = []
|
||||
lines = data.split('\n')
|
||||
for l in lines:
|
||||
if l.find("»") != -1 and l.find('<b>') != -1:
|
||||
s2 = bs.BeautifulStoneSoup(l)
|
||||
self.storyName = unicode(s2.find('b').string)
|
||||
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
|
||||
elif l.find("<a href='/u/") != -1:
|
||||
s2 = bs.BeautifulStoneSoup(l)
|
||||
self.authorName = unicode(s2.a.string)
|
||||
(u1, u2, self.authorId, u3) = s2.a['href'].split('/')
|
||||
logging.debug('self.authorId=%s, self.authorName=%s' % (self.authorId, self.authorName))
|
||||
elif l.find("Rated: <a href=") != -1:
|
||||
s2 = bs.BeautifulStoneSoup(l)
|
||||
self.storyRating = unicode(s2.a.string).strip()
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
logging.debug('s2.a=%s' % s2.a)
|
||||
s3 = l.split('-')
|
||||
logging.debug('s3=%s' % s3)
|
||||
if len(s3) > 0:
|
||||
if s3[1].find("Reviews: <a href=") != -1:
|
||||
continue
|
||||
self.language = s3[1].strip()
|
||||
logging.debug('self.language=%s' % self.language)
|
||||
if len(s3) > 1:
|
||||
if s3[2].find("Reviews: <a href=") != -1:
|
||||
continue
|
||||
self.genre = s3[2].strip()
|
||||
if "&" in self.genre:
|
||||
self.genre = ''
|
||||
continue
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
self._splitGenre(self.genre)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
if "Complete" in l:
|
||||
self.storyStatus = 'Completed'
|
||||
else:
|
||||
self.storyStatus = 'In-Progress'
|
||||
elif l.find("<SELECT title='chapter navigation'") != -1:
|
||||
if len(urls) > 0:
|
||||
continue
|
||||
try:
|
||||
u = l.decode('utf-8')
|
||||
except UnicodeEncodeError, e:
|
||||
u = l
|
||||
except:
|
||||
u = l.encode('ascii', 'xmlcharrefreplace')
|
||||
u = re.sub('&\#[0-9]+;', ' ', u)
|
||||
s2 = bs.BeautifulSoup(u)
|
||||
options = s2.findAll('option')
|
||||
for o in options:
|
||||
url = 'http://' + self.host + '/s/' + self.storyId + '/' + o['value']
|
||||
title = o.string
|
||||
logging.debug('URL = `%s`, Title = `%s`' % (url, title))
|
||||
urls.append((url,title))
|
||||
elif l.find("var chapters") != -1:
|
||||
self.numChapters = self._getVarValue (l)
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
elif l.find("var words") != -1:
|
||||
self.numWords = self._getVarValue (l)
|
||||
logging.debug('self.numWords=%s' % self.numWords)
|
||||
elif l.find("var categoryid") != -1:
|
||||
categoryid = self._getVarValue (l)
|
||||
logging.debug('categoryid=%s' % categoryid)
|
||||
elif l.find("var cat_title") != -1:
|
||||
self.category = self._getVarValue (l).strip("'")
|
||||
logging.debug('self.category=%s' % self.category)
|
||||
self._splitCrossover(self.category)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
elif l.find("var summary") != -1:
|
||||
self.storyDescription = self._getVarValue (l).strip("'")
|
||||
if '&' in self.storyDescription:
|
||||
s = self.storyDescription.split('&')
|
||||
logging.debug('s=%s' % s)
|
||||
self.storyDescription = ''
|
||||
for ss in s:
|
||||
if len(self.storyDescription) > 0:
|
||||
if len(ss) > 4 and 'amp;' in ss[1:4]:
|
||||
self.storyDescription = self.storyDescription + '&' + ss
|
||||
else:
|
||||
self.storyDescription = self.storyDescription + '&' + ss
|
||||
else:
|
||||
self.storyDescription = ss
|
||||
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
||||
elif l.find("var datep") != -1:
|
||||
dateps = self._getVarValue (l)
|
||||
self.storyPublished = datetime.datetime(*time.strptime ( dateps, "'%m-%d-%y'" )[0:5])
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished.strftime("%Y-%m-%dT%I:%M:%S"))
|
||||
elif l.find("var dateu") != -1:
|
||||
dateus = self._getVarValue (l)
|
||||
self.storyUpdated = datetime.datetime(*time.strptime ( dateus, "'%m-%d-%y'" )[0:5])
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated.strftime("%Y-%m-%dT%I:%M:%S"))
|
||||
|
||||
if len(urls) <= 0:
|
||||
# no chapters found, try url by itself.
|
||||
urls.append((self.url,self.storyName))
|
||||
|
||||
self.authorURL = 'http://' + self.host + '/u/' + self.authorId
|
||||
|
||||
#logging.debug('urls=%s' % urls)
|
||||
return urls
|
||||
|
||||
def getText(self, url):
|
||||
# time.sleep( 2.0 )
|
||||
data = ''
|
||||
try:
|
||||
logging.debug("Fetching URL: %s" % url)
|
||||
data = self.fetchUrl(url)
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
|
||||
|
||||
lines = data.split('\n')
|
||||
|
||||
textbuf = ''
|
||||
emit = False
|
||||
|
||||
olddata = data
|
||||
try:
|
||||
data = data.decode('utf8')
|
||||
except:
|
||||
data = olddata
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
except:
|
||||
logging.debug(data)
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
|
||||
|
||||
div = soup.find('div', {'id' : 'storytext'})
|
||||
if None == div:
|
||||
logging.debug(data)
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return div.__str__('utf8')
|
||||
|
||||
|
||||
class FFA_UnitTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
pass
|
||||
|
||||
def testChaptersAuthStory(self):
|
||||
f = FFNet('http://www.fanfiction.net/s/5257563/1')
|
||||
f.extractIndividualUrls()
|
||||
|
||||
self.assertEquals('Beka0502', f.getAuthorName())
|
||||
self.assertEquals("Draco's Redemption", f.getStoryName())
|
||||
|
||||
def testChaptersCountNames(self):
|
||||
f = FFNet('http://www.fanfiction.net/s/5257563/1')
|
||||
urls = f.extractIndividualUrls()
|
||||
|
||||
self.assertEquals(10, len(urls))
|
||||
|
||||
def testGetText(self):
|
||||
url = 'http://www.fanfiction.net/s/5257563/1'
|
||||
f = FFNet(url)
|
||||
text = f.getText(url)
|
||||
self.assertTrue(text.find('He was just about to look at some photos when he heard a crack') != -1)
|
||||
|
||||
def testBrokenWands(self):
|
||||
url = 'http://www.fanfiction.net/s/1527263/30/Harry_Potter_and_Broken_Wands'
|
||||
f = FFNet(url)
|
||||
text = f.getText(url)
|
||||
|
||||
urls = f.extractIndividualUrls()
|
||||
|
||||
def testFictionPress(self):
|
||||
url = 'http://www.fictionpress.com/s/2725180/1/Behind_This_Facade'
|
||||
f = FFNet(url)
|
||||
urls = f.extractIndividualUrls()
|
||||
|
||||
self.assertEquals('Behind This Facade', f.getStoryName())
|
||||
self.assertEquals('IntoxicatingMelody', f.getAuthorName())
|
||||
|
||||
text = f.getText(url)
|
||||
self.assertTrue(text.find('Kale Resgerald at your service" He answered, "So, can we go now? Or do you want to') != -1)
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
301
fanficdownloader/fictionalley.py
Normal file
301
fanficdownloader/fictionalley.py
Normal file
|
|
@ -0,0 +1,301 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
import logging
|
||||
import os.path
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import cookielib as cl
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import time as time
|
||||
import datetime
|
||||
from adapter import *
|
||||
|
||||
|
||||
class FictionAlley(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
|
||||
logging.debug('self.host=%s' % self.host)
|
||||
logging.debug('self.path=%s' % self.path)
|
||||
|
||||
cookieproc = u2.HTTPCookieProcessor()
|
||||
|
||||
# FictionAlley wants a cookie to prove you're old enough to read R+ rated stuff.
|
||||
cookie = cl.Cookie(version=0, name='fauser', value='wizard',
|
||||
port=None, port_specified=False,
|
||||
domain='www.fictionalley.org', domain_specified=False, domain_initial_dot=False,
|
||||
path='/authors', path_specified=True,
|
||||
secure=False,
|
||||
expires=time.time()+10000,
|
||||
discard=False,
|
||||
comment=None,
|
||||
comment_url=None,
|
||||
rest={'HttpOnly': None},
|
||||
rfc2109=False)
|
||||
cookieproc.cookiejar.set_cookie(cookie)
|
||||
self.opener = u2.build_opener(cookieproc)
|
||||
|
||||
ss = self.path.split('/')
|
||||
|
||||
self.storyDescription = 'Fanfiction Story'
|
||||
self.authorId = ''
|
||||
self.authorURL = ''
|
||||
self.storyId = ''
|
||||
if len(ss) > 2 and ss[1] == 'authors':
|
||||
self.authorId = ss[2]
|
||||
self.authorURL = 'http://' + self.host + '/authors/' + self.authorId
|
||||
if len(ss) > 3:
|
||||
self.storyId = ss[3].replace ('.html','')
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.subjects.append ('fanfiction')
|
||||
self.subjects.append ('Harry Potter')
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = ''
|
||||
self.category = 'Harry Potter'
|
||||
self.storyStatus = 'In-Progress'
|
||||
self.storyRating = 'K'
|
||||
self.storyUserRating = '0'
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
self.storyName = ''
|
||||
self.outputName = ''
|
||||
self.outputStorySep = '-fa_'
|
||||
|
||||
def getPasswordLine(self):
|
||||
return 'opaopapassword'
|
||||
|
||||
def getLoginScript(self):
|
||||
return 'opaopaloginscript'
|
||||
|
||||
def getLoginPasswordOthers(self):
|
||||
login = dict(login = 'name', password = 'pass')
|
||||
other = dict(submit = 'Log In', remember='yes')
|
||||
return (login, other)
|
||||
|
||||
def _processChapterHeaders(self, div):
|
||||
brs = div.findAll ('br')
|
||||
for br in brs:
|
||||
keystr=''
|
||||
valstr=''
|
||||
if len(br.contents) > 2:
|
||||
keystr = br.contents[1]
|
||||
if keystr is not None:
|
||||
strs = re.split ("<[^>]+>", unicode(keystr))
|
||||
keystr=''
|
||||
for s in strs:
|
||||
keystr = keystr + s
|
||||
valstr = br.contents[2].strip(' ')
|
||||
if keystr is not None:
|
||||
if keystr == 'Rating:':
|
||||
self.storyRating = valstr
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
elif keystr == 'Genre:':
|
||||
self.genre = valstr
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
s2 = valstr.split(', ')
|
||||
for ss2 in s2:
|
||||
self.addSubject(ss2)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
elif keystr == 'Main Character(s):':
|
||||
s2 = valstr.split(', ')
|
||||
for ss2 in s2:
|
||||
self.addCharacter(ss2)
|
||||
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
||||
elif keystr == 'Summary:':
|
||||
self.storyDescription = valstr
|
||||
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
||||
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(self.url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + self.url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise StoryDoesNotExist("Problem reading story URL " + self.url + "!")
|
||||
|
||||
# There is some usefull information in the headers of the first chapter page..
|
||||
data = data.replace('<!-- headerstart -->','<crazytagstringnobodywouldstumbleonaccidently id="storyheaders">').replace('<!-- headerend -->','</crazytagstringnobodywouldstumbleonaccidently>')
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
except:
|
||||
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % self.url)
|
||||
|
||||
breadcrumbs = soup.find('div', {'class': 'breadcrumbs'})
|
||||
if breadcrumbs is not None:
|
||||
# Be aware that this means that the user has entered the {STORY}01.html
|
||||
# We will not have valid Publised and Updated dates. User should enter
|
||||
# the {STORY}.html instead. We should force that instead of this.
|
||||
#logging.debug('breadcrumbs=%s' % breadcrumbs )
|
||||
bcas = breadcrumbs.findAll('a')
|
||||
#logging.debug('bcas=%s' % bcas )
|
||||
if bcas is not None and len(bcas) > 1:
|
||||
bca = bcas[1]
|
||||
#logging.debug('bca=%s' % bca )
|
||||
if 'href' in bca._getAttrMap():
|
||||
#logging.debug('bca.href=%s' % bca['href'] )
|
||||
url = unicode(bca['href'])
|
||||
if url is not None and len(url) > 0:
|
||||
self.url = url
|
||||
logging.debug('self.url=%s' % self.url )
|
||||
ss = self.url.split('/')
|
||||
self.storyId = ss[-1].replace('.html','')
|
||||
self.storyName = bca.string
|
||||
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
|
||||
|
||||
data = self.opener.open(self.url).read()
|
||||
|
||||
# There is some usefull information in the headers of the first chapter page..
|
||||
data = data.replace('<!-- headerstart -->','<crazytagstringnobodywouldstumbleonaccidently id="storyheaders">').replace('<!-- headerend -->','</crazytagstringnobodywouldstumbleonaccidently>')
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
|
||||
# If it is decided that we really do care about number of words.. It's only available on the author's page..
|
||||
#d0 = self.opener.open(self.authorURL).read()
|
||||
#soupA = bs.BeautifulStoneSoup(d0)
|
||||
#dls = soupA.findAll('dl')
|
||||
#logging.debug('dls=%s' % dls)
|
||||
|
||||
# Get title from <title>, remove before '-'.
|
||||
if len(self.storyName) == 0:
|
||||
title = soup.find('title').string
|
||||
self.storyName = "-".join(title.split('-')[1:]).strip().replace(" (Story Text)","")
|
||||
|
||||
links = soup.findAll('li')
|
||||
|
||||
self.numChapters = 0;
|
||||
result = []
|
||||
if len(links) == 0:
|
||||
# Be aware that this means that the user has entered the {STORY}01.html
|
||||
# We will not have valid Publised and Updated dates. User should enter
|
||||
# the {STORY}.html instead. We should force that instead of this.
|
||||
breadcrumbs = soup.find('div', {'class': 'breadcrumbs'})
|
||||
self.authorName = breadcrumbs.a.string.replace("'s Fics","")
|
||||
result.append((self.url,self.storyName))
|
||||
#logging.debug('chapter[%s]=%s, %s' % (self.numChapters+1,self.url,self.storyName))
|
||||
self.numChapters = self.numChapters + 1;
|
||||
div = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storyheaders'})
|
||||
if div is not None:
|
||||
self._processChapterHeaders(div)
|
||||
else:
|
||||
author = soup.find('h1', {'class' : 'title'})
|
||||
self.authorName = author.a.string
|
||||
|
||||
summary = soup.find('div', {'class' : 'summary'})
|
||||
ss = summary.contents
|
||||
if len(ss) > 1:
|
||||
ss1 = ss[0].split(': ')
|
||||
if len(ss1) > 1 and ss1[0] == 'Rating':
|
||||
self.storyRating = ss1[1]
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
self.storyDescription = unicode(ss[1]).replace("<br>","").replace("</br>","").replace('\n','')
|
||||
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
||||
|
||||
for li in links:
|
||||
a = li.find('a', {'class' : 'chapterlink'})
|
||||
s = li.contents
|
||||
if a is not None:
|
||||
url = a['href']
|
||||
title = a.string
|
||||
result.append((url,title))
|
||||
#logging.debug('chapter[%s]=%s, %s' % (self.numChapters+1,url,title))
|
||||
if self.numChapters == 0:
|
||||
# fictionalley uses full URLs in chapter list.
|
||||
d1 = self.opener.open(url).read()
|
||||
|
||||
# find <!-- headerstart --> & <!-- headerend --> and
|
||||
# replaced with matching div pair for easier parsing.
|
||||
# Yes, it's an evil kludge, but what can ya do? Using
|
||||
# something other than div prevents soup from pairing
|
||||
# our div with poor html inside the story text.
|
||||
d1 = d1.replace('<!-- headerstart -->','<crazytagstringnobodywouldstumbleonaccidently id="storyheaders">').replace('<!-- headerend -->','</crazytagstringnobodywouldstumbleonaccidently>')
|
||||
sop = bs.BeautifulStoneSoup(d1)
|
||||
|
||||
div = sop.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storyheaders'})
|
||||
if div is not None:
|
||||
self._processChapterHeaders(div)
|
||||
|
||||
self.numChapters = self.numChapters + 1
|
||||
if len(s) > 1:
|
||||
datestr=''
|
||||
ss2 = s[1].replace('\n','').replace('(','').split(' ')
|
||||
if len(ss2) > 2 and ss2[0] == 'Posted:':
|
||||
datestr = ss2[1] + ' ' + ss2[2]
|
||||
tmpdate = datetime.datetime.fromtimestamp(time.mktime(time.strptime(datestr.strip(' '), "%Y-%m-%d %H:%M:%S")))
|
||||
if self.numChapters == 1:
|
||||
self.storyPublished = tmpdate
|
||||
self.storyUpdated = tmpdate
|
||||
logging.debug('self.storyPublished=%s, self.storyUpdated=%s' % (self.storyPublished, self.storyUpdated))
|
||||
else:
|
||||
logging.debug('li chapterlink not found! li=%s' % li)
|
||||
|
||||
|
||||
logging.debug('Story "%s" by %s' % (self.storyName, self.authorName))
|
||||
|
||||
return result
|
||||
|
||||
def getText(self, url):
|
||||
# fictionalley uses full URLs in chapter list.
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
|
||||
|
||||
|
||||
# find <!-- headerend --> & <!-- footerstart --> and
|
||||
# replaced with matching div pair for easier parsing.
|
||||
# Yes, it's an evil kludge, but what can ya do? Using
|
||||
# something other than div prevents soup from pairing
|
||||
# our div with poor html inside the story text.
|
||||
data = data.replace('<!-- headerend -->','<crazytagstringnobodywouldstumbleonaccidently id="storytext">').replace('<!-- footerstart -->','</crazytagstringnobodywouldstumbleonaccidently>')
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
except:
|
||||
logging.info("Failed to decode: <%s>" % data)
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
|
||||
|
||||
div = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storytext'})
|
||||
if None == div:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
html = soup.findAll('html')
|
||||
if len(html) > 1:
|
||||
return html[1].__str__('utf8')
|
||||
else:
|
||||
return div.__str__('utf8').replace('crazytagstringnobodywouldstumbleonaccidently','div')
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
url = 'http://www.fictionalley.org/authors/drt/DA.html'
|
||||
data = self.opener.open(url).read()
|
||||
host = up.urlparse(url).netloc
|
||||
fw = FictionAlley(url)
|
||||
urls = fw.extractIndividualUrls(data, host, url)
|
||||
pp.pprint(urls)
|
||||
print(fw.getText(data))
|
||||
267
fanficdownloader/ficwad.py
Normal file
267
fanficdownloader/ficwad.py
Normal file
|
|
@ -0,0 +1,267 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
import os.path
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import logging
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from adapter import *
|
||||
|
||||
class FicWad(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
self.host = up.urlparse(url).netloc
|
||||
self.storyDescription = 'Fanfiction Story'
|
||||
self.authorId = '0'
|
||||
self.storyId = '0'
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.subjects.append ('fanfiction')
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = 'FanFiction'
|
||||
self.category = 'Category'
|
||||
self.storyStatus = 'In-Progress'
|
||||
self.storyRating = 'PG'
|
||||
self.storyUserRating = '0'
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
self.outputName = ''
|
||||
self.outputStorySep = '-fw_'
|
||||
|
||||
def getPasswordLine(self):
|
||||
return 'opaopapassword'
|
||||
|
||||
def getLoginScript(self):
|
||||
return 'opaopaloginscript'
|
||||
|
||||
def getLoginPasswordOthers(self):
|
||||
login = dict(login = 'name', password = 'pass')
|
||||
other = dict(submit = 'Log In', remember='yes')
|
||||
return (login, other)
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
oldurl = ''
|
||||
cururl = self.url
|
||||
data = ''
|
||||
try:
|
||||
data = u2.urlopen(self.url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + self.url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise StoryDoesNotExist("Problem reading story URL " + self.url + "!")
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
except:
|
||||
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % self.url)
|
||||
|
||||
story = soup.find('div', {'id' : 'story'})
|
||||
crumbtrail = story.find('h3') # the only h3 ficwad uses.
|
||||
allAhrefs = crumbtrail.findAll('a')
|
||||
# last of crumbtrail
|
||||
storyinfo = allAhrefs[-1]
|
||||
(u0, u1, storyid) = storyinfo['href'].split('/')
|
||||
if u1 == "story":
|
||||
# This page does not have the correct information on it.. Need to get the Story Title Page
|
||||
logging.debug('URL %s is a chapter URL. Getting Title Page http://%s/%s/%s.' % (self.url, self.host, u1, storyid))
|
||||
oldurl = self.url
|
||||
self.url = 'http://' + self.host + '/' + u1 + '/' + storyid
|
||||
data = u2.urlopen(self.url).read()
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
|
||||
story = soup.find('div', {'id' : 'story'})
|
||||
crumbtrail = story.find('h3') # the only h3 ficwad uses.
|
||||
allAhrefs = crumbtrail.findAll('a')
|
||||
|
||||
# save chapter name from header in case of one-shot.
|
||||
storyinfo = story.find('h4').find('a')
|
||||
(u0, u1, self.storyId) = storyinfo['href'].split('/')
|
||||
self.storyName = storyinfo.string.strip()
|
||||
|
||||
logging.debug('self.storyName=%s, self.storyId=%s' % (self.storyName, self.storyId))
|
||||
|
||||
author = soup.find('span', {'class' : 'author'})
|
||||
self.authorName = unicode(author.a.string)
|
||||
(u0, u1,self.authorId) = author.a['href'].split('/')
|
||||
self.authorURL = 'http://' + self.host + author.a['href']
|
||||
logging.debug('self.authorName=%s self.authorId=%s' % (self.authorName, self.authorId))
|
||||
|
||||
description = soup.find('blockquote', {'class' : 'summary'})
|
||||
if description is not None:
|
||||
self.storyDescription = unicode(description.p.string)
|
||||
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
||||
|
||||
meta = soup.find('p', {'class' : 'meta'})
|
||||
if meta is not None:
|
||||
s = unicode(meta).replace('\n',' ').replace('\t','').split(' - ')
|
||||
#logging.debug('meta.s=%s' % s)
|
||||
for ss in s:
|
||||
s1 = ss.replace(' ','').split(':')
|
||||
#logging.debug('meta.s.s1=%s' % s1)
|
||||
if len(s1) > 1:
|
||||
s2 = re.split ('<[^>]+>', s1[0])
|
||||
#logging.debug('meta.s.s1.s2=%s' % s2)
|
||||
if len(s2) > 1:
|
||||
s1[0] = s2[1]
|
||||
skey = s1[0].strip()
|
||||
#logging.debug('Checking = %s' % skey)
|
||||
if skey == 'Category':
|
||||
soup1 = bs.BeautifulStoneSoup(s1[1])
|
||||
allAs = soup1.findAll('a')
|
||||
for a in allAs:
|
||||
if self.category == 'Category':
|
||||
self.category = unicode(a.string)
|
||||
logging.debug('self.category=%s' % self.category)
|
||||
self.addSubject(self.category)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
elif skey == 'Rating':
|
||||
self.storyRating = s1[1]
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
elif skey == 'Genres':
|
||||
self.genre = s1[1]
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
s2 = s1[1].split(', ')
|
||||
for ss2 in s2:
|
||||
self.addSubject(ss2)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
elif skey == 'Characters':
|
||||
s2 = s1[1].split(', ')
|
||||
for ss2 in s2:
|
||||
self.addCharacter(ss2)
|
||||
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
||||
elif skey == 'Chapters':
|
||||
self.numChapters = s1[1]
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
elif skey == 'Warnings':
|
||||
logging.debug('Warnings=%s' % s1[1])
|
||||
elif skey == 'Published':
|
||||
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d")))
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
elif skey == 'Updated':
|
||||
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d")))
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
||||
else:
|
||||
s3 = re.split ('<[^>]+>', s1[0])
|
||||
#logging.debug('meta.s.s1.s3=%s' % s3)
|
||||
if len(s3) > 1:
|
||||
s1[0] = s3[0]
|
||||
s4 = s1[0].split('w')
|
||||
#logging.debug('meta.s.s1.s4=%s' % s4)
|
||||
if len(s4) > 1 and s4[1] == 'ords':
|
||||
self.numWords = s4[0]
|
||||
logging.debug('self.numWords=%s' % self.numWords)
|
||||
|
||||
|
||||
logging.debug('Story "%s" by %s' % (self.storyName, self.authorName))
|
||||
|
||||
result = []
|
||||
ii = 1
|
||||
|
||||
if oldurl is not None and len(oldurl) > 0:
|
||||
logging.debug('Switching back to %s' % oldurl)
|
||||
cururl = oldurl
|
||||
data = u2.urlopen(oldurl).read()
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
|
||||
storylist = soup.find('ul', {'id' : 'storylist'})
|
||||
if storylist is not None:
|
||||
allBlocked = storylist.findAll('li', {'class' : 'blocked'})
|
||||
if allBlocked is not None:
|
||||
#logging.debug('allBlocked=%s' % allBlocked)
|
||||
raise LoginRequiredException(cururl)
|
||||
|
||||
allH4s = storylist.findAll('h4')
|
||||
#logging.debug('allH4s=%s' % allH4s)
|
||||
|
||||
if allH4s is not None:
|
||||
for h4 in allH4s:
|
||||
chapterinfo = h4.find('a')
|
||||
#logging.debug('Chapter1=%s' % chapterinfo)
|
||||
url = 'http://' + self.host + chapterinfo['href']
|
||||
title = chapterinfo.string.strip()
|
||||
#logging.debug('Chapter=%s, %s' % (url, title))
|
||||
# ficwad includes 'Story Index' in the dropdown of chapters,
|
||||
# but it's not a real chapter.
|
||||
if title != "Story Index":
|
||||
logging.debug('Chapter[%s]=%s, %s' % (ii, url, title))
|
||||
result.append((url,title))
|
||||
ii = ii+1
|
||||
else:
|
||||
logging.debug('Skipping Story Index. URL %s' % url)
|
||||
|
||||
if ii == 1:
|
||||
select = soup.find('select', { 'name' : 'goto' } )
|
||||
|
||||
if select is None:
|
||||
self.numChapters = '1'
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
result.append((self.url,self.storyName))
|
||||
logging.debug('Chapter[%s]=%s %s' % (ii, self.url, self.storyName))
|
||||
else:
|
||||
allOptions = select.findAll('option')
|
||||
for o in allOptions:
|
||||
url = 'http://' + self.host + o['value']
|
||||
title = o.string
|
||||
# ficwad includes 'Story Index' in the dropdown of chapters,
|
||||
# but it's not a real chapter.
|
||||
if title != "Story Index":
|
||||
logging.debug('Chapter[%s]=%s, %s' % (ii, url, title))
|
||||
result.append((url,title))
|
||||
ii = ii+1
|
||||
else:
|
||||
logging.debug('Skipping Story Index. URL %s' % url)
|
||||
|
||||
return result
|
||||
|
||||
def getText(self, url):
|
||||
if url.find('http://') == -1:
|
||||
url = 'http://' + self.host + '/' + url
|
||||
|
||||
data = ''
|
||||
try:
|
||||
data = u2.urlopen(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
|
||||
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
except:
|
||||
logging.info("Failed to decode: <%s>" % data)
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
|
||||
|
||||
div = soup.find('div', {'id' : 'storytext'})
|
||||
if None == div:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return div.__str__('utf8')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
url = 'http://www.ficwad.com/story/14536'
|
||||
data = u2.urlopen(url).read()
|
||||
host = up.urlparse(url).netloc
|
||||
fw = FicWad(url)
|
||||
urls = fw.extractIndividualUrls()
|
||||
pp.pprint(urls)
|
||||
print(fw.getText(data))
|
||||
344
fanficdownloader/fpcom.py
Normal file
344
fanficdownloader/fpcom.py
Normal file
|
|
@ -0,0 +1,344 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import cgi
|
||||
import uuid
|
||||
import shutil
|
||||
import os.path
|
||||
import logging
|
||||
import unittest
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from constants import *
|
||||
from adapter import *
|
||||
|
||||
try:
|
||||
import login_password
|
||||
except:
|
||||
# tough luck
|
||||
pass
|
||||
|
||||
class FPCom(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
|
||||
self.storyName = ''
|
||||
self.authorName = ''
|
||||
self.storyDescription = ''
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
self.authorId = '0'
|
||||
self.authorURL = self.path
|
||||
self.storyId = '0'
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = ''
|
||||
self.category = ''
|
||||
self.storyStatus = 'In-Progress'
|
||||
self.storyRating = 'K'
|
||||
self.storyUserRating = '0'
|
||||
self.outputName = ''
|
||||
self.outputStorySep = '-fpcom_'
|
||||
|
||||
if self.path.startswith('/'):
|
||||
self.path = self.path[1:]
|
||||
|
||||
spl = self.path.split('/')
|
||||
if spl is not None:
|
||||
if len(spl) > 0 and spl[0] != 's':
|
||||
raise InvalidStoryURL("Error URL \"%s\" is not a story." % self.url)
|
||||
if len(spl) > 1:
|
||||
self.storyId = spl[1]
|
||||
if len(spl) > 2:
|
||||
chapter = spl[1]
|
||||
else:
|
||||
chapter = '1'
|
||||
if len(spl) == 5:
|
||||
self.path = "/".join(spl[1:-1])
|
||||
|
||||
if self.path.endswith('/'):
|
||||
self.path = self.path[:-1]
|
||||
|
||||
logging.debug('self.path=%s' % self.path)
|
||||
|
||||
if not self.appEngine:
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
else:
|
||||
self.opener = None
|
||||
|
||||
logging.debug("Created FP.Com: url=%s" % (self.url))
|
||||
|
||||
def _getLoginScript(self):
|
||||
return self.path
|
||||
|
||||
def _getVarValue(self, varstr):
|
||||
#logging.debug('_getVarValue varstr=%s' % varstr)
|
||||
vals = varstr.split('=')
|
||||
#logging.debug('vals=%s' % vals)
|
||||
retstr="".join(vals[+1:])
|
||||
#logging.debug('retstr=%s' % retstr)
|
||||
if retstr.startswith(' '):
|
||||
retstr = retstr[1:]
|
||||
if retstr.endswith(';'):
|
||||
retstr = retstr[:-1]
|
||||
return retstr
|
||||
|
||||
def _splitCrossover(self, subject):
|
||||
if "Crossover" in subject:
|
||||
self.addSubject ("Crossover")
|
||||
logging.debug('Crossover=%s' % subject)
|
||||
if subject.find(' and ') != -1:
|
||||
words = subject.split(' ')
|
||||
logging.debug('words=%s' % words)
|
||||
subj = ''
|
||||
for s in words:
|
||||
if s in "and Crossover":
|
||||
if len(subj) > 0:
|
||||
self.addSubject(subj)
|
||||
subj = ''
|
||||
else:
|
||||
if len(subj) > 0:
|
||||
subj = subj + ' '
|
||||
subj = subj + s
|
||||
if len(subj) > 0:
|
||||
self.addSubject(subj)
|
||||
else:
|
||||
self.addSubject(subject)
|
||||
else:
|
||||
self.addSubject(subject)
|
||||
return True
|
||||
|
||||
def _splitGenre(self, subject):
|
||||
if len(subject) > 0:
|
||||
words = subject.split('/')
|
||||
logging.debug('words=%s' % words)
|
||||
for subj in words:
|
||||
if len(subj) > 0:
|
||||
self.addSubject(subj)
|
||||
return True
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
data = ''
|
||||
try:
|
||||
data = self.fetchUrl(self.url)
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + self.url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise StoryDoesNotExist("Problem reading story URL " + self.url + "!")
|
||||
|
||||
d2 = re.sub('&\#[0-9]+;', ' ', data)
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(d2)
|
||||
except:
|
||||
logging.error("Failed to decode: <%s>" % d2)
|
||||
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % self.url)
|
||||
|
||||
allA = soup.findAll('a')
|
||||
for a in allA:
|
||||
if 'href' in a._getAttrMap() and a['href'].find('/u/') != -1:
|
||||
self.authorName = a.string
|
||||
(u1, u2, self.authorId, u3) = a['href'].split('/')
|
||||
logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName))
|
||||
|
||||
urls = []
|
||||
|
||||
metas = soup.findAll ('meta', {'name' : 'description'})
|
||||
if metas is not None:
|
||||
for meta in metas:
|
||||
if 'content' in meta._getAttrMap():
|
||||
self.storyDescription = unicode(meta['content'])
|
||||
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
||||
|
||||
title=meta.find('title')
|
||||
logging.debug('title=%s' % title.string)
|
||||
tt = title.string.split(',')
|
||||
if tt is not None:
|
||||
if len(tt) > 0:
|
||||
self.storyName = tt[0]
|
||||
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
|
||||
if len(tt) > 1:
|
||||
tt1 = tt[1].split(' - ')
|
||||
if tt1 is not None and len(tt1) > 0:
|
||||
self.category = tt1[0].strip()
|
||||
logging.debug('self.category=%s' % self.category)
|
||||
cc = self.category.split(' ')
|
||||
for cc1 in cc:
|
||||
if cc1 is not None and cc1 != 'a':
|
||||
if cc1 == 'fanfic':
|
||||
self.addSubject('FanFiction')
|
||||
else:
|
||||
self.addSubject(cc1)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
|
||||
|
||||
numchapters = 0
|
||||
urlstory = ''
|
||||
|
||||
fidochap = soup.find('form', {'name':'fidochap'})
|
||||
sl = fidochap.find('select', {'title':'chapter navigation'})
|
||||
if sl is not None:
|
||||
logging.debug('sl=%s' % sl )
|
||||
if 'onchange' in sl._getAttrMap():
|
||||
ocs = sl['onchange'].split('\'')
|
||||
logging.debug('ocs=%s' % ocs)
|
||||
if ocs is not None and len(ocs) > 3:
|
||||
urlstory = ocs[3]
|
||||
logging.debug('urlstory=%s' % urlstory)
|
||||
|
||||
opts = sl.findAll('option')
|
||||
for o in opts:
|
||||
if 'value' in o._getAttrMap():
|
||||
url = 'http://' + self.host + '/s/' + self.storyId + '/' + o['value'] + urlstory
|
||||
logging.debug('URL=%s, Title=%s' % (url, o.string))
|
||||
urls.append((url, o.string))
|
||||
numchapters = numchapters + 1
|
||||
|
||||
if numchapters == 0:
|
||||
numchapters = 1
|
||||
url = 'http://' + self.host + '/s/' + self.storyId + '/1' + urlstory
|
||||
logging.debug('URL=%s, Title=%s' % (url, self.storyName))
|
||||
urls.append((url, self.storyName))
|
||||
|
||||
self.numChapters = unicode(numchapters)
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
logging.debug('urls=%s' % urls)
|
||||
|
||||
self.genre = ''
|
||||
tds = fidochap.findAll('td')
|
||||
for td in tds:
|
||||
tdb = td.find('b')
|
||||
if tdb is not None and tdb.string == self.storyName:
|
||||
tdas = td.findAll('a')
|
||||
for tda in tdas:
|
||||
ss = tda.string
|
||||
if ss is not None:
|
||||
if len(self.genre) > 0:
|
||||
self.genre = self.genre + ', '
|
||||
self.genre = self.genre + ss
|
||||
self.addSubject(ss)
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
tda = td.find ('a')
|
||||
if tda is not None and tda.string.find('Rated:') != -1:
|
||||
tdas = re.split ("<[^>]+>", unicode(td).replace('\n','').replace(' ',' '))
|
||||
if tdas is not None:
|
||||
ll = len(tdas)
|
||||
if ll > 2:
|
||||
ss = tdas[2].split(': ')
|
||||
if ss is not None and len(ss) > 1:
|
||||
self.storyRating = ss[1]
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
if ll > 3:
|
||||
ss = tdas[3].split(' - ')
|
||||
if ss is not None:
|
||||
lls = len(ss)
|
||||
if lls > 1:
|
||||
language = ss[1]
|
||||
logging.debug('language=%s' % language)
|
||||
if lls > 2:
|
||||
self.category = ss[2]
|
||||
logging.debug('self.category=%s' % self.category)
|
||||
sgs = self.category.split('/')
|
||||
for sg in sgs:
|
||||
self.addSubject(sg)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
if lls > 3 and ss[3].strip() == 'Reviews:' and ll > 4:
|
||||
reviews = tdas[4]
|
||||
logging.debug('reviews=%s' % reviews)
|
||||
if ll > 5:
|
||||
ss = tdas[5].split(' - ')
|
||||
if ss is not None:
|
||||
lls = len(ss)
|
||||
if lls > 1:
|
||||
sds = ss[1].split(': ')
|
||||
if sds is not None and len(sds) > 1 and sds[0] == 'Published':
|
||||
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(sds[1].strip(' '), "%m-%d-%y")))
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
lls = len(ss)
|
||||
if lls > 2:
|
||||
sds = ss[2].split(': ')
|
||||
if sds is not None and len(sds) > 1 and sds[0] == 'Updated':
|
||||
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(sds[1].strip(' '), "%m-%d-%y")))
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
||||
|
||||
|
||||
|
||||
self.authorURL = 'http://' + self.host + '/u/' + self.authorId
|
||||
|
||||
return urls
|
||||
|
||||
def getText(self, url):
|
||||
time.sleep( 2.0 )
|
||||
data = ''
|
||||
try:
|
||||
data = self.fetchUrl(url)
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
|
||||
|
||||
lines = data.split('\n')
|
||||
|
||||
textbuf = ''
|
||||
emit = False
|
||||
|
||||
olddata = data
|
||||
try:
|
||||
data = data.decode('utf8')
|
||||
except:
|
||||
data = olddata
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
except:
|
||||
logging.info("Failed to decode: <%s>" % data)
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
|
||||
|
||||
div = soup.find('div', {'id' : 'storytext'})
|
||||
if None == div:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return div.__str__('utf8')
|
||||
|
||||
|
||||
class FPC_UnitTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
pass
|
||||
|
||||
def testFictionPress(self):
|
||||
url = 'http://www.fictionpress.com/s/2725180/1/Behind_This_Facade'
|
||||
f = FPCom(url)
|
||||
urls = f.extractIndividualUrls()
|
||||
|
||||
self.assertEquals('Behind This Facade', f.getStoryName())
|
||||
self.assertEquals('IntoxicatingMelody', f.getAuthorName())
|
||||
|
||||
text = f.getText(url)
|
||||
self.assertTrue(text.find('Kale Resgerald at your service" He answered, "So, can we go now? Or do you want to') != -1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
280
fanficdownloader/hpfiction.py
Normal file
280
fanficdownloader/hpfiction.py
Normal file
|
|
@ -0,0 +1,280 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import cgi
|
||||
import uuid
|
||||
import shutil
|
||||
import os.path
|
||||
import logging
|
||||
import unittest
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from constants import *
|
||||
from adapter import *
|
||||
|
||||
try:
|
||||
import login_password
|
||||
except:
|
||||
# tough luck
|
||||
pass
|
||||
|
||||
class HPFiction(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
logging.debug('self.host=%s' % self.host)
|
||||
logging.debug('self.path=%s' % self.path)
|
||||
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
|
||||
self.chapurl = False
|
||||
self.storyId = '0'
|
||||
|
||||
sss = self.url.split('?')
|
||||
logging.debug('sss=%s' % sss)
|
||||
if sss is not None and len(sss) > 1:
|
||||
sc = sss[1].split('=')
|
||||
logging.debug('sc=%s' % sc)
|
||||
if sc is not None and len(sc) > 1:
|
||||
if sc[0] == 'chapterid':
|
||||
self.chapurl = True
|
||||
elif sc[0] == 'psid' or sc[0] == 'sid':
|
||||
self.storyId = sc[1]
|
||||
|
||||
self.storyDescription = 'Fanfiction Story'
|
||||
self.authorId = '0'
|
||||
self.authorURL = ''
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.subjects.append ('fanfiction')
|
||||
self.subjects.append ('Harry Potter')
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = 'FanFiction'
|
||||
self.category = 'Category'
|
||||
self.storyStatus = 'In-Progress'
|
||||
self.storyRating = 'K'
|
||||
self.storyUserRating = '0'
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
self.outputName = ''
|
||||
self.outputStorySep = '-hp_'
|
||||
|
||||
logging.debug("Created HPFiction: url=%s" % (self.url))
|
||||
|
||||
def _getLoginScript(self):
|
||||
return self.path
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(self.url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + self.url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise StoryDoesNotExist("Problem reading story URL " + self.url + "!")
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulSoup(data)
|
||||
except:
|
||||
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % self.url)
|
||||
|
||||
links = soup.findAll('a')
|
||||
def_chapurl = ''
|
||||
def_chaptitle = ''
|
||||
|
||||
if self.chapurl:
|
||||
foundid = False
|
||||
for a in links:
|
||||
if a['href'].find('psid') != -1:
|
||||
sp = a['href'].split('?')
|
||||
if sp is not None and len(sp) > 1:
|
||||
for sp1 in sp:
|
||||
if sp1.find('psid') != -1:
|
||||
ps = sp1.split('=')
|
||||
if ps is not None and len(ps) > 1:
|
||||
self.storyId = ps[1].replace('\'','')
|
||||
foundid = True
|
||||
self.storyName = a.string
|
||||
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
|
||||
break
|
||||
if foundid:
|
||||
self.url = "http://" + self.host + "/viewstory.php?psid=" + self.storyId
|
||||
logging.debug('Title Page URL=%s' % self.url)
|
||||
data1 = self.opener.open(self.url).read()
|
||||
hdrsoup = bs.BeautifulSoup(data1)
|
||||
else:
|
||||
hdrsoup = soup
|
||||
else:
|
||||
hdrsoup = soup
|
||||
|
||||
for a in links:
|
||||
if not self.chapurl and a['href'].find('psid') != -1:
|
||||
sp = a['href'].split('?')
|
||||
if sp is not None and len(sp) > 1:
|
||||
for sp1 in sp:
|
||||
if sp1.find('psid') != -1:
|
||||
ps = sp1.split('=')
|
||||
if ps is not None and len(ps) > 1:
|
||||
self.storyId = ps[1].replace('\'','')
|
||||
self.storyName = a.string
|
||||
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
|
||||
elif a['href'].find('viewuser.php') != -1:
|
||||
self.authorName = a.string
|
||||
self.authorURL = 'http://' + self.host + '/' + a['href']
|
||||
(u1, self.authorId) = a['href'].split('=')
|
||||
logging.debug('self.authorName=%s, self.authorId=%s' % (self.authorName, self.authorId))
|
||||
elif a['href'].find('chapterid=') != -1 and len(def_chapurl) == 0:
|
||||
def_chapurl = 'http://' + self.host + '/viewstory.php' + unicode(a['href'])
|
||||
def_chaptitle = a.string
|
||||
logging.debug('def_chapurl=%s, def_chaptitle=%s' % (def_chapurl, def_chaptitle))
|
||||
|
||||
centers = hdrsoup.findAll('center')
|
||||
for center in centers:
|
||||
tds = center.findAll ('td')
|
||||
if tds is not None and len(tds) > 0:
|
||||
for td in tds:
|
||||
s = re.split ("<[^>]+>", unicode(td).replace('\n','').replace(' ',' '))
|
||||
ii = 0
|
||||
ll = len(s)
|
||||
sss = ''
|
||||
while ii < ll - 1:
|
||||
if s[ii] is not None and len(s[ii]) > 0:
|
||||
if s[ii] == 'Rating:':
|
||||
self.storyRating = s[ii+1]
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'Chapters:':
|
||||
self.numChapters = s[ii+1]
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'Characters:':
|
||||
s2 = s[ii+1].split(', ')
|
||||
for ss2 in s2:
|
||||
self.addCharacter(ss2)
|
||||
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'Genre(s):':
|
||||
self.genre = s[ii+1]
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
s2 = s[ii+1].split(', ')
|
||||
for ss2 in s2:
|
||||
self.addSubject(ss2)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'Status:':
|
||||
if s[ii+1].strip(' ') == "Work In Progress":
|
||||
self.storyStatus = 'In-Progress'
|
||||
else:
|
||||
self.storyStatus = 'Completed'
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'First Published:':
|
||||
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s[ii+1].strip(' '), "%Y.%m.%d")))
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'Last Updated:':
|
||||
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s[ii+1].strip(' '), "%Y.%m.%d")))
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'Last Published Chapter:':
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'Pairings:':
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'Warnings:':
|
||||
ii = ii + 2
|
||||
else:
|
||||
sss = sss + ' ' + s[ii]
|
||||
ii = ii + 1
|
||||
else:
|
||||
ii = ii + 1
|
||||
self.storyDescription = sss
|
||||
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
||||
|
||||
urls = []
|
||||
|
||||
select = soup.find('select', {'name' : 'chapterid'})
|
||||
if select is None:
|
||||
# no chapters found, try url by itself.
|
||||
if len(def_chapurl) > 0:
|
||||
urls.append((def_chapurl, def_chaptitle))
|
||||
else:
|
||||
urls.append((self.url,self.storyName))
|
||||
else:
|
||||
for o in select.findAll('option'):
|
||||
if 'value' in o._getAttrMap():
|
||||
url = 'http://' + self.host + self.path + o['value']
|
||||
title = o.string
|
||||
if title != "Story Index":
|
||||
urls.append((url,title))
|
||||
|
||||
return urls
|
||||
|
||||
def getText(self, url):
|
||||
logging.debug('Downloading from URL: %s' % url)
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulSoup(data)
|
||||
except:
|
||||
logging.info("Failed to decode: <%s>" % data)
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
|
||||
|
||||
divtext = soup.find('div', {'id' : 'fluidtext'})
|
||||
if None == divtext:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return divtext.__str__('utf8')
|
||||
|
||||
|
||||
class FF_UnitTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
pass
|
||||
|
||||
def testChaptersAuthStory(self):
|
||||
f = HPFiction('http://www.harrypotterfanfiction.com/viewstory.php?chapterid=80123')
|
||||
urls = f.extractIndividualUrls()
|
||||
|
||||
self.assertEquals(49, len(urls))
|
||||
self.assertEquals('Elisha', f.getAuthorName())
|
||||
self.assertEquals('A Secret Thought', f.getStoryName())
|
||||
|
||||
def testGetText(self):
|
||||
url = 'http://www.harrypotterfanfiction.com/viewstory.php?chapterid=80123'
|
||||
f = HPFiction(url)
|
||||
#urls = f.extractIndividualUrls()
|
||||
text = f.getText(url)
|
||||
self.assertTrue(text.find('She pulled out of his arms and felt the subtle regret') != -1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
|
||||
452
fanficdownloader/html2text.py
Normal file
452
fanficdownloader/html2text.py
Normal file
|
|
@ -0,0 +1,452 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""html2text: Turn HTML into equivalent Markdown-structured text."""
|
||||
__version__ = "2.37"
|
||||
__author__ = "Aaron Swartz (me@aaronsw.com)"
|
||||
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
|
||||
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
|
||||
|
||||
# TODO:
|
||||
# Support decoded entities with unifiable.
|
||||
|
||||
if not hasattr(__builtins__, 'True'): True, False = 1, 0
|
||||
import re, sys, urllib, htmlentitydefs, codecs, StringIO, types
|
||||
import sgmllib
|
||||
import urlparse
|
||||
sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
|
||||
|
||||
try: from textwrap import wrap
|
||||
except: pass
|
||||
|
||||
# Use Unicode characters instead of their ascii psuedo-replacements
|
||||
UNICODE_SNOB = 0
|
||||
|
||||
# Put the links after each paragraph instead of at the end.
|
||||
LINKS_EACH_PARAGRAPH = 0
|
||||
|
||||
# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
|
||||
BODY_WIDTH = 78
|
||||
|
||||
# Don't show internal links (href="#local-anchor") -- corresponding link targets
|
||||
# won't be visible in the plain text file anyway.
|
||||
SKIP_INTERNAL_LINKS = False
|
||||
|
||||
### Entity Nonsense ###
|
||||
|
||||
def name2cp(k):
|
||||
if k == 'apos': return ord("'")
|
||||
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
|
||||
return htmlentitydefs.name2codepoint[k]
|
||||
else:
|
||||
k = htmlentitydefs.entitydefs[k]
|
||||
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
|
||||
return ord(codecs.latin_1_decode(k)[0])
|
||||
|
||||
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
|
||||
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
|
||||
'ndash':'-', 'oelig':'oe', 'aelig':'ae',
|
||||
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
|
||||
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
|
||||
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
|
||||
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
|
||||
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
|
||||
|
||||
unifiable_n = {}
|
||||
|
||||
for k in unifiable.keys():
|
||||
unifiable_n[name2cp(k)] = unifiable[k]
|
||||
|
||||
def charref(name):
|
||||
if name[0] in ['x','X']:
|
||||
c = int(name[1:], 16)
|
||||
else:
|
||||
c = int(name)
|
||||
|
||||
if not UNICODE_SNOB and c in unifiable_n.keys():
|
||||
return unifiable_n[c]
|
||||
else:
|
||||
return unichr(c)
|
||||
|
||||
def entityref(c):
|
||||
if not UNICODE_SNOB and c in unifiable.keys():
|
||||
return unifiable[c]
|
||||
else:
|
||||
try: name2cp(c)
|
||||
except KeyError: return "&" + c
|
||||
else: return unichr(name2cp(c))
|
||||
|
||||
def replaceEntities(s):
|
||||
s = s.group(1)
|
||||
if s[0] == "#":
|
||||
return charref(s[1:])
|
||||
else: return entityref(s)
|
||||
|
||||
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
|
||||
def unescape(s):
|
||||
return r_unescape.sub(replaceEntities, s)
|
||||
|
||||
def fixattrs(attrs):
|
||||
# Fix bug in sgmllib.py
|
||||
if not attrs: return attrs
|
||||
newattrs = []
|
||||
for attr in attrs:
|
||||
newattrs.append((attr[0], unescape(attr[1])))
|
||||
return newattrs
|
||||
|
||||
### End Entity Nonsense ###
|
||||
|
||||
def onlywhite(line):
|
||||
"""Return true if the line does only consist of whitespace characters."""
|
||||
for c in line:
|
||||
if c is not ' ' and c is not ' ':
|
||||
return c is ' '
|
||||
return line
|
||||
|
||||
def optwrap(text):
|
||||
"""Wrap all paragraphs in the provided text."""
|
||||
if not BODY_WIDTH:
|
||||
return text
|
||||
|
||||
assert wrap, "Requires Python 2.3."
|
||||
result = ''
|
||||
newlines = 0
|
||||
for para in text.split("\n"):
|
||||
if len(para) > 0:
|
||||
if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
|
||||
for line in wrap(para, BODY_WIDTH):
|
||||
result += line + "\n"
|
||||
result += "\n"
|
||||
newlines = 2
|
||||
else:
|
||||
if not onlywhite(para):
|
||||
result += para + "\n"
|
||||
newlines = 1
|
||||
else:
|
||||
if newlines < 2:
|
||||
result += "\n"
|
||||
newlines += 1
|
||||
return result
|
||||
|
||||
def hn(tag):
|
||||
if tag[0] == 'h' and len(tag) == 2:
|
||||
try:
|
||||
n = int(tag[1])
|
||||
if n in range(1, 10): return n
|
||||
except ValueError: return 0
|
||||
|
||||
class _html2text(sgmllib.SGMLParser):
|
||||
def __init__(self, out=None, baseurl=''):
|
||||
sgmllib.SGMLParser.__init__(self)
|
||||
|
||||
if out is None: self.out = self.outtextf
|
||||
else: self.out = out
|
||||
self.outtext = u''
|
||||
self.quiet = 0
|
||||
self.p_p = 0
|
||||
self.outcount = 0
|
||||
self.start = 1
|
||||
self.space = 0
|
||||
self.a = []
|
||||
self.astack = []
|
||||
self.acount = 0
|
||||
self.list = []
|
||||
self.blockquote = 0
|
||||
self.pre = 0
|
||||
self.startpre = 0
|
||||
self.lastWasNL = 0
|
||||
self.abbr_title = None # current abbreviation definition
|
||||
self.abbr_data = None # last inner HTML (for abbr being defined)
|
||||
self.abbr_list = {} # stack of abbreviations to write later
|
||||
self.baseurl = baseurl
|
||||
|
||||
def outtextf(self, s):
|
||||
self.outtext += s
|
||||
|
||||
def close(self):
|
||||
sgmllib.SGMLParser.close(self)
|
||||
|
||||
self.pbr()
|
||||
self.o('', 0, 'end')
|
||||
|
||||
return self.outtext
|
||||
|
||||
def handle_charref(self, c):
|
||||
self.o(charref(c))
|
||||
|
||||
def handle_entityref(self, c):
|
||||
self.o(entityref(c))
|
||||
|
||||
def unknown_starttag(self, tag, attrs):
|
||||
self.handle_tag(tag, attrs, 1)
|
||||
|
||||
def unknown_endtag(self, tag):
|
||||
self.handle_tag(tag, None, 0)
|
||||
|
||||
def previousIndex(self, attrs):
|
||||
""" returns the index of certain set of attributes (of a link) in the
|
||||
self.a list
|
||||
|
||||
If the set of attributes is not found, returns None
|
||||
"""
|
||||
if not attrs.has_key('href'): return None
|
||||
|
||||
i = -1
|
||||
for a in self.a:
|
||||
i += 1
|
||||
match = 0
|
||||
|
||||
if a.has_key('href') and a['href'] == attrs['href']:
|
||||
if a.has_key('title') or attrs.has_key('title'):
|
||||
if (a.has_key('title') and attrs.has_key('title') and
|
||||
a['title'] == attrs['title']):
|
||||
match = True
|
||||
else:
|
||||
match = True
|
||||
|
||||
if match: return i
|
||||
|
||||
def handle_tag(self, tag, attrs, start):
|
||||
attrs = fixattrs(attrs)
|
||||
|
||||
if hn(tag):
|
||||
self.p()
|
||||
if start: self.o(hn(tag)*"#" + ' ')
|
||||
|
||||
if tag in ['p', 'div']: self.p()
|
||||
|
||||
if tag == "br" and start: self.o(" \n")
|
||||
|
||||
if tag == "hr" and start:
|
||||
self.p()
|
||||
self.o("* * *")
|
||||
self.p()
|
||||
|
||||
if tag in ["head", "style", 'script']:
|
||||
if start: self.quiet += 1
|
||||
else: self.quiet -= 1
|
||||
|
||||
if tag in ["body"]:
|
||||
self.quiet = 0 # sites like 9rules.com never close <head>
|
||||
|
||||
if tag == "blockquote":
|
||||
if start:
|
||||
self.p(); self.o('> ', 0, 1); self.start = 1
|
||||
self.blockquote += 1
|
||||
else:
|
||||
self.blockquote -= 1
|
||||
self.p()
|
||||
|
||||
if tag in ['em', 'i', 'u']: self.o("_")
|
||||
if tag in ['strong', 'b']: self.o("**")
|
||||
if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
|
||||
if tag == "abbr":
|
||||
if start:
|
||||
attrsD = {}
|
||||
for (x, y) in attrs: attrsD[x] = y
|
||||
attrs = attrsD
|
||||
|
||||
self.abbr_title = None
|
||||
self.abbr_data = ''
|
||||
if attrs.has_key('title'):
|
||||
self.abbr_title = attrs['title']
|
||||
else:
|
||||
if self.abbr_title != None:
|
||||
self.abbr_list[self.abbr_data] = self.abbr_title
|
||||
self.abbr_title = None
|
||||
self.abbr_data = ''
|
||||
|
||||
if tag == "a":
|
||||
if start:
|
||||
attrsD = {}
|
||||
for (x, y) in attrs: attrsD[x] = y
|
||||
attrs = attrsD
|
||||
if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')):
|
||||
self.astack.append(attrs)
|
||||
self.o("[")
|
||||
else:
|
||||
self.astack.append(None)
|
||||
else:
|
||||
if self.astack:
|
||||
a = self.astack.pop()
|
||||
if a:
|
||||
i = self.previousIndex(a)
|
||||
if i is not None:
|
||||
a = self.a[i]
|
||||
else:
|
||||
self.acount += 1
|
||||
a['count'] = self.acount
|
||||
a['outcount'] = self.outcount
|
||||
self.a.append(a)
|
||||
self.o("][" + `a['count']` + "]")
|
||||
|
||||
if tag == "img" and start:
|
||||
attrsD = {}
|
||||
for (x, y) in attrs: attrsD[x] = y
|
||||
attrs = attrsD
|
||||
if attrs.has_key('src'):
|
||||
attrs['href'] = attrs['src']
|
||||
alt = attrs.get('alt', '')
|
||||
i = self.previousIndex(attrs)
|
||||
if i is not None:
|
||||
attrs = self.a[i]
|
||||
else:
|
||||
self.acount += 1
|
||||
attrs['count'] = self.acount
|
||||
attrs['outcount'] = self.outcount
|
||||
self.a.append(attrs)
|
||||
self.o("![")
|
||||
self.o(alt)
|
||||
self.o("]["+`attrs['count']`+"]")
|
||||
|
||||
if tag == 'dl' and start: self.p()
|
||||
if tag == 'dt' and not start: self.pbr()
|
||||
if tag == 'dd' and start: self.o(' ')
|
||||
if tag == 'dd' and not start: self.pbr()
|
||||
|
||||
if tag in ["ol", "ul"]:
|
||||
if start:
|
||||
self.list.append({'name':tag, 'num':0})
|
||||
else:
|
||||
if self.list: self.list.pop()
|
||||
|
||||
self.p()
|
||||
|
||||
if tag == 'li':
|
||||
if start:
|
||||
self.pbr()
|
||||
if self.list: li = self.list[-1]
|
||||
else: li = {'name':'ul', 'num':0}
|
||||
self.o(" "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.
|
||||
if li['name'] == "ul": self.o("* ")
|
||||
elif li['name'] == "ol":
|
||||
li['num'] += 1
|
||||
self.o(`li['num']`+". ")
|
||||
self.start = 1
|
||||
else:
|
||||
self.pbr()
|
||||
|
||||
if tag in ["table", "tr"] and start: self.p()
|
||||
if tag == 'td': self.pbr()
|
||||
|
||||
if tag == "pre":
|
||||
if start:
|
||||
self.startpre = 1
|
||||
self.pre = 1
|
||||
else:
|
||||
self.pre = 0
|
||||
self.p()
|
||||
|
||||
def pbr(self):
|
||||
if self.p_p == 0: self.p_p = 1
|
||||
|
||||
def p(self): self.p_p = 2
|
||||
|
||||
def o(self, data, puredata=0, force=0):
|
||||
if self.abbr_data is not None: self.abbr_data += data
|
||||
|
||||
if not self.quiet:
|
||||
if puredata and not self.pre:
|
||||
data = re.sub('\s+', ' ', data)
|
||||
if data and data[0] == ' ':
|
||||
self.space = 1
|
||||
data = data[1:]
|
||||
if not data and not force: return
|
||||
|
||||
if self.startpre:
|
||||
#self.out(" :") #TODO: not output when already one there
|
||||
self.startpre = 0
|
||||
|
||||
bq = (">" * self.blockquote)
|
||||
if not (force and data and data[0] == ">") and self.blockquote: bq += " "
|
||||
|
||||
if self.pre:
|
||||
bq += " "
|
||||
data = data.replace("\n", "\n"+bq)
|
||||
|
||||
if self.start:
|
||||
self.space = 0
|
||||
self.p_p = 0
|
||||
self.start = 0
|
||||
|
||||
if force == 'end':
|
||||
# It's the end.
|
||||
self.p_p = 0
|
||||
self.out("\n")
|
||||
self.space = 0
|
||||
|
||||
|
||||
if self.p_p:
|
||||
self.out(('\n'+bq)*self.p_p)
|
||||
self.space = 0
|
||||
|
||||
if self.space:
|
||||
if not self.lastWasNL: self.out(' ')
|
||||
self.space = 0
|
||||
|
||||
if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
|
||||
if force == "end": self.out("\n")
|
||||
|
||||
newa = []
|
||||
for link in self.a:
|
||||
if self.outcount > link['outcount']:
|
||||
self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href']))
|
||||
if link.has_key('title'): self.out(" ("+link['title']+")")
|
||||
self.out("\n")
|
||||
else:
|
||||
newa.append(link)
|
||||
|
||||
if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
|
||||
|
||||
self.a = newa
|
||||
|
||||
if self.abbr_list and force == "end":
|
||||
for abbr, definition in self.abbr_list.items():
|
||||
self.out(" *[" + abbr + "]: " + definition + "\n")
|
||||
|
||||
self.p_p = 0
|
||||
self.out(data)
|
||||
self.lastWasNL = data and data[-1] == '\n'
|
||||
self.outcount += 1
|
||||
|
||||
def handle_data(self, data):
|
||||
if r'\/script>' in data: self.quiet -= 1
|
||||
self.o(data, 1)
|
||||
|
||||
def unknown_decl(self, data): pass
|
||||
|
||||
def wrapwrite(text): sys.stdout.write(text.encode('utf8'))
|
||||
|
||||
def html2text_file(html, out=wrapwrite, baseurl=''):
|
||||
h = _html2text(out, baseurl)
|
||||
h.feed(html)
|
||||
h.feed("")
|
||||
return h.close()
|
||||
|
||||
def html2text(html, baseurl=''):
|
||||
return optwrap(html2text_file(html, None, baseurl))
|
||||
|
||||
if __name__ == "__main__":
|
||||
baseurl = ''
|
||||
if sys.argv[1:]:
|
||||
arg = sys.argv[1]
|
||||
if arg.startswith('http://'):
|
||||
baseurl = arg
|
||||
j = urllib.urlopen(baseurl)
|
||||
try:
|
||||
from feedparser import _getCharacterEncoding as enc
|
||||
except ImportError:
|
||||
enc = lambda x, y: ('utf-8', 1)
|
||||
text = j.read()
|
||||
encoding = enc(j.headers, text)[0]
|
||||
if encoding == 'us-ascii': encoding = 'utf-8'
|
||||
data = text.decode(encoding)
|
||||
|
||||
else:
|
||||
encoding = 'utf8'
|
||||
if len(sys.argv) > 2:
|
||||
encoding = sys.argv[2]
|
||||
data = open(arg, 'r').read().decode(encoding)
|
||||
else:
|
||||
data = sys.stdin.read().decode('utf8')
|
||||
wrapwrite(html2text(data, baseurl))
|
||||
19
fanficdownloader/html_constants.py
Normal file
19
fanficdownloader/html_constants.py
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
XHTML_START = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>${title} by ${author}</title>
|
||||
<link href="stylesheet.css" type="text/css" rel="stylesheet" />
|
||||
</head>
|
||||
<body>
|
||||
<div>
|
||||
<h1>${title} by ${author}</h1>
|
||||
${body}
|
||||
</body></html>
|
||||
'''
|
||||
|
||||
XHTML_CHAPTER_START = '''<h2>${chapter}</h2>'''
|
||||
|
||||
XHTML_END = ''''''
|
||||
406
fanficdownloader/mediaminer.py
Normal file
406
fanficdownloader/mediaminer.py
Normal file
|
|
@ -0,0 +1,406 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import cgi
|
||||
import uuid
|
||||
import shutil
|
||||
import os.path
|
||||
import logging
|
||||
import unittest
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from constants import *
|
||||
from adapter import *
|
||||
|
||||
try:
|
||||
import login_password
|
||||
except:
|
||||
# tough luck
|
||||
pass
|
||||
|
||||
class MediaMiner(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
|
||||
self.storyName = ''
|
||||
self.authorName = ''
|
||||
self.storyDescription = ''
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
self.authorId = '0'
|
||||
self.authorURL = self.path
|
||||
self.storyId = '0'
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = ''
|
||||
self.category = ''
|
||||
self.storyStatus = 'In-Progress'
|
||||
self.storyRating = 'K'
|
||||
self.storyUserRating = '0'
|
||||
self.outputName = ''
|
||||
self.outputStorySep = '-mm_'
|
||||
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
|
||||
if self.url.find('view_st.php') != -1:
|
||||
ss = self.url.split('view_st.php')
|
||||
logging.debug('ss=%s' % ss)
|
||||
if ss is not None and len(ss) > 1:
|
||||
self.storyId = ss[1].replace('/','').strip()
|
||||
elif self.url.find('view_ch.php?') != -1:
|
||||
ss = self.url.split('=')
|
||||
logging.debug('ss=%s' % ss)
|
||||
if ss is not None and len(ss) > 1:
|
||||
self.storyId = ss[-1].replace('/','').strip()
|
||||
self.path = '/fanfic/view_st.php/' + self.storyId
|
||||
self.url = 'http://' + self.host + self.path
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
elif self.url.find('view_ch.php/') != -1:
|
||||
ss = self.url.split('/')
|
||||
logging.debug('ss=%s' % ss)
|
||||
if ss is not None and len(ss) > 2:
|
||||
self.storyId = ss[-2].strip()
|
||||
self.path = '/fanfic/view_st.php/' + self.storyId
|
||||
self.url = 'http://' + self.host + self.path
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
else:
|
||||
raise InvalidStoryURL("Error URL \"%s\" is not a story." % self.url)
|
||||
|
||||
logging.debug('self.storyId=%s' % self.storyId)
|
||||
|
||||
logging.debug('self.path=%s' % self.path)
|
||||
|
||||
if not self.appEngine:
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
else:
|
||||
self.opener = None
|
||||
|
||||
logging.debug("Created MediaMiner: url=%s" % (self.url))
|
||||
|
||||
def _getLoginScript(self):
|
||||
return self.path
|
||||
|
||||
def _getVarValue(self, varstr):
|
||||
#logging.debug('_getVarValue varstr=%s' % varstr)
|
||||
vals = varstr.split('=')
|
||||
#logging.debug('vals=%s' % vals)
|
||||
retstr="".join(vals[+1:])
|
||||
#logging.debug('retstr=%s' % retstr)
|
||||
if retstr.startswith(' '):
|
||||
retstr = retstr[1:]
|
||||
if retstr.endswith(';'):
|
||||
retstr = retstr[:-1]
|
||||
return retstr
|
||||
|
||||
def _splitCrossover(self, subject):
|
||||
if "Crossover" in subject:
|
||||
self.addSubject ("Crossover")
|
||||
logging.debug('Crossover=%s' % subject)
|
||||
if subject.find(' and ') != -1:
|
||||
words = subject.split(' ')
|
||||
logging.debug('words=%s' % words)
|
||||
subj = ''
|
||||
for s in words:
|
||||
if s in "and Crossover":
|
||||
if len(subj) > 0:
|
||||
self.addSubject(subj)
|
||||
subj = ''
|
||||
else:
|
||||
if len(subj) > 0:
|
||||
subj = subj + ' '
|
||||
subj = subj + s
|
||||
if len(subj) > 0:
|
||||
self.addSubject(subj)
|
||||
else:
|
||||
self.addSubject(subject)
|
||||
else:
|
||||
self.addSubject(subject)
|
||||
return True
|
||||
|
||||
def _splitGenre(self, subject):
|
||||
if len(subject) > 0:
|
||||
words = subject.split('/')
|
||||
logging.debug('words=%s' % words)
|
||||
for subj in words:
|
||||
if len(subj) > 0:
|
||||
self.addSubject(subj)
|
||||
return True
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
data = None
|
||||
try:
|
||||
data = self.fetchUrl(self.url)
|
||||
except Exception, e:
|
||||
data = None
|
||||
logging.error("Caught an exception reading URL " + self.url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise StoryDoesNotExist("Problem reading story URL " + self.url + "!")
|
||||
|
||||
#data.replace('<br />',' ').replace('<br>',' ').replace('</br>',' ')
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulSoup(data)
|
||||
except:
|
||||
logging.error("Failed to decode: <%s>" % data)
|
||||
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % self.url)
|
||||
|
||||
#logging.debug('soap=%s' % soup)
|
||||
urls = []
|
||||
|
||||
td_ffh = soup.find('td', {'class' : 'ffh'})
|
||||
#logging.debug('td_ffh=%s' % td_ffh)
|
||||
if td_ffh is not None:
|
||||
#logging.debug('td_ffh.text=%s' % td_ffh.find(text=True))
|
||||
self.storyName = unicode(td_ffh.find(text=True)).strip()
|
||||
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
|
||||
fft = td_ffh.find('font', {'class' : 'smtxt'})
|
||||
#logging.debug('fft=%s' % fft)
|
||||
if fft is not None:
|
||||
ffts = fft.string.split(' ')
|
||||
if ffts is not None:
|
||||
if len(ffts) > 1:
|
||||
self.storyRating = ffts[1]
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
self.genre = ''
|
||||
td_smtxt = soup.findAll('td')
|
||||
if td_smtxt is None:
|
||||
#logging.debug('td_smtxt is NONE!')
|
||||
pass
|
||||
else:
|
||||
ll = len(td_smtxt)
|
||||
#logging.debug('td_smtxt=%s, len=%s' % (td_smtxt, ll))
|
||||
for ii in range(ll):
|
||||
td = td_smtxt[ii]
|
||||
if 'class' in td._getAttrMap() and td['class'] != 'smtxt':
|
||||
#logging.debug('td has class attribute but is not smtxt')
|
||||
continue
|
||||
ss = unicode(td).replace('\n','').replace('\r','').replace(' ', ' ')
|
||||
#logging.debug('ss=%s' % ss)
|
||||
if len(ss) > 1 and (ss.find('Genre(s):') != -1 or ss.find('Type:') != -1):
|
||||
#logging.debug('ss=%s' % ss)
|
||||
ssbs = td.findAll('b')
|
||||
#logging.debug('ssbs=%s' % ssbs)
|
||||
bb = 0
|
||||
while bb < len(ssbs):
|
||||
nvs = bs.NavigableString('')
|
||||
sst=''
|
||||
ssb = ssbs[bb]
|
||||
ssbt = unicode(ssb.text).strip()
|
||||
#logging.debug('ssb=%s' % ssb)
|
||||
#logging.debug('ssbt=%s' % ssbt)
|
||||
ssbn = ssb.nextSibling
|
||||
while ssbn is not None:
|
||||
#logging.debug('ssbn=%s' % ssbn)
|
||||
#logging.debug('ssbn.class=%s' % ssbn.__class__)
|
||||
if nvs.__class__ == ssbn.__class__:
|
||||
st = unicode(ssbn)
|
||||
if st.strip() != '|':
|
||||
sst = sst + st
|
||||
else:
|
||||
#logging.debug('ssbn.name=%s' % ssbn.name)
|
||||
if ssbn.name == 'b':
|
||||
break
|
||||
ssbnts = ssbn.findAll(text=True)
|
||||
for ssbnt in ssbnts:
|
||||
sst = sst + ssbnt
|
||||
ssbn = ssbn.nextSibling
|
||||
sst = sst.replace(' ',' ').strip()
|
||||
#logging.debug('sst=%s' % sst)
|
||||
if bb == 0:
|
||||
ssbt = ssbt.replace(':','')
|
||||
self.addSubject(ssbt)
|
||||
self.addSubject(sst)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
else:
|
||||
if ssbt == 'Genre(s):':
|
||||
self.genre = sst
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
sts = sst.split(' / ')
|
||||
for st in sts:
|
||||
self.addSubject(st.strip())
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
elif ssbt == 'Type:':
|
||||
self.category = sst
|
||||
logging.debug('self.category=%s' % self.category)
|
||||
self.addSubject(sst)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
elif ssbt == 'Author:':
|
||||
pass
|
||||
elif ssbt == 'Visits:':
|
||||
pass
|
||||
elif ssbt == 'Size:':
|
||||
pass
|
||||
elif ssbt == 'Pages:':
|
||||
pass
|
||||
elif ssbt == 'Status:':
|
||||
if sst == "Completed":
|
||||
self.storyStatus = 'Completed'
|
||||
else:
|
||||
self.storyStatus = 'In-Progress'
|
||||
elif ssbt == 'Words:':
|
||||
self.numWords = sst.replace('|','').strip()
|
||||
logging.debug('self.numWords=%s' % self.numWords)
|
||||
pass
|
||||
elif ssbt == 'Summary:':
|
||||
self.storyDescription = sst.strip()
|
||||
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
||||
elif ssbt == 'Latest Revision:' or ssbt == 'Uploaded On:':
|
||||
#logging.debug('sst=%s' % sst)
|
||||
ssts = sst.split(' ')
|
||||
if ssts is not None and len(ssts) > 3:
|
||||
sst = ssts[0] + ' ' + ssts[1] + ' ' + ssts[2]
|
||||
#logging.debug('sst=%s' % sst)
|
||||
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(sst.strip(' '), "%B %d, %Y")))
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
||||
else:
|
||||
pass
|
||||
bb = bb+1
|
||||
|
||||
smtxt_as = td_smtxt[ii].findAll('a')
|
||||
#logging.debug('smtxt_as=%s' % smtxt_as)
|
||||
for smtxt_a in smtxt_as:
|
||||
if 'href' in smtxt_a._getAttrMap() and smtxt_a['href'].find('/u/'):
|
||||
sta = smtxt_a['href']
|
||||
#logging.debug('sta=%s' % sta)
|
||||
stas = sta.split('/u/')
|
||||
#logging.debug('stas=%s' % stas)
|
||||
if stas is not None and len(stas) > 1:
|
||||
self.authorId = stas[1]
|
||||
self.authorURL = 'http://' + self.host + sta
|
||||
self.authorName = smtxt_a.string
|
||||
logging.debug('self.authorName=%s, self.authorId=%s' % (self.authorName, self.authorId))
|
||||
|
||||
urlstory=''
|
||||
numchapters = 0
|
||||
td_tbbrdr = soup.find('td', {'class' : 'tbbrdr'})
|
||||
if td_tbbrdr is not None:
|
||||
#logging.debug('td_tbbrdr=%s' % td_tbbrdr )
|
||||
|
||||
sl = td_tbbrdr.find('select', {'name':'cid'})
|
||||
if sl is not None:
|
||||
#logging.debug('sl=%s' % sl )
|
||||
opts = sl.findAll('option')
|
||||
for o in opts:
|
||||
#logging.debug('o=%s' % o)
|
||||
if 'value' in o._getAttrMap():
|
||||
url = 'http://' + self.host + '/fanfic/view_ch.php/' + self.storyId + '/' + o['value']
|
||||
logging.debug('URL=%s, Title=%s' % (url, o.string))
|
||||
if numchapters == 0:
|
||||
ss = o.string.split('[')
|
||||
if ss is not None and len(ss) > 1:
|
||||
ssd = ss[-1].replace(']','')
|
||||
#logging.debug('ssd=%s' % ssd)
|
||||
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(ssd.strip(' '), "%b %d, %Y")))
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
urls.append((url, o.string))
|
||||
numchapters = numchapters + 1
|
||||
|
||||
if numchapters == 0:
|
||||
numchapters = 1
|
||||
url = 'http://' + self.host + '/fanfic/view_st.php/' + self.storyId
|
||||
self.storyPublished = self.storyUpdated
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
ssd = self.storyName + ' [' + self.storyPublished.strftime("%b %d, %Y") + ']'
|
||||
logging.debug('URL=%s, Title=%s' % (url, ssd))
|
||||
urls.append((url, ssd))
|
||||
|
||||
self.numChapters = unicode(numchapters)
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
#logging.debug('urls=%s' % urls)
|
||||
|
||||
return urls
|
||||
|
||||
def getText(self, url):
|
||||
time.sleep( 2.0 )
|
||||
logging.debug('url=%s' % url)
|
||||
data = ''
|
||||
try:
|
||||
data = self.fetchUrl(url)
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulSoup(data)
|
||||
except:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
|
||||
|
||||
nvs = bs.NavigableString('')
|
||||
sst=''
|
||||
allAs = soup.findAll ('a', { 'name' : 'fic_c' })
|
||||
#logging.debug('allAs=%s' % allAs)
|
||||
for a in allAs:
|
||||
#logging.debug('a=%s' % a)
|
||||
foundfirst = False
|
||||
done = False
|
||||
nxta = a.nextSibling
|
||||
while nxta is not None and not done:
|
||||
#logging.debug('nxta=%s' % nxta)
|
||||
#logging.debug('nxta.class=%s' % nxta.__class__)
|
||||
st = unicode(nxta)
|
||||
if nvs.__class__ != nxta.__class__:
|
||||
#logging.debug('nxta.name=%s' % nxta.name)
|
||||
if nxta.name == 'table':
|
||||
st = ''
|
||||
if foundfirst:
|
||||
done = True
|
||||
if nxta.name == 'div' and 'class' in nxta._getAttrMap() and nxta['class'] == 'acl' and foundfirst:
|
||||
st = ''
|
||||
done = True
|
||||
|
||||
if nxta.name == 'br':
|
||||
if not foundfirst:
|
||||
st = ''
|
||||
else:
|
||||
foundfirst = True
|
||||
else:
|
||||
foundfirst = True
|
||||
|
||||
sst = sst + st
|
||||
nxta = nxta.nextSibling
|
||||
|
||||
if sst is None:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return sst
|
||||
|
||||
class FPC_UnitTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
pass
|
||||
|
||||
def testFictionPress(self):
|
||||
url = 'http://www.fictionpress.com/s/2725180/1/Behind_This_Facade'
|
||||
f = FPCom(url)
|
||||
urls = f.extractIndividualUrls()
|
||||
|
||||
self.assertEquals('Behind This Facade', f.getStoryName())
|
||||
self.assertEquals('IntoxicatingMelody', f.getAuthorName())
|
||||
|
||||
text = f.getText(url)
|
||||
self.assertTrue(text.find('Kale Resgerald at your service" He answered, "So, can we go now? Or do you want to') != -1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
424
fanficdownloader/output.py
Normal file
424
fanficdownloader/output.py
Normal file
|
|
@ -0,0 +1,424 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import cgi
|
||||
import uuid
|
||||
import unicodedata
|
||||
import codecs
|
||||
import shutil
|
||||
import string
|
||||
import os.path
|
||||
import zipfile
|
||||
import StringIO
|
||||
import logging
|
||||
import hashlib
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
|
||||
import zipdir
|
||||
import html_constants
|
||||
from constants import *
|
||||
|
||||
|
||||
import html2text
|
||||
import datetime
|
||||
|
||||
|
||||
class FanficWriter:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def writeChapter(self, index, title, text):
|
||||
pass
|
||||
|
||||
def finalise(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def getFormatName():
|
||||
return 'base'
|
||||
|
||||
@staticmethod
|
||||
def getFormatExt():
|
||||
return '.bse'
|
||||
|
||||
class TextWriter(FanficWriter):
|
||||
htmlWriter = None
|
||||
|
||||
@staticmethod
|
||||
def getFormatName():
|
||||
return 'text'
|
||||
|
||||
@staticmethod
|
||||
def getFormatExt():
|
||||
return '.txt'
|
||||
|
||||
def __init__(self, base, adapter, inmemory=False, compress=False):
|
||||
self.inmemory = inmemory
|
||||
self.htmlWriter = HTMLWriter(base, adapter, True, False)
|
||||
|
||||
def writeChapter(self, index, title, text):
|
||||
self.htmlWriter.writeChapter(index, title, text)
|
||||
|
||||
def finalise(self):
|
||||
self.htmlWriter.finalise()
|
||||
self.name=self.htmlWriter.name
|
||||
self.fileName = self.htmlWriter.fileName.replace(".html",".txt")
|
||||
if self.inmemory:
|
||||
self.output = StringIO.StringIO()
|
||||
else:
|
||||
self.output = open(self.fileName, 'w')
|
||||
|
||||
self.output.write(html2text.html2text(self.htmlWriter.output.getvalue().decode('utf-8')).encode('utf-8'))
|
||||
|
||||
if not self.inmemory:
|
||||
self.output.close()
|
||||
|
||||
|
||||
class HTMLWriter(FanficWriter):
|
||||
body = ''
|
||||
|
||||
@staticmethod
|
||||
def getFormatName():
|
||||
return 'html'
|
||||
|
||||
@staticmethod
|
||||
def getFormatExt():
|
||||
return '.html'
|
||||
|
||||
def __init__(self, base, adapter, inmemory=False, compress=False):
|
||||
self.basePath = base
|
||||
self.storyTitle = removeEntities(adapter.getStoryName())
|
||||
self.name = makeAcceptableFilename(adapter.getOutputName())
|
||||
self.fileName = self.basePath + '/' + self.name + self.getFormatExt()
|
||||
self.authorName = removeEntities(adapter.getAuthorName())
|
||||
self.adapter = adapter
|
||||
|
||||
self.inmemory = inmemory
|
||||
|
||||
if not self.inmemory and os.path.exists(self.fileName):
|
||||
os.remove(self.fileName)
|
||||
|
||||
if self.inmemory:
|
||||
self.output = StringIO.StringIO()
|
||||
else:
|
||||
self.output = open(self.fileName, 'w')
|
||||
|
||||
self.xhtmlTemplate = string.Template(html_constants.XHTML_START)
|
||||
self.chapterStartTemplate = string.Template(html_constants.XHTML_CHAPTER_START)
|
||||
|
||||
def _printableVersion(self, text):
|
||||
try:
|
||||
d = text.decode('utf-8')
|
||||
return d
|
||||
except:
|
||||
return text
|
||||
|
||||
def writeChapter(self, index, title, text):
|
||||
title = self._printableVersion(title) #title.decode('utf-8')
|
||||
text = self._printableVersion(text) #text.decode('utf-8')
|
||||
self.body = self.body + '\n' + self.chapterStartTemplate.substitute({'chapter' : title})
|
||||
self.body = self.body + '\n' + text
|
||||
|
||||
def finalise(self):
|
||||
html = self.xhtmlTemplate.substitute({'title' : self.storyTitle, 'author' : self.authorName, 'body' : self.body})
|
||||
soup = bs.BeautifulSoup(html)
|
||||
result = soup.__str__('utf8')
|
||||
|
||||
# f = open(self.fileName, 'w')
|
||||
# f.write(result)
|
||||
# f.close()
|
||||
|
||||
self.output.write(result)
|
||||
if not self.inmemory:
|
||||
self.output.close()
|
||||
|
||||
class EPubFanficWriter(FanficWriter):
|
||||
chapters = []
|
||||
|
||||
files = {}
|
||||
|
||||
@staticmethod
|
||||
def getFormatName():
|
||||
return 'epub'
|
||||
|
||||
@staticmethod
|
||||
def getFormatExt():
|
||||
return '.epub'
|
||||
|
||||
def __init__(self, base, adapter, inmemory=False, compress=True):
|
||||
self.basePath = base
|
||||
self.storyTitle = removeEntities(adapter.getStoryName())
|
||||
self.name = makeAcceptableFilename(adapter.getOutputName())
|
||||
self.directory = self.basePath + '/' + self.name
|
||||
self.authorName = removeEntities(adapter.getAuthorName())
|
||||
self.inmemory = inmemory
|
||||
self.adapter = adapter
|
||||
|
||||
self.files = {}
|
||||
self.chapters = []
|
||||
|
||||
if not self.inmemory:
|
||||
self.inmemory = True
|
||||
self.writeToFile = True
|
||||
else:
|
||||
self.writeToFile = False
|
||||
|
||||
if not self.inmemory:
|
||||
if os.path.exists(self.directory):
|
||||
shutil.rmtree(self.directory)
|
||||
|
||||
os.mkdir(self.directory)
|
||||
|
||||
os.mkdir(self.directory + '/META-INF')
|
||||
os.mkdir(self.directory + '/OEBPS')
|
||||
|
||||
self._writeFile('mimetype', MIMETYPE)
|
||||
self._writeFile('META-INF/container.xml', CONTAINER)
|
||||
self._writeFile('OEBPS/stylesheet.css', CSS)
|
||||
|
||||
def _writeFile(self, fileName, data):
|
||||
#logging.debug('_writeFile(`%s`, data)' % fileName)
|
||||
if fileName in self.files:
|
||||
try:
|
||||
d = data.decode('utf-8')
|
||||
except UnicodeEncodeError, e:
|
||||
d = data
|
||||
|
||||
self.files[fileName].write(d)
|
||||
else:
|
||||
if self.inmemory:
|
||||
self.files[fileName] = StringIO.StringIO()
|
||||
else:
|
||||
self.files[fileName] = open(self.directory + '/' + fileName, encoding='utf-8', mode='w')
|
||||
|
||||
self._writeFile(fileName, data)
|
||||
|
||||
|
||||
def _closeFiles(self):
|
||||
if not self.inmemory:
|
||||
for f in self.files:
|
||||
self.files[f].close()
|
||||
|
||||
def writeChapter(self, index, title, text):
|
||||
title = removeEntities(title)
|
||||
logging.debug("Writing chapter: %s" % title)
|
||||
fileName="chapter%04d.xhtml" % index
|
||||
|
||||
filePath = self.directory + "/OEBPS/" + fileName
|
||||
|
||||
fn = 'OEBPS/' + fileName
|
||||
|
||||
# f = open(filePath, 'w')
|
||||
|
||||
text = removeEntities(text)
|
||||
|
||||
# BeautifulStoneSoup doesn't have any selfClosingTags by default.
|
||||
# hr & br needs to be if they're going to work.
|
||||
# Some stories do use multiple br tags as their section breaks...
|
||||
self.soup = bs.BeautifulStoneSoup(text, selfClosingTags=('br','hr'))
|
||||
|
||||
allTags = self.soup.findAll(recursive=True)
|
||||
for t in allTags:
|
||||
for attr in t._getAttrMap().keys():
|
||||
if attr not in acceptable_attributes:
|
||||
del t[attr]
|
||||
# these are not acceptable strict XHTML. But we do already have
|
||||
# CSS classes of the same names defined in constants.py
|
||||
if t.name in ('u'):
|
||||
t['class']=t.name
|
||||
t.name='span'
|
||||
if t.name in ('center'):
|
||||
t['class']=t.name
|
||||
t.name='div'
|
||||
# removes paired, but empty tags.
|
||||
if t.string != None and len(t.string.strip()) == 0 :
|
||||
t.extract()
|
||||
|
||||
text = self.soup.__str__('utf8')
|
||||
|
||||
# ffnet(& maybe others) gives the whole chapter text
|
||||
# as one line. This causes problems for nook(at
|
||||
# least) when the chapter size starts getting big
|
||||
# (200k+) Using Soup's prettify() messes up italics
|
||||
# and such. Done after soup extract so <p> and <br>
|
||||
# tags are normalized. Doing it here seems less evil
|
||||
# than hacking BeautifulSoup, but it's debatable.
|
||||
text = text.replace('</p>','</p>\n').replace('<br />','<br />\n')
|
||||
|
||||
self._writeFile(fn, XHTML_START % (title, title))
|
||||
self._writeFile(fn, text)
|
||||
self._writeFile(fn, XHTML_END)
|
||||
# print >> f, XHTML_START % (title, title)
|
||||
# f.write(text)
|
||||
# print >> f, XHTML_END
|
||||
|
||||
self.chapters.append((title, fileName))
|
||||
|
||||
def finalise(self):
|
||||
logging.debug("Finalising...")
|
||||
### writing table of contents -- ncx file
|
||||
|
||||
tocFilePath = "OEBPS/toc.ncx"
|
||||
# toc = open(tocFilePath, 'w')
|
||||
# print >> toc, TOC_START % self.storyTitle
|
||||
self._writeFile(tocFilePath, TOC_START % (self.adapter.getUUID(), self.storyTitle))
|
||||
|
||||
published = self.adapter.getStoryPublished().strftime("%Y-%m-%d")
|
||||
createda = self.adapter.getStoryCreated().strftime("%Y-%m-%d %H:%M:%S")
|
||||
created = self.adapter.getStoryCreated().strftime("%Y-%m-%d")
|
||||
updated = self.adapter.getStoryUpdated().strftime("%Y-%m-%d")
|
||||
calibre = self.adapter.getStoryUpdated().strftime("%Y-%m-%dT%H:%M:%S")
|
||||
|
||||
description = self.adapter.getStoryDescription()
|
||||
if hasattr(description, "text"):
|
||||
description = unicode(description.text)
|
||||
else:
|
||||
description = unicode(description)
|
||||
if description is not None and len(description) > 0:
|
||||
description = description.replace ('\\\'', '\'').replace('\\\"', '\"')
|
||||
description = removeEntities(description)
|
||||
else:
|
||||
description = ' '
|
||||
|
||||
### writing content -- title page
|
||||
titleFilePath = "OEBPS/title_page.xhtml"
|
||||
self._writeFile(titleFilePath, TITLE_HEADER % (self.authorName, self.storyTitle, self.adapter.getStoryURL(), self.storyTitle, self.adapter.getAuthorURL(), self.authorName))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Category:', self.adapter.getCategory()))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Genre:', self.adapter.getGenre()))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Status:', self.adapter.getStoryStatus()))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Published:', published))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Updated:', updated))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Packaged:', createda))
|
||||
tmpstr = self.adapter.getStoryRating() + " / " + self.adapter.getStoryUserRating()
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Rating Age/User:', tmpstr))
|
||||
tmpstr = unicode(self.adapter.getNumChapters()) + " / " + unicode(self.adapter.getNumWords())
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Chapters/Words:', tmpstr))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Publisher:', self.adapter.getHost()))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Story ID:', self.adapter.getStoryId()))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Author ID:', self.adapter.getAuthorId()))
|
||||
|
||||
self._writeFile(titleFilePath, TITLE_FOOTER % description )
|
||||
|
||||
### writing content -- opf file
|
||||
opfFilePath = "OEBPS/content.opf"
|
||||
|
||||
# opf = open(opfFilePath, 'w')
|
||||
self._writeFile(opfFilePath, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName, self.adapter.getLanguageId(), published, created, updated, calibre, description))
|
||||
|
||||
i = 0
|
||||
subjs = []
|
||||
subjs = self.adapter.getSubjects()
|
||||
for subj in subjs:
|
||||
self._writeFile(opfFilePath, CONTENT_SUBJECT % subj)
|
||||
i = i + 1
|
||||
if (i <= 0):
|
||||
self._writeFile(opfFilePath, CONTENT_SUBJECT % "FanFiction")
|
||||
|
||||
self._writeFile(opfFilePath, CONTENT_END_METADATA % (self.adapter.getPublisher(), self.adapter.getUUID(), self.adapter.getStoryURL(), self.adapter.getStoryURL(), self.adapter.getStoryUserRating()))
|
||||
# print >> opf, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName)
|
||||
|
||||
ids = []
|
||||
|
||||
i = 0
|
||||
|
||||
t = "Title Page"
|
||||
f = "title_page.xhtml"
|
||||
chapterId = "Title Page"
|
||||
self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f))
|
||||
self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))
|
||||
|
||||
ids.append(chapterId)
|
||||
|
||||
i = i + 1
|
||||
|
||||
for t,f in self.chapters:
|
||||
chapterId = "chapter%04d" % i
|
||||
|
||||
self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f))
|
||||
self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))
|
||||
|
||||
ids.append(chapterId)
|
||||
|
||||
i = i + 1
|
||||
|
||||
# logging.d('Toc and refs printed, proceesing to ref-ids....')
|
||||
|
||||
self._writeFile(tocFilePath, TOC_END)
|
||||
self._writeFile(opfFilePath, CONTENT_END_MANIFEST)
|
||||
|
||||
for chapterId in ids:
|
||||
self._writeFile(opfFilePath, CONTENT_ITEMREF % chapterId)
|
||||
|
||||
self._writeFile(opfFilePath, CONTENT_END)
|
||||
|
||||
self._closeFiles()
|
||||
|
||||
filename = self.directory + self.getFormatExt()
|
||||
|
||||
zipdata = zipdir.inMemoryZip(self.files)
|
||||
|
||||
if self.writeToFile:
|
||||
f = open(filename, 'wb')
|
||||
f.write(zipdata.getvalue())
|
||||
f.close()
|
||||
else:
|
||||
self.output = zipdata
|
||||
|
||||
# zipdir.toZip(filename, self.directory)
|
||||
|
||||
def unirepl(match):
|
||||
"Return the unicode string for a decimal number"
|
||||
if match.group(1)=='x':
|
||||
radix=16
|
||||
else:
|
||||
radix=10
|
||||
value = int(match.group(2), radix )
|
||||
return unichr(value)
|
||||
|
||||
def replaceNumberEntities(data):
|
||||
p = re.compile(r'&#(x?)(\d+);')
|
||||
return p.sub(unirepl, data)
|
||||
|
||||
def removeEntities(text):
|
||||
# replace numeric versions of [&<>] with named versions.
|
||||
|
||||
try:
|
||||
t = text.decode('utf-8')
|
||||
except UnicodeEncodeError, e:
|
||||
try:
|
||||
t = text.encode ('ascii', 'xmlcharrefreplace')
|
||||
except UnicodeEncodeError, e:
|
||||
t = text
|
||||
text = t
|
||||
text = re.sub(r'�*38;','&',text)
|
||||
text = re.sub(r'�*60;','<',text)
|
||||
text = re.sub(r'�*62;','>',text)
|
||||
|
||||
# replace remaining � entities with unicode value, such as ' -> '
|
||||
text = replaceNumberEntities(text)
|
||||
|
||||
# replace several named entities with character, such as — -> -
|
||||
# see constants.py for the list.
|
||||
# reverse sort will put entities with ; before the same one without, when valid.
|
||||
for e in reversed(sorted(entities.keys())):
|
||||
v = entities[e]
|
||||
try:
|
||||
text = text.replace(e, v)
|
||||
except UnicodeDecodeError, ex:
|
||||
# for the pound symbol in constants.py
|
||||
text = text.replace(e, v.decode('utf-8'))
|
||||
|
||||
# < < and & are the only html entities allowed in xhtml, put those back.
|
||||
text = text.replace('&', '&').replace('&lt;', '<').replace('&gt;', '>')
|
||||
|
||||
return text
|
||||
|
||||
def makeAcceptableFilename(text):
|
||||
return re.sub('[^a-zA-Z0-9_-]+','',removeEntities(text).replace(" ", "_").replace(":","_"))
|
||||
367
fanficdownloader/potionsNsnitches.py
Normal file
367
fanficdownloader/potionsNsnitches.py
Normal file
|
|
@ -0,0 +1,367 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copied from the twilighted.py because site is almost the same..
|
||||
# of course, now that we're trying to scrape more detail about the
|
||||
# story, there were differences in how headers are displayed
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
import os.path
|
||||
import urllib as u
|
||||
import logging
|
||||
import pprint as pp
|
||||
import unittest
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from adapter import *
|
||||
|
||||
class PotionsNSnitches(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
self.password = ''
|
||||
self.login='sigizmund'
|
||||
self.storyDescription = 'Fanfiction Story'
|
||||
self.authorId = '0'
|
||||
self.authorURL = ''
|
||||
self.storyId = '0'
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.subjects.append ('fanfiction')
|
||||
self.subjects.append ('Harry Potter')
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = 'FanFiction'
|
||||
self.category = 'Category'
|
||||
self.storyStatus = 'In-Progress'
|
||||
self.storyRating = 'PG'
|
||||
self.storyUserRating = '0'
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
self.outputName = ''
|
||||
self.outputStorySep = '-pns_'
|
||||
|
||||
self.chapurl = False
|
||||
ss=self.url.split('?')
|
||||
if ss is not None and len(ss) > 1:
|
||||
sss = ss[1].replace('&','&').split('&')
|
||||
if sss is not None and len(sss) > 0:
|
||||
ssss = sss[0].split('=')
|
||||
if ssss is not None and len(ssss) > 1 and ssss[0] == 'sid':
|
||||
self.storyId = ssss[1]
|
||||
if len(sss) > 1:
|
||||
ssss = sss[1].split('=')
|
||||
if ssss is not None and len(ssss) > 1 and ssss[0] == 'chapter':
|
||||
self.chapurl = True
|
||||
|
||||
self.url = 'http://' + self.host + self.path + '?sid=' + self.storyId
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
|
||||
logging.debug("Created PotionsNSnitches: url=%s" % (self.url))
|
||||
|
||||
|
||||
def _getLoginScript(self):
|
||||
return '/user.php?action=login'
|
||||
|
||||
def reqLoginData(self, data):
|
||||
if data.find('Registered Users Only. Please click OK to login or register.') != -1 or data.find('There is no such account on our website') != -1:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def _fillCharacters(self, strlist, idx, maxlen):
|
||||
ii = idx
|
||||
while ii < maxlen:
|
||||
chara = strlist[ii].strip()
|
||||
if len(chara) > 0:
|
||||
if chara.find(':') != -1:
|
||||
return (ii-1)
|
||||
elif chara.find(',') == -1:
|
||||
self.addCharacter (chara)
|
||||
ii = ii + 1
|
||||
return (ii)
|
||||
|
||||
def _buildGenre(self, strlist, idx, maxlen):
|
||||
self.genre = ''
|
||||
ii = idx
|
||||
while ii < maxlen:
|
||||
genre = strlist[ii].strip()
|
||||
if len(genre) > 0:
|
||||
if genre.find(':') != -1:
|
||||
return (ii-1)
|
||||
elif genre.find(',') != -1:
|
||||
genre = ', '
|
||||
else:
|
||||
self.addSubject (genre)
|
||||
self.genre = self.genre + genre
|
||||
ii = ii + 1
|
||||
return (ii)
|
||||
|
||||
def _buildCategory(self, strlist, idx, maxlen):
|
||||
self.category = ''
|
||||
ii = idx
|
||||
while ii < maxlen:
|
||||
cat = strlist[ii].strip()
|
||||
if len(cat) > 0:
|
||||
if cat.find(':') != -1:
|
||||
return (ii-1)
|
||||
elif cat.find(',') != -1:
|
||||
cat = ', '
|
||||
else:
|
||||
self.addSubject (cat)
|
||||
self.category = self.category + cat
|
||||
ii = ii + 1
|
||||
return (ii)
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
url = self.url + '&chapter=1'
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise StoryDoesNotExist("Problem reading story URL " + url + "!")
|
||||
|
||||
if self.reqLoginData(data):
|
||||
self.performLogin()
|
||||
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise StoryDoesNotExist("Problem reading story URL " + url + "!")
|
||||
|
||||
if self.reqLoginData(data):
|
||||
raise FailedToDownload("Error downloading Story: %s! Login Failed!" % url)
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
except:
|
||||
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % url)
|
||||
|
||||
self.storyName = ''
|
||||
self.authorName = ''
|
||||
self.storyId = '0'
|
||||
title = soup.find('title').string
|
||||
if title is not None and len(title) > 0:
|
||||
logging.debug('Title: %s' % title)
|
||||
ss = title.split(' by ')
|
||||
if ss is not None and len(ss) > 1:
|
||||
self.storyName = ss[0].strip()
|
||||
self.authorName = ss[1].strip()
|
||||
|
||||
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
|
||||
logging.debug('self.authorId=%s, self.authorName=%s' % (self.authorId, self.authorName))
|
||||
|
||||
select = soup.find('select', { 'name' : 'chapter' } )
|
||||
|
||||
result = []
|
||||
if select is None:
|
||||
# no chapters found, try url by itself.
|
||||
chaptitle = soup.find('div', { 'id' : 'chaptertitle' } )
|
||||
if chaptitle is not None and chaptitle.string is not None and len(chaptitle.string) > 0:
|
||||
result.append((url,chaptitle.string))
|
||||
else:
|
||||
result.append((url,self.storyName))
|
||||
else:
|
||||
allOptions = select.findAll('option')
|
||||
for o in allOptions:
|
||||
url = self.url + "&chapter=%s" % o['value']
|
||||
title = o.string
|
||||
result.append((url,title))
|
||||
|
||||
url = self.url + "&index=1"
|
||||
data = self.opener.open(url).read()
|
||||
lines = data.split('\n')
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
pgt = soup.find('div', {'id' : 'pagetitle'})
|
||||
#logging.debug('pagetitle: %s' % pgt)
|
||||
pgtAs = pgt.findAll('a')
|
||||
#logging.debug('pgtAs: %s' % pgtAs)
|
||||
for a in pgtAs:
|
||||
if a['href'].find('viewstory.php') != -1:
|
||||
(u1, self.storyId) = a['href'].split('=')
|
||||
self.storyName = a.string
|
||||
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
|
||||
elif a['href'].find('viewuser.php') != -1:
|
||||
self.authorName = a.string
|
||||
self.authorURL = 'http://' + self.host + '/' + a['href']
|
||||
(u1, self.authorId) = a['href'].split('=')
|
||||
logging.debug('self.authorName=%s, self.authorId=%s' % (self.authorName, self.authorId))
|
||||
|
||||
output = soup.find('div', {'id' : 'output'})
|
||||
#logging.debug('output: %s' % unicode(output))
|
||||
if output is not None and len(unicode(output)) > 1:
|
||||
s2 = re.split ('<[^>]+>', unicode(output))
|
||||
#logging.debug('s2=%s' % s2)
|
||||
ii = 0
|
||||
ll = len(s2)
|
||||
while ii < ll:
|
||||
if s2[ii] == 'Summary:' and ii+1 < ll:
|
||||
self.storyDescription = s2[ii+1].strip()
|
||||
logging.debug('self.storyDescription: %s' % self.storyDescription)
|
||||
break;
|
||||
ii = ii+1
|
||||
|
||||
cnt = soup.find('div', {'class' : 'content'})
|
||||
#logging.debug('content: %s' % cnt)
|
||||
cnttd = cnt.findAll('td')
|
||||
#logging.debug('cnttd: %s' % cnttd)
|
||||
for td in cnttd:
|
||||
#logging.debug('td: %s' % unicode(td))
|
||||
ss = unicode(td).replace('\n','').replace('\r','').replace(' ', ' ')
|
||||
if len(ss) > 1:
|
||||
s2 = re.split ('<[^>]+>', ss)
|
||||
#logging.debug('s2=%s' % s2)
|
||||
ii = 0
|
||||
ll = len(s2)
|
||||
while ii < ll-1:
|
||||
if s2[ii] is not None and len(s2[ii]) > 0 and s2[ii].find(':') != -1:
|
||||
skey = s2[ii].strip()
|
||||
ii = ii+1
|
||||
if skey == 'Rated:':
|
||||
self.storyRating = s2[ii].strip()
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
ii = ii + 1
|
||||
elif skey == 'Chapters:':
|
||||
self.numChapters = s2[ii].strip()
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
ii = ii + 1
|
||||
elif skey == 'Characters:':
|
||||
ii = self._fillCharacters(s2, ii, ll)
|
||||
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
||||
ii = ii + 1
|
||||
elif skey == 'Genres:':
|
||||
ii = self._buildGenre(s2, ii, ll)
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
elif skey == 'Categories:':
|
||||
ii = self._buildCategory(s2, ii, ll)
|
||||
logging.debug('self.category=%s' % self.category)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
elif skey == 'Completed:':
|
||||
if s2[ii].strip(' ') == "No":
|
||||
self.storyStatus = 'In-Progress'
|
||||
else:
|
||||
self.storyStatus = 'Completed'
|
||||
ii = ii + 1
|
||||
elif skey == 'Word count:':
|
||||
self.numWords = s2[ii].strip()
|
||||
if self.numWords is None or len(self.numWords) == 0:
|
||||
self.numWords = '0'
|
||||
logging.debug('self.numWords=%s' % self.numWords)
|
||||
ii = ii + 1
|
||||
elif skey == 'Takes Place:':
|
||||
ii = ii + 1
|
||||
elif skey == 'Awards:':
|
||||
ii = ii + 1
|
||||
elif skey == 'Series:':
|
||||
ii = ii + 1
|
||||
elif skey == 'Read:':
|
||||
ii = ii + 1
|
||||
elif skey == 'Warnings:':
|
||||
ii = ii + 1
|
||||
else:
|
||||
ii = ii + 1
|
||||
|
||||
tls = soup.findAll('div', {'style' : 'text-align: center;'})
|
||||
for tl in tls:
|
||||
#logging.debug('tl: %s' % tl)
|
||||
ss = unicode(tl).replace('\n','').replace('\r','').replace(' ', ' ')
|
||||
if ss.find('Published:') != -1:
|
||||
s2 = re.split ('<[^>]+>', ss)
|
||||
#logging.debug('s2: %s' % s2)
|
||||
ii = 0
|
||||
ll = len(s2)
|
||||
while ii < ll-1:
|
||||
if s2[ii] is not None and len(s2[ii]) > 0 and s2[ii].find(':') != -1:
|
||||
skey = s2[ii].strip()
|
||||
#logging.debug('skey: %s' % skey)
|
||||
ii = ii+1
|
||||
if skey == 'Published:':
|
||||
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s2[ii].strip(' '), "%b %d %Y")))
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
ii = ii + 1
|
||||
elif skey == 'Updated:':
|
||||
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s2[ii].strip(' '), "%b %d %Y")))
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
||||
ii = ii + 1
|
||||
else:
|
||||
ii = ii + 1
|
||||
|
||||
if (self.storyName is None or len(self.storyName) == 0) and self.storyId == '0':
|
||||
logging.error('self.storyName is empty!! Exitting!')
|
||||
exit(1)
|
||||
|
||||
return result
|
||||
|
||||
def getText(self, url):
|
||||
if url.find('http://') == -1:
|
||||
url = 'http://' + self.host + '/' + url
|
||||
|
||||
logging.debug('Getting data from: %s' % url)
|
||||
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
|
||||
|
||||
# need to do this, because for some reason the <br /> tag in the story causes problems
|
||||
data = data.replace('<br />', ' SOMETHING_BR ')
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES)
|
||||
except:
|
||||
logging.info("Failed to decode: <%s>" % data)
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
# put the <br /> tags back in..
|
||||
text = div.__str__('utf8').replace(' SOMETHING_BR ','<br />')
|
||||
return text
|
||||
|
||||
|
||||
class PotionsNSnitches_UnitTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
pass
|
||||
|
||||
def testLoginWorks(self):
|
||||
pass
|
||||
|
||||
def testGetUrlsWorks(self):
|
||||
url = 'http://potionsandsnitches.net/fanfiction/viewstory.php?sid=2230'
|
||||
self.assertEquals(32, len(Twilighted(url).extractIndividualUrls()))
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
10
fanficdownloader/readme.txt
Normal file
10
fanficdownloader/readme.txt
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
To use, do:
|
||||
|
||||
python downloader.py <url> (epub|html)
|
||||
|
||||
Eg:
|
||||
|
||||
python downloader.py http://www.fanfiction.net/s/5192986/1/A_Fox_in_Tokyo epub
|
||||
|
||||
This tool uses Python 2.5.2, but should work with newer versions.
|
||||
|
||||
316
fanficdownloader/twilighted.py
Normal file
316
fanficdownloader/twilighted.py
Normal file
|
|
@ -0,0 +1,316 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
import os.path
|
||||
import urllib as u
|
||||
import logging
|
||||
import pprint as pp
|
||||
import unittest
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from adapter import *
|
||||
import twipassword
|
||||
|
||||
class Twilighted(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
self.password=twipassword.password
|
||||
self.login='sigizmund'
|
||||
self.storyDescription = 'Fanfiction Story'
|
||||
self.authorId = '0'
|
||||
self.authorURL = ''
|
||||
self.storyId = '0'
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.subjects.append ('fanfiction')
|
||||
self.subjects.append ('Twilight')
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = ''
|
||||
self.category = 'Fanfiction'
|
||||
self.storyStatus = 'In-Progress'
|
||||
self.storyRating = 'PG'
|
||||
self.storyUserRating = '0'
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
self.outputName = ''
|
||||
self.outputStorySep = '-tw_'
|
||||
|
||||
self.chapurl = False
|
||||
ss=self.url.split('?')
|
||||
logging.debug('ss=%s' % ss)
|
||||
if ss is not None and len(ss) > 1:
|
||||
sss = ss[1].replace('&','&').split('&')
|
||||
logging.debug('sss=%s' % sss)
|
||||
if sss is not None and len(sss) > 0:
|
||||
ssss = sss[0].split('=')
|
||||
logging.debug('ssss=%s' % ssss)
|
||||
if ssss is not None and len(ssss) > 1 and ssss[0] == 'sid':
|
||||
self.storyId = ssss[1]
|
||||
if len(sss) > 1:
|
||||
ssss = sss[1].split('=')
|
||||
logging.debug('ssss=%s' % ssss)
|
||||
if ssss is not None and len(ssss) > 1 and ssss[0] == 'chapter':
|
||||
self.chapurl = True
|
||||
|
||||
self.url = 'http://' + self.host + '/' + self.path + '?sid=' + self.storyId
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
|
||||
logging.debug("Created Twilighted: url=%s" % (self.url))
|
||||
|
||||
def _getLoginScript(self):
|
||||
return '/user.php?action=login'
|
||||
|
||||
def reqLoginData(self, data):
|
||||
if data.find('Registered Users Only. Please click OK to login or register.') != -1 or data.find('There is no such account on our website') != -1:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def requiresLogin(self, url = None):
|
||||
return True
|
||||
|
||||
def performLogin(self, url = None):
|
||||
data = {}
|
||||
|
||||
data['penname'] = self.login
|
||||
data['password'] = self.password
|
||||
data['cookiecheck'] = '1'
|
||||
data['submit'] = 'Submit'
|
||||
|
||||
urlvals = u.urlencode(data)
|
||||
loginUrl = 'http://' + self.host + self._getLoginScript()
|
||||
logging.debug("Will now login to URL %s" % loginUrl)
|
||||
|
||||
req = self.opener.open(loginUrl, urlvals)
|
||||
|
||||
d = req.read().decode('utf-8')
|
||||
|
||||
if self.reqLoginData(d) :
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
url = self.url + '&chapter=1'
|
||||
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise StoryDoesNotExist("Problem reading story URL " + url + "!")
|
||||
|
||||
if self.reqLoginData(data):
|
||||
self.performLogin()
|
||||
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise StoryDoesNotExist("Problem reading story URL " + url + "!")
|
||||
|
||||
if self.reqLoginData(data):
|
||||
raise FailedToDownload("Error downloading Story: %s! Login Failed!" % url)
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
except:
|
||||
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % url)
|
||||
|
||||
title = soup.find('title').string
|
||||
logging.debug('Title: %s' % title)
|
||||
self.storyName = title.split(' by ')[0].strip()
|
||||
self.authorName = title.split(' by ')[1].strip()
|
||||
|
||||
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
|
||||
logging.debug('self.authorId=%s, self.authorName=%s' % (self.authorId, self.authorName))
|
||||
|
||||
select = soup.find('select', { 'name' : 'chapter' } )
|
||||
|
||||
result = []
|
||||
if select is None:
|
||||
# no chapters found, try url by itself.
|
||||
result.append((self.url,self.storyName))
|
||||
else:
|
||||
allOptions = select.findAll('option')
|
||||
for o in allOptions:
|
||||
url = self.url + "&chapter=%s" % o['value']
|
||||
title = o.string
|
||||
result.append((url,title))
|
||||
|
||||
url = self.url + "&index=1"
|
||||
data = self.opener.open(url).read()
|
||||
lines = data.split('\n')
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
metas = soup.findAll('meta')
|
||||
|
||||
for meta in metas:
|
||||
if 'name' in meta._getAttrMap() and meta['name'].find('description') != -1:
|
||||
#logging.debug('Meta: %s' % meta)
|
||||
if 'content' in meta._getAttrMap():
|
||||
s1 = bs.BeautifulStoneSoup(meta['content'])
|
||||
ps = s1.findAll('p')
|
||||
if len(ps) > 0:
|
||||
self.storyDescription = ps[0]
|
||||
logging.debug('self.storyDescription=%s' % (self.storyDescription))
|
||||
else:
|
||||
divs = meta.findAll('div')
|
||||
#logging.debug('Divs: %s' % divs)
|
||||
|
||||
for div in divs:
|
||||
#logging.debug('Div: %s' % div)
|
||||
if 'id' in div._getAttrMap() and div['id'].find('pagetitle') != -1:
|
||||
#logging.debug('Div PAGETITLE: %s' % div)
|
||||
allA = div.findAll('a')
|
||||
for a in allA:
|
||||
if 'href' in a._getAttrMap():
|
||||
if a['href'].find('viewstory.php?sid=') != -1:
|
||||
str1 = a.string
|
||||
(vs, self.storyId) = a['href'].split('=')
|
||||
logging.debug('self.storyId=%s self.storyName=%s' % (self.storyId, self.storyName))
|
||||
if a['href'].find('viewuser.php?uid=') != -1:
|
||||
str1 = a.string
|
||||
(vs, self.authorId) = a['href'].split('=')
|
||||
logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName))
|
||||
self.authorURL = 'http://'+self.host+'/viewuser.php?uid='+self.authorId
|
||||
logging.debug('self.authorURL=%s' % self.authorURL)
|
||||
if 'class' in div._getAttrMap() and div['class'].find('content') != -1:
|
||||
#logging.debug('Div CONTENT: %s' % div)
|
||||
brs = div.findAll('br')
|
||||
for br in brs:
|
||||
buf = unicode(br).encode('utf-8')
|
||||
strs = re.split ('<[^>]+>', buf)
|
||||
#logging.debug('BUF: %s' % strs)
|
||||
ii = 2
|
||||
stlen = len(strs)
|
||||
while stlen > ii+1:
|
||||
if len(strs[ii]) == 0:
|
||||
ii = ii+1
|
||||
continue
|
||||
if strs[ii] == 'Categories:':
|
||||
ii = ii+1
|
||||
while stlen > ii and len(strs[ii]) != 0 and strs[ii].find(':') == -1:
|
||||
if strs[ii] != ' ' and strs[ii] != ', ':
|
||||
if len(self.genre) > 0:
|
||||
self.genre = self.genre + ', '
|
||||
self.genre = strs[ii].strip(' ')
|
||||
if len(self.category) == 0:
|
||||
self.category = strs[ii].strip(' ')
|
||||
self.addSubject(strs[ii].strip(' '))
|
||||
ii = ii+1
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
if strs[ii] == 'Characters: ':
|
||||
ii = ii+1
|
||||
while stlen > ii and len(strs[ii]) != 0 and strs[ii].find(':') == -1:
|
||||
if strs[ii] != ' ' and strs[ii] != ', ':
|
||||
self.addCharacter(strs[ii].strip(' '))
|
||||
ii = ii+1
|
||||
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
||||
elif strs[ii] == 'Completed:':
|
||||
if strs[ii+1].strip(' ') == "No":
|
||||
self.storyStatus = 'In-Progress'
|
||||
else:
|
||||
self.storyStatus = 'Completed'
|
||||
ii = ii+2
|
||||
logging.debug('self.storyStatus=%s' % self.storyStatus)
|
||||
elif strs[ii] == 'Rated:':
|
||||
self.storyRating = strs[ii+1].strip(' ')
|
||||
ii = ii+2
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
elif strs[ii] == 'Series:':
|
||||
self.storySeries = strs[ii+1].strip(' ')
|
||||
if self.storySeries == 'None':
|
||||
self.storySeries = ''
|
||||
ii = ii+2
|
||||
logging.debug('self.storySeries=%s' % self.storySeries)
|
||||
elif strs[ii] == 'Chapters: ':
|
||||
self.numChapters = strs[ii+1].strip(' ')
|
||||
ii = ii+2
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
elif strs[ii] == 'Word count:':
|
||||
self.numWords = strs[ii+1].strip(' ')
|
||||
ii = ii+2
|
||||
logging.debug('self.numWords=%s' % self.numWords)
|
||||
elif strs[ii] == ' Published: ':
|
||||
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(strs[ii+1].strip(' '), "%B %d, %Y")))
|
||||
ii = ii+2
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
elif strs[ii] == 'Updated:':
|
||||
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(strs[ii+1].strip(' '), "%B %d, %Y")))
|
||||
ii = ii+2
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
||||
else:
|
||||
logging.debug('Skipped Label \"%s\" Value \"%s\"' % (strs[ii], strs[ii+1]))
|
||||
ii = ii+2
|
||||
|
||||
return result
|
||||
|
||||
def getText(self, url):
|
||||
if url.find('http://') == -1:
|
||||
url = 'http://' + self.host + '/' + url
|
||||
|
||||
logging.debug('Getting data from: %s' % url)
|
||||
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES)
|
||||
except:
|
||||
logging.info("Failed to decode: <%s>" % data)
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return div.__str__('utf8')
|
||||
|
||||
|
||||
class Twilighted_UnitTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
pass
|
||||
|
||||
def testLoginWorks(self):
|
||||
url = 'http://www.twilighted.net/viewstory.php?sid=10004'
|
||||
self.assertTrue(Twilighted(url).performLogin())
|
||||
|
||||
def testGetUrlsWorks(self):
|
||||
url = 'http://www.twilighted.net/viewstory.php?sid=10004'
|
||||
self.assertEquals(32, len(Twilighted(url).extractIndividualUrls()))
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
4
fanficdownloader/twipassword.py
Normal file
4
fanficdownloader/twipassword.py
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# This is really for the web version. downalod.py will ask.
|
||||
password='somepass'
|
||||
177
fanficdownloader/zipdir.py
Normal file
177
fanficdownloader/zipdir.py
Normal file
|
|
@ -0,0 +1,177 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import with_statement
|
||||
|
||||
import sys
|
||||
import os
|
||||
import zlib
|
||||
import zipfile
|
||||
from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED
|
||||
from contextlib import closing
|
||||
import logging
|
||||
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import time
|
||||
import datetime
|
||||
from datetime import timedelta
|
||||
|
||||
import StringIO
|
||||
|
||||
class InvalidEPub(Exception):
|
||||
pass
|
||||
|
||||
def checkNewer(filename, curdte):
|
||||
ret = True
|
||||
|
||||
if not os.path.isfile(filename):
|
||||
logging.debug('File %s does not already exist.' % filename)
|
||||
return ret
|
||||
|
||||
#logging.debug('filename=%s, curdte=%s' % (filename, curdte))
|
||||
lastdate = None
|
||||
with closing(ZipFile(open(filename, 'rb'))) as epub:
|
||||
titleFilePath = "OEBPS/title_page.xhtml"
|
||||
contentFilePath = "OEBPS/content.opf"
|
||||
|
||||
namelist = set(epub.namelist())
|
||||
#logging.debug('namelist=%s' % namelist)
|
||||
if 'mimetype' not in namelist or \
|
||||
'META-INF/container.xml' not in namelist:
|
||||
#raise InvalidEPub('%s: not a valid EPUB' % filename)
|
||||
logging.debug('File %s is not a valid EPub format file.' % filename)
|
||||
return ret
|
||||
|
||||
if contentFilePath not in namelist:
|
||||
return ret # file is not newer
|
||||
|
||||
data = epub.read(contentFilePath)
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
lstdte = soup.find ('dc:date', {'opf:event' : 'modification'})
|
||||
#logging.debug('lstdte=%s' % lstdte.string)
|
||||
if lstdte is None and titleFilePath in namelist:
|
||||
data = epub.read(titleFilePath)
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
fld = ''
|
||||
allTDs = soup.findAll ('td')
|
||||
for td in allTDs:
|
||||
b = td.find ('b')
|
||||
if b is not None:
|
||||
fld = b.string
|
||||
if td.string is not None and fld == "Updated:":
|
||||
lastdate = td.string
|
||||
#logging.debug('title lastdate=%s' % lastdate)
|
||||
else:
|
||||
lastdate = lstdte.string.strip(' ')
|
||||
#logging.debug('contents lastdate=%s' % lastdate)
|
||||
|
||||
if lastdate is not None:
|
||||
currUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(curdte.strftime('%Y-%m-%d'), "%Y-%m-%d")))
|
||||
storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(lastdate, "%Y-%m-%d")))
|
||||
logging.debug('File %s last update date is %s, comparing to %s' % (filename, storyUpdated, currUpdated))
|
||||
if currUpdated <= storyUpdated :
|
||||
ret = False
|
||||
|
||||
logging.debug("Does %s need to be updated? %s" % (filename, ret))
|
||||
return ret
|
||||
|
||||
|
||||
def toZip(filename, directory):
|
||||
zippedHelp = zipfile.ZipFile(filename, "w", compression=zipfile.ZIP_DEFLATED)
|
||||
lst = os.listdir(directory)
|
||||
|
||||
for entity in lst:
|
||||
if entity.startswith('.'):
|
||||
continue
|
||||
|
||||
each = os.path.join(directory,entity)
|
||||
print(each)
|
||||
|
||||
if os.path.isfile(each):
|
||||
print(each)
|
||||
# epub standard requires mimetype to be uncompressed and first file.
|
||||
if entity == 'mimetype':
|
||||
zippedHelp.write(each, arcname=entity, compress_type=zipfile.ZIP_STORED)
|
||||
else:
|
||||
zippedHelp.write(each, arcname=entity)
|
||||
else:
|
||||
addFolderToZip(zippedHelp,entity, each)
|
||||
|
||||
zippedHelp.close()
|
||||
|
||||
def addFolderToZip(zippedHelp,folder,fpath):
|
||||
#print('addFolderToZip(%s)' % folder)
|
||||
|
||||
if folder == '.' or folder == '..':
|
||||
return
|
||||
|
||||
folderFiles = os.listdir(fpath)
|
||||
for f in folderFiles:
|
||||
if os.path.isfile(fpath + '/' + f):
|
||||
#print('basename=%s' % os.path.basename(fpath + '/' + f))
|
||||
zippedHelp.write(fpath + '/' + f, folder + '/' + f, zipfile.ZIP_DEFLATED)
|
||||
elif os.path.isdir(f):
|
||||
addFolderToZip(zippedHelp,f)
|
||||
|
||||
def inMemoryZip(files):
|
||||
# files have a structure of {'path/to/file' => content} dictionary
|
||||
io = StringIO.StringIO()
|
||||
|
||||
if 'mimetype' in files:
|
||||
# This fixes the uncompressed mimetype-first issue by opening
|
||||
# the in memory file as STORE, putting in the mimetype, then
|
||||
# closing and re-opening with DEFLATED. while it is often
|
||||
# true that mimetype is the first file, we can't assume it,
|
||||
# because the dict object is defined as unordered.
|
||||
path='mimetype'
|
||||
memzip = zipfile.ZipFile(io, 'a', compression=zipfile.ZIP_STORED)
|
||||
memzip.debug = 3
|
||||
if type(files[path]) != type('str'):
|
||||
data = files[path].getvalue()
|
||||
else:
|
||||
data = files[path]
|
||||
|
||||
logging.debug("Writing ZIP path %s" % path)
|
||||
try:
|
||||
memzip.writestr(path, data.encode('utf-8'))
|
||||
except UnicodeDecodeError, e:
|
||||
memzip.writestr(path.encode('utf-8'), data.encode('utf-8'))
|
||||
|
||||
memzip.close()
|
||||
|
||||
# remove it from the files dict.
|
||||
del(files['mimetype'])
|
||||
|
||||
# open in 'a' append mode.
|
||||
memzip = zipfile.ZipFile(io, 'a', compression=zipfile.ZIP_DEFLATED)
|
||||
memzip.debug = 3
|
||||
|
||||
for path in files:
|
||||
if type(files[path]) != type('str'):
|
||||
data = files[path].getvalue()
|
||||
else:
|
||||
data = files[path]
|
||||
|
||||
# logging.debug(data)
|
||||
logging.debug("Writing ZIP path %s" % path)
|
||||
try:
|
||||
memzip.writestr(path, data.encode('utf-8'))
|
||||
except UnicodeDecodeError, e:
|
||||
memzip.writestr(path.encode('utf-8'), data.encode('utf-8'))
|
||||
|
||||
# declares all the files created by Windows.
|
||||
for zf in memzip.filelist:
|
||||
zf.create_system = 0
|
||||
|
||||
memzip.close()
|
||||
|
||||
return io
|
||||
|
||||
if __name__ == '__main__':
|
||||
# toZip('sample.epub', "books/A_Time_To_Reflect")
|
||||
# z = zipfile.ZipFile('sample.epub', 'r')
|
||||
files = {'test.txt' : 'test', 'data/abc.txt' : 'abc'}
|
||||
data = inMemoryZip(files)
|
||||
f = open('res.zip', 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
21
ffstorage.py
Normal file
21
ffstorage.py
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
from google.appengine.ext import db
|
||||
|
||||
class OneDownload(db.Model):
|
||||
user = db.UserProperty()
|
||||
url = db.StringProperty()
|
||||
format = db.StringProperty()
|
||||
login = db.StringProperty()
|
||||
password = db.StringProperty()
|
||||
failure = db.StringProperty()
|
||||
date = db.DateTimeProperty(auto_now_add=True)
|
||||
|
||||
class DownloadedFanfic(db.Model):
|
||||
user = db.UserProperty()
|
||||
url = db.StringProperty()
|
||||
name = db.StringProperty()
|
||||
author = db.StringProperty()
|
||||
format = db.StringProperty()
|
||||
date = db.DateTimeProperty(auto_now_add=True)
|
||||
blob = db.BlobProperty()
|
||||
mac = db.StringProperty()
|
||||
cleared = db.BooleanProperty(default=False)
|
||||
109
index-ajax.html
Normal file
109
index-ajax.html
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
|
||||
<html>
|
||||
<head>
|
||||
<link href="css/index.css" rel="stylesheet" type="text/css">
|
||||
<link type="text/css" href="http://jqueryui.com/latest/themes/base/ui.all.css" rel="stylesheet" />
|
||||
|
||||
<title>Fanfiction Downloader (fanfiction.net, fictionalley, ficwad to epub and HTML)</title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<script src="/js/jquery-1.3.2.js"></script>
|
||||
<script src="/js/fdownloader.js"></script>
|
||||
|
||||
<script type="text/javascript" src="http://jqueryui.com/latest/ui/ui.core.js"></script>
|
||||
<script type="text/javascript" src="http://jqueryui.com/latest/ui/ui.progressbar.js"></script>
|
||||
|
||||
</head>
|
||||
<body>
|
||||
<div id='main'>
|
||||
<h1>
|
||||
<a href="/" style="text-decoration: none; color: black;">FanFiction Downloader</a>
|
||||
</h1>
|
||||
|
||||
<!-- <form action="/fdown" method="post"> -->
|
||||
<div id='urlbox'>
|
||||
<div id='greeting'>
|
||||
Hi, {{ nickname }}! This is a fan fiction downloader, which makes reading stories from various websites much easier. Please paste a URL of the <em>first chapter</em> in the box to start. Alternatively, see your personal list of <a href="/recent">previously downloaded fanfics</a>.
|
||||
</div>
|
||||
<input type="text" id='url' name="url" size="50" value='{{ url }}'>
|
||||
<div style="margin-top: 0.5em;">
|
||||
Ebook format <select name="format" id="format">
|
||||
<option value='epub'>ePub</option>
|
||||
<option value='html'>HTML</option>
|
||||
</select>
|
||||
</div>
|
||||
|
||||
<div id='error' style='color: red'>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
<div id='yourfile' style='display:none'>
|
||||
</div>
|
||||
|
||||
<div id='typebox'>
|
||||
</div>
|
||||
|
||||
<h3>
|
||||
Login and Password
|
||||
</h3>
|
||||
<div id='logpassword'>
|
||||
If the story requires a login and password to download (e.g. marked as Mature on FFA), you may need to provide your credentials to download it, otherwise just leave it empty
|
||||
</div>
|
||||
<div id='logpasswordtable'>
|
||||
<div class='fieldandlabel'>
|
||||
<div class='label'>Login</div>
|
||||
<div class='field'><input type='text' name='login' id='login' size='50'></div>
|
||||
</div>
|
||||
|
||||
<div class='fieldandlabel'>
|
||||
<div class='label'>Password</div>
|
||||
<div class='field'><input type='password' id='password' name='password' size='50'></div>
|
||||
</div>
|
||||
</div>
|
||||
<div id='submitbtn'>
|
||||
<span id='submit_button'><button onclick='downloadFanfic();'>Download</button></span>
|
||||
<span id='ajax_loader' style='display:none'><img src="/static/ajax-loader.gif"></span>
|
||||
</div>
|
||||
|
||||
|
||||
<div id="progressbar">
|
||||
|
||||
</div>
|
||||
<div id='helpbox'>
|
||||
Few things to know, which will make your life substantially easier:
|
||||
<ol>
|
||||
<li>Small <a href="http://www.sigizmund.com/reading-fanfiction-off-line-in-stanza-and-oth">post written by me</a> — how to read fiction in Stanza or any other ebook reader. </a></li>
|
||||
<li>Currently we support fanfiction.net, fictionpress.com, fanficauthors.net and ficwad.com</li>
|
||||
<li>Paste a URL of the first chapter of the fanfic, not the index page</li>
|
||||
<li>Fics with a single chapter are not supported (you can just copy and paste it)</li>
|
||||
<li>Stories which are too long may not be downloaded correctly and application will report a time-out error — this is a limitation which is currently imposed by Google AppEngine on a long-running activities</li>
|
||||
<li>FicWad support is somewhat flaky — if you feel it doesn't work for you, send all the details to me</li>
|
||||
<li>You can download fanfics and store them for 'later' by just downloading them and visiting <a href="/recent">recent downloads</a> section, but in future they will be deleted after 5 days to save the space</li>
|
||||
<li>If Downloader simply opens a download file window rather than saves the fanfic and gives you a link, it means it is too large to save in the database and you need to download it straight away</li>
|
||||
<li>If you think that something that should work in fact doesn't, drop me a mail to <a href='mailto:sigizmund@gmail.com'>sigizmund@gmail.com</a></li>
|
||||
</ol>
|
||||
Otherwise, just have fun, and if you want to say thank you — use the email above.
|
||||
</div>
|
||||
<div style='text-align: center'>
|
||||
<img src="http://code.google.com/appengine/images/appengine-silver-120x30.gif"
|
||||
alt="Powered by Google App Engine" />
|
||||
<br/><br/>
|
||||
FanfictionLoader is a web front-end to <A href="http://code.google.com/p/fanficdownloader/">fanficdownloader</a><br/>
|
||||
Copyright © <a href="http://twitter.com/sigizmund">Roman Kirillov</a>
|
||||
</div>
|
||||
<!-- </form> -->
|
||||
</div>
|
||||
<script type="text/javascript">
|
||||
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
|
||||
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
|
||||
</script>
|
||||
<script type="text/javascript">
|
||||
try {
|
||||
var pageTracker = _gat._getTracker("UA-12136939-1");
|
||||
pageTracker._trackPageview();
|
||||
} catch(err) {}</script>
|
||||
</body>
|
||||
</html>
|
||||
204
index.html
Normal file
204
index.html
Normal file
|
|
@ -0,0 +1,204 @@
|
|||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
|
||||
<html>
|
||||
<head>
|
||||
<link href="css/index.css" rel="stylesheet" type="text/css">
|
||||
<title>Fanfiction Downloader — twilighted.net, fanfiction.net, fictionpress.com, fictionalley.org, ficwad.com, potionsandsnitches.net, harrypotterfanfiction.com, mediaminer.org to epub and HTML to Stanza, Kindle, Nook, Sony Reader</title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<meta name="google-site-verification" content="kCFc-G4bka_pJN6Rv8CapPBcwmq0hbAUZPkKWqRsAYU" />
|
||||
</head>
|
||||
<body>
|
||||
<div id='main'>
|
||||
<h1>
|
||||
<a href="/" style="text-decoration: none; color: black;">FanFiction Downloader</a>
|
||||
</h1>
|
||||
|
||||
<div style="text-align: center">
|
||||
<script type="text/javascript"><!--
|
||||
google_ad_client = "pub-2027714004231956";
|
||||
/* FFD */
|
||||
google_ad_slot = "7330682770";
|
||||
google_ad_width = 468;
|
||||
google_ad_height = 60;
|
||||
//-->
|
||||
</script>
|
||||
<script type="text/javascript"
|
||||
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
|
||||
</script>
|
||||
</div>
|
||||
<!-- <div id='yourfile'> -->
|
||||
{{yourfile}}
|
||||
<!-- </div> -->
|
||||
|
||||
{% if authorized %}
|
||||
<form action="/fdown" method="post">
|
||||
<div id='urlbox'>
|
||||
<div id='greeting'>
|
||||
<p>Hi, {{ nickname }}! This is a fan fiction downloader, which makes reading stories from various websites
|
||||
much easier. </p>
|
||||
<p>To support new features, such as including story summaries,
|
||||
the URL you need to use for some sites has changed. See below for example URLs for each site. </p>
|
||||
<p>Or see your personal list of <a href="/recent">previously downloaded fanfics</a>.</p>
|
||||
</div>
|
||||
<div id='error'>
|
||||
{{ error_message }}
|
||||
</div>
|
||||
<input type="text" name="url" size="50" value='{{ url }}'>
|
||||
</div>
|
||||
|
||||
<div id='typebox'>
|
||||
<div id='typelabel'>Ebook format</div>
|
||||
<div id='typeoptions'>
|
||||
<input type='radio' name='format' value='epub' checked>EPub</input>
|
||||
<input type='radio' name='format' value='html'>HTML</input>
|
||||
<input type='radio' name='format' value='text'>Plain Text</input>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id='logpasswordtable'>
|
||||
<h3>Login and Password</h3>
|
||||
<div id='logpassword'>
|
||||
|
||||
If the story requires a login and password to download (e.g. marked as Mature on FFA), you may need to provide
|
||||
your credentials to download it, otherwise just leave it empty
|
||||
</div>
|
||||
<div class='fieldandlabel'>
|
||||
<div class='label'>Login</div>
|
||||
<div class='field'><input type='text' name='login' size='50'></div>
|
||||
</div>
|
||||
|
||||
<div class='fieldandlabel'>
|
||||
<div class='label'>Password</div>
|
||||
<div class='field'><input type='password' name='password' size='50'></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id='submitbtn'>
|
||||
<input type="submit" value="Download">
|
||||
</div>
|
||||
</form>
|
||||
{% else %}
|
||||
<div id='urlbox'>
|
||||
<div id='greeting'>
|
||||
<p>
|
||||
This is a fan fiction downloader, which makes reading stories from various websites much easier. Before you
|
||||
can start downloading fanfics, you need to login, so downloader can remember your fanfics and store them.
|
||||
</p>
|
||||
<p><a href="{{ login_url }}">Login using Google account</a></p>
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<div id='helpbox'>
|
||||
<dl>
|
||||
<dt>fictionalley.org
|
||||
<dd>Use the URL of the story's chapter list, such as
|
||||
<br /><a href="http://www.fictionalley.org/authors/drt/DA.html">http://www.fictionalley.org/authors/drt/DA.html</a>. Or the story text URL for
|
||||
fictionalley.org one-shots, such as
|
||||
<br /><a href="http://www.fictionalley.org/authors/drt/JOTP01a.html">http://www.fictionalley.org/authors/drt/JOTP01a.html</a>.
|
||||
<dt>fanfiction.net
|
||||
<dd>Use the URL of any story chapter, with or without story title such as
|
||||
<br /><a href="http://www.fanfiction.net/s/5192986/1/A_Fox_in_Tokyo">http://www.fanfiction.net/s/5192986/1/A_Fox_in_Tokyo</a> or
|
||||
<br /><a href="http://www.fanfiction.net/s/2345466/3/">http://www.fanfiction.net/s/5192986/5/</a>.
|
||||
<dt>fictionpress.com
|
||||
<dd>Use the URL of any story chapter, such as
|
||||
<br /><a href="http://www.fictionpress.com/s/2851771/1/Untouchable_Love">http://www.fictionpress.com/s/2851771/1/Untouchable_Love</a> or
|
||||
<br /><a href="http://www.fictionpress.com/s/2847338/6/">http://www.fictionpress.com/s/2847338/6/</a>.
|
||||
<dt>twilighted.net
|
||||
<dd>Use the URL of the start of the story, such as
|
||||
<br /><a href="http://twilighted.net/viewstory.php?sid=8422">http://twilighted.net/viewstory.php?sid=8422</a>.
|
||||
<dt>ficwad.com
|
||||
<dd>Use the URL of any story chapter, such as
|
||||
<br /><a href="http://www.ficwad.com/story/75246">http://www.ficwad.com/story/75246</a>.
|
||||
<dt>harrypotterfanfiction.com
|
||||
<dd>Use the URL of the story's chapter list, such as
|
||||
<br /><a href="http://www.harrypotterfanfiction.com/viewstory.php?psid=289208">http://www.harrypotterfanfiction.com/viewstory.php?psid=289208</a>.
|
||||
<dt>potionsandsnitches.net
|
||||
<dd>Use the URL of the story's chapter list, such as
|
||||
<br /><a href="http://potionsandsnitches.net/fanfiction/viewstory.php?sid=2332">http://potionsandsnitches.net/fanfiction/viewstory.php?sid=2332</a>.
|
||||
<dt>mediaminer.org
|
||||
<dd>Use the URL of the story's chapter list, such as
|
||||
<br /><a href="http://www.mediaminer.org/fanfic/view_st.php/156934">http://www.mediaminer.org/fanfic/view_st.php/166653</a>.
|
||||
Or the story URL for one-shots, such as
|
||||
<br /><a href="http://www.mediaminer.org/fanfic/view_st.php/167618">http://www.mediaminer.org/fanfic/view_st.php/167618</a>.
|
||||
</dl>
|
||||
|
||||
|
||||
A few additional things to know, which will make your life substantially easier:
|
||||
<ol>
|
||||
<li>
|
||||
First thing to know: I do not use your login and password. In fact, all I know about it is your ID – password
|
||||
is being verified by Google and is absolutely, totally unknown to anyone but you.
|
||||
</li>
|
||||
<li>
|
||||
Small <a href="http://www.sigizmund.com/reading-fanfiction-off-line-in-stanza-and-oth">post written by me</a>
|
||||
— how to read fiction in Stanza or any other ebook reader.
|
||||
</li>
|
||||
<li>
|
||||
Currently we support fanfiction.net, fictionpress.com, ficwad.com, fictionalley.org, harrypotterfanfiction.com, potionsandsnitches.net, mediaminer.org and twilighted.net.
|
||||
fanficauthors.net and tthfanfic.org offer native ePub functionality.
|
||||
</li>
|
||||
<li>
|
||||
You can download fanfiction directly from your iPhone, Kindle or (possibly) other ebook reader.
|
||||
</li>
|
||||
<li>
|
||||
One-shots, fics with a single chapter, <em>are</em> now supported.
|
||||
</li>
|
||||
<li>
|
||||
You can download fanfics and store them for 'later' by just downloading them and visiting <a href="/recent">recent
|
||||
downloads</a> section.
|
||||
</li>
|
||||
<li>
|
||||
Downloaded stories are deleted after some time (which should give you enough of time to download it and will keep
|
||||
Google happy about the app not going over the storage limit).
|
||||
</li>
|
||||
<li>
|
||||
If Downloader simply opens a download file window rather than saves the fanfic and gives you a link, it means it is
|
||||
too large to save in the database and you need to download it straight away.
|
||||
</li>
|
||||
<li>
|
||||
If you see some funny characters in downloaded Plain Text file, make sure you choose text file encoding UTF-8 and
|
||||
not something else.
|
||||
</li>
|
||||
<li>
|
||||
If you think that something that should work in fact doesn't, drop me a mail
|
||||
to <a href='mailto:sigizmund@gmail.com'>sigizmund@gmail.com</a>, or, even better, write an email to
|
||||
our <a href="http://groups.google.com/group/fanfic-downloader">Google Group</a>. I also encourage you to join it so
|
||||
you will find out about latest updates and fixes as soon as possible
|
||||
</li>
|
||||
</ol>
|
||||
Otherwise, just have fun, and if you want to say thank you — use the contacts above.
|
||||
</div>
|
||||
<div style='text-align: center'>
|
||||
<img src="http://code.google.com/appengine/images/appengine-silver-120x30.gif"
|
||||
alt="Powered by Google App Engine" />
|
||||
<br/><br/>
|
||||
FanfictionLoader is a web front-end to <A href="http://code.google.com/p/fanficdownloader/">fanficdownloader</a><br/>
|
||||
Copyright © <a href="http://twitter.com/sigizmund">Roman Kirillov</a>
|
||||
</div>
|
||||
|
||||
<div style="margin-top: 1em; text-align: center'">
|
||||
<script type="text/javascript"><!--
|
||||
google_ad_client = "pub-2027714004231956";
|
||||
/* FFD */
|
||||
google_ad_slot = "7330682770";
|
||||
google_ad_width = 468;
|
||||
google_ad_height = 60;
|
||||
//-->
|
||||
</script>
|
||||
<script type="text/javascript"
|
||||
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
|
||||
</script>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<script type="text/javascript">
|
||||
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
|
||||
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
|
||||
</script>
|
||||
<script type="text/javascript">
|
||||
try {
|
||||
var pageTracker = _gat._getTracker("UA-12136939-1");
|
||||
pageTracker._trackPageview();
|
||||
} catch(err) {}</script>
|
||||
</body>
|
||||
</html>
|
||||
22
index.yaml
Normal file
22
index.yaml
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
indexes:
|
||||
|
||||
# AUTOGENERATED
|
||||
|
||||
# This index.yaml is automatically updated whenever the dev_appserver
|
||||
# detects that a new type of query is run. If you want to manage the
|
||||
# index.yaml file manually, remove the above marker line (the line
|
||||
# saying "# AUTOGENERATED"). If you want to manage some indexes
|
||||
# manually, move them above the marker line. The index.yaml file is
|
||||
# automatically uploaded to the admin console when you next deploy
|
||||
# your application using appcfg.py.
|
||||
|
||||
- kind: DownloadedFanfic
|
||||
properties:
|
||||
- name: cleared
|
||||
- name: date
|
||||
|
||||
- kind: DownloadedFanfic
|
||||
properties:
|
||||
- name: user
|
||||
- name: date
|
||||
direction: desc
|
||||
116
js/fdownloader.js
Normal file
116
js/fdownloader.js
Normal file
|
|
@ -0,0 +1,116 @@
|
|||
var g_CurrentKey = null;
|
||||
var g_Counter = 0;
|
||||
|
||||
var COUNTER_MAX = 50;
|
||||
|
||||
|
||||
function setErrorState(error)
|
||||
{
|
||||
olderr = error;
|
||||
error = error + "<br/><a href='mailto:sigizmund@gmail.com?subject=Problem with the fanfiction downloader'>" + "Complain about this error</a>";
|
||||
$('#error').html(error);
|
||||
}
|
||||
|
||||
function clearErrorState()
|
||||
{
|
||||
$('#error').html('');
|
||||
}
|
||||
|
||||
function showFile(data)
|
||||
{
|
||||
$('#yourfile').html('<a href="/file?id=' + data.key + '">' + data.name + " by " + data.author + "</a>");
|
||||
$('#yourfile').show();
|
||||
}
|
||||
|
||||
function hideFile()
|
||||
{
|
||||
$('#yourfile').hide();
|
||||
}
|
||||
|
||||
function checkResults()
|
||||
{
|
||||
if ( g_Counter >= COUNTER_MAX )
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
g_Counter+=1;
|
||||
|
||||
$.getJSON('/progress', { 'key' : g_CurrentKey }, function(data)
|
||||
{
|
||||
if ( data.result != "Nope")
|
||||
{
|
||||
if ( data.result != "OK" )
|
||||
{
|
||||
leaveLoadingState();
|
||||
setErrorState(data.result);
|
||||
}
|
||||
else
|
||||
{
|
||||
showFile(data);
|
||||
leaveLoadingState();
|
||||
// result = data.split("|");
|
||||
// showFile(result[1], result[2], result[3]);
|
||||
}
|
||||
|
||||
$("#progressbar").progressbar('destroy');
|
||||
g_Counter = 101;
|
||||
}
|
||||
});
|
||||
|
||||
if ( g_Counter < COUNTER_MAX )
|
||||
setTimeout("checkResults()", 1000);
|
||||
else
|
||||
{
|
||||
leaveLoadingState();
|
||||
setErrorState("Operation takes too long - terminating by timeout (story too long?)");
|
||||
}
|
||||
}
|
||||
|
||||
function enterLoadingState()
|
||||
{
|
||||
$('#submit_button').hide();
|
||||
$('#ajax_loader').show();
|
||||
}
|
||||
|
||||
function leaveLoadingState()
|
||||
{
|
||||
$('#submit_button').show();
|
||||
$('#ajax_loader').hide();
|
||||
}
|
||||
|
||||
function downloadFanfic()
|
||||
{
|
||||
clearErrorState();
|
||||
hideFile();
|
||||
|
||||
|
||||
format = $("#format").val();
|
||||
alert(format);
|
||||
|
||||
return;
|
||||
|
||||
var url = $('#url').val();
|
||||
var login = $('#login').val();
|
||||
var password = $('#password').val();
|
||||
|
||||
if ( url == '' )
|
||||
{
|
||||
setErrorState('URL shouldn\'t be empty');
|
||||
return;
|
||||
}
|
||||
|
||||
if ( (url.indexOf('fanfiction.net') == -1 && url.indexOf('fanficauthors') == -1 && url.indexOf('ficwad') == -1 && url.indexOf('fictionpress') == -1) || (url.indexOf('adultfanfiction.net') != -1) )
|
||||
{
|
||||
setErrorState("This source is not yet supported. Ping me if you want it!");
|
||||
return;
|
||||
}
|
||||
|
||||
$.post('/submitDownload', {'url' : url, 'login' : login, 'password' : password, 'format' : format}, function(data)
|
||||
{
|
||||
g_CurrentKey = data;
|
||||
g_Counter = 0;
|
||||
setTimeout("checkResults()", 1000);
|
||||
enterLoadingState();
|
||||
})
|
||||
}
|
||||
4376
js/jquery-1.3.2.js
vendored
Normal file
4376
js/jquery-1.3.2.js
vendored
Normal file
File diff suppressed because it is too large
Load diff
316
main.py
Normal file
316
main.py
Normal file
|
|
@ -0,0 +1,316 @@
|
|||
#!/usr/bin/env python
|
||||
#
|
||||
# Copyright 2007 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
import traceback
|
||||
import StringIO
|
||||
|
||||
from google.appengine.runtime import DeadlineExceededError
|
||||
|
||||
from google.appengine.ext.webapp import template
|
||||
from google.appengine.api import users
|
||||
from google.appengine.ext import webapp
|
||||
from google.appengine.ext.webapp import util
|
||||
|
||||
from fanficdownloader.downloader import *
|
||||
from fanficdownloader.ffnet import *
|
||||
from fanficdownloader.output import *
|
||||
from fanficdownloader import twilighted
|
||||
|
||||
from google.appengine.ext import db
|
||||
|
||||
from fanficdownloader.zipdir import *
|
||||
|
||||
from ffstorage import *
|
||||
|
||||
|
||||
|
||||
class LoginRequired(webapp.RequestHandler):
|
||||
def get(self):
|
||||
user = users.get_current_user()
|
||||
if user:
|
||||
self.redirect('/')
|
||||
else:
|
||||
logging.debug(users.create_login_url('/'))
|
||||
url = users.create_login_url(self.request.uri)
|
||||
template_values = {'login_url' : url}
|
||||
path = os.path.join(os.path.dirname(__file__), 'index-nonlogin.html')
|
||||
self.response.out.write(template.render(path, template_values))
|
||||
|
||||
class MainHandler(webapp.RequestHandler):
|
||||
def get(self):
|
||||
user = users.get_current_user()
|
||||
if user:
|
||||
error = self.request.get('error')
|
||||
template_values = {'nickname' : user.nickname(), 'authorized': True}
|
||||
url = self.request.get('url')
|
||||
template_values['url'] = url
|
||||
|
||||
if error != None and len(error) > 1:
|
||||
if error == 'login_required':
|
||||
template_values['error_message'] = 'This story (or one of the chapters) requires you to be logged in.'
|
||||
elif error == 'bad_url':
|
||||
template_values['error_message'] = 'Unsupported URL: ' + url
|
||||
elif error == 'custom':
|
||||
template_values['error_message'] = 'Error happened: ' + self.request.get('errtext')
|
||||
|
||||
filename = self.request.get('file')
|
||||
if len(filename) > 1:
|
||||
template_values['yourfile'] = '''<div id='yourfile'><a href='/file?id=%s'>"%s" by %s</a></div>''' % (filename, self.request.get('name'), self.request.get('author'))
|
||||
|
||||
self.response.headers['Content-Type'] = 'text/html'
|
||||
path = os.path.join(os.path.dirname(__file__), 'index.html')
|
||||
|
||||
self.response.out.write(template.render(path, template_values))
|
||||
else:
|
||||
# self.redirect(users.create_login_url(self.request.uri))
|
||||
# self.redirect('/login')
|
||||
logging.debug(users.create_login_url('/'))
|
||||
url = users.create_login_url(self.request.uri)
|
||||
template_values = {'login_url' : url, 'authorized': False}
|
||||
path = os.path.join(os.path.dirname(__file__), 'index.html')
|
||||
self.response.out.write(template.render(path, template_values))
|
||||
|
||||
|
||||
class FileServer(webapp.RequestHandler):
|
||||
def get(self):
|
||||
# user = users.get_current_user()
|
||||
fileId = self.request.get('id')
|
||||
|
||||
if fileId == None or len(fileId) < 3:
|
||||
self.redirect('/')
|
||||
|
||||
key = db.Key(fileId)
|
||||
fanfic = db.get(key)
|
||||
|
||||
name = fanfic.name.encode('utf-8')
|
||||
|
||||
name = makeAcceptableFilename(name)
|
||||
|
||||
logging.info("Serving file: %s" % name)
|
||||
|
||||
if fanfic.format == 'epub':
|
||||
self.response.headers['Content-Type'] = 'application/epub+zip'
|
||||
self.response.headers['Content-disposition'] = 'attachment; filename=' + name + '.epub'
|
||||
elif fanfic.format == 'html':
|
||||
self.response.headers['Content-Type'] = 'text/html'
|
||||
self.response.headers['Content-disposition'] = 'attachment; filename=' + name + '.html.zip'
|
||||
elif fanfic.format == 'text':
|
||||
self.response.headers['Content-Type'] = 'text/plain'
|
||||
self.response.headers['Content-disposition'] = 'attachment; filename=' +name + '.txt.zip'
|
||||
|
||||
self.response.out.write(fanfic.blob)
|
||||
|
||||
class RecentFilesServer(webapp.RequestHandler):
|
||||
def get(self):
|
||||
user = users.get_current_user()
|
||||
if not user:
|
||||
self.redirect('/login')
|
||||
|
||||
# fics = db.GqlQuery("Select * From DownloadedFanfic WHERE user = :1 and cleared = :2", user)
|
||||
q = DownloadedFanfic.all()
|
||||
q.filter('user =', user)
|
||||
q.filter('cleared =', False)
|
||||
fics = q.fetch(100)
|
||||
|
||||
template_values = dict(fics = fics, nickname = user.nickname())
|
||||
path = os.path.join(os.path.dirname(__file__), 'recent.html')
|
||||
self.response.out.write(template.render(path, template_values))
|
||||
|
||||
class RecentAllFilesServer(webapp.RequestHandler):
|
||||
def get(self):
|
||||
user = users.get_current_user()
|
||||
if user.nickname() != 'sigizmund':
|
||||
return
|
||||
|
||||
fics = db.GqlQuery("Select * From DownloadedFanfic")
|
||||
template_values = dict(fics = fics, nickname = user.nickname())
|
||||
path = os.path.join(os.path.dirname(__file__), 'recent.html')
|
||||
self.response.out.write(template.render(path, template_values))
|
||||
|
||||
class FanfictionDownloader(webapp.RequestHandler):
|
||||
def _printableVersion(self, text):
|
||||
text = removeEntities(text)
|
||||
try:
|
||||
d = text.decode('utf-8')
|
||||
except:
|
||||
d = text
|
||||
return d
|
||||
|
||||
|
||||
def post(self):
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
|
||||
user = users.get_current_user()
|
||||
if not user:
|
||||
self.redirect(users.create_login_url('/'))
|
||||
|
||||
format = self.request.get('format')
|
||||
url = self.request.get('url')
|
||||
login = self.request.get('login')
|
||||
password = self.request.get('password')
|
||||
|
||||
logging.info("Downloading: " + url)
|
||||
|
||||
adapter = None
|
||||
writerClass = None
|
||||
|
||||
download = OneDownload()
|
||||
download.user = user
|
||||
download.url = url
|
||||
download.login = login
|
||||
download.password = password
|
||||
download.format = format
|
||||
logging.info('Creating adapter...')
|
||||
|
||||
try:
|
||||
if url.find('fictionalley') != -1:
|
||||
adapter = fictionalley.FictionAlley(url)
|
||||
elif url.find('ficwad') != -1:
|
||||
adapter = ficwad.FicWad(url)
|
||||
elif url.find('fanfiction.net') != -1:
|
||||
adapter = ffnet.FFNet(url)
|
||||
elif url.find('fictionpress.com') != -1:
|
||||
adapter = fpcom.FPCom(url)
|
||||
elif url.find('harrypotterfanfiction.com') != -1:
|
||||
adapter = hpfiction.HPFiction(url)
|
||||
elif url.find('twilighted.net') != -1:
|
||||
adapter = twilighted.Twilighted(url)
|
||||
elif url.find('potionsandsnitches.net') != -1:
|
||||
adapter = potionsNsnitches.PotionsNSnitches(url)
|
||||
elif url.find('mediaminer.org') != -1:
|
||||
adapter = mediaminer.MediaMiner(url)
|
||||
else:
|
||||
logging.debug("Bad URL detected")
|
||||
self.redirect('/?error=bad_url&url=' + urlEscape(url) )
|
||||
return
|
||||
except Exception, e:
|
||||
logging.exception(e)
|
||||
download.failure = "Adapter was not created: " + str(e)
|
||||
download.put()
|
||||
|
||||
self.redirect('/?error=custom&url=' + urlEscape(url) + '&errtext=' + urlEscape(str(traceback.format_exc())) )
|
||||
return
|
||||
|
||||
logging.info('Created an adaper: %s' % adapter)
|
||||
|
||||
if len(login) > 1:
|
||||
adapter.setLogin(login)
|
||||
adapter.setPassword(password)
|
||||
|
||||
if format == 'epub':
|
||||
writerClass = output.EPubFanficWriter
|
||||
elif format == 'html':
|
||||
writerClass = output.HTMLWriter
|
||||
else:
|
||||
writerClass = output.TextWriter
|
||||
|
||||
loader = FanficLoader(adapter, writerClass, quiet = True, inmemory=True, compress=False)
|
||||
try:
|
||||
data = loader.download()
|
||||
|
||||
if format == 'html' or format == 'text':
|
||||
# data is uncompressed hence huge
|
||||
ext = '.html'
|
||||
if format == 'text':
|
||||
ext = '.txt'
|
||||
logging.debug(data)
|
||||
files = {makeAcceptableFilename(str(adapter.getOutputName())) + ext : StringIO.StringIO(data.decode('utf-8')) }
|
||||
d = inMemoryZip(files)
|
||||
data = d.getvalue()
|
||||
|
||||
|
||||
except LoginRequiredException, e:
|
||||
logging.exception(e)
|
||||
download.failure = 'Login problem detected'
|
||||
download.put()
|
||||
|
||||
self.redirect('/?error=login_required&url=' + urlEscape(url))
|
||||
return
|
||||
except:
|
||||
e = sys.exc_info()[0]
|
||||
|
||||
logging.exception(e)
|
||||
download.failure = 'Some exception happened in downloader: ' + str(e)
|
||||
download.put()
|
||||
|
||||
self.redirect('/?error=custom&url=' + urlEscape(url) + '&errtext=' + urlEscape(str(traceback.format_exc())) )
|
||||
return
|
||||
|
||||
if data == None:
|
||||
if loader.badLogin:
|
||||
logging.debug("Bad login detected")
|
||||
|
||||
download.failure = 'Login problem detected'
|
||||
download.put()
|
||||
|
||||
self.redirect('/?error=login_required&url=' + urlEscape(url))
|
||||
else:
|
||||
fic = DownloadedFanfic()
|
||||
fic.user = user
|
||||
fic.url = url
|
||||
fic.format = format
|
||||
fic.name = self._printableVersion(adapter.getOutputName())
|
||||
fic.author = self._printableVersion(adapter.getAuthorName())
|
||||
fic.blob = data
|
||||
|
||||
try:
|
||||
fic.put()
|
||||
|
||||
key = fic.key()
|
||||
|
||||
download.put()
|
||||
self.redirect('/?file='+str(key)+'&name=' + urlEscape(fic.name) + '&author=' + urlEscape(fic.author))
|
||||
|
||||
logging.info("Download finished OK")
|
||||
except Exception, e:
|
||||
logging.exception(e)
|
||||
# it was too large, won't save it
|
||||
name = str(makeAcceptableFilename(adapter.getStoryName()))
|
||||
if format == 'epub':
|
||||
self.response.headers['Content-Type'] = 'application/epub+zip'
|
||||
self.response.headers['Content-disposition'] = 'attachment; filename=' + name + '.epub'
|
||||
elif format == 'html':
|
||||
self.response.headers['Content-Type'] = 'application/zip'
|
||||
self.response.headers['Content-disposition'] = 'attachment; filename=' + name + '.html.zip'
|
||||
elif format == 'text':
|
||||
self.response.headers['Content-Type'] = 'application/zip'
|
||||
self.response.headers['Content-disposition'] = 'attachment; filename=' + name + '.txt.zip'
|
||||
|
||||
self.response.out.write(data)
|
||||
|
||||
def toPercentDecimal(match):
|
||||
"Return the %decimal number for the character for url escaping"
|
||||
s = match.group(1)
|
||||
return "%%%02x" % ord(s)
|
||||
|
||||
def urlEscape(data):
|
||||
"Escape text, including unicode, for use in URLs"
|
||||
p = re.compile(r'([^\w])')
|
||||
return p.sub(toPercentDecimal, data.encode("utf-8"))
|
||||
|
||||
def main():
|
||||
application = webapp.WSGIApplication([('/', MainHandler), ('/fdown', FanfictionDownloader), ('/file', FileServer), ('/recent', RecentFilesServer), ('/r2d2', RecentAllFilesServer), ('/login', LoginRequired)],
|
||||
debug=False)
|
||||
util.run_wsgi_app(application)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
main()
|
||||
5
queue.yaml
Normal file
5
queue.yaml
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
queue:
|
||||
- name: default
|
||||
rate: 1/s
|
||||
- name: download
|
||||
rate: 10/s
|
||||
69
recent.html
Normal file
69
recent.html
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
|
||||
<html>
|
||||
<head>
|
||||
<link href="css/index.css" rel="stylesheet" type="text/css">
|
||||
<title>Fanfiction Downloader (fanfiction.net, fanficauthors, fictionalley, ficwad to epub and HTML)</title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
</head>
|
||||
<body>
|
||||
<div id='main'>
|
||||
<h1>
|
||||
<a href="/" style="text-decoration: none; color: black;">FanFiction Downloader</a>
|
||||
</h1>
|
||||
|
||||
|
||||
<script type="text/javascript"><!--
|
||||
google_ad_client = "pub-2027714004231956";
|
||||
/* 468x60, created 6/9/10 */
|
||||
google_ad_slot = "8817097473";
|
||||
google_ad_width = 468;
|
||||
google_ad_height = 60;
|
||||
//-->
|
||||
</script>
|
||||
<script type="text/javascript"
|
||||
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
|
||||
</script>
|
||||
|
||||
|
||||
|
||||
<!-- <div id='yourfile'> -->
|
||||
{{yourfile}}
|
||||
<!-- </div> -->
|
||||
|
||||
<div id='urlbox'>
|
||||
<div id='greeting'>
|
||||
Hi, {{ nickname }}! These fanfics you've downloaded previously.
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id='helpbox'>
|
||||
{% for fic in fics %}
|
||||
<p> <a href="/file?id={{ fic.key }}">{{ fic.name }}</a> by {{ fic.author }} ({{ fic.format }})<br/><small><a href="{{ fic.url }}">{{ fic.url }}</a></small></p>
|
||||
{% endfor %}
|
||||
</div>
|
||||
|
||||
<script type="text/javascript"><!--
|
||||
google_ad_client = "pub-2027714004231956";
|
||||
/* 468x60, created 6/9/10 */
|
||||
google_ad_slot = "2009456648";
|
||||
google_ad_width = 468;
|
||||
google_ad_height = 60;
|
||||
//-->
|
||||
</script>
|
||||
<script type="text/javascript"
|
||||
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
|
||||
</script>
|
||||
|
||||
|
||||
</div>
|
||||
<script type="text/javascript">
|
||||
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
|
||||
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
|
||||
</script>
|
||||
<script type="text/javascript">
|
||||
try {
|
||||
var pageTracker = _gat._getTracker("UA-12136939-1");
|
||||
pageTracker._trackPageview();
|
||||
} catch(err) {}</script>
|
||||
</body>
|
||||
</html>
|
||||
318
simplejson/__init__.py
Normal file
318
simplejson/__init__.py
Normal file
|
|
@ -0,0 +1,318 @@
|
|||
r"""JSON (JavaScript Object Notation) <http://json.org> is a subset of
|
||||
JavaScript syntax (ECMA-262 3rd edition) used as a lightweight data
|
||||
interchange format.
|
||||
|
||||
:mod:`simplejson` exposes an API familiar to users of the standard library
|
||||
:mod:`marshal` and :mod:`pickle` modules. It is the externally maintained
|
||||
version of the :mod:`json` library contained in Python 2.6, but maintains
|
||||
compatibility with Python 2.4 and Python 2.5 and (currently) has
|
||||
significant performance advantages, even without using the optional C
|
||||
extension for speedups.
|
||||
|
||||
Encoding basic Python object hierarchies::
|
||||
|
||||
>>> import simplejson as json
|
||||
>>> json.dumps(['foo', {'bar': ('baz', None, 1.0, 2)}])
|
||||
'["foo", {"bar": ["baz", null, 1.0, 2]}]'
|
||||
>>> print json.dumps("\"foo\bar")
|
||||
"\"foo\bar"
|
||||
>>> print json.dumps(u'\u1234')
|
||||
"\u1234"
|
||||
>>> print json.dumps('\\')
|
||||
"\\"
|
||||
>>> print json.dumps({"c": 0, "b": 0, "a": 0}, sort_keys=True)
|
||||
{"a": 0, "b": 0, "c": 0}
|
||||
>>> from StringIO import StringIO
|
||||
>>> io = StringIO()
|
||||
>>> json.dump(['streaming API'], io)
|
||||
>>> io.getvalue()
|
||||
'["streaming API"]'
|
||||
|
||||
Compact encoding::
|
||||
|
||||
>>> import simplejson as json
|
||||
>>> json.dumps([1,2,3,{'4': 5, '6': 7}], separators=(',',':'))
|
||||
'[1,2,3,{"4":5,"6":7}]'
|
||||
|
||||
Pretty printing::
|
||||
|
||||
>>> import simplejson as json
|
||||
>>> s = json.dumps({'4': 5, '6': 7}, sort_keys=True, indent=4)
|
||||
>>> print '\n'.join([l.rstrip() for l in s.splitlines()])
|
||||
{
|
||||
"4": 5,
|
||||
"6": 7
|
||||
}
|
||||
|
||||
Decoding JSON::
|
||||
|
||||
>>> import simplejson as json
|
||||
>>> obj = [u'foo', {u'bar': [u'baz', None, 1.0, 2]}]
|
||||
>>> json.loads('["foo", {"bar":["baz", null, 1.0, 2]}]') == obj
|
||||
True
|
||||
>>> json.loads('"\\"foo\\bar"') == u'"foo\x08ar'
|
||||
True
|
||||
>>> from StringIO import StringIO
|
||||
>>> io = StringIO('["streaming API"]')
|
||||
>>> json.load(io)[0] == 'streaming API'
|
||||
True
|
||||
|
||||
Specializing JSON object decoding::
|
||||
|
||||
>>> import simplejson as json
|
||||
>>> def as_complex(dct):
|
||||
... if '__complex__' in dct:
|
||||
... return complex(dct['real'], dct['imag'])
|
||||
... return dct
|
||||
...
|
||||
>>> json.loads('{"__complex__": true, "real": 1, "imag": 2}',
|
||||
... object_hook=as_complex)
|
||||
(1+2j)
|
||||
>>> import decimal
|
||||
>>> json.loads('1.1', parse_float=decimal.Decimal) == decimal.Decimal('1.1')
|
||||
True
|
||||
|
||||
Specializing JSON object encoding::
|
||||
|
||||
>>> import simplejson as json
|
||||
>>> def encode_complex(obj):
|
||||
... if isinstance(obj, complex):
|
||||
... return [obj.real, obj.imag]
|
||||
... raise TypeError(repr(o) + " is not JSON serializable")
|
||||
...
|
||||
>>> json.dumps(2 + 1j, default=encode_complex)
|
||||
'[2.0, 1.0]'
|
||||
>>> json.JSONEncoder(default=encode_complex).encode(2 + 1j)
|
||||
'[2.0, 1.0]'
|
||||
>>> ''.join(json.JSONEncoder(default=encode_complex).iterencode(2 + 1j))
|
||||
'[2.0, 1.0]'
|
||||
|
||||
|
||||
Using simplejson.tool from the shell to validate and pretty-print::
|
||||
|
||||
$ echo '{"json":"obj"}' | python -m simplejson.tool
|
||||
{
|
||||
"json": "obj"
|
||||
}
|
||||
$ echo '{ 1.2:3.4}' | python -m simplejson.tool
|
||||
Expecting property name: line 1 column 2 (char 2)
|
||||
"""
|
||||
__version__ = '2.0.9'
|
||||
__all__ = [
|
||||
'dump', 'dumps', 'load', 'loads',
|
||||
'JSONDecoder', 'JSONEncoder',
|
||||
]
|
||||
|
||||
__author__ = 'Bob Ippolito <bob@redivi.com>'
|
||||
|
||||
from decoder import JSONDecoder
|
||||
from encoder import JSONEncoder
|
||||
|
||||
_default_encoder = JSONEncoder(
|
||||
skipkeys=False,
|
||||
ensure_ascii=True,
|
||||
check_circular=True,
|
||||
allow_nan=True,
|
||||
indent=None,
|
||||
separators=None,
|
||||
encoding='utf-8',
|
||||
default=None,
|
||||
)
|
||||
|
||||
def dump(obj, fp, skipkeys=False, ensure_ascii=True, check_circular=True,
|
||||
allow_nan=True, cls=None, indent=None, separators=None,
|
||||
encoding='utf-8', default=None, **kw):
|
||||
"""Serialize ``obj`` as a JSON formatted stream to ``fp`` (a
|
||||
``.write()``-supporting file-like object).
|
||||
|
||||
If ``skipkeys`` is true then ``dict`` keys that are not basic types
|
||||
(``str``, ``unicode``, ``int``, ``long``, ``float``, ``bool``, ``None``)
|
||||
will be skipped instead of raising a ``TypeError``.
|
||||
|
||||
If ``ensure_ascii`` is false, then the some chunks written to ``fp``
|
||||
may be ``unicode`` instances, subject to normal Python ``str`` to
|
||||
``unicode`` coercion rules. Unless ``fp.write()`` explicitly
|
||||
understands ``unicode`` (as in ``codecs.getwriter()``) this is likely
|
||||
to cause an error.
|
||||
|
||||
If ``check_circular`` is false, then the circular reference check
|
||||
for container types will be skipped and a circular reference will
|
||||
result in an ``OverflowError`` (or worse).
|
||||
|
||||
If ``allow_nan`` is false, then it will be a ``ValueError`` to
|
||||
serialize out of range ``float`` values (``nan``, ``inf``, ``-inf``)
|
||||
in strict compliance of the JSON specification, instead of using the
|
||||
JavaScript equivalents (``NaN``, ``Infinity``, ``-Infinity``).
|
||||
|
||||
If ``indent`` is a non-negative integer, then JSON array elements and object
|
||||
members will be pretty-printed with that indent level. An indent level
|
||||
of 0 will only insert newlines. ``None`` is the most compact representation.
|
||||
|
||||
If ``separators`` is an ``(item_separator, dict_separator)`` tuple
|
||||
then it will be used instead of the default ``(', ', ': ')`` separators.
|
||||
``(',', ':')`` is the most compact JSON representation.
|
||||
|
||||
``encoding`` is the character encoding for str instances, default is UTF-8.
|
||||
|
||||
``default(obj)`` is a function that should return a serializable version
|
||||
of obj or raise TypeError. The default simply raises TypeError.
|
||||
|
||||
To use a custom ``JSONEncoder`` subclass (e.g. one that overrides the
|
||||
``.default()`` method to serialize additional types), specify it with
|
||||
the ``cls`` kwarg.
|
||||
|
||||
"""
|
||||
# cached encoder
|
||||
if (not skipkeys and ensure_ascii and
|
||||
check_circular and allow_nan and
|
||||
cls is None and indent is None and separators is None and
|
||||
encoding == 'utf-8' and default is None and not kw):
|
||||
iterable = _default_encoder.iterencode(obj)
|
||||
else:
|
||||
if cls is None:
|
||||
cls = JSONEncoder
|
||||
iterable = cls(skipkeys=skipkeys, ensure_ascii=ensure_ascii,
|
||||
check_circular=check_circular, allow_nan=allow_nan, indent=indent,
|
||||
separators=separators, encoding=encoding,
|
||||
default=default, **kw).iterencode(obj)
|
||||
# could accelerate with writelines in some versions of Python, at
|
||||
# a debuggability cost
|
||||
for chunk in iterable:
|
||||
fp.write(chunk)
|
||||
|
||||
|
||||
def dumps(obj, skipkeys=False, ensure_ascii=True, check_circular=True,
|
||||
allow_nan=True, cls=None, indent=None, separators=None,
|
||||
encoding='utf-8', default=None, **kw):
|
||||
"""Serialize ``obj`` to a JSON formatted ``str``.
|
||||
|
||||
If ``skipkeys`` is false then ``dict`` keys that are not basic types
|
||||
(``str``, ``unicode``, ``int``, ``long``, ``float``, ``bool``, ``None``)
|
||||
will be skipped instead of raising a ``TypeError``.
|
||||
|
||||
If ``ensure_ascii`` is false, then the return value will be a
|
||||
``unicode`` instance subject to normal Python ``str`` to ``unicode``
|
||||
coercion rules instead of being escaped to an ASCII ``str``.
|
||||
|
||||
If ``check_circular`` is false, then the circular reference check
|
||||
for container types will be skipped and a circular reference will
|
||||
result in an ``OverflowError`` (or worse).
|
||||
|
||||
If ``allow_nan`` is false, then it will be a ``ValueError`` to
|
||||
serialize out of range ``float`` values (``nan``, ``inf``, ``-inf``) in
|
||||
strict compliance of the JSON specification, instead of using the
|
||||
JavaScript equivalents (``NaN``, ``Infinity``, ``-Infinity``).
|
||||
|
||||
If ``indent`` is a non-negative integer, then JSON array elements and
|
||||
object members will be pretty-printed with that indent level. An indent
|
||||
level of 0 will only insert newlines. ``None`` is the most compact
|
||||
representation.
|
||||
|
||||
If ``separators`` is an ``(item_separator, dict_separator)`` tuple
|
||||
then it will be used instead of the default ``(', ', ': ')`` separators.
|
||||
``(',', ':')`` is the most compact JSON representation.
|
||||
|
||||
``encoding`` is the character encoding for str instances, default is UTF-8.
|
||||
|
||||
``default(obj)`` is a function that should return a serializable version
|
||||
of obj or raise TypeError. The default simply raises TypeError.
|
||||
|
||||
To use a custom ``JSONEncoder`` subclass (e.g. one that overrides the
|
||||
``.default()`` method to serialize additional types), specify it with
|
||||
the ``cls`` kwarg.
|
||||
|
||||
"""
|
||||
# cached encoder
|
||||
if (not skipkeys and ensure_ascii and
|
||||
check_circular and allow_nan and
|
||||
cls is None and indent is None and separators is None and
|
||||
encoding == 'utf-8' and default is None and not kw):
|
||||
return _default_encoder.encode(obj)
|
||||
if cls is None:
|
||||
cls = JSONEncoder
|
||||
return cls(
|
||||
skipkeys=skipkeys, ensure_ascii=ensure_ascii,
|
||||
check_circular=check_circular, allow_nan=allow_nan, indent=indent,
|
||||
separators=separators, encoding=encoding, default=default,
|
||||
**kw).encode(obj)
|
||||
|
||||
|
||||
_default_decoder = JSONDecoder(encoding=None, object_hook=None)
|
||||
|
||||
|
||||
def load(fp, encoding=None, cls=None, object_hook=None, parse_float=None,
|
||||
parse_int=None, parse_constant=None, **kw):
|
||||
"""Deserialize ``fp`` (a ``.read()``-supporting file-like object containing
|
||||
a JSON document) to a Python object.
|
||||
|
||||
If the contents of ``fp`` is encoded with an ASCII based encoding other
|
||||
than utf-8 (e.g. latin-1), then an appropriate ``encoding`` name must
|
||||
be specified. Encodings that are not ASCII based (such as UCS-2) are
|
||||
not allowed, and should be wrapped with
|
||||
``codecs.getreader(fp)(encoding)``, or simply decoded to a ``unicode``
|
||||
object and passed to ``loads()``
|
||||
|
||||
``object_hook`` is an optional function that will be called with the
|
||||
result of any object literal decode (a ``dict``). The return value of
|
||||
``object_hook`` will be used instead of the ``dict``. This feature
|
||||
can be used to implement custom decoders (e.g. JSON-RPC class hinting).
|
||||
|
||||
To use a custom ``JSONDecoder`` subclass, specify it with the ``cls``
|
||||
kwarg.
|
||||
|
||||
"""
|
||||
return loads(fp.read(),
|
||||
encoding=encoding, cls=cls, object_hook=object_hook,
|
||||
parse_float=parse_float, parse_int=parse_int,
|
||||
parse_constant=parse_constant, **kw)
|
||||
|
||||
|
||||
def loads(s, encoding=None, cls=None, object_hook=None, parse_float=None,
|
||||
parse_int=None, parse_constant=None, **kw):
|
||||
"""Deserialize ``s`` (a ``str`` or ``unicode`` instance containing a JSON
|
||||
document) to a Python object.
|
||||
|
||||
If ``s`` is a ``str`` instance and is encoded with an ASCII based encoding
|
||||
other than utf-8 (e.g. latin-1) then an appropriate ``encoding`` name
|
||||
must be specified. Encodings that are not ASCII based (such as UCS-2)
|
||||
are not allowed and should be decoded to ``unicode`` first.
|
||||
|
||||
``object_hook`` is an optional function that will be called with the
|
||||
result of any object literal decode (a ``dict``). The return value of
|
||||
``object_hook`` will be used instead of the ``dict``. This feature
|
||||
can be used to implement custom decoders (e.g. JSON-RPC class hinting).
|
||||
|
||||
``parse_float``, if specified, will be called with the string
|
||||
of every JSON float to be decoded. By default this is equivalent to
|
||||
float(num_str). This can be used to use another datatype or parser
|
||||
for JSON floats (e.g. decimal.Decimal).
|
||||
|
||||
``parse_int``, if specified, will be called with the string
|
||||
of every JSON int to be decoded. By default this is equivalent to
|
||||
int(num_str). This can be used to use another datatype or parser
|
||||
for JSON integers (e.g. float).
|
||||
|
||||
``parse_constant``, if specified, will be called with one of the
|
||||
following strings: -Infinity, Infinity, NaN, null, true, false.
|
||||
This can be used to raise an exception if invalid JSON numbers
|
||||
are encountered.
|
||||
|
||||
To use a custom ``JSONDecoder`` subclass, specify it with the ``cls``
|
||||
kwarg.
|
||||
|
||||
"""
|
||||
if (cls is None and encoding is None and object_hook is None and
|
||||
parse_int is None and parse_float is None and
|
||||
parse_constant is None and not kw):
|
||||
return _default_decoder.decode(s)
|
||||
if cls is None:
|
||||
cls = JSONDecoder
|
||||
if object_hook is not None:
|
||||
kw['object_hook'] = object_hook
|
||||
if parse_float is not None:
|
||||
kw['parse_float'] = parse_float
|
||||
if parse_int is not None:
|
||||
kw['parse_int'] = parse_int
|
||||
if parse_constant is not None:
|
||||
kw['parse_constant'] = parse_constant
|
||||
return cls(encoding=encoding, **kw).decode(s)
|
||||
BIN
simplejson/__init__.pyc
Normal file
BIN
simplejson/__init__.pyc
Normal file
Binary file not shown.
2329
simplejson/_speedups.c
Normal file
2329
simplejson/_speedups.c
Normal file
File diff suppressed because it is too large
Load diff
354
simplejson/decoder.py
Normal file
354
simplejson/decoder.py
Normal file
|
|
@ -0,0 +1,354 @@
|
|||
"""Implementation of JSONDecoder
|
||||
"""
|
||||
import re
|
||||
import sys
|
||||
import struct
|
||||
|
||||
from simplejson.scanner import make_scanner
|
||||
try:
|
||||
from simplejson._speedups import scanstring as c_scanstring
|
||||
except ImportError:
|
||||
c_scanstring = None
|
||||
|
||||
__all__ = ['JSONDecoder']
|
||||
|
||||
FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
|
||||
|
||||
def _floatconstants():
|
||||
_BYTES = '7FF80000000000007FF0000000000000'.decode('hex')
|
||||
if sys.byteorder != 'big':
|
||||
_BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
|
||||
nan, inf = struct.unpack('dd', _BYTES)
|
||||
return nan, inf, -inf
|
||||
|
||||
NaN, PosInf, NegInf = _floatconstants()
|
||||
|
||||
|
||||
def linecol(doc, pos):
|
||||
lineno = doc.count('\n', 0, pos) + 1
|
||||
if lineno == 1:
|
||||
colno = pos
|
||||
else:
|
||||
colno = pos - doc.rindex('\n', 0, pos)
|
||||
return lineno, colno
|
||||
|
||||
|
||||
def errmsg(msg, doc, pos, end=None):
|
||||
# Note that this function is called from _speedups
|
||||
lineno, colno = linecol(doc, pos)
|
||||
if end is None:
|
||||
#fmt = '{0}: line {1} column {2} (char {3})'
|
||||
#return fmt.format(msg, lineno, colno, pos)
|
||||
fmt = '%s: line %d column %d (char %d)'
|
||||
return fmt % (msg, lineno, colno, pos)
|
||||
endlineno, endcolno = linecol(doc, end)
|
||||
#fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
|
||||
#return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
|
||||
fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
|
||||
return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
|
||||
|
||||
|
||||
_CONSTANTS = {
|
||||
'-Infinity': NegInf,
|
||||
'Infinity': PosInf,
|
||||
'NaN': NaN,
|
||||
}
|
||||
|
||||
STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
|
||||
BACKSLASH = {
|
||||
'"': u'"', '\\': u'\\', '/': u'/',
|
||||
'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
|
||||
}
|
||||
|
||||
DEFAULT_ENCODING = "utf-8"
|
||||
|
||||
def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHUNK.match):
|
||||
"""Scan the string s for a JSON string. End is the index of the
|
||||
character in s after the quote that started the JSON string.
|
||||
Unescapes all valid JSON string escape sequences and raises ValueError
|
||||
on attempt to decode an invalid string. If strict is False then literal
|
||||
control characters are allowed in the string.
|
||||
|
||||
Returns a tuple of the decoded string and the index of the character in s
|
||||
after the end quote."""
|
||||
if encoding is None:
|
||||
encoding = DEFAULT_ENCODING
|
||||
chunks = []
|
||||
_append = chunks.append
|
||||
begin = end - 1
|
||||
while 1:
|
||||
chunk = _m(s, end)
|
||||
if chunk is None:
|
||||
raise ValueError(
|
||||
errmsg("Unterminated string starting at", s, begin))
|
||||
end = chunk.end()
|
||||
content, terminator = chunk.groups()
|
||||
# Content is contains zero or more unescaped string characters
|
||||
if content:
|
||||
if not isinstance(content, unicode):
|
||||
content = unicode(content, encoding)
|
||||
_append(content)
|
||||
# Terminator is the end of string, a literal control character,
|
||||
# or a backslash denoting that an escape sequence follows
|
||||
if terminator == '"':
|
||||
break
|
||||
elif terminator != '\\':
|
||||
if strict:
|
||||
msg = "Invalid control character %r at" % (terminator,)
|
||||
#msg = "Invalid control character {0!r} at".format(terminator)
|
||||
raise ValueError(errmsg(msg, s, end))
|
||||
else:
|
||||
_append(terminator)
|
||||
continue
|
||||
try:
|
||||
esc = s[end]
|
||||
except IndexError:
|
||||
raise ValueError(
|
||||
errmsg("Unterminated string starting at", s, begin))
|
||||
# If not a unicode escape sequence, must be in the lookup table
|
||||
if esc != 'u':
|
||||
try:
|
||||
char = _b[esc]
|
||||
except KeyError:
|
||||
msg = "Invalid \\escape: " + repr(esc)
|
||||
raise ValueError(errmsg(msg, s, end))
|
||||
end += 1
|
||||
else:
|
||||
# Unicode escape sequence
|
||||
esc = s[end + 1:end + 5]
|
||||
next_end = end + 5
|
||||
if len(esc) != 4:
|
||||
msg = "Invalid \\uXXXX escape"
|
||||
raise ValueError(errmsg(msg, s, end))
|
||||
uni = int(esc, 16)
|
||||
# Check for surrogate pair on UCS-4 systems
|
||||
if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
|
||||
msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
|
||||
if not s[end + 5:end + 7] == '\\u':
|
||||
raise ValueError(errmsg(msg, s, end))
|
||||
esc2 = s[end + 7:end + 11]
|
||||
if len(esc2) != 4:
|
||||
raise ValueError(errmsg(msg, s, end))
|
||||
uni2 = int(esc2, 16)
|
||||
uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
|
||||
next_end += 6
|
||||
char = unichr(uni)
|
||||
end = next_end
|
||||
# Append the unescaped character
|
||||
_append(char)
|
||||
return u''.join(chunks), end
|
||||
|
||||
|
||||
# Use speedup if available
|
||||
scanstring = c_scanstring or py_scanstring
|
||||
|
||||
WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
|
||||
WHITESPACE_STR = ' \t\n\r'
|
||||
|
||||
def JSONObject((s, end), encoding, strict, scan_once, object_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
|
||||
pairs = {}
|
||||
# Use a slice to prevent IndexError from being raised, the following
|
||||
# check will raise a more specific ValueError if the string is empty
|
||||
nextchar = s[end:end + 1]
|
||||
# Normally we expect nextchar == '"'
|
||||
if nextchar != '"':
|
||||
if nextchar in _ws:
|
||||
end = _w(s, end).end()
|
||||
nextchar = s[end:end + 1]
|
||||
# Trivial empty object
|
||||
if nextchar == '}':
|
||||
return pairs, end + 1
|
||||
elif nextchar != '"':
|
||||
raise ValueError(errmsg("Expecting property name", s, end))
|
||||
end += 1
|
||||
while True:
|
||||
key, end = scanstring(s, end, encoding, strict)
|
||||
|
||||
# To skip some function call overhead we optimize the fast paths where
|
||||
# the JSON key separator is ": " or just ":".
|
||||
if s[end:end + 1] != ':':
|
||||
end = _w(s, end).end()
|
||||
if s[end:end + 1] != ':':
|
||||
raise ValueError(errmsg("Expecting : delimiter", s, end))
|
||||
|
||||
end += 1
|
||||
|
||||
try:
|
||||
if s[end] in _ws:
|
||||
end += 1
|
||||
if s[end] in _ws:
|
||||
end = _w(s, end + 1).end()
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
try:
|
||||
value, end = scan_once(s, end)
|
||||
except StopIteration:
|
||||
raise ValueError(errmsg("Expecting object", s, end))
|
||||
pairs[key] = value
|
||||
|
||||
try:
|
||||
nextchar = s[end]
|
||||
if nextchar in _ws:
|
||||
end = _w(s, end + 1).end()
|
||||
nextchar = s[end]
|
||||
except IndexError:
|
||||
nextchar = ''
|
||||
end += 1
|
||||
|
||||
if nextchar == '}':
|
||||
break
|
||||
elif nextchar != ',':
|
||||
raise ValueError(errmsg("Expecting , delimiter", s, end - 1))
|
||||
|
||||
try:
|
||||
nextchar = s[end]
|
||||
if nextchar in _ws:
|
||||
end += 1
|
||||
nextchar = s[end]
|
||||
if nextchar in _ws:
|
||||
end = _w(s, end + 1).end()
|
||||
nextchar = s[end]
|
||||
except IndexError:
|
||||
nextchar = ''
|
||||
|
||||
end += 1
|
||||
if nextchar != '"':
|
||||
raise ValueError(errmsg("Expecting property name", s, end - 1))
|
||||
|
||||
if object_hook is not None:
|
||||
pairs = object_hook(pairs)
|
||||
return pairs, end
|
||||
|
||||
def JSONArray((s, end), scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
|
||||
values = []
|
||||
nextchar = s[end:end + 1]
|
||||
if nextchar in _ws:
|
||||
end = _w(s, end + 1).end()
|
||||
nextchar = s[end:end + 1]
|
||||
# Look-ahead for trivial empty array
|
||||
if nextchar == ']':
|
||||
return values, end + 1
|
||||
_append = values.append
|
||||
while True:
|
||||
try:
|
||||
value, end = scan_once(s, end)
|
||||
except StopIteration:
|
||||
raise ValueError(errmsg("Expecting object", s, end))
|
||||
_append(value)
|
||||
nextchar = s[end:end + 1]
|
||||
if nextchar in _ws:
|
||||
end = _w(s, end + 1).end()
|
||||
nextchar = s[end:end + 1]
|
||||
end += 1
|
||||
if nextchar == ']':
|
||||
break
|
||||
elif nextchar != ',':
|
||||
raise ValueError(errmsg("Expecting , delimiter", s, end))
|
||||
|
||||
try:
|
||||
if s[end] in _ws:
|
||||
end += 1
|
||||
if s[end] in _ws:
|
||||
end = _w(s, end + 1).end()
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
return values, end
|
||||
|
||||
class JSONDecoder(object):
|
||||
"""Simple JSON <http://json.org> decoder
|
||||
|
||||
Performs the following translations in decoding by default:
|
||||
|
||||
+---------------+-------------------+
|
||||
| JSON | Python |
|
||||
+===============+===================+
|
||||
| object | dict |
|
||||
+---------------+-------------------+
|
||||
| array | list |
|
||||
+---------------+-------------------+
|
||||
| string | unicode |
|
||||
+---------------+-------------------+
|
||||
| number (int) | int, long |
|
||||
+---------------+-------------------+
|
||||
| number (real) | float |
|
||||
+---------------+-------------------+
|
||||
| true | True |
|
||||
+---------------+-------------------+
|
||||
| false | False |
|
||||
+---------------+-------------------+
|
||||
| null | None |
|
||||
+---------------+-------------------+
|
||||
|
||||
It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
|
||||
their corresponding ``float`` values, which is outside the JSON spec.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, encoding=None, object_hook=None, parse_float=None,
|
||||
parse_int=None, parse_constant=None, strict=True):
|
||||
"""``encoding`` determines the encoding used to interpret any ``str``
|
||||
objects decoded by this instance (utf-8 by default). It has no
|
||||
effect when decoding ``unicode`` objects.
|
||||
|
||||
Note that currently only encodings that are a superset of ASCII work,
|
||||
strings of other encodings should be passed in as ``unicode``.
|
||||
|
||||
``object_hook``, if specified, will be called with the result
|
||||
of every JSON object decoded and its return value will be used in
|
||||
place of the given ``dict``. This can be used to provide custom
|
||||
deserializations (e.g. to support JSON-RPC class hinting).
|
||||
|
||||
``parse_float``, if specified, will be called with the string
|
||||
of every JSON float to be decoded. By default this is equivalent to
|
||||
float(num_str). This can be used to use another datatype or parser
|
||||
for JSON floats (e.g. decimal.Decimal).
|
||||
|
||||
``parse_int``, if specified, will be called with the string
|
||||
of every JSON int to be decoded. By default this is equivalent to
|
||||
int(num_str). This can be used to use another datatype or parser
|
||||
for JSON integers (e.g. float).
|
||||
|
||||
``parse_constant``, if specified, will be called with one of the
|
||||
following strings: -Infinity, Infinity, NaN.
|
||||
This can be used to raise an exception if invalid JSON numbers
|
||||
are encountered.
|
||||
|
||||
"""
|
||||
self.encoding = encoding
|
||||
self.object_hook = object_hook
|
||||
self.parse_float = parse_float or float
|
||||
self.parse_int = parse_int or int
|
||||
self.parse_constant = parse_constant or _CONSTANTS.__getitem__
|
||||
self.strict = strict
|
||||
self.parse_object = JSONObject
|
||||
self.parse_array = JSONArray
|
||||
self.parse_string = scanstring
|
||||
self.scan_once = make_scanner(self)
|
||||
|
||||
def decode(self, s, _w=WHITESPACE.match):
|
||||
"""Return the Python representation of ``s`` (a ``str`` or ``unicode``
|
||||
instance containing a JSON document)
|
||||
|
||||
"""
|
||||
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
|
||||
end = _w(s, end).end()
|
||||
if end != len(s):
|
||||
raise ValueError(errmsg("Extra data", s, end, len(s)))
|
||||
return obj
|
||||
|
||||
def raw_decode(self, s, idx=0):
|
||||
"""Decode a JSON document from ``s`` (a ``str`` or ``unicode`` beginning
|
||||
with a JSON document) and return a 2-tuple of the Python
|
||||
representation and the index in ``s`` where the document ended.
|
||||
|
||||
This can be used to decode a JSON document from a string that may
|
||||
have extraneous data at the end.
|
||||
|
||||
"""
|
||||
try:
|
||||
obj, end = self.scan_once(s, idx)
|
||||
except StopIteration:
|
||||
raise ValueError("No JSON object could be decoded")
|
||||
return obj, end
|
||||
BIN
simplejson/decoder.pyc
Normal file
BIN
simplejson/decoder.pyc
Normal file
Binary file not shown.
440
simplejson/encoder.py
Normal file
440
simplejson/encoder.py
Normal file
|
|
@ -0,0 +1,440 @@
|
|||
"""Implementation of JSONEncoder
|
||||
"""
|
||||
import re
|
||||
|
||||
try:
|
||||
from simplejson._speedups import encode_basestring_ascii as c_encode_basestring_ascii
|
||||
except ImportError:
|
||||
c_encode_basestring_ascii = None
|
||||
try:
|
||||
from simplejson._speedups import make_encoder as c_make_encoder
|
||||
except ImportError:
|
||||
c_make_encoder = None
|
||||
|
||||
ESCAPE = re.compile(r'[\x00-\x1f\\"\b\f\n\r\t]')
|
||||
ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])')
|
||||
HAS_UTF8 = re.compile(r'[\x80-\xff]')
|
||||
ESCAPE_DCT = {
|
||||
'\\': '\\\\',
|
||||
'"': '\\"',
|
||||
'\b': '\\b',
|
||||
'\f': '\\f',
|
||||
'\n': '\\n',
|
||||
'\r': '\\r',
|
||||
'\t': '\\t',
|
||||
}
|
||||
for i in range(0x20):
|
||||
#ESCAPE_DCT.setdefault(chr(i), '\\u{0:04x}'.format(i))
|
||||
ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,))
|
||||
|
||||
# Assume this produces an infinity on all machines (probably not guaranteed)
|
||||
INFINITY = float('1e66666')
|
||||
FLOAT_REPR = repr
|
||||
|
||||
def encode_basestring(s):
|
||||
"""Return a JSON representation of a Python string
|
||||
|
||||
"""
|
||||
def replace(match):
|
||||
return ESCAPE_DCT[match.group(0)]
|
||||
return '"' + ESCAPE.sub(replace, s) + '"'
|
||||
|
||||
|
||||
def py_encode_basestring_ascii(s):
|
||||
"""Return an ASCII-only JSON representation of a Python string
|
||||
|
||||
"""
|
||||
if isinstance(s, str) and HAS_UTF8.search(s) is not None:
|
||||
s = s.decode('utf-8')
|
||||
def replace(match):
|
||||
s = match.group(0)
|
||||
try:
|
||||
return ESCAPE_DCT[s]
|
||||
except KeyError:
|
||||
n = ord(s)
|
||||
if n < 0x10000:
|
||||
#return '\\u{0:04x}'.format(n)
|
||||
return '\\u%04x' % (n,)
|
||||
else:
|
||||
# surrogate pair
|
||||
n -= 0x10000
|
||||
s1 = 0xd800 | ((n >> 10) & 0x3ff)
|
||||
s2 = 0xdc00 | (n & 0x3ff)
|
||||
#return '\\u{0:04x}\\u{1:04x}'.format(s1, s2)
|
||||
return '\\u%04x\\u%04x' % (s1, s2)
|
||||
return '"' + str(ESCAPE_ASCII.sub(replace, s)) + '"'
|
||||
|
||||
|
||||
encode_basestring_ascii = c_encode_basestring_ascii or py_encode_basestring_ascii
|
||||
|
||||
class JSONEncoder(object):
|
||||
"""Extensible JSON <http://json.org> encoder for Python data structures.
|
||||
|
||||
Supports the following objects and types by default:
|
||||
|
||||
+-------------------+---------------+
|
||||
| Python | JSON |
|
||||
+===================+===============+
|
||||
| dict | object |
|
||||
+-------------------+---------------+
|
||||
| list, tuple | array |
|
||||
+-------------------+---------------+
|
||||
| str, unicode | string |
|
||||
+-------------------+---------------+
|
||||
| int, long, float | number |
|
||||
+-------------------+---------------+
|
||||
| True | true |
|
||||
+-------------------+---------------+
|
||||
| False | false |
|
||||
+-------------------+---------------+
|
||||
| None | null |
|
||||
+-------------------+---------------+
|
||||
|
||||
To extend this to recognize other objects, subclass and implement a
|
||||
``.default()`` method with another method that returns a serializable
|
||||
object for ``o`` if possible, otherwise it should call the superclass
|
||||
implementation (to raise ``TypeError``).
|
||||
|
||||
"""
|
||||
item_separator = ', '
|
||||
key_separator = ': '
|
||||
def __init__(self, skipkeys=False, ensure_ascii=True,
|
||||
check_circular=True, allow_nan=True, sort_keys=False,
|
||||
indent=None, separators=None, encoding='utf-8', default=None):
|
||||
"""Constructor for JSONEncoder, with sensible defaults.
|
||||
|
||||
If skipkeys is false, then it is a TypeError to attempt
|
||||
encoding of keys that are not str, int, long, float or None. If
|
||||
skipkeys is True, such items are simply skipped.
|
||||
|
||||
If ensure_ascii is true, the output is guaranteed to be str
|
||||
objects with all incoming unicode characters escaped. If
|
||||
ensure_ascii is false, the output will be unicode object.
|
||||
|
||||
If check_circular is true, then lists, dicts, and custom encoded
|
||||
objects will be checked for circular references during encoding to
|
||||
prevent an infinite recursion (which would cause an OverflowError).
|
||||
Otherwise, no such check takes place.
|
||||
|
||||
If allow_nan is true, then NaN, Infinity, and -Infinity will be
|
||||
encoded as such. This behavior is not JSON specification compliant,
|
||||
but is consistent with most JavaScript based encoders and decoders.
|
||||
Otherwise, it will be a ValueError to encode such floats.
|
||||
|
||||
If sort_keys is true, then the output of dictionaries will be
|
||||
sorted by key; this is useful for regression tests to ensure
|
||||
that JSON serializations can be compared on a day-to-day basis.
|
||||
|
||||
If indent is a non-negative integer, then JSON array
|
||||
elements and object members will be pretty-printed with that
|
||||
indent level. An indent level of 0 will only insert newlines.
|
||||
None is the most compact representation.
|
||||
|
||||
If specified, separators should be a (item_separator, key_separator)
|
||||
tuple. The default is (', ', ': '). To get the most compact JSON
|
||||
representation you should specify (',', ':') to eliminate whitespace.
|
||||
|
||||
If specified, default is a function that gets called for objects
|
||||
that can't otherwise be serialized. It should return a JSON encodable
|
||||
version of the object or raise a ``TypeError``.
|
||||
|
||||
If encoding is not None, then all input strings will be
|
||||
transformed into unicode using that encoding prior to JSON-encoding.
|
||||
The default is UTF-8.
|
||||
|
||||
"""
|
||||
|
||||
self.skipkeys = skipkeys
|
||||
self.ensure_ascii = ensure_ascii
|
||||
self.check_circular = check_circular
|
||||
self.allow_nan = allow_nan
|
||||
self.sort_keys = sort_keys
|
||||
self.indent = indent
|
||||
if separators is not None:
|
||||
self.item_separator, self.key_separator = separators
|
||||
if default is not None:
|
||||
self.default = default
|
||||
self.encoding = encoding
|
||||
|
||||
def default(self, o):
|
||||
"""Implement this method in a subclass such that it returns
|
||||
a serializable object for ``o``, or calls the base implementation
|
||||
(to raise a ``TypeError``).
|
||||
|
||||
For example, to support arbitrary iterators, you could
|
||||
implement default like this::
|
||||
|
||||
def default(self, o):
|
||||
try:
|
||||
iterable = iter(o)
|
||||
except TypeError:
|
||||
pass
|
||||
else:
|
||||
return list(iterable)
|
||||
return JSONEncoder.default(self, o)
|
||||
|
||||
"""
|
||||
raise TypeError(repr(o) + " is not JSON serializable")
|
||||
|
||||
def encode(self, o):
|
||||
"""Return a JSON string representation of a Python data structure.
|
||||
|
||||
>>> JSONEncoder().encode({"foo": ["bar", "baz"]})
|
||||
'{"foo": ["bar", "baz"]}'
|
||||
|
||||
"""
|
||||
# This is for extremely simple cases and benchmarks.
|
||||
if isinstance(o, basestring):
|
||||
if isinstance(o, str):
|
||||
_encoding = self.encoding
|
||||
if (_encoding is not None
|
||||
and not (_encoding == 'utf-8')):
|
||||
o = o.decode(_encoding)
|
||||
if self.ensure_ascii:
|
||||
return encode_basestring_ascii(o)
|
||||
else:
|
||||
return encode_basestring(o)
|
||||
# This doesn't pass the iterator directly to ''.join() because the
|
||||
# exceptions aren't as detailed. The list call should be roughly
|
||||
# equivalent to the PySequence_Fast that ''.join() would do.
|
||||
chunks = self.iterencode(o, _one_shot=True)
|
||||
if not isinstance(chunks, (list, tuple)):
|
||||
chunks = list(chunks)
|
||||
return ''.join(chunks)
|
||||
|
||||
def iterencode(self, o, _one_shot=False):
|
||||
"""Encode the given object and yield each string
|
||||
representation as available.
|
||||
|
||||
For example::
|
||||
|
||||
for chunk in JSONEncoder().iterencode(bigobject):
|
||||
mysocket.write(chunk)
|
||||
|
||||
"""
|
||||
if self.check_circular:
|
||||
markers = {}
|
||||
else:
|
||||
markers = None
|
||||
if self.ensure_ascii:
|
||||
_encoder = encode_basestring_ascii
|
||||
else:
|
||||
_encoder = encode_basestring
|
||||
if self.encoding != 'utf-8':
|
||||
def _encoder(o, _orig_encoder=_encoder, _encoding=self.encoding):
|
||||
if isinstance(o, str):
|
||||
o = o.decode(_encoding)
|
||||
return _orig_encoder(o)
|
||||
|
||||
def floatstr(o, allow_nan=self.allow_nan, _repr=FLOAT_REPR, _inf=INFINITY, _neginf=-INFINITY):
|
||||
# Check for specials. Note that this type of test is processor- and/or
|
||||
# platform-specific, so do tests which don't depend on the internals.
|
||||
|
||||
if o != o:
|
||||
text = 'NaN'
|
||||
elif o == _inf:
|
||||
text = 'Infinity'
|
||||
elif o == _neginf:
|
||||
text = '-Infinity'
|
||||
else:
|
||||
return _repr(o)
|
||||
|
||||
if not allow_nan:
|
||||
raise ValueError(
|
||||
"Out of range float values are not JSON compliant: " +
|
||||
repr(o))
|
||||
|
||||
return text
|
||||
|
||||
|
||||
if _one_shot and c_make_encoder is not None and not self.indent and not self.sort_keys:
|
||||
_iterencode = c_make_encoder(
|
||||
markers, self.default, _encoder, self.indent,
|
||||
self.key_separator, self.item_separator, self.sort_keys,
|
||||
self.skipkeys, self.allow_nan)
|
||||
else:
|
||||
_iterencode = _make_iterencode(
|
||||
markers, self.default, _encoder, self.indent, floatstr,
|
||||
self.key_separator, self.item_separator, self.sort_keys,
|
||||
self.skipkeys, _one_shot)
|
||||
return _iterencode(o, 0)
|
||||
|
||||
def _make_iterencode(markers, _default, _encoder, _indent, _floatstr, _key_separator, _item_separator, _sort_keys, _skipkeys, _one_shot,
|
||||
## HACK: hand-optimized bytecode; turn globals into locals
|
||||
False=False,
|
||||
True=True,
|
||||
ValueError=ValueError,
|
||||
basestring=basestring,
|
||||
dict=dict,
|
||||
float=float,
|
||||
id=id,
|
||||
int=int,
|
||||
isinstance=isinstance,
|
||||
list=list,
|
||||
long=long,
|
||||
str=str,
|
||||
tuple=tuple,
|
||||
):
|
||||
|
||||
def _iterencode_list(lst, _current_indent_level):
|
||||
if not lst:
|
||||
yield '[]'
|
||||
return
|
||||
if markers is not None:
|
||||
markerid = id(lst)
|
||||
if markerid in markers:
|
||||
raise ValueError("Circular reference detected")
|
||||
markers[markerid] = lst
|
||||
buf = '['
|
||||
if _indent is not None:
|
||||
_current_indent_level += 1
|
||||
newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
|
||||
separator = _item_separator + newline_indent
|
||||
buf += newline_indent
|
||||
else:
|
||||
newline_indent = None
|
||||
separator = _item_separator
|
||||
first = True
|
||||
for value in lst:
|
||||
if first:
|
||||
first = False
|
||||
else:
|
||||
buf = separator
|
||||
if isinstance(value, basestring):
|
||||
yield buf + _encoder(value)
|
||||
elif value is None:
|
||||
yield buf + 'null'
|
||||
elif value is True:
|
||||
yield buf + 'true'
|
||||
elif value is False:
|
||||
yield buf + 'false'
|
||||
elif isinstance(value, (int, long)):
|
||||
yield buf + str(value)
|
||||
elif isinstance(value, float):
|
||||
yield buf + _floatstr(value)
|
||||
else:
|
||||
yield buf
|
||||
if isinstance(value, (list, tuple)):
|
||||
chunks = _iterencode_list(value, _current_indent_level)
|
||||
elif isinstance(value, dict):
|
||||
chunks = _iterencode_dict(value, _current_indent_level)
|
||||
else:
|
||||
chunks = _iterencode(value, _current_indent_level)
|
||||
for chunk in chunks:
|
||||
yield chunk
|
||||
if newline_indent is not None:
|
||||
_current_indent_level -= 1
|
||||
yield '\n' + (' ' * (_indent * _current_indent_level))
|
||||
yield ']'
|
||||
if markers is not None:
|
||||
del markers[markerid]
|
||||
|
||||
def _iterencode_dict(dct, _current_indent_level):
|
||||
if not dct:
|
||||
yield '{}'
|
||||
return
|
||||
if markers is not None:
|
||||
markerid = id(dct)
|
||||
if markerid in markers:
|
||||
raise ValueError("Circular reference detected")
|
||||
markers[markerid] = dct
|
||||
yield '{'
|
||||
if _indent is not None:
|
||||
_current_indent_level += 1
|
||||
newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
|
||||
item_separator = _item_separator + newline_indent
|
||||
yield newline_indent
|
||||
else:
|
||||
newline_indent = None
|
||||
item_separator = _item_separator
|
||||
first = True
|
||||
if _sort_keys:
|
||||
items = dct.items()
|
||||
items.sort(key=lambda kv: kv[0])
|
||||
else:
|
||||
items = dct.iteritems()
|
||||
for key, value in items:
|
||||
if isinstance(key, basestring):
|
||||
pass
|
||||
# JavaScript is weakly typed for these, so it makes sense to
|
||||
# also allow them. Many encoders seem to do something like this.
|
||||
elif isinstance(key, float):
|
||||
key = _floatstr(key)
|
||||
elif key is True:
|
||||
key = 'true'
|
||||
elif key is False:
|
||||
key = 'false'
|
||||
elif key is None:
|
||||
key = 'null'
|
||||
elif isinstance(key, (int, long)):
|
||||
key = str(key)
|
||||
elif _skipkeys:
|
||||
continue
|
||||
else:
|
||||
raise TypeError("key " + repr(key) + " is not a string")
|
||||
if first:
|
||||
first = False
|
||||
else:
|
||||
yield item_separator
|
||||
yield _encoder(key)
|
||||
yield _key_separator
|
||||
if isinstance(value, basestring):
|
||||
yield _encoder(value)
|
||||
elif value is None:
|
||||
yield 'null'
|
||||
elif value is True:
|
||||
yield 'true'
|
||||
elif value is False:
|
||||
yield 'false'
|
||||
elif isinstance(value, (int, long)):
|
||||
yield str(value)
|
||||
elif isinstance(value, float):
|
||||
yield _floatstr(value)
|
||||
else:
|
||||
if isinstance(value, (list, tuple)):
|
||||
chunks = _iterencode_list(value, _current_indent_level)
|
||||
elif isinstance(value, dict):
|
||||
chunks = _iterencode_dict(value, _current_indent_level)
|
||||
else:
|
||||
chunks = _iterencode(value, _current_indent_level)
|
||||
for chunk in chunks:
|
||||
yield chunk
|
||||
if newline_indent is not None:
|
||||
_current_indent_level -= 1
|
||||
yield '\n' + (' ' * (_indent * _current_indent_level))
|
||||
yield '}'
|
||||
if markers is not None:
|
||||
del markers[markerid]
|
||||
|
||||
def _iterencode(o, _current_indent_level):
|
||||
if isinstance(o, basestring):
|
||||
yield _encoder(o)
|
||||
elif o is None:
|
||||
yield 'null'
|
||||
elif o is True:
|
||||
yield 'true'
|
||||
elif o is False:
|
||||
yield 'false'
|
||||
elif isinstance(o, (int, long)):
|
||||
yield str(o)
|
||||
elif isinstance(o, float):
|
||||
yield _floatstr(o)
|
||||
elif isinstance(o, (list, tuple)):
|
||||
for chunk in _iterencode_list(o, _current_indent_level):
|
||||
yield chunk
|
||||
elif isinstance(o, dict):
|
||||
for chunk in _iterencode_dict(o, _current_indent_level):
|
||||
yield chunk
|
||||
else:
|
||||
if markers is not None:
|
||||
markerid = id(o)
|
||||
if markerid in markers:
|
||||
raise ValueError("Circular reference detected")
|
||||
markers[markerid] = o
|
||||
o = _default(o)
|
||||
for chunk in _iterencode(o, _current_indent_level):
|
||||
yield chunk
|
||||
if markers is not None:
|
||||
del markers[markerid]
|
||||
|
||||
return _iterencode
|
||||
BIN
simplejson/encoder.pyc
Normal file
BIN
simplejson/encoder.pyc
Normal file
Binary file not shown.
65
simplejson/scanner.py
Normal file
65
simplejson/scanner.py
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
"""JSON token scanner
|
||||
"""
|
||||
import re
|
||||
try:
|
||||
from simplejson._speedups import make_scanner as c_make_scanner
|
||||
except ImportError:
|
||||
c_make_scanner = None
|
||||
|
||||
__all__ = ['make_scanner']
|
||||
|
||||
NUMBER_RE = re.compile(
|
||||
r'(-?(?:0|[1-9]\d*))(\.\d+)?([eE][-+]?\d+)?',
|
||||
(re.VERBOSE | re.MULTILINE | re.DOTALL))
|
||||
|
||||
def py_make_scanner(context):
|
||||
parse_object = context.parse_object
|
||||
parse_array = context.parse_array
|
||||
parse_string = context.parse_string
|
||||
match_number = NUMBER_RE.match
|
||||
encoding = context.encoding
|
||||
strict = context.strict
|
||||
parse_float = context.parse_float
|
||||
parse_int = context.parse_int
|
||||
parse_constant = context.parse_constant
|
||||
object_hook = context.object_hook
|
||||
|
||||
def _scan_once(string, idx):
|
||||
try:
|
||||
nextchar = string[idx]
|
||||
except IndexError:
|
||||
raise StopIteration
|
||||
|
||||
if nextchar == '"':
|
||||
return parse_string(string, idx + 1, encoding, strict)
|
||||
elif nextchar == '{':
|
||||
return parse_object((string, idx + 1), encoding, strict, _scan_once, object_hook)
|
||||
elif nextchar == '[':
|
||||
return parse_array((string, idx + 1), _scan_once)
|
||||
elif nextchar == 'n' and string[idx:idx + 4] == 'null':
|
||||
return None, idx + 4
|
||||
elif nextchar == 't' and string[idx:idx + 4] == 'true':
|
||||
return True, idx + 4
|
||||
elif nextchar == 'f' and string[idx:idx + 5] == 'false':
|
||||
return False, idx + 5
|
||||
|
||||
m = match_number(string, idx)
|
||||
if m is not None:
|
||||
integer, frac, exp = m.groups()
|
||||
if frac or exp:
|
||||
res = parse_float(integer + (frac or '') + (exp or ''))
|
||||
else:
|
||||
res = parse_int(integer)
|
||||
return res, m.end()
|
||||
elif nextchar == 'N' and string[idx:idx + 3] == 'NaN':
|
||||
return parse_constant('NaN'), idx + 3
|
||||
elif nextchar == 'I' and string[idx:idx + 8] == 'Infinity':
|
||||
return parse_constant('Infinity'), idx + 8
|
||||
elif nextchar == '-' and string[idx:idx + 9] == '-Infinity':
|
||||
return parse_constant('-Infinity'), idx + 9
|
||||
else:
|
||||
raise StopIteration
|
||||
|
||||
return _scan_once
|
||||
|
||||
make_scanner = c_make_scanner or py_make_scanner
|
||||
BIN
simplejson/scanner.pyc
Normal file
BIN
simplejson/scanner.pyc
Normal file
Binary file not shown.
23
simplejson/tests/__init__.py
Normal file
23
simplejson/tests/__init__.py
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
import unittest
|
||||
import doctest
|
||||
|
||||
def additional_tests():
|
||||
import simplejson
|
||||
import simplejson.encoder
|
||||
import simplejson.decoder
|
||||
suite = unittest.TestSuite()
|
||||
for mod in (simplejson, simplejson.encoder, simplejson.decoder):
|
||||
suite.addTest(doctest.DocTestSuite(mod))
|
||||
suite.addTest(doctest.DocFileSuite('../../index.rst'))
|
||||
return suite
|
||||
|
||||
def main():
|
||||
suite = additional_tests()
|
||||
runner = unittest.TextTestRunner()
|
||||
runner.run(suite)
|
||||
|
||||
if __name__ == '__main__':
|
||||
import os
|
||||
import sys
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||
main()
|
||||
30
simplejson/tests/test_check_circular.py
Normal file
30
simplejson/tests/test_check_circular.py
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
from unittest import TestCase
|
||||
import simplejson as json
|
||||
|
||||
def default_iterable(obj):
|
||||
return list(obj)
|
||||
|
||||
class TestCheckCircular(TestCase):
|
||||
def test_circular_dict(self):
|
||||
dct = {}
|
||||
dct['a'] = dct
|
||||
self.assertRaises(ValueError, json.dumps, dct)
|
||||
|
||||
def test_circular_list(self):
|
||||
lst = []
|
||||
lst.append(lst)
|
||||
self.assertRaises(ValueError, json.dumps, lst)
|
||||
|
||||
def test_circular_composite(self):
|
||||
dct2 = {}
|
||||
dct2['a'] = []
|
||||
dct2['a'].append(dct2)
|
||||
self.assertRaises(ValueError, json.dumps, dct2)
|
||||
|
||||
def test_circular_default(self):
|
||||
json.dumps([set()], default=default_iterable)
|
||||
self.assertRaises(TypeError, json.dumps, [set()])
|
||||
|
||||
def test_circular_off_default(self):
|
||||
json.dumps([set()], default=default_iterable, check_circular=False)
|
||||
self.assertRaises(TypeError, json.dumps, [set()], check_circular=False)
|
||||
22
simplejson/tests/test_decode.py
Normal file
22
simplejson/tests/test_decode.py
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
import decimal
|
||||
from unittest import TestCase
|
||||
|
||||
import simplejson as json
|
||||
|
||||
class TestDecode(TestCase):
|
||||
def test_decimal(self):
|
||||
rval = json.loads('1.1', parse_float=decimal.Decimal)
|
||||
self.assert_(isinstance(rval, decimal.Decimal))
|
||||
self.assertEquals(rval, decimal.Decimal('1.1'))
|
||||
|
||||
def test_float(self):
|
||||
rval = json.loads('1', parse_int=float)
|
||||
self.assert_(isinstance(rval, float))
|
||||
self.assertEquals(rval, 1.0)
|
||||
|
||||
def test_decoder_optimizations(self):
|
||||
# Several optimizations were made that skip over calls to
|
||||
# the whitespace regex, so this test is designed to try and
|
||||
# exercise the uncommon cases. The array cases are already covered.
|
||||
rval = json.loads('{ "key" : "value" , "k":"v" }')
|
||||
self.assertEquals(rval, {"key":"value", "k":"v"})
|
||||
9
simplejson/tests/test_default.py
Normal file
9
simplejson/tests/test_default.py
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
from unittest import TestCase
|
||||
|
||||
import simplejson as json
|
||||
|
||||
class TestDefault(TestCase):
|
||||
def test_default(self):
|
||||
self.assertEquals(
|
||||
json.dumps(type, default=repr),
|
||||
json.dumps(repr(type)))
|
||||
21
simplejson/tests/test_dump.py
Normal file
21
simplejson/tests/test_dump.py
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
from unittest import TestCase
|
||||
from cStringIO import StringIO
|
||||
|
||||
import simplejson as json
|
||||
|
||||
class TestDump(TestCase):
|
||||
def test_dump(self):
|
||||
sio = StringIO()
|
||||
json.dump({}, sio)
|
||||
self.assertEquals(sio.getvalue(), '{}')
|
||||
|
||||
def test_dumps(self):
|
||||
self.assertEquals(json.dumps({}), '{}')
|
||||
|
||||
def test_encode_truefalse(self):
|
||||
self.assertEquals(json.dumps(
|
||||
{True: False, False: True}, sort_keys=True),
|
||||
'{"false": true, "true": false}')
|
||||
self.assertEquals(json.dumps(
|
||||
{2: 3.0, 4.0: 5L, False: 1, 6L: True, "7": 0}, sort_keys=True),
|
||||
'{"false": 1, "2": 3.0, "4.0": 5, "6": true, "7": 0}')
|
||||
38
simplejson/tests/test_encode_basestring_ascii.py
Normal file
38
simplejson/tests/test_encode_basestring_ascii.py
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
from unittest import TestCase
|
||||
|
||||
import simplejson.encoder
|
||||
|
||||
CASES = [
|
||||
(u'/\\"\ucafe\ubabe\uab98\ufcde\ubcda\uef4a\x08\x0c\n\r\t`1~!@#$%^&*()_+-=[]{}|;:\',./<>?', '"/\\\\\\"\\ucafe\\ubabe\\uab98\\ufcde\\ubcda\\uef4a\\b\\f\\n\\r\\t`1~!@#$%^&*()_+-=[]{}|;:\',./<>?"'),
|
||||
(u'\u0123\u4567\u89ab\ucdef\uabcd\uef4a', '"\\u0123\\u4567\\u89ab\\ucdef\\uabcd\\uef4a"'),
|
||||
(u'controls', '"controls"'),
|
||||
(u'\x08\x0c\n\r\t', '"\\b\\f\\n\\r\\t"'),
|
||||
(u'{"object with 1 member":["array with 1 element"]}', '"{\\"object with 1 member\\":[\\"array with 1 element\\"]}"'),
|
||||
(u' s p a c e d ', '" s p a c e d "'),
|
||||
(u'\U0001d120', '"\\ud834\\udd20"'),
|
||||
(u'\u03b1\u03a9', '"\\u03b1\\u03a9"'),
|
||||
('\xce\xb1\xce\xa9', '"\\u03b1\\u03a9"'),
|
||||
(u'\u03b1\u03a9', '"\\u03b1\\u03a9"'),
|
||||
('\xce\xb1\xce\xa9', '"\\u03b1\\u03a9"'),
|
||||
(u'\u03b1\u03a9', '"\\u03b1\\u03a9"'),
|
||||
(u'\u03b1\u03a9', '"\\u03b1\\u03a9"'),
|
||||
(u"`1~!@#$%^&*()_+-={':[,]}|;.</>?", '"`1~!@#$%^&*()_+-={\':[,]}|;.</>?"'),
|
||||
(u'\x08\x0c\n\r\t', '"\\b\\f\\n\\r\\t"'),
|
||||
(u'\u0123\u4567\u89ab\ucdef\uabcd\uef4a', '"\\u0123\\u4567\\u89ab\\ucdef\\uabcd\\uef4a"'),
|
||||
]
|
||||
|
||||
class TestEncodeBaseStringAscii(TestCase):
|
||||
def test_py_encode_basestring_ascii(self):
|
||||
self._test_encode_basestring_ascii(simplejson.encoder.py_encode_basestring_ascii)
|
||||
|
||||
def test_c_encode_basestring_ascii(self):
|
||||
if not simplejson.encoder.c_encode_basestring_ascii:
|
||||
return
|
||||
self._test_encode_basestring_ascii(simplejson.encoder.c_encode_basestring_ascii)
|
||||
|
||||
def _test_encode_basestring_ascii(self, encode_basestring_ascii):
|
||||
fname = encode_basestring_ascii.__name__
|
||||
for input_string, expect in CASES:
|
||||
result = encode_basestring_ascii(input_string)
|
||||
self.assertEquals(result, expect,
|
||||
'%r != %r for %s(%r)' % (result, expect, fname, input_string))
|
||||
76
simplejson/tests/test_fail.py
Normal file
76
simplejson/tests/test_fail.py
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
from unittest import TestCase
|
||||
|
||||
import simplejson as json
|
||||
|
||||
# Fri Dec 30 18:57:26 2005
|
||||
JSONDOCS = [
|
||||
# http://json.org/JSON_checker/test/fail1.json
|
||||
'"A JSON payload should be an object or array, not a string."',
|
||||
# http://json.org/JSON_checker/test/fail2.json
|
||||
'["Unclosed array"',
|
||||
# http://json.org/JSON_checker/test/fail3.json
|
||||
'{unquoted_key: "keys must be quoted}',
|
||||
# http://json.org/JSON_checker/test/fail4.json
|
||||
'["extra comma",]',
|
||||
# http://json.org/JSON_checker/test/fail5.json
|
||||
'["double extra comma",,]',
|
||||
# http://json.org/JSON_checker/test/fail6.json
|
||||
'[ , "<-- missing value"]',
|
||||
# http://json.org/JSON_checker/test/fail7.json
|
||||
'["Comma after the close"],',
|
||||
# http://json.org/JSON_checker/test/fail8.json
|
||||
'["Extra close"]]',
|
||||
# http://json.org/JSON_checker/test/fail9.json
|
||||
'{"Extra comma": true,}',
|
||||
# http://json.org/JSON_checker/test/fail10.json
|
||||
'{"Extra value after close": true} "misplaced quoted value"',
|
||||
# http://json.org/JSON_checker/test/fail11.json
|
||||
'{"Illegal expression": 1 + 2}',
|
||||
# http://json.org/JSON_checker/test/fail12.json
|
||||
'{"Illegal invocation": alert()}',
|
||||
# http://json.org/JSON_checker/test/fail13.json
|
||||
'{"Numbers cannot have leading zeroes": 013}',
|
||||
# http://json.org/JSON_checker/test/fail14.json
|
||||
'{"Numbers cannot be hex": 0x14}',
|
||||
# http://json.org/JSON_checker/test/fail15.json
|
||||
'["Illegal backslash escape: \\x15"]',
|
||||
# http://json.org/JSON_checker/test/fail16.json
|
||||
'["Illegal backslash escape: \\\'"]',
|
||||
# http://json.org/JSON_checker/test/fail17.json
|
||||
'["Illegal backslash escape: \\017"]',
|
||||
# http://json.org/JSON_checker/test/fail18.json
|
||||
'[[[[[[[[[[[[[[[[[[[["Too deep"]]]]]]]]]]]]]]]]]]]]',
|
||||
# http://json.org/JSON_checker/test/fail19.json
|
||||
'{"Missing colon" null}',
|
||||
# http://json.org/JSON_checker/test/fail20.json
|
||||
'{"Double colon":: null}',
|
||||
# http://json.org/JSON_checker/test/fail21.json
|
||||
'{"Comma instead of colon", null}',
|
||||
# http://json.org/JSON_checker/test/fail22.json
|
||||
'["Colon instead of comma": false]',
|
||||
# http://json.org/JSON_checker/test/fail23.json
|
||||
'["Bad value", truth]',
|
||||
# http://json.org/JSON_checker/test/fail24.json
|
||||
"['single quote']",
|
||||
# http://code.google.com/p/simplejson/issues/detail?id=3
|
||||
u'["A\u001FZ control characters in string"]',
|
||||
]
|
||||
|
||||
SKIPS = {
|
||||
1: "why not have a string payload?",
|
||||
18: "spec doesn't specify any nesting limitations",
|
||||
}
|
||||
|
||||
class TestFail(TestCase):
|
||||
def test_failures(self):
|
||||
for idx, doc in enumerate(JSONDOCS):
|
||||
idx = idx + 1
|
||||
if idx in SKIPS:
|
||||
json.loads(doc)
|
||||
continue
|
||||
try:
|
||||
json.loads(doc)
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
self.fail("Expected failure for fail%d.json: %r" % (idx, doc))
|
||||
15
simplejson/tests/test_float.py
Normal file
15
simplejson/tests/test_float.py
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
import math
|
||||
from unittest import TestCase
|
||||
|
||||
import simplejson as json
|
||||
|
||||
class TestFloat(TestCase):
|
||||
def test_floats(self):
|
||||
for num in [1617161771.7650001, math.pi, math.pi**100, math.pi**-100, 3.1]:
|
||||
self.assertEquals(float(json.dumps(num)), num)
|
||||
self.assertEquals(json.loads(json.dumps(num)), num)
|
||||
|
||||
def test_ints(self):
|
||||
for num in [1, 1L, 1<<32, 1<<64]:
|
||||
self.assertEquals(json.dumps(num), str(num))
|
||||
self.assertEquals(int(json.dumps(num)), num)
|
||||
41
simplejson/tests/test_indent.py
Normal file
41
simplejson/tests/test_indent.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
from unittest import TestCase
|
||||
|
||||
import simplejson as json
|
||||
import textwrap
|
||||
|
||||
class TestIndent(TestCase):
|
||||
def test_indent(self):
|
||||
h = [['blorpie'], ['whoops'], [], 'd-shtaeou', 'd-nthiouh', 'i-vhbjkhnth',
|
||||
{'nifty': 87}, {'field': 'yes', 'morefield': False} ]
|
||||
|
||||
expect = textwrap.dedent("""\
|
||||
[
|
||||
[
|
||||
"blorpie"
|
||||
],
|
||||
[
|
||||
"whoops"
|
||||
],
|
||||
[],
|
||||
"d-shtaeou",
|
||||
"d-nthiouh",
|
||||
"i-vhbjkhnth",
|
||||
{
|
||||
"nifty": 87
|
||||
},
|
||||
{
|
||||
"field": "yes",
|
||||
"morefield": false
|
||||
}
|
||||
]""")
|
||||
|
||||
|
||||
d1 = json.dumps(h)
|
||||
d2 = json.dumps(h, indent=2, sort_keys=True, separators=(',', ': '))
|
||||
|
||||
h1 = json.loads(d1)
|
||||
h2 = json.loads(d2)
|
||||
|
||||
self.assertEquals(h1, h)
|
||||
self.assertEquals(h2, h)
|
||||
self.assertEquals(d2, expect)
|
||||
76
simplejson/tests/test_pass1.py
Normal file
76
simplejson/tests/test_pass1.py
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
from unittest import TestCase
|
||||
|
||||
import simplejson as json
|
||||
|
||||
# from http://json.org/JSON_checker/test/pass1.json
|
||||
JSON = r'''
|
||||
[
|
||||
"JSON Test Pattern pass1",
|
||||
{"object with 1 member":["array with 1 element"]},
|
||||
{},
|
||||
[],
|
||||
-42,
|
||||
true,
|
||||
false,
|
||||
null,
|
||||
{
|
||||
"integer": 1234567890,
|
||||
"real": -9876.543210,
|
||||
"e": 0.123456789e-12,
|
||||
"E": 1.234567890E+34,
|
||||
"": 23456789012E666,
|
||||
"zero": 0,
|
||||
"one": 1,
|
||||
"space": " ",
|
||||
"quote": "\"",
|
||||
"backslash": "\\",
|
||||
"controls": "\b\f\n\r\t",
|
||||
"slash": "/ & \/",
|
||||
"alpha": "abcdefghijklmnopqrstuvwyz",
|
||||
"ALPHA": "ABCDEFGHIJKLMNOPQRSTUVWYZ",
|
||||
"digit": "0123456789",
|
||||
"special": "`1~!@#$%^&*()_+-={':[,]}|;.</>?",
|
||||
"hex": "\u0123\u4567\u89AB\uCDEF\uabcd\uef4A",
|
||||
"true": true,
|
||||
"false": false,
|
||||
"null": null,
|
||||
"array":[ ],
|
||||
"object":{ },
|
||||
"address": "50 St. James Street",
|
||||
"url": "http://www.JSON.org/",
|
||||
"comment": "// /* <!-- --",
|
||||
"# -- --> */": " ",
|
||||
" s p a c e d " :[1,2 , 3
|
||||
|
||||
,
|
||||
|
||||
4 , 5 , 6 ,7 ],
|
||||
"compact": [1,2,3,4,5,6,7],
|
||||
"jsontext": "{\"object with 1 member\":[\"array with 1 element\"]}",
|
||||
"quotes": "" \u0022 %22 0x22 034 "",
|
||||
"\/\\\"\uCAFE\uBABE\uAB98\uFCDE\ubcda\uef4A\b\f\n\r\t`1~!@#$%^&*()_+-=[]{}|;:',./<>?"
|
||||
: "A key can be any string"
|
||||
},
|
||||
0.5 ,98.6
|
||||
,
|
||||
99.44
|
||||
,
|
||||
|
||||
1066
|
||||
|
||||
|
||||
,"rosebud"]
|
||||
'''
|
||||
|
||||
class TestPass1(TestCase):
|
||||
def test_parse(self):
|
||||
# test in/out equivalence and parsing
|
||||
res = json.loads(JSON)
|
||||
out = json.dumps(res)
|
||||
self.assertEquals(res, json.loads(out))
|
||||
try:
|
||||
json.dumps(res, allow_nan=False)
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
self.fail("23456789012E666 should be out of range")
|
||||
14
simplejson/tests/test_pass2.py
Normal file
14
simplejson/tests/test_pass2.py
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
from unittest import TestCase
|
||||
import simplejson as json
|
||||
|
||||
# from http://json.org/JSON_checker/test/pass2.json
|
||||
JSON = r'''
|
||||
[[[[[[[[[[[[[[[[[[["Not too deep"]]]]]]]]]]]]]]]]]]]
|
||||
'''
|
||||
|
||||
class TestPass2(TestCase):
|
||||
def test_parse(self):
|
||||
# test in/out equivalence and parsing
|
||||
res = json.loads(JSON)
|
||||
out = json.dumps(res)
|
||||
self.assertEquals(res, json.loads(out))
|
||||
20
simplejson/tests/test_pass3.py
Normal file
20
simplejson/tests/test_pass3.py
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
from unittest import TestCase
|
||||
|
||||
import simplejson as json
|
||||
|
||||
# from http://json.org/JSON_checker/test/pass3.json
|
||||
JSON = r'''
|
||||
{
|
||||
"JSON Test Pattern pass3": {
|
||||
"The outermost value": "must be an object or array.",
|
||||
"In this test": "It is an object."
|
||||
}
|
||||
}
|
||||
'''
|
||||
|
||||
class TestPass3(TestCase):
|
||||
def test_parse(self):
|
||||
# test in/out equivalence and parsing
|
||||
res = json.loads(JSON)
|
||||
out = json.dumps(res)
|
||||
self.assertEquals(res, json.loads(out))
|
||||
67
simplejson/tests/test_recursion.py
Normal file
67
simplejson/tests/test_recursion.py
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
from unittest import TestCase
|
||||
|
||||
import simplejson as json
|
||||
|
||||
class JSONTestObject:
|
||||
pass
|
||||
|
||||
|
||||
class RecursiveJSONEncoder(json.JSONEncoder):
|
||||
recurse = False
|
||||
def default(self, o):
|
||||
if o is JSONTestObject:
|
||||
if self.recurse:
|
||||
return [JSONTestObject]
|
||||
else:
|
||||
return 'JSONTestObject'
|
||||
return json.JSONEncoder.default(o)
|
||||
|
||||
|
||||
class TestRecursion(TestCase):
|
||||
def test_listrecursion(self):
|
||||
x = []
|
||||
x.append(x)
|
||||
try:
|
||||
json.dumps(x)
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
self.fail("didn't raise ValueError on list recursion")
|
||||
x = []
|
||||
y = [x]
|
||||
x.append(y)
|
||||
try:
|
||||
json.dumps(x)
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
self.fail("didn't raise ValueError on alternating list recursion")
|
||||
y = []
|
||||
x = [y, y]
|
||||
# ensure that the marker is cleared
|
||||
json.dumps(x)
|
||||
|
||||
def test_dictrecursion(self):
|
||||
x = {}
|
||||
x["test"] = x
|
||||
try:
|
||||
json.dumps(x)
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
self.fail("didn't raise ValueError on dict recursion")
|
||||
x = {}
|
||||
y = {"a": x, "b": x}
|
||||
# ensure that the marker is cleared
|
||||
json.dumps(x)
|
||||
|
||||
def test_defaultrecursion(self):
|
||||
enc = RecursiveJSONEncoder()
|
||||
self.assertEquals(enc.encode(JSONTestObject), '"JSONTestObject"')
|
||||
enc.recurse = True
|
||||
try:
|
||||
enc.encode(JSONTestObject)
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
self.fail("didn't raise ValueError on default recursion")
|
||||
111
simplejson/tests/test_scanstring.py
Normal file
111
simplejson/tests/test_scanstring.py
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
import sys
|
||||
import decimal
|
||||
from unittest import TestCase
|
||||
|
||||
import simplejson as json
|
||||
import simplejson.decoder
|
||||
|
||||
class TestScanString(TestCase):
|
||||
def test_py_scanstring(self):
|
||||
self._test_scanstring(simplejson.decoder.py_scanstring)
|
||||
|
||||
def test_c_scanstring(self):
|
||||
if not simplejson.decoder.c_scanstring:
|
||||
return
|
||||
self._test_scanstring(simplejson.decoder.c_scanstring)
|
||||
|
||||
def _test_scanstring(self, scanstring):
|
||||
self.assertEquals(
|
||||
scanstring('"z\\ud834\\udd20x"', 1, None, True),
|
||||
(u'z\U0001d120x', 16))
|
||||
|
||||
if sys.maxunicode == 65535:
|
||||
self.assertEquals(
|
||||
scanstring(u'"z\U0001d120x"', 1, None, True),
|
||||
(u'z\U0001d120x', 6))
|
||||
else:
|
||||
self.assertEquals(
|
||||
scanstring(u'"z\U0001d120x"', 1, None, True),
|
||||
(u'z\U0001d120x', 5))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('"\\u007b"', 1, None, True),
|
||||
(u'{', 8))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('"A JSON payload should be an object or array, not a string."', 1, None, True),
|
||||
(u'A JSON payload should be an object or array, not a string.', 60))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('["Unclosed array"', 2, None, True),
|
||||
(u'Unclosed array', 17))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('["extra comma",]', 2, None, True),
|
||||
(u'extra comma', 14))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('["double extra comma",,]', 2, None, True),
|
||||
(u'double extra comma', 21))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('["Comma after the close"],', 2, None, True),
|
||||
(u'Comma after the close', 24))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('["Extra close"]]', 2, None, True),
|
||||
(u'Extra close', 14))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('{"Extra comma": true,}', 2, None, True),
|
||||
(u'Extra comma', 14))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('{"Extra value after close": true} "misplaced quoted value"', 2, None, True),
|
||||
(u'Extra value after close', 26))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('{"Illegal expression": 1 + 2}', 2, None, True),
|
||||
(u'Illegal expression', 21))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('{"Illegal invocation": alert()}', 2, None, True),
|
||||
(u'Illegal invocation', 21))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('{"Numbers cannot have leading zeroes": 013}', 2, None, True),
|
||||
(u'Numbers cannot have leading zeroes', 37))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('{"Numbers cannot be hex": 0x14}', 2, None, True),
|
||||
(u'Numbers cannot be hex', 24))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('[[[[[[[[[[[[[[[[[[[["Too deep"]]]]]]]]]]]]]]]]]]]]', 21, None, True),
|
||||
(u'Too deep', 30))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('{"Missing colon" null}', 2, None, True),
|
||||
(u'Missing colon', 16))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('{"Double colon":: null}', 2, None, True),
|
||||
(u'Double colon', 15))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('{"Comma instead of colon", null}', 2, None, True),
|
||||
(u'Comma instead of colon', 25))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('["Colon instead of comma": false]', 2, None, True),
|
||||
(u'Colon instead of comma', 25))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('["Bad value", truth]', 2, None, True),
|
||||
(u'Bad value', 12))
|
||||
|
||||
def test_issue3623(self):
|
||||
self.assertRaises(ValueError, json.decoder.scanstring, "xxx", 1,
|
||||
"xxx")
|
||||
self.assertRaises(UnicodeDecodeError,
|
||||
json.encoder.encode_basestring_ascii, "xx\xff")
|
||||
42
simplejson/tests/test_separators.py
Normal file
42
simplejson/tests/test_separators.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
import textwrap
|
||||
from unittest import TestCase
|
||||
|
||||
import simplejson as json
|
||||
|
||||
|
||||
class TestSeparators(TestCase):
|
||||
def test_separators(self):
|
||||
h = [['blorpie'], ['whoops'], [], 'd-shtaeou', 'd-nthiouh', 'i-vhbjkhnth',
|
||||
{'nifty': 87}, {'field': 'yes', 'morefield': False} ]
|
||||
|
||||
expect = textwrap.dedent("""\
|
||||
[
|
||||
[
|
||||
"blorpie"
|
||||
] ,
|
||||
[
|
||||
"whoops"
|
||||
] ,
|
||||
[] ,
|
||||
"d-shtaeou" ,
|
||||
"d-nthiouh" ,
|
||||
"i-vhbjkhnth" ,
|
||||
{
|
||||
"nifty" : 87
|
||||
} ,
|
||||
{
|
||||
"field" : "yes" ,
|
||||
"morefield" : false
|
||||
}
|
||||
]""")
|
||||
|
||||
|
||||
d1 = json.dumps(h)
|
||||
d2 = json.dumps(h, indent=2, sort_keys=True, separators=(' ,', ' : '))
|
||||
|
||||
h1 = json.loads(d1)
|
||||
h2 = json.loads(d2)
|
||||
|
||||
self.assertEquals(h1, h)
|
||||
self.assertEquals(h2, h)
|
||||
self.assertEquals(d2, expect)
|
||||
64
simplejson/tests/test_unicode.py
Normal file
64
simplejson/tests/test_unicode.py
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
from unittest import TestCase
|
||||
|
||||
import simplejson as json
|
||||
|
||||
class TestUnicode(TestCase):
|
||||
def test_encoding1(self):
|
||||
encoder = json.JSONEncoder(encoding='utf-8')
|
||||
u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
|
||||
s = u.encode('utf-8')
|
||||
ju = encoder.encode(u)
|
||||
js = encoder.encode(s)
|
||||
self.assertEquals(ju, js)
|
||||
|
||||
def test_encoding2(self):
|
||||
u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
|
||||
s = u.encode('utf-8')
|
||||
ju = json.dumps(u, encoding='utf-8')
|
||||
js = json.dumps(s, encoding='utf-8')
|
||||
self.assertEquals(ju, js)
|
||||
|
||||
def test_encoding3(self):
|
||||
u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
|
||||
j = json.dumps(u)
|
||||
self.assertEquals(j, '"\\u03b1\\u03a9"')
|
||||
|
||||
def test_encoding4(self):
|
||||
u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
|
||||
j = json.dumps([u])
|
||||
self.assertEquals(j, '["\\u03b1\\u03a9"]')
|
||||
|
||||
def test_encoding5(self):
|
||||
u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
|
||||
j = json.dumps(u, ensure_ascii=False)
|
||||
self.assertEquals(j, u'"%s"' % (u,))
|
||||
|
||||
def test_encoding6(self):
|
||||
u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
|
||||
j = json.dumps([u], ensure_ascii=False)
|
||||
self.assertEquals(j, u'["%s"]' % (u,))
|
||||
|
||||
def test_big_unicode_encode(self):
|
||||
u = u'\U0001d120'
|
||||
self.assertEquals(json.dumps(u), '"\\ud834\\udd20"')
|
||||
self.assertEquals(json.dumps(u, ensure_ascii=False), u'"\U0001d120"')
|
||||
|
||||
def test_big_unicode_decode(self):
|
||||
u = u'z\U0001d120x'
|
||||
self.assertEquals(json.loads('"' + u + '"'), u)
|
||||
self.assertEquals(json.loads('"z\\ud834\\udd20x"'), u)
|
||||
|
||||
def test_unicode_decode(self):
|
||||
for i in range(0, 0xd7ff):
|
||||
u = unichr(i)
|
||||
s = '"\\u%04x"' % (i,)
|
||||
self.assertEquals(json.loads(s), u)
|
||||
|
||||
def test_default_encoding(self):
|
||||
self.assertEquals(json.loads(u'{"a": "\xe9"}'.encode('utf-8')),
|
||||
{'a': u'\xe9'})
|
||||
|
||||
def test_unicode_preservation(self):
|
||||
self.assertEquals(type(json.loads(u'""')), unicode)
|
||||
self.assertEquals(type(json.loads(u'"a"')), unicode)
|
||||
self.assertEquals(type(json.loads(u'["a"]')[0]), unicode)
|
||||
37
simplejson/tool.py
Normal file
37
simplejson/tool.py
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
r"""Command-line tool to validate and pretty-print JSON
|
||||
|
||||
Usage::
|
||||
|
||||
$ echo '{"json":"obj"}' | python -m simplejson.tool
|
||||
{
|
||||
"json": "obj"
|
||||
}
|
||||
$ echo '{ 1.2:3.4}' | python -m simplejson.tool
|
||||
Expecting property name: line 1 column 2 (char 2)
|
||||
|
||||
"""
|
||||
import sys
|
||||
import simplejson
|
||||
|
||||
def main():
|
||||
if len(sys.argv) == 1:
|
||||
infile = sys.stdin
|
||||
outfile = sys.stdout
|
||||
elif len(sys.argv) == 2:
|
||||
infile = open(sys.argv[1], 'rb')
|
||||
outfile = sys.stdout
|
||||
elif len(sys.argv) == 3:
|
||||
infile = open(sys.argv[1], 'rb')
|
||||
outfile = open(sys.argv[2], 'wb')
|
||||
else:
|
||||
raise SystemExit(sys.argv[0] + " [infile [outfile]]")
|
||||
try:
|
||||
obj = simplejson.load(infile)
|
||||
except ValueError, e:
|
||||
raise SystemExit(e)
|
||||
simplejson.dump(obj, outfile, sort_keys=True, indent=4)
|
||||
outfile.write('\n')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
BIN
static/ajax-loader.gif
Normal file
BIN
static/ajax-loader.gif
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 11 KiB |
BIN
static/favicon.ico
Normal file
BIN
static/favicon.ico
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 21 KiB |
53
utils/remover.py
Normal file
53
utils/remover.py
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
#!/usr/bin/env python
|
||||
# encoding: utf-8
|
||||
"""
|
||||
remover.py
|
||||
|
||||
Created by Roman on 2010-06-20.
|
||||
Copyright (c) 2010 __MyCompanyName__. All rights reserved.
|
||||
"""
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
|
||||
from google.appengine.ext.webapp import util
|
||||
from google.appengine.ext import webapp
|
||||
from google.appengine.api import users
|
||||
|
||||
from ffstorage import *
|
||||
|
||||
class Remover(webapp.RequestHandler):
|
||||
def get(self):
|
||||
logging.debug("Starting r3m0v3r")
|
||||
user = users.get_current_user()
|
||||
logging.debug("Working as user %s" % user)
|
||||
theDate = datetime.date.today() - datetime.timedelta(days=2)
|
||||
logging.debug("Will delete stuff older than %s" % theDate)
|
||||
|
||||
fics = DownloadedFanfic.all()
|
||||
fics.order("date")
|
||||
|
||||
results = fics.fetch(50)
|
||||
|
||||
|
||||
logging.debug([x.name for x in results])
|
||||
|
||||
num = 0
|
||||
for d in results:
|
||||
# d.blob = None
|
||||
# d.cleared = True
|
||||
d.delete()
|
||||
num = num + 1
|
||||
logging.info('Deleted instances: %d' % num)
|
||||
self.response.out.write('Deleted instances: %d' % num)
|
||||
|
||||
|
||||
def main():
|
||||
application = webapp.WSGIApplication([('/r3m0v3r', Remover)],
|
||||
debug=False)
|
||||
util.run_wsgi_app(application)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
main()
|
||||
Loading…
Reference in a new issue