mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-06 08:52:55 +01:00
Commit first version of reorg/rewrite. Currently CLI only.
This commit is contained in:
commit
150316f460
85 changed files with 20691 additions and 0 deletions
37
app.yaml
Normal file
37
app.yaml
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
# fanfictionloader
|
||||
application: fanfictionloader
|
||||
version: 3-0-2
|
||||
runtime: python
|
||||
api_version: 1
|
||||
|
||||
handlers:
|
||||
- url: /r3m0v3r
|
||||
script: utils/remover.py
|
||||
login: admin
|
||||
|
||||
- url: /r3m0v3r
|
||||
script: main.py
|
||||
login: admin
|
||||
|
||||
- url: /fdownloadtask
|
||||
script: main.py
|
||||
login: admin
|
||||
|
||||
- url: /css
|
||||
static_dir: css
|
||||
|
||||
- url: /js
|
||||
static_dir: js
|
||||
|
||||
- url: /static
|
||||
static_dir: static
|
||||
|
||||
- url: /favicon\.ico
|
||||
static_files: static/favicon.ico
|
||||
upload: static/favicon\.ico
|
||||
|
||||
- url: /.*
|
||||
script: main.py
|
||||
|
||||
builtins:
|
||||
- datastore_admin: on
|
||||
4
cron.yaml
Normal file
4
cron.yaml
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
cron:
|
||||
- description: cleanup job
|
||||
url: /r3m0v3r
|
||||
schedule: every 2 hours
|
||||
71
css/index.css
Normal file
71
css/index.css
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
body
|
||||
{
|
||||
font: 0.9em "Helvetica Neue", Arial, Helvetica, Geneva, sans-serif;
|
||||
}
|
||||
|
||||
#main
|
||||
{
|
||||
width: 43%;
|
||||
margin-left: 23%;
|
||||
background-color: #dae6ff;
|
||||
padding: 2em;
|
||||
}
|
||||
|
||||
#greeting
|
||||
{
|
||||
margin-bottom: 1em;
|
||||
border-color: #efefef;
|
||||
}
|
||||
|
||||
|
||||
|
||||
#logpassword:hover, #logpasswordtable:hover, #urlbox:hover, #typebox:hover, #helpbox:hover, #yourfile:hover
|
||||
{
|
||||
border: thin solid #fffeff;
|
||||
}
|
||||
|
||||
h1
|
||||
{
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
#logpasswordtable
|
||||
{
|
||||
padding: 1em;
|
||||
}
|
||||
|
||||
#logpassword, #logpasswordtable {
|
||||
// display: none;
|
||||
}
|
||||
|
||||
#urlbox, #typebox, #logpasswordtable, #logpassword, #helpbox, #yourfile
|
||||
{
|
||||
margin: 1em;
|
||||
padding: 1em;
|
||||
border: thin dotted #fffeff;
|
||||
}
|
||||
|
||||
div.field
|
||||
{
|
||||
margin-bottom: 0.5em;
|
||||
}
|
||||
|
||||
#submitbtn
|
||||
{
|
||||
padding: 1em;
|
||||
}
|
||||
|
||||
#typelabel
|
||||
{
|
||||
}
|
||||
|
||||
#typeoptions
|
||||
{
|
||||
margin-top: 0.5em;
|
||||
}
|
||||
|
||||
#error
|
||||
{
|
||||
font-size: small;
|
||||
color: #f00;
|
||||
}
|
||||
59
delete_fic.py
Normal file
59
delete_fic.py
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
import os
|
||||
import cgi
|
||||
import sys
|
||||
import logging
|
||||
import traceback
|
||||
import StringIO
|
||||
|
||||
from google.appengine.api import users
|
||||
from google.appengine.ext import webapp
|
||||
from google.appengine.ext.webapp import util
|
||||
|
||||
from fanficdownloader.downaloder import *
|
||||
from fanficdownloader.ffnet import *
|
||||
from fanficdownloader.output import *
|
||||
|
||||
from google.appengine.ext import db
|
||||
|
||||
from fanficdownloader.zipdir import *
|
||||
|
||||
from ffstorage import *
|
||||
|
||||
def create_mac(user, fic_id, fic_url):
|
||||
return str(abs(hash(user)+hash(fic_id)))+str(abs(hash(fic_url)))
|
||||
|
||||
def check_mac(user, fic_id, fic_url, mac):
|
||||
return (create_mac(user, fic_id, fic_url) == mac)
|
||||
|
||||
def create_mac_for_fic(user, fic_id):
|
||||
key = db.Key(fic_id)
|
||||
fanfic = db.get(key)
|
||||
if fanfic.user != user:
|
||||
return None
|
||||
else:
|
||||
return create_mac(user, key, fanfic.url)
|
||||
|
||||
class DeleteFicHandler(webapp.RequestHandler):
|
||||
def get(self):
|
||||
user = users.get_current_user()
|
||||
if not user:
|
||||
self.redirect('/login')
|
||||
|
||||
fic_id = self.request.get('fic_id')
|
||||
fic_mac = self.request.get('key_id')
|
||||
|
||||
actual_mac = create_mac_for_fic(user, fic_id)
|
||||
if actual_mac != fic_mac:
|
||||
self.response.out.write("Ooops")
|
||||
else:
|
||||
key = db.Key(fic_id)
|
||||
fanfic = db.get(key)
|
||||
fanfic.delete()
|
||||
self.redirect('/recent')
|
||||
|
||||
|
||||
fics = db.GqlQuery("Select * From DownloadedFanfic WHERE user = :1", user)
|
||||
template_values = dict(fics = fics, nickname = user.nickname())
|
||||
path = os.path.join(os.path.dirname(__file__), 'recent.html')
|
||||
self.response.out.write(template.render(path, template_values))
|
||||
|
||||
2014
fanficdownloader/BeautifulSoup.py
Normal file
2014
fanficdownloader/BeautifulSoup.py
Normal file
File diff suppressed because it is too large
Load diff
1
fanficdownloader/__init__.py
Normal file
1
fanficdownloader/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
231
fanficdownloader/adapter.py
Normal file
231
fanficdownloader/adapter.py
Normal file
|
|
@ -0,0 +1,231 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
import datetime
|
||||
from output import makeAcceptableFilename
|
||||
|
||||
try:
|
||||
from google.appengine.api.urlfetch import fetch as googlefetch
|
||||
appEngineGlob = True
|
||||
except:
|
||||
appEngineGlob = False
|
||||
|
||||
class LoginRequiredException(Exception):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
|
||||
def __str__(self):
|
||||
return repr(self.url + ' requires user to be logged in')
|
||||
|
||||
class StoryArchivedAlready(Exception):
|
||||
pass
|
||||
|
||||
class StoryDoesNotExist(Exception):
|
||||
pass
|
||||
|
||||
class FailedToDownload(Exception):
|
||||
pass
|
||||
|
||||
class InvalidStoryURL(Exception):
|
||||
pass
|
||||
|
||||
class FanfictionSiteAdapter:
|
||||
appEngine = appEngineGlob
|
||||
login = ''
|
||||
password = ''
|
||||
url = ''
|
||||
host = ''
|
||||
path = ''
|
||||
uuid = ''
|
||||
storyName = ''
|
||||
storyId = ''
|
||||
authorName = ''
|
||||
authorId = ''
|
||||
authorURL = ''
|
||||
outputStorySep = '-Ukn_'
|
||||
outputName = ''
|
||||
outputFileName = ''
|
||||
storyDescription = ''
|
||||
storyCharacters = []
|
||||
storySeries = ''
|
||||
storyPublished = datetime.date(1970, 01, 31)
|
||||
storyCreated = datetime.datetime.now()
|
||||
storyUpdated = datetime.date(1970, 01, 31)
|
||||
languageId = 'en-UK'
|
||||
language = 'English'
|
||||
subjects = []
|
||||
publisher = ''
|
||||
numChapters = '0'
|
||||
numWords = '0'
|
||||
genre = ''
|
||||
category = ''
|
||||
storyStatus = 'In-Progress'
|
||||
storyRating = ''
|
||||
storyUserRating = '0'
|
||||
def __init__(self, url):
|
||||
# basic plain url parsing...
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
|
||||
def hasAppEngine(self):
|
||||
return self.appEngine
|
||||
|
||||
def fetchUrl(self, url):
|
||||
if not self.appEngine:
|
||||
return self.opener.open(url).read().decode('utf-8')
|
||||
else:
|
||||
return googlefetch(url,deadline=10).content
|
||||
|
||||
def requiresLogin(self, url = None):
|
||||
return False
|
||||
|
||||
def performLogin(self, url = None):
|
||||
return True
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
pass
|
||||
|
||||
def getText(self, url):
|
||||
pass
|
||||
|
||||
def setLogin(self, login):
|
||||
self.login = login
|
||||
|
||||
def setPassword(self, password):
|
||||
self.password = password
|
||||
|
||||
def getHost(self):
|
||||
logging.debug('self.host=%s' % self.host)
|
||||
return self.host
|
||||
|
||||
def getUUID(self):
|
||||
self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
|
||||
logging.debug('self.uuid=%s' % self.uuid)
|
||||
return self.uuid
|
||||
|
||||
def getOutputName(self):
|
||||
self.outputName = makeAcceptableFilename(self.storyName.replace(" ", "_") + self.outputStorySep + self.storyId)
|
||||
logging.debug('self.outputName=%s' % self.outputName)
|
||||
return self.outputName
|
||||
|
||||
def getOutputFileName(self, booksDirectory, bookExt):
|
||||
self.getOutputName() # make sure self.outputName is populated
|
||||
self.outputFileName = booksDirectory + "/" + self.outputName + bookExt
|
||||
logging.debug('self.outputFileName=%s' % self.outputFileName)
|
||||
return self.outputFileName
|
||||
|
||||
def getAuthorURL(self):
|
||||
logging.debug('self.authorURL=%s' % self.authorURL)
|
||||
return self.authorURL
|
||||
|
||||
def getAuthorId(self):
|
||||
logging.debug('self.authorId=%s' % self.authorId)
|
||||
return self.authorId
|
||||
|
||||
def getAuthorName(self):
|
||||
logging.debug('self.authorName=%s' % self.authorName)
|
||||
return self.authorName
|
||||
|
||||
def getStoryURL(self):
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
return self.url
|
||||
|
||||
def getStoryId(self):
|
||||
logging.debug('self.storyId=%s' % self.storyId)
|
||||
return self.storyId
|
||||
|
||||
def getStoryName(self):
|
||||
logging.debug('self.storyName=%s' % self.storyName)
|
||||
return self.storyName
|
||||
|
||||
def getStoryDescription(self):
|
||||
## with out stripping \n's, appengine treats additional lines from this debug
|
||||
## output as error messages.
|
||||
#logging.debug('self.storyDescription=%s' % self.storyDescription.replace("\n"," ").replace('\r',''))
|
||||
return self.storyDescription
|
||||
|
||||
def getStoryCreated(self):
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
logging.debug('self.storyCreated=%s' % self.storyCreated)
|
||||
return self.storyCreated
|
||||
|
||||
def addCharacter(self, character):
|
||||
chara = character.upper()
|
||||
for c in self.storyCharacters:
|
||||
if c.upper() == chara:
|
||||
return False
|
||||
self.storyCharacters.append(character)
|
||||
return True
|
||||
|
||||
def getStoryCharacters(self):
|
||||
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
||||
return self.storyCharacters
|
||||
|
||||
def getStoryPublished(self):
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
return self.storyPublished
|
||||
|
||||
def getStoryUpdated(self):
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
||||
return self.storyUpdated
|
||||
|
||||
def getStorySeries(self):
|
||||
logging.debug('self.storySeries=%s' % self.storySeries)
|
||||
return self.storySeries
|
||||
|
||||
def getLanguage(self):
|
||||
logging.debug('self.language=%s' % self.language)
|
||||
return self.language
|
||||
|
||||
def getLanguageId(self):
|
||||
logging.debug('self.languageId=%s' % self.languageId)
|
||||
return self.languageId
|
||||
|
||||
def addSubject(self, subject):
|
||||
subj = subject.upper()
|
||||
for s in self.subjects:
|
||||
if s.upper() == subj:
|
||||
return False
|
||||
self.subjects.append(subject)
|
||||
return True
|
||||
|
||||
def getSubjects(self):
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
return self.subjects
|
||||
|
||||
def getPublisher(self):
|
||||
logging.debug('self.publisher=%s' % self.publisher)
|
||||
return self.publisher
|
||||
|
||||
def getNumChapters(self):
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
return self.numChapters
|
||||
|
||||
def getNumWords(self):
|
||||
logging.debug('self.numWords=%s' % self.numWords)
|
||||
return self.numWords
|
||||
|
||||
def getCategory(self):
|
||||
logging.debug('self.category=%s' % self.category)
|
||||
return self.category
|
||||
|
||||
def getGenre(self):
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
return self.genre
|
||||
|
||||
def getStoryStatus(self):
|
||||
logging.debug('self.storyStatus=%s' % self.storyStatus)
|
||||
return self.storyStatus
|
||||
|
||||
def getStoryRating(self):
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
return self.storyRating
|
||||
|
||||
def getStoryUserRating(self):
|
||||
logging.debug('self.storyUserRating=%s' % self.storyUserRating)
|
||||
return self.storyUserRating
|
||||
|
||||
def getPrintableUrl(self, url):
|
||||
return url
|
||||
81
fanficdownloader/adapters/__init__.py
Normal file
81
fanficdownloader/adapters/__init__.py
Normal file
|
|
@ -0,0 +1,81 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os, sys, glob
|
||||
from os.path import dirname, basename, normpath
|
||||
import logging
|
||||
import urlparse as up
|
||||
|
||||
## A few exceptions for different things for adapters
|
||||
|
||||
class FailedToDownload(Exception):
|
||||
def __init__(self,error):
|
||||
self.error=error
|
||||
|
||||
def __str__(self):
|
||||
return self.error
|
||||
|
||||
class InvalidStoryURL(Exception):
|
||||
def __init__(self,url,domain,example):
|
||||
self.url=url
|
||||
self.domain=domain
|
||||
self.example=example
|
||||
|
||||
def __str__(self):
|
||||
return "Bad Story URL: %s\nFor site: %s\nExample: %s" % (self.url, self.domain, self.example)
|
||||
|
||||
class FailedToLogin(Exception):
|
||||
def __init__(self,url,username):
|
||||
self.url=url
|
||||
self.username=username
|
||||
|
||||
def __str__(self):
|
||||
return "Failed to Login for URL: %s with username: %s" % (self.url, self.username)
|
||||
|
||||
class StoryDoesNotExist(Exception):
|
||||
def __init__(self,url):
|
||||
self.url=url
|
||||
|
||||
def __str__(self):
|
||||
return "Story Does Not Exit: " + self.url
|
||||
|
||||
class UnknownSite(Exception):
|
||||
def __init__(self,url,supported_sites_list):
|
||||
self.url=url
|
||||
self.supported_sites_list=supported_sites_list
|
||||
|
||||
def __str__(self):
|
||||
return "Unknown Site("+self.url+"). Supported sites: "+", ".join(self.supported_sites_list)
|
||||
|
||||
## This bit of complexity allows adapters to be added by just adding
|
||||
## the source file. It eliminates the long if/else clauses we used to
|
||||
## need to pick out the adapter.
|
||||
|
||||
## List of registered site adapters.
|
||||
|
||||
__class_list = []
|
||||
|
||||
def _register_handler(cls):
|
||||
__class_list.append(cls)
|
||||
|
||||
def getAdapter(config,url):
|
||||
parsedUrl = up.urlparse(url)
|
||||
logging.debug("site:"+parsedUrl.netloc)
|
||||
for cls in __class_list:
|
||||
if cls.matchesSite(parsedUrl.netloc):
|
||||
adapter = cls(config,url) # raises InvalidStoryURL
|
||||
return adapter
|
||||
# No adapter found.
|
||||
raise UnknownSite( url, (cls.getSiteDomain() for cls in __class_list) )
|
||||
|
||||
## Automatically import each adapter_*.py file.
|
||||
## Each must call _register_handler() with their class to be
|
||||
## registered.
|
||||
|
||||
filelist = glob.glob(dirname(__file__)+'/adapter_*.py')
|
||||
sys.path.insert(0,normpath(dirname(__file__)))
|
||||
|
||||
for file in filelist:
|
||||
#print "file: "+basename(file)[:-3]
|
||||
__import__(basename(file)[:-3])
|
||||
|
||||
del sys.path[0]
|
||||
183
fanficdownloader/adapters/adapter_fanfictionnet.py
Normal file
183
fanficdownloader/adapters/adapter_fanfictionnet.py
Normal file
|
|
@ -0,0 +1,183 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import time
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
import BeautifulSoup as bs
|
||||
|
||||
import adapters
|
||||
from adapters import _register_handler
|
||||
from adapters.base_adapter import BaseSiteAdapter, utf8FromSoup
|
||||
|
||||
class FanFictionNetSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','ffnet')
|
||||
|
||||
# get storyId from url--url validation guarantees second part is storyId
|
||||
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL("http://"+self.getSiteDomain()\
|
||||
+"/s/"+self.story.getMetadata('storyId')+"/1/")
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'www.fanfiction.net'
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return ['www.fanfiction.net','m.fanfiction.net']
|
||||
|
||||
def getSiteExampleURLs(self):
|
||||
return "http://www.fanfiction.net/s/1234/1/ http://www.fanfiction.net/s/1234/12/ http://www.fanfiction.net/s/1234/1/Story_Title"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return r"http://(www|m)?\.fanfiction\.net/s/\d+/\d+(/|/[a-zA-Z0-9_]+)?$"
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
# fetch the chapter. From that we will get almost all the
|
||||
# metadata and chapter list
|
||||
|
||||
url = self.url
|
||||
logging.debug("URL: "+url)
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
try:
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url))
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise adapters.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"^/u/\d+"))
|
||||
self.story.setMetadata('authorId',a['href'].split('/')[2])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
|
||||
# start by finding a script towards the bottom that has a
|
||||
# bunch of useful stuff in it.
|
||||
|
||||
# var storyid = 6577076;
|
||||
# var chapter = 1;
|
||||
# var chapters = 17;
|
||||
# var words = 42787;
|
||||
# var userid = 2645830;
|
||||
# var title = 'The+Invitation';
|
||||
# var title_t = 'The Invitation';
|
||||
# var summary = 'Dudley Dursley would be the first to say he lived a very normal life. But what happens when he gets invited to his cousin Harry Potter\'s wedding? Will Dudley get the courage to apologize for the torture he caused all those years ago? Harry/Ginny story.';
|
||||
# var categoryid = 224;
|
||||
# var cat_title = 'Harry Potter';
|
||||
# var datep = '12-21-10';
|
||||
# var dateu = '04-06-11';
|
||||
# var author = 'U n F a b u l o u s M e';
|
||||
|
||||
for script in soup.findAll('script', src=None):
|
||||
if 'var storyid' in script.string:
|
||||
for line in script.string.split('\n'):
|
||||
m = re.match(r"^ +var ([^ ]+) = '?(.*?)'?;$",line)
|
||||
if m == None : continue
|
||||
var,value = m.groups()
|
||||
# remove javascript escaping from values.
|
||||
value = re.sub(r'\\(.)',r'\1',value)
|
||||
#print var,value
|
||||
if 'words' in var:
|
||||
self.story.setMetadata('numWords', value)
|
||||
if 'title_t' in var:
|
||||
self.story.setMetadata('title', value)
|
||||
if 'summary' in var:
|
||||
self.story.setMetadata('description', value)
|
||||
if 'datep' in var:
|
||||
self.story.setMetadata('datePublished',
|
||||
datetime.datetime.fromtimestamp(time.mktime(time.strptime(value, '%m-%d-%y'))))
|
||||
if 'dateu' in var:
|
||||
self.story.setMetadata('dateUpdated',
|
||||
datetime.datetime.fromtimestamp(time.mktime(time.strptime(value, '%m-%d-%y'))))
|
||||
if 'cat_title' in var:
|
||||
if "Crossover" in value:
|
||||
value = re.sub(r' Crossover$','',value)
|
||||
for c in value.split(' and '):
|
||||
self.story.addToList('category',c)
|
||||
# Screws up when the category itself
|
||||
# contains ' and '. But that's rare
|
||||
# and the only alternative is to find
|
||||
# the 'Crossover' category URL and
|
||||
# parse that page to search for <a>
|
||||
# with href /crossovers/(name)/(num)/
|
||||
# <a href="/crossovers/Harry_Potter/224/">Harry Potter</a>
|
||||
# <a href="/crossovers/Naruto/1402/">Naruto</a>
|
||||
else:
|
||||
self.story.addToList('category',value)
|
||||
break # for script in soup.findAll('script', src=None):
|
||||
|
||||
# Find the chapter selector
|
||||
select = soup.find('select', { 'name' : 'chapter' } )
|
||||
|
||||
if select is None:
|
||||
# no selector found, so it's a one-chapter story.
|
||||
self.chapterUrls.append((self.story.getMetadata('title'),url))
|
||||
else:
|
||||
allOptions = select.findAll('option')
|
||||
for o in allOptions:
|
||||
url = u'http://%s/s/%s/%s/' % ( self.getSiteDomain(),
|
||||
self.story.getMetadata('storyId'),
|
||||
o['value'])
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
title = u"%s" % o
|
||||
title = re.sub(r'<[^>]+>','',title)
|
||||
self.chapterUrls.append((title,url))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
## Pull some additional data from html. Find Rating and look around it.
|
||||
|
||||
a = soup.find('a', href='http://www.fictionratings.com/')
|
||||
self.story.setMetadata('rating',a.string)
|
||||
|
||||
# after Rating, the same bit of text containing id:123456 contains
|
||||
# Complete--if completed.
|
||||
if 'Complete' in a.findNext(text=re.compile(r'id:\d+')):
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
# Parse genre(s) from <meta name="description" content="..."
|
||||
# <meta name="description" content="Chapter 1 of a Harry Potter - Family/Friendship fanfiction. Dudley Dursley would be the first to say he lived a very normal life. But what happens when he gets invited to his cousin Harry Potter's wedding? Will Dudley get the courage to apologize for the torture he caused all those years ago? Harry/Ginny story..">
|
||||
# <meta name="description" content="A Gundam Wing/AC and Gundam Seed - Romance/Sci-Fi crossover fanfiction with characters: & Kira Y.. Story summary: One-Shoot dividido en dos partes. Kira va en camino a rescatar a Lacus, pero él no es el unico. Dos personajes de diferentes universos Gundams. SEED vs ZERO.">
|
||||
# <meta name="description" content="Chapter 1 of a Alvin and the chipmunks and Alpha and Omega crossover fanfiction with characters: Alvin S. & Humphrey. You'll just have to read to find out... No Flames Plesae... and tell me what you want to see by PM'ing me....">
|
||||
# genre is after first -, but before first 'fanfiction'.
|
||||
m = re.match(r"^(?:Chapter \d+ of a|A) (?:.*?) (?:- (?P<genres>.*?)) (?:crossover )?fanfiction",
|
||||
soup.find('meta',{'name':'description'})['content'])
|
||||
if m != None:
|
||||
genres=m.group('genres')
|
||||
# Hurt/Comfort is one genre.
|
||||
genres=re.sub('Hurt/Comfort','Hurt-Comfort',genres)
|
||||
for g in genres.split('/'):
|
||||
self.story.addToList('genre',g)
|
||||
|
||||
return
|
||||
|
||||
|
||||
def getChapterText(self, url):
|
||||
logging.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
span = soup.find('div', {'id' : 'storytext'})
|
||||
|
||||
if None == span:
|
||||
raise adapters.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return utf8FromSoup(span)
|
||||
|
||||
_register_handler(FanFictionNetSiteAdapter)
|
||||
|
||||
89
fanficdownloader/adapters/adapter_test1.py
Normal file
89
fanficdownloader/adapters/adapter_test1.py
Normal file
File diff suppressed because one or more lines are too long
200
fanficdownloader/adapters/adapter_twilightednet.py
Normal file
200
fanficdownloader/adapters/adapter_twilightednet.py
Normal file
|
|
@ -0,0 +1,200 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import time
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
import urllib
|
||||
import urllib2
|
||||
|
||||
import BeautifulSoup as bs
|
||||
|
||||
import adapters
|
||||
from adapters import _register_handler
|
||||
from adapters.base_adapter import BaseSiteAdapter, utf8FromSoup
|
||||
from htmlcleanup import stripHTML
|
||||
|
||||
class TwilightedNetSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','tw')
|
||||
self.decode = "utf8"
|
||||
self.story.addToList("category","Twilight")
|
||||
self.username = "NoneGiven" # if left empty, twilighted.net doesn't return any message at all.
|
||||
self.password = ""
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
logging.debug("storyId: (%s)"%self.story.getMetadata('storyId'))
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'www.twilighted.net'
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return ['www.twilighted.net','twilighted.net']
|
||||
|
||||
def getSiteExampleURLs(self):
|
||||
return "http://www.twilighted.net/viewstory.php?sid=1234 http://twilighted.net/viewstory.php?sid=5678"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://")+r"(www\.)?"+re.escape("twilighted.net/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
def needToLoginCheck(self, data):
|
||||
if 'Registered Users Only.' in data \
|
||||
or 'There is no such account on our website' in data \
|
||||
or "That password doesn't match the one in our database." in data:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def performLogin(self, url):
|
||||
data = {}
|
||||
|
||||
if self.password:
|
||||
data['penname'] = self.username
|
||||
data['password'] = self.password
|
||||
else:
|
||||
data['penname'] = self.getConfig("username")
|
||||
data['password'] = self.getConfig("password")
|
||||
data['cookiecheck'] = '1'
|
||||
data['submit'] = 'Submit'
|
||||
|
||||
urlvals = urllib.urlencode(data)
|
||||
loginUrl = 'http://' + self.getSiteDomain() + '/user.php?action=login'
|
||||
logging.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
|
||||
data['penname']))
|
||||
|
||||
d = self._fetchUrl(loginUrl, urlvals)
|
||||
|
||||
if self.needToLoginCheck(d) :
|
||||
logging.info("Failed to login to URL %s as %s" % (loginUrl,
|
||||
data['penname']))
|
||||
raise adapters.FailedToLogin(url,data['penname'])
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
url = self.url+'&index=1'
|
||||
logging.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise adapters.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
if self.needToLoginCheck(data):
|
||||
# need to log in for this one.
|
||||
self.performLogin(url)
|
||||
data = self._fetchUrl(url)
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
|
||||
## Title
|
||||
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
|
||||
self.story.setMetadata('title',a.string)
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"viewuser.php"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
self.story.setMetadata('author',a.string)
|
||||
|
||||
# Find the chapters:
|
||||
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
## <meta name='description' content='<p>Description</p> ...' >
|
||||
## Summary, strangely, is in the content attr of a <meta name='description'> tag
|
||||
## which is escaped HTML. Unfortunately, we can't use it because they don't
|
||||
## escape (') chars in the desc, breakin the tag.
|
||||
#meta_desc = soup.find('meta',{'name':'description'})
|
||||
#metasoup = bs.BeautifulStoneSoup(meta_desc['content'])
|
||||
#self.story.setMetadata('description',stripHTML(metasoup))
|
||||
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
labels = soup.findAll('span',{'class':'label'})
|
||||
for labelspan in labels:
|
||||
value = labelspan.nextSibling
|
||||
label = labelspan.string
|
||||
|
||||
if 'Summary' in label:
|
||||
## Everything until the next span class='label'
|
||||
svalue = str(value)
|
||||
while not defaultGetattr(value,'class') == 'label':
|
||||
svalue += str(value)
|
||||
value = value.nextSibling
|
||||
self.story.setMetadata('description',stripHTML(svalue))
|
||||
|
||||
if 'Rated' in label:
|
||||
self.story.setMetadata('rating', value.strip())
|
||||
|
||||
if 'Word count' in label:
|
||||
self.story.setMetadata('numWords', value.strip())
|
||||
|
||||
if 'Categories' in label:
|
||||
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
|
||||
catstext = [cat.string for cat in cats]
|
||||
for cat in catstext:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
## twilighted.net doesn't use genre.
|
||||
# if 'Genre' in label:
|
||||
# genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class'))
|
||||
# genrestext = [genre.string for genre in genres]
|
||||
# self.genre = ', '.join(genrestext)
|
||||
# for genre in genrestext:
|
||||
# self.addSubject(genre.string)
|
||||
|
||||
if 'Completed' in label:
|
||||
if 'Yes' in value:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
if 'Published' in label:
|
||||
self.story.setMetadata('datePublished', datetime.datetime.fromtimestamp(time.mktime(time.strptime(value.strip(), "%B %d, %Y"))))
|
||||
|
||||
if 'Updated' in label:
|
||||
# there's a stray [ at the end.
|
||||
#value = value[0:-1]
|
||||
self.story.setMetadata('dateUpdated', datetime.datetime.fromtimestamp(time.mktime(time.strptime(value.strip(), "%B %d, %Y"))))
|
||||
|
||||
|
||||
def getChapterText(self, url):
|
||||
|
||||
logging.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
span = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == span:
|
||||
raise adapters.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return utf8FromSoup(span)
|
||||
|
||||
_register_handler(TwilightedNetSiteAdapter)
|
||||
|
||||
183
fanficdownloader/adapters/adapter_whoficcom.py
Normal file
183
fanficdownloader/adapters/adapter_whoficcom.py
Normal file
|
|
@ -0,0 +1,183 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import time
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
import urllib2
|
||||
|
||||
import BeautifulSoup as bs
|
||||
|
||||
import adapters
|
||||
from adapters import _register_handler
|
||||
from adapters.base_adapter import BaseSiteAdapter, utf8FromSoup
|
||||
|
||||
class WhoficComSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','whof')
|
||||
self.decode = "ISO-8859-1"
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'www.whofic.com'
|
||||
|
||||
def getSiteExampleURLs(self):
|
||||
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+"\d+$"
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
|
||||
|
||||
# fetch the first chapter. From that we will:
|
||||
# - determine title, authorname, authorid
|
||||
# - get chapter list, if not one-shot.
|
||||
|
||||
url = self.url+'&chapter=1'
|
||||
logging.debug("URL: "+url)
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
try:
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url))
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise adapters.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
# pull title(title) and author from the HTML title.
|
||||
title = soup.find('title').string
|
||||
logging.debug('Title: %s' % title)
|
||||
title = title.split('::')[1].strip()
|
||||
self.story.setMetadata('title',title.split(' by ')[0].strip())
|
||||
self.story.setMetadata('author',title.split(' by ')[1].strip())
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = soup.find('a', href=re.compile(r"viewuser.php"))
|
||||
self.story.setMetadata('authorId',a['href'].split('=')[1])
|
||||
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
|
||||
|
||||
# Find the chapter selector
|
||||
select = soup.find('select', { 'name' : 'chapter' } )
|
||||
|
||||
if select is None:
|
||||
# no selector found, so it's a one-chapter story.
|
||||
self.chapterUrls.append((self.story.getMetadata('title'),url))
|
||||
else:
|
||||
allOptions = select.findAll('option')
|
||||
for o in allOptions:
|
||||
url = self.url + "&chapter=%s" % o['value']
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
title = "%s" % o
|
||||
title = re.sub(r'<[^>]+>','',title)
|
||||
self.chapterUrls.append((title,url))
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
## Whofic.com puts none of the other meta data in the chapters
|
||||
## or even the story chapter index page. Need to scrape the
|
||||
## author page to find it.
|
||||
|
||||
# <table width="100%" bordercolor="#333399" border="0" cellspacing="0" cellpadding="2"><tr><td>
|
||||
# <b><a href="viewstory.php?sid=38220">Accompaniment 2</a></b> by <a href="viewuser.php?uid=12412">clandestinemiscreant</a> [<a href="reviews.php?sid=38220">Reviews</a> - <a href="reviews.php?sid=38220">0</a>] <br>
|
||||
# This is a series of short stories written as an accompaniment to Season 2, Season 28 for us oldies, and each is unrelated except for that one factor. Each story is canon, in that it does not change established events at time of airing, based on things mentioned and/or implied and missing or deleted scenes that were not seen in the final aired episodes.<br>
|
||||
# <font size="-1"><b><a href="categories.php?catid=15">Tenth Doctor</a></b> - All Ages - None - Humor, Hurt/Comfort, Romance<br>
|
||||
# <i>Characters:</i> Rose Tyler<br>
|
||||
# <i>Series:</i> None<br>
|
||||
# <i>Published:</i> 2010.08.15 - <i>Updated:</i> 2010.08.16 - <i>Chapters:</i> 4 - <i>Completed:</i> Yes - <i>Word Count:</i> 4890 </font>
|
||||
# </td></tr></table>
|
||||
|
||||
logging.debug("Author URL: "+self.story.getMetadata('authorUrl'))
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(self.story.getMetadata('authorUrl')),
|
||||
selfClosingTags=('br')) # normalize <br> tags to <br />
|
||||
|
||||
# find this story in the list, parse it's metadata based on
|
||||
# lots of assumptions about the html, since there's little
|
||||
# tagging.
|
||||
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')))
|
||||
metadata = a.findParent('td')
|
||||
metadatachunks = utf8FromSoup(metadata).split('<br />')
|
||||
# process metadata for this story.
|
||||
self.story.setMetadata('description', metadatachunks[1])
|
||||
|
||||
# First line of the stuff with ' - ' separators
|
||||
moremeta = metadatachunks[2]
|
||||
moremeta = re.sub(r'<[^>]+>','',moremeta) # strip tags.
|
||||
|
||||
moremetaparts = moremeta.split(' - ')
|
||||
|
||||
# first part is category--whofic.com has categories
|
||||
# Doctor One-11, Torchwood, etc. We're going to
|
||||
# prepend any with 'Doctor' or 'Era' (Multi-Era, Other
|
||||
# Era) as 'Doctor Who'.
|
||||
#
|
||||
# Also push each in as 'extra tags'.
|
||||
category = moremetaparts[0]
|
||||
if 'Doctor' in category or 'Era' in category :
|
||||
self.story.addToList('category','Doctor Who')
|
||||
|
||||
for cat in category.split(', '):
|
||||
self.story.addToList('category',cat)
|
||||
|
||||
# next in that line is age rating.
|
||||
self.story.setMetadata('rating',moremetaparts[1])
|
||||
|
||||
# after that is a possible list fo specific warnings,
|
||||
# Explicit Violence, Swearing, etc
|
||||
if "None" not in moremetaparts[2]:
|
||||
for warn in moremetaparts[2].split(', '):
|
||||
self.story.addToList('warnings',warn)
|
||||
|
||||
# then genre. It's another comma list. All together
|
||||
# in genre, plus each in extra tags.
|
||||
genre=moremetaparts[3]
|
||||
for g in genre.split(r', '):
|
||||
self.story.addToList('genre',g)
|
||||
|
||||
|
||||
# the next line is stuff with ' - ' separators *and* names--with tags.
|
||||
moremeta = metadatachunks[5]
|
||||
moremeta = re.sub(r'<[^>]+>','',moremeta) # strip tags.
|
||||
|
||||
moremetaparts = moremeta.split(' - ')
|
||||
|
||||
for part in moremetaparts:
|
||||
(name,value) = part.split(': ')
|
||||
name=name.strip()
|
||||
value=value.strip()
|
||||
if name == 'Published':
|
||||
self.story.setMetadata('datePublished', datetime.datetime.fromtimestamp(time.mktime(time.strptime(value, '%Y.%m.%d'))))
|
||||
if name == 'Updated':
|
||||
self.story.setMetadata('dateUpdated', datetime.datetime.fromtimestamp(time.mktime(time.strptime(value, '%Y.%m.%d'))))
|
||||
if name == 'Completed':
|
||||
if value == 'Yes':
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
if name == 'Word Count':
|
||||
self.story.setMetadata('numWords', value)
|
||||
|
||||
def getChapterText(self, url):
|
||||
|
||||
logging.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
|
||||
# hardly a great identifier, I know, but whofic really doesn't
|
||||
# give us anything better to work with.
|
||||
span = soup.find('span', {'style' : 'font-size: 100%;'})
|
||||
|
||||
if None == span:
|
||||
raise adapters.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return utf8FromSoup(span)
|
||||
|
||||
_register_handler(WhoficComSiteAdapter)
|
||||
|
||||
102
fanficdownloader/adapters/base_adapter.py
Normal file
102
fanficdownloader/adapters/base_adapter.py
Normal file
|
|
@ -0,0 +1,102 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
import datetime
|
||||
import time
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
|
||||
from story import Story
|
||||
from configurable import Configurable
|
||||
from htmlcleanup import removeEntities, removeAllEntities, stripHTML
|
||||
from adapters import InvalidStoryURL
|
||||
|
||||
class BaseSiteAdapter(Configurable):
|
||||
|
||||
@classmethod
|
||||
def matchesSite(cls,site):
|
||||
return site in cls.getAcceptDomains()
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return [cls.getSiteDomain()]
|
||||
|
||||
def validateURL(self):
|
||||
return re.match(self.getSiteURLPattern(), self.url)
|
||||
|
||||
def __init__(self, config, url):
|
||||
Configurable.__init__(self, config)
|
||||
self.addConfigSection(self.getSiteDomain())
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
self.storyDone = False
|
||||
self.story = Story()
|
||||
self.story.setMetadata('site',self.getSiteDomain())
|
||||
self.story.setMetadata('dateCreated',datetime.datetime.now())
|
||||
self.chapterUrls = [] # tuples of (chapter title,chapter url)
|
||||
self.decode = "utf8"
|
||||
self._setURL(url)
|
||||
if not self.validateURL():
|
||||
raise InvalidStoryURL(url,
|
||||
self.getSiteDomain(),
|
||||
self.getSiteExampleURLs())
|
||||
|
||||
def _setURL(self,url):
|
||||
self.url = url
|
||||
self.parsedUrl = up.urlparse(url)
|
||||
self.host = self.parsedUrl.netloc
|
||||
self.path = self.parsedUrl.path
|
||||
self.story.setMetadata('storyUrl',self.url)
|
||||
|
||||
def _fetchUrl(self, url, parameters=None):
|
||||
if self.getConfig('slow_down_sleep_time'):
|
||||
time.sleep(float(self.getConfig('slow_down_sleep_time')))
|
||||
if parameters:
|
||||
return self.opener.open(url,parameters).read().decode(self.decode)
|
||||
else:
|
||||
return self.opener.open(url).read().decode(self.decode)
|
||||
|
||||
# Does the download the first time it's called.
|
||||
def getStory(self):
|
||||
if not self.storyDone:
|
||||
self.extractChapterUrlsAndMetadata()
|
||||
for (title,url) in self.chapterUrls:
|
||||
self.story.addChapter(removeEntities(title),
|
||||
removeEntities(self.getChapterText(url)))
|
||||
self.storyDone = True
|
||||
return self.story
|
||||
|
||||
###############################
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
"Needs to be overriden in each adapter class."
|
||||
return 'no such domain'
|
||||
|
||||
## URL pattern validation is done *after* picking an adaptor based
|
||||
## on domain instead of *as* the adaptor selector so we can offer
|
||||
## the user example(s) for that particular site.
|
||||
def getSiteURLPattern(self):
|
||||
"Used to validate URL. Should be override in each adapter class."
|
||||
return '^http://'+re.escape(self.getSiteDomain())
|
||||
|
||||
def getSiteExampleURLs(self):
|
||||
"""
|
||||
Needs to be overriden in each adapter class. It's the adapter
|
||||
writer's responsibility to make sure the example(s) pass the
|
||||
URL validate.
|
||||
"""
|
||||
return 'no such example'
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
"Needs to be overriden in each adapter class. Populates self.story metadata and self.chapterUrls"
|
||||
pass
|
||||
|
||||
def getChapterText(self, url):
|
||||
"Needs to be overriden in each adapter class."
|
||||
pass
|
||||
|
||||
|
||||
# this gives us a unicode object, not just a string containing bytes.
|
||||
# (I gave soup a unicode string, you'd think it could give it back...)
|
||||
def utf8FromSoup(soup):
|
||||
return soup.__str__('utf8').decode('utf-8')
|
||||
225
fanficdownloader/adastrafanfic.py
Normal file
225
fanficdownloader/adastrafanfic.py
Normal file
|
|
@ -0,0 +1,225 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
import os.path
|
||||
import urllib as u
|
||||
import logging
|
||||
import pprint as pp
|
||||
import unittest
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from adapter import *
|
||||
|
||||
class Adastrafanfic(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
self.storyDescription = 'Fanfiction Story'
|
||||
self.authorId = '0'
|
||||
self.authorURL = ''
|
||||
self.storyId = '0'
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.subjects.append ('fanfiction')
|
||||
self.subjects.append ('Ad Astra')
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = ''
|
||||
self.category = 'Fanfiction'
|
||||
self.storyStatus = 'In-Progress'
|
||||
self.storyRating = 'PG'
|
||||
self.storyUserRating = '0'
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
self.outputName = ''
|
||||
self.outputStorySep = '-aaff_'
|
||||
|
||||
self.chapurl = False
|
||||
ss=self.url.split('?')
|
||||
logging.debug('ss=%s' % ss)
|
||||
if ss is not None and len(ss) > 1:
|
||||
sss = ss[1].replace('&','&').split('&')
|
||||
logging.debug('sss=%s' % sss)
|
||||
if sss is not None and len(sss) > 0:
|
||||
ssss = sss[0].split('=')
|
||||
logging.debug('ssss=%s' % ssss)
|
||||
if ssss is not None and len(ssss) > 1 and ssss[0] == 'sid':
|
||||
self.storyId = ssss[1]
|
||||
if len(sss) > 1:
|
||||
ssss = sss[1].split('=')
|
||||
logging.debug('ssss=%s' % ssss)
|
||||
if ssss is not None and len(ssss) > 1 and ssss[0] == 'chapter':
|
||||
self.chapurl = True
|
||||
|
||||
self.url = 'http://' + self.host + self.path + '?sid=' + self.storyId
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
|
||||
logging.debug("Created Adastrafanfic: url=%s" % (self.url))
|
||||
|
||||
def requiresLogin(self, url = None):
|
||||
return False
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
# warning=5 bypasses 'are you old enough' checks.
|
||||
url = self.url + '&warning=5&chapter=1'
|
||||
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise StoryDoesNotExist("Problem reading story URL " + url + "!")
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
except:
|
||||
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % url)
|
||||
|
||||
title = soup.find('title').string
|
||||
logging.debug('Title: %s' % title)
|
||||
self.storyName = title.split(' by ')[0].strip()
|
||||
self.authorName = title.split(' by ')[1].strip()
|
||||
|
||||
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
|
||||
logging.debug('self.authorId=%s, self.authorName=%s' % (self.authorId, self.authorName))
|
||||
|
||||
select = soup.find('select', { 'name' : 'chapter' } )
|
||||
|
||||
result = []
|
||||
if select is None:
|
||||
# no chapters found, try url by itself.
|
||||
result.append((url,self.storyName))
|
||||
else:
|
||||
allOptions = select.findAll('option')
|
||||
for o in allOptions:
|
||||
# warning=5 bypasses 'are you old enough' checks.
|
||||
url = self.url + "&warning=5&chapter=%s" % o['value']
|
||||
# ad astra can have tags, like <i> in chapter titles.
|
||||
title = "%s" % o
|
||||
title = re.sub('<[^>]+>','',title)
|
||||
result.append((url,title))
|
||||
|
||||
# warning=5 bypasses 'are you old enough' checks.
|
||||
url = self.url + "&warning=5&index=1"
|
||||
data = self.opener.open(url).read()
|
||||
|
||||
soup = bs.BeautifulStoneSoup(data, selfClosingTags=('br','hr'))
|
||||
# find authorId.
|
||||
titlediv = soup.find('div', {'id' : 'pagetitle'})
|
||||
for a in titlediv.findAll('a'):
|
||||
if a['href'].startswith('viewuser.php'):
|
||||
self.authorId = a['href'].split('=')[1]
|
||||
self.authorURL = 'http://'+self.host+'/'+a['href']
|
||||
|
||||
# find other metadata
|
||||
contentdiv = soup.find('div', {'class' : 'content'})
|
||||
|
||||
# adastra meta data is not well structured. There's an
|
||||
# identifiable span class="label" around the *labels*, but
|
||||
# nothing around the content for each label. And there's
|
||||
# <a href> around lots of the meta data values.
|
||||
|
||||
# characters are given 'ln, fn'. Need to parse out
|
||||
# separately. Of course, I only realized *after* doing this
|
||||
# that output.py isn't actually doing anything with the
|
||||
# characters... <sigh>
|
||||
for a in contentdiv.findAll('a'):
|
||||
if a['href'].startswith('browse.php?type=characters'):
|
||||
name=a.text
|
||||
if a.text.find(', ') > -1:
|
||||
names=a.text.split(', ')
|
||||
names.reverse()
|
||||
name=' '.join(names)
|
||||
self.addCharacter(name)
|
||||
|
||||
contentdivstring = contentdiv.__str__('utf8')
|
||||
labeledlines = contentdivstring.strip().split('<span class="label">') # eats the <span class="label"> tags.
|
||||
metadata = dict()
|
||||
for labeledline in labeledlines:
|
||||
labeledline = re.sub(r'<[^>]+>','',labeledline)
|
||||
(label,sep,value)=labeledline.strip().partition(':') # a bit like split, but splits on first separator.
|
||||
metadata[label.strip()]=value.strip()
|
||||
#print label+"->"+value
|
||||
|
||||
self.storyDescription = metadata['Summary']
|
||||
self.genre = metadata['Genre']
|
||||
for genre in self.genre.split(", "):
|
||||
self.addSubject(genre)
|
||||
self.category = metadata['Categories']
|
||||
for category in self.category.split(", "):
|
||||
self.addSubject(category)
|
||||
if metadata['Completed'] == "No":
|
||||
self.storyStatus = 'In-Progress'
|
||||
else:
|
||||
self.storyStatus = 'Completed'
|
||||
|
||||
self.storyRating = metadata['Rated']
|
||||
self.storySeries = metadata['Series']
|
||||
self.numChapters = metadata['Chapters']
|
||||
self.numWords = metadata['Word count']
|
||||
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(metadata['Published'], "%m/%d/%Y")))
|
||||
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(metadata['Updated'], "%m/%d/%Y")))
|
||||
|
||||
return result
|
||||
|
||||
def getText(self, url):
|
||||
if url.find('http://') == -1:
|
||||
url = 'http://' + self.host + '/' + url
|
||||
|
||||
logging.debug('Getting data from: %s' % url)
|
||||
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
|
||||
|
||||
soup = None
|
||||
try:
|
||||
# I really wish I knew why adastra needs the selfClosingTags to make <br /> work, but ficwad doesn't.
|
||||
soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES, selfClosingTags=('br','hr'))
|
||||
except:
|
||||
logging.info("Failed to decode: <%s>" % data)
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return div.__str__('utf8')
|
||||
|
||||
|
||||
class Adastrafanfic_UnitTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
pass
|
||||
|
||||
def testGetUrlsWorks(self):
|
||||
url = 'http://www.adastrafanfic.com/viewstory.php?sid=426'
|
||||
self.assertEquals(32, len(Adastrafanfic(url).extractIndividualUrls()))
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
0
fanficdownloader/books/place holder.txt
Normal file
0
fanficdownloader/books/place holder.txt
Normal file
49
fanficdownloader/configurable.py
Normal file
49
fanficdownloader/configurable.py
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import ConfigParser
|
||||
|
||||
# All of the writers(epub,html,txt) and adapters(ffnet,twlt,etc)
|
||||
# inherit from Configurable. The config file(s) uses ini format:
|
||||
# [sections] with key:value settings.
|
||||
#
|
||||
# There's a [defaults] section which is overriden by the writer's
|
||||
# section [epub], which is overriden by the adapter's section for each
|
||||
# site.
|
||||
#
|
||||
# [defaults]
|
||||
# titlepage_entries: category,genre, status
|
||||
# [epub]
|
||||
# titlepage_entries: category,genre, status,datePublished,dateUpdated,dateCreated
|
||||
# [www.whofic.com]
|
||||
# titlepage_entries: category,genre, status,dateUpdated,rating
|
||||
|
||||
class Configurable(object):
|
||||
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.sectionslist = ['defaults']
|
||||
|
||||
def addConfigSection(self,section):
|
||||
self.sectionslist.insert(0,section)
|
||||
|
||||
def getConfig(self, key):
|
||||
val = ""
|
||||
for section in self.sectionslist:
|
||||
try:
|
||||
val = self.config.get(section,key)
|
||||
if val and val.lower() == "false":
|
||||
val = False
|
||||
#print "getConfig(%s)=[%s]%s" % (key,section,val)
|
||||
return val
|
||||
except (ConfigParser.NoOptionError, ConfigParser.NoSectionError), e:
|
||||
pass
|
||||
|
||||
return val
|
||||
|
||||
# split and strip each.
|
||||
def getConfigList(self, key):
|
||||
vlist = self.getConfig(key).split(',')
|
||||
vlist = [ v.strip() for v in vlist ]
|
||||
#print "vlist("+key+"):"+str(vlist)
|
||||
return vlist
|
||||
|
||||
552
fanficdownloader/constants.py
Normal file
552
fanficdownloader/constants.py
Normal file
|
|
@ -0,0 +1,552 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
CSS = '''body { margin-left: 2%; margin-right: 2%; margin-top: 2%; margin-bottom: 2%; text-align: justify; }
|
||||
pre { font-size: x-small; }
|
||||
sml { font-size: small; }
|
||||
h1 { text-align: center; }
|
||||
h2 { text-align: center; }
|
||||
h3 { text-align: center; }
|
||||
h4 { text-align: center; }
|
||||
h5 { text-align: center; }
|
||||
h6 { text-align: center; }
|
||||
h7 { text-align: left; font-size: large; font-weight: bold; }
|
||||
.CI {
|
||||
text-align:center;
|
||||
margin-top:0px;
|
||||
margin-bottom:0px;
|
||||
padding:0px;
|
||||
}
|
||||
.center {text-align: center;}
|
||||
.cover {text-align: center;}
|
||||
.full {width: 100%; }
|
||||
.quarter {width: 25%; }
|
||||
.smcap {font-variant: small-caps;}
|
||||
.u {text-decoration: underline;}
|
||||
.bold {font-weight: bold;}
|
||||
'''
|
||||
|
||||
MIMETYPE = '''application/epub+zip'''
|
||||
|
||||
TITLE_HEADER = '''<?xml version="1.0" encoding="utf-8"?><html xmlns="http://www.w3.org/1999/xhtml" xmlns:xlink="http://www.w3.org/1999/xlink"><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
|
||||
<title>%s - %s</title><link href="stylesheet.css" type="text/css" charset="UTF-8" rel="stylesheet"/></head><body>
|
||||
<p><h3 id="lnks"><b><a id="StoryLink" href="%s">%s</a></b> by <b><a id="AuthorLink" href="%s">%s</a></b></h3></p>
|
||||
'''
|
||||
|
||||
TITLE_ENTRY = '''<b>%s</b> %s<br />
|
||||
'''
|
||||
|
||||
TITLE_FOOTER = '''
|
||||
<br /><b>Summary:</b><br />%s<br />
|
||||
</body></html>
|
||||
'''
|
||||
|
||||
TABLE_TITLE_HEADER = TITLE_HEADER + '''
|
||||
<table class="full">
|
||||
'''
|
||||
|
||||
TABLE_TITLE_ENTRY = '''<tr><td><b>%s</b></td><td>%s</td></tr>
|
||||
'''
|
||||
|
||||
TABLE_TITLE_FOOTER = '''
|
||||
</table>
|
||||
''' + TITLE_FOOTER
|
||||
|
||||
CONTAINER = '''<?xml version="1.0" encoding="utf-8"?>
|
||||
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
|
||||
<rootfiles>
|
||||
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
|
||||
</rootfiles>
|
||||
</container>
|
||||
'''
|
||||
|
||||
CONTENT_START = '''<?xml version="1.0" encoding="utf-8"?>
|
||||
<package version="2.0" xmlns="http://www.idpf.org/2007/opf"
|
||||
unique-identifier="fanficdownloader-uuid">
|
||||
<metadata xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:dcterms="http://purl.org/dc/terms/"
|
||||
xmlns:opf="http://www.idpf.org/2007/opf"
|
||||
xmlns:calibre="http://calibre.kovidgoyal.net/2009/metadata">
|
||||
<dc:identifier id="fanficdownloader-uuid">BookID-Epub-%s</dc:identifier>
|
||||
<dc:title>%s</dc:title>
|
||||
<dc:creator opf:role="aut">%s</dc:creator>
|
||||
<dc:contributor opf:role="bkp">fanficdownloader [http://fanficdownloader.googlecode.com]</dc:contributor>
|
||||
<dc:language>%s</dc:language>
|
||||
<dc:rights></dc:rights>
|
||||
<dc:date opf:event="publication">%s</dc:date>
|
||||
<dc:date opf:event="creation">%s</dc:date>
|
||||
<dc:date opf:event="modification">%s</dc:date>
|
||||
<meta name="calibre:timestamp" content="%s"/>
|
||||
<dc:description>%s</dc:description>
|
||||
'''
|
||||
|
||||
CONTENT_END_METADATA = ''' <dc:publisher>%s</dc:publisher>
|
||||
<dc:identifier id="BookId">%s</dc:identifier>
|
||||
<dc:identifier opf:scheme="URL">%s</dc:identifier>
|
||||
<dc:source>%s</dc:source>
|
||||
<dc:type>FanFiction</dc:type>
|
||||
<meta name="calibre:rating" content="%s"/>
|
||||
</metadata>
|
||||
<manifest>
|
||||
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
|
||||
<item id="style" href="stylesheet.css" media-type="text/css" />
|
||||
'''
|
||||
|
||||
CONTENT_SUBJECT = ''' <dc:subject>%s</dc:subject>
|
||||
'''
|
||||
|
||||
CONTENT_ITEM = ''' <item id="%s" href="%s" media-type="application/xhtml+xml" />
|
||||
'''
|
||||
|
||||
CONTENT_END_MANIFEST = ''' </manifest>
|
||||
<spine toc="ncx">
|
||||
'''
|
||||
|
||||
CONTENT_ITEMREF = ''' <itemref idref="%s" />
|
||||
'''
|
||||
|
||||
CONTENT_END = ''' </spine>
|
||||
</package>
|
||||
'''
|
||||
|
||||
TOC_START = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
|
||||
<head>
|
||||
<meta name="dtb:uid" content="%s"/>
|
||||
<meta name="dtb:depth" content="1"/>
|
||||
<meta name="dtb:totalPageCount" content="0"/>
|
||||
<meta name="dtb:maxPageNumber" content="0"/>
|
||||
</head>
|
||||
<docTitle>
|
||||
<text>%s</text>
|
||||
</docTitle>
|
||||
<navMap>
|
||||
'''
|
||||
|
||||
TOC_ITEM = '''<navPoint id="%s" playOrder="%d">
|
||||
<navLabel>
|
||||
<text>%s</text>
|
||||
</navLabel>
|
||||
<content src="%s"/>
|
||||
</navPoint>
|
||||
'''
|
||||
|
||||
TOC_END = '''</navMap>
|
||||
</ncx>
|
||||
'''
|
||||
|
||||
XHTML_START = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>%s</title>
|
||||
<link href="stylesheet.css" type="text/css" rel="stylesheet" />
|
||||
</head>
|
||||
<body>
|
||||
<div>
|
||||
<h3>%s</h3>
|
||||
'''
|
||||
|
||||
XHTML_END = '''</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
|
||||
'blockquote', 'br', 'center', 'cite', 'code', 'col',
|
||||
'colgroup', 'dd', 'del', 'dfn', 'dir', 'dl', 'dt', 'em',
|
||||
'font', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'i',
|
||||
'ins', 'kbd', 'label', 'li', 'ol',
|
||||
'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike',
|
||||
'strong', 'sub', 'sup', 'u', 'ul']
|
||||
|
||||
acceptable_attributes = ['href']
|
||||
|
||||
# entity list from http://code.google.com/p/doctype/wiki/CharacterEntitiesConsistent
|
||||
entities = { 'á' : 'á',
|
||||
'Á' : 'Á',
|
||||
'Á' : 'Á',
|
||||
'á' : 'á',
|
||||
'â' : 'â',
|
||||
'Â' : 'Â',
|
||||
'Â' : 'Â',
|
||||
'â' : 'â',
|
||||
'´' : '´',
|
||||
'´' : '´',
|
||||
'Æ' : 'Æ',
|
||||
'æ' : 'æ',
|
||||
'Æ' : 'Æ',
|
||||
'æ' : 'æ',
|
||||
'à' : 'à',
|
||||
'À' : 'À',
|
||||
'À' : 'À',
|
||||
'à' : 'à',
|
||||
'ℵ' : 'ℵ',
|
||||
'α' : 'α',
|
||||
'Α' : 'Α',
|
||||
'&' : '&',
|
||||
'&' : '&',
|
||||
'&' : '&',
|
||||
'&' : '&',
|
||||
'∧' : '∧',
|
||||
'∠' : '∠',
|
||||
'å' : 'å',
|
||||
'Å' : 'Å',
|
||||
'Å' : 'Å',
|
||||
'å' : 'å',
|
||||
'≈' : '≈',
|
||||
'ã' : 'ã',
|
||||
'Ã' : 'Ã',
|
||||
'Ã' : 'Ã',
|
||||
'ã' : 'ã',
|
||||
'ä' : 'ä',
|
||||
'Ä' : 'Ä',
|
||||
'Ä' : 'Ä',
|
||||
'ä' : 'ä',
|
||||
'„' : '„',
|
||||
'β' : 'β',
|
||||
'Β' : 'Β',
|
||||
'¦' : '¦',
|
||||
'¦' : '¦',
|
||||
'•' : '•',
|
||||
'∩' : '∩',
|
||||
'ç' : 'ç',
|
||||
'Ç' : 'Ç',
|
||||
'Ç' : 'Ç',
|
||||
'ç' : 'ç',
|
||||
'¸' : '¸',
|
||||
'¸' : '¸',
|
||||
'¢' : '¢',
|
||||
'¢' : '¢',
|
||||
'χ' : 'χ',
|
||||
'Χ' : 'Χ',
|
||||
'ˆ' : 'ˆ',
|
||||
'♣' : '♣',
|
||||
'≅' : '≅',
|
||||
'©' : '©',
|
||||
'©' : '©',
|
||||
'©' : '©',
|
||||
'©' : '©',
|
||||
'↵' : '↵',
|
||||
'∪' : '∪',
|
||||
'¤' : '¤',
|
||||
'¤' : '¤',
|
||||
'†' : '†',
|
||||
'‡' : '‡',
|
||||
'↓' : '↓',
|
||||
'⇓' : '⇓',
|
||||
'°' : '°',
|
||||
'°' : '°',
|
||||
'δ' : 'δ',
|
||||
'Δ' : 'Δ',
|
||||
'♦' : '♦',
|
||||
'÷' : '÷',
|
||||
'÷' : '÷',
|
||||
'é' : 'é',
|
||||
'É' : 'É',
|
||||
'É' : 'É',
|
||||
'é' : 'é',
|
||||
'ê' : 'ê',
|
||||
'Ê' : 'Ê',
|
||||
'Ê' : 'Ê',
|
||||
'ê' : 'ê',
|
||||
'è' : 'è',
|
||||
'È' : 'È',
|
||||
'È' : 'È',
|
||||
'è' : 'è',
|
||||
'∅' : '∅',
|
||||
' ' : ' ',
|
||||
' ' : ' ',
|
||||
'ε' : 'ε',
|
||||
'Ε' : 'Ε',
|
||||
'≡' : '≡',
|
||||
'η' : 'η',
|
||||
'Η' : 'Η',
|
||||
'ð' : 'ð',
|
||||
'Ð' : 'Ð',
|
||||
'Ð' : 'Ð',
|
||||
'ð' : 'ð',
|
||||
'ë' : 'ë',
|
||||
'Ë' : 'Ë',
|
||||
'Ë' : 'Ë',
|
||||
'ë' : 'ë',
|
||||
'€' : '€',
|
||||
'∃' : '∃',
|
||||
'ƒ' : 'ƒ',
|
||||
'∀' : '∀',
|
||||
'½' : '½',
|
||||
'½' : '½',
|
||||
'¼' : '¼',
|
||||
'¼' : '¼',
|
||||
'¾' : '¾',
|
||||
'¾' : '¾',
|
||||
'⁄' : '⁄',
|
||||
'γ' : 'γ',
|
||||
'Γ' : 'Γ',
|
||||
'≥' : '≥',
|
||||
'>' : '>',
|
||||
'>' : '>',
|
||||
'>' : '>',
|
||||
'>' : '>',
|
||||
'↔' : '↔',
|
||||
'⇔' : '⇔',
|
||||
'♥' : '♥',
|
||||
'…' : '…',
|
||||
'í' : 'í',
|
||||
'Í' : 'Í',
|
||||
'Í' : 'Í',
|
||||
'í' : 'í',
|
||||
'î' : 'î',
|
||||
'Î' : 'Î',
|
||||
'Î' : 'Î',
|
||||
'î' : 'î',
|
||||
'¡' : '¡',
|
||||
'¡' : '¡',
|
||||
'ì' : 'ì',
|
||||
'Ì' : 'Ì',
|
||||
'Ì' : 'Ì',
|
||||
'ì' : 'ì',
|
||||
'ℑ' : 'ℑ',
|
||||
'∞' : '∞',
|
||||
'∫' : '∫',
|
||||
'ι' : 'ι',
|
||||
'Ι' : 'Ι',
|
||||
'¿' : '¿',
|
||||
'¿' : '¿',
|
||||
'∈' : '∈',
|
||||
'ï' : 'ï',
|
||||
'Ï' : 'Ï',
|
||||
'Ï' : 'Ï',
|
||||
'ï' : 'ï',
|
||||
'κ' : 'κ',
|
||||
'Κ' : 'Κ',
|
||||
'λ' : 'λ',
|
||||
'Λ' : 'Λ',
|
||||
'«' : '«',
|
||||
'«' : '«',
|
||||
'←' : '←',
|
||||
'⇐' : '⇐',
|
||||
'⌈' : '⌈',
|
||||
'“' : '“',
|
||||
'≤' : '≤',
|
||||
'⌊' : '⌊',
|
||||
'∗' : '∗',
|
||||
'◊' : '◊',
|
||||
'‎' : '',
|
||||
'‹' : '‹',
|
||||
'‘' : '‘',
|
||||
'<' : '<',
|
||||
'<' : '<',
|
||||
'<' : '<',
|
||||
'<' : '<',
|
||||
'¯' : '¯',
|
||||
'¯' : '¯',
|
||||
'—' : '—',
|
||||
'µ' : 'µ',
|
||||
'µ' : 'µ',
|
||||
'·' : '·',
|
||||
'·' : '·',
|
||||
'−' : '−',
|
||||
'μ' : 'μ',
|
||||
'Μ' : 'Μ',
|
||||
'∇' : '∇',
|
||||
' ' : ' ',
|
||||
' ' : ' ',
|
||||
'–' : '–',
|
||||
'≠' : '≠',
|
||||
'∋' : '∋',
|
||||
'¬' : '¬',
|
||||
'¬' : '¬',
|
||||
'∉' : '∉',
|
||||
'⊄' : '⊄',
|
||||
'ñ' : 'ñ',
|
||||
'Ñ' : 'Ñ',
|
||||
'Ñ' : 'Ñ',
|
||||
'ñ' : 'ñ',
|
||||
'ν' : 'ν',
|
||||
'Ν' : 'Ν',
|
||||
'ó' : 'ó',
|
||||
'Ó' : 'Ó',
|
||||
'Ó' : 'Ó',
|
||||
'ó' : 'ó',
|
||||
'ô' : 'ô',
|
||||
'Ô' : 'Ô',
|
||||
'Ô' : 'Ô',
|
||||
'ô' : 'ô',
|
||||
'Œ' : 'Œ',
|
||||
'œ' : 'œ',
|
||||
'ò' : 'ò',
|
||||
'Ò' : 'Ò',
|
||||
'Ò' : 'Ò',
|
||||
'ò' : 'ò',
|
||||
'‾' : '‾',
|
||||
'ω' : 'ω',
|
||||
'Ω' : 'Ω',
|
||||
'ο' : 'ο',
|
||||
'Ο' : 'Ο',
|
||||
'⊕' : '⊕',
|
||||
'∨' : '∨',
|
||||
'ª' : 'ª',
|
||||
'ª' : 'ª',
|
||||
'º' : 'º',
|
||||
'º' : 'º',
|
||||
'ø' : 'ø',
|
||||
'Ø' : 'Ø',
|
||||
'Ø' : 'Ø',
|
||||
'ø' : 'ø',
|
||||
'õ' : 'õ',
|
||||
'Õ' : 'Õ',
|
||||
'Õ' : 'Õ',
|
||||
'õ' : 'õ',
|
||||
'⊗' : '⊗',
|
||||
'ö' : 'ö',
|
||||
'Ö' : 'Ö',
|
||||
'Ö' : 'Ö',
|
||||
'ö' : 'ö',
|
||||
'¶' : '¶',
|
||||
'¶' : '¶',
|
||||
'∂' : '∂',
|
||||
'‰' : '‰',
|
||||
'⊥' : '⊥',
|
||||
'φ' : 'φ',
|
||||
'Φ' : 'Φ',
|
||||
'π' : 'π',
|
||||
'Π' : 'Π',
|
||||
'ϖ' : 'ϖ',
|
||||
'±' : '±',
|
||||
'±' : '±',
|
||||
'£' : '£',
|
||||
'£' : '£',
|
||||
'′' : '′',
|
||||
'″' : '″',
|
||||
'∏' : '∏',
|
||||
'∝' : '∝',
|
||||
'ψ' : 'ψ',
|
||||
'Ψ' : 'Ψ',
|
||||
'"' : '"',
|
||||
'"' : '"',
|
||||
'"' : '"',
|
||||
'"' : '"',
|
||||
'√' : '√',
|
||||
'»' : '»',
|
||||
'»' : '»',
|
||||
'→' : '→',
|
||||
'⇒' : '⇒',
|
||||
'⌉' : '⌉',
|
||||
'”' : '”',
|
||||
'ℜ' : 'ℜ',
|
||||
'®' : '®',
|
||||
'®' : '®',
|
||||
'®' : '®',
|
||||
'®' : '®',
|
||||
'⌋' : '⌋',
|
||||
'ρ' : 'ρ',
|
||||
'Ρ' : 'Ρ',
|
||||
'‏' : '',
|
||||
'›' : '›',
|
||||
'’' : '’',
|
||||
'‚' : '‚',
|
||||
'š' : 'š',
|
||||
'Š' : 'Š',
|
||||
'⋅' : '⋅',
|
||||
'§' : '§',
|
||||
'§' : '§',
|
||||
'­' : '', # strange optional hyphenation control character, not just a dash
|
||||
'­' : '',
|
||||
'σ' : 'σ',
|
||||
'Σ' : 'Σ',
|
||||
'ς' : 'ς',
|
||||
'∼' : '∼',
|
||||
'♠' : '♠',
|
||||
'⊂' : '⊂',
|
||||
'⊆' : '⊆',
|
||||
'∑' : '∑',
|
||||
'¹' : '¹',
|
||||
'¹' : '¹',
|
||||
'²' : '²',
|
||||
'²' : '²',
|
||||
'³' : '³',
|
||||
'³' : '³',
|
||||
'⊃' : '⊃',
|
||||
'⊇' : '⊇',
|
||||
'ß' : 'ß',
|
||||
'ß' : 'ß',
|
||||
'τ' : 'τ',
|
||||
'Τ' : 'Τ',
|
||||
'∴' : '∴',
|
||||
'θ' : 'θ',
|
||||
'Θ' : 'Θ',
|
||||
'ϑ' : 'ϑ',
|
||||
' ' : ' ',
|
||||
'þ' : 'þ',
|
||||
'Þ' : 'Þ',
|
||||
'Þ' : 'Þ',
|
||||
'þ' : 'þ',
|
||||
'˜' : '˜',
|
||||
'×' : '×',
|
||||
'×' : '×',
|
||||
'™' : '™',
|
||||
'ú' : 'ú',
|
||||
'Ú' : 'Ú',
|
||||
'Ú' : 'Ú',
|
||||
'ú' : 'ú',
|
||||
'↑' : '↑',
|
||||
'⇑' : '⇑',
|
||||
'û' : 'û',
|
||||
'Û' : 'Û',
|
||||
'Û' : 'Û',
|
||||
'û' : 'û',
|
||||
'ù' : 'ù',
|
||||
'Ù' : 'Ù',
|
||||
'Ù' : 'Ù',
|
||||
'ù' : 'ù',
|
||||
'¨' : '¨',
|
||||
'¨' : '¨',
|
||||
'ϒ' : 'ϒ',
|
||||
'υ' : 'υ',
|
||||
'Υ' : 'Υ',
|
||||
'ü' : 'ü',
|
||||
'Ü' : 'Ü',
|
||||
'Ü' : 'Ü',
|
||||
'ü' : 'ü',
|
||||
'℘' : '℘',
|
||||
'ξ' : 'ξ',
|
||||
'Ξ' : 'Ξ',
|
||||
'ý' : 'ý',
|
||||
'Ý' : 'Ý',
|
||||
'Ý' : 'Ý',
|
||||
'ý' : 'ý',
|
||||
'¥' : '¥',
|
||||
'¥' : '¥',
|
||||
'ÿ' : 'ÿ',
|
||||
'Ÿ' : 'Ÿ',
|
||||
'ÿ' : 'ÿ',
|
||||
'ζ' : 'ζ',
|
||||
'Ζ' : 'Ζ',
|
||||
'‍' : '', # strange spacing control character, not just a space
|
||||
'‌' : '', # strange spacing control character, not just a space
|
||||
}
|
||||
|
||||
FB2_PROLOGUE = '<FictionBook>'
|
||||
FB2_DESCRIPTION = '''<description>
|
||||
<title-info>
|
||||
<genre>fanfiction</genre>
|
||||
<author>
|
||||
<first-name></first-name>
|
||||
<middle-name></middle-name>
|
||||
<last-name>%s</last-name>
|
||||
</author>
|
||||
<book-title>%s</book-title>
|
||||
<lang>eng</lang>
|
||||
</title-info>
|
||||
<document-info>
|
||||
<author>
|
||||
<nickname>sgzmd</nickname>
|
||||
</author>
|
||||
<date value="%s">%s</date>
|
||||
<id>sgzmd_%s</id>
|
||||
<version>2.0</version>
|
||||
</document-info>
|
||||
</description>'''
|
||||
|
||||
HTML_ESC_Definitions = 'HTML_Escape.def'
|
||||
111
fanficdownloader/defaults.ini
Normal file
111
fanficdownloader/defaults.ini
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
[defaults]
|
||||
|
||||
## [defaults] section applies to all formats and sites but may be
|
||||
## overridden.
|
||||
|
||||
# All available titlepage_entries:
|
||||
# category
|
||||
# genre
|
||||
# status
|
||||
# datePublished
|
||||
# dateUpdated
|
||||
# dateCreated
|
||||
# rating
|
||||
# warnings
|
||||
# numChapters
|
||||
# numWords
|
||||
# site
|
||||
# siteabbrev
|
||||
# author
|
||||
# authorId
|
||||
# authorURL
|
||||
# title
|
||||
# storyId
|
||||
# storyUrl
|
||||
# extratags
|
||||
# description
|
||||
# formatname
|
||||
# formatext
|
||||
|
||||
## items to include in title page
|
||||
titlepage_entries: category,genre,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,storyId,authorId,extratags,description
|
||||
|
||||
## include title page as first page.
|
||||
include_titlepage: true
|
||||
|
||||
## include TOC page immediately after title page.
|
||||
include_tocpage: true
|
||||
|
||||
## python string Template, string with ${title}, ${author} etc, same as titlepage_entries
|
||||
## Can include directories. ${formatext} will be added if not in name somewhere.
|
||||
output_filename: ${title}-${siteabbrev}_${storyId}${formatext}
|
||||
## Make directories as needed.
|
||||
make_directories: true
|
||||
|
||||
## put output (with output_filename) in a zip file zip_filename.
|
||||
zip_output: false
|
||||
## Can include directories. .zip will be added if not in name somewhere
|
||||
zip_filename: ${title}-${siteabbrev}_${storyId}${formatext}.zip
|
||||
|
||||
## try to make the output file name 'safe'--remove invalid filename chars.
|
||||
## applies to both output_filename & zip_filename
|
||||
safe_filename: true
|
||||
|
||||
## extra tags (comma separated) to include, primarily for epub.
|
||||
extratags: FanFiction
|
||||
|
||||
## number of seconds to sleep between calls to the story site.
|
||||
slow_down_sleep_time:0.5
|
||||
|
||||
## Each output format has a section that overrides [defaults]
|
||||
|
||||
[html]
|
||||
|
||||
[txt]
|
||||
## Add URLs since there aren't links.
|
||||
titlepage_entries: category,genre,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,extratags,storyUrl, author URL, description
|
||||
|
||||
# use \r\n for line endings, the windows convention. txt output only.
|
||||
windows_eol: true
|
||||
|
||||
[epub]
|
||||
|
||||
## epub is already a zip file.
|
||||
zip_output: false
|
||||
|
||||
# possible subject tags: extratags, genre, category, warnings, lastupdate
|
||||
# lastupdate creates two tags: "Last Update Year/Month: %Y/%m" and "Last Update: %Y/%m/%d"
|
||||
include_subject_tags: extratags, genre, category, lastupdate
|
||||
include_tocpage: false
|
||||
|
||||
# epub->mobi conversions typically don't like tables.
|
||||
titlepage_use_table: true
|
||||
|
||||
## When using tables, make these span both columns.
|
||||
wide_titlepage_entries: description, storyUrl, author URL
|
||||
|
||||
|
||||
## Each site has a section that overrides [defaults] *and* the format section
|
||||
[test1.com]
|
||||
titlepage_entries: title,description,category,genre, status,dateCreated,rating,numChapters,numWords,extratags,description,storyUrl,extratags
|
||||
extratags: FanFiction,Testing
|
||||
|
||||
## If necessary, you can define [<site>:<format>] sections to customize
|
||||
## the formats differently for the same site. Overrides defaults, format and site.
|
||||
[test1.com:txt]
|
||||
extratags: FanFiction,Testing,Text
|
||||
|
||||
[test1.com:html]
|
||||
extratags: FanFiction,Testing,HTML
|
||||
|
||||
[www.twilighted.net]
|
||||
## Some sites require login (or login for some rated stories)
|
||||
## The program can prompt you, or you can save it in config.
|
||||
## This should go in your personal.ini, not defaults.ini.
|
||||
#username:YourName
|
||||
#password:yourpassword
|
||||
|
||||
[www.whofic.com]
|
||||
|
||||
[www.fanfiction.net]
|
||||
|
||||
220
fanficdownloader/downloader.py
Normal file
220
fanficdownloader/downloader.py
Normal file
|
|
@ -0,0 +1,220 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
import os.path
|
||||
import getpass
|
||||
import logging
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
|
||||
import zipdir
|
||||
|
||||
import output
|
||||
import adapter
|
||||
from adapter import StoryArchivedAlready
|
||||
from adapter import StoryDoesNotExist
|
||||
from adapter import FailedToDownload
|
||||
from adapter import InvalidStoryURL
|
||||
from adapter import LoginRequiredException
|
||||
import ffnet
|
||||
import fpcom
|
||||
import ficwad
|
||||
import fictionalley
|
||||
import hpfiction
|
||||
import twilighted
|
||||
import twiwrite
|
||||
import adastrafanfic
|
||||
import whofic
|
||||
import potionsNsnitches
|
||||
import mediaminer
|
||||
|
||||
import time
|
||||
|
||||
class FanficLoader:
|
||||
'''A controller class which handles the interaction between various specific downloaders and writers'''
|
||||
booksDirectory = "books"
|
||||
standAlone = False
|
||||
|
||||
def __init__(self, adapter, writerClass, quiet = False, inmemory = False, compress=True, overwrite=False):
|
||||
self.adapter = adapter
|
||||
self.writerClass = writerClass
|
||||
self.quiet = quiet
|
||||
self.inmemory = inmemory
|
||||
self.compress = compress
|
||||
self.badLogin = False
|
||||
self.overWrite = overwrite
|
||||
|
||||
def getBooksDirectory(self):
|
||||
return self.booksDirectory
|
||||
|
||||
def setBooksDirectory(self, bd):
|
||||
self.booksDirectory = bd
|
||||
return self.booksDirectory
|
||||
|
||||
def getStandAlone(self):
|
||||
return self.standAlone
|
||||
|
||||
def setStandAlone(self, sa):
|
||||
self.standAlone = sa
|
||||
return self.standAlone
|
||||
|
||||
def getOverWrite(self):
|
||||
return self.overWrite
|
||||
|
||||
def setOverWrite(self, sa):
|
||||
self.overWrite = sa
|
||||
return self.overWrite
|
||||
|
||||
def getAdapter():
|
||||
return self.adapter
|
||||
|
||||
def download(self):
|
||||
logging.debug("Trying to download the story")
|
||||
if self.adapter.requiresLogin():
|
||||
logging.debug("Story requires login")
|
||||
if not self.adapter.performLogin():
|
||||
logging.debug("Login/password problem")
|
||||
self.badLogin = True
|
||||
raise adapter.LoginRequiredException(self.adapter.url)
|
||||
|
||||
urls = self.adapter.extractIndividualUrls()
|
||||
|
||||
logging.debug("self.writerClass=%s" % self.writerClass)
|
||||
if self.standAlone and not self.inmemory:
|
||||
s = self.adapter.getOutputFileName(self.booksDirectory, self.writerClass.getFormatExt())
|
||||
logging.debug("Always overwrite? %s" % self.overWrite)
|
||||
if not self.overWrite:
|
||||
logging.debug("Checking if current archive of the story exists. Filename=%s" % s)
|
||||
if not zipdir.checkNewer ( s, self.adapter.getStoryUpdated() ):
|
||||
raise StoryArchivedAlready("A Current archive file \"" + s + "\" already exists! Skipping!")
|
||||
else:
|
||||
logging.debug("Do not check for existance of archive file.")
|
||||
|
||||
self.writer = self.writerClass(self.booksDirectory,
|
||||
self.adapter,
|
||||
inmemory=self.inmemory,
|
||||
compress=self.compress)
|
||||
|
||||
i = 1
|
||||
for u,n in urls:
|
||||
if not self.quiet:
|
||||
print('Downloading chapter %d/%d' % (i, len(urls)))
|
||||
text = self.adapter.getText(u)
|
||||
self.writer.writeChapter(i, n, text)
|
||||
i = i+1
|
||||
#time.sleep(2)
|
||||
|
||||
self.writer.finalise()
|
||||
|
||||
if self.inmemory:
|
||||
self.name = self.writer.name
|
||||
return self.writer.output.getvalue()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(level=logging.DEBUG,format="%(levelname)s:%(filename)s(%(lineno)d):%(message)s")
|
||||
argvlen = len(sys.argv)
|
||||
url = None
|
||||
bookFormat = 'epub'
|
||||
if argvlen > 1:
|
||||
url = sys.argv[1]
|
||||
if argvlen > 2:
|
||||
bookFormat = sys.argv[2]
|
||||
|
||||
if url is None:
|
||||
print >> sys.stderr, "Usage: downloader.py URL Type"
|
||||
sys.exit(-1)
|
||||
|
||||
if type(url) is unicode:
|
||||
print('URL is unicode')
|
||||
url = url.encode('latin1')
|
||||
url = url.strip()
|
||||
adapter = None
|
||||
writerClass = None
|
||||
|
||||
if url.find('fanficauthors') != -1:
|
||||
print >> sys.stderr, "fanficauthors.net already provides ebooks"
|
||||
sys.exit(0)
|
||||
elif url.find('fictionalley') != -1:
|
||||
adapter = fictionalley.FictionAlley(url)
|
||||
elif url.find('ficwad') != -1:
|
||||
adapter = ficwad.FicWad(url)
|
||||
elif url.find('fanfiction.net') != -1:
|
||||
adapter = ffnet.FFNet(url)
|
||||
elif url.find('fictionpress.com') != -1:
|
||||
adapter = fpcom.FPCom(url)
|
||||
elif url.find('harrypotterfanfiction.com') != -1:
|
||||
adapter = hpfiction.HPFiction(url)
|
||||
elif url.find('twilighted.net') != -1:
|
||||
adapter = twilighted.Twilighted(url)
|
||||
elif url.find('twiwrite.net') != -1:
|
||||
adapter = twiwrite.Twiwrite(url)
|
||||
elif url.find('adastrafanfic.com') != -1:
|
||||
adapter = adastrafanfic.Adastrafanfic(url)
|
||||
elif url.find('whofic.com') != -1:
|
||||
adapter = whofic.Whofic(url)
|
||||
elif url.find('potionsandsnitches.net') != -1:
|
||||
adapter = potionsNsnitches.PotionsNSnitches(url)
|
||||
elif url.find('mediaminer.org') != -1:
|
||||
adapter = mediaminer.MediaMiner(url)
|
||||
else:
|
||||
print >> sys.stderr, "Oi! I can haz not appropriate adapter for URL %s!" % url
|
||||
sys.exit(1)
|
||||
|
||||
if bookFormat == 'epub':
|
||||
writerClass = output.EPubFanficWriter
|
||||
elif bookFormat == 'html':
|
||||
writerClass = output.HTMLWriter
|
||||
elif bookFormat == 'mobi':
|
||||
writerClass = output.MobiWriter
|
||||
elif bookFormat == 'text':
|
||||
writerClass = output.TextWriter
|
||||
|
||||
if adapter.requiresLogin(url):
|
||||
print("Meow, URL %s requires you to haz been logged in! Please can I haz this datas?" % url)
|
||||
sys.stdout.write("Can I haz ur login? ")
|
||||
login = sys.stdin.readline().strip()
|
||||
password = getpass.getpass(prompt='Can I haz ur password? ')
|
||||
print("Login: `%s`, Password: `%s`" % (login, password))
|
||||
|
||||
adapter.setLogin(login)
|
||||
adapter.setPassword(password)
|
||||
|
||||
|
||||
loader = FanficLoader(adapter,
|
||||
writerClass)
|
||||
loader.setStandAlone(True)
|
||||
if bookFormat != 'epub':
|
||||
loader.setOverWrite(True)
|
||||
|
||||
|
||||
try:
|
||||
loader.download()
|
||||
except FailedToDownload, ftd:
|
||||
print >> sys.stderr, str(ftd)
|
||||
sys.exit(2) # Error Downloading
|
||||
except InvalidStoryURL, isu:
|
||||
print >> sys.stderr, str(isu)
|
||||
sys.exit(3) # Unknown Error
|
||||
except StoryArchivedAlready, se:
|
||||
print >> sys.stderr, str(se)
|
||||
sys.exit(10) # Skipped
|
||||
except StoryDoesNotExist, sdne:
|
||||
print >> sys.stderr, str(sdne)
|
||||
sys.exit(20) # Missing
|
||||
except LoginRequiredException, lre:
|
||||
print >> sys.stderr, str(lre)
|
||||
sys.exit(30) # Missing
|
||||
except Exception, e:
|
||||
print >> sys.stderr, str(e)
|
||||
sys.exit(99) # Unknown Error
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
293
fanficdownloader/epubmerge.py
Normal file
293
fanficdownloader/epubmerge.py
Normal file
|
|
@ -0,0 +1,293 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# epubmerge.py 1.0
|
||||
|
||||
# Copyright 2011, Jim Miller
|
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
import getopt
|
||||
import os
|
||||
|
||||
import zlib
|
||||
import zipfile
|
||||
from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED
|
||||
from time import time
|
||||
|
||||
from xml.dom.minidom import parse, parseString, getDOMImplementation
|
||||
|
||||
def usage():
|
||||
print "epubmerge 1.0 Merges multiple epub format ebooks together"
|
||||
print "\nUsage: " + sys.argv[0]+" [options] <input epub> [<input epub> ...]\n"
|
||||
print " Options:"
|
||||
print " -h --help"
|
||||
print " -o <output file> --output=<output file> Default: merge.epub"
|
||||
print " -t <output title> --title=<output title> Default: '<First Title> Anthology'"
|
||||
print " -a <author name> --author=<author name> Default: <All authors from epubs>"
|
||||
print " Multiple authors may be given."
|
||||
|
||||
def main():
|
||||
try:
|
||||
opts, args = getopt.getopt(sys.argv[1:], "t:a:o:h", ["title=","author=", "output=","help"])
|
||||
except getopt.GetoptError, err:
|
||||
# print help information and exit:
|
||||
print str(err) # will print something like "option -a not recognized"
|
||||
usage()
|
||||
sys.exit(2)
|
||||
|
||||
if( len(args) < 1 ):
|
||||
usage()
|
||||
sys.exit()
|
||||
|
||||
outputopt = "merge.epub"
|
||||
titleopt = None
|
||||
authoropts = [] # list of strings
|
||||
|
||||
for o, a in opts:
|
||||
if o in ("-h", "--help"):
|
||||
usage()
|
||||
sys.exit()
|
||||
elif o in ("-t", "--title"):
|
||||
titleopt = a
|
||||
elif o in ("-a", "--author"):
|
||||
authoropts.append(a)
|
||||
elif o in ("-o", "--output"):
|
||||
outputopt = a
|
||||
else:
|
||||
assert False, "unhandled option"
|
||||
|
||||
## Add .epub if not already there.
|
||||
if( not outputopt.lower().endswith(".epub") ):
|
||||
outputopt=outputopt+".epub"
|
||||
|
||||
print "output file: "+outputopt
|
||||
|
||||
## Write mimetype file, must be first and uncompressed.
|
||||
## Older versions of python(2.4/5) don't allow you to specify
|
||||
## compression by individual file.
|
||||
## Overwrite if existing output file.
|
||||
outputepub = ZipFile(outputopt, "w", compression=ZIP_STORED)
|
||||
outputepub.debug = 3
|
||||
outputepub.writestr("mimetype", "application/epub+zip")
|
||||
outputepub.close()
|
||||
|
||||
## Re-open file for content.
|
||||
outputepub = ZipFile(outputopt, "a", compression=ZIP_DEFLATED)
|
||||
outputepub.debug = 3
|
||||
|
||||
## Create META-INF/container.xml file. The only thing it does is
|
||||
## point to content.opf
|
||||
containerdom = getDOMImplementation().createDocument(None, "container", None)
|
||||
containertop = containerdom.documentElement
|
||||
containertop.setAttribute("version","1.0")
|
||||
containertop.setAttribute("xmlns","urn:oasis:names:tc:opendocument:xmlns:container")
|
||||
rootfiles = containerdom.createElement("rootfiles")
|
||||
containertop.appendChild(rootfiles)
|
||||
rootfiles.appendChild(newTag(containerdom,"rootfile",{"full-path":"content.opf",
|
||||
"media-type":"application/oebps-package+xml"}))
|
||||
outputepub.writestr("META-INF/container.xml",containerdom.toprettyxml(indent=' ',encoding='utf-8'))
|
||||
|
||||
## Process input epubs.
|
||||
|
||||
items = [] # list of (id, href, type) tuples(all strings) -- From .opfs' manifests
|
||||
items.append(("ncx","toc.ncx","application/x-dtbncx+xml")) ## we'll generate the toc.ncx file,
|
||||
## but it needs to be in the items manifest.
|
||||
itemrefs = [] # list of strings -- idrefs from .opfs' spines
|
||||
navmaps = [] # list of navMap DOM elements -- TOC data for each from toc.ncx files
|
||||
|
||||
booktitles = [] # list of strings -- Each book's title
|
||||
allauthors = [] # list of lists of strings -- Each book's list of authors.
|
||||
|
||||
booknum=1
|
||||
for filename in args:
|
||||
print "input file: "+filename
|
||||
book = "%d" % booknum
|
||||
|
||||
epub = ZipFile(filename, 'r')
|
||||
|
||||
## Find the .opf file.
|
||||
container = epub.read("META-INF/container.xml")
|
||||
containerdom = parseString(container)
|
||||
rootfilenodelist = containerdom.getElementsByTagName("rootfile")
|
||||
rootfilename = rootfilenodelist[0].getAttribute("full-path")
|
||||
|
||||
## Save the path to the .opf file--hrefs inside it are relative to it.
|
||||
relpath = os.path.dirname(rootfilename)
|
||||
if( len(relpath) > 0 ):
|
||||
relpath=relpath+"/"
|
||||
|
||||
metadom = parseString(epub.read(rootfilename))
|
||||
|
||||
## Save indiv book title
|
||||
booktitles.append(metadom.getElementsByTagName("dc:title")[0].firstChild.data)
|
||||
|
||||
## Save authors.
|
||||
authors=[]
|
||||
for creator in metadom.getElementsByTagName("dc:creator"):
|
||||
if( creator.getAttribute("opf:role") == "aut" ):
|
||||
authors.append(creator.firstChild.data)
|
||||
allauthors.append(authors)
|
||||
|
||||
for item in metadom.getElementsByTagName("item"):
|
||||
if( item.getAttribute("media-type") == "application/x-dtbncx+xml" ):
|
||||
# TOC file is only one with this type--as far as I know.
|
||||
# grab the whole navmap, deal with it later.
|
||||
tocdom = parseString(epub.read(relpath+item.getAttribute("href")))
|
||||
|
||||
for navpoint in tocdom.getElementsByTagName("navPoint"):
|
||||
navpoint.setAttribute("id","a"+book+navpoint.getAttribute("id"))
|
||||
|
||||
for content in tocdom.getElementsByTagName("content"):
|
||||
content.setAttribute("src",book+"/"+relpath+content.getAttribute("src"))
|
||||
|
||||
navmaps.append(tocdom.getElementsByTagName("navMap")[0])
|
||||
else:
|
||||
id="a"+book+item.getAttribute("id")
|
||||
href=book+"/"+relpath+item.getAttribute("href")
|
||||
href=href.encode('utf8')
|
||||
items.append((id,href,item.getAttribute("media-type")))
|
||||
outputepub.writestr(href,
|
||||
epub.read(relpath+item.getAttribute("href")))
|
||||
|
||||
for itemref in metadom.getElementsByTagName("itemref"):
|
||||
itemrefs.append("a"+book+itemref.getAttribute("idref"))
|
||||
|
||||
booknum=booknum+1;
|
||||
|
||||
## create content.opf file.
|
||||
uniqueid="epubmerge-uid-%d" % time() # real sophisticated uid scheme.
|
||||
contentdom = getDOMImplementation().createDocument(None, "package", None)
|
||||
package = contentdom.documentElement
|
||||
package.setAttribute("version","2.0")
|
||||
package.setAttribute("xmlns","http://www.idpf.org/2007/opf")
|
||||
package.setAttribute("unique-identifier","epubmerge-id")
|
||||
metadata=newTag(contentdom,"metadata",
|
||||
attrs={"xmlns:dc":"http://purl.org/dc/elements/1.1/",
|
||||
"xmlns:opf":"http://www.idpf.org/2007/opf"})
|
||||
package.appendChild(metadata)
|
||||
metadata.appendChild(newTag(contentdom,"dc:identifier",text=uniqueid,attrs={"id":"epubmerge-id"}))
|
||||
if( titleopt is None ):
|
||||
titleopt = booktitles[0]+" Anthology"
|
||||
metadata.appendChild(newTag(contentdom,"dc:title",text=titleopt))
|
||||
|
||||
# If cmdline authors, use those instead of those collected from the epubs
|
||||
# (allauthors kept for TOC & description gen below.
|
||||
if( len(authoropts) > 1 ):
|
||||
useauthors=[authoropts]
|
||||
else:
|
||||
useauthors=allauthors
|
||||
|
||||
usedauthors=dict()
|
||||
for authorlist in useauthors:
|
||||
for author in authorlist:
|
||||
if( not usedauthors.has_key(author) ):
|
||||
usedauthors[author]=author
|
||||
metadata.appendChild(newTag(contentdom,"dc:creator",
|
||||
attrs={"opf:role":"aut"},
|
||||
text=author))
|
||||
|
||||
metadata.appendChild(newTag(contentdom,"dc:contributor",text="epubmerge",attrs={"opf:role":"bkp"}))
|
||||
metadata.appendChild(newTag(contentdom,"dc:rights",text="Copyrights as per source stories"))
|
||||
metadata.appendChild(newTag(contentdom,"dc:language",text="en"))
|
||||
|
||||
# created now, but not filled in until TOC generation to save loops.
|
||||
description = newTag(contentdom,"dc:description",text="Anthology containing:\n")
|
||||
metadata.appendChild(description)
|
||||
|
||||
manifest = contentdom.createElement("manifest")
|
||||
package.appendChild(manifest)
|
||||
for item in items:
|
||||
(id,href,type)=item
|
||||
manifest.appendChild(newTag(contentdom,"item",
|
||||
attrs={'id':id,
|
||||
'href':href,
|
||||
'media-type':type}))
|
||||
|
||||
spine = newTag(contentdom,"spine",attrs={"toc":"ncx"})
|
||||
package.appendChild(spine)
|
||||
for itemref in itemrefs:
|
||||
spine.appendChild(newTag(contentdom,"itemref",
|
||||
attrs={"idref":itemref,
|
||||
"linear":"yes"}))
|
||||
|
||||
## create toc.ncx file
|
||||
tocncxdom = getDOMImplementation().createDocument(None, "ncx", None)
|
||||
ncx = tocncxdom.documentElement
|
||||
ncx.setAttribute("version","2005-1")
|
||||
ncx.setAttribute("xmlns","http://www.daisy.org/z3986/2005/ncx/")
|
||||
head = tocncxdom.createElement("head")
|
||||
ncx.appendChild(head)
|
||||
head.appendChild(newTag(tocncxdom,"meta",
|
||||
attrs={"name":"dtb:uid", "content":uniqueid}))
|
||||
head.appendChild(newTag(tocncxdom,"meta",
|
||||
attrs={"name":"dtb:depth", "content":"1"}))
|
||||
head.appendChild(newTag(tocncxdom,"meta",
|
||||
attrs={"name":"dtb:totalPageCount", "content":"0"}))
|
||||
head.appendChild(newTag(tocncxdom,"meta",
|
||||
attrs={"name":"dtb:maxPageNumber", "content":"0"}))
|
||||
|
||||
docTitle = tocncxdom.createElement("docTitle")
|
||||
docTitle.appendChild(newTag(tocncxdom,"text",text=titleopt))
|
||||
ncx.appendChild(docTitle)
|
||||
|
||||
tocnavMap = tocncxdom.createElement("navMap")
|
||||
ncx.appendChild(tocnavMap)
|
||||
|
||||
## TOC navPoints can ge nested, but this flattens them for
|
||||
## simplicity, plus adds a navPoint for each epub.
|
||||
booknum=0
|
||||
for navmap in navmaps:
|
||||
navpoints = navmap.getElementsByTagName("navPoint")
|
||||
## Copy first navPoint of each epub, give a different id and
|
||||
## text: bookname by authorname
|
||||
newnav = navpoints[0].cloneNode(True)
|
||||
newnav.setAttribute("id","book"+newnav.getAttribute("id"))
|
||||
## For purposes of TOC titling & desc, use first book author
|
||||
newtext = newTag(tocncxdom,"text",text=booktitles[booknum]+" by "+allauthors[booknum][0])
|
||||
description.appendChild(contentdom.createTextNode(booktitles[booknum]+" by "+allauthors[booknum][0]+"\n"))
|
||||
text = newnav.getElementsByTagName("text")[0]
|
||||
text.parentNode.replaceChild(newtext,text)
|
||||
tocnavMap.appendChild(newnav)
|
||||
|
||||
for navpoint in navpoints:
|
||||
tocnavMap.appendChild(navpoint)
|
||||
booknum=booknum+1;
|
||||
|
||||
## Force strict ordering of playOrder
|
||||
playorder=1
|
||||
for navpoint in tocncxdom.getElementsByTagName("navPoint"):
|
||||
navpoint.setAttribute("playOrder","%d" % playorder)
|
||||
if( not navpoint.getAttribute("id").startswith("book") ):
|
||||
playorder = playorder + 1
|
||||
|
||||
## content.opf written now due to description being filled in
|
||||
## during TOC generation to save loops.
|
||||
outputepub.writestr("content.opf",contentdom.toxml('utf-8'))
|
||||
outputepub.writestr("toc.ncx",tocncxdom.toxml('utf-8'))
|
||||
|
||||
outputepub.close()
|
||||
|
||||
## Utility method for creating new tags.
|
||||
def newTag(dom,name,attrs=None,text=None):
|
||||
tag = dom.createElement(name)
|
||||
if( attrs is not None ):
|
||||
for attr in attrs.keys():
|
||||
tag.setAttribute(attr,attrs[attr])
|
||||
if( text is not None ):
|
||||
tag.appendChild(dom.createTextNode(text))
|
||||
return tag
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
368
fanficdownloader/ffnet.py
Normal file
368
fanficdownloader/ffnet.py
Normal file
|
|
@ -0,0 +1,368 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import cgi
|
||||
import uuid
|
||||
import shutil
|
||||
import os.path
|
||||
import logging
|
||||
import unittest
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from constants import *
|
||||
from adapter import *
|
||||
|
||||
try:
|
||||
import login_password
|
||||
except:
|
||||
# tough luck
|
||||
pass
|
||||
|
||||
class FFNet(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
|
||||
self.storyName = 'FF.Net story'
|
||||
self.authorName = 'FF.Net author'
|
||||
self.storyDescription = 'Fanfiction Story'
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
self.authorId = '0'
|
||||
self.authorURL = self.path
|
||||
self.storyId = '0'
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.subjects.append ('FanFiction')
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = 'FanFiction'
|
||||
self.category = 'FF.Net Category'
|
||||
self.storyStatus = 'In-Progress'
|
||||
self.storyRating = 'K'
|
||||
self.storyUserRating = '0'
|
||||
self.outputName = ''
|
||||
self.outputStorySep = '-ffnet_'
|
||||
|
||||
logging.debug('self.path=%s' % self.path)
|
||||
|
||||
if self.path.startswith('/'):
|
||||
self.path = self.path[1:]
|
||||
|
||||
spl = self.path.split('/')
|
||||
logging.debug('spl=%s' % spl)
|
||||
if spl is not None:
|
||||
if len(spl) > 0 and spl[0] != 's':
|
||||
raise InvalidStoryURL("Error URL \"%s\" is not a story." % self.url)
|
||||
if len(spl) > 1:
|
||||
self.storyId = spl[1]
|
||||
if len(spl) > 2:
|
||||
chapter = spl[1]
|
||||
else:
|
||||
chapter = '1'
|
||||
if len(spl) == 5:
|
||||
self.path = "/".join(spl[1:-1])
|
||||
|
||||
if self.path.endswith('/'):
|
||||
self.path = self.path[:-1]
|
||||
|
||||
logging.debug('self.path=%s' % self.path)
|
||||
|
||||
if self.host is not None and self.host == "m.fanfiction.net":
|
||||
self.host = "www.fanfiction.net"
|
||||
logging.debug('self.host=%s' % self.host)
|
||||
self.url = "http://" + self.host + "/" + self.path
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
|
||||
logging.debug('self.storyId=%s' % self.storyId)
|
||||
if not self.appEngine:
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
else:
|
||||
self.opener = None
|
||||
|
||||
logging.debug("Created FF.Net: url=%s" % (self.url))
|
||||
|
||||
def _getLoginScript(self):
|
||||
return self.path
|
||||
|
||||
def _getVarValue(self, varstr):
|
||||
#logging.debug('_getVarValue varstr=%s' % varstr)
|
||||
vals = varstr.split('=')
|
||||
#logging.debug('vals=%s' % vals)
|
||||
retstr="".join(vals[+1:])
|
||||
#logging.debug('retstr=%s' % retstr)
|
||||
if retstr.startswith(' '):
|
||||
retstr = retstr[1:]
|
||||
if retstr.endswith(';'):
|
||||
retstr = retstr[:-1]
|
||||
return retstr
|
||||
|
||||
def _splitCrossover(self, subject):
|
||||
if "Crossover" in subject:
|
||||
self.addSubject ("Crossover")
|
||||
logging.debug('Crossover=%s' % subject)
|
||||
if subject.find(' and ') != -1:
|
||||
words = subject.split(' ')
|
||||
logging.debug('words=%s' % words)
|
||||
subj = ''
|
||||
for s in words:
|
||||
if s in "and Crossover":
|
||||
if len(subj) > 0:
|
||||
self.addSubject(subj)
|
||||
subj = ''
|
||||
else:
|
||||
if len(subj) > 0:
|
||||
subj = subj + ' '
|
||||
subj = subj + s
|
||||
if len(subj) > 0:
|
||||
self.addSubject(subj)
|
||||
else:
|
||||
self.addSubject(subject)
|
||||
else:
|
||||
self.addSubject(subject)
|
||||
return True
|
||||
|
||||
def _splitGenre(self, subject):
|
||||
if len(subject) > 0:
|
||||
words = subject.split('/')
|
||||
logging.debug('words=%s' % words)
|
||||
for subj in words:
|
||||
if len(subj) > 0:
|
||||
self.addSubject(subj)
|
||||
return True
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
data = ''
|
||||
try:
|
||||
data = self.fetchUrl(self.url)
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + self.url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise StoryDoesNotExist("Problem reading story URL " + self.url + "!")
|
||||
|
||||
d2 = re.sub('&\#[0-9]+;', ' ', data)
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(d2)
|
||||
except:
|
||||
logging.error("Failed to decode: <%s>" % d2)
|
||||
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % self.url)
|
||||
|
||||
allA = soup.findAll('a')
|
||||
for a in allA:
|
||||
if 'href' in a._getAttrMap() and a['href'].find('/u/') != -1:
|
||||
self.authorName = a.string
|
||||
(u1, u2, self.authorId, u3) = a['href'].split('/')
|
||||
logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName))
|
||||
|
||||
urls = []
|
||||
lines = data.split('\n')
|
||||
for l in lines:
|
||||
if l.find("»") != -1 and l.find('<b>') != -1:
|
||||
s2 = bs.BeautifulStoneSoup(l)
|
||||
self.storyName = unicode(s2.find('b').string)
|
||||
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
|
||||
elif l.find("<a href='/u/") != -1:
|
||||
s2 = bs.BeautifulStoneSoup(l)
|
||||
self.authorName = unicode(s2.a.string)
|
||||
(u1, u2, self.authorId, u3) = s2.a['href'].split('/')
|
||||
logging.debug('self.authorId=%s, self.authorName=%s' % (self.authorId, self.authorName))
|
||||
elif l.find("Rated: <a href=") != -1:
|
||||
if "Complete" in l:
|
||||
self.storyStatus = 'Completed'
|
||||
else:
|
||||
self.storyStatus = 'In-Progress'
|
||||
|
||||
s2 = bs.BeautifulStoneSoup(l)
|
||||
self.storyRating = unicode(s2.a.string).strip()
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
logging.debug('s2.a=%s' % s2.a)
|
||||
s3 = l.split(' - ')
|
||||
logging.debug('s3=%s' % s3)
|
||||
if len(s3) > 0:
|
||||
if s3[1].find("Reviews: <a href=") != -1:
|
||||
continue
|
||||
self.language = s3[1].strip()
|
||||
logging.debug('self.language=%s' % self.language)
|
||||
if len(s3) > 1:
|
||||
if s3[2].find("Reviews: <a href=") != -1:
|
||||
continue
|
||||
self.genre = s3[2].strip()
|
||||
if "&" in self.genre:
|
||||
self.genre = ''
|
||||
continue
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
self._splitGenre(self.genre)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
elif l.find("<SELECT title='chapter navigation'") != -1:
|
||||
if len(urls) > 0:
|
||||
continue
|
||||
try:
|
||||
u = l.decode('utf-8')
|
||||
except UnicodeEncodeError, e:
|
||||
u = l
|
||||
except:
|
||||
u = l.encode('ascii', 'xmlcharrefreplace')
|
||||
u = re.sub('&\#[0-9]+;', ' ', u)
|
||||
s2 = bs.BeautifulSoup(u)
|
||||
options = s2.findAll('option')
|
||||
for o in options:
|
||||
url = 'http://' + self.host + '/s/' + self.storyId + '/' + o['value']
|
||||
title = o.string
|
||||
logging.debug('URL = `%s`, Title = `%s`' % (url, title))
|
||||
urls.append((url,title))
|
||||
elif l.find("var chapters") != -1:
|
||||
self.numChapters = self._getVarValue (l)
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
elif l.find("var words") != -1:
|
||||
self.numWords = self._getVarValue (l)
|
||||
logging.debug('self.numWords=%s' % self.numWords)
|
||||
elif l.find("var categoryid") != -1:
|
||||
categoryid = self._getVarValue (l)
|
||||
logging.debug('categoryid=%s' % categoryid)
|
||||
elif l.find("var cat_title") != -1:
|
||||
self.category = self._getVarValue (l).strip("'")
|
||||
logging.debug('self.category=%s' % self.category)
|
||||
self._splitCrossover(self.category)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
elif l.find("var summary") != -1:
|
||||
self.storyDescription = self._getVarValue (l).strip("'")
|
||||
if '&' in self.storyDescription:
|
||||
s = self.storyDescription.split('&')
|
||||
logging.debug('s=%s' % s)
|
||||
self.storyDescription = ''
|
||||
for ss in s:
|
||||
if len(self.storyDescription) > 0:
|
||||
if len(ss) > 4 and 'amp;' in ss[1:4]:
|
||||
self.storyDescription = self.storyDescription + '&' + ss
|
||||
else:
|
||||
self.storyDescription = self.storyDescription + '&' + ss
|
||||
else:
|
||||
self.storyDescription = ss
|
||||
#logging.debug('self.storyDescription=%s' % self.storyDescription.replace('\n',' ').replace('\r',''))
|
||||
elif l.find("var datep") != -1:
|
||||
dateps = self._getVarValue (l)
|
||||
self.storyPublished = datetime.datetime(*time.strptime ( dateps, "'%m-%d-%y'" )[0:5])
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished.strftime("%Y-%m-%dT%I:%M:%S"))
|
||||
elif l.find("var dateu") != -1:
|
||||
dateus = self._getVarValue (l)
|
||||
self.storyUpdated = datetime.datetime(*time.strptime ( dateus, "'%m-%d-%y'" )[0:5])
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated.strftime("%Y-%m-%dT%I:%M:%S"))
|
||||
|
||||
if len(urls) <= 0:
|
||||
# no chapters found, try url by itself.
|
||||
urls.append((self.url,self.storyName))
|
||||
|
||||
self.authorURL = 'http://' + self.host + '/u/' + self.authorId
|
||||
|
||||
#logging.debug('urls=%s' % urls)
|
||||
return urls
|
||||
|
||||
def getText(self, url):
|
||||
data = None
|
||||
|
||||
# try up to three times, with longer sleeps first.
|
||||
for sleeptime in [0.5, 4, 9]:
|
||||
time.sleep(sleeptime)
|
||||
try:
|
||||
logging.debug("Fetching URL: %s sleeptime: %f" % (url, sleeptime))
|
||||
data = self.fetchUrl(url)
|
||||
if data is not None:
|
||||
break
|
||||
except Exception, e:
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
logging.error("Data downloaded: <%s>" % data)
|
||||
|
||||
if data is None:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
|
||||
|
||||
lines = data.split('\n')
|
||||
|
||||
textbuf = ''
|
||||
emit = False
|
||||
|
||||
olddata = data
|
||||
try:
|
||||
data = data.decode('utf8')
|
||||
except:
|
||||
data = olddata
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
except:
|
||||
logging.debug(data)
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
|
||||
|
||||
div = soup.find('div', {'id' : 'storytext'})
|
||||
if None == div:
|
||||
if "Story Not Found" in data:
|
||||
logging.info("Story not Found at %s" % url)
|
||||
raise FailedToDownload("Story not Found at %s" % url)
|
||||
logging.debug(data)
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return div.__str__('utf8')
|
||||
|
||||
|
||||
class FFA_UnitTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
pass
|
||||
|
||||
def testChaptersAuthStory(self):
|
||||
f = FFNet('http://www.fanfiction.net/s/5257563/1')
|
||||
f.extractIndividualUrls()
|
||||
|
||||
self.assertEquals('Beka0502', f.getAuthorName())
|
||||
self.assertEquals("Draco's Redemption", f.getStoryName())
|
||||
|
||||
def testChaptersCountNames(self):
|
||||
f = FFNet('http://www.fanfiction.net/s/5257563/1')
|
||||
urls = f.extractIndividualUrls()
|
||||
|
||||
self.assertEquals(10, len(urls))
|
||||
|
||||
def testGetText(self):
|
||||
url = 'http://www.fanfiction.net/s/5257563/1'
|
||||
f = FFNet(url)
|
||||
text = f.getText(url)
|
||||
self.assertTrue(text.find('He was just about to look at some photos when he heard a crack') != -1)
|
||||
|
||||
def testBrokenWands(self):
|
||||
url = 'http://www.fanfiction.net/s/1527263/30/Harry_Potter_and_Broken_Wands'
|
||||
f = FFNet(url)
|
||||
text = f.getText(url)
|
||||
|
||||
urls = f.extractIndividualUrls()
|
||||
|
||||
def testFictionPress(self):
|
||||
url = 'http://www.fictionpress.com/s/2725180/1/Behind_This_Facade'
|
||||
f = FFNet(url)
|
||||
urls = f.extractIndividualUrls()
|
||||
|
||||
self.assertEquals('Behind This Facade', f.getStoryName())
|
||||
self.assertEquals('IntoxicatingMelody', f.getAuthorName())
|
||||
|
||||
text = f.getText(url)
|
||||
self.assertTrue(text.find('Kale Resgerald at your service" He answered, "So, can we go now? Or do you want to') != -1)
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
301
fanficdownloader/fictionalley.py
Normal file
301
fanficdownloader/fictionalley.py
Normal file
|
|
@ -0,0 +1,301 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
import logging
|
||||
import os.path
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import cookielib as cl
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import time as time
|
||||
import datetime
|
||||
from adapter import *
|
||||
|
||||
|
||||
class FictionAlley(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
|
||||
logging.debug('self.host=%s' % self.host)
|
||||
logging.debug('self.path=%s' % self.path)
|
||||
|
||||
cookieproc = u2.HTTPCookieProcessor()
|
||||
|
||||
# FictionAlley wants a cookie to prove you're old enough to read R+ rated stuff.
|
||||
cookie = cl.Cookie(version=0, name='fauser', value='wizard',
|
||||
port=None, port_specified=False,
|
||||
domain='www.fictionalley.org', domain_specified=False, domain_initial_dot=False,
|
||||
path='/authors', path_specified=True,
|
||||
secure=False,
|
||||
expires=time.time()+10000,
|
||||
discard=False,
|
||||
comment=None,
|
||||
comment_url=None,
|
||||
rest={'HttpOnly': None},
|
||||
rfc2109=False)
|
||||
cookieproc.cookiejar.set_cookie(cookie)
|
||||
self.opener = u2.build_opener(cookieproc)
|
||||
|
||||
ss = self.path.split('/')
|
||||
|
||||
self.storyDescription = 'Fanfiction Story'
|
||||
self.authorId = ''
|
||||
self.authorURL = ''
|
||||
self.storyId = ''
|
||||
if len(ss) > 2 and ss[1] == 'authors':
|
||||
self.authorId = ss[2]
|
||||
self.authorURL = 'http://' + self.host + '/authors/' + self.authorId
|
||||
if len(ss) > 3:
|
||||
self.storyId = ss[3].replace ('.html','')
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.subjects.append ('fanfiction')
|
||||
self.subjects.append ('Harry Potter')
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = ''
|
||||
self.category = 'Harry Potter'
|
||||
self.storyStatus = 'Unknown' # fictionalley doesn't give us in-progress/completed anywhere.
|
||||
self.storyRating = 'K'
|
||||
self.storyUserRating = '0'
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
self.storyName = ''
|
||||
self.outputName = ''
|
||||
self.outputStorySep = '-fa_'
|
||||
|
||||
def getPasswordLine(self):
|
||||
return 'opaopapassword'
|
||||
|
||||
def getLoginScript(self):
|
||||
return 'opaopaloginscript'
|
||||
|
||||
def getLoginPasswordOthers(self):
|
||||
login = dict(login = 'name', password = 'pass')
|
||||
other = dict(submit = 'Log In', remember='yes')
|
||||
return (login, other)
|
||||
|
||||
def _processChapterHeaders(self, div):
|
||||
brs = div.findAll ('br')
|
||||
for br in brs:
|
||||
keystr=''
|
||||
valstr=''
|
||||
if len(br.contents) > 2:
|
||||
keystr = br.contents[1]
|
||||
if keystr is not None:
|
||||
strs = re.split ("<[^>]+>", unicode(keystr))
|
||||
keystr=''
|
||||
for s in strs:
|
||||
keystr = keystr + s
|
||||
valstr = br.contents[2].strip(' ')
|
||||
if keystr is not None:
|
||||
if keystr == 'Rating:':
|
||||
self.storyRating = valstr
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
elif keystr == 'Genre:':
|
||||
self.genre = valstr
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
s2 = valstr.split(', ')
|
||||
for ss2 in s2:
|
||||
self.addSubject(ss2)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
elif keystr == 'Main Character(s):':
|
||||
s2 = valstr.split(', ')
|
||||
for ss2 in s2:
|
||||
self.addCharacter(ss2)
|
||||
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
||||
elif keystr == 'Summary:':
|
||||
self.storyDescription = valstr
|
||||
#logging.debug('self.storyDescription=%s' % self.storyDescription.replace("\n"," ").replace('\r',''))
|
||||
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(self.url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + self.url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise StoryDoesNotExist("Problem reading story URL " + self.url + "!")
|
||||
|
||||
# There is some usefull information in the headers of the first chapter page..
|
||||
data = data.replace('<!-- headerstart -->','<crazytagstringnobodywouldstumbleonaccidently id="storyheaders">').replace('<!-- headerend -->','</crazytagstringnobodywouldstumbleonaccidently>')
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
except:
|
||||
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % self.url)
|
||||
|
||||
breadcrumbs = soup.find('div', {'class': 'breadcrumbs'})
|
||||
if breadcrumbs is not None:
|
||||
# Be aware that this means that the user has entered the {STORY}01.html
|
||||
# We will not have valid Publised and Updated dates. User should enter
|
||||
# the {STORY}.html instead. We should force that instead of this.
|
||||
#logging.debug('breadcrumbs=%s' % breadcrumbs )
|
||||
bcas = breadcrumbs.findAll('a')
|
||||
#logging.debug('bcas=%s' % bcas )
|
||||
if bcas is not None and len(bcas) > 1:
|
||||
bca = bcas[1]
|
||||
#logging.debug('bca=%s' % bca )
|
||||
if 'href' in bca._getAttrMap():
|
||||
#logging.debug('bca.href=%s' % bca['href'] )
|
||||
url = unicode(bca['href'])
|
||||
if url is not None and len(url) > 0:
|
||||
self.url = url
|
||||
logging.debug('self.url=%s' % self.url )
|
||||
ss = self.url.split('/')
|
||||
self.storyId = ss[-1].replace('.html','')
|
||||
self.storyName = bca.string
|
||||
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
|
||||
|
||||
data = self.opener.open(self.url).read()
|
||||
|
||||
# There is some usefull information in the headers of the first chapter page..
|
||||
data = data.replace('<!-- headerstart -->','<crazytagstringnobodywouldstumbleonaccidently id="storyheaders">').replace('<!-- headerend -->','</crazytagstringnobodywouldstumbleonaccidently>')
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
|
||||
# If it is decided that we really do care about number of words.. It's only available on the author's page..
|
||||
#d0 = self.opener.open(self.authorURL).read()
|
||||
#soupA = bs.BeautifulStoneSoup(d0)
|
||||
#dls = soupA.findAll('dl')
|
||||
#logging.debug('dls=%s' % dls)
|
||||
|
||||
# Get title from <title>, remove before '-'.
|
||||
if len(self.storyName) == 0:
|
||||
title = soup.find('title').string
|
||||
self.storyName = "-".join(title.split('-')[1:]).strip().replace(" (Story Text)","")
|
||||
|
||||
links = soup.findAll('li')
|
||||
|
||||
self.numChapters = 0;
|
||||
result = []
|
||||
if len(links) == 0:
|
||||
# Be aware that this means that the user has entered the {STORY}01.html
|
||||
# We will not have valid Publised and Updated dates. User should enter
|
||||
# the {STORY}.html instead. We should force that instead of this.
|
||||
breadcrumbs = soup.find('div', {'class': 'breadcrumbs'})
|
||||
self.authorName = breadcrumbs.a.string.replace("'s Fics","")
|
||||
result.append((self.url,self.storyName))
|
||||
#logging.debug('chapter[%s]=%s, %s' % (self.numChapters+1,self.url,self.storyName))
|
||||
self.numChapters = self.numChapters + 1;
|
||||
div = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storyheaders'})
|
||||
if div is not None:
|
||||
self._processChapterHeaders(div)
|
||||
else:
|
||||
author = soup.find('h1', {'class' : 'title'})
|
||||
self.authorName = author.a.string
|
||||
|
||||
summary = soup.find('div', {'class' : 'summary'})
|
||||
ss = summary.contents
|
||||
if len(ss) > 1:
|
||||
ss1 = ss[0].split(': ')
|
||||
if len(ss1) > 1 and ss1[0] == 'Rating':
|
||||
self.storyRating = ss1[1]
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
self.storyDescription = unicode(ss[1]).replace("<br>","").replace("</br>","").replace('\n','')
|
||||
#logging.debug('self.storyDescription=%s' % self.storyDescription.replace("\n"," ").replace('\r',''))
|
||||
|
||||
for li in links:
|
||||
a = li.find('a', {'class' : 'chapterlink'})
|
||||
s = li.contents
|
||||
if a is not None:
|
||||
url = a['href']
|
||||
title = a.string
|
||||
result.append((url,title))
|
||||
#logging.debug('chapter[%s]=%s, %s' % (self.numChapters+1,url,title))
|
||||
if self.numChapters == 0:
|
||||
# fictionalley uses full URLs in chapter list.
|
||||
d1 = self.opener.open(url).read()
|
||||
|
||||
# find <!-- headerstart --> & <!-- headerend --> and
|
||||
# replaced with matching div pair for easier parsing.
|
||||
# Yes, it's an evil kludge, but what can ya do? Using
|
||||
# something other than div prevents soup from pairing
|
||||
# our div with poor html inside the story text.
|
||||
d1 = d1.replace('<!-- headerstart -->','<crazytagstringnobodywouldstumbleonaccidently id="storyheaders">').replace('<!-- headerend -->','</crazytagstringnobodywouldstumbleonaccidently>')
|
||||
sop = bs.BeautifulStoneSoup(d1)
|
||||
|
||||
div = sop.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storyheaders'})
|
||||
if div is not None:
|
||||
self._processChapterHeaders(div)
|
||||
|
||||
self.numChapters = self.numChapters + 1
|
||||
if len(s) > 1:
|
||||
datestr=''
|
||||
ss2 = s[1].replace('\n','').replace('(','').split(' ')
|
||||
if len(ss2) > 2 and ss2[0] == 'Posted:':
|
||||
datestr = ss2[1] + ' ' + ss2[2]
|
||||
tmpdate = datetime.datetime.fromtimestamp(time.mktime(time.strptime(datestr.strip(' '), "%Y-%m-%d %H:%M:%S")))
|
||||
if self.numChapters == 1:
|
||||
self.storyPublished = tmpdate
|
||||
self.storyUpdated = tmpdate
|
||||
logging.debug('self.storyPublished=%s, self.storyUpdated=%s' % (self.storyPublished, self.storyUpdated))
|
||||
else:
|
||||
logging.debug('li chapterlink not found! li=%s' % li)
|
||||
|
||||
|
||||
logging.debug('Story "%s" by %s' % (self.storyName, self.authorName))
|
||||
|
||||
return result
|
||||
|
||||
def getText(self, url):
|
||||
# fictionalley uses full URLs in chapter list.
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
|
||||
|
||||
|
||||
# find <!-- headerend --> & <!-- footerstart --> and
|
||||
# replaced with matching div pair for easier parsing.
|
||||
# Yes, it's an evil kludge, but what can ya do? Using
|
||||
# something other than div prevents soup from pairing
|
||||
# our div with poor html inside the story text.
|
||||
data = data.replace('<!-- headerend -->','<crazytagstringnobodywouldstumbleonaccidently id="storytext">').replace('<!-- footerstart -->','</crazytagstringnobodywouldstumbleonaccidently>')
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
except:
|
||||
logging.info("Failed to decode: <%s>" % data)
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
|
||||
|
||||
div = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storytext'})
|
||||
if None == div:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
html = soup.findAll('html')
|
||||
if len(html) > 1:
|
||||
return html[1].__str__('utf8')
|
||||
else:
|
||||
return div.__str__('utf8').replace('crazytagstringnobodywouldstumbleonaccidently','div')
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
url = 'http://www.fictionalley.org/authors/drt/DA.html'
|
||||
data = self.opener.open(url).read()
|
||||
host = up.urlparse(url).netloc
|
||||
fw = FictionAlley(url)
|
||||
urls = fw.extractIndividualUrls(data, host, url)
|
||||
pp.pprint(urls)
|
||||
print(fw.getText(data))
|
||||
257
fanficdownloader/ficwad.py
Normal file
257
fanficdownloader/ficwad.py
Normal file
|
|
@ -0,0 +1,257 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
import os.path
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import logging
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from adapter import *
|
||||
|
||||
class FicWad(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
self.host = up.urlparse(url).netloc
|
||||
self.storyDescription = 'Fanfiction Story'
|
||||
self.authorId = '0'
|
||||
self.storyId = '0'
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.subjects.append ('fanfiction')
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = 'FanFiction'
|
||||
self.category = 'Category'
|
||||
self.storyStatus = 'In-Progress'
|
||||
self.storyRating = 'PG'
|
||||
self.storyUserRating = '0'
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
self.outputName = ''
|
||||
self.outputStorySep = '-fw_'
|
||||
|
||||
def getPasswordLine(self):
|
||||
return 'opaopapassword'
|
||||
|
||||
def getLoginScript(self):
|
||||
return 'opaopaloginscript'
|
||||
|
||||
def getLoginPasswordOthers(self):
|
||||
login = dict(login = 'name', password = 'pass')
|
||||
other = dict(submit = 'Log In', remember='yes')
|
||||
return (login, other)
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
oldurl = ''
|
||||
cururl = self.url
|
||||
data = ''
|
||||
try:
|
||||
data = u2.urlopen(self.url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + self.url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise StoryDoesNotExist("Problem reading story URL " + self.url + "!")
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
except:
|
||||
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % self.url)
|
||||
|
||||
story = soup.find('div', {'id' : 'story'})
|
||||
crumbtrail = story.find('h3') # the only h3 ficwad uses.
|
||||
allAhrefs = crumbtrail.findAll('a')
|
||||
# last of crumbtrail
|
||||
storyinfo = allAhrefs[-1]
|
||||
(u0, u1, storyid) = storyinfo['href'].split('/')
|
||||
if u1 == "story":
|
||||
# This page does not have the correct information on it.. Need to get the Story Title Page
|
||||
logging.debug('URL %s is a chapter URL. Getting Title Page http://%s/%s/%s.' % (self.url, self.host, u1, storyid))
|
||||
oldurl = self.url
|
||||
self.url = 'http://' + self.host + '/' + u1 + '/' + storyid
|
||||
data = u2.urlopen(self.url).read()
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
|
||||
story = soup.find('div', {'id' : 'story'})
|
||||
crumbtrail = story.find('h3') # the only h3 ficwad uses.
|
||||
allAhrefs = crumbtrail.findAll('a')
|
||||
|
||||
# save chapter name from header in case of one-shot.
|
||||
storyinfo = story.find('h4').find('a')
|
||||
(u0, u1, self.storyId) = storyinfo['href'].split('/')
|
||||
self.storyName = storyinfo.string.strip()
|
||||
|
||||
logging.debug('self.storyName=%s, self.storyId=%s' % (self.storyName, self.storyId))
|
||||
|
||||
author = soup.find('span', {'class' : 'author'})
|
||||
self.authorName = unicode(author.a.string)
|
||||
(u0, u1,self.authorId) = author.a['href'].split('/')
|
||||
self.authorURL = 'http://' + self.host + author.a['href']
|
||||
logging.debug('self.authorName=%s self.authorId=%s' % (self.authorName, self.authorId))
|
||||
|
||||
description = soup.find('blockquote', {'class' : 'summary'})
|
||||
if description is not None:
|
||||
self.storyDescription = unicode(description.p.string)
|
||||
#logging.debug('self.storyDescription=%s' % self.storyDescription.replace('\n',' ').replace('\r',''))
|
||||
|
||||
meta = soup.find('p', {'class' : 'meta'})
|
||||
if meta is not None:
|
||||
logging.debug('meta.s pre=%s' % meta.__str__('utf8'))
|
||||
s = re.sub('<[^>]+>','',unicode(meta)).replace('\n',' ').replace('\t','').split(' - ')
|
||||
#logging.debug('meta.s post=%s' % s)
|
||||
for ss in s:
|
||||
s1 = ss.replace(' ','').split(':')
|
||||
#logging.debug('ss=%s' % ss)
|
||||
if len(s1) > 1:
|
||||
skey = s1[0].strip()
|
||||
#logging.debug('Checking = %s' % skey)
|
||||
if skey == 'Category':
|
||||
# ficwad doesn't allow multiple categories.
|
||||
self.category = unicode(s1[1])
|
||||
logging.debug('self.category=%s' % self.category)
|
||||
self.addSubject(self.category)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
elif skey == 'Rating':
|
||||
self.storyRating = s1[1]
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
elif skey == 'Genres':
|
||||
self.genre = s1[1]
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
s2 = s1[1].split(', ')
|
||||
for ss2 in s2:
|
||||
self.addSubject(ss2)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
elif skey == 'Characters':
|
||||
s2 = s1[1].split(', ')
|
||||
for ss2 in s2:
|
||||
self.addCharacter(ss2)
|
||||
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
||||
elif skey == 'Chapters':
|
||||
self.numChapters = s1[1]
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
elif skey == 'Warnings':
|
||||
logging.debug('Warnings=%s' % s1[1])
|
||||
elif skey == 'Published':
|
||||
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d")))
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
elif skey == 'Updated':
|
||||
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d")))
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
||||
else:
|
||||
if ss == 'Complete' :
|
||||
self.storyStatus = 'Completed'
|
||||
elif ss.endswith('words'):
|
||||
self.numWords=ss.replace('words','').replace(' ','')
|
||||
logging.debug('self.numWords=%s' % self.numWords)
|
||||
|
||||
logging.debug('Story "%s" by %s' % (self.storyName, self.authorName))
|
||||
|
||||
result = []
|
||||
ii = 1
|
||||
|
||||
if oldurl is not None and len(oldurl) > 0:
|
||||
logging.debug('Switching back to %s' % oldurl)
|
||||
cururl = oldurl
|
||||
data = u2.urlopen(oldurl).read()
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
|
||||
storylist = soup.find('ul', {'id' : 'storylist'})
|
||||
if storylist is not None:
|
||||
allBlocked = storylist.findAll('li', {'class' : 'blocked'})
|
||||
if allBlocked is not None:
|
||||
#logging.debug('allBlocked=%s' % allBlocked)
|
||||
raise FailedToDownload("Are you sure %s is a chapter URL(not the chapter list)?"%cururl)
|
||||
raise LoginRequiredException(cururl)
|
||||
|
||||
allH4s = storylist.findAll('h4')
|
||||
#logging.debug('allH4s=%s' % allH4s)
|
||||
|
||||
if allH4s is not None:
|
||||
for h4 in allH4s:
|
||||
chapterinfo = h4.find('a')
|
||||
#logging.debug('Chapter1=%s' % chapterinfo)
|
||||
url = 'http://' + self.host + chapterinfo['href']
|
||||
title = chapterinfo.string.strip()
|
||||
#logging.debug('Chapter=%s, %s' % (url, title))
|
||||
# ficwad includes 'Story Index' in the dropdown of chapters,
|
||||
# but it's not a real chapter.
|
||||
if title != "Story Index":
|
||||
logging.debug('Chapter[%s]=%s, %s' % (ii, url, title))
|
||||
result.append((url,title))
|
||||
ii = ii+1
|
||||
else:
|
||||
logging.debug('Skipping Story Index. URL %s' % url)
|
||||
|
||||
if ii == 1:
|
||||
select = soup.find('select', { 'name' : 'goto' } )
|
||||
|
||||
if select is None:
|
||||
self.numChapters = '1'
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
result.append((self.url,self.storyName))
|
||||
logging.debug('Chapter[%s]=%s %s' % (ii, self.url, self.storyName))
|
||||
else:
|
||||
allOptions = select.findAll('option')
|
||||
for o in allOptions:
|
||||
url = 'http://' + self.host + o['value']
|
||||
title = o.string
|
||||
# ficwad includes 'Story Index' in the dropdown of chapters,
|
||||
# but it's not a real chapter.
|
||||
if title != "Story Index":
|
||||
logging.debug('Chapter[%s]=%s, %s' % (ii, url, title))
|
||||
result.append((url,title))
|
||||
ii = ii+1
|
||||
else:
|
||||
logging.debug('Skipping Story Index. URL %s' % url)
|
||||
|
||||
return result
|
||||
|
||||
def getText(self, url):
|
||||
if url.find('http://') == -1:
|
||||
url = 'http://' + self.host + '/' + url
|
||||
|
||||
data = ''
|
||||
try:
|
||||
data = u2.urlopen(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
|
||||
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
except:
|
||||
logging.info("Failed to decode: <%s>" % data)
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
|
||||
|
||||
div = soup.find('div', {'id' : 'storytext'})
|
||||
if None == div:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return div.__str__('utf8')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
url = 'http://www.ficwad.com/story/14536'
|
||||
data = u2.urlopen(url).read()
|
||||
host = up.urlparse(url).netloc
|
||||
fw = FicWad(url)
|
||||
urls = fw.extractIndividualUrls()
|
||||
pp.pprint(urls)
|
||||
print(fw.getText(data))
|
||||
301
fanficdownloader/fpcom.py
Normal file
301
fanficdownloader/fpcom.py
Normal file
|
|
@ -0,0 +1,301 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import cgi
|
||||
import uuid
|
||||
import shutil
|
||||
import os.path
|
||||
import logging
|
||||
import unittest
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from constants import *
|
||||
from adapter import *
|
||||
|
||||
try:
|
||||
import login_password
|
||||
except:
|
||||
# tough luck
|
||||
pass
|
||||
|
||||
class FPCom(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
|
||||
self.storyName = ''
|
||||
self.authorName = ''
|
||||
self.storyDescription = ''
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
self.authorId = '0'
|
||||
self.authorURL = self.path
|
||||
self.storyId = '0'
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = ''
|
||||
self.category = ''
|
||||
self.storyStatus = 'In-Progress'
|
||||
self.storyRating = 'K'
|
||||
self.storyUserRating = '0'
|
||||
self.outputName = ''
|
||||
self.outputStorySep = '-fpcom_'
|
||||
|
||||
if self.path.startswith('/'):
|
||||
self.path = self.path[1:]
|
||||
|
||||
spl = self.path.split('/')
|
||||
if spl is not None:
|
||||
if len(spl) > 0 and spl[0] != 's':
|
||||
raise InvalidStoryURL("Error URL \"%s\" is not a story." % self.url)
|
||||
if len(spl) > 1:
|
||||
self.storyId = spl[1]
|
||||
if len(spl) > 2:
|
||||
chapter = spl[1]
|
||||
else:
|
||||
chapter = '1'
|
||||
if len(spl) == 5:
|
||||
self.path = "/".join(spl[1:-1])
|
||||
|
||||
if self.path.endswith('/'):
|
||||
self.path = self.path[:-1]
|
||||
|
||||
logging.debug('self.path=%s' % self.path)
|
||||
|
||||
if not self.appEngine:
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
else:
|
||||
self.opener = None
|
||||
|
||||
logging.debug("Created FP.Com: url=%s" % (self.url))
|
||||
|
||||
def _getLoginScript(self):
|
||||
return self.path
|
||||
|
||||
|
||||
def _processInfoLine(self, line):
|
||||
have_lang = False
|
||||
words = line.split(' - ')
|
||||
if words is not None:
|
||||
for word in words:
|
||||
if word.find(':') != -1:
|
||||
sds = word.split(': ')
|
||||
if sds is not None and len(sds) > 1:
|
||||
if sds[0] == 'Updated':
|
||||
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(sds[1].strip(' '), "%m-%d-%y")))
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
||||
elif sds[0] == 'Published':
|
||||
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(sds[1].strip(' '), "%m-%d-%y")))
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
elif sds[0] == 'Reviews':
|
||||
reviews = sds[1]
|
||||
logging.debug('reviews=%s' % reviews)
|
||||
elif word.find('Complete') != -1:
|
||||
self.storyStatus = 'Completed'
|
||||
logging.debug('self.storyStatus=%s' % self.storyStatus)
|
||||
elif not have_lang:
|
||||
have_lang = True
|
||||
language = word
|
||||
logging.debug('language=%s' % language)
|
||||
else:
|
||||
self.category = word
|
||||
logging.debug('self.category=%s' % self.category)
|
||||
sgs = self.category.split('/')
|
||||
for sg in sgs:
|
||||
self.addSubject(sg)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
data = ''
|
||||
try:
|
||||
data = self.fetchUrl(self.url)
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + self.url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise StoryDoesNotExist("Problem reading story URL " + self.url + "!")
|
||||
|
||||
d2 = re.sub('&\#[0-9]+;', ' ', data)
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(d2)
|
||||
except:
|
||||
logging.error("Failed to decode: <%s>" % d2)
|
||||
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % self.url)
|
||||
|
||||
allA = soup.findAll('a')
|
||||
for a in allA:
|
||||
if 'href' in a._getAttrMap() and a['href'].find('/u/') != -1:
|
||||
self.authorName = a.string
|
||||
(u1, u2, self.authorId, u3) = a['href'].split('/')
|
||||
logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName))
|
||||
|
||||
urls = []
|
||||
|
||||
metas = soup.findAll ('meta', {'name' : 'description'})
|
||||
if metas is not None:
|
||||
for meta in metas:
|
||||
if 'content' in meta._getAttrMap():
|
||||
self.storyDescription = unicode(meta['content'])
|
||||
#logging.debug('self.storyDescription=%s' % self.storyDescription.replace("\n"," ").replace('\r',''))
|
||||
|
||||
title=meta.find('title')
|
||||
logging.debug('title=%s' % title.string)
|
||||
tt = title.string.split(',')
|
||||
if tt is not None:
|
||||
if len(tt) > 0:
|
||||
self.storyName = tt[0]
|
||||
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
|
||||
if len(tt) > 1:
|
||||
tt1 = tt[1].split(' - ')
|
||||
if tt1 is not None and len(tt1) > 0:
|
||||
self.category = tt1[0].strip()
|
||||
logging.debug('self.category=%s' % self.category)
|
||||
cc = self.category.split(' ')
|
||||
for cc1 in cc:
|
||||
if cc1 is not None and cc1 != 'a':
|
||||
if cc1 == 'fanfic':
|
||||
self.addSubject('FanFiction')
|
||||
else:
|
||||
self.addSubject(cc1)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
|
||||
|
||||
numchapters = 0
|
||||
urlstory = ''
|
||||
|
||||
fidochap = soup.find('form', {'name':'fidochap'})
|
||||
sl = fidochap.find('select', {'title':'chapter navigation'})
|
||||
if sl is not None:
|
||||
logging.debug('sl=%s' % sl )
|
||||
if 'onchange' in sl._getAttrMap():
|
||||
ocs = sl['onchange'].split('\'')
|
||||
logging.debug('ocs=%s' % ocs)
|
||||
if ocs is not None and len(ocs) > 3:
|
||||
urlstory = ocs[3]
|
||||
logging.debug('urlstory=%s' % urlstory)
|
||||
|
||||
opts = sl.findAll('option')
|
||||
for o in opts:
|
||||
if 'value' in o._getAttrMap():
|
||||
url = 'http://' + self.host + '/s/' + self.storyId + '/' + o['value'] + urlstory
|
||||
logging.debug('URL=%s, Title=%s' % (url, o.string))
|
||||
urls.append((url, o.string))
|
||||
numchapters = numchapters + 1
|
||||
|
||||
if numchapters == 0:
|
||||
numchapters = 1
|
||||
url = 'http://' + self.host + '/s/' + self.storyId + '/1' + urlstory
|
||||
logging.debug('URL=%s, Title=%s' % (url, self.storyName))
|
||||
urls.append((url, self.storyName))
|
||||
|
||||
self.numChapters = unicode(numchapters)
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
logging.debug('urls=%s' % urls)
|
||||
|
||||
self.genre = ''
|
||||
tds = fidochap.findAll('td')
|
||||
for td in tds:
|
||||
tdb = td.find('b')
|
||||
if tdb is not None and tdb.string == self.storyName:
|
||||
tdas = td.findAll('a')
|
||||
for tda in tdas:
|
||||
ss = tda.string
|
||||
if ss is not None:
|
||||
if len(self.genre) > 0:
|
||||
self.genre = self.genre + ', '
|
||||
self.genre = self.genre + ss
|
||||
self.addSubject(ss)
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
tda = td.find ('a')
|
||||
if tda is not None and tda.string.find('Rated:') != -1:
|
||||
tdas = re.split ("<[^>]+>", unicode(td).replace('\n','').replace(' ',' '))
|
||||
if tdas is not None:
|
||||
ll = len(tdas)
|
||||
if ll > 2:
|
||||
ss = tdas[2].split(': ')
|
||||
if ss is not None and len(ss) > 1:
|
||||
self.storyRating = ss[1]
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
if ll > 3:
|
||||
self._processInfoLine (tdas[3])
|
||||
if ll > 5:
|
||||
self._processInfoLine (tdas[5])
|
||||
|
||||
self.authorURL = 'http://' + self.host + '/u/' + self.authorId
|
||||
|
||||
return urls
|
||||
|
||||
def getText(self, url):
|
||||
# time.sleep( 2.0 )
|
||||
data = ''
|
||||
try:
|
||||
data = self.fetchUrl(url)
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
|
||||
|
||||
lines = data.split('\n')
|
||||
|
||||
textbuf = ''
|
||||
emit = False
|
||||
|
||||
olddata = data
|
||||
try:
|
||||
data = data.decode('utf8')
|
||||
except:
|
||||
data = olddata
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
except:
|
||||
logging.info("Failed to decode: <%s>" % data)
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
|
||||
|
||||
div = soup.find('div', {'id' : 'storytext'})
|
||||
if None == div:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return div.__str__('utf8')
|
||||
|
||||
|
||||
class FPC_UnitTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
pass
|
||||
|
||||
def testFictionPress(self):
|
||||
url = 'http://www.fictionpress.com/s/2725180/1/Behind_This_Facade'
|
||||
f = FPCom(url)
|
||||
urls = f.extractIndividualUrls()
|
||||
|
||||
self.assertEquals('Behind This Facade', f.getStoryName())
|
||||
self.assertEquals('IntoxicatingMelody', f.getAuthorName())
|
||||
|
||||
text = f.getText(url)
|
||||
self.assertTrue(text.find('Kale Resgerald at your service" He answered, "So, can we go now? Or do you want to') != -1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
280
fanficdownloader/hpfiction.py
Normal file
280
fanficdownloader/hpfiction.py
Normal file
|
|
@ -0,0 +1,280 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import cgi
|
||||
import uuid
|
||||
import shutil
|
||||
import os.path
|
||||
import logging
|
||||
import unittest
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from constants import *
|
||||
from adapter import *
|
||||
|
||||
try:
|
||||
import login_password
|
||||
except:
|
||||
# tough luck
|
||||
pass
|
||||
|
||||
class HPFiction(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
logging.debug('self.host=%s' % self.host)
|
||||
logging.debug('self.path=%s' % self.path)
|
||||
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
|
||||
self.chapurl = False
|
||||
self.storyId = '0'
|
||||
|
||||
sss = self.url.split('?')
|
||||
logging.debug('sss=%s' % sss)
|
||||
if sss is not None and len(sss) > 1:
|
||||
sc = sss[1].split('=')
|
||||
logging.debug('sc=%s' % sc)
|
||||
if sc is not None and len(sc) > 1:
|
||||
if sc[0] == 'chapterid':
|
||||
self.chapurl = True
|
||||
elif sc[0] == 'psid' or sc[0] == 'sid':
|
||||
self.storyId = sc[1]
|
||||
|
||||
self.storyDescription = 'Fanfiction Story'
|
||||
self.authorId = '0'
|
||||
self.authorURL = ''
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.subjects.append ('fanfiction')
|
||||
self.subjects.append ('Harry Potter')
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = 'FanFiction'
|
||||
self.category = 'Category'
|
||||
self.storyStatus = 'In-Progress'
|
||||
self.storyRating = 'K'
|
||||
self.storyUserRating = '0'
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
self.outputName = ''
|
||||
self.outputStorySep = '-hp_'
|
||||
|
||||
logging.debug("Created HPFiction: url=%s" % (self.url))
|
||||
|
||||
def _getLoginScript(self):
|
||||
return self.path
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(self.url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + self.url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise StoryDoesNotExist("Problem reading story URL " + self.url + "!")
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulSoup(data)
|
||||
except:
|
||||
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % self.url)
|
||||
|
||||
links = soup.findAll('a')
|
||||
def_chapurl = ''
|
||||
def_chaptitle = ''
|
||||
|
||||
if self.chapurl:
|
||||
foundid = False
|
||||
for a in links:
|
||||
if a['href'].find('psid') != -1:
|
||||
sp = a['href'].split('?')
|
||||
if sp is not None and len(sp) > 1:
|
||||
for sp1 in sp:
|
||||
if sp1.find('psid') != -1:
|
||||
ps = sp1.split('=')
|
||||
if ps is not None and len(ps) > 1:
|
||||
self.storyId = ps[1].replace('\'','')
|
||||
foundid = True
|
||||
self.storyName = a.string
|
||||
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
|
||||
break
|
||||
if foundid:
|
||||
self.url = "http://" + self.host + "/viewstory.php?psid=" + self.storyId
|
||||
logging.debug('Title Page URL=%s' % self.url)
|
||||
data1 = self.opener.open(self.url).read()
|
||||
hdrsoup = bs.BeautifulSoup(data1)
|
||||
else:
|
||||
hdrsoup = soup
|
||||
else:
|
||||
hdrsoup = soup
|
||||
|
||||
for a in links:
|
||||
if not self.chapurl and a['href'].find('psid') != -1:
|
||||
sp = a['href'].split('?')
|
||||
if sp is not None and len(sp) > 1:
|
||||
for sp1 in sp:
|
||||
if sp1.find('psid') != -1:
|
||||
ps = sp1.split('=')
|
||||
if ps is not None and len(ps) > 1:
|
||||
self.storyId = ps[1].replace('\'','')
|
||||
self.storyName = a.string
|
||||
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
|
||||
elif a['href'].find('viewuser.php') != -1:
|
||||
self.authorName = a.string
|
||||
self.authorURL = 'http://' + self.host + '/' + a['href']
|
||||
(u1, self.authorId) = a['href'].split('=')
|
||||
logging.debug('self.authorName=%s, self.authorId=%s' % (self.authorName, self.authorId))
|
||||
elif a['href'].find('chapterid=') != -1 and len(def_chapurl) == 0:
|
||||
def_chapurl = 'http://' + self.host + '/viewstory.php' + unicode(a['href'])
|
||||
def_chaptitle = a.string
|
||||
logging.debug('def_chapurl=%s, def_chaptitle=%s' % (def_chapurl, def_chaptitle))
|
||||
|
||||
centers = hdrsoup.findAll('center')
|
||||
for center in centers:
|
||||
tds = center.findAll ('td')
|
||||
if tds is not None and len(tds) > 0:
|
||||
for td in tds:
|
||||
s = re.split ("<[^>]+>", unicode(td).replace('\n','').replace(' ',' '))
|
||||
ii = 0
|
||||
ll = len(s)
|
||||
sss = ''
|
||||
while ii < ll - 1:
|
||||
if s[ii] is not None and len(s[ii]) > 0:
|
||||
if s[ii] == 'Rating:':
|
||||
self.storyRating = s[ii+1]
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'Chapters:':
|
||||
self.numChapters = s[ii+1]
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'Characters:':
|
||||
s2 = s[ii+1].split(', ')
|
||||
for ss2 in s2:
|
||||
self.addCharacter(ss2)
|
||||
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'Genre(s):':
|
||||
self.genre = s[ii+1]
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
s2 = s[ii+1].split(', ')
|
||||
for ss2 in s2:
|
||||
self.addSubject(ss2)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'Status:':
|
||||
if s[ii+1].strip(' ') == "Work In Progress":
|
||||
self.storyStatus = 'In-Progress'
|
||||
else:
|
||||
self.storyStatus = 'Completed'
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'First Published:':
|
||||
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s[ii+1].strip(' '), "%Y.%m.%d")))
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'Last Updated:':
|
||||
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s[ii+1].strip(' '), "%Y.%m.%d")))
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'Last Published Chapter:':
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'Pairings:':
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'Warnings:':
|
||||
ii = ii + 2
|
||||
else:
|
||||
sss = sss + ' ' + s[ii]
|
||||
ii = ii + 1
|
||||
else:
|
||||
ii = ii + 1
|
||||
self.storyDescription = sss
|
||||
#logging.debug('self.storyDescription=%s' % self.storyDescription.replace("\n"," ").replace('\r',''))
|
||||
|
||||
urls = []
|
||||
|
||||
select = soup.find('select', {'name' : 'chapterid'})
|
||||
if select is None:
|
||||
# no chapters found, try url by itself.
|
||||
if len(def_chapurl) > 0:
|
||||
urls.append((def_chapurl, def_chaptitle))
|
||||
else:
|
||||
urls.append((self.url,self.storyName))
|
||||
else:
|
||||
for o in select.findAll('option'):
|
||||
if 'value' in o._getAttrMap():
|
||||
url = 'http://' + self.host + self.path + o['value']
|
||||
title = o.string
|
||||
if title != "Story Index":
|
||||
urls.append((url,title))
|
||||
|
||||
return urls
|
||||
|
||||
def getText(self, url):
|
||||
logging.debug('Downloading from URL: %s' % url)
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulSoup(data)
|
||||
except:
|
||||
logging.info("Failed to decode: <%s>" % data)
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
|
||||
|
||||
divtext = soup.find('div', {'id' : 'fluidtext'})
|
||||
if None == divtext:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return divtext.__str__('utf8')
|
||||
|
||||
|
||||
class FF_UnitTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
pass
|
||||
|
||||
def testChaptersAuthStory(self):
|
||||
f = HPFiction('http://www.harrypotterfanfiction.com/viewstory.php?chapterid=80123')
|
||||
urls = f.extractIndividualUrls()
|
||||
|
||||
self.assertEquals(49, len(urls))
|
||||
self.assertEquals('Elisha', f.getAuthorName())
|
||||
self.assertEquals('A Secret Thought', f.getStoryName())
|
||||
|
||||
def testGetText(self):
|
||||
url = 'http://www.harrypotterfanfiction.com/viewstory.php?chapterid=80123'
|
||||
f = HPFiction(url)
|
||||
#urls = f.extractIndividualUrls()
|
||||
text = f.getText(url)
|
||||
self.assertTrue(text.find('She pulled out of his arms and felt the subtle regret') != -1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
|
||||
126
fanficdownloader/html.py
Normal file
126
fanficdownloader/html.py
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
#!/usr/bin/python
|
||||
# Copyright(c) 2009 Andrew Chatham and Vijay Pandurangan
|
||||
|
||||
import re
|
||||
import sys
|
||||
import StringIO
|
||||
import urllib
|
||||
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
|
||||
class HtmlProcessor:
|
||||
WHITESPACE_RE = re.compile(r'\s')
|
||||
# Look for </blockquote <p>
|
||||
BAD_TAG_RE = re.compile(r'<[^>]+<', re.MULTILINE)
|
||||
|
||||
def __init__(self, html, unfill=0):
|
||||
self.unfill = unfill
|
||||
html = self._ProcessRawHtml(html)
|
||||
self._soup = BeautifulSoup(html)
|
||||
if self._soup.title:
|
||||
self.title = self._soup.title.contents[0]
|
||||
else:
|
||||
self.title = None
|
||||
|
||||
def _ProcessRawHtml(self, html):
|
||||
new_html, count = HtmlProcessor.BAD_TAG_RE.subn('<', html)
|
||||
if count:
|
||||
print >>sys.stderr, 'Replaced %d bad tags' % count
|
||||
return new_html
|
||||
|
||||
def _StubInternalAnchors(self):
|
||||
'''Replace each internal anchor with a fixed-size filepos anchor.
|
||||
|
||||
Looks for every anchor with <a href="#myanchor"> and replaces that
|
||||
with <a filepos="00000000050">. Stores anchors in self._anchor_references'''
|
||||
self._anchor_references = []
|
||||
anchor_num = 0
|
||||
# anchor links
|
||||
anchorlist = self._soup.findAll('a', href=re.compile('^#'))
|
||||
# treat reference tags like a tags for TOCTOP.
|
||||
anchorlist.extend(self._soup.findAll('reference', href=re.compile('^#')))
|
||||
for anchor in anchorlist:
|
||||
self._anchor_references.append((anchor_num, anchor['href']))
|
||||
del anchor['href']
|
||||
anchor['filepos'] = '%.10d' % anchor_num
|
||||
anchor_num += 1
|
||||
|
||||
def _ReplaceAnchorStubs(self):
|
||||
# TODO: Browsers allow extra whitespace in the href names.
|
||||
# use __str__ instead of prettify--it inserts extra spaces.
|
||||
assembled_text = self._soup.__str__('utf8')
|
||||
del self._soup # shouldn't touch this anymore
|
||||
for anchor_num, original_ref in self._anchor_references:
|
||||
ref = urllib.unquote(original_ref[1:]) # remove leading '#'
|
||||
# Find the position of ref in the utf-8 document.
|
||||
# TODO(chatham): Using regexes and looking for name= would be better.
|
||||
newpos = assembled_text.rfind(ref.encode('utf-8'))
|
||||
if newpos == -1:
|
||||
print >>sys.stderr, 'Could not find anchor "%s"' % original_ref
|
||||
continue
|
||||
newpos += len(ref) + 2 # don't point into the middle of the <a name> tag
|
||||
old_filepos = 'filepos="%.10d"' % anchor_num
|
||||
new_filepos = 'filepos="%.10d"' % newpos
|
||||
assert assembled_text.find(old_filepos) != -1
|
||||
assembled_text = assembled_text.replace(old_filepos, new_filepos, 1)
|
||||
return assembled_text
|
||||
|
||||
def _FixPreTags(self):
|
||||
'''Replace <pre> tags with HTML-ified text.'''
|
||||
pres = self._soup.findAll('pre')
|
||||
for pre in pres:
|
||||
pre.replaceWith(self._FixPreContents(str(pre.contents[0])))
|
||||
|
||||
def _FixPreContents(self, text):
|
||||
if self.unfill:
|
||||
line_splitter = '\n\n'
|
||||
line_joiner = '<p>'
|
||||
else:
|
||||
line_splitter = '\n'
|
||||
line_joiner = '<br>'
|
||||
lines = []
|
||||
for line in text.split(line_splitter):
|
||||
lines.append(self.WHITESPACE_RE.subn(' ', line)[0])
|
||||
return line_joiner.join(lines)
|
||||
|
||||
def _RemoveUnsupported(self):
|
||||
'''Remove any tags which the kindle cannot handle.'''
|
||||
# TODO(chatham): <link> tags to script?
|
||||
unsupported_tags = ('script', 'style')
|
||||
for tag_type in unsupported_tags:
|
||||
for element in self._soup.findAll(tag_type):
|
||||
element.extract()
|
||||
|
||||
def RenameAnchors(self, prefix):
|
||||
'''Rename every internal anchor to have the given prefix, then
|
||||
return the contents of the body tag.'''
|
||||
for anchor in self._soup.findAll('a', href=re.compile('^#')):
|
||||
anchor['href'] = '#' + prefix + anchor['href'][1:]
|
||||
for a in self._soup.findAll('a'):
|
||||
if a.get('name'):
|
||||
a['name'] = prefix + a['name']
|
||||
|
||||
# TODO(chatham): figure out how to fix this. sometimes body comes out
|
||||
# as NoneType.
|
||||
content = []
|
||||
if self._soup.body is not None:
|
||||
content = [unicode(c) for c in self._soup.body.contents]
|
||||
return '\n'.join(content)
|
||||
|
||||
def CleanHtml(self):
|
||||
# TODO(chatham): fix_html_br, fix_html
|
||||
self._RemoveUnsupported()
|
||||
self._StubInternalAnchors()
|
||||
self._FixPreTags()
|
||||
return self._ReplaceAnchorStubs()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
FILE ='/tmp/documentation.html'
|
||||
#FILE = '/tmp/multipre.html'
|
||||
FILE = '/tmp/view.html'
|
||||
import codecs
|
||||
d = open(FILE).read()
|
||||
h = HtmlProcessor(d)
|
||||
s = h.CleanHtml()
|
||||
#print s
|
||||
452
fanficdownloader/html2text.py
Normal file
452
fanficdownloader/html2text.py
Normal file
|
|
@ -0,0 +1,452 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""html2text: Turn HTML into equivalent Markdown-structured text."""
|
||||
__version__ = "2.37"
|
||||
__author__ = "Aaron Swartz (me@aaronsw.com)"
|
||||
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
|
||||
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
|
||||
|
||||
# TODO:
|
||||
# Support decoded entities with unifiable.
|
||||
|
||||
if not hasattr(__builtins__, 'True'): True, False = 1, 0
|
||||
import re, sys, urllib, htmlentitydefs, codecs, StringIO, types
|
||||
import sgmllib
|
||||
import urlparse
|
||||
sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
|
||||
|
||||
try: from textwrap import wrap
|
||||
except: pass
|
||||
|
||||
# Use Unicode characters instead of their ascii psuedo-replacements
|
||||
UNICODE_SNOB = 0
|
||||
|
||||
# Put the links after each paragraph instead of at the end.
|
||||
LINKS_EACH_PARAGRAPH = 0
|
||||
|
||||
# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
|
||||
BODY_WIDTH = 78
|
||||
|
||||
# Don't show internal links (href="#local-anchor") -- corresponding link targets
|
||||
# won't be visible in the plain text file anyway.
|
||||
SKIP_INTERNAL_LINKS = False
|
||||
|
||||
### Entity Nonsense ###
|
||||
|
||||
def name2cp(k):
|
||||
if k == 'apos': return ord("'")
|
||||
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
|
||||
return htmlentitydefs.name2codepoint[k]
|
||||
else:
|
||||
k = htmlentitydefs.entitydefs[k]
|
||||
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
|
||||
return ord(codecs.latin_1_decode(k)[0])
|
||||
|
||||
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
|
||||
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
|
||||
'ndash':'-', 'oelig':'oe', 'aelig':'ae',
|
||||
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
|
||||
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
|
||||
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
|
||||
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
|
||||
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
|
||||
|
||||
unifiable_n = {}
|
||||
|
||||
for k in unifiable.keys():
|
||||
unifiable_n[name2cp(k)] = unifiable[k]
|
||||
|
||||
def charref(name):
|
||||
if name[0] in ['x','X']:
|
||||
c = int(name[1:], 16)
|
||||
else:
|
||||
c = int(name)
|
||||
|
||||
if not UNICODE_SNOB and c in unifiable_n.keys():
|
||||
return unifiable_n[c]
|
||||
else:
|
||||
return unichr(c)
|
||||
|
||||
def entityref(c):
|
||||
if not UNICODE_SNOB and c in unifiable.keys():
|
||||
return unifiable[c]
|
||||
else:
|
||||
try: name2cp(c)
|
||||
except KeyError: return "&" + c
|
||||
else: return unichr(name2cp(c))
|
||||
|
||||
def replaceEntities(s):
|
||||
s = s.group(1)
|
||||
if s[0] == "#":
|
||||
return charref(s[1:])
|
||||
else: return entityref(s)
|
||||
|
||||
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
|
||||
def unescape(s):
|
||||
return r_unescape.sub(replaceEntities, s)
|
||||
|
||||
def fixattrs(attrs):
|
||||
# Fix bug in sgmllib.py
|
||||
if not attrs: return attrs
|
||||
newattrs = []
|
||||
for attr in attrs:
|
||||
newattrs.append((attr[0], unescape(attr[1])))
|
||||
return newattrs
|
||||
|
||||
### End Entity Nonsense ###
|
||||
|
||||
def onlywhite(line):
|
||||
"""Return true if the line does only consist of whitespace characters."""
|
||||
for c in line:
|
||||
if c is not ' ' and c is not ' ':
|
||||
return c is ' '
|
||||
return line
|
||||
|
||||
def optwrap(text):
|
||||
"""Wrap all paragraphs in the provided text."""
|
||||
if not BODY_WIDTH:
|
||||
return text
|
||||
|
||||
assert wrap, "Requires Python 2.3."
|
||||
result = ''
|
||||
newlines = 0
|
||||
for para in text.split("\n"):
|
||||
if len(para) > 0:
|
||||
if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
|
||||
for line in wrap(para, BODY_WIDTH):
|
||||
result += line + "\n"
|
||||
result += "\n"
|
||||
newlines = 2
|
||||
else:
|
||||
if not onlywhite(para):
|
||||
result += para + "\n"
|
||||
newlines = 1
|
||||
else:
|
||||
if newlines < 2:
|
||||
result += "\n"
|
||||
newlines += 1
|
||||
return result
|
||||
|
||||
def hn(tag):
|
||||
if tag[0] == 'h' and len(tag) == 2:
|
||||
try:
|
||||
n = int(tag[1])
|
||||
if n in range(1, 10): return n
|
||||
except ValueError: return 0
|
||||
|
||||
class _html2text(sgmllib.SGMLParser):
|
||||
def __init__(self, out=None, baseurl=''):
|
||||
sgmllib.SGMLParser.__init__(self)
|
||||
|
||||
if out is None: self.out = self.outtextf
|
||||
else: self.out = out
|
||||
self.outtext = u''
|
||||
self.quiet = 0
|
||||
self.p_p = 0
|
||||
self.outcount = 0
|
||||
self.start = 1
|
||||
self.space = 0
|
||||
self.a = []
|
||||
self.astack = []
|
||||
self.acount = 0
|
||||
self.list = []
|
||||
self.blockquote = 0
|
||||
self.pre = 0
|
||||
self.startpre = 0
|
||||
self.lastWasNL = 0
|
||||
self.abbr_title = None # current abbreviation definition
|
||||
self.abbr_data = None # last inner HTML (for abbr being defined)
|
||||
self.abbr_list = {} # stack of abbreviations to write later
|
||||
self.baseurl = baseurl
|
||||
|
||||
def outtextf(self, s):
|
||||
self.outtext += s
|
||||
|
||||
def close(self):
|
||||
sgmllib.SGMLParser.close(self)
|
||||
|
||||
self.pbr()
|
||||
self.o('', 0, 'end')
|
||||
|
||||
return self.outtext
|
||||
|
||||
def handle_charref(self, c):
|
||||
self.o(charref(c))
|
||||
|
||||
def handle_entityref(self, c):
|
||||
self.o(entityref(c))
|
||||
|
||||
def unknown_starttag(self, tag, attrs):
|
||||
self.handle_tag(tag, attrs, 1)
|
||||
|
||||
def unknown_endtag(self, tag):
|
||||
self.handle_tag(tag, None, 0)
|
||||
|
||||
def previousIndex(self, attrs):
|
||||
""" returns the index of certain set of attributes (of a link) in the
|
||||
self.a list
|
||||
|
||||
If the set of attributes is not found, returns None
|
||||
"""
|
||||
if not attrs.has_key('href'): return None
|
||||
|
||||
i = -1
|
||||
for a in self.a:
|
||||
i += 1
|
||||
match = 0
|
||||
|
||||
if a.has_key('href') and a['href'] == attrs['href']:
|
||||
if a.has_key('title') or attrs.has_key('title'):
|
||||
if (a.has_key('title') and attrs.has_key('title') and
|
||||
a['title'] == attrs['title']):
|
||||
match = True
|
||||
else:
|
||||
match = True
|
||||
|
||||
if match: return i
|
||||
|
||||
def handle_tag(self, tag, attrs, start):
|
||||
attrs = fixattrs(attrs)
|
||||
|
||||
if hn(tag):
|
||||
self.p()
|
||||
if start: self.o(hn(tag)*"#" + ' ')
|
||||
|
||||
if tag in ['p', 'div']: self.p()
|
||||
|
||||
if tag == "br" and start: self.o(" \n")
|
||||
|
||||
if tag == "hr" and start:
|
||||
self.p()
|
||||
self.o("* * *")
|
||||
self.p()
|
||||
|
||||
if tag in ["head", "style", 'script']:
|
||||
if start: self.quiet += 1
|
||||
else: self.quiet -= 1
|
||||
|
||||
if tag in ["body"]:
|
||||
self.quiet = 0 # sites like 9rules.com never close <head>
|
||||
|
||||
if tag == "blockquote":
|
||||
if start:
|
||||
self.p(); self.o('> ', 0, 1); self.start = 1
|
||||
self.blockquote += 1
|
||||
else:
|
||||
self.blockquote -= 1
|
||||
self.p()
|
||||
|
||||
if tag in ['em', 'i', 'u']: self.o("_")
|
||||
if tag in ['strong', 'b']: self.o("**")
|
||||
if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
|
||||
if tag == "abbr":
|
||||
if start:
|
||||
attrsD = {}
|
||||
for (x, y) in attrs: attrsD[x] = y
|
||||
attrs = attrsD
|
||||
|
||||
self.abbr_title = None
|
||||
self.abbr_data = ''
|
||||
if attrs.has_key('title'):
|
||||
self.abbr_title = attrs['title']
|
||||
else:
|
||||
if self.abbr_title != None:
|
||||
self.abbr_list[self.abbr_data] = self.abbr_title
|
||||
self.abbr_title = None
|
||||
self.abbr_data = ''
|
||||
|
||||
if tag == "a":
|
||||
if start:
|
||||
attrsD = {}
|
||||
for (x, y) in attrs: attrsD[x] = y
|
||||
attrs = attrsD
|
||||
if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')):
|
||||
self.astack.append(attrs)
|
||||
self.o("[")
|
||||
else:
|
||||
self.astack.append(None)
|
||||
else:
|
||||
if self.astack:
|
||||
a = self.astack.pop()
|
||||
if a:
|
||||
i = self.previousIndex(a)
|
||||
if i is not None:
|
||||
a = self.a[i]
|
||||
else:
|
||||
self.acount += 1
|
||||
a['count'] = self.acount
|
||||
a['outcount'] = self.outcount
|
||||
self.a.append(a)
|
||||
self.o("][" + `a['count']` + "]")
|
||||
|
||||
if tag == "img" and start:
|
||||
attrsD = {}
|
||||
for (x, y) in attrs: attrsD[x] = y
|
||||
attrs = attrsD
|
||||
if attrs.has_key('src'):
|
||||
attrs['href'] = attrs['src']
|
||||
alt = attrs.get('alt', '')
|
||||
i = self.previousIndex(attrs)
|
||||
if i is not None:
|
||||
attrs = self.a[i]
|
||||
else:
|
||||
self.acount += 1
|
||||
attrs['count'] = self.acount
|
||||
attrs['outcount'] = self.outcount
|
||||
self.a.append(attrs)
|
||||
self.o("![")
|
||||
self.o(alt)
|
||||
self.o("]["+`attrs['count']`+"]")
|
||||
|
||||
if tag == 'dl' and start: self.p()
|
||||
if tag == 'dt' and not start: self.pbr()
|
||||
if tag == 'dd' and start: self.o(' ')
|
||||
if tag == 'dd' and not start: self.pbr()
|
||||
|
||||
if tag in ["ol", "ul"]:
|
||||
if start:
|
||||
self.list.append({'name':tag, 'num':0})
|
||||
else:
|
||||
if self.list: self.list.pop()
|
||||
|
||||
self.p()
|
||||
|
||||
if tag == 'li':
|
||||
if start:
|
||||
self.pbr()
|
||||
if self.list: li = self.list[-1]
|
||||
else: li = {'name':'ul', 'num':0}
|
||||
self.o(" "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.
|
||||
if li['name'] == "ul": self.o("* ")
|
||||
elif li['name'] == "ol":
|
||||
li['num'] += 1
|
||||
self.o(`li['num']`+". ")
|
||||
self.start = 1
|
||||
else:
|
||||
self.pbr()
|
||||
|
||||
if tag in ["table", "tr"] and start: self.p()
|
||||
if tag == 'td': self.pbr()
|
||||
|
||||
if tag == "pre":
|
||||
if start:
|
||||
self.startpre = 1
|
||||
self.pre = 1
|
||||
else:
|
||||
self.pre = 0
|
||||
self.p()
|
||||
|
||||
def pbr(self):
|
||||
if self.p_p == 0: self.p_p = 1
|
||||
|
||||
def p(self): self.p_p = 2
|
||||
|
||||
def o(self, data, puredata=0, force=0):
|
||||
if self.abbr_data is not None: self.abbr_data += data
|
||||
|
||||
if not self.quiet:
|
||||
if puredata and not self.pre:
|
||||
data = re.sub('\s+', ' ', data)
|
||||
if data and data[0] == ' ':
|
||||
self.space = 1
|
||||
data = data[1:]
|
||||
if not data and not force: return
|
||||
|
||||
if self.startpre:
|
||||
#self.out(" :") #TODO: not output when already one there
|
||||
self.startpre = 0
|
||||
|
||||
bq = (">" * self.blockquote)
|
||||
if not (force and data and data[0] == ">") and self.blockquote: bq += " "
|
||||
|
||||
if self.pre:
|
||||
bq += " "
|
||||
data = data.replace("\n", "\n"+bq)
|
||||
|
||||
if self.start:
|
||||
self.space = 0
|
||||
self.p_p = 0
|
||||
self.start = 0
|
||||
|
||||
if force == 'end':
|
||||
# It's the end.
|
||||
self.p_p = 0
|
||||
self.out("\n")
|
||||
self.space = 0
|
||||
|
||||
|
||||
if self.p_p:
|
||||
self.out(('\n'+bq)*self.p_p)
|
||||
self.space = 0
|
||||
|
||||
if self.space:
|
||||
if not self.lastWasNL: self.out(' ')
|
||||
self.space = 0
|
||||
|
||||
if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
|
||||
if force == "end": self.out("\n")
|
||||
|
||||
newa = []
|
||||
for link in self.a:
|
||||
if self.outcount > link['outcount']:
|
||||
self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href']))
|
||||
if link.has_key('title'): self.out(" ("+link['title']+")")
|
||||
self.out("\n")
|
||||
else:
|
||||
newa.append(link)
|
||||
|
||||
if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
|
||||
|
||||
self.a = newa
|
||||
|
||||
if self.abbr_list and force == "end":
|
||||
for abbr, definition in self.abbr_list.items():
|
||||
self.out(" *[" + abbr + "]: " + definition + "\n")
|
||||
|
||||
self.p_p = 0
|
||||
self.out(data)
|
||||
self.lastWasNL = data and data[-1] == '\n'
|
||||
self.outcount += 1
|
||||
|
||||
def handle_data(self, data):
|
||||
if r'\/script>' in data: self.quiet -= 1
|
||||
self.o(data, 1)
|
||||
|
||||
def unknown_decl(self, data): pass
|
||||
|
||||
def wrapwrite(text): sys.stdout.write(text.encode('utf8'))
|
||||
|
||||
def html2text_file(html, out=wrapwrite, baseurl=''):
|
||||
h = _html2text(out, baseurl)
|
||||
h.feed(html)
|
||||
h.feed("")
|
||||
return h.close()
|
||||
|
||||
def html2text(html, baseurl=''):
|
||||
return optwrap(html2text_file(html, None, baseurl))
|
||||
|
||||
if __name__ == "__main__":
|
||||
baseurl = ''
|
||||
if sys.argv[1:]:
|
||||
arg = sys.argv[1]
|
||||
if arg.startswith('http://'):
|
||||
baseurl = arg
|
||||
j = urllib.urlopen(baseurl)
|
||||
try:
|
||||
from feedparser import _getCharacterEncoding as enc
|
||||
except ImportError:
|
||||
enc = lambda x, y: ('utf-8', 1)
|
||||
text = j.read()
|
||||
encoding = enc(j.headers, text)[0]
|
||||
if encoding == 'us-ascii': encoding = 'utf-8'
|
||||
data = text.decode(encoding)
|
||||
|
||||
else:
|
||||
encoding = 'utf8'
|
||||
if len(sys.argv) > 2:
|
||||
encoding = sys.argv[2]
|
||||
data = open(arg, 'r').read().decode(encoding)
|
||||
else:
|
||||
data = sys.stdin.read().decode('utf8')
|
||||
wrapwrite(html2text(data, baseurl))
|
||||
19
fanficdownloader/html_constants.py
Normal file
19
fanficdownloader/html_constants.py
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
XHTML_START = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>${title} by ${author}</title>
|
||||
<link href="stylesheet.css" type="text/css" rel="stylesheet" />
|
||||
</head>
|
||||
<body>
|
||||
<div>
|
||||
<h1>${title} by ${author}</h1>
|
||||
${body}
|
||||
</body></html>
|
||||
'''
|
||||
|
||||
XHTML_CHAPTER_START = '''<h2>${chapter}</h2>'''
|
||||
|
||||
XHTML_END = ''''''
|
||||
448
fanficdownloader/htmlcleanup.py
Normal file
448
fanficdownloader/htmlcleanup.py
Normal file
|
|
@ -0,0 +1,448 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
|
||||
def _unirepl(match):
|
||||
"Return the unicode string for a decimal number"
|
||||
if match.group(1)=='x':
|
||||
radix=16
|
||||
else:
|
||||
radix=10
|
||||
value = int(match.group(2), radix )
|
||||
return unichr(value)
|
||||
|
||||
def _replaceNumberEntities(data):
|
||||
p = re.compile(r'&#(x?)(\d+);')
|
||||
return p.sub(_unirepl, data)
|
||||
|
||||
def _replaceNotEntities(data):
|
||||
# not just \w or \S. regexp from c:\Python25\lib\sgmllib.py
|
||||
# (or equiv), SGMLParser, entityref
|
||||
p = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
|
||||
return p.sub(r'&\1', data)
|
||||
|
||||
def stripHTML(soup):
|
||||
return removeAllEntities(re.sub(r'<[^>]+>','',"%s" % soup)).strip()
|
||||
|
||||
def conditionalRemoveEntities(value):
|
||||
if isinstance(value,str) or isinstance(value,unicode) :
|
||||
return removeEntities(value.strip())
|
||||
else:
|
||||
return value
|
||||
|
||||
def removeAllEntities(text):
|
||||
# Remove < < and &
|
||||
return removeEntities(text).replace('<', '<').replace('>', '>').replace('&', '&')
|
||||
|
||||
def removeEntities(text):
|
||||
|
||||
# replace numeric versions of [&<>] with named versions,
|
||||
# then replace named versions with actual characters,
|
||||
|
||||
if text is None:
|
||||
return ""
|
||||
if not (isinstance(text,str) or isinstance(text,unicode)):
|
||||
return str(text)
|
||||
|
||||
try:
|
||||
t = text.decode('utf-8')
|
||||
except UnicodeEncodeError, e:
|
||||
try:
|
||||
t = text.encode ('ascii', 'xmlcharrefreplace')
|
||||
except UnicodeEncodeError, e:
|
||||
t = text
|
||||
text = t
|
||||
text = re.sub(r'�*38;','&',text)
|
||||
text = re.sub(r'�*60;','<',text)
|
||||
text = re.sub(r'�*62;','>',text)
|
||||
|
||||
# replace remaining � entities with unicode value, such as ' -> '
|
||||
text = _replaceNumberEntities(text)
|
||||
|
||||
# replace several named entities with character, such as — -> -
|
||||
# see constants.py for the list.
|
||||
# reverse sort will put entities with ; before the same one without, when valid.
|
||||
for e in reversed(sorted(entities.keys())):
|
||||
v = entities[e]
|
||||
try:
|
||||
text = text.replace(e, v)
|
||||
except UnicodeDecodeError, ex:
|
||||
# for the pound symbol in constants.py
|
||||
text = text.replace(e, v.decode('utf-8'))
|
||||
|
||||
# SGMLParser, and in turn, BeautifulStoneSoup doesn't parse
|
||||
# entities terribly well and inserts (;) after something that
|
||||
# it thinks might be an entity. AT&T becomes AT&T; All of my
|
||||
# attempts to fix this by changing the input to
|
||||
# BeautifulStoneSoup break something else instead. But at
|
||||
# this point, there should be *no* real entities left, so find
|
||||
# these not-entities and removing them here should be safe.
|
||||
text = _replaceNotEntities(text)
|
||||
|
||||
# < < and & are the only html entities allowed in xhtml, put those back.
|
||||
return text.replace('&', '&').replace('&lt', '<').replace('&gt', '>')
|
||||
|
||||
# entity list from http://code.google.com/p/doctype/wiki/CharacterEntitiesConsistent
|
||||
entities = { 'á' : 'á',
|
||||
'Á' : 'Á',
|
||||
'Á' : 'Á',
|
||||
'á' : 'á',
|
||||
'â' : 'â',
|
||||
'Â' : 'Â',
|
||||
'Â' : 'Â',
|
||||
'â' : 'â',
|
||||
'´' : '´',
|
||||
'´' : '´',
|
||||
'Æ' : 'Æ',
|
||||
'æ' : 'æ',
|
||||
'Æ' : 'Æ',
|
||||
'æ' : 'æ',
|
||||
'à' : 'à',
|
||||
'À' : 'À',
|
||||
'À' : 'À',
|
||||
'à' : 'à',
|
||||
'ℵ' : 'ℵ',
|
||||
'α' : 'α',
|
||||
'Α' : 'Α',
|
||||
'&' : '&',
|
||||
'&' : '&',
|
||||
'&' : '&',
|
||||
'&' : '&',
|
||||
'∧' : '∧',
|
||||
'∠' : '∠',
|
||||
'å' : 'å',
|
||||
'Å' : 'Å',
|
||||
'Å' : 'Å',
|
||||
'å' : 'å',
|
||||
'≈' : '≈',
|
||||
'ã' : 'ã',
|
||||
'Ã' : 'Ã',
|
||||
'Ã' : 'Ã',
|
||||
'ã' : 'ã',
|
||||
'ä' : 'ä',
|
||||
'Ä' : 'Ä',
|
||||
'Ä' : 'Ä',
|
||||
'ä' : 'ä',
|
||||
'„' : '„',
|
||||
'β' : 'β',
|
||||
'Β' : 'Β',
|
||||
'¦' : '¦',
|
||||
'¦' : '¦',
|
||||
'•' : '•',
|
||||
'∩' : '∩',
|
||||
'ç' : 'ç',
|
||||
'Ç' : 'Ç',
|
||||
'Ç' : 'Ç',
|
||||
'ç' : 'ç',
|
||||
'¸' : '¸',
|
||||
'¸' : '¸',
|
||||
'¢' : '¢',
|
||||
'¢' : '¢',
|
||||
'χ' : 'χ',
|
||||
'Χ' : 'Χ',
|
||||
'ˆ' : 'ˆ',
|
||||
'♣' : '♣',
|
||||
'≅' : '≅',
|
||||
'©' : '©',
|
||||
'©' : '©',
|
||||
'©' : '©',
|
||||
'©' : '©',
|
||||
'↵' : '↵',
|
||||
'∪' : '∪',
|
||||
'¤' : '¤',
|
||||
'¤' : '¤',
|
||||
'†' : '†',
|
||||
'‡' : '‡',
|
||||
'↓' : '↓',
|
||||
'⇓' : '⇓',
|
||||
'°' : '°',
|
||||
'°' : '°',
|
||||
'δ' : 'δ',
|
||||
'Δ' : 'Δ',
|
||||
'♦' : '♦',
|
||||
'÷' : '÷',
|
||||
'÷' : '÷',
|
||||
'é' : 'é',
|
||||
'É' : 'É',
|
||||
'É' : 'É',
|
||||
'é' : 'é',
|
||||
'ê' : 'ê',
|
||||
'Ê' : 'Ê',
|
||||
'Ê' : 'Ê',
|
||||
'ê' : 'ê',
|
||||
'è' : 'è',
|
||||
'È' : 'È',
|
||||
'È' : 'È',
|
||||
'è' : 'è',
|
||||
'∅' : '∅',
|
||||
' ' : ' ',
|
||||
' ' : ' ',
|
||||
'ε' : 'ε',
|
||||
'Ε' : 'Ε',
|
||||
'≡' : '≡',
|
||||
'η' : 'η',
|
||||
'Η' : 'Η',
|
||||
'ð' : 'ð',
|
||||
'Ð' : 'Ð',
|
||||
'Ð' : 'Ð',
|
||||
'ð' : 'ð',
|
||||
'ë' : 'ë',
|
||||
'Ë' : 'Ë',
|
||||
'Ë' : 'Ë',
|
||||
'ë' : 'ë',
|
||||
'€' : '€',
|
||||
'∃' : '∃',
|
||||
'ƒ' : 'ƒ',
|
||||
'∀' : '∀',
|
||||
'½' : '½',
|
||||
'½' : '½',
|
||||
'¼' : '¼',
|
||||
'¼' : '¼',
|
||||
'¾' : '¾',
|
||||
'¾' : '¾',
|
||||
'⁄' : '⁄',
|
||||
'γ' : 'γ',
|
||||
'Γ' : 'Γ',
|
||||
'≥' : '≥',
|
||||
#'>' : '>',
|
||||
#'>' : '>',
|
||||
#'>' : '>',
|
||||
#'>' : '>',
|
||||
'↔' : '↔',
|
||||
'⇔' : '⇔',
|
||||
'♥' : '♥',
|
||||
'…' : '…',
|
||||
'í' : 'í',
|
||||
'Í' : 'Í',
|
||||
'Í' : 'Í',
|
||||
'í' : 'í',
|
||||
'î' : 'î',
|
||||
'Î' : 'Î',
|
||||
'Î' : 'Î',
|
||||
'î' : 'î',
|
||||
'¡' : '¡',
|
||||
'¡' : '¡',
|
||||
'ì' : 'ì',
|
||||
'Ì' : 'Ì',
|
||||
'Ì' : 'Ì',
|
||||
'ì' : 'ì',
|
||||
'ℑ' : 'ℑ',
|
||||
'∞' : '∞',
|
||||
'∫' : '∫',
|
||||
'ι' : 'ι',
|
||||
'Ι' : 'Ι',
|
||||
'¿' : '¿',
|
||||
'¿' : '¿',
|
||||
'∈' : '∈',
|
||||
'ï' : 'ï',
|
||||
'Ï' : 'Ï',
|
||||
'Ï' : 'Ï',
|
||||
'ï' : 'ï',
|
||||
'κ' : 'κ',
|
||||
'Κ' : 'Κ',
|
||||
'λ' : 'λ',
|
||||
'Λ' : 'Λ',
|
||||
'«' : '«',
|
||||
'«' : '«',
|
||||
'←' : '←',
|
||||
'⇐' : '⇐',
|
||||
'⌈' : '⌈',
|
||||
'“' : '“',
|
||||
'≤' : '≤',
|
||||
'⌊' : '⌊',
|
||||
'∗' : '∗',
|
||||
'◊' : '◊',
|
||||
'‎' : '',
|
||||
'‹' : '‹',
|
||||
'‘' : '‘',
|
||||
#'<' : '<',
|
||||
#'<' : '<',
|
||||
#'<' : '<',
|
||||
#'<' : '<',
|
||||
'¯' : '¯',
|
||||
'¯' : '¯',
|
||||
'—' : '—',
|
||||
'µ' : 'µ',
|
||||
'µ' : 'µ',
|
||||
'·' : '·',
|
||||
'·' : '·',
|
||||
'−' : '−',
|
||||
'μ' : 'μ',
|
||||
'Μ' : 'Μ',
|
||||
'∇' : '∇',
|
||||
' ' : ' ',
|
||||
' ' : ' ',
|
||||
'–' : '–',
|
||||
'≠' : '≠',
|
||||
'∋' : '∋',
|
||||
'¬' : '¬',
|
||||
'¬' : '¬',
|
||||
'∉' : '∉',
|
||||
'⊄' : '⊄',
|
||||
'ñ' : 'ñ',
|
||||
'Ñ' : 'Ñ',
|
||||
'Ñ' : 'Ñ',
|
||||
'ñ' : 'ñ',
|
||||
'ν' : 'ν',
|
||||
'Ν' : 'Ν',
|
||||
'ó' : 'ó',
|
||||
'Ó' : 'Ó',
|
||||
'Ó' : 'Ó',
|
||||
'ó' : 'ó',
|
||||
'ô' : 'ô',
|
||||
'Ô' : 'Ô',
|
||||
'Ô' : 'Ô',
|
||||
'ô' : 'ô',
|
||||
'Œ' : 'Œ',
|
||||
'œ' : 'œ',
|
||||
'ò' : 'ò',
|
||||
'Ò' : 'Ò',
|
||||
'Ò' : 'Ò',
|
||||
'ò' : 'ò',
|
||||
'‾' : '‾',
|
||||
'ω' : 'ω',
|
||||
'Ω' : 'Ω',
|
||||
'ο' : 'ο',
|
||||
'Ο' : 'Ο',
|
||||
'⊕' : '⊕',
|
||||
'∨' : '∨',
|
||||
'ª' : 'ª',
|
||||
'ª' : 'ª',
|
||||
'º' : 'º',
|
||||
'º' : 'º',
|
||||
'ø' : 'ø',
|
||||
'Ø' : 'Ø',
|
||||
'Ø' : 'Ø',
|
||||
'ø' : 'ø',
|
||||
'õ' : 'õ',
|
||||
'Õ' : 'Õ',
|
||||
'Õ' : 'Õ',
|
||||
'õ' : 'õ',
|
||||
'⊗' : '⊗',
|
||||
'ö' : 'ö',
|
||||
'Ö' : 'Ö',
|
||||
'Ö' : 'Ö',
|
||||
'ö' : 'ö',
|
||||
'¶' : '¶',
|
||||
'¶' : '¶',
|
||||
'∂' : '∂',
|
||||
'‰' : '‰',
|
||||
'⊥' : '⊥',
|
||||
'φ' : 'φ',
|
||||
'Φ' : 'Φ',
|
||||
'π' : 'π',
|
||||
'Π' : 'Π',
|
||||
'ϖ' : 'ϖ',
|
||||
'±' : '±',
|
||||
'±' : '±',
|
||||
'£' : '£',
|
||||
'£' : '£',
|
||||
'′' : '′',
|
||||
'″' : '″',
|
||||
'∏' : '∏',
|
||||
'∝' : '∝',
|
||||
'ψ' : 'ψ',
|
||||
'Ψ' : 'Ψ',
|
||||
'"' : '"',
|
||||
'"' : '"',
|
||||
'"' : '"',
|
||||
'"' : '"',
|
||||
'√' : '√',
|
||||
'»' : '»',
|
||||
'»' : '»',
|
||||
'→' : '→',
|
||||
'⇒' : '⇒',
|
||||
'⌉' : '⌉',
|
||||
'”' : '”',
|
||||
'ℜ' : 'ℜ',
|
||||
'®' : '®',
|
||||
'®' : '®',
|
||||
'®' : '®',
|
||||
'®' : '®',
|
||||
'⌋' : '⌋',
|
||||
'ρ' : 'ρ',
|
||||
'Ρ' : 'Ρ',
|
||||
'‏' : '',
|
||||
'›' : '›',
|
||||
'’' : '’',
|
||||
'‚' : '‚',
|
||||
'š' : 'š',
|
||||
'Š' : 'Š',
|
||||
'⋅' : '⋅',
|
||||
'§' : '§',
|
||||
'§' : '§',
|
||||
'­' : '', # strange optional hyphenation control character, not just a dash
|
||||
'­' : '',
|
||||
'σ' : 'σ',
|
||||
'Σ' : 'Σ',
|
||||
'ς' : 'ς',
|
||||
'∼' : '∼',
|
||||
'♠' : '♠',
|
||||
'⊂' : '⊂',
|
||||
'⊆' : '⊆',
|
||||
'∑' : '∑',
|
||||
'¹' : '¹',
|
||||
'¹' : '¹',
|
||||
'²' : '²',
|
||||
'²' : '²',
|
||||
'³' : '³',
|
||||
'³' : '³',
|
||||
'⊃' : '⊃',
|
||||
'⊇' : '⊇',
|
||||
'ß' : 'ß',
|
||||
'ß' : 'ß',
|
||||
'τ' : 'τ',
|
||||
'Τ' : 'Τ',
|
||||
'∴' : '∴',
|
||||
'θ' : 'θ',
|
||||
'Θ' : 'Θ',
|
||||
'ϑ' : 'ϑ',
|
||||
' ' : ' ',
|
||||
'þ' : 'þ',
|
||||
'Þ' : 'Þ',
|
||||
'Þ' : 'Þ',
|
||||
'þ' : 'þ',
|
||||
'˜' : '˜',
|
||||
'×' : '×',
|
||||
'×' : '×',
|
||||
'™' : '™',
|
||||
'ú' : 'ú',
|
||||
'Ú' : 'Ú',
|
||||
'Ú' : 'Ú',
|
||||
'ú' : 'ú',
|
||||
'↑' : '↑',
|
||||
'⇑' : '⇑',
|
||||
'û' : 'û',
|
||||
'Û' : 'Û',
|
||||
'Û' : 'Û',
|
||||
'û' : 'û',
|
||||
'ù' : 'ù',
|
||||
'Ù' : 'Ù',
|
||||
'Ù' : 'Ù',
|
||||
'ù' : 'ù',
|
||||
'¨' : '¨',
|
||||
'¨' : '¨',
|
||||
'ϒ' : 'ϒ',
|
||||
'υ' : 'υ',
|
||||
'Υ' : 'Υ',
|
||||
'ü' : 'ü',
|
||||
'Ü' : 'Ü',
|
||||
'Ü' : 'Ü',
|
||||
'ü' : 'ü',
|
||||
'℘' : '℘',
|
||||
'ξ' : 'ξ',
|
||||
'Ξ' : 'Ξ',
|
||||
'ý' : 'ý',
|
||||
'Ý' : 'Ý',
|
||||
'Ý' : 'Ý',
|
||||
'ý' : 'ý',
|
||||
'¥' : '¥',
|
||||
'¥' : '¥',
|
||||
'ÿ' : 'ÿ',
|
||||
'Ÿ' : 'Ÿ',
|
||||
'ÿ' : 'ÿ',
|
||||
'ζ' : 'ζ',
|
||||
'Ζ' : 'Ζ',
|
||||
'‍' : '', # strange spacing control character, not just a space
|
||||
'‌' : '', # strange spacing control character, not just a space
|
||||
}
|
||||
366
fanficdownloader/mediaminer.py
Normal file
366
fanficdownloader/mediaminer.py
Normal file
|
|
@ -0,0 +1,366 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import cgi
|
||||
import uuid
|
||||
import shutil
|
||||
import os.path
|
||||
import logging
|
||||
import unittest
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from constants import *
|
||||
from adapter import *
|
||||
|
||||
try:
|
||||
import login_password
|
||||
except:
|
||||
# tough luck
|
||||
pass
|
||||
|
||||
class MediaMiner(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
|
||||
self.storyName = ''
|
||||
self.authorName = ''
|
||||
self.storyDescription = ''
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
self.authorId = '0'
|
||||
self.authorURL = self.path
|
||||
self.storyId = '0'
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = ''
|
||||
self.category = ''
|
||||
self.storyStatus = 'In-Progress'
|
||||
self.storyRating = 'K'
|
||||
self.storyUserRating = '0'
|
||||
self.outputName = ''
|
||||
self.outputStorySep = '-mm_'
|
||||
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
|
||||
if self.url.find('view_st.php') != -1:
|
||||
ss = self.url.split('view_st.php')
|
||||
logging.debug('ss=%s' % ss)
|
||||
if ss is not None and len(ss) > 1:
|
||||
self.storyId = ss[1].replace('/','').strip()
|
||||
elif self.url.find('view_ch.php?') != -1:
|
||||
ss = self.url.split('=')
|
||||
logging.debug('ss=%s' % ss)
|
||||
if ss is not None and len(ss) > 1:
|
||||
self.storyId = ss[-1].replace('/','').strip()
|
||||
self.path = '/fanfic/view_st.php/' + self.storyId
|
||||
self.url = 'http://' + self.host + self.path
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
elif self.url.find('view_ch.php/') != -1:
|
||||
ss = self.url.split('/')
|
||||
logging.debug('ss=%s' % ss)
|
||||
if ss is not None and len(ss) > 2:
|
||||
self.storyId = ss[-2].strip()
|
||||
self.path = '/fanfic/view_st.php/' + self.storyId
|
||||
self.url = 'http://' + self.host + self.path
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
else:
|
||||
raise InvalidStoryURL("Error URL \"%s\" is not a story." % self.url)
|
||||
|
||||
logging.debug('self.storyId=%s' % self.storyId)
|
||||
|
||||
logging.debug('self.path=%s' % self.path)
|
||||
|
||||
if not self.appEngine:
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
else:
|
||||
self.opener = None
|
||||
|
||||
logging.debug("Created MediaMiner: url=%s" % (self.url))
|
||||
|
||||
def _getLoginScript(self):
|
||||
return self.path
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
data = None
|
||||
try:
|
||||
data = self.fetchUrl(self.url)
|
||||
except Exception, e:
|
||||
data = None
|
||||
logging.error("Caught an exception reading URL " + self.url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise StoryDoesNotExist("Problem reading story URL " + self.url + "!")
|
||||
|
||||
#data.replace('<br />',' ').replace('<br>',' ').replace('</br>',' ')
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulSoup(data)
|
||||
except:
|
||||
logging.error("Failed to decode: <%s>" % data)
|
||||
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % self.url)
|
||||
|
||||
#logging.debug('soap=%s' % soup)
|
||||
urls = []
|
||||
|
||||
td_ffh = soup.find('td', {'class' : 'ffh'})
|
||||
#logging.debug('td_ffh=%s' % td_ffh)
|
||||
if td_ffh is not None:
|
||||
#logging.debug('td_ffh.text=%s' % td_ffh.find(text=True))
|
||||
self.storyName = unicode(td_ffh.find(text=True)).strip()
|
||||
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
|
||||
fft = td_ffh.find('font', {'class' : 'smtxt'})
|
||||
#logging.debug('fft=%s' % fft)
|
||||
if fft is not None:
|
||||
ffts = fft.string.split(' ')
|
||||
if ffts is not None:
|
||||
if len(ffts) > 1:
|
||||
self.storyRating = ffts[1]
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
self.genre = ''
|
||||
td_smtxt = soup.findAll('td')
|
||||
if td_smtxt is None:
|
||||
#logging.debug('td_smtxt is NONE!')
|
||||
pass
|
||||
else:
|
||||
ll = len(td_smtxt)
|
||||
#logging.debug('td_smtxt=%s, len=%s' % (td_smtxt, ll))
|
||||
for ii in range(ll):
|
||||
td = td_smtxt[ii]
|
||||
if 'class' in td._getAttrMap() and td['class'] != 'smtxt':
|
||||
#logging.debug('td has class attribute but is not smtxt')
|
||||
continue
|
||||
ss = unicode(td).replace('\n','').replace('\r','').replace(' ', ' ')
|
||||
#logging.debug('ss=%s' % ss)
|
||||
if len(ss) > 1 and (ss.find('Genre(s):') != -1 or ss.find('Type:') != -1):
|
||||
#logging.debug('ss=%s' % ss)
|
||||
ssbs = td.findAll('b')
|
||||
#logging.debug('ssbs=%s' % ssbs)
|
||||
bb = 0
|
||||
while bb < len(ssbs):
|
||||
nvs = bs.NavigableString('')
|
||||
sst=''
|
||||
ssb = ssbs[bb]
|
||||
ssbt = unicode(ssb.text).strip()
|
||||
#logging.debug('ssb=%s' % ssb)
|
||||
#logging.debug('ssbt=%s' % ssbt)
|
||||
ssbn = ssb.nextSibling
|
||||
while ssbn is not None:
|
||||
#logging.debug('ssbn=%s' % ssbn)
|
||||
#logging.debug('ssbn.class=%s' % ssbn.__class__)
|
||||
if nvs.__class__ == ssbn.__class__:
|
||||
st = unicode(ssbn)
|
||||
if st.strip() != '|':
|
||||
sst = sst + st
|
||||
else:
|
||||
#logging.debug('ssbn.name=%s' % ssbn.name)
|
||||
if ssbn.name == 'b':
|
||||
break
|
||||
ssbnts = ssbn.findAll(text=True)
|
||||
for ssbnt in ssbnts:
|
||||
sst = sst + ssbnt
|
||||
ssbn = ssbn.nextSibling
|
||||
sst = sst.replace(' ',' ').strip()
|
||||
#logging.debug('sst=%s' % sst)
|
||||
if bb == 0:
|
||||
ssbt = ssbt.replace(':','')
|
||||
self.addSubject(ssbt)
|
||||
self.addSubject(sst)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
else:
|
||||
if ssbt == 'Genre(s):':
|
||||
self.genre = sst
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
sts = sst.split(' / ')
|
||||
for st in sts:
|
||||
self.addSubject(st.strip())
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
elif ssbt == 'Type:':
|
||||
self.category = sst
|
||||
logging.debug('self.category=%s' % self.category)
|
||||
self.addSubject(sst)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
elif ssbt == 'Author:':
|
||||
pass
|
||||
elif ssbt == 'Visits:':
|
||||
pass
|
||||
elif ssbt == 'Size:':
|
||||
pass
|
||||
elif ssbt == 'Pages:':
|
||||
pass
|
||||
elif ssbt == 'Status:':
|
||||
if sst == "Completed":
|
||||
self.storyStatus = 'Completed'
|
||||
else:
|
||||
self.storyStatus = 'In-Progress'
|
||||
elif ssbt == 'Words:':
|
||||
self.numWords = sst.replace('|','').strip()
|
||||
logging.debug('self.numWords=%s' % self.numWords)
|
||||
pass
|
||||
elif ssbt == 'Summary:':
|
||||
self.storyDescription = sst.strip()
|
||||
#logging.debug('self.storyDescription=%s' % self.storyDescription.replace("\n"," ").replace('\r',''))
|
||||
elif ssbt == 'Latest Revision:' or ssbt == 'Uploaded On:':
|
||||
#logging.debug('sst=%s' % sst)
|
||||
ssts = sst.split(' ')
|
||||
if ssts is not None and len(ssts) > 3:
|
||||
sst = ssts[0] + ' ' + ssts[1] + ' ' + ssts[2]
|
||||
#logging.debug('sst=%s' % sst)
|
||||
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(sst.strip(' '), "%B %d, %Y")))
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
||||
else:
|
||||
pass
|
||||
bb = bb+1
|
||||
|
||||
smtxt_as = td_smtxt[ii].findAll('a')
|
||||
#logging.debug('smtxt_as=%s' % smtxt_as)
|
||||
for smtxt_a in smtxt_as:
|
||||
if 'href' in smtxt_a._getAttrMap() and smtxt_a['href'].find('/u/'):
|
||||
sta = smtxt_a['href']
|
||||
#logging.debug('sta=%s' % sta)
|
||||
stas = sta.split('/u/')
|
||||
#logging.debug('stas=%s' % stas)
|
||||
if stas is not None and len(stas) > 1:
|
||||
self.authorId = stas[1]
|
||||
self.authorURL = 'http://' + self.host + sta
|
||||
self.authorName = smtxt_a.string
|
||||
logging.debug('self.authorName=%s, self.authorId=%s' % (self.authorName, self.authorId))
|
||||
|
||||
urlstory=''
|
||||
numchapters = 0
|
||||
td_tbbrdr = soup.find('td', {'class' : 'tbbrdr'})
|
||||
if td_tbbrdr is not None:
|
||||
#logging.debug('td_tbbrdr=%s' % td_tbbrdr )
|
||||
|
||||
sl = td_tbbrdr.find('select', {'name':'cid'})
|
||||
if sl is not None:
|
||||
#logging.debug('sl=%s' % sl )
|
||||
opts = sl.findAll('option')
|
||||
for o in opts:
|
||||
#logging.debug('o=%s' % o)
|
||||
if 'value' in o._getAttrMap():
|
||||
url = 'http://' + self.host + '/fanfic/view_ch.php/' + self.storyId + '/' + o['value']
|
||||
logging.debug('URL=%s, Title=%s' % (url, o.string))
|
||||
if numchapters == 0:
|
||||
ss = o.string.split('[')
|
||||
if ss is not None and len(ss) > 1:
|
||||
ssd = ss[-1].replace(']','')
|
||||
#logging.debug('ssd=%s' % ssd)
|
||||
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(ssd.strip(' '), "%b %d, %Y")))
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
urls.append((url, o.string))
|
||||
numchapters = numchapters + 1
|
||||
|
||||
if numchapters == 0:
|
||||
numchapters = 1
|
||||
url = 'http://' + self.host + '/fanfic/view_st.php/' + self.storyId
|
||||
self.storyPublished = self.storyUpdated
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
ssd = self.storyName + ' [' + self.storyPublished.strftime("%b %d, %Y") + ']'
|
||||
logging.debug('URL=%s, Title=%s' % (url, ssd))
|
||||
urls.append((url, ssd))
|
||||
|
||||
self.numChapters = unicode(numchapters)
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
#logging.debug('urls=%s' % urls)
|
||||
|
||||
return urls
|
||||
|
||||
def getText(self, url):
|
||||
# time.sleep( 2.0 )
|
||||
logging.debug('url=%s' % url)
|
||||
data = ''
|
||||
try:
|
||||
data = self.fetchUrl(url)
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulSoup(data)
|
||||
except:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
|
||||
|
||||
# convert div's to p's. mediaminer uses div with a
|
||||
# margin for paragraphs.
|
||||
divlist = soup.findAll('div', {'class' : None})
|
||||
for tag in divlist:
|
||||
tag.name='p';
|
||||
|
||||
nvs = bs.NavigableString('')
|
||||
sst=''
|
||||
allAs = soup.findAll ('a', { 'name' : 'fic_c' })
|
||||
#logging.debug('allAs=%s' % allAs)
|
||||
for a in allAs:
|
||||
#logging.debug('a=%s' % a)
|
||||
foundfirst = False
|
||||
done = False
|
||||
nxta = a.nextSibling
|
||||
while nxta is not None and not done:
|
||||
#logging.debug('nxta=%s' % nxta)
|
||||
#logging.debug('nxta.class=%s' % nxta.__class__)
|
||||
st = unicode(nxta)
|
||||
if nvs.__class__ != nxta.__class__:
|
||||
#logging.debug('nxta.name=%s' % nxta.name)
|
||||
if nxta.name == 'table':
|
||||
st = ''
|
||||
if foundfirst:
|
||||
done = True
|
||||
if nxta.name == 'div' and 'class' in nxta._getAttrMap() and nxta['class'] == 'acl' and foundfirst:
|
||||
st = ''
|
||||
done = True
|
||||
|
||||
if nxta.name == 'br':
|
||||
if not foundfirst:
|
||||
st = ''
|
||||
else:
|
||||
foundfirst = True
|
||||
else:
|
||||
foundfirst = True
|
||||
|
||||
sst = sst + st
|
||||
nxta = nxta.nextSibling
|
||||
|
||||
if sst is None:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return sst
|
||||
|
||||
class FPC_UnitTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
pass
|
||||
|
||||
def testFictionPress(self):
|
||||
url = 'http://www.fictionpress.com/s/2725180/1/Behind_This_Facade'
|
||||
f = FPCom(url)
|
||||
urls = f.extractIndividualUrls()
|
||||
|
||||
self.assertEquals('Behind This Facade', f.getStoryName())
|
||||
self.assertEquals('IntoxicatingMelody', f.getAuthorName())
|
||||
|
||||
text = f.getText(url)
|
||||
self.assertTrue(text.find('Kale Resgerald at your service" He answered, "So, can we go now? Or do you want to') != -1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
384
fanficdownloader/mobi.py
Normal file
384
fanficdownloader/mobi.py
Normal file
|
|
@ -0,0 +1,384 @@
|
|||
#!/usr/bin/python
|
||||
# Copyright(c) 2009 Andrew Chatham and Vijay Pandurangan
|
||||
|
||||
|
||||
import StringIO
|
||||
import struct
|
||||
import time
|
||||
import random
|
||||
import logging
|
||||
|
||||
from html import HtmlProcessor
|
||||
|
||||
# http://wiki.mobileread.com/wiki/MOBI
|
||||
# http://membres.lycos.fr/microfirst/palm/pdb.html
|
||||
|
||||
encoding = {
|
||||
'UTF-8' : 65001,
|
||||
'latin-1' : 1252,
|
||||
}
|
||||
|
||||
languages = {"en-us" : 0x0409,
|
||||
"sv" : 0x041d,
|
||||
"fi" : 0x000b,
|
||||
"en" : 0x0009,
|
||||
"en-gb" : 0x0809}
|
||||
|
||||
def ToHex(s):
|
||||
v = ['%.2x' % ord(c) for c in s]
|
||||
return ' '.join(v)
|
||||
|
||||
class _SubEntry:
|
||||
def __init__(self, pos, html_data):
|
||||
self.pos = pos
|
||||
self.html = HtmlProcessor(html_data)
|
||||
self.title = self.html.title
|
||||
self._name = 'mobi_article_%d' % pos
|
||||
if not self.title:
|
||||
self.title = 'Article %d' % self.pos
|
||||
|
||||
def TocLink(self):
|
||||
return '<a href="#%s_MOBI_START">%.80s</a>' % (self._name, self.title)
|
||||
|
||||
def Anchor(self):
|
||||
return '<a name="%s_MOBI_START">' % self._name
|
||||
|
||||
def Body(self):
|
||||
return self.html.RenameAnchors(self._name + '_')
|
||||
|
||||
class Converter:
|
||||
def __init__(self, refresh_url='', title='Unknown', author='Unknown', publisher='Unknown'):
|
||||
self._header = Header()
|
||||
self._header.SetTitle(title)
|
||||
self._header.SetAuthor(author)
|
||||
self._header.SetPublisher(publisher)
|
||||
self._refresh_url = refresh_url
|
||||
|
||||
def ConvertString(self, s):
|
||||
out = StringIO.StringIO()
|
||||
self._ConvertStringToFile(s, out)
|
||||
return out.getvalue()
|
||||
|
||||
def ConvertStrings(self, html_strs):
|
||||
out = StringIO.StringIO()
|
||||
self._ConvertStringsToFile(html_strs, out)
|
||||
return out.getvalue()
|
||||
|
||||
def ConvertFile(self, html_file, out_file):
|
||||
self._ConvertStringToFile(open(html_file,'rb').read(),
|
||||
open(out_file, 'wb'))
|
||||
|
||||
def ConvertFiles(self, html_files, out_file):
|
||||
html_strs = [open(f,'rb').read() for f in html_files]
|
||||
self._ConvertStringsToFile(html_strs, open(out_file, 'wb'))
|
||||
|
||||
def MakeOneHTML(self, html_strs):
|
||||
"""This takes a list of HTML strings and returns a big HTML file with
|
||||
all contents consolidated. It constructs a table of contents and adds
|
||||
anchors within the text
|
||||
"""
|
||||
title_html = []
|
||||
toc_html = []
|
||||
body_html = []
|
||||
|
||||
PAGE_BREAK = '<mbp:pagebreak>'
|
||||
|
||||
# pull out the title page, assumed first html_strs.
|
||||
htmltitle = html_strs[0]
|
||||
entrytitle = _SubEntry(1, htmltitle)
|
||||
title_html.append(entrytitle.Body())
|
||||
|
||||
title_html.append(PAGE_BREAK)
|
||||
toc_html.append('<a name="TOCTOP"><h3>Table of Contents</h3><br />')
|
||||
|
||||
for pos, html in enumerate(html_strs[1:]):
|
||||
entry = _SubEntry(pos+1, html)
|
||||
toc_html.append('%s<br />' % entry.TocLink())
|
||||
|
||||
# give some space between bodies of work.
|
||||
body_html.append(PAGE_BREAK)
|
||||
|
||||
body_html.append(entry.Anchor())
|
||||
|
||||
body_html.append(entry.Body())
|
||||
|
||||
# TODO: this title can get way too long with RSS feeds. Not sure how to fix
|
||||
# cheat slightly and use the <a href> code to set filepos in references.
|
||||
header = '''<html>
|
||||
<head>
|
||||
<title>Bibliorize %s GMT</title>
|
||||
<guide>
|
||||
<reference href="#TOCTOP" type="toc" title="Table of Contents"/>
|
||||
</guide>
|
||||
</head>
|
||||
<body>
|
||||
''' % time.ctime(time.time())
|
||||
|
||||
footer = '</body></html>'
|
||||
all_html = header + '\n'.join(title_html + toc_html + body_html) + footer
|
||||
#print "%s" % all_html.encode('utf8')
|
||||
return all_html
|
||||
|
||||
def _ConvertStringsToFile(self, html_strs, out_file):
|
||||
try:
|
||||
tmp = self.MakeOneHTML(html_strs)
|
||||
self._ConvertStringToFile(tmp, out_file)
|
||||
except Exception, e:
|
||||
logging.error('Error %s', e)
|
||||
logging.debug('Details: %s' % html_strs)
|
||||
|
||||
def _ConvertStringToFile(self, html_data, out):
|
||||
html = HtmlProcessor(html_data)
|
||||
data = html.CleanHtml()
|
||||
|
||||
# collect offsets of '<mbp:pagebreak>' tags, use to make index list.
|
||||
# indexlist = [] # list of (offset,length) tuples.
|
||||
# not in current use.
|
||||
|
||||
# j=0
|
||||
# lastj=0
|
||||
# while True:
|
||||
# j=data.find('<mbp:pagebreak>',lastj+10) # plus a bit so we find the next.
|
||||
# if j < 0:
|
||||
# break
|
||||
# indexlist.append((lastj,j-lastj))
|
||||
# print "index offset: %d length: %d" % (lastj,j-lastj)
|
||||
# lastj=j
|
||||
|
||||
records = []
|
||||
# title = html.title
|
||||
# if title:
|
||||
# self._header.SetTitle(title)
|
||||
record_id = 1
|
||||
for start_pos in range(0, len(data), Record.MAX_SIZE):
|
||||
end = min(len(data), start_pos + Record.MAX_SIZE)
|
||||
record_data = data[start_pos:end]
|
||||
records.append(self._header.AddRecord(record_data, record_id))
|
||||
#print "HTML Record %03d: (size:%d) [[%s ... %s]]" % ( record_id, len(record_data), record_data[:20], record_data[-20:] )
|
||||
record_id += 1
|
||||
self._header.SetImageRecordIndex(record_id)
|
||||
records[0:0] = [self._header.MobiHeader()]
|
||||
|
||||
header, rec_offset = self._header.PDBHeader(len(records))
|
||||
out.write(header)
|
||||
for record in records:
|
||||
record.WriteHeader(out, rec_offset)
|
||||
#print "rec_offset: %d len(record.data): %d" % (rec_offset,len(record.data))
|
||||
rec_offset += (len(record.data)+1) # plus one for trailing null
|
||||
|
||||
# Write to nuls for some reason
|
||||
out.write('\0\0')
|
||||
for record in records:
|
||||
record.WriteData(out)
|
||||
out.write('\0')
|
||||
# needs a trailing null, I believe it indicates zero length 'overlap'.
|
||||
# otherwise, the readers eat the last char of each html record.
|
||||
# Calibre writes another 6-7 bytes of stuff after that, but we seem
|
||||
# to be getting along without it.
|
||||
|
||||
class Record:
|
||||
MAX_SIZE = 4096
|
||||
INDEX_LEN = 8
|
||||
_unique_id_seed = 28 # should be arbitrary, but taken from MobiHeader
|
||||
|
||||
# TODO(chatham): Record compression doesn't look that hard.
|
||||
|
||||
def __init__(self, data, record_id):
|
||||
assert len(data) <= self.MAX_SIZE
|
||||
self.data = data
|
||||
if record_id != 0:
|
||||
self._id = record_id
|
||||
else:
|
||||
Record._unique_id_seed += 1
|
||||
self._id = 0
|
||||
|
||||
def __repr__(self):
|
||||
return 'Record: id=%d len=%d' % (self._id, len(self.data))
|
||||
|
||||
def _SetUniqueId(self):
|
||||
Record._unique_id_seed += 1
|
||||
# TODO(chatham): Wraparound crap
|
||||
self._id = Record._unique_id_seed
|
||||
|
||||
def WriteData(self, out):
|
||||
out.write(self.data)
|
||||
|
||||
def WriteHeader(self, out, rec_offset):
|
||||
attributes = 64 # dirty?
|
||||
header = struct.pack('>IbbH',
|
||||
rec_offset,
|
||||
attributes,
|
||||
0, self._id)
|
||||
assert len(header) == Record.INDEX_LEN
|
||||
out.write(header)
|
||||
|
||||
EXTH_HEADER_FIELDS = {
|
||||
'author' : 100,
|
||||
'publisher' : 101,
|
||||
}
|
||||
|
||||
class Header:
|
||||
EPOCH_1904 = 2082844800
|
||||
|
||||
def __init__(self):
|
||||
self._length = 0
|
||||
self._record_count = 0
|
||||
self._title = '2008_2_34'
|
||||
self._author = 'Unknown author'
|
||||
self._publisher = 'Unknown publisher'
|
||||
self._first_image_index = 0
|
||||
|
||||
def SetAuthor(self, author):
|
||||
self._author = author.encode('ascii','ignore')
|
||||
|
||||
def SetTitle(self, title):
|
||||
# TODO(chatham): Reevaluate whether this needs to be ASCII.
|
||||
# maybe just do sys.setdefaultencoding('utf-8')? Problems
|
||||
# appending self._title with other things.
|
||||
self._title = title.encode('ascii','ignore')
|
||||
|
||||
def SetPublisher(self, publisher):
|
||||
self._publisher = publisher.encode('ascii','ignore')
|
||||
|
||||
def AddRecord(self, data, record_id):
|
||||
self.max_record_size = max(Record.MAX_SIZE, len(data))
|
||||
self._record_count += 1
|
||||
self._length += len(data)
|
||||
return Record(data, record_id)
|
||||
|
||||
def _ReplaceWord(self, data, pos, word):
|
||||
return data[:pos] + struct.pack('>I', word) + data[pos+4:]
|
||||
|
||||
def PalmDocHeader(self):
|
||||
compression = 1 # no compression
|
||||
unused = 0
|
||||
encryption_type = 0 # no ecryption
|
||||
records = self._record_count + 1 # the header record itself
|
||||
palmdoc_header = struct.pack('>HHIHHHH',
|
||||
compression,
|
||||
unused,
|
||||
self._length,
|
||||
records,
|
||||
Record.MAX_SIZE,
|
||||
encryption_type,
|
||||
unused)
|
||||
assert len(palmdoc_header) == 16
|
||||
return palmdoc_header
|
||||
|
||||
def PDBHeader(self, num_records):
|
||||
HEADER_LEN = 32+2+2+9*4
|
||||
RECORD_INDEX_HEADER_LEN = 6
|
||||
RESOURCE_INDEX_LEN = 10
|
||||
|
||||
index_len = RECORD_INDEX_HEADER_LEN + num_records * Record.INDEX_LEN
|
||||
rec_offset = HEADER_LEN + index_len + 2
|
||||
|
||||
short_title = self._title[0:31]
|
||||
attributes = 0
|
||||
version = 0
|
||||
ctime = self.EPOCH_1904 + int(time.time())
|
||||
mtime = self.EPOCH_1904 + int(time.time())
|
||||
backup_time = self.EPOCH_1904 + int(time.time())
|
||||
modnum = 0
|
||||
appinfo_offset = 0
|
||||
sort_offset = 0
|
||||
type = 'BOOK'
|
||||
creator = 'MOBI'
|
||||
id_seed = 36
|
||||
header = struct.pack('>32sHHII',
|
||||
short_title, attributes, version,
|
||||
ctime, mtime)
|
||||
header += struct.pack('>IIII', backup_time, modnum,
|
||||
appinfo_offset, sort_offset)
|
||||
header += struct.pack('>4s4sI',
|
||||
type, creator, id_seed)
|
||||
next_record = 0 # not used?
|
||||
header += struct.pack('>IH', next_record, num_records)
|
||||
return header, rec_offset
|
||||
|
||||
def _GetExthHeader(self):
|
||||
# They set author, publisher, coveroffset, thumboffset
|
||||
data = {'author' : self._author,
|
||||
'publisher' : self._publisher,
|
||||
}
|
||||
# Turn string type names into EXTH typeids.
|
||||
r = []
|
||||
for key, value in data.items():
|
||||
typeid = EXTH_HEADER_FIELDS[key]
|
||||
length_encoding_len = 8
|
||||
r.append(struct.pack('>LL', typeid, len(value) + length_encoding_len,) + value)
|
||||
content = ''.join(r)
|
||||
|
||||
# Pad to word boundary
|
||||
while len(content) % 4:
|
||||
content += '\0'
|
||||
TODO_mysterious = 12
|
||||
exth = 'EXTH' + struct.pack('>LL', len(content) + TODO_mysterious, len(data)) + content
|
||||
return exth
|
||||
|
||||
def SetImageRecordIndex(self, idx):
|
||||
self._first_image_index = idx
|
||||
|
||||
def MobiHeader(self):
|
||||
exth_header = self._GetExthHeader();
|
||||
palmdoc_header = self.PalmDocHeader()
|
||||
|
||||
fs = 0xffffffff
|
||||
|
||||
# Record 0
|
||||
header_len = 0xE4 # TODO
|
||||
mobi_type = 2 # BOOK
|
||||
text_encoding = encoding['UTF-8']
|
||||
unique_id = random.randint(1, 1<<32)
|
||||
creator_version = 4
|
||||
reserved = '%c' % 0xff * 40
|
||||
nonbook_index = fs
|
||||
full_name_offset = header_len + len(palmdoc_header) + len(exth_header) # put full name after header
|
||||
language = languages['en-us']
|
||||
unused = 0
|
||||
mobi_header = struct.pack('>4sIIIII40sIIIIII',
|
||||
'MOBI',
|
||||
header_len,
|
||||
mobi_type,
|
||||
text_encoding,
|
||||
unique_id,
|
||||
creator_version,
|
||||
reserved,
|
||||
nonbook_index,
|
||||
full_name_offset,
|
||||
len(self._title),
|
||||
language,
|
||||
fs, fs)
|
||||
assert len(mobi_header) == 104 - 16
|
||||
|
||||
unknown_fields = chr(0) * 32
|
||||
drm_offset = 0
|
||||
drm_count = 0
|
||||
drm_size = 0
|
||||
drm_flags = 0
|
||||
exth_flags = 0x50
|
||||
header_end = chr(0) * 64
|
||||
mobi_header += struct.pack('>IIIIIII',
|
||||
creator_version,
|
||||
self._first_image_index,
|
||||
fs,
|
||||
unused,
|
||||
fs,
|
||||
unused,
|
||||
exth_flags)
|
||||
mobi_header += '\0' * 112 # TODO: Why this much padding?
|
||||
# Set some magic offsets to be 0xFFFFFFF.
|
||||
for pos in (0x94, 0x98, 0xb0, 0xb8, 0xc0, 0xc8, 0xd0, 0xd8, 0xdc):
|
||||
mobi_header = self._ReplaceWord(mobi_header, pos, fs)
|
||||
|
||||
# 16 bytes?
|
||||
padding = '\0' * 48 * 4 # why?
|
||||
total_header = palmdoc_header + mobi_header + exth_header + self._title + padding
|
||||
|
||||
return self.AddRecord(total_header, 0)
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
m = Converter(title='Testing Mobi', author='Mobi Author', publisher='mobi converter')
|
||||
m.ConvertFiles(sys.argv[1:], 'test.mobi')
|
||||
#m.ConvertFile(sys.argv[1], 'test.mobi')
|
||||
48
fanficdownloader/newdownload.py
Normal file
48
fanficdownloader/newdownload.py
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
import sys, os
|
||||
|
||||
import adapters
|
||||
import writers
|
||||
|
||||
import ConfigParser
|
||||
|
||||
from writers.writer_html import HTMLWriter
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG,format="%(levelname)s:%(filename)s(%(lineno)d):%(message)s")
|
||||
|
||||
config = ConfigParser.ConfigParser()
|
||||
|
||||
logging.debug('reading defaults.ini config file, if present')
|
||||
config.read('defaults.ini')
|
||||
logging.debug('reading personal.ini config file, if present')
|
||||
config.read('personal.ini')
|
||||
|
||||
def writeStory(adapter,writeformat):
|
||||
writer = writers.getWriter(writeformat,config,adapter.getStory())
|
||||
writer.writeStory()
|
||||
del writer
|
||||
|
||||
try:
|
||||
adapter = adapters.getAdapter(config,sys.argv[1])
|
||||
|
||||
#try:
|
||||
print adapter.getStory()
|
||||
#except adapters.FailedToLogin, ftl:
|
||||
# print "Login Failed, trying with user/pass"
|
||||
# adapter.username="BobsClue"
|
||||
# adapter.password="XXXXXXXXX"
|
||||
# print adapter.getStory()
|
||||
|
||||
writeStory(adapter,"epub")
|
||||
writeStory(adapter,"html")
|
||||
writeStory(adapter,"txt")
|
||||
del adapter
|
||||
|
||||
except adapters.InvalidStoryURL, isu:
|
||||
print isu
|
||||
except adapters.StoryDoesNotExist, dne:
|
||||
print dne
|
||||
except adapters.UnknownSite, us:
|
||||
print us
|
||||
643
fanficdownloader/output.py
Normal file
643
fanficdownloader/output.py
Normal file
|
|
@ -0,0 +1,643 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import cgi
|
||||
import uuid
|
||||
import unicodedata
|
||||
import codecs
|
||||
import shutil
|
||||
import string
|
||||
import os.path
|
||||
import zipfile
|
||||
import StringIO
|
||||
import logging
|
||||
import hashlib
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
|
||||
import mobi
|
||||
import zipdir
|
||||
import html_constants
|
||||
from constants import *
|
||||
|
||||
|
||||
import html2text
|
||||
import datetime
|
||||
|
||||
|
||||
class FanficWriter:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def writeChapter(self, index, title, text):
|
||||
pass
|
||||
|
||||
def finalise(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def getFormatName():
|
||||
return 'base'
|
||||
|
||||
@staticmethod
|
||||
def getFormatExt():
|
||||
return '.bse'
|
||||
|
||||
class TextWriter(FanficWriter):
|
||||
htmlWriter = None
|
||||
|
||||
@staticmethod
|
||||
def getFormatName():
|
||||
return 'text'
|
||||
|
||||
@staticmethod
|
||||
def getFormatExt():
|
||||
return '.txt'
|
||||
|
||||
def __init__(self, base, adapter, inmemory=False, compress=False):
|
||||
self.inmemory = inmemory
|
||||
self.htmlWriter = HTMLWriter(base, adapter, True, False)
|
||||
|
||||
def writeChapter(self, index, title, text):
|
||||
self.htmlWriter.writeChapter(index, title, text)
|
||||
|
||||
def finalise(self):
|
||||
self.htmlWriter.finalise()
|
||||
self.name=self.htmlWriter.name
|
||||
self.fileName = self.htmlWriter.fileName.replace(".html",".txt")
|
||||
if self.inmemory:
|
||||
self.output = StringIO.StringIO()
|
||||
else:
|
||||
self.output = open(self.fileName, 'w')
|
||||
|
||||
self.output.write(html2text.html2text(self.htmlWriter.output.getvalue().decode('utf-8')).encode('utf-8'))
|
||||
|
||||
if not self.inmemory:
|
||||
self.output.close()
|
||||
|
||||
|
||||
class MobiWriter(FanficWriter):
|
||||
chapters = []
|
||||
files = {}
|
||||
|
||||
@staticmethod
|
||||
def getFormatName():
|
||||
return 'mobi'
|
||||
|
||||
@staticmethod
|
||||
def getFormatExt():
|
||||
return '.mobi'
|
||||
|
||||
def __init__(self, base, adapter, inmemory=False, compress=False):
|
||||
self.basePath = base
|
||||
self.storyTitle = removeEntities(adapter.getStoryName())
|
||||
self.name = makeAcceptableFilename(adapter.getOutputName())
|
||||
self.fileName = self.basePath + '/' + self.name + self.getFormatExt()
|
||||
self.authorName = removeEntities(adapter.getAuthorName())
|
||||
self.publisher = adapter.getPublisher()
|
||||
self.adapter = adapter
|
||||
self.mobi = mobi
|
||||
self.inmemory = inmemory
|
||||
|
||||
self.files = {}
|
||||
self.chapters = []
|
||||
|
||||
if not self.inmemory and os.path.exists(self.fileName):
|
||||
os.remove(self.fileName)
|
||||
|
||||
if self.inmemory:
|
||||
self.output = StringIO.StringIO()
|
||||
else:
|
||||
self.output = open(self.fileName, 'wb')
|
||||
|
||||
self.xhtmlTemplate = string.Template(html_constants.XHTML_START)
|
||||
self.chapterStartTemplate = string.Template(html_constants.XHTML_CHAPTER_START)
|
||||
|
||||
def _printableVersion(self, text):
|
||||
try:
|
||||
d = text.decode('utf-8')
|
||||
return d
|
||||
except:
|
||||
return text
|
||||
|
||||
def _writeFile(self, fileName, data):
|
||||
#logging.debug('_writeFile(`%s`, data)' % fileName)
|
||||
if fileName in self.files:
|
||||
try:
|
||||
d = data.decode('utf-8')
|
||||
except UnicodeEncodeError, e:
|
||||
d = data
|
||||
|
||||
self.files[fileName].write(d)
|
||||
else:
|
||||
self.files[fileName] = StringIO.StringIO()
|
||||
self._writeFile(fileName, data)
|
||||
|
||||
def _getFilesStrings(self):
|
||||
strings = []
|
||||
if "title_page.xhtml" in self.files:
|
||||
strings.append(self.files["title_page.xhtml"].getvalue())
|
||||
del(self.files["title_page.xhtml"])
|
||||
|
||||
keys = self.files.keys()
|
||||
keys.sort()
|
||||
|
||||
# Assumed all other files are chapter0000.xhtml.
|
||||
for fn in keys:
|
||||
strings.append(self.files[fn].getvalue())
|
||||
|
||||
return strings
|
||||
|
||||
def writeChapter(self, index, title, text):
|
||||
title = removeEntities(title)
|
||||
logging.debug("Writing chapter: %s" % title)
|
||||
#title = self._printableVersion(title) #title.decode('utf-8')
|
||||
text = removeEntities(text)
|
||||
#text = self._printableVersion(text) #text.decode('utf-8')
|
||||
|
||||
# BeautifulStoneSoup doesn't have any selfClosingTags by default.
|
||||
# hr & br needs to be if they're going to work.
|
||||
# Some stories do use multiple br tags as their section breaks...
|
||||
self.soup = bs.BeautifulStoneSoup(text, selfClosingTags=('br','hr'))
|
||||
|
||||
allTags = self.soup.findAll(recursive=True)
|
||||
for t in allTags:
|
||||
for attr in t._getAttrMap().keys():
|
||||
if attr not in acceptable_attributes:
|
||||
del t[attr]
|
||||
# these are not acceptable strict XHTML. But we do already have
|
||||
# CSS classes of the same names defined in constants.py
|
||||
if t.name in ('u'):
|
||||
t['class']=t.name
|
||||
t.name='span'
|
||||
if t.name in ('center'):
|
||||
t['class']=t.name
|
||||
t.name='div'
|
||||
# removes paired, but empty tags.
|
||||
if t.string != None and len(t.string.strip()) == 0 :
|
||||
t.extract()
|
||||
|
||||
text = self.soup.__str__('utf8')
|
||||
|
||||
# ffnet(& maybe others) gives the whole chapter text
|
||||
# as one line. This causes problems for nook(at
|
||||
# least) when the chapter size starts getting big
|
||||
# (200k+) Using Soup's prettify() messes up italics
|
||||
# and such. Done after soup extract so <p> and <br>
|
||||
# tags are normalized. Doing it here seems less evil
|
||||
# than hacking BeautifulSoup, but it's debatable.
|
||||
text = text.replace('</p>','</p>\n').replace('<br />','<br />\n')
|
||||
|
||||
filename="chapter%04d.xhtml" % index
|
||||
self._writeFile(filename, XHTML_START % (title, title))
|
||||
self._writeFile(filename, text)
|
||||
self._writeFile(filename, XHTML_END)
|
||||
|
||||
#self.body = self.body + '\n' + self.chapterStartTemplate.substitute({'chapter' : title})
|
||||
#self.body = self.body + '\n' + text
|
||||
|
||||
def finalise(self):
|
||||
logging.debug("Finalising...")
|
||||
|
||||
published = self.adapter.getStoryPublished().strftime("%Y-%m-%d")
|
||||
createda = self.adapter.getStoryCreated().strftime("%Y-%m-%d %H:%M:%S")
|
||||
created = self.adapter.getStoryCreated().strftime("%Y-%m-%d")
|
||||
updated = self.adapter.getStoryUpdated().strftime("%Y-%m-%d")
|
||||
updateyy = self.adapter.getStoryUpdated().strftime("%Y")
|
||||
updatemm = self.adapter.getStoryUpdated().strftime("%m")
|
||||
updatedd = self.adapter.getStoryUpdated().strftime("%d")
|
||||
calibre = self.adapter.getStoryUpdated().strftime("%Y-%m-%dT%H:%M:%S")
|
||||
|
||||
description = self.adapter.getStoryDescription()
|
||||
if hasattr(description, "text"):
|
||||
description = description.text
|
||||
prevalue=description
|
||||
try:
|
||||
description = unicode(description)
|
||||
except:
|
||||
description=prevalue
|
||||
|
||||
if description is not None and len(description) > 0:
|
||||
description = description.replace ('\\\'', '\'').replace('\\\"', '\"')
|
||||
description = removeEntities(description)
|
||||
else:
|
||||
description = ' '
|
||||
|
||||
### writing content -- title page
|
||||
titleFilePath = "title_page.xhtml"
|
||||
self._writeFile(titleFilePath, TITLE_HEADER % (self.authorName, self.storyTitle, self.adapter.getStoryURL(), self.storyTitle, self.adapter.getAuthorURL(), self.authorName))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Category:', self.adapter.getCategory()))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Genre:', self.adapter.getGenre()))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Status:', self.adapter.getStoryStatus()))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Published:', published))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Updated:', updated))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Packaged:', createda))
|
||||
tmpstr = self.adapter.getStoryRating() + " / " + self.adapter.getStoryUserRating()
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Rating Age/User:', tmpstr))
|
||||
tmpstr = unicode(self.adapter.getNumChapters()) + " / " + commaGroups(unicode(self.adapter.getNumWords()))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Chapters/Words:', tmpstr))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Publisher:', self.adapter.getHost()))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Story ID:', self.adapter.getStoryId()))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Author ID:', self.adapter.getAuthorId()))
|
||||
|
||||
self._writeFile(titleFilePath, TITLE_FOOTER % description )
|
||||
|
||||
|
||||
|
||||
c = mobi.Converter(title=self.storyTitle,
|
||||
author=self.authorName,
|
||||
publisher=self.publisher)
|
||||
mobidata = c.ConvertStrings(self._getFilesStrings())
|
||||
|
||||
self.output.write(mobidata)
|
||||
if not self.inmemory:
|
||||
self.output.close()
|
||||
# zipdir.toZip(filename, self.directory)
|
||||
|
||||
|
||||
|
||||
class HTMLWriter(FanficWriter):
|
||||
body = ''
|
||||
|
||||
@staticmethod
|
||||
def getFormatName():
|
||||
return 'html'
|
||||
|
||||
@staticmethod
|
||||
def getFormatExt():
|
||||
return '.html'
|
||||
|
||||
def __init__(self, base, adapter, inmemory=False, compress=False, mobi = False):
|
||||
self.basePath = base
|
||||
self.storyTitle = removeEntities(adapter.getStoryName())
|
||||
self.name = makeAcceptableFilename(adapter.getOutputName())
|
||||
self.fileName = self.basePath + '/' + self.name + self.getFormatExt()
|
||||
self.authorName = removeEntities(adapter.getAuthorName())
|
||||
self.adapter = adapter
|
||||
self.mobi = mobi
|
||||
self.inmemory = inmemory
|
||||
|
||||
if not self.inmemory and os.path.exists(self.fileName):
|
||||
os.remove(self.fileName)
|
||||
|
||||
if self.inmemory:
|
||||
self.output = StringIO.StringIO()
|
||||
else:
|
||||
self.output = open(self.fileName, 'w')
|
||||
|
||||
self.xhtmlTemplate = string.Template(html_constants.XHTML_START)
|
||||
self.chapterStartTemplate = string.Template(html_constants.XHTML_CHAPTER_START)
|
||||
|
||||
def _printableVersion(self, text):
|
||||
try:
|
||||
d = text.decode('utf-8')
|
||||
return d
|
||||
except:
|
||||
return text
|
||||
|
||||
def writeChapter(self, index, title, text):
|
||||
title = self._printableVersion(title) #title.decode('utf-8')
|
||||
text = self._printableVersion(text) #text.decode('utf-8')
|
||||
self.body = self.body + '\n' + self.chapterStartTemplate.substitute({'chapter' : title})
|
||||
self.body = self.body + '\n' + text
|
||||
|
||||
def finalise(self):
|
||||
html = self.xhtmlTemplate.substitute({'title' : self.storyTitle, 'author' : self.authorName, 'body' : self.body})
|
||||
soup = bs.BeautifulSoup(html)
|
||||
result = soup.__str__('utf8')
|
||||
|
||||
# f = open(self.fileName, 'w')
|
||||
# f.write(result)
|
||||
# f.close()
|
||||
|
||||
self.output.write(result)
|
||||
if not self.inmemory:
|
||||
self.output.close()
|
||||
|
||||
class EPubFanficWriter(FanficWriter):
|
||||
chapters = []
|
||||
files = {}
|
||||
|
||||
@staticmethod
|
||||
def getFormatName():
|
||||
return 'epub'
|
||||
|
||||
@staticmethod
|
||||
def getFormatExt():
|
||||
return '.epub'
|
||||
|
||||
def __init__(self, base, adapter, inmemory=False, compress=True):
|
||||
self.basePath = base
|
||||
self.storyTitle = removeEntities(adapter.getStoryName())
|
||||
self.name = makeAcceptableFilename(adapter.getOutputName())
|
||||
self.directory = self.basePath + '/' + self.name
|
||||
self.authorName = removeEntities(adapter.getAuthorName())
|
||||
self.inmemory = inmemory
|
||||
self.adapter = adapter
|
||||
|
||||
self.files = {}
|
||||
self.chapters = []
|
||||
|
||||
if not self.inmemory:
|
||||
self.inmemory = True
|
||||
self.writeToFile = True
|
||||
else:
|
||||
self.writeToFile = False
|
||||
|
||||
if not self.inmemory:
|
||||
if os.path.exists(self.directory):
|
||||
shutil.rmtree(self.directory)
|
||||
|
||||
os.mkdir(self.directory)
|
||||
|
||||
os.mkdir(self.directory + '/META-INF')
|
||||
os.mkdir(self.directory + '/OEBPS')
|
||||
|
||||
self._writeFile('mimetype', MIMETYPE)
|
||||
self._writeFile('META-INF/container.xml', CONTAINER)
|
||||
self._writeFile('OEBPS/stylesheet.css', CSS)
|
||||
|
||||
def _writeFile(self, fileName, data):
|
||||
#logging.debug('_writeFile(`%s`, data)' % fileName)
|
||||
if fileName in self.files:
|
||||
try:
|
||||
d = data.decode('utf-8')
|
||||
except UnicodeEncodeError, e:
|
||||
d = data
|
||||
|
||||
self.files[fileName].write(d)
|
||||
else:
|
||||
if self.inmemory:
|
||||
self.files[fileName] = StringIO.StringIO()
|
||||
else:
|
||||
self.files[fileName] = open(self.directory + '/' + fileName, encoding='utf-8', mode='w')
|
||||
|
||||
self._writeFile(fileName, data)
|
||||
|
||||
|
||||
def _closeFiles(self):
|
||||
if not self.inmemory:
|
||||
for f in self.files:
|
||||
self.files[f].close()
|
||||
|
||||
def writeChapter(self, index, title, text):
|
||||
title = removeEntities(title)
|
||||
logging.debug("Writing chapter: %s" % title)
|
||||
fileName="chapter%04d.xhtml" % index
|
||||
|
||||
filePath = self.directory + "/OEBPS/" + fileName
|
||||
|
||||
fn = 'OEBPS/' + fileName
|
||||
|
||||
# f = open(filePath, 'w')
|
||||
|
||||
text = removeEntities(text)
|
||||
|
||||
# BeautifulStoneSoup doesn't have any selfClosingTags by default.
|
||||
# hr & br needs to be if they're going to work.
|
||||
# Some stories do use multiple br tags as their section breaks...
|
||||
self.soup = bs.BeautifulStoneSoup(text, selfClosingTags=('br','hr'))
|
||||
|
||||
allTags = self.soup.findAll(recursive=True)
|
||||
for t in allTags:
|
||||
for attr in t._getAttrMap().keys():
|
||||
if attr not in acceptable_attributes:
|
||||
del t[attr]
|
||||
# these are not acceptable strict XHTML. But we do already have
|
||||
# CSS classes of the same names defined in constants.py
|
||||
if t.name in ('u'):
|
||||
t['class']=t.name
|
||||
t.name='span'
|
||||
if t.name in ('center'):
|
||||
t['class']=t.name
|
||||
t.name='div'
|
||||
# removes paired, but empty tags.
|
||||
if t.string != None and len(t.string.strip()) == 0 :
|
||||
t.extract()
|
||||
|
||||
text = self.soup.__str__('utf8')
|
||||
|
||||
# ffnet(& maybe others) gives the whole chapter text
|
||||
# as one line. This causes problems for nook(at
|
||||
# least) when the chapter size starts getting big
|
||||
# (200k+) Using Soup's prettify() messes up italics
|
||||
# and such. Done after soup extract so <p> and <br>
|
||||
# tags are normalized. Doing it here seems less evil
|
||||
# than hacking BeautifulSoup, but it's debatable.
|
||||
text = text.replace('</p>','</p>\n').replace('<br />','<br />\n')
|
||||
|
||||
self._writeFile(fn, XHTML_START % (title, title))
|
||||
self._writeFile(fn, text)
|
||||
self._writeFile(fn, XHTML_END)
|
||||
# print >> f, XHTML_START % (title, title)
|
||||
# f.write(text)
|
||||
# print >> f, XHTML_END
|
||||
|
||||
self.chapters.append((title, fileName))
|
||||
|
||||
def finalise(self):
|
||||
logging.debug("Finalising...")
|
||||
### writing table of contents -- ncx file
|
||||
|
||||
tocFilePath = "OEBPS/toc.ncx"
|
||||
# toc = open(tocFilePath, 'w')
|
||||
# print >> toc, TOC_START % self.storyTitle
|
||||
self._writeFile(tocFilePath, TOC_START % (self.adapter.getUUID(), self.storyTitle))
|
||||
|
||||
published = self.adapter.getStoryPublished().strftime("%Y-%m-%d")
|
||||
createda = self.adapter.getStoryCreated().strftime("%Y-%m-%d %H:%M:%S")
|
||||
created = self.adapter.getStoryCreated().strftime("%Y-%m-%d")
|
||||
updated = self.adapter.getStoryUpdated().strftime("%Y-%m-%d")
|
||||
updateyy = self.adapter.getStoryUpdated().strftime("%Y")
|
||||
updatemm = self.adapter.getStoryUpdated().strftime("%m")
|
||||
updatedd = self.adapter.getStoryUpdated().strftime("%d")
|
||||
calibre = self.adapter.getStoryUpdated().strftime("%Y-%m-%dT%H:%M:%S")
|
||||
|
||||
description = self.adapter.getStoryDescription()
|
||||
if hasattr(description, "text"):
|
||||
description = description.text
|
||||
prevalue=description
|
||||
try:
|
||||
description = unicode(description)
|
||||
except:
|
||||
description=prevalue
|
||||
|
||||
if description is not None and len(description) > 0:
|
||||
description = description.replace ('\\\'', '\'').replace('\\\"', '\"')
|
||||
description = removeEntities(description)
|
||||
else:
|
||||
description = ' '
|
||||
|
||||
### writing content -- title page
|
||||
titleFilePath = "OEBPS/title_page.xhtml"
|
||||
self._writeFile(titleFilePath, TITLE_HEADER % (self.authorName, self.storyTitle, self.adapter.getStoryURL(), self.storyTitle, self.adapter.getAuthorURL(), self.authorName))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Category:', self.adapter.getCategory()))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Genre:', self.adapter.getGenre()))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Status:', self.adapter.getStoryStatus()))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Published:', published))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Updated:', updated))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Packaged:', createda))
|
||||
tmpstr = self.adapter.getStoryRating() + " / " + self.adapter.getStoryUserRating()
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Rating Age/User:', tmpstr))
|
||||
tmpstr = unicode(self.adapter.getNumChapters()) + " / " + commaGroups(unicode(self.adapter.getNumWords()))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Chapters/Words:', tmpstr))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Publisher:', self.adapter.getHost()))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Story ID:', self.adapter.getStoryId()))
|
||||
self._writeFile(titleFilePath, TITLE_ENTRY % ('Author ID:', self.adapter.getAuthorId()))
|
||||
|
||||
self._writeFile(titleFilePath, TITLE_FOOTER % description )
|
||||
|
||||
### writing content -- opf file
|
||||
opfFilePath = "OEBPS/content.opf"
|
||||
|
||||
# opf = open(opfFilePath, 'w')
|
||||
self._writeFile(opfFilePath, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName, self.adapter.getLanguageId(), published, created, updated, calibre, description))
|
||||
|
||||
if self.adapter.storyStatus != 'Unknown':
|
||||
self.adapter.addSubject(self.adapter.storyStatus)
|
||||
i = 0
|
||||
subjs = []
|
||||
subjs = self.adapter.getSubjects()
|
||||
for subj in subjs:
|
||||
self._writeFile(opfFilePath, CONTENT_SUBJECT % subj)
|
||||
i = i + 1
|
||||
if (i <= 0):
|
||||
self._writeFile(opfFilePath, CONTENT_SUBJECT % "FanFiction")
|
||||
|
||||
subj = "Last Update Year/Month: " + updateyy + "/" + updatemm
|
||||
self._writeFile(opfFilePath, CONTENT_SUBJECT % subj)
|
||||
|
||||
subj = "Last Update: " + updateyy + "/" + updatemm + "/" + updatedd
|
||||
self._writeFile(opfFilePath, CONTENT_SUBJECT % subj)
|
||||
|
||||
self._writeFile(opfFilePath, CONTENT_END_METADATA % (self.adapter.getPublisher(), self.adapter.getUUID(), self.adapter.getStoryURL(), self.adapter.getStoryURL(), self.adapter.getStoryUserRating()))
|
||||
# print >> opf, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName)
|
||||
|
||||
ids = []
|
||||
|
||||
i = 0
|
||||
|
||||
t = "Title Page"
|
||||
f = "title_page.xhtml"
|
||||
chapterId = "title_page"
|
||||
self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f))
|
||||
self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))
|
||||
|
||||
ids.append(chapterId)
|
||||
|
||||
i = i + 1
|
||||
|
||||
for t,f in self.chapters:
|
||||
chapterId = "chapter%04d" % i
|
||||
|
||||
self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f))
|
||||
self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))
|
||||
|
||||
ids.append(chapterId)
|
||||
|
||||
i = i + 1
|
||||
|
||||
# logging.d('Toc and refs printed, proceesing to ref-ids....')
|
||||
|
||||
self._writeFile(tocFilePath, TOC_END)
|
||||
self._writeFile(opfFilePath, CONTENT_END_MANIFEST)
|
||||
|
||||
for chapterId in ids:
|
||||
self._writeFile(opfFilePath, CONTENT_ITEMREF % chapterId)
|
||||
|
||||
self._writeFile(opfFilePath, CONTENT_END)
|
||||
|
||||
self._closeFiles()
|
||||
|
||||
filename = self.directory + self.getFormatExt()
|
||||
|
||||
zipdata = zipdir.inMemoryZip(self.files)
|
||||
|
||||
if self.writeToFile:
|
||||
f = open(filename, 'wb')
|
||||
f.write(zipdata.getvalue())
|
||||
f.close()
|
||||
else:
|
||||
self.output = zipdata
|
||||
|
||||
# zipdir.toZip(filename, self.directory)
|
||||
|
||||
def unirepl(match):
|
||||
"Return the unicode string for a decimal number"
|
||||
if match.group(1)=='x':
|
||||
radix=16
|
||||
else:
|
||||
radix=10
|
||||
value = int(match.group(2), radix )
|
||||
return unichr(value)
|
||||
|
||||
def replaceNumberEntities(data):
|
||||
p = re.compile(r'&#(x?)(\d+);')
|
||||
return p.sub(unirepl, data)
|
||||
|
||||
def replaceNotEntities(data):
|
||||
# not just \w or \S. regexp from c:\Python25\lib\sgmllib.py
|
||||
# (or equiv), SGMLParser, entityref
|
||||
p = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
|
||||
return p.sub(r'&\1', data)
|
||||
|
||||
def removeEntities(text):
|
||||
# replace numeric versions of [&<>] with named versions.
|
||||
|
||||
if text is None:
|
||||
return text
|
||||
try:
|
||||
t = text.decode('utf-8')
|
||||
except UnicodeEncodeError, e:
|
||||
try:
|
||||
t = text.encode ('ascii', 'xmlcharrefreplace')
|
||||
except UnicodeEncodeError, e:
|
||||
t = text
|
||||
text = t
|
||||
text = re.sub(r'�*38;','&',text)
|
||||
text = re.sub(r'�*60;','<',text)
|
||||
text = re.sub(r'�*62;','>',text)
|
||||
|
||||
# replace remaining � entities with unicode value, such as ' -> '
|
||||
text = replaceNumberEntities(text)
|
||||
|
||||
# replace several named entities with character, such as — -> -
|
||||
# see constants.py for the list.
|
||||
# reverse sort will put entities with ; before the same one without, when valid.
|
||||
for e in reversed(sorted(entities.keys())):
|
||||
v = entities[e]
|
||||
try:
|
||||
text = text.replace(e, v)
|
||||
except UnicodeDecodeError, ex:
|
||||
# for the pound symbol in constants.py
|
||||
text = text.replace(e, v.decode('utf-8'))
|
||||
|
||||
# SGMLParser, and in turn, BeautifulStoneSoup doesn't parse
|
||||
# entities terribly well and inserts (;) after something that
|
||||
# it thinks might be an entity. AT&T becomes AT&T; All of my
|
||||
# attempts to fix this by changing the input to
|
||||
# BeautifulStoneSoup break something else instead. But at
|
||||
# this point, there should be *no* real entities left, so find
|
||||
# these not-entities and removing them here should be safe.
|
||||
text = replaceNotEntities(text)
|
||||
|
||||
# < < and & are the only html entities allowed in xhtml, put those back.
|
||||
text = text.replace('&', '&').replace('&lt;', '<').replace('&gt;', '>')
|
||||
|
||||
return text
|
||||
|
||||
def makeAcceptableFilename(text):
|
||||
return re.sub('[^a-zA-Z0-9_-]+','',removeEntities(text).replace(" ", "_").replace(":","_"))
|
||||
|
||||
def commaGroups(s):
|
||||
groups = []
|
||||
while s and s[-1].isdigit():
|
||||
groups.append(s[-3:])
|
||||
s = s[:-3]
|
||||
return s + ','.join(reversed(groups))
|
||||
367
fanficdownloader/potionsNsnitches.py
Normal file
367
fanficdownloader/potionsNsnitches.py
Normal file
|
|
@ -0,0 +1,367 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copied from the twilighted.py because site is almost the same..
|
||||
# of course, now that we're trying to scrape more detail about the
|
||||
# story, there were differences in how headers are displayed
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
import os.path
|
||||
import urllib as u
|
||||
import logging
|
||||
import pprint as pp
|
||||
import unittest
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from adapter import *
|
||||
|
||||
class PotionsNSnitches(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
self.password = ''
|
||||
self.login='sigizmund'
|
||||
self.storyDescription = 'Fanfiction Story'
|
||||
self.authorId = '0'
|
||||
self.authorURL = ''
|
||||
self.storyId = '0'
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.subjects.append ('fanfiction')
|
||||
self.subjects.append ('Harry Potter')
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = 'FanFiction'
|
||||
self.category = 'Category'
|
||||
self.storyStatus = 'In-Progress'
|
||||
self.storyRating = 'PG'
|
||||
self.storyUserRating = '0'
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
self.outputName = ''
|
||||
self.outputStorySep = '-pns_'
|
||||
|
||||
self.chapurl = False
|
||||
ss=self.url.split('?')
|
||||
if ss is not None and len(ss) > 1:
|
||||
sss = ss[1].replace('&','&').split('&')
|
||||
if sss is not None and len(sss) > 0:
|
||||
ssss = sss[0].split('=')
|
||||
if ssss is not None and len(ssss) > 1 and ssss[0] == 'sid':
|
||||
self.storyId = ssss[1]
|
||||
if len(sss) > 1:
|
||||
ssss = sss[1].split('=')
|
||||
if ssss is not None and len(ssss) > 1 and ssss[0] == 'chapter':
|
||||
self.chapurl = True
|
||||
|
||||
self.url = 'http://' + self.host + self.path + '?sid=' + self.storyId
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
|
||||
logging.debug("Created PotionsNSnitches: url=%s" % (self.url))
|
||||
|
||||
|
||||
def _getLoginScript(self):
|
||||
return '/user.php?action=login'
|
||||
|
||||
def reqLoginData(self, data):
|
||||
if data.find('Registered Users Only. Please click OK to login or register.') != -1 or data.find('There is no such account on our website') != -1:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def _fillCharacters(self, strlist, idx, maxlen):
|
||||
ii = idx
|
||||
while ii < maxlen:
|
||||
chara = strlist[ii].strip()
|
||||
if len(chara) > 0:
|
||||
if chara.find(':') != -1:
|
||||
return (ii-1)
|
||||
elif chara.find(',') == -1:
|
||||
self.addCharacter (chara)
|
||||
ii = ii + 1
|
||||
return (ii)
|
||||
|
||||
def _buildGenre(self, strlist, idx, maxlen):
|
||||
self.genre = ''
|
||||
ii = idx
|
||||
while ii < maxlen:
|
||||
genre = strlist[ii].strip()
|
||||
if len(genre) > 0:
|
||||
if genre.find(':') != -1:
|
||||
return (ii-1)
|
||||
elif genre.find(',') != -1:
|
||||
genre = ', '
|
||||
else:
|
||||
self.addSubject (genre)
|
||||
self.genre = self.genre + genre
|
||||
ii = ii + 1
|
||||
return (ii)
|
||||
|
||||
def _buildCategory(self, strlist, idx, maxlen):
|
||||
self.category = ''
|
||||
ii = idx
|
||||
while ii < maxlen:
|
||||
cat = strlist[ii].strip()
|
||||
if len(cat) > 0:
|
||||
if cat.find(':') != -1:
|
||||
return (ii-1)
|
||||
elif cat.find(',') != -1:
|
||||
cat = ', '
|
||||
else:
|
||||
self.addSubject (cat)
|
||||
self.category = self.category + cat
|
||||
ii = ii + 1
|
||||
return (ii)
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
url = self.url + '&chapter=1'
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise StoryDoesNotExist("Problem reading story URL " + url + "!")
|
||||
|
||||
if self.reqLoginData(data):
|
||||
self.performLogin()
|
||||
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise StoryDoesNotExist("Problem reading story URL " + url + "!")
|
||||
|
||||
if self.reqLoginData(data):
|
||||
raise FailedToDownload("Error downloading Story: %s! Login Failed!" % url)
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
except:
|
||||
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % url)
|
||||
|
||||
self.storyName = ''
|
||||
self.authorName = ''
|
||||
self.storyId = '0'
|
||||
title = soup.find('title').string
|
||||
if title is not None and len(title) > 0:
|
||||
logging.debug('Title: %s' % title)
|
||||
ss = title.split(' by ')
|
||||
if ss is not None and len(ss) > 1:
|
||||
self.storyName = ss[0].strip()
|
||||
self.authorName = ss[1].strip()
|
||||
|
||||
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
|
||||
logging.debug('self.authorId=%s, self.authorName=%s' % (self.authorId, self.authorName))
|
||||
|
||||
select = soup.find('select', { 'name' : 'chapter' } )
|
||||
|
||||
result = []
|
||||
if select is None:
|
||||
# no chapters found, try url by itself.
|
||||
chaptitle = soup.find('div', { 'id' : 'chaptertitle' } )
|
||||
if chaptitle is not None and chaptitle.string is not None and len(chaptitle.string) > 0:
|
||||
result.append((url,chaptitle.string))
|
||||
else:
|
||||
result.append((url,self.storyName))
|
||||
else:
|
||||
allOptions = select.findAll('option')
|
||||
for o in allOptions:
|
||||
url = self.url + "&chapter=%s" % o['value']
|
||||
title = o.string
|
||||
result.append((url,title))
|
||||
|
||||
url = self.url + "&index=1"
|
||||
data = self.opener.open(url).read()
|
||||
lines = data.split('\n')
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
pgt = soup.find('div', {'id' : 'pagetitle'})
|
||||
#logging.debug('pagetitle: %s' % pgt)
|
||||
pgtAs = pgt.findAll('a')
|
||||
#logging.debug('pgtAs: %s' % pgtAs)
|
||||
for a in pgtAs:
|
||||
if a['href'].find('viewstory.php') != -1:
|
||||
(u1, self.storyId) = a['href'].split('=')
|
||||
self.storyName = a.string
|
||||
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
|
||||
elif a['href'].find('viewuser.php') != -1:
|
||||
self.authorName = a.string
|
||||
self.authorURL = 'http://' + self.host + '/' + a['href']
|
||||
(u1, self.authorId) = a['href'].split('=')
|
||||
logging.debug('self.authorName=%s, self.authorId=%s' % (self.authorName, self.authorId))
|
||||
|
||||
output = soup.find('div', {'id' : 'output'})
|
||||
#logging.debug('output: %s' % unicode(output))
|
||||
if output is not None and len(unicode(output)) > 1:
|
||||
s2 = re.split ('<[^>]+>', unicode(output))
|
||||
#logging.debug('s2=%s' % s2)
|
||||
ii = 0
|
||||
ll = len(s2)
|
||||
while ii < ll:
|
||||
if s2[ii] == 'Summary:' and ii+1 < ll:
|
||||
self.storyDescription = s2[ii+1].strip()
|
||||
logging.debug('self.storyDescription: %s' % self.storyDescription)
|
||||
break;
|
||||
ii = ii+1
|
||||
|
||||
cnt = soup.find('div', {'class' : 'content'})
|
||||
#logging.debug('content: %s' % cnt)
|
||||
cnttd = cnt.findAll('td')
|
||||
#logging.debug('cnttd: %s' % cnttd)
|
||||
for td in cnttd:
|
||||
#logging.debug('td: %s' % unicode(td))
|
||||
ss = unicode(td).replace('\n','').replace('\r','').replace(' ', ' ')
|
||||
if len(ss) > 1:
|
||||
s2 = re.split ('<[^>]+>', ss)
|
||||
#logging.debug('s2=%s' % s2)
|
||||
ii = 0
|
||||
ll = len(s2)
|
||||
while ii < ll-1:
|
||||
if s2[ii] is not None and len(s2[ii]) > 0 and s2[ii].find(':') != -1:
|
||||
skey = s2[ii].strip()
|
||||
ii = ii+1
|
||||
if skey == 'Rated:':
|
||||
self.storyRating = s2[ii].strip()
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
ii = ii + 1
|
||||
elif skey == 'Chapters:':
|
||||
self.numChapters = s2[ii].strip()
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
ii = ii + 1
|
||||
elif skey == 'Characters:':
|
||||
ii = self._fillCharacters(s2, ii, ll)
|
||||
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
||||
ii = ii + 1
|
||||
elif skey == 'Genres:':
|
||||
ii = self._buildGenre(s2, ii, ll)
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
elif skey == 'Categories:':
|
||||
ii = self._buildCategory(s2, ii, ll)
|
||||
logging.debug('self.category=%s' % self.category)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
elif skey == 'Completed:':
|
||||
if s2[ii].strip(' ') == "No":
|
||||
self.storyStatus = 'In-Progress'
|
||||
else:
|
||||
self.storyStatus = 'Completed'
|
||||
ii = ii + 1
|
||||
elif skey == 'Word count:':
|
||||
self.numWords = s2[ii].strip()
|
||||
if self.numWords is None or len(self.numWords) == 0:
|
||||
self.numWords = '0'
|
||||
logging.debug('self.numWords=%s' % self.numWords)
|
||||
ii = ii + 1
|
||||
elif skey == 'Takes Place:':
|
||||
ii = ii + 1
|
||||
elif skey == 'Awards:':
|
||||
ii = ii + 1
|
||||
elif skey == 'Series:':
|
||||
ii = ii + 1
|
||||
elif skey == 'Read:':
|
||||
ii = ii + 1
|
||||
elif skey == 'Warnings:':
|
||||
ii = ii + 1
|
||||
else:
|
||||
ii = ii + 1
|
||||
|
||||
tls = soup.findAll('div', {'style' : 'text-align: center;'})
|
||||
for tl in tls:
|
||||
#logging.debug('tl: %s' % tl)
|
||||
ss = unicode(tl).replace('\n','').replace('\r','').replace(' ', ' ')
|
||||
if ss.find('Published:') != -1:
|
||||
s2 = re.split ('<[^>]+>', ss)
|
||||
#logging.debug('s2: %s' % s2)
|
||||
ii = 0
|
||||
ll = len(s2)
|
||||
while ii < ll-1:
|
||||
if s2[ii] is not None and len(s2[ii]) > 0 and s2[ii].find(':') != -1:
|
||||
skey = s2[ii].strip()
|
||||
#logging.debug('skey: %s' % skey)
|
||||
ii = ii+1
|
||||
if skey == 'Published:':
|
||||
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s2[ii].strip(' '), "%b %d %Y")))
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
ii = ii + 1
|
||||
elif skey == 'Updated:':
|
||||
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s2[ii].strip(' '), "%b %d %Y")))
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
||||
ii = ii + 1
|
||||
else:
|
||||
ii = ii + 1
|
||||
|
||||
if (self.storyName is None or len(self.storyName) == 0) and self.storyId == '0':
|
||||
logging.error('self.storyName is empty!! Exitting!')
|
||||
exit(1)
|
||||
|
||||
return result
|
||||
|
||||
def getText(self, url):
|
||||
if url.find('http://') == -1:
|
||||
url = 'http://' + self.host + '/' + url
|
||||
|
||||
logging.debug('Getting data from: %s' % url)
|
||||
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
|
||||
|
||||
# need to do this, because for some reason the <br /> tag in the story causes problems
|
||||
data = data.replace('<br />', ' SOMETHING_BR ')
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES)
|
||||
except:
|
||||
logging.info("Failed to decode: <%s>" % data)
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
# put the <br /> tags back in..
|
||||
text = div.__str__('utf8').replace(' SOMETHING_BR ','<br />')
|
||||
return text
|
||||
|
||||
|
||||
class PotionsNSnitches_UnitTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
pass
|
||||
|
||||
def testLoginWorks(self):
|
||||
pass
|
||||
|
||||
def testGetUrlsWorks(self):
|
||||
url = 'http://potionsandsnitches.net/fanfiction/viewstory.php?sid=2230'
|
||||
self.assertEquals(32, len(Twilighted(url).extractIndividualUrls()))
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
10
fanficdownloader/readme.txt
Normal file
10
fanficdownloader/readme.txt
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
To use, do:
|
||||
|
||||
python downloader.py <url> (epub|html|text|mobi)
|
||||
|
||||
Eg:
|
||||
|
||||
python downloader.py http://www.fanfiction.net/s/5192986/1/A_Fox_in_Tokyo epub
|
||||
|
||||
This tool uses Python 2.5.2, but should work with newer versions of Python.
|
||||
|
||||
64
fanficdownloader/story.py
Normal file
64
fanficdownloader/story.py
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from htmlcleanup import conditionalRemoveEntities
|
||||
|
||||
class Story:
|
||||
|
||||
def __init__(self):
|
||||
self.metadata = {}
|
||||
self.chapters = [] # chapters will be tuples of (title,html)
|
||||
self.listables = {} # some items (extratags, category, warnings & genres) are also kept as lists.
|
||||
|
||||
def setMetadata(self, key, value):
|
||||
self.metadata[key]=conditionalRemoveEntities(value)
|
||||
|
||||
def getMetadataRaw(self,key):
|
||||
if self.metadata.has_key(key):
|
||||
return self.metadata[key]
|
||||
|
||||
def getMetadata(self, key):
|
||||
if self.getLists().has_key(key):
|
||||
return ', '.join(self.getList(key))
|
||||
if self.metadata.has_key(key):
|
||||
value = self.metadata[key]
|
||||
if value:
|
||||
if key == "numWords":
|
||||
value = commaGroups(value)
|
||||
if key == "dateCreated":
|
||||
value = value.strftime("%Y-%m-%d %H:%M:%S")
|
||||
if key == "datePublished" or key == "dateUpdated":
|
||||
value = value.strftime("%Y-%m-%d")
|
||||
return value
|
||||
|
||||
def addToList(self,listname,value):
|
||||
if not self.listables.has_key(listname):
|
||||
self.listables[listname]=[]
|
||||
# prevent duplicates.
|
||||
if not value in self.listables[listname]:
|
||||
self.listables[listname].append(conditionalRemoveEntities(value))
|
||||
|
||||
def getList(self,listname):
|
||||
if not self.listables.has_key(listname):
|
||||
return []
|
||||
return self.listables[listname]
|
||||
|
||||
def getLists(self):
|
||||
return self.listables
|
||||
|
||||
def addChapter(self, title, html):
|
||||
self.chapters.append( (title,html) )
|
||||
|
||||
def getChapters(self):
|
||||
"Chapters will be tuples of (title,html)"
|
||||
return self.chapters
|
||||
|
||||
def __str__(self):
|
||||
return "Metadata: " +str(self.metadata) + "\nListables: " +str(self.listables) #+ "\nChapters: "+str(self.chapters)
|
||||
|
||||
def commaGroups(s):
|
||||
groups = []
|
||||
while s and s[-1].isdigit():
|
||||
groups.append(s[-3:])
|
||||
s = s[:-3]
|
||||
return s + ','.join(reversed(groups))
|
||||
|
||||
316
fanficdownloader/twilighted.py
Normal file
316
fanficdownloader/twilighted.py
Normal file
|
|
@ -0,0 +1,316 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
import os.path
|
||||
import urllib as u
|
||||
import logging
|
||||
import pprint as pp
|
||||
import unittest
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from adapter import *
|
||||
import twipassword
|
||||
|
||||
class Twilighted(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
self.password=twipassword.password
|
||||
self.login='sigizmund'
|
||||
self.storyDescription = 'Fanfiction Story'
|
||||
self.authorId = '0'
|
||||
self.authorURL = ''
|
||||
self.storyId = '0'
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.subjects.append ('fanfiction')
|
||||
self.subjects.append ('Twilight')
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = ''
|
||||
self.category = 'Fanfiction'
|
||||
self.storyStatus = 'In-Progress'
|
||||
self.storyRating = 'PG'
|
||||
self.storyUserRating = '0'
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
self.outputName = ''
|
||||
self.outputStorySep = '-tw_'
|
||||
|
||||
self.chapurl = False
|
||||
ss=self.url.split('?')
|
||||
logging.debug('ss=%s' % ss)
|
||||
if ss is not None and len(ss) > 1:
|
||||
sss = ss[1].replace('&','&').split('&')
|
||||
logging.debug('sss=%s' % sss)
|
||||
if sss is not None and len(sss) > 0:
|
||||
ssss = sss[0].split('=')
|
||||
logging.debug('ssss=%s' % ssss)
|
||||
if ssss is not None and len(ssss) > 1 and ssss[0] == 'sid':
|
||||
self.storyId = ssss[1]
|
||||
if len(sss) > 1:
|
||||
ssss = sss[1].split('=')
|
||||
logging.debug('ssss=%s' % ssss)
|
||||
if ssss is not None and len(ssss) > 1 and ssss[0] == 'chapter':
|
||||
self.chapurl = True
|
||||
|
||||
self.url = 'http://' + self.host + self.path + '?sid=' + self.storyId
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
|
||||
logging.debug("Created Twilighted: url=%s" % (self.url))
|
||||
|
||||
def _getLoginScript(self):
|
||||
return '/user.php?action=login'
|
||||
|
||||
def reqLoginData(self, data):
|
||||
if data.find('Registered Users Only. Please click OK to login or register.') != -1 or data.find('There is no such account on our website') != -1:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def requiresLogin(self, url = None):
|
||||
return True
|
||||
|
||||
def performLogin(self, url = None):
|
||||
data = {}
|
||||
|
||||
data['penname'] = self.login
|
||||
data['password'] = self.password
|
||||
data['cookiecheck'] = '1'
|
||||
data['submit'] = 'Submit'
|
||||
|
||||
urlvals = u.urlencode(data)
|
||||
loginUrl = 'http://' + self.host + self._getLoginScript()
|
||||
logging.debug("Will now login to URL %s" % loginUrl)
|
||||
|
||||
req = self.opener.open(loginUrl, urlvals)
|
||||
|
||||
d = req.read().decode('utf-8')
|
||||
|
||||
if self.reqLoginData(d) :
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
url = self.url + '&chapter=1'
|
||||
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise StoryDoesNotExist("Problem reading story URL " + url + "!")
|
||||
|
||||
if self.reqLoginData(data):
|
||||
self.performLogin()
|
||||
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise StoryDoesNotExist("Problem reading story URL " + url + "!")
|
||||
|
||||
if self.reqLoginData(data):
|
||||
raise FailedToDownload("Error downloading Story: %s! Login Failed!" % url)
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
except:
|
||||
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % url)
|
||||
|
||||
title = soup.find('title').string
|
||||
logging.debug('Title: %s' % title)
|
||||
self.storyName = title.split(' by ')[0].strip()
|
||||
self.authorName = title.split(' by ')[1].strip()
|
||||
|
||||
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
|
||||
logging.debug('self.authorId=%s, self.authorName=%s' % (self.authorId, self.authorName))
|
||||
|
||||
select = soup.find('select', { 'name' : 'chapter' } )
|
||||
|
||||
result = []
|
||||
if select is None:
|
||||
# no chapters found, try url by itself.
|
||||
result.append((self.url,self.storyName))
|
||||
else:
|
||||
allOptions = select.findAll('option')
|
||||
for o in allOptions:
|
||||
url = self.url + "&chapter=%s" % o['value']
|
||||
title = o.string
|
||||
result.append((url,title))
|
||||
|
||||
url = self.url + "&index=1"
|
||||
data = self.opener.open(url).read()
|
||||
lines = data.split('\n')
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
metas = soup.findAll('meta')
|
||||
|
||||
for meta in metas:
|
||||
if 'name' in meta._getAttrMap() and meta['name'].find('description') != -1:
|
||||
#logging.debug('Meta: %s' % meta)
|
||||
if 'content' in meta._getAttrMap():
|
||||
s1 = bs.BeautifulStoneSoup(meta['content'])
|
||||
ps = s1.findAll('p')
|
||||
if len(ps) > 0:
|
||||
self.storyDescription = ps[0]
|
||||
#logging.debug('self.storyDescription=%s' % self.storyDescription.replace("\n"," ").replace('\r',''))
|
||||
else:
|
||||
divs = meta.findAll('div')
|
||||
#logging.debug('Divs: %s' % divs)
|
||||
|
||||
for div in divs:
|
||||
#logging.debug('Div: %s' % div)
|
||||
if 'id' in div._getAttrMap() and div['id'].find('pagetitle') != -1:
|
||||
#logging.debug('Div PAGETITLE: %s' % div)
|
||||
allA = div.findAll('a')
|
||||
for a in allA:
|
||||
if 'href' in a._getAttrMap():
|
||||
if a['href'].find('viewstory.php?sid=') != -1:
|
||||
str1 = a.string
|
||||
(vs, self.storyId) = a['href'].split('=')
|
||||
logging.debug('self.storyId=%s self.storyName=%s' % (self.storyId, self.storyName))
|
||||
if a['href'].find('viewuser.php?uid=') != -1:
|
||||
str1 = a.string
|
||||
(vs, self.authorId) = a['href'].split('=')
|
||||
logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName))
|
||||
self.authorURL = 'http://'+self.host+'/viewuser.php?uid='+self.authorId
|
||||
logging.debug('self.authorURL=%s' % self.authorURL)
|
||||
if 'class' in div._getAttrMap() and div['class'].find('content') != -1:
|
||||
#logging.debug('Div CONTENT: %s' % div)
|
||||
brs = div.findAll('br')
|
||||
for br in brs:
|
||||
buf = unicode(br).encode('utf-8')
|
||||
strs = re.split ('<[^>]+>', buf)
|
||||
#logging.debug('BUF: %s' % strs)
|
||||
ii = 2
|
||||
stlen = len(strs)
|
||||
while stlen > ii+1:
|
||||
if len(strs[ii]) == 0:
|
||||
ii = ii+1
|
||||
continue
|
||||
if strs[ii] == 'Categories:':
|
||||
ii = ii+1
|
||||
while stlen > ii and len(strs[ii]) != 0 and strs[ii].find(':') == -1:
|
||||
if strs[ii] != ' ' and strs[ii] != ', ':
|
||||
if len(self.genre) > 0:
|
||||
self.genre = self.genre + ', '
|
||||
self.genre = strs[ii].strip(' ')
|
||||
if len(self.category) == 0:
|
||||
self.category = strs[ii].strip(' ')
|
||||
self.addSubject(strs[ii].strip(' '))
|
||||
ii = ii+1
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
if strs[ii] == 'Characters: ':
|
||||
ii = ii+1
|
||||
while stlen > ii and len(strs[ii]) != 0 and strs[ii].find(':') == -1:
|
||||
if strs[ii] != ' ' and strs[ii] != ', ':
|
||||
self.addCharacter(strs[ii].strip(' '))
|
||||
ii = ii+1
|
||||
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
||||
elif strs[ii] == 'Completed:':
|
||||
if strs[ii+1].strip(' ') == "No":
|
||||
self.storyStatus = 'In-Progress'
|
||||
else:
|
||||
self.storyStatus = 'Completed'
|
||||
ii = ii+2
|
||||
logging.debug('self.storyStatus=%s' % self.storyStatus)
|
||||
elif strs[ii] == 'Rated:':
|
||||
self.storyRating = strs[ii+1].strip(' ')
|
||||
ii = ii+2
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
elif strs[ii] == 'Series:':
|
||||
self.storySeries = strs[ii+1].strip(' ')
|
||||
if self.storySeries == 'None':
|
||||
self.storySeries = ''
|
||||
ii = ii+2
|
||||
logging.debug('self.storySeries=%s' % self.storySeries)
|
||||
elif strs[ii] == 'Chapters: ':
|
||||
self.numChapters = strs[ii+1].strip(' ')
|
||||
ii = ii+2
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
elif strs[ii] == 'Word count:':
|
||||
self.numWords = strs[ii+1].strip(' ')
|
||||
ii = ii+2
|
||||
logging.debug('self.numWords=%s' % self.numWords)
|
||||
elif strs[ii] == ' Published: ':
|
||||
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(strs[ii+1].strip(' '), "%B %d, %Y")))
|
||||
ii = ii+2
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
elif strs[ii] == 'Updated:':
|
||||
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(strs[ii+1].strip(' '), "%B %d, %Y")))
|
||||
ii = ii+2
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
||||
else:
|
||||
logging.debug('Skipped Label \"%s\" Value \"%s\"' % (strs[ii], strs[ii+1]))
|
||||
ii = ii+2
|
||||
|
||||
return result
|
||||
|
||||
def getText(self, url):
|
||||
if url.find('http://') == -1:
|
||||
url = 'http://' + self.host + '/' + url
|
||||
|
||||
logging.debug('Getting data from: %s' % url)
|
||||
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES)
|
||||
except:
|
||||
logging.info("Failed to decode: <%s>" % data)
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return div.__str__('utf8')
|
||||
|
||||
|
||||
class Twilighted_UnitTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
pass
|
||||
|
||||
def testLoginWorks(self):
|
||||
url = 'http://www.twilighted.net/viewstory.php?sid=10004'
|
||||
self.assertTrue(Twilighted(url).performLogin())
|
||||
|
||||
def testGetUrlsWorks(self):
|
||||
url = 'http://www.twilighted.net/viewstory.php?sid=10004'
|
||||
self.assertEquals(32, len(Twilighted(url).extractIndividualUrls()))
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
5
fanficdownloader/twipassword.py
Normal file
5
fanficdownloader/twipassword.py
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# This is really for the web version. download.py will ask.
|
||||
password='somepass'
|
||||
twiwritepassword='otherpass'
|
||||
280
fanficdownloader/twiwrite.py
Normal file
280
fanficdownloader/twiwrite.py
Normal file
|
|
@ -0,0 +1,280 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
import os.path
|
||||
import urllib as u
|
||||
import logging
|
||||
import pprint as pp
|
||||
import unittest
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from adapter import *
|
||||
import twipassword
|
||||
|
||||
class Twiwrite(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
self.password=twipassword.twiwritepassword
|
||||
self.login='BobsClue'
|
||||
self.storyDescription = 'Fanfiction Story'
|
||||
self.authorId = '0'
|
||||
self.authorURL = ''
|
||||
self.storyId = '0'
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.subjects.append ('fanfiction')
|
||||
self.subjects.append ('Twiwrite')
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = ''
|
||||
self.category = 'Fanfiction'
|
||||
self.storyStatus = 'Unknown'
|
||||
self.storyRating = 'Unknown'
|
||||
self.storyUserRating = '0'
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
self.outputName = ''
|
||||
self.outputStorySep = '-twrt_'
|
||||
|
||||
self.chapurl = False
|
||||
ss=self.url.split('?')
|
||||
logging.debug('ss=%s' % ss)
|
||||
if ss is not None and len(ss) > 1:
|
||||
sss = ss[1].replace('&','&').split('&')
|
||||
logging.debug('sss=%s' % sss)
|
||||
if sss is not None and len(sss) > 0:
|
||||
ssss = sss[0].split('=')
|
||||
logging.debug('ssss=%s' % ssss)
|
||||
if ssss is not None and len(ssss) > 1 and ssss[0] == 'sid':
|
||||
self.storyId = ssss[1]
|
||||
if len(sss) > 1:
|
||||
ssss = sss[1].split('=')
|
||||
logging.debug('ssss=%s' % ssss)
|
||||
if ssss is not None and len(ssss) > 1 and ssss[0] == 'chapter':
|
||||
self.chapurl = True
|
||||
|
||||
self.url = 'http://' + self.host + self.path + '?sid=' + self.storyId
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
|
||||
logging.debug("Created Twiwrite: url=%s" % (self.url))
|
||||
|
||||
def _getLoginScript(self):
|
||||
return '/user.php?action=login'
|
||||
|
||||
def reqLoginData(self, data):
|
||||
if data.find('Registered Users Only') != -1 or data.find('There is no such account on our website') != -1:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def requiresLogin(self, url = None):
|
||||
return False
|
||||
|
||||
def performLogin(self, url = None):
|
||||
data = {}
|
||||
|
||||
data['penname'] = self.login
|
||||
data['password'] = self.password
|
||||
data['cookiecheck'] = '1'
|
||||
data['submit'] = 'Submit'
|
||||
|
||||
urlvals = u.urlencode(data)
|
||||
loginUrl = 'http://' + self.host + self._getLoginScript()
|
||||
logging.debug("Will now login to URL %s" % loginUrl)
|
||||
|
||||
req = self.opener.open(loginUrl, urlvals)
|
||||
|
||||
d = req.read().decode('utf-8')
|
||||
|
||||
if self.reqLoginData(d) :
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
url = self.url + '&chapter=1&ageconsent=ok&warning=1'
|
||||
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise StoryDoesNotExist("Problem reading story URL " + url + "!")
|
||||
|
||||
if self.reqLoginData(data):
|
||||
self.performLogin()
|
||||
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise StoryDoesNotExist("Problem reading story URL " + url + "!")
|
||||
|
||||
if self.reqLoginData(data):
|
||||
raise FailedToDownload("Error downloading Story: %s! Login Failed!" % url)
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
except:
|
||||
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % url)
|
||||
|
||||
#<div id="pagetitle"><a href="viewstory.php?sid=280">Twilight for Dummies</a> by <a href="viewuser.php?uid=61">The Chick Norris</a> </div>
|
||||
|
||||
div = soup.find('div',{'id':'pagetitle'})
|
||||
titlea = div.find('a', href=re.compile(r"viewstory.php"))
|
||||
self.storyName = titlea.string
|
||||
|
||||
authora = div.find('a', href=re.compile(r"viewuser.php"))
|
||||
self.authorName = authora.string
|
||||
self.authorId= authora['href'].split('=')[1]
|
||||
self.authorURL = 'http://'+self.host+'/'+authora['href']
|
||||
|
||||
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
|
||||
logging.debug('self.authorId=%s, self.authorName=%s' % (self.authorId, self.authorName))
|
||||
|
||||
select = soup.find('select', { 'name' : 'chapter' } )
|
||||
|
||||
result = []
|
||||
if select is None:
|
||||
# no chapters found, try url by itself.
|
||||
result.append((self.url,self.storyName))
|
||||
else:
|
||||
allOptions = select.findAll('option')
|
||||
for o in allOptions:
|
||||
url = self.url + "&chapter=%s&ageconsent=ok&warning=1" % o['value']
|
||||
title = o.string
|
||||
result.append((url,title))
|
||||
|
||||
url = self.url + "&index=1&ageconsent=ok&warning=1"
|
||||
data = self.opener.open(url).read()
|
||||
lines = data.split('\n')
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
|
||||
labels = soup.findAll('span',{'class':'label'})
|
||||
for labelspan in labels:
|
||||
value = labelspan.nextSibling
|
||||
label = labelspan.string
|
||||
|
||||
if 'Rated' in label:
|
||||
self.storyRating = value.strip()
|
||||
|
||||
if 'Chapters' in label:
|
||||
self.numChapters = value.strip()
|
||||
|
||||
if 'Word count' in label:
|
||||
self.numWords = value.strip()
|
||||
|
||||
if 'Categories' in label:
|
||||
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
|
||||
catstext = [cat.string for cat in cats]
|
||||
self.category = ', '.join(catstext)
|
||||
for cat in catstext:
|
||||
self.addSubject(cat.string)
|
||||
|
||||
if 'Genre' in label:
|
||||
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class'))
|
||||
genrestext = [genre.string for genre in genres]
|
||||
self.genre = ', '.join(genrestext)
|
||||
for genre in genrestext:
|
||||
self.addSubject(genre.string)
|
||||
|
||||
if 'Completed' in label:
|
||||
if 'Yes' in value:
|
||||
self.storyStatus = 'Completed'
|
||||
else:
|
||||
self.storyStatus = 'In-Progress'
|
||||
|
||||
if 'Published' in label:
|
||||
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(value.strip(), "%B %d, %Y")))
|
||||
|
||||
if 'Updated' in label:
|
||||
# there's a stray [ at the end.
|
||||
value = value[0:-1]
|
||||
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(value.strip(), "%B %d, %Y")))
|
||||
|
||||
# the only things in <p> tags in <div class='content'> are the parts of the summary.
|
||||
divcontent = soup.find('div',{'class':'content'})
|
||||
|
||||
# metadesc = soup.find('meta',{'name':'description'})
|
||||
# contentsoup = bs.BeautifulStoneSoup(metadesc['content'])
|
||||
ps = divcontent.findAll('p')
|
||||
pstext=[]
|
||||
for p in ps:
|
||||
if p.string:
|
||||
s = p.string.replace(' ',' ').strip()
|
||||
if s:
|
||||
pstext.append(p.string)
|
||||
|
||||
self.storyDescription = ' '.join(pstext)
|
||||
print "self.storyDescription: %s"%self.storyDescription
|
||||
|
||||
return result
|
||||
|
||||
def getText(self, url):
|
||||
if url.find('http://') == -1:
|
||||
url = 'http://' + self.host + '/' + url
|
||||
|
||||
logging.debug('Getting data from: %s' % url)
|
||||
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES)
|
||||
except:
|
||||
logging.info("Failed to decode: <%s>" % data)
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return div.__str__('utf8')
|
||||
|
||||
|
||||
class Twiwrite_UnitTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
pass
|
||||
|
||||
def testLoginWorks(self):
|
||||
url = 'http://www.twiwrite.net/viewstory.php?sid=117'
|
||||
self.assertTrue(Twiwrite(url).performLogin())
|
||||
|
||||
def testGetUrlsWorks(self):
|
||||
url = 'http://www.twiwrite.net/viewstory.php?sid=117'
|
||||
self.assertEquals(36, len(Twiwrite(url).extractIndividualUrls()))
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
225
fanficdownloader/whofic.py
Normal file
225
fanficdownloader/whofic.py
Normal file
|
|
@ -0,0 +1,225 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
import os.path
|
||||
import urllib as u
|
||||
import logging
|
||||
import pprint as pp
|
||||
import unittest
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from adapter import *
|
||||
|
||||
class Whofic(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
self.storyDescription = 'Fanfiction Story'
|
||||
self.authorId = '0'
|
||||
self.authorURL = ''
|
||||
self.storyId = '0'
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.subjects.append ('Fanfiction')
|
||||
self.subjects.append ('Doctor Who')
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = ''
|
||||
self.category = ''
|
||||
self.storyStatus = 'In-Progress'
|
||||
self.storyRating = 'PG'
|
||||
self.storyUserRating = '0'
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
self.outputName = ''
|
||||
self.outputStorySep = '-whof_'
|
||||
|
||||
self.chapurl = False
|
||||
ss=self.url.split('?')
|
||||
logging.debug('ss=%s' % ss)
|
||||
if ss is not None and len(ss) > 1:
|
||||
sss = ss[1].replace('&','&').split('&')
|
||||
logging.debug('sss=%s' % sss)
|
||||
if sss is not None and len(sss) > 0:
|
||||
ssss = sss[0].split('=')
|
||||
logging.debug('ssss=%s' % ssss)
|
||||
if ssss is not None and len(ssss) > 1 and ssss[0] == 'sid':
|
||||
self.storyId = ssss[1]
|
||||
if len(sss) > 1:
|
||||
ssss = sss[1].split('=')
|
||||
logging.debug('ssss=%s' % ssss)
|
||||
if ssss is not None and len(ssss) > 1 and ssss[0] == 'chapter':
|
||||
self.chapurl = True
|
||||
|
||||
self.url = 'http://' + self.host + self.path + '?sid=' + self.storyId
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
|
||||
logging.debug("Created Whofic: url=%s" % (self.url))
|
||||
|
||||
def requiresLogin(self, url = None):
|
||||
return False
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
url = self.url + '&chapter=1'
|
||||
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise StoryDoesNotExist("Problem reading story URL " + url + "!")
|
||||
|
||||
soup = None
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
except:
|
||||
raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % url)
|
||||
|
||||
title = soup.find('title').string
|
||||
title = title.split('::')[1].strip()
|
||||
logging.debug('Title: %s' % title)
|
||||
self.storyName = title.split(' by ')[0].strip()
|
||||
self.authorName = title.split(' by ')[1].strip()
|
||||
|
||||
for a in soup.findAll('a'):
|
||||
if a['href'].startswith('viewuser.php'):
|
||||
self.authorId = a['href'].split('=')[1]
|
||||
self.authorURL = 'http://'+self.host+'/'+a['href']
|
||||
|
||||
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
|
||||
logging.debug('self.authorId=%s, self.authorName=%s' % (self.authorId, self.authorName))
|
||||
|
||||
select = soup.find('select', { 'name' : 'chapter' } )
|
||||
|
||||
result = []
|
||||
if select is None:
|
||||
# no chapters found, try url by itself.
|
||||
result.append((url,self.storyName))
|
||||
else:
|
||||
allOptions = select.findAll('option')
|
||||
for o in allOptions:
|
||||
url = self.url + "&chapter=%s" % o['value']
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
title = "%s" % o
|
||||
title = re.sub('<[^>]+>','',title)
|
||||
result.append((url,title))
|
||||
|
||||
## Whofic.com puts none of the meta data in the chapters or
|
||||
## even the story chapter index page. Need to scrape the
|
||||
## author page to find it.
|
||||
data = self.opener.open(self.authorURL).read()
|
||||
|
||||
soup = bs.BeautifulStoneSoup(data, selfClosingTags=('br','hr'))
|
||||
# find this story in the list, parse it's metadata based on
|
||||
# lots of assumptions, since there's little tagging.
|
||||
for a in soup.findAll('a'):
|
||||
if a['href'].find('viewstory.php?sid='+self.storyId) != -1:
|
||||
metadata = a.findParent('td')
|
||||
metadatachunks = metadata.__str__('utf8').split('<br />')
|
||||
# process metadata for this story.
|
||||
self.storyDescription = metadatachunks[1].strip()
|
||||
|
||||
# the stuff with ' - ' separators
|
||||
moremeta = metadatachunks[2]
|
||||
moremeta = re.sub('<[^>]+>','',moremeta) # strip tags.
|
||||
|
||||
moremetaparts = moremeta.split(' - ')
|
||||
|
||||
self.category = moremetaparts[0]
|
||||
for cat in self.category.split(', '):
|
||||
self.addSubject(cat.strip())
|
||||
|
||||
self.storyRating = moremetaparts[1]
|
||||
|
||||
for warn in moremetaparts[2].split(', '):
|
||||
self.addSubject(warn.strip())
|
||||
|
||||
self.genre = moremetaparts[3]
|
||||
|
||||
# the stuff with ' - ' separators *and* names
|
||||
moremeta = metadatachunks[5]
|
||||
moremeta = re.sub('<[^>]+>','',moremeta) # strip tags.
|
||||
|
||||
moremetaparts = moremeta.split(' - ')
|
||||
|
||||
for part in moremetaparts:
|
||||
(name,value) = part.split(': ')
|
||||
name=name.strip()
|
||||
value=value.strip()
|
||||
if name == 'Published':
|
||||
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(value, '%Y.%m.%d')))
|
||||
if name == 'Updated':
|
||||
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(value, '%Y.%m.%d')))
|
||||
if name == 'Completed' and value == 'Yes':
|
||||
self.storyStatus = name
|
||||
if name == 'Word Count':
|
||||
self.numWords = value
|
||||
|
||||
break
|
||||
|
||||
self.numChapters = len(result)
|
||||
|
||||
return result
|
||||
|
||||
def getText(self, url):
|
||||
if url.find('http://') == -1:
|
||||
url = 'http://' + self.host + '/' + url
|
||||
|
||||
logging.debug('Getting data from: %s' % url)
|
||||
|
||||
data = ''
|
||||
try:
|
||||
data = self.opener.open(url).read()
|
||||
except Exception, e:
|
||||
data = ''
|
||||
logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
|
||||
if data is None:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
|
||||
|
||||
soup = None
|
||||
try:
|
||||
# I really wish I knew why adastra needs the selfClosingTags to make <br /> work, but ficwad doesn't.
|
||||
soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES, selfClosingTags=('br','hr'))
|
||||
except:
|
||||
logging.info("Failed to decode: <%s>" % data)
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
|
||||
|
||||
# hardly a great identifier, I know, but whofic really doesn't
|
||||
# give us anything better to work with.
|
||||
span = soup.find('span', {'style' : 'font-size: 100%;'})
|
||||
|
||||
if None == span:
|
||||
raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return span.__str__('utf8')
|
||||
|
||||
|
||||
class Whofic_UnitTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
pass
|
||||
|
||||
def testGetUrlsWorks(self):
|
||||
url = 'http://www.whofic.com/viewstory.php?sid=37139'
|
||||
self.assertEquals(6, len(Whofic(url).extractIndividualUrls()))
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
16
fanficdownloader/writers/__init__.py
Normal file
16
fanficdownloader/writers/__init__.py
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
## This could (should?) use a dynamic loader like adapters, but for
|
||||
## now, it's static, since there's so few of them.
|
||||
|
||||
from writers.writer_html import HTMLWriter
|
||||
from writers.writer_txt import TextWriter
|
||||
from writers.writer_epub import EpubWriter
|
||||
|
||||
def getWriter(type,config,story):
|
||||
if type == "html":
|
||||
return HTMLWriter(config,story)
|
||||
if type == "txt":
|
||||
return TextWriter(config,story)
|
||||
if type == "epub":
|
||||
return EpubWriter(config,story)
|
||||
168
fanficdownloader/writers/base_writer.py
Normal file
168
fanficdownloader/writers/base_writer.py
Normal file
|
|
@ -0,0 +1,168 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
import os.path
|
||||
import string
|
||||
import StringIO
|
||||
import zipfile
|
||||
from zipfile import ZipFile, ZIP_DEFLATED
|
||||
|
||||
from story import Story
|
||||
from configurable import Configurable
|
||||
from htmlcleanup import removeEntities, removeAllEntities, stripHTML
|
||||
|
||||
from adapters.base_adapter import *
|
||||
|
||||
class BaseStoryWriter(Configurable):
|
||||
|
||||
@staticmethod
|
||||
def getFormatName():
|
||||
return 'base'
|
||||
|
||||
@staticmethod
|
||||
def getFormatExt():
|
||||
return '.bse'
|
||||
|
||||
def __init__(self, config, story):
|
||||
Configurable.__init__(self, config)
|
||||
self.addConfigSection(self.getFormatName())
|
||||
self.story = story
|
||||
self.titleLabels = {
|
||||
'category':'Category',
|
||||
'genre':'Genre',
|
||||
'status':'Status',
|
||||
'datePublished':'Published',
|
||||
'dateUpdated':'Updated',
|
||||
'dateCreated':'Packaged',
|
||||
'rating':'Rating',
|
||||
'warnings':'Warnings',
|
||||
'numChapters':'Chapters',
|
||||
'numWords':'Words',
|
||||
'site':'Publisher',
|
||||
'storyId':'Story ID',
|
||||
'authorId':'Author ID',
|
||||
'extratags':'Extra Tags',
|
||||
'title':'Title',
|
||||
'storyUrl':'Story URL',
|
||||
'description':'Summary',
|
||||
'author':'Author',
|
||||
'authorUrl':'Author URL',
|
||||
'formatname':'File Format',
|
||||
'formatext':'File Extension',
|
||||
}
|
||||
self.story.setMetadata('formatname',self.getFormatName())
|
||||
self.story.setMetadata('formatext',self.getFormatExt())
|
||||
|
||||
def getOutputFileName(self):
|
||||
return self.getFileName(self.getConfig('output_filename'))
|
||||
|
||||
def getZipFileName(self):
|
||||
return self.getFileName(self.getConfig('zip_filename'),extension=".zip")
|
||||
|
||||
def getFileName(self,template,extension="${formatext}"):
|
||||
values = self.story.metadata
|
||||
fallback=False
|
||||
# fall back default:
|
||||
if not template:
|
||||
template="${title}-${siteabbrev}_${storyId}${formatext}"
|
||||
fallback=True
|
||||
|
||||
# Add extension if not already included.
|
||||
if extension not in template:
|
||||
template+=extension
|
||||
|
||||
if fallback or self.getConfig('safe_filename'):
|
||||
values={}
|
||||
pattern = re.compile(r"[^a-zA-Z0-9_\. \[\]\(\)&'-]+")
|
||||
for k in self.story.metadata.keys():
|
||||
values[k]=re.sub(pattern,'_', removeAllEntities(self.story.getMetadata(k)))
|
||||
|
||||
return string.Template(template).substitute(values).encode('utf8')
|
||||
|
||||
def _write(self, out, text):
|
||||
out.write(text.encode('utf8'))
|
||||
|
||||
def writeTitlePage(self, out, START, ENTRY, END, WIDE_ENTRY=None):
|
||||
"""
|
||||
Write the title page, but only include entries that there's
|
||||
metadata for. START, ENTRY and END are expected to already by
|
||||
string.Template(). START and END are expected to use the same
|
||||
names as Story.metadata, but ENTRY should use label and value.
|
||||
"""
|
||||
if self.getConfig("include_titlepage"):
|
||||
self._write(out,START.substitute(self.story.metadata))
|
||||
|
||||
if WIDE_ENTRY==None:
|
||||
WIDE_ENTRY=ENTRY
|
||||
|
||||
titleEntriesList = self.getConfigList("titlepage_entries")
|
||||
wideTitleEntriesList = self.getConfigList("wide_titlepage_entries")
|
||||
|
||||
for entry in titleEntriesList:
|
||||
if entry in self.titleLabels:
|
||||
if self.story.getMetadata(entry):
|
||||
if entry in wideTitleEntriesList:
|
||||
TEMPLATE=WIDE_ENTRY
|
||||
else:
|
||||
TEMPLATE=ENTRY
|
||||
self._write(out,TEMPLATE.substitute({'label':self.titleLabels[entry],
|
||||
'value':self.story.getMetadata(entry)}))
|
||||
|
||||
self._write(out,END.substitute(self.story.metadata))
|
||||
|
||||
def writeTOCPage(self, out, START, ENTRY, END):
|
||||
"""
|
||||
Write the Table of Contents page. START, ENTRY and END are expected to already by
|
||||
string.Template(). START and END are expected to use the same
|
||||
names as Story.metadata, but ENTRY should use index and chapter.
|
||||
"""
|
||||
# Only do TOC if there's more than one chapter and it's configured.
|
||||
if len(self.story.getChapters()) > 1 and self.getConfig("include_tocpage"):
|
||||
self._write(out,START.substitute(self.story.metadata))
|
||||
|
||||
for index, (title,html) in enumerate(self.story.getChapters()):
|
||||
self._write(out,ENTRY.substitute({'chapter':title, 'index':"%04d"%(index+1)}))
|
||||
|
||||
self._write(out,END.substitute(self.story.metadata))
|
||||
|
||||
# if no outstream is given, write to file.
|
||||
def writeStory(self,outstream=None):
|
||||
self.addConfigSection(self.story.getMetadata('site'))
|
||||
self.addConfigSection(self.story.getMetadata('site')+":"+self.getFormatName())
|
||||
for tag in self.getConfigList("extratags"):
|
||||
self.story.addToList("extratags",tag)
|
||||
|
||||
zipfilename=self.getZipFileName()
|
||||
filename=self.getOutputFileName()
|
||||
|
||||
if self.getConfig('zip_output'):
|
||||
outfilename=zipfilename
|
||||
else:
|
||||
outfilename=filename
|
||||
|
||||
if not outstream:
|
||||
if self.getConfig('make_directories'):
|
||||
path=""
|
||||
dirs = os.path.dirname(outfilename).split('/')
|
||||
for dir in dirs:
|
||||
path+=dir+"/"
|
||||
if not os.path.exists(path):
|
||||
os.mkdir(path) ## os.makedirs() doesn't work in 2.5.2?
|
||||
outstream = open(outfilename,"wb")
|
||||
|
||||
if self.getConfig('zip_output'):
|
||||
out = StringIO.StringIO()
|
||||
self.writeStoryImpl(out)
|
||||
zipout = ZipFile(outstream, 'w', compression=ZIP_DEFLATED)
|
||||
zipout.writestr(filename,out.getvalue())
|
||||
zipout.close()
|
||||
out.close()
|
||||
else:
|
||||
self.writeStoryImpl(outstream)
|
||||
|
||||
outstream.close()
|
||||
|
||||
def writeStoryImpl(self, out):
|
||||
"Must be overriden by sub classes."
|
||||
pass
|
||||
|
||||
404
fanficdownloader/writers/writer_epub.py
Normal file
404
fanficdownloader/writers/writer_epub.py
Normal file
|
|
@ -0,0 +1,404 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
import string
|
||||
import StringIO
|
||||
import zipfile
|
||||
from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED
|
||||
|
||||
## XML isn't as forgiving as HTML, so rather than generate as strings,
|
||||
## use DOM to generate the XML files.
|
||||
from xml.dom.minidom import parse, parseString, getDOMImplementation
|
||||
|
||||
from writers.base_writer import *
|
||||
|
||||
class EpubWriter(BaseStoryWriter):
|
||||
|
||||
@staticmethod
|
||||
def getFormatName():
|
||||
return 'epub'
|
||||
|
||||
@staticmethod
|
||||
def getFormatExt():
|
||||
return '.epub'
|
||||
|
||||
def __init__(self, config, story):
|
||||
BaseStoryWriter.__init__(self, config, story)
|
||||
|
||||
self.EPUB_CSS='''body { margin-left: 2%; margin-right: 2%; margin-top: 2%; margin-bottom: 2%; text-align: justify; }
|
||||
pre { font-size: x-small; }
|
||||
sml { font-size: small; }
|
||||
h1 { text-align: center; }
|
||||
h2 { text-align: center; }
|
||||
h3 { text-align: center; }
|
||||
h4 { text-align: center; }
|
||||
h5 { text-align: center; }
|
||||
h6 { text-align: center; }
|
||||
.CI {
|
||||
text-align:center;
|
||||
margin-top:0px;
|
||||
margin-bottom:0px;
|
||||
padding:0px;
|
||||
}
|
||||
.center {text-align: center;}
|
||||
.cover {text-align: center;}
|
||||
.full {width: 100%; }
|
||||
.quarter {width: 25%; }
|
||||
.smcap {font-variant: small-caps;}
|
||||
.u {text-decoration: underline;}
|
||||
.bold {font-weight: bold;}
|
||||
'''
|
||||
|
||||
self.EPUB_TITLE_PAGE_START = string.Template('''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>${title} by ${author}</title>
|
||||
<link href="stylesheet.css" type="text/css" charset="UTF-8" rel="stylesheet"/>
|
||||
</head>
|
||||
<body>
|
||||
<h1><a href="${storyUrl}">${title}</a> by <a href="${authorUrl}">${author}</a></h1>
|
||||
<div>
|
||||
''')
|
||||
|
||||
self.EPUB_TITLE_ENTRY = string.Template('''
|
||||
<b>${label}:</b> ${value}<br />
|
||||
''')
|
||||
|
||||
self.EPUB_TITLE_PAGE_END = string.Template('''
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
''')
|
||||
|
||||
self.EPUB_TABLE_TITLE_PAGE_START = string.Template('''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>${title} by ${author}</title>
|
||||
<link href="stylesheet.css" type="text/css" charset="UTF-8" rel="stylesheet"/>
|
||||
</head>
|
||||
<body>
|
||||
<h1><a href="${storyUrl}">${title}</a> by <a href="${authorUrl}">${author}</a></h1>
|
||||
<table class="full">
|
||||
''')
|
||||
|
||||
self.EPUB_TABLE_TITLE_ENTRY = string.Template('''
|
||||
<tr><td><b>${label}:</b></td><td>${value}</td></tr>
|
||||
''')
|
||||
|
||||
self.EPUB_TABLE_TITLE_WIDE_ENTRY = string.Template('''
|
||||
<tr><td colspan="2"><b>${label}:</b> ${value}</td></tr>
|
||||
''')
|
||||
|
||||
self.EPUB_TABLE_TITLE_PAGE_END = string.Template('''
|
||||
</table>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
''')
|
||||
|
||||
self.EPUB_TOC_PAGE_START = string.Template('''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>${title} by ${author}</title>
|
||||
<link href="stylesheet.css" type="text/css" charset="UTF-8" rel="stylesheet"/>
|
||||
</head>
|
||||
<body>
|
||||
<div>
|
||||
<h3>Table of Contents</h3>
|
||||
''')
|
||||
|
||||
self.EPUB_TOC_ENTRY = string.Template('''
|
||||
<a href="file${index}.xhtml">${chapter}</a><br />
|
||||
''')
|
||||
|
||||
self.EPUB_TOC_PAGE_END = string.Template('''
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
''')
|
||||
|
||||
self.EPUB_CHAPTER_START = string.Template('''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>${chapter}</title>
|
||||
<link href="stylesheet.css" type="text/css" charset="UTF-8" rel="stylesheet"/>
|
||||
</head>
|
||||
<body>
|
||||
<h2>${chapter}</h2>
|
||||
''')
|
||||
|
||||
self.EPUB_CHAPTER_END = string.Template('''
|
||||
</body>
|
||||
</html>
|
||||
''')
|
||||
|
||||
def getMetadata(self,key):
|
||||
return removeAllEntities(self.story.getMetadata(key))
|
||||
|
||||
def writeStoryImpl(self, out):
|
||||
|
||||
## Python 2.5 ZipFile is rather more primative than later
|
||||
## versions. It can operate on a file, or on a StringIO, but
|
||||
## not on an open stream. OTOH, I suspect we would have had
|
||||
## problems with closing and opening again to change the
|
||||
## compression type anyway.
|
||||
zipio = StringIO.StringIO()
|
||||
|
||||
## mimetype must be first file and uncompressed. Python 2.5
|
||||
## ZipFile can't change compression type file-by-file, so we
|
||||
## have to close and re-open
|
||||
outputepub = ZipFile(zipio, 'w', compression=ZIP_STORED)
|
||||
outputepub.writestr('mimetype','application/epub+zip')
|
||||
outputepub.close()
|
||||
|
||||
## Re-open file for content.
|
||||
outputepub = ZipFile(zipio, 'a', compression=ZIP_DEFLATED)
|
||||
|
||||
## Create META-INF/container.xml file. The only thing it does is
|
||||
## point to content.opf
|
||||
containerdom = getDOMImplementation().createDocument(None, "container", None)
|
||||
containertop = containerdom.documentElement
|
||||
containertop.setAttribute("version","1.0")
|
||||
containertop.setAttribute("xmlns","urn:oasis:names:tc:opendocument:xmlns:container")
|
||||
rootfiles = containerdom.createElement("rootfiles")
|
||||
containertop.appendChild(rootfiles)
|
||||
rootfiles.appendChild(newTag(containerdom,"rootfile",{"full-path":"content.opf",
|
||||
"media-type":"application/oebps-package+xml"}))
|
||||
outputepub.writestr("META-INF/container.xml",containerdom.toxml(encoding='utf-8'))
|
||||
del containerdom
|
||||
|
||||
## Epub has two metadata files with real data. We're putting
|
||||
## them in content.opf (pointed to by META-INF/container.xml)
|
||||
## and toc.ncx (pointed to by content.opf)
|
||||
|
||||
## content.opf contains metadata, a 'manifest' list of all
|
||||
## other included files, and another 'spine' list of the items in the
|
||||
## file
|
||||
|
||||
uniqueid= 'fanficdownloader-uid:%s-u%s-s%s' % (
|
||||
self.getMetadata('site'),
|
||||
self.getMetadata('authorId'),
|
||||
self.getMetadata('storyId'))
|
||||
|
||||
contentdom = getDOMImplementation().createDocument(None, "package", None)
|
||||
package = contentdom.documentElement
|
||||
package.setAttribute("version","2.0")
|
||||
package.setAttribute("xmlns","http://www.idpf.org/2007/opf")
|
||||
package.setAttribute("unique-identifier","fanficdownloader-uid")
|
||||
metadata=newTag(contentdom,"metadata",
|
||||
attrs={"xmlns:dc":"http://purl.org/dc/elements/1.1/",
|
||||
"xmlns:opf":"http://www.idpf.org/2007/opf"})
|
||||
package.appendChild(metadata)
|
||||
|
||||
metadata.appendChild(newTag(contentdom,"dc:identifier",
|
||||
text=uniqueid,
|
||||
attrs={"id":"fanficdownloader-uid"}))
|
||||
|
||||
if self.getMetadata('title'):
|
||||
metadata.appendChild(newTag(contentdom,"dc:title",text=self.getMetadata('title')))
|
||||
|
||||
if self.getMetadata('author'):
|
||||
metadata.appendChild(newTag(contentdom,"dc:creator",
|
||||
attrs={"opf:role":"aut"},
|
||||
text=self.getMetadata('author')))
|
||||
|
||||
metadata.appendChild(newTag(contentdom,"dc:contributor",text="fanficdownloader [http://fanficdownloader.googlecode.com]",attrs={"opf:role":"bkp"}))
|
||||
metadata.appendChild(newTag(contentdom,"dc:rights",text=""))
|
||||
metadata.appendChild(newTag(contentdom,"dc:language",text="en"))
|
||||
|
||||
# published, created, updated, calibre
|
||||
# Leave calling self.story.getMetadataRaw directly in case date format changes.
|
||||
if self.story.getMetadataRaw('datePublished'):
|
||||
metadata.appendChild(newTag(contentdom,"dc:date",
|
||||
attrs={"opf:event":"publication"},
|
||||
text=self.story.getMetadataRaw('datePublished').strftime("%Y-%m-%d")))
|
||||
|
||||
if self.story.getMetadataRaw('dateCreated'):
|
||||
metadata.appendChild(newTag(contentdom,"dc:date",
|
||||
attrs={"opf:event":"creation"},
|
||||
text=self.story.getMetadataRaw('dateCreated').strftime("%Y-%m-%d")))
|
||||
|
||||
if self.story.getMetadataRaw('dateUpdated'):
|
||||
metadata.appendChild(newTag(contentdom,"dc:date",
|
||||
attrs={"opf:event":"modification"},
|
||||
text=self.story.getMetadataRaw('dateUpdated').strftime("%Y-%m-%d")))
|
||||
metadata.appendChild(newTag(contentdom,"meta",
|
||||
attrs={"name":"calibre:timestamp",
|
||||
"content":self.story.getMetadataRaw('dateUpdated').strftime("%Y-%m-%dT%H:%M:%S")}))
|
||||
# Last Update tags for Bill.
|
||||
self.story.addToList('lastupdate',self.story.getMetadataRaw('dateUpdated').strftime("Last Update Year/Month: %Y/%m"))
|
||||
self.story.addToList('lastupdate',self.story.getMetadataRaw('dateUpdated').strftime("Last Update: %Y/%m/%d"))
|
||||
|
||||
if self.getMetadata('description'):
|
||||
metadata.appendChild(newTag(contentdom,"dc:description",text=
|
||||
self.getMetadata('description')))
|
||||
|
||||
# listables all go into dc:suject tags, but only if they are configured.
|
||||
for (name,lst) in self.story.getLists().iteritems():
|
||||
if name in self.getConfigList("include_subject_tags"):
|
||||
for tag in lst:
|
||||
metadata.appendChild(newTag(contentdom,"dc:subject",text=
|
||||
tag))
|
||||
|
||||
if self.getMetadata('site'):
|
||||
metadata.appendChild(newTag(contentdom,"dc:publisher",
|
||||
text=self.getMetadata('site')))
|
||||
|
||||
if self.getMetadata('storyUrl'):
|
||||
metadata.appendChild(newTag(contentdom,"dc:identifier",
|
||||
attrs={"opf:scheme":"URL"},
|
||||
text=self.getMetadata('storyUrl')))
|
||||
metadata.appendChild(newTag(contentdom,"dc:source",
|
||||
text=self.getMetadata('storyUrl')))
|
||||
|
||||
## end of metadata, create manifest.
|
||||
items = [] # list of (id, href, type, title) tuples(all strings)
|
||||
itemrefs = [] # list of strings -- idrefs from .opfs' spines
|
||||
items.append(("ncx","toc.ncx","application/x-dtbncx+xml",None)) ## we'll generate the toc.ncx file,
|
||||
## but it needs to be in the items manifest.
|
||||
items.append(("style","OEBPS/stylesheet.css","text/css",None))
|
||||
if self.getConfig("include_titlepage"):
|
||||
items.append(("title_page","OEBPS/title_page.xhtml","application/xhtml+xml","Title Page"))
|
||||
itemrefs.append("title_page")
|
||||
if self.getConfig("include_tocpage"):
|
||||
items.append(("toc_page","OEBPS/toc_page.xhtml","application/xhtml+xml","Table of Contents"))
|
||||
itemrefs.append("toc_page")
|
||||
for index, (title,html) in enumerate(self.story.getChapters()):
|
||||
i=index+1
|
||||
items.append(("file%04d"%i,
|
||||
"OEBPS/file%04d.xhtml"%i,
|
||||
"application/xhtml+xml",
|
||||
title))
|
||||
itemrefs.append("file%04d"%i)
|
||||
|
||||
manifest = contentdom.createElement("manifest")
|
||||
package.appendChild(manifest)
|
||||
for item in items:
|
||||
(id,href,type,title)=item
|
||||
manifest.appendChild(newTag(contentdom,"item",
|
||||
attrs={'id':id,
|
||||
'href':href,
|
||||
'media-type':type}))
|
||||
|
||||
spine = newTag(contentdom,"spine",attrs={"toc":"ncx"})
|
||||
package.appendChild(spine)
|
||||
for itemref in itemrefs:
|
||||
spine.appendChild(newTag(contentdom,"itemref",
|
||||
attrs={"idref":itemref,
|
||||
"linear":"yes"}))
|
||||
# write content.opf to zip.
|
||||
outputepub.writestr("content.opf",contentdom.toxml(encoding='utf-8'))
|
||||
del contentdom
|
||||
|
||||
## create toc.ncx file
|
||||
tocncxdom = getDOMImplementation().createDocument(None, "ncx", None)
|
||||
ncx = tocncxdom.documentElement
|
||||
ncx.setAttribute("version","2005-1")
|
||||
ncx.setAttribute("xmlns","http://www.daisy.org/z3986/2005/ncx/")
|
||||
head = tocncxdom.createElement("head")
|
||||
ncx.appendChild(head)
|
||||
head.appendChild(newTag(tocncxdom,"meta",
|
||||
attrs={"name":"dtb:uid", "content":uniqueid}))
|
||||
head.appendChild(newTag(tocncxdom,"meta",
|
||||
attrs={"name":"dtb:depth", "content":"1"}))
|
||||
head.appendChild(newTag(tocncxdom,"meta",
|
||||
attrs={"name":"dtb:totalPageCount", "content":"0"}))
|
||||
head.appendChild(newTag(tocncxdom,"meta",
|
||||
attrs={"name":"dtb:maxPageNumber", "content":"0"}))
|
||||
|
||||
docTitle = tocncxdom.createElement("docTitle")
|
||||
docTitle.appendChild(newTag(tocncxdom,"text",text=self.getMetadata('title')))
|
||||
ncx.appendChild(docTitle)
|
||||
|
||||
tocnavMap = tocncxdom.createElement("navMap")
|
||||
ncx.appendChild(tocnavMap)
|
||||
|
||||
# <navPoint id="<id>" playOrder="<risingnumberfrom0>">
|
||||
# <navLabel>
|
||||
# <text><chapter title></text>
|
||||
# </navLabel>
|
||||
# <content src="<chapterfile>"/>
|
||||
# </navPoint>
|
||||
index=0
|
||||
for item in items:
|
||||
(id,href,type,title)=item
|
||||
# only items to be skipped, toc.ncx, stylesheet.css, should have no title.
|
||||
if title :
|
||||
navPoint = newTag(tocncxdom,"navPoint",
|
||||
attrs={'id':id,
|
||||
'playOrder':str(index)})
|
||||
tocnavMap.appendChild(navPoint)
|
||||
navLabel = newTag(tocncxdom,"navLabel")
|
||||
navPoint.appendChild(navLabel)
|
||||
navLabel.appendChild(newTag(tocncxdom,"text",text=title))
|
||||
navPoint.appendChild(newTag(tocncxdom,"content",attrs={"src":href}))
|
||||
index=index+1
|
||||
|
||||
# write toc.ncs to zip file
|
||||
outputepub.writestr("toc.ncx",tocncxdom.toxml(encoding='utf-8'))
|
||||
del tocncxdom
|
||||
|
||||
# write stylesheet.css file.
|
||||
outputepub.writestr("OEBPS/stylesheet.css",self.EPUB_CSS)
|
||||
|
||||
# write title page.
|
||||
if self.getConfig("titlepage_use_table"):
|
||||
TITLE_PAGE_START = self.EPUB_TABLE_TITLE_PAGE_START
|
||||
TITLE_ENTRY = self.EPUB_TABLE_TITLE_ENTRY
|
||||
WIDE_TITLE_ENTRY = self.EPUB_TABLE_TITLE_WIDE_ENTRY
|
||||
TITLE_PAGE_END = self.EPUB_TABLE_TITLE_PAGE_END
|
||||
else:
|
||||
TITLE_PAGE_START = self.EPUB_TITLE_PAGE_START
|
||||
TITLE_ENTRY = self.EPUB_TITLE_ENTRY
|
||||
WIDE_TITLE_ENTRY = self.EPUB_TITLE_ENTRY # same, only wide in tables.
|
||||
TITLE_PAGE_END = self.EPUB_TITLE_PAGE_END
|
||||
|
||||
titlepageIO = StringIO.StringIO()
|
||||
self.writeTitlePage(out=titlepageIO,
|
||||
START=TITLE_PAGE_START,
|
||||
ENTRY=TITLE_ENTRY,
|
||||
WIDE_ENTRY=WIDE_TITLE_ENTRY,
|
||||
END=TITLE_PAGE_END)
|
||||
if titlepageIO.getvalue(): # will be false if no title page.
|
||||
outputepub.writestr("OEBPS/title_page.xhtml",titlepageIO.getvalue())
|
||||
titlepageIO.close()
|
||||
|
||||
# write toc page.
|
||||
tocpageIO = StringIO.StringIO()
|
||||
self.writeTOCPage(tocpageIO,
|
||||
self.EPUB_TOC_PAGE_START,
|
||||
self.EPUB_TOC_ENTRY,
|
||||
self.EPUB_TOC_PAGE_END)
|
||||
if tocpageIO.getvalue(): # will be false if no toc page.
|
||||
outputepub.writestr("OEBPS/toc_page.xhtml",tocpageIO.getvalue())
|
||||
tocpageIO.close()
|
||||
|
||||
for index, (title,html) in enumerate(self.story.getChapters()):
|
||||
logging.debug('Writing chapter text for: %s' % title)
|
||||
fullhtml = self.EPUB_CHAPTER_START.substitute({'chapter':title, 'index':index+1}) + html + self.EPUB_CHAPTER_END.substitute({'chapter':title, 'index':index+1})
|
||||
# ffnet(& maybe others) gives the whole chapter text as
|
||||
# one line. This causes problems for nook(at least) when
|
||||
# the chapter size starts getting big (200k+)
|
||||
fullhtml = fullhtml.replace('</p>','</p>\n').replace('<br />','<br />\n')
|
||||
outputepub.writestr("OEBPS/file%04d.xhtml"%(index+1),fullhtml.encode('utf-8'))
|
||||
del fullhtml
|
||||
|
||||
outputepub.close()
|
||||
out.write(zipio.getvalue())
|
||||
zipio.close()
|
||||
|
||||
## Utility method for creating new tags.
|
||||
def newTag(dom,name,attrs=None,text=None):
|
||||
tag = dom.createElement(name)
|
||||
if( attrs is not None ):
|
||||
for attr in attrs.keys():
|
||||
tag.setAttribute(attr,attrs[attr])
|
||||
if( text is not None ):
|
||||
tag.appendChild(dom.createTextNode(text))
|
||||
return tag
|
||||
|
||||
84
fanficdownloader/writers/writer_html.py
Normal file
84
fanficdownloader/writers/writer_html.py
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
import string
|
||||
|
||||
from writers.base_writer import *
|
||||
|
||||
class HTMLWriter(BaseStoryWriter):
|
||||
|
||||
@staticmethod
|
||||
def getFormatName():
|
||||
return 'html'
|
||||
|
||||
@staticmethod
|
||||
def getFormatExt():
|
||||
return '.html'
|
||||
|
||||
def __init__(self, config, story):
|
||||
BaseStoryWriter.__init__(self, config, story)
|
||||
|
||||
self.HTML_FILE_START = string.Template('''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>${title} by ${author}</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1><a href="${storyUrl}">${title}</a> by <a href="${authorUrl}">${author}</a></h1>
|
||||
''')
|
||||
|
||||
self.HTML_TITLE_PAGE_START = string.Template('''
|
||||
<table class="full">
|
||||
''')
|
||||
|
||||
self.HTML_TITLE_ENTRY = string.Template('''
|
||||
<tr><td><b>${label}:</b></td><td>${value}</td></tr>
|
||||
''')
|
||||
|
||||
self.HTML_TITLE_PAGE_END = string.Template('''
|
||||
</table>
|
||||
''')
|
||||
|
||||
self.HTML_TOC_PAGE_START = string.Template('''
|
||||
<a name="TOCTOP"><h3>Table of Contents</h3>
|
||||
<p>
|
||||
''')
|
||||
|
||||
self.HTML_TOC_ENTRY = string.Template('''
|
||||
<a href="#section${index}">${chapter}</a><br />
|
||||
''')
|
||||
|
||||
self.HTML_TOC_PAGE_END = string.Template('''
|
||||
</p>
|
||||
''')
|
||||
|
||||
self.HTML_CHAPTER_START = string.Template('''
|
||||
<a name="section${index}"><h2>${chapter}</h2></a>
|
||||
''')
|
||||
|
||||
self.HTML_FILE_END = string.Template('''
|
||||
</body>
|
||||
</html>''')
|
||||
|
||||
|
||||
def writeStoryImpl(self, out):
|
||||
|
||||
self._write(out,self.HTML_FILE_START.substitute(self.story.metadata))
|
||||
|
||||
self.writeTitlePage(out,
|
||||
self.HTML_TITLE_PAGE_START,
|
||||
self.HTML_TITLE_ENTRY,
|
||||
self.HTML_TITLE_PAGE_END)
|
||||
|
||||
self.writeTOCPage(out,
|
||||
self.HTML_TOC_PAGE_START,
|
||||
self.HTML_TOC_ENTRY,
|
||||
self.HTML_TOC_PAGE_END)
|
||||
|
||||
for index, (title,html) in enumerate(self.story.getChapters()):
|
||||
logging.debug('Writing chapter text for: %s' % title)
|
||||
self._write(out,self.HTML_CHAPTER_START.substitute({'chapter':title, 'index':"%04d"%(index+1)}))
|
||||
self._write(out,html)
|
||||
|
||||
self._write(out,self.HTML_FILE_END.substitute(self.story.metadata))
|
||||
142
fanficdownloader/writers/writer_txt.py
Normal file
142
fanficdownloader/writers/writer_txt.py
Normal file
|
|
@ -0,0 +1,142 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
import string
|
||||
from textwrap import wrap
|
||||
|
||||
from writers.base_writer import *
|
||||
|
||||
from html2text import html2text, BODY_WIDTH
|
||||
|
||||
## In BaseStoryWriter, we define _write to encode <unicode> objects
|
||||
## back into <string> for true output. But txt needs to write the
|
||||
## title page and TOC to a buffer first to wordwrap. And StringIO
|
||||
## gets pissy about unicode bytes in its buflist. This decodes the
|
||||
## unicode containing <string> object passed in back to a <unicode>
|
||||
## object so they join up properly. Could override _write to not
|
||||
## encode and do out.write(whatever.encode('utf8') instead. Honestly
|
||||
## not sure which is uglier.
|
||||
class KludgeStringIO():
|
||||
def __init__(self, buf = ''):
|
||||
self.buflist=[]
|
||||
def write(self,s):
|
||||
try:
|
||||
s=s.decode('utf-8')
|
||||
except:
|
||||
pass
|
||||
self.buflist.append(s)
|
||||
def getvalue(self):
|
||||
return u''.join(self.buflist)
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
class TextWriter(BaseStoryWriter):
|
||||
|
||||
@staticmethod
|
||||
def getFormatName():
|
||||
return 'txt'
|
||||
|
||||
@staticmethod
|
||||
def getFormatExt():
|
||||
return '.txt'
|
||||
|
||||
def __init__(self, config, story):
|
||||
|
||||
BaseStoryWriter.__init__(self, config, story)
|
||||
|
||||
self.TEXT_FILE_START = string.Template(u'''
|
||||
|
||||
|
||||
${title}
|
||||
|
||||
by ${author}
|
||||
|
||||
|
||||
''')
|
||||
|
||||
self.TEXT_TITLE_PAGE_START = string.Template(u'''
|
||||
''')
|
||||
|
||||
self.TEXT_TITLE_ENTRY = string.Template(u'''${label}: ${value}
|
||||
''')
|
||||
|
||||
self.TEXT_TITLE_PAGE_END = string.Template(u'''
|
||||
|
||||
|
||||
''')
|
||||
|
||||
self.TEXT_TOC_PAGE_START = string.Template(u'''
|
||||
|
||||
TABLE OF CONTENTS
|
||||
|
||||
''')
|
||||
|
||||
self.TEXT_TOC_ENTRY = string.Template(u'''
|
||||
${chapter}
|
||||
''')
|
||||
|
||||
self.TEXT_TOC_PAGE_END = string.Template(u'''
|
||||
''')
|
||||
|
||||
self.TEXT_CHAPTER_START = string.Template(u'''
|
||||
|
||||
\t${chapter}
|
||||
|
||||
''')
|
||||
|
||||
self.TEXT_FILE_END = string.Template(u'''
|
||||
|
||||
End file.
|
||||
''')
|
||||
|
||||
def writeStoryImpl(self, out):
|
||||
|
||||
wrapout = KludgeStringIO()
|
||||
|
||||
wrapout.write(self.TEXT_FILE_START.substitute(self.story.metadata))
|
||||
|
||||
self.writeTitlePage(wrapout,
|
||||
self.TEXT_TITLE_PAGE_START,
|
||||
self.TEXT_TITLE_ENTRY,
|
||||
self.TEXT_TITLE_PAGE_END)
|
||||
towrap = wrapout.getvalue()
|
||||
|
||||
self.writeTOCPage(wrapout,
|
||||
self.TEXT_TOC_PAGE_START,
|
||||
self.TEXT_TOC_ENTRY,
|
||||
self.TEXT_TOC_PAGE_END)
|
||||
|
||||
towrap = wrapout.getvalue()
|
||||
wrapout.close()
|
||||
towrap = removeAllEntities(towrap)
|
||||
|
||||
self._write(out,self.lineends(self.wraplines(towrap)))
|
||||
|
||||
for index, (title,html) in enumerate(self.story.getChapters()):
|
||||
logging.debug('Writing chapter text for: %s' % title)
|
||||
self._write(out,self.lineends(self.wraplines(removeAllEntities(self.TEXT_CHAPTER_START.substitute({'chapter':title, 'index':index+1})))))
|
||||
|
||||
self._write(out,self.lineends(html2text(html)))
|
||||
|
||||
self._write(out,self.lineends(self.wraplines(self.TEXT_FILE_END.substitute(self.story.metadata))))
|
||||
|
||||
def wraplines(self, text):
|
||||
result=''
|
||||
for para in text.split("\n"):
|
||||
first=True
|
||||
for line in wrap(para, BODY_WIDTH):
|
||||
if first:
|
||||
first=False
|
||||
else:
|
||||
result += u"\n"
|
||||
result += line
|
||||
result += u"\n"
|
||||
return result
|
||||
|
||||
## The appengine will return unix line endings.
|
||||
def lineends(self, txt):
|
||||
txt = txt.replace('\r','')
|
||||
if self.getConfig("windows_eol"):
|
||||
txt = txt.replace('\n',u'\r\n')
|
||||
return txt
|
||||
|
||||
177
fanficdownloader/zipdir.py
Normal file
177
fanficdownloader/zipdir.py
Normal file
|
|
@ -0,0 +1,177 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import with_statement
|
||||
|
||||
import sys
|
||||
import os
|
||||
import zlib
|
||||
import zipfile
|
||||
from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED
|
||||
from contextlib import closing
|
||||
import logging
|
||||
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import time
|
||||
import datetime
|
||||
from datetime import timedelta
|
||||
|
||||
import StringIO
|
||||
|
||||
class InvalidEPub(Exception):
|
||||
pass
|
||||
|
||||
def checkNewer(filename, curdte):
|
||||
ret = True
|
||||
|
||||
if not os.path.isfile(filename):
|
||||
logging.debug('File %s does not already exist.' % filename)
|
||||
return ret
|
||||
|
||||
#logging.debug('filename=%s, curdte=%s' % (filename, curdte))
|
||||
lastdate = None
|
||||
with closing(ZipFile(open(filename, 'rb'))) as epub:
|
||||
titleFilePath = "OEBPS/title_page.xhtml"
|
||||
contentFilePath = "OEBPS/content.opf"
|
||||
|
||||
namelist = set(epub.namelist())
|
||||
#logging.debug('namelist=%s' % namelist)
|
||||
if 'mimetype' not in namelist or \
|
||||
'META-INF/container.xml' not in namelist:
|
||||
#raise InvalidEPub('%s: not a valid EPUB' % filename)
|
||||
logging.debug('File %s is not a valid EPub format file.' % filename)
|
||||
return ret
|
||||
|
||||
if contentFilePath not in namelist:
|
||||
return ret # file is not newer
|
||||
|
||||
data = epub.read(contentFilePath)
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
lstdte = soup.find ('dc:date', {'opf:event' : 'modification'})
|
||||
#logging.debug('lstdte=%s' % lstdte.string)
|
||||
if lstdte is None and titleFilePath in namelist:
|
||||
data = epub.read(titleFilePath)
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
fld = ''
|
||||
allTDs = soup.findAll ('td')
|
||||
for td in allTDs:
|
||||
b = td.find ('b')
|
||||
if b is not None:
|
||||
fld = b.string
|
||||
if td.string is not None and fld == "Updated:":
|
||||
lastdate = td.string
|
||||
#logging.debug('title lastdate=%s' % lastdate)
|
||||
else:
|
||||
lastdate = lstdte.string.strip(' ')
|
||||
#logging.debug('contents lastdate=%s' % lastdate)
|
||||
|
||||
if lastdate is not None:
|
||||
currUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(curdte.strftime('%Y-%m-%d'), "%Y-%m-%d")))
|
||||
storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(lastdate, "%Y-%m-%d")))
|
||||
logging.debug('File %s last update date is %s, comparing to %s' % (filename, storyUpdated, currUpdated))
|
||||
if currUpdated <= storyUpdated :
|
||||
ret = False
|
||||
|
||||
logging.debug("Does %s need to be updated? %s" % (filename, ret))
|
||||
return ret
|
||||
|
||||
|
||||
def toZip(filename, directory):
|
||||
zippedHelp = zipfile.ZipFile(filename, "w", compression=zipfile.ZIP_DEFLATED)
|
||||
lst = os.listdir(directory)
|
||||
|
||||
for entity in lst:
|
||||
if entity.startswith('.'):
|
||||
continue
|
||||
|
||||
each = os.path.join(directory,entity)
|
||||
print(each)
|
||||
|
||||
if os.path.isfile(each):
|
||||
print(each)
|
||||
# epub standard requires mimetype to be uncompressed and first file.
|
||||
if entity == 'mimetype':
|
||||
zippedHelp.write(each, arcname=entity, compress_type=zipfile.ZIP_STORED)
|
||||
else:
|
||||
zippedHelp.write(each, arcname=entity)
|
||||
else:
|
||||
addFolderToZip(zippedHelp,entity, each)
|
||||
|
||||
zippedHelp.close()
|
||||
|
||||
def addFolderToZip(zippedHelp,folder,fpath):
|
||||
#print('addFolderToZip(%s)' % folder)
|
||||
|
||||
if folder == '.' or folder == '..':
|
||||
return
|
||||
|
||||
folderFiles = os.listdir(fpath)
|
||||
for f in folderFiles:
|
||||
if os.path.isfile(fpath + '/' + f):
|
||||
#print('basename=%s' % os.path.basename(fpath + '/' + f))
|
||||
zippedHelp.write(fpath + '/' + f, folder + '/' + f, zipfile.ZIP_DEFLATED)
|
||||
elif os.path.isdir(f):
|
||||
addFolderToZip(zippedHelp,f)
|
||||
|
||||
def inMemoryZip(files):
|
||||
# files have a structure of {'path/to/file' => content} dictionary
|
||||
io = StringIO.StringIO()
|
||||
|
||||
if 'mimetype' in files:
|
||||
# This fixes the uncompressed mimetype-first issue by opening
|
||||
# the in memory file as STORE, putting in the mimetype, then
|
||||
# closing and re-opening with DEFLATED. while it is often
|
||||
# true that mimetype is the first file, we can't assume it,
|
||||
# because the dict object is defined as unordered.
|
||||
path='mimetype'
|
||||
memzip = zipfile.ZipFile(io, 'a', compression=zipfile.ZIP_STORED)
|
||||
memzip.debug = 3
|
||||
if type(files[path]) != type('str'):
|
||||
data = files[path].getvalue()
|
||||
else:
|
||||
data = files[path]
|
||||
|
||||
logging.debug("Writing ZIP path %s" % path)
|
||||
try:
|
||||
memzip.writestr(path, data.encode('utf-8'))
|
||||
except UnicodeDecodeError, e:
|
||||
memzip.writestr(path.encode('utf-8'), data.encode('utf-8'))
|
||||
|
||||
memzip.close()
|
||||
|
||||
# remove it from the files dict.
|
||||
del(files['mimetype'])
|
||||
|
||||
# open in 'a' append mode.
|
||||
memzip = zipfile.ZipFile(io, 'a', compression=zipfile.ZIP_DEFLATED)
|
||||
memzip.debug = 3
|
||||
|
||||
for path in files:
|
||||
if type(files[path]) != type('str'):
|
||||
data = files[path].getvalue()
|
||||
else:
|
||||
data = files[path]
|
||||
|
||||
# logging.debug(data)
|
||||
logging.debug("Writing ZIP path %s" % path)
|
||||
try:
|
||||
memzip.writestr(path, data.encode('utf-8'))
|
||||
except UnicodeDecodeError, e:
|
||||
memzip.writestr(path.encode('utf-8'), data.encode('utf-8'))
|
||||
|
||||
# declares all the files created by Windows.
|
||||
for zf in memzip.filelist:
|
||||
zf.create_system = 0
|
||||
|
||||
memzip.close()
|
||||
|
||||
return io
|
||||
|
||||
if __name__ == '__main__':
|
||||
# toZip('sample.epub', "books/A_Time_To_Reflect")
|
||||
# z = zipfile.ZipFile('sample.epub', 'r')
|
||||
files = {'test.txt' : 'test', 'data/abc.txt' : 'abc'}
|
||||
data = inMemoryZip(files)
|
||||
f = open('res.zip', 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
19
ffstorage.py
Normal file
19
ffstorage.py
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
from google.appengine.ext import db
|
||||
|
||||
class DownloadMeta(db.Model):
|
||||
user = db.UserProperty()
|
||||
url = db.StringProperty()
|
||||
name = db.StringProperty()
|
||||
title = db.StringProperty()
|
||||
author = db.StringProperty()
|
||||
format = db.StringProperty()
|
||||
failure = db.StringProperty()
|
||||
completed = db.BooleanProperty(default=False)
|
||||
date = db.DateTimeProperty(auto_now_add=True)
|
||||
# data_chunks is implicit from DownloadData def.
|
||||
|
||||
class DownloadData(db.Model):
|
||||
download = db.ReferenceProperty(DownloadMeta,
|
||||
collection_name='data_chunks')
|
||||
blob = db.BlobProperty()
|
||||
index = db.IntegerProperty()
|
||||
109
index-ajax.html
Normal file
109
index-ajax.html
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
|
||||
<html>
|
||||
<head>
|
||||
<link href="css/index.css" rel="stylesheet" type="text/css">
|
||||
<link type="text/css" href="http://jqueryui.com/latest/themes/base/ui.all.css" rel="stylesheet" />
|
||||
|
||||
<title>Fanfiction Downloader (fanfiction.net, fictionalley, ficwad to epub and HTML)</title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<script src="/js/jquery-1.3.2.js"></script>
|
||||
<script src="/js/fdownloader.js"></script>
|
||||
|
||||
<script type="text/javascript" src="http://jqueryui.com/latest/ui/ui.core.js"></script>
|
||||
<script type="text/javascript" src="http://jqueryui.com/latest/ui/ui.progressbar.js"></script>
|
||||
|
||||
</head>
|
||||
<body>
|
||||
<div id='main'>
|
||||
<h1>
|
||||
<a href="/" style="text-decoration: none; color: black;">FanFiction Downloader</a>
|
||||
</h1>
|
||||
|
||||
<!-- <form action="/fdown" method="post"> -->
|
||||
<div id='urlbox'>
|
||||
<div id='greeting'>
|
||||
Hi, {{ nickname }}! This is a fan fiction downloader, which makes reading stories from various websites much easier. Please paste a URL of the <em>first chapter</em> in the box to start. Alternatively, see your personal list of <a href="/recent">previously downloaded fanfics</a>.
|
||||
</div>
|
||||
<input type="text" id='url' name="url" size="50" value='{{ url }}'>
|
||||
<div style="margin-top: 0.5em;">
|
||||
Ebook format <select name="format" id="format">
|
||||
<option value='epub'>ePub</option>
|
||||
<option value='html'>HTML</option>
|
||||
</select>
|
||||
</div>
|
||||
|
||||
<div id='error' style='color: red'>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
<div id='yourfile' style='display:none'>
|
||||
</div>
|
||||
|
||||
<div id='typebox'>
|
||||
</div>
|
||||
|
||||
<h3>
|
||||
Login and Password
|
||||
</h3>
|
||||
<div id='logpassword'>
|
||||
If the story requires a login and password to download (e.g. marked as Mature on FFA), you may need to provide your credentials to download it, otherwise just leave it empty
|
||||
</div>
|
||||
<div id='logpasswordtable'>
|
||||
<div class='fieldandlabel'>
|
||||
<div class='label'>Login</div>
|
||||
<div class='field'><input type='text' name='login' id='login' size='50'></div>
|
||||
</div>
|
||||
|
||||
<div class='fieldandlabel'>
|
||||
<div class='label'>Password</div>
|
||||
<div class='field'><input type='password' id='password' name='password' size='50'></div>
|
||||
</div>
|
||||
</div>
|
||||
<div id='submitbtn'>
|
||||
<span id='submit_button'><button onclick='downloadFanfic();'>Download</button></span>
|
||||
<span id='ajax_loader' style='display:none'><img src="/static/ajax-loader.gif"></span>
|
||||
</div>
|
||||
|
||||
|
||||
<div id="progressbar">
|
||||
|
||||
</div>
|
||||
<div id='helpbox'>
|
||||
Few things to know, which will make your life substantially easier:
|
||||
<ol>
|
||||
<li>Small <a href="http://www.sigizmund.com/reading-fanfiction-off-line-in-stanza-and-oth">post written by me</a> — how to read fiction in Stanza or any other ebook reader. </a></li>
|
||||
<li>Currently we support fanfiction.net, fictionpress.com, fanficauthors.net and ficwad.com</li>
|
||||
<li>Paste a URL of the first chapter of the fanfic, not the index page</li>
|
||||
<li>Fics with a single chapter are not supported (you can just copy and paste it)</li>
|
||||
<li>Stories which are too long may not be downloaded correctly and application will report a time-out error — this is a limitation which is currently imposed by Google AppEngine on a long-running activities</li>
|
||||
<li>FicWad support is somewhat flaky — if you feel it doesn't work for you, send all the details to me</li>
|
||||
<li>You can download fanfics and store them for 'later' by just downloading them and visiting <a href="/recent">recent downloads</a> section, but in future they will be deleted after 5 days to save the space</li>
|
||||
<li>If Downloader simply opens a download file window rather than saves the fanfic and gives you a link, it means it is too large to save in the database and you need to download it straight away</li>
|
||||
<li>If you think that something that should work in fact doesn't, drop me a mail to <a href='mailto:sigizmund@gmail.com'>sigizmund@gmail.com</a></li>
|
||||
</ol>
|
||||
Otherwise, just have fun, and if you want to say thank you — use the email above.
|
||||
</div>
|
||||
<div style='text-align: center'>
|
||||
<img src="http://code.google.com/appengine/images/appengine-silver-120x30.gif"
|
||||
alt="Powered by Google App Engine" />
|
||||
<br/><br/>
|
||||
FanfictionLoader is a web front-end to <A href="http://code.google.com/p/fanficdownloader/">fanficdownloader</a><br/>
|
||||
Copyright © <a href="http://twitter.com/sigizmund">Roman Kirillov</a>
|
||||
</div>
|
||||
<!-- </form> -->
|
||||
</div>
|
||||
<script type="text/javascript">
|
||||
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
|
||||
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
|
||||
</script>
|
||||
<script type="text/javascript">
|
||||
try {
|
||||
var pageTracker = _gat._getTracker("UA-12136939-1");
|
||||
pageTracker._trackPageview();
|
||||
} catch(err) {}</script>
|
||||
</body>
|
||||
</html>
|
||||
219
index.html
Normal file
219
index.html
Normal file
|
|
@ -0,0 +1,219 @@
|
|||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
|
||||
<html>
|
||||
<head>
|
||||
<link href="/css/index.css" rel="stylesheet" type="text/css">
|
||||
<title>Fanfiction Downloader - read fanfiction from twilighted.net, fanfiction.net, fictionpress.com, fictionalley.org, ficwad.com, potionsandsnitches.net, harrypotterfanfiction.com, mediaminer.org on Kindle, Nook, Sony Reader, iPad, iPhone, Android, Aldiko, Stanza</title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<meta name="google-site-verification" content="kCFc-G4bka_pJN6Rv8CapPBcwmq0hbAUZPkKWqRsAYU" />
|
||||
</head>
|
||||
<body>
|
||||
<div id='main'>
|
||||
<h1>
|
||||
<a href="/" style="text-decoration: none; color: black;">FanFiction Downloader</a>
|
||||
</h1>
|
||||
|
||||
<div style="text-align: center">
|
||||
<script type="text/javascript"><!--
|
||||
google_ad_client = "ca-pub-0320924304307555";
|
||||
/* Standard */
|
||||
google_ad_slot = "8974025478";
|
||||
google_ad_width = 468;
|
||||
google_ad_height = 60;
|
||||
//-->
|
||||
</script>
|
||||
<script type="text/javascript"
|
||||
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
|
||||
</script>
|
||||
</div>
|
||||
<!-- <div id='yourfile'> -->
|
||||
{{yourfile}}
|
||||
<!-- </div> -->
|
||||
|
||||
{% if authorized %}
|
||||
<form action="/fdown" method="post">
|
||||
<div id='urlbox'>
|
||||
<div id='greeting'>
|
||||
<p>Hi, {{ nickname }}! This is a fan fiction downloader, which makes reading stories from various websites
|
||||
much easier. </p>
|
||||
<p>For Amazon Kindle use Mobi output(see notice below), for Sony Reader, Nook and iPad use ePub</p>
|
||||
<p>Or see your personal list of <a href="/recent">previously downloaded fanfics</a>.</p>
|
||||
</div>
|
||||
<h3>Experimental New Feature</h3>
|
||||
<p>
|
||||
If you select EPub format, when it's done you will also be given a 'Convert' link.
|
||||
</p>
|
||||
<p>
|
||||
That link will take you to <a href="http://convertfiles.com">convertfiles.com</a> where you can
|
||||
directly convert your new story to FictionBook (fb2), Mobipocket (mobi), MS Reader (lit) or Adobe Portable
|
||||
Document Format(pdf).
|
||||
There's also a 'Convert' link for EPubs on your <a href="/recent">recent downloads</a>
|
||||
page. We'd really like to hear from users about this in our <a href="http://groups.google.com/group/fanfic-downloader">Google Group</a>.
|
||||
</p>
|
||||
<p>
|
||||
We'd especially like Kindle and other Mobi users to try it. The <a href="http://convertfiles.com">convertfiles.com</a> Mobi file
|
||||
appears to be more correct than our Mobi output.
|
||||
</p>
|
||||
|
||||
<div id='error'>
|
||||
{{ error_message }}
|
||||
</div>
|
||||
<input type="text" name="url" size="50" value='{{ url }}'>
|
||||
</div>
|
||||
|
||||
<div id='typebox'>
|
||||
<div id='typelabel'>Ebook format</div>
|
||||
<div id='typeoptions'>
|
||||
<input type='radio' name='format' value='epub' checked>EPub</input>
|
||||
<input type='radio' name='format' value='html'>HTML</input>
|
||||
<input type='radio' name='format' value='text'>Plain Text</input>
|
||||
<input type='radio' name='format' value='mobi'>Mobi (Kindle)</input>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id='logpasswordtable'>
|
||||
<h3>Login and Password</h3>
|
||||
<div id='logpassword'>
|
||||
If the story requires a login and
|
||||
password to download, you may need
|
||||
to provide your credentials to
|
||||
download it, otherwise just leave
|
||||
it empty. Currently only needed
|
||||
by twilighted.net and twiwrite.net.
|
||||
</div>
|
||||
<div class='fieldandlabel'>
|
||||
<div class='label'>Login</div>
|
||||
<div class='field'><input type='text' name='login' size='50'></div>
|
||||
</div>
|
||||
|
||||
<div class='fieldandlabel'>
|
||||
<div class='label'>Password</div>
|
||||
<div class='field'><input type='password' name='password' size='50'></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id='submitbtn'>
|
||||
<input type="submit" value="Download">
|
||||
</div>
|
||||
</form>
|
||||
{% else %}
|
||||
<div id='urlbox'>
|
||||
<div id='greeting'>
|
||||
<p>
|
||||
This is a fan fiction downloader, which makes reading stories from various websites much easier. Before you
|
||||
can start downloading fanfics, you need to login, so downloader can remember your fanfics and store them.
|
||||
</p>
|
||||
<p><a href="{{ login_url }}">Login using Google account</a></p>
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<div id='helpbox'>
|
||||
<dl>
|
||||
<dt>fictionalley.org
|
||||
<dd>Use the URL of the story's chapter list, such as
|
||||
<br /><a href="http://www.fictionalley.org/authors/drt/DA.html">http://www.fictionalley.org/authors/drt/DA.html</a>. Or the story text URL for
|
||||
fictionalley.org one-shots, such as
|
||||
<br /><a href="http://www.fictionalley.org/authors/drt/JOTP01a.html">http://www.fictionalley.org/authors/drt/JOTP01a.html</a>.
|
||||
<dt>fanfiction.net
|
||||
<dd>Use the URL of any story chapter, with or without story title such as
|
||||
<br /><a href="http://www.fanfiction.net/s/5192986/1/A_Fox_in_Tokyo">http://www.fanfiction.net/s/5192986/1/A_Fox_in_Tokyo</a> or
|
||||
<br /><a href="http://www.fanfiction.net/s/2345466/3/">http://www.fanfiction.net/s/2345466/3/</a>.
|
||||
<dt>fictionpress.com
|
||||
<dd>Use the URL of any story chapter, such as
|
||||
<br /><a href="http://www.fictionpress.com/s/2851771/1/Untouchable_Love">http://www.fictionpress.com/s/2851771/1/Untouchable_Love</a> or
|
||||
<br /><a href="http://www.fictionpress.com/s/2847338/6/">http://www.fictionpress.com/s/2847338/6/</a>.
|
||||
<dt>twilighted.net
|
||||
<dd>Use the URL of the start of the story, such as
|
||||
<br /><a href="http://twilighted.net/viewstory.php?sid=8422">http://twilighted.net/viewstory.php?sid=8422</a>.
|
||||
<dt>twiwrite.net
|
||||
<dd>Use the URL of the start of the story, such as
|
||||
<br /><a href="http://twiwrite.net/viewstory.php?sid=427">http://twiwrite.net/viewstory.php?sid=427</a>.
|
||||
<dt>ficwad.com
|
||||
<dd>Use the URL of any story chapter, such as
|
||||
<br /><a href="http://www.ficwad.com/story/75246">http://www.ficwad.com/story/75246</a>.
|
||||
<dt>harrypotterfanfiction.com
|
||||
<dd>Use the URL of the story's chapter list, such as
|
||||
<br /><a href="http://www.harrypotterfanfiction.com/viewstory.php?psid=289208">http://www.harrypotterfanfiction.com/viewstory.php?psid=289208</a>.
|
||||
<dt>potionsandsnitches.net
|
||||
<dd>Use the URL of the story's chapter list, such as
|
||||
<br /><a href="http://potionsandsnitches.net/fanfiction/viewstory.php?sid=2332">http://potionsandsnitches.net/fanfiction/viewstory.php?sid=2332</a>.
|
||||
<dt>mediaminer.org
|
||||
<dd>Use the URL of the story's chapter list, such as
|
||||
<br /><a href="http://www.mediaminer.org/fanfic/view_st.php/156934">http://www.mediaminer.org/fanfic/view_st.php/166653</a>.
|
||||
Or the story URL for one-shots, such as
|
||||
<br /><a href="http://www.mediaminer.org/fanfic/view_st.php/167618">http://www.mediaminer.org/fanfic/view_st.php/167618</a>.
|
||||
<dt>adastrafanfic.com
|
||||
<dd>Use the URL of the story's chapter list, such as
|
||||
<br /><a href="http://www.adastrafanfic.com/viewstory.php?sid=854">http://www.adastrafanfic.com/viewstory.php?sid=854</a>.
|
||||
<dt>whofic.com
|
||||
<dd>Use the URL of the story's chapter list, such as
|
||||
<br /><a href="http://www.whofic.com/viewstory.php?sid=16334">http://www.whofic.com/viewstory.php?sid=16334</a>.
|
||||
</dl>
|
||||
|
||||
|
||||
A few additional things to know, which will make your life substantially easier:
|
||||
<ol>
|
||||
<li>
|
||||
First thing to know: I do not use your login and password. In fact, all I know about it is your ID – password
|
||||
is being verified by Google and is absolutely, totally unknown to anyone but you.
|
||||
</li>
|
||||
<li>
|
||||
Small <a href="http://www.sigizmund.com/reading-fanfiction-off-line-in-stanza-and-oth">post written by me</a>
|
||||
— how to read fiction in Stanza or any other ebook reader.
|
||||
</li>
|
||||
<li>
|
||||
You can download fanfiction directly from your iPhone, Kindle or (possibly) other ebook reader.
|
||||
</li>
|
||||
<li>
|
||||
Downloaded stories are deleted after some time (which should give you enough of time to download it and will keep
|
||||
Google happy about the app not going over the storage limit).
|
||||
</li>
|
||||
<li>
|
||||
If you see some funny characters in downloaded Plain Text file, make sure you choose text file encoding UTF-8 and
|
||||
not something else.
|
||||
</li>
|
||||
<li>
|
||||
If you think that something that should work in fact doesn't, drop me a mail
|
||||
to <a href='mailto:sigizmund@gmail.com'>sigizmund@gmail.com</a>, or, even better, write an email to
|
||||
our <a href="http://groups.google.com/group/fanfic-downloader">Google Group</a>. I also encourage you to join it so
|
||||
you will find out about latest updates and fixes as soon as possible
|
||||
</li>
|
||||
</ol>
|
||||
Otherwise, just have fun, and if you want to say thank you — use the contacts above.
|
||||
</div>
|
||||
<div style='text-align: center'>
|
||||
<img src="http://code.google.com/appengine/images/appengine-silver-120x30.gif"
|
||||
alt="Powered by Google App Engine" />
|
||||
<br/><br/>
|
||||
FanfictionLoader is a web front-end to <A href="http://code.google.com/p/fanficdownloader/">fanficdownloader</a><br/>
|
||||
Copyright © <a href="http://twitter.com/sigizmund">Roman Kirillov</a>
|
||||
</div>
|
||||
|
||||
<div style="margin-top: 1em; text-align: center'">
|
||||
<script type="text/javascript"><!--
|
||||
google_ad_client = "pub-2027714004231956";
|
||||
/* FFD */
|
||||
google_ad_slot = "7330682770";
|
||||
google_ad_width = 468;
|
||||
google_ad_height = 60;
|
||||
//-->
|
||||
</script>
|
||||
<script type="text/javascript"
|
||||
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
|
||||
</script>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<script type="text/javascript"><!--
|
||||
google_ad_client = "ca-pub-0320924304307555";
|
||||
/* Standard */
|
||||
google_ad_slot = "8974025478";
|
||||
google_ad_width = 468;
|
||||
google_ad_height = 60;
|
||||
//-->
|
||||
</script>
|
||||
<script type="text/javascript"
|
||||
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
33
index.yaml
Normal file
33
index.yaml
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
indexes:
|
||||
|
||||
# AUTOGENERATED
|
||||
|
||||
# This index.yaml is automatically updated whenever the dev_appserver
|
||||
# detects that a new type of query is run. If you want to manage the
|
||||
# index.yaml file manually, remove the above marker line (the line
|
||||
# saying "# AUTOGENERATED"). If you want to manage some indexes
|
||||
# manually, move them above the marker line. The index.yaml file is
|
||||
# automatically uploaded to the admin console when you next deploy
|
||||
# your application using appcfg.py.
|
||||
|
||||
- kind: DownloadData
|
||||
properties:
|
||||
- name: download
|
||||
- name: index
|
||||
|
||||
- kind: DownloadMeta
|
||||
properties:
|
||||
- name: user
|
||||
- name: date
|
||||
direction: desc
|
||||
|
||||
- kind: DownloadedFanfic
|
||||
properties:
|
||||
- name: cleared
|
||||
- name: date
|
||||
|
||||
- kind: DownloadedFanfic
|
||||
properties:
|
||||
- name: user
|
||||
- name: date
|
||||
direction: desc
|
||||
116
js/fdownloader.js
Normal file
116
js/fdownloader.js
Normal file
|
|
@ -0,0 +1,116 @@
|
|||
var g_CurrentKey = null;
|
||||
var g_Counter = 0;
|
||||
|
||||
var COUNTER_MAX = 50;
|
||||
|
||||
|
||||
function setErrorState(error)
|
||||
{
|
||||
olderr = error;
|
||||
error = error + "<br/><a href='mailto:sigizmund@gmail.com?subject=Problem with the fanfiction downloader'>" + "Complain about this error</a>";
|
||||
$('#error').html(error);
|
||||
}
|
||||
|
||||
function clearErrorState()
|
||||
{
|
||||
$('#error').html('');
|
||||
}
|
||||
|
||||
function showFile(data)
|
||||
{
|
||||
$('#yourfile').html('<a href="/file?id=' + data.key + '">' + data.name + " by " + data.author + "</a>");
|
||||
$('#yourfile').show();
|
||||
}
|
||||
|
||||
function hideFile()
|
||||
{
|
||||
$('#yourfile').hide();
|
||||
}
|
||||
|
||||
function checkResults()
|
||||
{
|
||||
if ( g_Counter >= COUNTER_MAX )
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
g_Counter+=1;
|
||||
|
||||
$.getJSON('/progress', { 'key' : g_CurrentKey }, function(data)
|
||||
{
|
||||
if ( data.result != "Nope")
|
||||
{
|
||||
if ( data.result != "OK" )
|
||||
{
|
||||
leaveLoadingState();
|
||||
setErrorState(data.result);
|
||||
}
|
||||
else
|
||||
{
|
||||
showFile(data);
|
||||
leaveLoadingState();
|
||||
// result = data.split("|");
|
||||
// showFile(result[1], result[2], result[3]);
|
||||
}
|
||||
|
||||
$("#progressbar").progressbar('destroy');
|
||||
g_Counter = 101;
|
||||
}
|
||||
});
|
||||
|
||||
if ( g_Counter < COUNTER_MAX )
|
||||
setTimeout("checkResults()", 1000);
|
||||
else
|
||||
{
|
||||
leaveLoadingState();
|
||||
setErrorState("Operation takes too long - terminating by timeout (story too long?)");
|
||||
}
|
||||
}
|
||||
|
||||
function enterLoadingState()
|
||||
{
|
||||
$('#submit_button').hide();
|
||||
$('#ajax_loader').show();
|
||||
}
|
||||
|
||||
function leaveLoadingState()
|
||||
{
|
||||
$('#submit_button').show();
|
||||
$('#ajax_loader').hide();
|
||||
}
|
||||
|
||||
function downloadFanfic()
|
||||
{
|
||||
clearErrorState();
|
||||
hideFile();
|
||||
|
||||
|
||||
format = $("#format").val();
|
||||
alert(format);
|
||||
|
||||
return;
|
||||
|
||||
var url = $('#url').val();
|
||||
var login = $('#login').val();
|
||||
var password = $('#password').val();
|
||||
|
||||
if ( url == '' )
|
||||
{
|
||||
setErrorState('URL shouldn\'t be empty');
|
||||
return;
|
||||
}
|
||||
|
||||
if ( (url.indexOf('fanfiction.net') == -1 && url.indexOf('fanficauthors') == -1 && url.indexOf('ficwad') == -1 && url.indexOf('fictionpress') == -1) || (url.indexOf('adultfanfiction.net') != -1) )
|
||||
{
|
||||
setErrorState("This source is not yet supported. Ping me if you want it!");
|
||||
return;
|
||||
}
|
||||
|
||||
$.post('/submitDownload', {'url' : url, 'login' : login, 'password' : password, 'format' : format}, function(data)
|
||||
{
|
||||
g_CurrentKey = data;
|
||||
g_Counter = 0;
|
||||
setTimeout("checkResults()", 1000);
|
||||
enterLoadingState();
|
||||
})
|
||||
}
|
||||
4376
js/jquery-1.3.2.js
vendored
Normal file
4376
js/jquery-1.3.2.js
vendored
Normal file
File diff suppressed because it is too large
Load diff
433
main.py
Normal file
433
main.py
Normal file
|
|
@ -0,0 +1,433 @@
|
|||
#!/usr/bin/env python
|
||||
#
|
||||
# Copyright 2007 Google Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import os
|
||||
import sys
|
||||
import zlib
|
||||
import logging
|
||||
import traceback
|
||||
import StringIO
|
||||
|
||||
from google.appengine.runtime import DeadlineExceededError
|
||||
|
||||
from google.appengine.api import taskqueue
|
||||
from google.appengine.ext.webapp import template
|
||||
from google.appengine.api import users
|
||||
from google.appengine.ext import webapp
|
||||
from google.appengine.ext.webapp import util
|
||||
|
||||
from fanficdownloader.downloader import *
|
||||
from fanficdownloader.ffnet import *
|
||||
from fanficdownloader.output import *
|
||||
from fanficdownloader import twilighted
|
||||
from fanficdownloader import adastrafanfic
|
||||
|
||||
from google.appengine.ext import db
|
||||
|
||||
from fanficdownloader.zipdir import *
|
||||
|
||||
from ffstorage import *
|
||||
|
||||
class LoginRequired(webapp.RequestHandler):
|
||||
def get(self):
|
||||
user = users.get_current_user()
|
||||
if user:
|
||||
self.redirect('/')
|
||||
return
|
||||
else:
|
||||
logging.debug(users.create_login_url('/'))
|
||||
url = users.create_login_url(self.request.uri)
|
||||
template_values = {'login_url' : url}
|
||||
path = os.path.join(os.path.dirname(__file__), 'index-nonlogin.html')
|
||||
self.response.out.write(template.render(path, template_values))
|
||||
|
||||
class MainHandler(webapp.RequestHandler):
|
||||
def get(self):
|
||||
user = users.get_current_user()
|
||||
if user:
|
||||
error = self.request.get('error')
|
||||
template_values = {'nickname' : user.nickname(), 'authorized': True}
|
||||
url = self.request.get('url')
|
||||
template_values['url'] = url
|
||||
|
||||
if error != None and len(error) > 1:
|
||||
if error == 'login_required':
|
||||
template_values['error_message'] = 'This story (or one of the chapters) requires you to be logged in.'
|
||||
elif error == 'bad_url':
|
||||
template_values['error_message'] = 'Unsupported URL: ' + url
|
||||
elif error == 'custom':
|
||||
template_values['error_message'] = 'Error happened: ' + self.request.get('errtext')
|
||||
|
||||
filename = self.request.get('file')
|
||||
if len(filename) > 1:
|
||||
template_values['yourfile'] = '''<div id='yourfile'><a href='/file?id=%s'>"%s" by %s</a></div>''' % (filename, self.request.get('name'), self.request.get('author'))
|
||||
|
||||
self.response.headers['Content-Type'] = 'text/html'
|
||||
path = os.path.join(os.path.dirname(__file__), 'index.html')
|
||||
|
||||
self.response.out.write(template.render(path, template_values))
|
||||
else:
|
||||
logging.debug(users.create_login_url('/'))
|
||||
url = users.create_login_url(self.request.uri)
|
||||
template_values = {'login_url' : url, 'authorized': False}
|
||||
path = os.path.join(os.path.dirname(__file__), 'index.html')
|
||||
self.response.out.write(template.render(path, template_values))
|
||||
|
||||
|
||||
class FileServer(webapp.RequestHandler):
|
||||
|
||||
def get(self):
|
||||
fileId = self.request.get('id')
|
||||
|
||||
if fileId == None or len(fileId) < 3:
|
||||
self.redirect('/')
|
||||
return
|
||||
|
||||
key = db.Key(fileId)
|
||||
fanfic = db.get(key)
|
||||
|
||||
# check for completed & failure.
|
||||
|
||||
name = fanfic.name.encode('utf-8')
|
||||
|
||||
name = makeAcceptableFilename(name)
|
||||
|
||||
logging.info("Serving file: %s" % name)
|
||||
|
||||
if fanfic.format == 'epub':
|
||||
self.response.headers['Content-Type'] = 'application/epub+zip'
|
||||
self.response.headers['Content-disposition'] = 'attachment; filename=' + name + '.epub'
|
||||
elif fanfic.format == 'html':
|
||||
self.response.headers['Content-Type'] = 'text/html'
|
||||
self.response.headers['Content-disposition'] = 'attachment; filename=' + name + '.html.zip'
|
||||
elif fanfic.format == 'text':
|
||||
self.response.headers['Content-Type'] = 'text/plain'
|
||||
self.response.headers['Content-disposition'] = 'attachment; filename=' +name + '.txt.zip'
|
||||
elif fanfic.format == 'mobi':
|
||||
self.response.headers['Content-Type'] = 'application/x-mobipocket-ebook'
|
||||
self.response.headers['Content-disposition'] = 'attachment; filename=' + name + '.mobi'
|
||||
|
||||
data = DownloadData.all().filter("download =", fanfic).order("index")
|
||||
# epub, txt and html are all already compressed.
|
||||
# Each chunk is compress individually to avoid having
|
||||
# to hold the whole in memory just for the
|
||||
# compress/uncompress
|
||||
if fanfic.format == 'mobi':
|
||||
def dc(data):
|
||||
try:
|
||||
return zlib.decompress(data)
|
||||
# if error, assume it's a chunk from before we started compessing.
|
||||
except zlib.error:
|
||||
return data
|
||||
else:
|
||||
def dc(data):
|
||||
return data
|
||||
|
||||
for datum in data:
|
||||
self.response.out.write(dc(datum.blob))
|
||||
|
||||
class FileStatusServer(webapp.RequestHandler):
|
||||
def get(self):
|
||||
user = users.get_current_user()
|
||||
if not user:
|
||||
self.redirect(users.create_login_url(self.request.uri))
|
||||
return
|
||||
|
||||
fileId = self.request.get('id')
|
||||
|
||||
if fileId == None or len(fileId) < 3:
|
||||
self.redirect('/')
|
||||
|
||||
key = db.Key(fileId)
|
||||
fic = db.get(key)
|
||||
|
||||
logging.info("Status url: %s" % fic.url)
|
||||
if fic.completed and fic.format=='epub':
|
||||
escaped_url = urlEscape(self.request.host_url+"/file/"+fic.name+"."+fic.format+"?id="+fileId+"&fake=file."+fic.format)
|
||||
else:
|
||||
escaped_url=False
|
||||
template_values = dict(fic = fic,
|
||||
nickname = user.nickname(),
|
||||
escaped_url = escaped_url
|
||||
)
|
||||
path = os.path.join(os.path.dirname(__file__), 'status.html')
|
||||
self.response.out.write(template.render(path, template_values))
|
||||
|
||||
class RecentFilesServer(webapp.RequestHandler):
|
||||
def get(self):
|
||||
user = users.get_current_user()
|
||||
if not user:
|
||||
self.redirect(users.create_login_url(self.request.uri))
|
||||
return
|
||||
|
||||
q = DownloadMeta.all()
|
||||
q.filter('user =', user).order('-date')
|
||||
fics = q.fetch(100)
|
||||
|
||||
for fic in fics:
|
||||
if fic.completed and fic.format == 'epub':
|
||||
fic.escaped_url = urlEscape(self.request.host_url+"/file/"+fic.name+"."+fic.format+"?id="+str(fic.key())+"&fake=file."+fic.format)
|
||||
|
||||
template_values = dict(fics = fics, nickname = user.nickname())
|
||||
path = os.path.join(os.path.dirname(__file__), 'recent.html')
|
||||
self.response.out.write(template.render(path, template_values))
|
||||
|
||||
class RecentAllFilesServer(webapp.RequestHandler):
|
||||
def get(self):
|
||||
user = users.get_current_user()
|
||||
if user.nickname() != 'sigizmund':
|
||||
return
|
||||
|
||||
fics = db.GqlQuery("Select * From DownloadedFanfic")
|
||||
template_values = dict(fics = fics, nickname = user.nickname())
|
||||
path = os.path.join(os.path.dirname(__file__), 'recent.html')
|
||||
self.response.out.write(template.render(path, template_values))
|
||||
|
||||
class FanfictionDownloader(webapp.RequestHandler):
|
||||
def get(self):
|
||||
self.post()
|
||||
|
||||
def post(self):
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
|
||||
user = users.get_current_user()
|
||||
if not user:
|
||||
self.redirect(users.create_login_url(self.request.uri))
|
||||
return
|
||||
|
||||
format = self.request.get('format')
|
||||
url = self.request.get('url')
|
||||
login = self.request.get('login')
|
||||
password = self.request.get('password')
|
||||
|
||||
logging.info("Queuing Download: " + url)
|
||||
|
||||
# use existing record if available.
|
||||
q = DownloadMeta.all().filter('user =', user).filter('url =',url).filter('format =',format).fetch(1)
|
||||
if( q is None or len(q) < 1 ):
|
||||
download = DownloadMeta()
|
||||
else:
|
||||
download = q[0]
|
||||
download.completed=False
|
||||
download.failure=None
|
||||
for c in download.data_chunks:
|
||||
c.delete()
|
||||
|
||||
download.user = user
|
||||
download.url = url
|
||||
download.format = format
|
||||
download.put()
|
||||
|
||||
|
||||
taskqueue.add(url='/fdowntask',
|
||||
queue_name="download",
|
||||
params={'format':format,
|
||||
'url':url,
|
||||
'login':login,
|
||||
'password':password,
|
||||
'user':user.email()})
|
||||
|
||||
logging.info("enqueued download key: " + str(download.key()))
|
||||
self.redirect('/status?id='+str(download.key()))
|
||||
|
||||
return
|
||||
|
||||
|
||||
class FanfictionDownloaderTask(webapp.RequestHandler):
|
||||
def _printableVersion(self, text):
|
||||
text = removeEntities(text)
|
||||
try:
|
||||
d = text.decode('utf-8')
|
||||
except:
|
||||
d = text
|
||||
return d
|
||||
|
||||
|
||||
def post(self):
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
|
||||
format = self.request.get('format')
|
||||
url = self.request.get('url')
|
||||
login = self.request.get('login')
|
||||
password = self.request.get('password')
|
||||
# User object can't pass, just email address
|
||||
user = users.User(self.request.get('user'))
|
||||
|
||||
logging.info("Downloading: " + url + " for user: "+user.nickname())
|
||||
|
||||
adapter = None
|
||||
writerClass = None
|
||||
|
||||
# use existing record if available.
|
||||
q = DownloadMeta.all().filter('user =', user).filter('url =',url).filter('format =',format).fetch(1)
|
||||
if( q is None or len(q) < 1 ):
|
||||
download = DownloadMeta()
|
||||
else:
|
||||
download = q[0]
|
||||
download.completed=False
|
||||
for c in download.data_chunks:
|
||||
c.delete()
|
||||
|
||||
download.user = user
|
||||
download.url = url
|
||||
download.format = format
|
||||
download.put()
|
||||
logging.info('Creating adapter...')
|
||||
|
||||
try:
|
||||
if url.find('fictionalley') != -1:
|
||||
adapter = fictionalley.FictionAlley(url)
|
||||
elif url.find('ficwad') != -1:
|
||||
adapter = ficwad.FicWad(url)
|
||||
elif url.find('fanfiction.net') != -1:
|
||||
adapter = ffnet.FFNet(url)
|
||||
elif url.find('fictionpress.com') != -1:
|
||||
adapter = fpcom.FPCom(url)
|
||||
elif url.find('harrypotterfanfiction.com') != -1:
|
||||
adapter = hpfiction.HPFiction(url)
|
||||
elif url.find('twilighted.net') != -1:
|
||||
adapter = twilighted.Twilighted(url)
|
||||
elif url.find('twiwrite.net') != -1:
|
||||
adapter = twiwrite.Twiwrite(url)
|
||||
elif url.find('adastrafanfic.com') != -1:
|
||||
adapter = adastrafanfic.Adastrafanfic(url)
|
||||
elif url.find('whofic.com') != -1:
|
||||
adapter = whofic.Whofic(url)
|
||||
elif url.find('potionsandsnitches.net') != -1:
|
||||
adapter = potionsNsnitches.PotionsNSnitches(url)
|
||||
elif url.find('mediaminer.org') != -1:
|
||||
adapter = mediaminer.MediaMiner(url)
|
||||
else:
|
||||
logging.debug("Bad URL detected")
|
||||
download.failure = url +" is not a valid story URL."
|
||||
download.put()
|
||||
return
|
||||
except Exception, e:
|
||||
logging.exception(e)
|
||||
download.failure = "Adapter was not created: " + str(e)
|
||||
download.put()
|
||||
return
|
||||
|
||||
logging.info('Created an adaper: %s' % adapter)
|
||||
|
||||
if len(login) > 1:
|
||||
adapter.setLogin(login)
|
||||
adapter.setPassword(password)
|
||||
|
||||
if format == 'epub':
|
||||
writerClass = output.EPubFanficWriter
|
||||
elif format == 'html':
|
||||
writerClass = output.HTMLWriter
|
||||
elif format == 'mobi':
|
||||
writerClass = output.MobiWriter
|
||||
else:
|
||||
writerClass = output.TextWriter
|
||||
|
||||
loader = FanficLoader(adapter,
|
||||
writerClass,
|
||||
quiet = True,
|
||||
inmemory=True,
|
||||
compress=False)
|
||||
try:
|
||||
data = loader.download()
|
||||
|
||||
if format == 'html' or format == 'text':
|
||||
# data is uncompressed hence huge
|
||||
ext = '.html'
|
||||
if format == 'text':
|
||||
ext = '.txt'
|
||||
logging.debug(data)
|
||||
files = {makeAcceptableFilename(str(adapter.getOutputName())) + ext : StringIO.StringIO(data.decode('utf-8')) }
|
||||
d = inMemoryZip(files)
|
||||
data = d.getvalue()
|
||||
|
||||
|
||||
except LoginRequiredException, e:
|
||||
logging.exception(e)
|
||||
download.failure = 'Login problem detected'
|
||||
download.put()
|
||||
return
|
||||
except Exception, e:
|
||||
logging.exception(e)
|
||||
download.failure = 'Some exception happened in downloader: ' + str(e)
|
||||
download.put()
|
||||
return
|
||||
|
||||
if data == None:
|
||||
if loader.badLogin:
|
||||
logging.debug("Bad login detected")
|
||||
download.failure = 'Login failed'
|
||||
download.put()
|
||||
return
|
||||
download.failure = 'No data returned by adaptor'
|
||||
download.put()
|
||||
else:
|
||||
download.name = self._printableVersion(adapter.getOutputName())
|
||||
download.title = self._printableVersion(adapter.getStoryName())
|
||||
download.author = self._printableVersion(adapter.getAuthorName())
|
||||
download.put()
|
||||
index=0
|
||||
|
||||
# epub, txt and html are all already compressed.
|
||||
# Each chunk is compressed individually to avoid having
|
||||
# to hold the whole in memory just for the
|
||||
# compress/uncompress.
|
||||
if format == 'mobi':
|
||||
def c(data):
|
||||
return zlib.compress(data)
|
||||
else:
|
||||
def c(data):
|
||||
return data
|
||||
|
||||
while( len(data) > 0 ):
|
||||
DownloadData(download=download,
|
||||
index=index,
|
||||
blob=c(data[:1000000])).put()
|
||||
index += 1
|
||||
data = data[1000000:]
|
||||
download.completed=True
|
||||
download.put()
|
||||
|
||||
logging.info("Download finished OK")
|
||||
return
|
||||
|
||||
def toPercentDecimal(match):
|
||||
"Return the %decimal number for the character for url escaping"
|
||||
s = match.group(1)
|
||||
return "%%%02x" % ord(s)
|
||||
|
||||
def urlEscape(data):
|
||||
"Escape text, including unicode, for use in URLs"
|
||||
p = re.compile(r'([^\w])')
|
||||
return p.sub(toPercentDecimal, data.encode("utf-8"))
|
||||
|
||||
def main():
|
||||
application = webapp.WSGIApplication([('/', MainHandler),
|
||||
('/fdowntask', FanfictionDownloaderTask),
|
||||
('/fdown', FanfictionDownloader),
|
||||
(r'/file.*', FileServer),
|
||||
('/status', FileStatusServer),
|
||||
('/recent', RecentFilesServer),
|
||||
('/r2d2', RecentAllFilesServer),
|
||||
('/login', LoginRequired)],
|
||||
debug=False)
|
||||
util.run_wsgi_app(application)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
main()
|
||||
7
queue.yaml
Normal file
7
queue.yaml
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
queue:
|
||||
- name: default
|
||||
rate: 1/s
|
||||
- name: download
|
||||
rate: 10/s
|
||||
retry_parameters:
|
||||
task_retry_limit: 2
|
||||
80
recent.html
Normal file
80
recent.html
Normal file
|
|
@ -0,0 +1,80 @@
|
|||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
|
||||
<html>
|
||||
<head>
|
||||
<link href="/css/index.css" rel="stylesheet" type="text/css">
|
||||
<title>Fanfiction Downloader (fanfiction.net, fanficauthors, fictionalley, ficwad to epub and HTML)</title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
</head>
|
||||
<body>
|
||||
<div id='main'>
|
||||
<h1>
|
||||
<a href="/" style="text-decoration: none; color: black;">FanFiction Downloader</a>
|
||||
</h1>
|
||||
|
||||
<script type="text/javascript"><!--
|
||||
google_ad_client = "ca-pub-0320924304307555";
|
||||
/* Standard */
|
||||
google_ad_slot = "8974025478";
|
||||
google_ad_width = 468;
|
||||
google_ad_height = 60;
|
||||
//-->
|
||||
</script>
|
||||
<script type="text/javascript"
|
||||
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
|
||||
</script>
|
||||
<!-- <div id='yourfile'> -->
|
||||
{{yourfile}}
|
||||
<!-- </div> -->
|
||||
|
||||
<div id='urlbox'>
|
||||
<div id='greeting'>
|
||||
Hi, {{ nickname }}! These are the fanfics you've recently requested.
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id='helpbox'>
|
||||
{% for fic in fics %}
|
||||
<p>
|
||||
{% if fic.completed %}
|
||||
<a href="/file?id={{ fic.key }}">Download {{ fic.title }}</a>
|
||||
by {{ fic.author }} ({{ fic.format }})<br/>
|
||||
{% if fic.escaped_url %}
|
||||
<a href="http://www.convertfiles.com/index.php?url={{ fic.escaped_url }}">Convert {{ fic.title }} to other formats</a><br />
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% if fic.failure %}
|
||||
<div id='error'>{{ fic.failure }}</div>
|
||||
{% endif %}
|
||||
{% if not fic.completed and not fic.failure %}
|
||||
Request Processing...<br />
|
||||
{% endif %}
|
||||
<small><a href="{{ fic.url }}">{{ fic.url }}</a></small>
|
||||
|
||||
</p>
|
||||
{% endfor %}
|
||||
</div>
|
||||
|
||||
<script type="text/javascript"><!--
|
||||
google_ad_client = "ca-pub-0320924304307555";
|
||||
/* Standard */
|
||||
google_ad_slot = "8974025478";
|
||||
google_ad_width = 468;
|
||||
google_ad_height = 60;
|
||||
//-->
|
||||
</script>
|
||||
<script type="text/javascript"
|
||||
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
|
||||
</script>
|
||||
|
||||
</div>
|
||||
<script type="text/javascript">
|
||||
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
|
||||
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
|
||||
</script>
|
||||
<script type="text/javascript">
|
||||
try {
|
||||
var pageTracker = _gat._getTracker("UA-12136939-1");
|
||||
pageTracker._trackPageview();
|
||||
} catch(err) {}</script>
|
||||
</body>
|
||||
</html>
|
||||
318
simplejson/__init__.py
Normal file
318
simplejson/__init__.py
Normal file
|
|
@ -0,0 +1,318 @@
|
|||
r"""JSON (JavaScript Object Notation) <http://json.org> is a subset of
|
||||
JavaScript syntax (ECMA-262 3rd edition) used as a lightweight data
|
||||
interchange format.
|
||||
|
||||
:mod:`simplejson` exposes an API familiar to users of the standard library
|
||||
:mod:`marshal` and :mod:`pickle` modules. It is the externally maintained
|
||||
version of the :mod:`json` library contained in Python 2.6, but maintains
|
||||
compatibility with Python 2.4 and Python 2.5 and (currently) has
|
||||
significant performance advantages, even without using the optional C
|
||||
extension for speedups.
|
||||
|
||||
Encoding basic Python object hierarchies::
|
||||
|
||||
>>> import simplejson as json
|
||||
>>> json.dumps(['foo', {'bar': ('baz', None, 1.0, 2)}])
|
||||
'["foo", {"bar": ["baz", null, 1.0, 2]}]'
|
||||
>>> print json.dumps("\"foo\bar")
|
||||
"\"foo\bar"
|
||||
>>> print json.dumps(u'\u1234')
|
||||
"\u1234"
|
||||
>>> print json.dumps('\\')
|
||||
"\\"
|
||||
>>> print json.dumps({"c": 0, "b": 0, "a": 0}, sort_keys=True)
|
||||
{"a": 0, "b": 0, "c": 0}
|
||||
>>> from StringIO import StringIO
|
||||
>>> io = StringIO()
|
||||
>>> json.dump(['streaming API'], io)
|
||||
>>> io.getvalue()
|
||||
'["streaming API"]'
|
||||
|
||||
Compact encoding::
|
||||
|
||||
>>> import simplejson as json
|
||||
>>> json.dumps([1,2,3,{'4': 5, '6': 7}], separators=(',',':'))
|
||||
'[1,2,3,{"4":5,"6":7}]'
|
||||
|
||||
Pretty printing::
|
||||
|
||||
>>> import simplejson as json
|
||||
>>> s = json.dumps({'4': 5, '6': 7}, sort_keys=True, indent=4)
|
||||
>>> print '\n'.join([l.rstrip() for l in s.splitlines()])
|
||||
{
|
||||
"4": 5,
|
||||
"6": 7
|
||||
}
|
||||
|
||||
Decoding JSON::
|
||||
|
||||
>>> import simplejson as json
|
||||
>>> obj = [u'foo', {u'bar': [u'baz', None, 1.0, 2]}]
|
||||
>>> json.loads('["foo", {"bar":["baz", null, 1.0, 2]}]') == obj
|
||||
True
|
||||
>>> json.loads('"\\"foo\\bar"') == u'"foo\x08ar'
|
||||
True
|
||||
>>> from StringIO import StringIO
|
||||
>>> io = StringIO('["streaming API"]')
|
||||
>>> json.load(io)[0] == 'streaming API'
|
||||
True
|
||||
|
||||
Specializing JSON object decoding::
|
||||
|
||||
>>> import simplejson as json
|
||||
>>> def as_complex(dct):
|
||||
... if '__complex__' in dct:
|
||||
... return complex(dct['real'], dct['imag'])
|
||||
... return dct
|
||||
...
|
||||
>>> json.loads('{"__complex__": true, "real": 1, "imag": 2}',
|
||||
... object_hook=as_complex)
|
||||
(1+2j)
|
||||
>>> import decimal
|
||||
>>> json.loads('1.1', parse_float=decimal.Decimal) == decimal.Decimal('1.1')
|
||||
True
|
||||
|
||||
Specializing JSON object encoding::
|
||||
|
||||
>>> import simplejson as json
|
||||
>>> def encode_complex(obj):
|
||||
... if isinstance(obj, complex):
|
||||
... return [obj.real, obj.imag]
|
||||
... raise TypeError(repr(o) + " is not JSON serializable")
|
||||
...
|
||||
>>> json.dumps(2 + 1j, default=encode_complex)
|
||||
'[2.0, 1.0]'
|
||||
>>> json.JSONEncoder(default=encode_complex).encode(2 + 1j)
|
||||
'[2.0, 1.0]'
|
||||
>>> ''.join(json.JSONEncoder(default=encode_complex).iterencode(2 + 1j))
|
||||
'[2.0, 1.0]'
|
||||
|
||||
|
||||
Using simplejson.tool from the shell to validate and pretty-print::
|
||||
|
||||
$ echo '{"json":"obj"}' | python -m simplejson.tool
|
||||
{
|
||||
"json": "obj"
|
||||
}
|
||||
$ echo '{ 1.2:3.4}' | python -m simplejson.tool
|
||||
Expecting property name: line 1 column 2 (char 2)
|
||||
"""
|
||||
__version__ = '2.0.9'
|
||||
__all__ = [
|
||||
'dump', 'dumps', 'load', 'loads',
|
||||
'JSONDecoder', 'JSONEncoder',
|
||||
]
|
||||
|
||||
__author__ = 'Bob Ippolito <bob@redivi.com>'
|
||||
|
||||
from decoder import JSONDecoder
|
||||
from encoder import JSONEncoder
|
||||
|
||||
_default_encoder = JSONEncoder(
|
||||
skipkeys=False,
|
||||
ensure_ascii=True,
|
||||
check_circular=True,
|
||||
allow_nan=True,
|
||||
indent=None,
|
||||
separators=None,
|
||||
encoding='utf-8',
|
||||
default=None,
|
||||
)
|
||||
|
||||
def dump(obj, fp, skipkeys=False, ensure_ascii=True, check_circular=True,
|
||||
allow_nan=True, cls=None, indent=None, separators=None,
|
||||
encoding='utf-8', default=None, **kw):
|
||||
"""Serialize ``obj`` as a JSON formatted stream to ``fp`` (a
|
||||
``.write()``-supporting file-like object).
|
||||
|
||||
If ``skipkeys`` is true then ``dict`` keys that are not basic types
|
||||
(``str``, ``unicode``, ``int``, ``long``, ``float``, ``bool``, ``None``)
|
||||
will be skipped instead of raising a ``TypeError``.
|
||||
|
||||
If ``ensure_ascii`` is false, then the some chunks written to ``fp``
|
||||
may be ``unicode`` instances, subject to normal Python ``str`` to
|
||||
``unicode`` coercion rules. Unless ``fp.write()`` explicitly
|
||||
understands ``unicode`` (as in ``codecs.getwriter()``) this is likely
|
||||
to cause an error.
|
||||
|
||||
If ``check_circular`` is false, then the circular reference check
|
||||
for container types will be skipped and a circular reference will
|
||||
result in an ``OverflowError`` (or worse).
|
||||
|
||||
If ``allow_nan`` is false, then it will be a ``ValueError`` to
|
||||
serialize out of range ``float`` values (``nan``, ``inf``, ``-inf``)
|
||||
in strict compliance of the JSON specification, instead of using the
|
||||
JavaScript equivalents (``NaN``, ``Infinity``, ``-Infinity``).
|
||||
|
||||
If ``indent`` is a non-negative integer, then JSON array elements and object
|
||||
members will be pretty-printed with that indent level. An indent level
|
||||
of 0 will only insert newlines. ``None`` is the most compact representation.
|
||||
|
||||
If ``separators`` is an ``(item_separator, dict_separator)`` tuple
|
||||
then it will be used instead of the default ``(', ', ': ')`` separators.
|
||||
``(',', ':')`` is the most compact JSON representation.
|
||||
|
||||
``encoding`` is the character encoding for str instances, default is UTF-8.
|
||||
|
||||
``default(obj)`` is a function that should return a serializable version
|
||||
of obj or raise TypeError. The default simply raises TypeError.
|
||||
|
||||
To use a custom ``JSONEncoder`` subclass (e.g. one that overrides the
|
||||
``.default()`` method to serialize additional types), specify it with
|
||||
the ``cls`` kwarg.
|
||||
|
||||
"""
|
||||
# cached encoder
|
||||
if (not skipkeys and ensure_ascii and
|
||||
check_circular and allow_nan and
|
||||
cls is None and indent is None and separators is None and
|
||||
encoding == 'utf-8' and default is None and not kw):
|
||||
iterable = _default_encoder.iterencode(obj)
|
||||
else:
|
||||
if cls is None:
|
||||
cls = JSONEncoder
|
||||
iterable = cls(skipkeys=skipkeys, ensure_ascii=ensure_ascii,
|
||||
check_circular=check_circular, allow_nan=allow_nan, indent=indent,
|
||||
separators=separators, encoding=encoding,
|
||||
default=default, **kw).iterencode(obj)
|
||||
# could accelerate with writelines in some versions of Python, at
|
||||
# a debuggability cost
|
||||
for chunk in iterable:
|
||||
fp.write(chunk)
|
||||
|
||||
|
||||
def dumps(obj, skipkeys=False, ensure_ascii=True, check_circular=True,
|
||||
allow_nan=True, cls=None, indent=None, separators=None,
|
||||
encoding='utf-8', default=None, **kw):
|
||||
"""Serialize ``obj`` to a JSON formatted ``str``.
|
||||
|
||||
If ``skipkeys`` is false then ``dict`` keys that are not basic types
|
||||
(``str``, ``unicode``, ``int``, ``long``, ``float``, ``bool``, ``None``)
|
||||
will be skipped instead of raising a ``TypeError``.
|
||||
|
||||
If ``ensure_ascii`` is false, then the return value will be a
|
||||
``unicode`` instance subject to normal Python ``str`` to ``unicode``
|
||||
coercion rules instead of being escaped to an ASCII ``str``.
|
||||
|
||||
If ``check_circular`` is false, then the circular reference check
|
||||
for container types will be skipped and a circular reference will
|
||||
result in an ``OverflowError`` (or worse).
|
||||
|
||||
If ``allow_nan`` is false, then it will be a ``ValueError`` to
|
||||
serialize out of range ``float`` values (``nan``, ``inf``, ``-inf``) in
|
||||
strict compliance of the JSON specification, instead of using the
|
||||
JavaScript equivalents (``NaN``, ``Infinity``, ``-Infinity``).
|
||||
|
||||
If ``indent`` is a non-negative integer, then JSON array elements and
|
||||
object members will be pretty-printed with that indent level. An indent
|
||||
level of 0 will only insert newlines. ``None`` is the most compact
|
||||
representation.
|
||||
|
||||
If ``separators`` is an ``(item_separator, dict_separator)`` tuple
|
||||
then it will be used instead of the default ``(', ', ': ')`` separators.
|
||||
``(',', ':')`` is the most compact JSON representation.
|
||||
|
||||
``encoding`` is the character encoding for str instances, default is UTF-8.
|
||||
|
||||
``default(obj)`` is a function that should return a serializable version
|
||||
of obj or raise TypeError. The default simply raises TypeError.
|
||||
|
||||
To use a custom ``JSONEncoder`` subclass (e.g. one that overrides the
|
||||
``.default()`` method to serialize additional types), specify it with
|
||||
the ``cls`` kwarg.
|
||||
|
||||
"""
|
||||
# cached encoder
|
||||
if (not skipkeys and ensure_ascii and
|
||||
check_circular and allow_nan and
|
||||
cls is None and indent is None and separators is None and
|
||||
encoding == 'utf-8' and default is None and not kw):
|
||||
return _default_encoder.encode(obj)
|
||||
if cls is None:
|
||||
cls = JSONEncoder
|
||||
return cls(
|
||||
skipkeys=skipkeys, ensure_ascii=ensure_ascii,
|
||||
check_circular=check_circular, allow_nan=allow_nan, indent=indent,
|
||||
separators=separators, encoding=encoding, default=default,
|
||||
**kw).encode(obj)
|
||||
|
||||
|
||||
_default_decoder = JSONDecoder(encoding=None, object_hook=None)
|
||||
|
||||
|
||||
def load(fp, encoding=None, cls=None, object_hook=None, parse_float=None,
|
||||
parse_int=None, parse_constant=None, **kw):
|
||||
"""Deserialize ``fp`` (a ``.read()``-supporting file-like object containing
|
||||
a JSON document) to a Python object.
|
||||
|
||||
If the contents of ``fp`` is encoded with an ASCII based encoding other
|
||||
than utf-8 (e.g. latin-1), then an appropriate ``encoding`` name must
|
||||
be specified. Encodings that are not ASCII based (such as UCS-2) are
|
||||
not allowed, and should be wrapped with
|
||||
``codecs.getreader(fp)(encoding)``, or simply decoded to a ``unicode``
|
||||
object and passed to ``loads()``
|
||||
|
||||
``object_hook`` is an optional function that will be called with the
|
||||
result of any object literal decode (a ``dict``). The return value of
|
||||
``object_hook`` will be used instead of the ``dict``. This feature
|
||||
can be used to implement custom decoders (e.g. JSON-RPC class hinting).
|
||||
|
||||
To use a custom ``JSONDecoder`` subclass, specify it with the ``cls``
|
||||
kwarg.
|
||||
|
||||
"""
|
||||
return loads(fp.read(),
|
||||
encoding=encoding, cls=cls, object_hook=object_hook,
|
||||
parse_float=parse_float, parse_int=parse_int,
|
||||
parse_constant=parse_constant, **kw)
|
||||
|
||||
|
||||
def loads(s, encoding=None, cls=None, object_hook=None, parse_float=None,
|
||||
parse_int=None, parse_constant=None, **kw):
|
||||
"""Deserialize ``s`` (a ``str`` or ``unicode`` instance containing a JSON
|
||||
document) to a Python object.
|
||||
|
||||
If ``s`` is a ``str`` instance and is encoded with an ASCII based encoding
|
||||
other than utf-8 (e.g. latin-1) then an appropriate ``encoding`` name
|
||||
must be specified. Encodings that are not ASCII based (such as UCS-2)
|
||||
are not allowed and should be decoded to ``unicode`` first.
|
||||
|
||||
``object_hook`` is an optional function that will be called with the
|
||||
result of any object literal decode (a ``dict``). The return value of
|
||||
``object_hook`` will be used instead of the ``dict``. This feature
|
||||
can be used to implement custom decoders (e.g. JSON-RPC class hinting).
|
||||
|
||||
``parse_float``, if specified, will be called with the string
|
||||
of every JSON float to be decoded. By default this is equivalent to
|
||||
float(num_str). This can be used to use another datatype or parser
|
||||
for JSON floats (e.g. decimal.Decimal).
|
||||
|
||||
``parse_int``, if specified, will be called with the string
|
||||
of every JSON int to be decoded. By default this is equivalent to
|
||||
int(num_str). This can be used to use another datatype or parser
|
||||
for JSON integers (e.g. float).
|
||||
|
||||
``parse_constant``, if specified, will be called with one of the
|
||||
following strings: -Infinity, Infinity, NaN, null, true, false.
|
||||
This can be used to raise an exception if invalid JSON numbers
|
||||
are encountered.
|
||||
|
||||
To use a custom ``JSONDecoder`` subclass, specify it with the ``cls``
|
||||
kwarg.
|
||||
|
||||
"""
|
||||
if (cls is None and encoding is None and object_hook is None and
|
||||
parse_int is None and parse_float is None and
|
||||
parse_constant is None and not kw):
|
||||
return _default_decoder.decode(s)
|
||||
if cls is None:
|
||||
cls = JSONDecoder
|
||||
if object_hook is not None:
|
||||
kw['object_hook'] = object_hook
|
||||
if parse_float is not None:
|
||||
kw['parse_float'] = parse_float
|
||||
if parse_int is not None:
|
||||
kw['parse_int'] = parse_int
|
||||
if parse_constant is not None:
|
||||
kw['parse_constant'] = parse_constant
|
||||
return cls(encoding=encoding, **kw).decode(s)
|
||||
BIN
simplejson/__init__.pyc
Normal file
BIN
simplejson/__init__.pyc
Normal file
Binary file not shown.
2329
simplejson/_speedups.c
Normal file
2329
simplejson/_speedups.c
Normal file
File diff suppressed because it is too large
Load diff
354
simplejson/decoder.py
Normal file
354
simplejson/decoder.py
Normal file
|
|
@ -0,0 +1,354 @@
|
|||
"""Implementation of JSONDecoder
|
||||
"""
|
||||
import re
|
||||
import sys
|
||||
import struct
|
||||
|
||||
from simplejson.scanner import make_scanner
|
||||
try:
|
||||
from simplejson._speedups import scanstring as c_scanstring
|
||||
except ImportError:
|
||||
c_scanstring = None
|
||||
|
||||
__all__ = ['JSONDecoder']
|
||||
|
||||
FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
|
||||
|
||||
def _floatconstants():
|
||||
_BYTES = '7FF80000000000007FF0000000000000'.decode('hex')
|
||||
if sys.byteorder != 'big':
|
||||
_BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
|
||||
nan, inf = struct.unpack('dd', _BYTES)
|
||||
return nan, inf, -inf
|
||||
|
||||
NaN, PosInf, NegInf = _floatconstants()
|
||||
|
||||
|
||||
def linecol(doc, pos):
|
||||
lineno = doc.count('\n', 0, pos) + 1
|
||||
if lineno == 1:
|
||||
colno = pos
|
||||
else:
|
||||
colno = pos - doc.rindex('\n', 0, pos)
|
||||
return lineno, colno
|
||||
|
||||
|
||||
def errmsg(msg, doc, pos, end=None):
|
||||
# Note that this function is called from _speedups
|
||||
lineno, colno = linecol(doc, pos)
|
||||
if end is None:
|
||||
#fmt = '{0}: line {1} column {2} (char {3})'
|
||||
#return fmt.format(msg, lineno, colno, pos)
|
||||
fmt = '%s: line %d column %d (char %d)'
|
||||
return fmt % (msg, lineno, colno, pos)
|
||||
endlineno, endcolno = linecol(doc, end)
|
||||
#fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
|
||||
#return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
|
||||
fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
|
||||
return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
|
||||
|
||||
|
||||
_CONSTANTS = {
|
||||
'-Infinity': NegInf,
|
||||
'Infinity': PosInf,
|
||||
'NaN': NaN,
|
||||
}
|
||||
|
||||
STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
|
||||
BACKSLASH = {
|
||||
'"': u'"', '\\': u'\\', '/': u'/',
|
||||
'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
|
||||
}
|
||||
|
||||
DEFAULT_ENCODING = "utf-8"
|
||||
|
||||
def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHUNK.match):
|
||||
"""Scan the string s for a JSON string. End is the index of the
|
||||
character in s after the quote that started the JSON string.
|
||||
Unescapes all valid JSON string escape sequences and raises ValueError
|
||||
on attempt to decode an invalid string. If strict is False then literal
|
||||
control characters are allowed in the string.
|
||||
|
||||
Returns a tuple of the decoded string and the index of the character in s
|
||||
after the end quote."""
|
||||
if encoding is None:
|
||||
encoding = DEFAULT_ENCODING
|
||||
chunks = []
|
||||
_append = chunks.append
|
||||
begin = end - 1
|
||||
while 1:
|
||||
chunk = _m(s, end)
|
||||
if chunk is None:
|
||||
raise ValueError(
|
||||
errmsg("Unterminated string starting at", s, begin))
|
||||
end = chunk.end()
|
||||
content, terminator = chunk.groups()
|
||||
# Content is contains zero or more unescaped string characters
|
||||
if content:
|
||||
if not isinstance(content, unicode):
|
||||
content = unicode(content, encoding)
|
||||
_append(content)
|
||||
# Terminator is the end of string, a literal control character,
|
||||
# or a backslash denoting that an escape sequence follows
|
||||
if terminator == '"':
|
||||
break
|
||||
elif terminator != '\\':
|
||||
if strict:
|
||||
msg = "Invalid control character %r at" % (terminator,)
|
||||
#msg = "Invalid control character {0!r} at".format(terminator)
|
||||
raise ValueError(errmsg(msg, s, end))
|
||||
else:
|
||||
_append(terminator)
|
||||
continue
|
||||
try:
|
||||
esc = s[end]
|
||||
except IndexError:
|
||||
raise ValueError(
|
||||
errmsg("Unterminated string starting at", s, begin))
|
||||
# If not a unicode escape sequence, must be in the lookup table
|
||||
if esc != 'u':
|
||||
try:
|
||||
char = _b[esc]
|
||||
except KeyError:
|
||||
msg = "Invalid \\escape: " + repr(esc)
|
||||
raise ValueError(errmsg(msg, s, end))
|
||||
end += 1
|
||||
else:
|
||||
# Unicode escape sequence
|
||||
esc = s[end + 1:end + 5]
|
||||
next_end = end + 5
|
||||
if len(esc) != 4:
|
||||
msg = "Invalid \\uXXXX escape"
|
||||
raise ValueError(errmsg(msg, s, end))
|
||||
uni = int(esc, 16)
|
||||
# Check for surrogate pair on UCS-4 systems
|
||||
if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
|
||||
msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
|
||||
if not s[end + 5:end + 7] == '\\u':
|
||||
raise ValueError(errmsg(msg, s, end))
|
||||
esc2 = s[end + 7:end + 11]
|
||||
if len(esc2) != 4:
|
||||
raise ValueError(errmsg(msg, s, end))
|
||||
uni2 = int(esc2, 16)
|
||||
uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
|
||||
next_end += 6
|
||||
char = unichr(uni)
|
||||
end = next_end
|
||||
# Append the unescaped character
|
||||
_append(char)
|
||||
return u''.join(chunks), end
|
||||
|
||||
|
||||
# Use speedup if available
|
||||
scanstring = c_scanstring or py_scanstring
|
||||
|
||||
WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
|
||||
WHITESPACE_STR = ' \t\n\r'
|
||||
|
||||
def JSONObject((s, end), encoding, strict, scan_once, object_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
|
||||
pairs = {}
|
||||
# Use a slice to prevent IndexError from being raised, the following
|
||||
# check will raise a more specific ValueError if the string is empty
|
||||
nextchar = s[end:end + 1]
|
||||
# Normally we expect nextchar == '"'
|
||||
if nextchar != '"':
|
||||
if nextchar in _ws:
|
||||
end = _w(s, end).end()
|
||||
nextchar = s[end:end + 1]
|
||||
# Trivial empty object
|
||||
if nextchar == '}':
|
||||
return pairs, end + 1
|
||||
elif nextchar != '"':
|
||||
raise ValueError(errmsg("Expecting property name", s, end))
|
||||
end += 1
|
||||
while True:
|
||||
key, end = scanstring(s, end, encoding, strict)
|
||||
|
||||
# To skip some function call overhead we optimize the fast paths where
|
||||
# the JSON key separator is ": " or just ":".
|
||||
if s[end:end + 1] != ':':
|
||||
end = _w(s, end).end()
|
||||
if s[end:end + 1] != ':':
|
||||
raise ValueError(errmsg("Expecting : delimiter", s, end))
|
||||
|
||||
end += 1
|
||||
|
||||
try:
|
||||
if s[end] in _ws:
|
||||
end += 1
|
||||
if s[end] in _ws:
|
||||
end = _w(s, end + 1).end()
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
try:
|
||||
value, end = scan_once(s, end)
|
||||
except StopIteration:
|
||||
raise ValueError(errmsg("Expecting object", s, end))
|
||||
pairs[key] = value
|
||||
|
||||
try:
|
||||
nextchar = s[end]
|
||||
if nextchar in _ws:
|
||||
end = _w(s, end + 1).end()
|
||||
nextchar = s[end]
|
||||
except IndexError:
|
||||
nextchar = ''
|
||||
end += 1
|
||||
|
||||
if nextchar == '}':
|
||||
break
|
||||
elif nextchar != ',':
|
||||
raise ValueError(errmsg("Expecting , delimiter", s, end - 1))
|
||||
|
||||
try:
|
||||
nextchar = s[end]
|
||||
if nextchar in _ws:
|
||||
end += 1
|
||||
nextchar = s[end]
|
||||
if nextchar in _ws:
|
||||
end = _w(s, end + 1).end()
|
||||
nextchar = s[end]
|
||||
except IndexError:
|
||||
nextchar = ''
|
||||
|
||||
end += 1
|
||||
if nextchar != '"':
|
||||
raise ValueError(errmsg("Expecting property name", s, end - 1))
|
||||
|
||||
if object_hook is not None:
|
||||
pairs = object_hook(pairs)
|
||||
return pairs, end
|
||||
|
||||
def JSONArray((s, end), scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
|
||||
values = []
|
||||
nextchar = s[end:end + 1]
|
||||
if nextchar in _ws:
|
||||
end = _w(s, end + 1).end()
|
||||
nextchar = s[end:end + 1]
|
||||
# Look-ahead for trivial empty array
|
||||
if nextchar == ']':
|
||||
return values, end + 1
|
||||
_append = values.append
|
||||
while True:
|
||||
try:
|
||||
value, end = scan_once(s, end)
|
||||
except StopIteration:
|
||||
raise ValueError(errmsg("Expecting object", s, end))
|
||||
_append(value)
|
||||
nextchar = s[end:end + 1]
|
||||
if nextchar in _ws:
|
||||
end = _w(s, end + 1).end()
|
||||
nextchar = s[end:end + 1]
|
||||
end += 1
|
||||
if nextchar == ']':
|
||||
break
|
||||
elif nextchar != ',':
|
||||
raise ValueError(errmsg("Expecting , delimiter", s, end))
|
||||
|
||||
try:
|
||||
if s[end] in _ws:
|
||||
end += 1
|
||||
if s[end] in _ws:
|
||||
end = _w(s, end + 1).end()
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
return values, end
|
||||
|
||||
class JSONDecoder(object):
|
||||
"""Simple JSON <http://json.org> decoder
|
||||
|
||||
Performs the following translations in decoding by default:
|
||||
|
||||
+---------------+-------------------+
|
||||
| JSON | Python |
|
||||
+===============+===================+
|
||||
| object | dict |
|
||||
+---------------+-------------------+
|
||||
| array | list |
|
||||
+---------------+-------------------+
|
||||
| string | unicode |
|
||||
+---------------+-------------------+
|
||||
| number (int) | int, long |
|
||||
+---------------+-------------------+
|
||||
| number (real) | float |
|
||||
+---------------+-------------------+
|
||||
| true | True |
|
||||
+---------------+-------------------+
|
||||
| false | False |
|
||||
+---------------+-------------------+
|
||||
| null | None |
|
||||
+---------------+-------------------+
|
||||
|
||||
It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
|
||||
their corresponding ``float`` values, which is outside the JSON spec.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, encoding=None, object_hook=None, parse_float=None,
|
||||
parse_int=None, parse_constant=None, strict=True):
|
||||
"""``encoding`` determines the encoding used to interpret any ``str``
|
||||
objects decoded by this instance (utf-8 by default). It has no
|
||||
effect when decoding ``unicode`` objects.
|
||||
|
||||
Note that currently only encodings that are a superset of ASCII work,
|
||||
strings of other encodings should be passed in as ``unicode``.
|
||||
|
||||
``object_hook``, if specified, will be called with the result
|
||||
of every JSON object decoded and its return value will be used in
|
||||
place of the given ``dict``. This can be used to provide custom
|
||||
deserializations (e.g. to support JSON-RPC class hinting).
|
||||
|
||||
``parse_float``, if specified, will be called with the string
|
||||
of every JSON float to be decoded. By default this is equivalent to
|
||||
float(num_str). This can be used to use another datatype or parser
|
||||
for JSON floats (e.g. decimal.Decimal).
|
||||
|
||||
``parse_int``, if specified, will be called with the string
|
||||
of every JSON int to be decoded. By default this is equivalent to
|
||||
int(num_str). This can be used to use another datatype or parser
|
||||
for JSON integers (e.g. float).
|
||||
|
||||
``parse_constant``, if specified, will be called with one of the
|
||||
following strings: -Infinity, Infinity, NaN.
|
||||
This can be used to raise an exception if invalid JSON numbers
|
||||
are encountered.
|
||||
|
||||
"""
|
||||
self.encoding = encoding
|
||||
self.object_hook = object_hook
|
||||
self.parse_float = parse_float or float
|
||||
self.parse_int = parse_int or int
|
||||
self.parse_constant = parse_constant or _CONSTANTS.__getitem__
|
||||
self.strict = strict
|
||||
self.parse_object = JSONObject
|
||||
self.parse_array = JSONArray
|
||||
self.parse_string = scanstring
|
||||
self.scan_once = make_scanner(self)
|
||||
|
||||
def decode(self, s, _w=WHITESPACE.match):
|
||||
"""Return the Python representation of ``s`` (a ``str`` or ``unicode``
|
||||
instance containing a JSON document)
|
||||
|
||||
"""
|
||||
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
|
||||
end = _w(s, end).end()
|
||||
if end != len(s):
|
||||
raise ValueError(errmsg("Extra data", s, end, len(s)))
|
||||
return obj
|
||||
|
||||
def raw_decode(self, s, idx=0):
|
||||
"""Decode a JSON document from ``s`` (a ``str`` or ``unicode`` beginning
|
||||
with a JSON document) and return a 2-tuple of the Python
|
||||
representation and the index in ``s`` where the document ended.
|
||||
|
||||
This can be used to decode a JSON document from a string that may
|
||||
have extraneous data at the end.
|
||||
|
||||
"""
|
||||
try:
|
||||
obj, end = self.scan_once(s, idx)
|
||||
except StopIteration:
|
||||
raise ValueError("No JSON object could be decoded")
|
||||
return obj, end
|
||||
BIN
simplejson/decoder.pyc
Normal file
BIN
simplejson/decoder.pyc
Normal file
Binary file not shown.
440
simplejson/encoder.py
Normal file
440
simplejson/encoder.py
Normal file
|
|
@ -0,0 +1,440 @@
|
|||
"""Implementation of JSONEncoder
|
||||
"""
|
||||
import re
|
||||
|
||||
try:
|
||||
from simplejson._speedups import encode_basestring_ascii as c_encode_basestring_ascii
|
||||
except ImportError:
|
||||
c_encode_basestring_ascii = None
|
||||
try:
|
||||
from simplejson._speedups import make_encoder as c_make_encoder
|
||||
except ImportError:
|
||||
c_make_encoder = None
|
||||
|
||||
ESCAPE = re.compile(r'[\x00-\x1f\\"\b\f\n\r\t]')
|
||||
ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])')
|
||||
HAS_UTF8 = re.compile(r'[\x80-\xff]')
|
||||
ESCAPE_DCT = {
|
||||
'\\': '\\\\',
|
||||
'"': '\\"',
|
||||
'\b': '\\b',
|
||||
'\f': '\\f',
|
||||
'\n': '\\n',
|
||||
'\r': '\\r',
|
||||
'\t': '\\t',
|
||||
}
|
||||
for i in range(0x20):
|
||||
#ESCAPE_DCT.setdefault(chr(i), '\\u{0:04x}'.format(i))
|
||||
ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,))
|
||||
|
||||
# Assume this produces an infinity on all machines (probably not guaranteed)
|
||||
INFINITY = float('1e66666')
|
||||
FLOAT_REPR = repr
|
||||
|
||||
def encode_basestring(s):
|
||||
"""Return a JSON representation of a Python string
|
||||
|
||||
"""
|
||||
def replace(match):
|
||||
return ESCAPE_DCT[match.group(0)]
|
||||
return '"' + ESCAPE.sub(replace, s) + '"'
|
||||
|
||||
|
||||
def py_encode_basestring_ascii(s):
|
||||
"""Return an ASCII-only JSON representation of a Python string
|
||||
|
||||
"""
|
||||
if isinstance(s, str) and HAS_UTF8.search(s) is not None:
|
||||
s = s.decode('utf-8')
|
||||
def replace(match):
|
||||
s = match.group(0)
|
||||
try:
|
||||
return ESCAPE_DCT[s]
|
||||
except KeyError:
|
||||
n = ord(s)
|
||||
if n < 0x10000:
|
||||
#return '\\u{0:04x}'.format(n)
|
||||
return '\\u%04x' % (n,)
|
||||
else:
|
||||
# surrogate pair
|
||||
n -= 0x10000
|
||||
s1 = 0xd800 | ((n >> 10) & 0x3ff)
|
||||
s2 = 0xdc00 | (n & 0x3ff)
|
||||
#return '\\u{0:04x}\\u{1:04x}'.format(s1, s2)
|
||||
return '\\u%04x\\u%04x' % (s1, s2)
|
||||
return '"' + str(ESCAPE_ASCII.sub(replace, s)) + '"'
|
||||
|
||||
|
||||
encode_basestring_ascii = c_encode_basestring_ascii or py_encode_basestring_ascii
|
||||
|
||||
class JSONEncoder(object):
|
||||
"""Extensible JSON <http://json.org> encoder for Python data structures.
|
||||
|
||||
Supports the following objects and types by default:
|
||||
|
||||
+-------------------+---------------+
|
||||
| Python | JSON |
|
||||
+===================+===============+
|
||||
| dict | object |
|
||||
+-------------------+---------------+
|
||||
| list, tuple | array |
|
||||
+-------------------+---------------+
|
||||
| str, unicode | string |
|
||||
+-------------------+---------------+
|
||||
| int, long, float | number |
|
||||
+-------------------+---------------+
|
||||
| True | true |
|
||||
+-------------------+---------------+
|
||||
| False | false |
|
||||
+-------------------+---------------+
|
||||
| None | null |
|
||||
+-------------------+---------------+
|
||||
|
||||
To extend this to recognize other objects, subclass and implement a
|
||||
``.default()`` method with another method that returns a serializable
|
||||
object for ``o`` if possible, otherwise it should call the superclass
|
||||
implementation (to raise ``TypeError``).
|
||||
|
||||
"""
|
||||
item_separator = ', '
|
||||
key_separator = ': '
|
||||
def __init__(self, skipkeys=False, ensure_ascii=True,
|
||||
check_circular=True, allow_nan=True, sort_keys=False,
|
||||
indent=None, separators=None, encoding='utf-8', default=None):
|
||||
"""Constructor for JSONEncoder, with sensible defaults.
|
||||
|
||||
If skipkeys is false, then it is a TypeError to attempt
|
||||
encoding of keys that are not str, int, long, float or None. If
|
||||
skipkeys is True, such items are simply skipped.
|
||||
|
||||
If ensure_ascii is true, the output is guaranteed to be str
|
||||
objects with all incoming unicode characters escaped. If
|
||||
ensure_ascii is false, the output will be unicode object.
|
||||
|
||||
If check_circular is true, then lists, dicts, and custom encoded
|
||||
objects will be checked for circular references during encoding to
|
||||
prevent an infinite recursion (which would cause an OverflowError).
|
||||
Otherwise, no such check takes place.
|
||||
|
||||
If allow_nan is true, then NaN, Infinity, and -Infinity will be
|
||||
encoded as such. This behavior is not JSON specification compliant,
|
||||
but is consistent with most JavaScript based encoders and decoders.
|
||||
Otherwise, it will be a ValueError to encode such floats.
|
||||
|
||||
If sort_keys is true, then the output of dictionaries will be
|
||||
sorted by key; this is useful for regression tests to ensure
|
||||
that JSON serializations can be compared on a day-to-day basis.
|
||||
|
||||
If indent is a non-negative integer, then JSON array
|
||||
elements and object members will be pretty-printed with that
|
||||
indent level. An indent level of 0 will only insert newlines.
|
||||
None is the most compact representation.
|
||||
|
||||
If specified, separators should be a (item_separator, key_separator)
|
||||
tuple. The default is (', ', ': '). To get the most compact JSON
|
||||
representation you should specify (',', ':') to eliminate whitespace.
|
||||
|
||||
If specified, default is a function that gets called for objects
|
||||
that can't otherwise be serialized. It should return a JSON encodable
|
||||
version of the object or raise a ``TypeError``.
|
||||
|
||||
If encoding is not None, then all input strings will be
|
||||
transformed into unicode using that encoding prior to JSON-encoding.
|
||||
The default is UTF-8.
|
||||
|
||||
"""
|
||||
|
||||
self.skipkeys = skipkeys
|
||||
self.ensure_ascii = ensure_ascii
|
||||
self.check_circular = check_circular
|
||||
self.allow_nan = allow_nan
|
||||
self.sort_keys = sort_keys
|
||||
self.indent = indent
|
||||
if separators is not None:
|
||||
self.item_separator, self.key_separator = separators
|
||||
if default is not None:
|
||||
self.default = default
|
||||
self.encoding = encoding
|
||||
|
||||
def default(self, o):
|
||||
"""Implement this method in a subclass such that it returns
|
||||
a serializable object for ``o``, or calls the base implementation
|
||||
(to raise a ``TypeError``).
|
||||
|
||||
For example, to support arbitrary iterators, you could
|
||||
implement default like this::
|
||||
|
||||
def default(self, o):
|
||||
try:
|
||||
iterable = iter(o)
|
||||
except TypeError:
|
||||
pass
|
||||
else:
|
||||
return list(iterable)
|
||||
return JSONEncoder.default(self, o)
|
||||
|
||||
"""
|
||||
raise TypeError(repr(o) + " is not JSON serializable")
|
||||
|
||||
def encode(self, o):
|
||||
"""Return a JSON string representation of a Python data structure.
|
||||
|
||||
>>> JSONEncoder().encode({"foo": ["bar", "baz"]})
|
||||
'{"foo": ["bar", "baz"]}'
|
||||
|
||||
"""
|
||||
# This is for extremely simple cases and benchmarks.
|
||||
if isinstance(o, basestring):
|
||||
if isinstance(o, str):
|
||||
_encoding = self.encoding
|
||||
if (_encoding is not None
|
||||
and not (_encoding == 'utf-8')):
|
||||
o = o.decode(_encoding)
|
||||
if self.ensure_ascii:
|
||||
return encode_basestring_ascii(o)
|
||||
else:
|
||||
return encode_basestring(o)
|
||||
# This doesn't pass the iterator directly to ''.join() because the
|
||||
# exceptions aren't as detailed. The list call should be roughly
|
||||
# equivalent to the PySequence_Fast that ''.join() would do.
|
||||
chunks = self.iterencode(o, _one_shot=True)
|
||||
if not isinstance(chunks, (list, tuple)):
|
||||
chunks = list(chunks)
|
||||
return ''.join(chunks)
|
||||
|
||||
def iterencode(self, o, _one_shot=False):
|
||||
"""Encode the given object and yield each string
|
||||
representation as available.
|
||||
|
||||
For example::
|
||||
|
||||
for chunk in JSONEncoder().iterencode(bigobject):
|
||||
mysocket.write(chunk)
|
||||
|
||||
"""
|
||||
if self.check_circular:
|
||||
markers = {}
|
||||
else:
|
||||
markers = None
|
||||
if self.ensure_ascii:
|
||||
_encoder = encode_basestring_ascii
|
||||
else:
|
||||
_encoder = encode_basestring
|
||||
if self.encoding != 'utf-8':
|
||||
def _encoder(o, _orig_encoder=_encoder, _encoding=self.encoding):
|
||||
if isinstance(o, str):
|
||||
o = o.decode(_encoding)
|
||||
return _orig_encoder(o)
|
||||
|
||||
def floatstr(o, allow_nan=self.allow_nan, _repr=FLOAT_REPR, _inf=INFINITY, _neginf=-INFINITY):
|
||||
# Check for specials. Note that this type of test is processor- and/or
|
||||
# platform-specific, so do tests which don't depend on the internals.
|
||||
|
||||
if o != o:
|
||||
text = 'NaN'
|
||||
elif o == _inf:
|
||||
text = 'Infinity'
|
||||
elif o == _neginf:
|
||||
text = '-Infinity'
|
||||
else:
|
||||
return _repr(o)
|
||||
|
||||
if not allow_nan:
|
||||
raise ValueError(
|
||||
"Out of range float values are not JSON compliant: " +
|
||||
repr(o))
|
||||
|
||||
return text
|
||||
|
||||
|
||||
if _one_shot and c_make_encoder is not None and not self.indent and not self.sort_keys:
|
||||
_iterencode = c_make_encoder(
|
||||
markers, self.default, _encoder, self.indent,
|
||||
self.key_separator, self.item_separator, self.sort_keys,
|
||||
self.skipkeys, self.allow_nan)
|
||||
else:
|
||||
_iterencode = _make_iterencode(
|
||||
markers, self.default, _encoder, self.indent, floatstr,
|
||||
self.key_separator, self.item_separator, self.sort_keys,
|
||||
self.skipkeys, _one_shot)
|
||||
return _iterencode(o, 0)
|
||||
|
||||
def _make_iterencode(markers, _default, _encoder, _indent, _floatstr, _key_separator, _item_separator, _sort_keys, _skipkeys, _one_shot,
|
||||
## HACK: hand-optimized bytecode; turn globals into locals
|
||||
False=False,
|
||||
True=True,
|
||||
ValueError=ValueError,
|
||||
basestring=basestring,
|
||||
dict=dict,
|
||||
float=float,
|
||||
id=id,
|
||||
int=int,
|
||||
isinstance=isinstance,
|
||||
list=list,
|
||||
long=long,
|
||||
str=str,
|
||||
tuple=tuple,
|
||||
):
|
||||
|
||||
def _iterencode_list(lst, _current_indent_level):
|
||||
if not lst:
|
||||
yield '[]'
|
||||
return
|
||||
if markers is not None:
|
||||
markerid = id(lst)
|
||||
if markerid in markers:
|
||||
raise ValueError("Circular reference detected")
|
||||
markers[markerid] = lst
|
||||
buf = '['
|
||||
if _indent is not None:
|
||||
_current_indent_level += 1
|
||||
newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
|
||||
separator = _item_separator + newline_indent
|
||||
buf += newline_indent
|
||||
else:
|
||||
newline_indent = None
|
||||
separator = _item_separator
|
||||
first = True
|
||||
for value in lst:
|
||||
if first:
|
||||
first = False
|
||||
else:
|
||||
buf = separator
|
||||
if isinstance(value, basestring):
|
||||
yield buf + _encoder(value)
|
||||
elif value is None:
|
||||
yield buf + 'null'
|
||||
elif value is True:
|
||||
yield buf + 'true'
|
||||
elif value is False:
|
||||
yield buf + 'false'
|
||||
elif isinstance(value, (int, long)):
|
||||
yield buf + str(value)
|
||||
elif isinstance(value, float):
|
||||
yield buf + _floatstr(value)
|
||||
else:
|
||||
yield buf
|
||||
if isinstance(value, (list, tuple)):
|
||||
chunks = _iterencode_list(value, _current_indent_level)
|
||||
elif isinstance(value, dict):
|
||||
chunks = _iterencode_dict(value, _current_indent_level)
|
||||
else:
|
||||
chunks = _iterencode(value, _current_indent_level)
|
||||
for chunk in chunks:
|
||||
yield chunk
|
||||
if newline_indent is not None:
|
||||
_current_indent_level -= 1
|
||||
yield '\n' + (' ' * (_indent * _current_indent_level))
|
||||
yield ']'
|
||||
if markers is not None:
|
||||
del markers[markerid]
|
||||
|
||||
def _iterencode_dict(dct, _current_indent_level):
|
||||
if not dct:
|
||||
yield '{}'
|
||||
return
|
||||
if markers is not None:
|
||||
markerid = id(dct)
|
||||
if markerid in markers:
|
||||
raise ValueError("Circular reference detected")
|
||||
markers[markerid] = dct
|
||||
yield '{'
|
||||
if _indent is not None:
|
||||
_current_indent_level += 1
|
||||
newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
|
||||
item_separator = _item_separator + newline_indent
|
||||
yield newline_indent
|
||||
else:
|
||||
newline_indent = None
|
||||
item_separator = _item_separator
|
||||
first = True
|
||||
if _sort_keys:
|
||||
items = dct.items()
|
||||
items.sort(key=lambda kv: kv[0])
|
||||
else:
|
||||
items = dct.iteritems()
|
||||
for key, value in items:
|
||||
if isinstance(key, basestring):
|
||||
pass
|
||||
# JavaScript is weakly typed for these, so it makes sense to
|
||||
# also allow them. Many encoders seem to do something like this.
|
||||
elif isinstance(key, float):
|
||||
key = _floatstr(key)
|
||||
elif key is True:
|
||||
key = 'true'
|
||||
elif key is False:
|
||||
key = 'false'
|
||||
elif key is None:
|
||||
key = 'null'
|
||||
elif isinstance(key, (int, long)):
|
||||
key = str(key)
|
||||
elif _skipkeys:
|
||||
continue
|
||||
else:
|
||||
raise TypeError("key " + repr(key) + " is not a string")
|
||||
if first:
|
||||
first = False
|
||||
else:
|
||||
yield item_separator
|
||||
yield _encoder(key)
|
||||
yield _key_separator
|
||||
if isinstance(value, basestring):
|
||||
yield _encoder(value)
|
||||
elif value is None:
|
||||
yield 'null'
|
||||
elif value is True:
|
||||
yield 'true'
|
||||
elif value is False:
|
||||
yield 'false'
|
||||
elif isinstance(value, (int, long)):
|
||||
yield str(value)
|
||||
elif isinstance(value, float):
|
||||
yield _floatstr(value)
|
||||
else:
|
||||
if isinstance(value, (list, tuple)):
|
||||
chunks = _iterencode_list(value, _current_indent_level)
|
||||
elif isinstance(value, dict):
|
||||
chunks = _iterencode_dict(value, _current_indent_level)
|
||||
else:
|
||||
chunks = _iterencode(value, _current_indent_level)
|
||||
for chunk in chunks:
|
||||
yield chunk
|
||||
if newline_indent is not None:
|
||||
_current_indent_level -= 1
|
||||
yield '\n' + (' ' * (_indent * _current_indent_level))
|
||||
yield '}'
|
||||
if markers is not None:
|
||||
del markers[markerid]
|
||||
|
||||
def _iterencode(o, _current_indent_level):
|
||||
if isinstance(o, basestring):
|
||||
yield _encoder(o)
|
||||
elif o is None:
|
||||
yield 'null'
|
||||
elif o is True:
|
||||
yield 'true'
|
||||
elif o is False:
|
||||
yield 'false'
|
||||
elif isinstance(o, (int, long)):
|
||||
yield str(o)
|
||||
elif isinstance(o, float):
|
||||
yield _floatstr(o)
|
||||
elif isinstance(o, (list, tuple)):
|
||||
for chunk in _iterencode_list(o, _current_indent_level):
|
||||
yield chunk
|
||||
elif isinstance(o, dict):
|
||||
for chunk in _iterencode_dict(o, _current_indent_level):
|
||||
yield chunk
|
||||
else:
|
||||
if markers is not None:
|
||||
markerid = id(o)
|
||||
if markerid in markers:
|
||||
raise ValueError("Circular reference detected")
|
||||
markers[markerid] = o
|
||||
o = _default(o)
|
||||
for chunk in _iterencode(o, _current_indent_level):
|
||||
yield chunk
|
||||
if markers is not None:
|
||||
del markers[markerid]
|
||||
|
||||
return _iterencode
|
||||
BIN
simplejson/encoder.pyc
Normal file
BIN
simplejson/encoder.pyc
Normal file
Binary file not shown.
65
simplejson/scanner.py
Normal file
65
simplejson/scanner.py
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
"""JSON token scanner
|
||||
"""
|
||||
import re
|
||||
try:
|
||||
from simplejson._speedups import make_scanner as c_make_scanner
|
||||
except ImportError:
|
||||
c_make_scanner = None
|
||||
|
||||
__all__ = ['make_scanner']
|
||||
|
||||
NUMBER_RE = re.compile(
|
||||
r'(-?(?:0|[1-9]\d*))(\.\d+)?([eE][-+]?\d+)?',
|
||||
(re.VERBOSE | re.MULTILINE | re.DOTALL))
|
||||
|
||||
def py_make_scanner(context):
|
||||
parse_object = context.parse_object
|
||||
parse_array = context.parse_array
|
||||
parse_string = context.parse_string
|
||||
match_number = NUMBER_RE.match
|
||||
encoding = context.encoding
|
||||
strict = context.strict
|
||||
parse_float = context.parse_float
|
||||
parse_int = context.parse_int
|
||||
parse_constant = context.parse_constant
|
||||
object_hook = context.object_hook
|
||||
|
||||
def _scan_once(string, idx):
|
||||
try:
|
||||
nextchar = string[idx]
|
||||
except IndexError:
|
||||
raise StopIteration
|
||||
|
||||
if nextchar == '"':
|
||||
return parse_string(string, idx + 1, encoding, strict)
|
||||
elif nextchar == '{':
|
||||
return parse_object((string, idx + 1), encoding, strict, _scan_once, object_hook)
|
||||
elif nextchar == '[':
|
||||
return parse_array((string, idx + 1), _scan_once)
|
||||
elif nextchar == 'n' and string[idx:idx + 4] == 'null':
|
||||
return None, idx + 4
|
||||
elif nextchar == 't' and string[idx:idx + 4] == 'true':
|
||||
return True, idx + 4
|
||||
elif nextchar == 'f' and string[idx:idx + 5] == 'false':
|
||||
return False, idx + 5
|
||||
|
||||
m = match_number(string, idx)
|
||||
if m is not None:
|
||||
integer, frac, exp = m.groups()
|
||||
if frac or exp:
|
||||
res = parse_float(integer + (frac or '') + (exp or ''))
|
||||
else:
|
||||
res = parse_int(integer)
|
||||
return res, m.end()
|
||||
elif nextchar == 'N' and string[idx:idx + 3] == 'NaN':
|
||||
return parse_constant('NaN'), idx + 3
|
||||
elif nextchar == 'I' and string[idx:idx + 8] == 'Infinity':
|
||||
return parse_constant('Infinity'), idx + 8
|
||||
elif nextchar == '-' and string[idx:idx + 9] == '-Infinity':
|
||||
return parse_constant('-Infinity'), idx + 9
|
||||
else:
|
||||
raise StopIteration
|
||||
|
||||
return _scan_once
|
||||
|
||||
make_scanner = c_make_scanner or py_make_scanner
|
||||
BIN
simplejson/scanner.pyc
Normal file
BIN
simplejson/scanner.pyc
Normal file
Binary file not shown.
23
simplejson/tests/__init__.py
Normal file
23
simplejson/tests/__init__.py
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
import unittest
|
||||
import doctest
|
||||
|
||||
def additional_tests():
|
||||
import simplejson
|
||||
import simplejson.encoder
|
||||
import simplejson.decoder
|
||||
suite = unittest.TestSuite()
|
||||
for mod in (simplejson, simplejson.encoder, simplejson.decoder):
|
||||
suite.addTest(doctest.DocTestSuite(mod))
|
||||
suite.addTest(doctest.DocFileSuite('../../index.rst'))
|
||||
return suite
|
||||
|
||||
def main():
|
||||
suite = additional_tests()
|
||||
runner = unittest.TextTestRunner()
|
||||
runner.run(suite)
|
||||
|
||||
if __name__ == '__main__':
|
||||
import os
|
||||
import sys
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||
main()
|
||||
30
simplejson/tests/test_check_circular.py
Normal file
30
simplejson/tests/test_check_circular.py
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
from unittest import TestCase
|
||||
import simplejson as json
|
||||
|
||||
def default_iterable(obj):
|
||||
return list(obj)
|
||||
|
||||
class TestCheckCircular(TestCase):
|
||||
def test_circular_dict(self):
|
||||
dct = {}
|
||||
dct['a'] = dct
|
||||
self.assertRaises(ValueError, json.dumps, dct)
|
||||
|
||||
def test_circular_list(self):
|
||||
lst = []
|
||||
lst.append(lst)
|
||||
self.assertRaises(ValueError, json.dumps, lst)
|
||||
|
||||
def test_circular_composite(self):
|
||||
dct2 = {}
|
||||
dct2['a'] = []
|
||||
dct2['a'].append(dct2)
|
||||
self.assertRaises(ValueError, json.dumps, dct2)
|
||||
|
||||
def test_circular_default(self):
|
||||
json.dumps([set()], default=default_iterable)
|
||||
self.assertRaises(TypeError, json.dumps, [set()])
|
||||
|
||||
def test_circular_off_default(self):
|
||||
json.dumps([set()], default=default_iterable, check_circular=False)
|
||||
self.assertRaises(TypeError, json.dumps, [set()], check_circular=False)
|
||||
22
simplejson/tests/test_decode.py
Normal file
22
simplejson/tests/test_decode.py
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
import decimal
|
||||
from unittest import TestCase
|
||||
|
||||
import simplejson as json
|
||||
|
||||
class TestDecode(TestCase):
|
||||
def test_decimal(self):
|
||||
rval = json.loads('1.1', parse_float=decimal.Decimal)
|
||||
self.assert_(isinstance(rval, decimal.Decimal))
|
||||
self.assertEquals(rval, decimal.Decimal('1.1'))
|
||||
|
||||
def test_float(self):
|
||||
rval = json.loads('1', parse_int=float)
|
||||
self.assert_(isinstance(rval, float))
|
||||
self.assertEquals(rval, 1.0)
|
||||
|
||||
def test_decoder_optimizations(self):
|
||||
# Several optimizations were made that skip over calls to
|
||||
# the whitespace regex, so this test is designed to try and
|
||||
# exercise the uncommon cases. The array cases are already covered.
|
||||
rval = json.loads('{ "key" : "value" , "k":"v" }')
|
||||
self.assertEquals(rval, {"key":"value", "k":"v"})
|
||||
9
simplejson/tests/test_default.py
Normal file
9
simplejson/tests/test_default.py
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
from unittest import TestCase
|
||||
|
||||
import simplejson as json
|
||||
|
||||
class TestDefault(TestCase):
|
||||
def test_default(self):
|
||||
self.assertEquals(
|
||||
json.dumps(type, default=repr),
|
||||
json.dumps(repr(type)))
|
||||
21
simplejson/tests/test_dump.py
Normal file
21
simplejson/tests/test_dump.py
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
from unittest import TestCase
|
||||
from cStringIO import StringIO
|
||||
|
||||
import simplejson as json
|
||||
|
||||
class TestDump(TestCase):
|
||||
def test_dump(self):
|
||||
sio = StringIO()
|
||||
json.dump({}, sio)
|
||||
self.assertEquals(sio.getvalue(), '{}')
|
||||
|
||||
def test_dumps(self):
|
||||
self.assertEquals(json.dumps({}), '{}')
|
||||
|
||||
def test_encode_truefalse(self):
|
||||
self.assertEquals(json.dumps(
|
||||
{True: False, False: True}, sort_keys=True),
|
||||
'{"false": true, "true": false}')
|
||||
self.assertEquals(json.dumps(
|
||||
{2: 3.0, 4.0: 5L, False: 1, 6L: True, "7": 0}, sort_keys=True),
|
||||
'{"false": 1, "2": 3.0, "4.0": 5, "6": true, "7": 0}')
|
||||
38
simplejson/tests/test_encode_basestring_ascii.py
Normal file
38
simplejson/tests/test_encode_basestring_ascii.py
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
from unittest import TestCase
|
||||
|
||||
import simplejson.encoder
|
||||
|
||||
CASES = [
|
||||
(u'/\\"\ucafe\ubabe\uab98\ufcde\ubcda\uef4a\x08\x0c\n\r\t`1~!@#$%^&*()_+-=[]{}|;:\',./<>?', '"/\\\\\\"\\ucafe\\ubabe\\uab98\\ufcde\\ubcda\\uef4a\\b\\f\\n\\r\\t`1~!@#$%^&*()_+-=[]{}|;:\',./<>?"'),
|
||||
(u'\u0123\u4567\u89ab\ucdef\uabcd\uef4a', '"\\u0123\\u4567\\u89ab\\ucdef\\uabcd\\uef4a"'),
|
||||
(u'controls', '"controls"'),
|
||||
(u'\x08\x0c\n\r\t', '"\\b\\f\\n\\r\\t"'),
|
||||
(u'{"object with 1 member":["array with 1 element"]}', '"{\\"object with 1 member\\":[\\"array with 1 element\\"]}"'),
|
||||
(u' s p a c e d ', '" s p a c e d "'),
|
||||
(u'\U0001d120', '"\\ud834\\udd20"'),
|
||||
(u'\u03b1\u03a9', '"\\u03b1\\u03a9"'),
|
||||
('\xce\xb1\xce\xa9', '"\\u03b1\\u03a9"'),
|
||||
(u'\u03b1\u03a9', '"\\u03b1\\u03a9"'),
|
||||
('\xce\xb1\xce\xa9', '"\\u03b1\\u03a9"'),
|
||||
(u'\u03b1\u03a9', '"\\u03b1\\u03a9"'),
|
||||
(u'\u03b1\u03a9', '"\\u03b1\\u03a9"'),
|
||||
(u"`1~!@#$%^&*()_+-={':[,]}|;.</>?", '"`1~!@#$%^&*()_+-={\':[,]}|;.</>?"'),
|
||||
(u'\x08\x0c\n\r\t', '"\\b\\f\\n\\r\\t"'),
|
||||
(u'\u0123\u4567\u89ab\ucdef\uabcd\uef4a', '"\\u0123\\u4567\\u89ab\\ucdef\\uabcd\\uef4a"'),
|
||||
]
|
||||
|
||||
class TestEncodeBaseStringAscii(TestCase):
|
||||
def test_py_encode_basestring_ascii(self):
|
||||
self._test_encode_basestring_ascii(simplejson.encoder.py_encode_basestring_ascii)
|
||||
|
||||
def test_c_encode_basestring_ascii(self):
|
||||
if not simplejson.encoder.c_encode_basestring_ascii:
|
||||
return
|
||||
self._test_encode_basestring_ascii(simplejson.encoder.c_encode_basestring_ascii)
|
||||
|
||||
def _test_encode_basestring_ascii(self, encode_basestring_ascii):
|
||||
fname = encode_basestring_ascii.__name__
|
||||
for input_string, expect in CASES:
|
||||
result = encode_basestring_ascii(input_string)
|
||||
self.assertEquals(result, expect,
|
||||
'%r != %r for %s(%r)' % (result, expect, fname, input_string))
|
||||
76
simplejson/tests/test_fail.py
Normal file
76
simplejson/tests/test_fail.py
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
from unittest import TestCase
|
||||
|
||||
import simplejson as json
|
||||
|
||||
# Fri Dec 30 18:57:26 2005
|
||||
JSONDOCS = [
|
||||
# http://json.org/JSON_checker/test/fail1.json
|
||||
'"A JSON payload should be an object or array, not a string."',
|
||||
# http://json.org/JSON_checker/test/fail2.json
|
||||
'["Unclosed array"',
|
||||
# http://json.org/JSON_checker/test/fail3.json
|
||||
'{unquoted_key: "keys must be quoted}',
|
||||
# http://json.org/JSON_checker/test/fail4.json
|
||||
'["extra comma",]',
|
||||
# http://json.org/JSON_checker/test/fail5.json
|
||||
'["double extra comma",,]',
|
||||
# http://json.org/JSON_checker/test/fail6.json
|
||||
'[ , "<-- missing value"]',
|
||||
# http://json.org/JSON_checker/test/fail7.json
|
||||
'["Comma after the close"],',
|
||||
# http://json.org/JSON_checker/test/fail8.json
|
||||
'["Extra close"]]',
|
||||
# http://json.org/JSON_checker/test/fail9.json
|
||||
'{"Extra comma": true,}',
|
||||
# http://json.org/JSON_checker/test/fail10.json
|
||||
'{"Extra value after close": true} "misplaced quoted value"',
|
||||
# http://json.org/JSON_checker/test/fail11.json
|
||||
'{"Illegal expression": 1 + 2}',
|
||||
# http://json.org/JSON_checker/test/fail12.json
|
||||
'{"Illegal invocation": alert()}',
|
||||
# http://json.org/JSON_checker/test/fail13.json
|
||||
'{"Numbers cannot have leading zeroes": 013}',
|
||||
# http://json.org/JSON_checker/test/fail14.json
|
||||
'{"Numbers cannot be hex": 0x14}',
|
||||
# http://json.org/JSON_checker/test/fail15.json
|
||||
'["Illegal backslash escape: \\x15"]',
|
||||
# http://json.org/JSON_checker/test/fail16.json
|
||||
'["Illegal backslash escape: \\\'"]',
|
||||
# http://json.org/JSON_checker/test/fail17.json
|
||||
'["Illegal backslash escape: \\017"]',
|
||||
# http://json.org/JSON_checker/test/fail18.json
|
||||
'[[[[[[[[[[[[[[[[[[[["Too deep"]]]]]]]]]]]]]]]]]]]]',
|
||||
# http://json.org/JSON_checker/test/fail19.json
|
||||
'{"Missing colon" null}',
|
||||
# http://json.org/JSON_checker/test/fail20.json
|
||||
'{"Double colon":: null}',
|
||||
# http://json.org/JSON_checker/test/fail21.json
|
||||
'{"Comma instead of colon", null}',
|
||||
# http://json.org/JSON_checker/test/fail22.json
|
||||
'["Colon instead of comma": false]',
|
||||
# http://json.org/JSON_checker/test/fail23.json
|
||||
'["Bad value", truth]',
|
||||
# http://json.org/JSON_checker/test/fail24.json
|
||||
"['single quote']",
|
||||
# http://code.google.com/p/simplejson/issues/detail?id=3
|
||||
u'["A\u001FZ control characters in string"]',
|
||||
]
|
||||
|
||||
SKIPS = {
|
||||
1: "why not have a string payload?",
|
||||
18: "spec doesn't specify any nesting limitations",
|
||||
}
|
||||
|
||||
class TestFail(TestCase):
|
||||
def test_failures(self):
|
||||
for idx, doc in enumerate(JSONDOCS):
|
||||
idx = idx + 1
|
||||
if idx in SKIPS:
|
||||
json.loads(doc)
|
||||
continue
|
||||
try:
|
||||
json.loads(doc)
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
self.fail("Expected failure for fail%d.json: %r" % (idx, doc))
|
||||
15
simplejson/tests/test_float.py
Normal file
15
simplejson/tests/test_float.py
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
import math
|
||||
from unittest import TestCase
|
||||
|
||||
import simplejson as json
|
||||
|
||||
class TestFloat(TestCase):
|
||||
def test_floats(self):
|
||||
for num in [1617161771.7650001, math.pi, math.pi**100, math.pi**-100, 3.1]:
|
||||
self.assertEquals(float(json.dumps(num)), num)
|
||||
self.assertEquals(json.loads(json.dumps(num)), num)
|
||||
|
||||
def test_ints(self):
|
||||
for num in [1, 1L, 1<<32, 1<<64]:
|
||||
self.assertEquals(json.dumps(num), str(num))
|
||||
self.assertEquals(int(json.dumps(num)), num)
|
||||
41
simplejson/tests/test_indent.py
Normal file
41
simplejson/tests/test_indent.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
from unittest import TestCase
|
||||
|
||||
import simplejson as json
|
||||
import textwrap
|
||||
|
||||
class TestIndent(TestCase):
|
||||
def test_indent(self):
|
||||
h = [['blorpie'], ['whoops'], [], 'd-shtaeou', 'd-nthiouh', 'i-vhbjkhnth',
|
||||
{'nifty': 87}, {'field': 'yes', 'morefield': False} ]
|
||||
|
||||
expect = textwrap.dedent("""\
|
||||
[
|
||||
[
|
||||
"blorpie"
|
||||
],
|
||||
[
|
||||
"whoops"
|
||||
],
|
||||
[],
|
||||
"d-shtaeou",
|
||||
"d-nthiouh",
|
||||
"i-vhbjkhnth",
|
||||
{
|
||||
"nifty": 87
|
||||
},
|
||||
{
|
||||
"field": "yes",
|
||||
"morefield": false
|
||||
}
|
||||
]""")
|
||||
|
||||
|
||||
d1 = json.dumps(h)
|
||||
d2 = json.dumps(h, indent=2, sort_keys=True, separators=(',', ': '))
|
||||
|
||||
h1 = json.loads(d1)
|
||||
h2 = json.loads(d2)
|
||||
|
||||
self.assertEquals(h1, h)
|
||||
self.assertEquals(h2, h)
|
||||
self.assertEquals(d2, expect)
|
||||
76
simplejson/tests/test_pass1.py
Normal file
76
simplejson/tests/test_pass1.py
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
from unittest import TestCase
|
||||
|
||||
import simplejson as json
|
||||
|
||||
# from http://json.org/JSON_checker/test/pass1.json
|
||||
JSON = r'''
|
||||
[
|
||||
"JSON Test Pattern pass1",
|
||||
{"object with 1 member":["array with 1 element"]},
|
||||
{},
|
||||
[],
|
||||
-42,
|
||||
true,
|
||||
false,
|
||||
null,
|
||||
{
|
||||
"integer": 1234567890,
|
||||
"real": -9876.543210,
|
||||
"e": 0.123456789e-12,
|
||||
"E": 1.234567890E+34,
|
||||
"": 23456789012E666,
|
||||
"zero": 0,
|
||||
"one": 1,
|
||||
"space": " ",
|
||||
"quote": "\"",
|
||||
"backslash": "\\",
|
||||
"controls": "\b\f\n\r\t",
|
||||
"slash": "/ & \/",
|
||||
"alpha": "abcdefghijklmnopqrstuvwyz",
|
||||
"ALPHA": "ABCDEFGHIJKLMNOPQRSTUVWYZ",
|
||||
"digit": "0123456789",
|
||||
"special": "`1~!@#$%^&*()_+-={':[,]}|;.</>?",
|
||||
"hex": "\u0123\u4567\u89AB\uCDEF\uabcd\uef4A",
|
||||
"true": true,
|
||||
"false": false,
|
||||
"null": null,
|
||||
"array":[ ],
|
||||
"object":{ },
|
||||
"address": "50 St. James Street",
|
||||
"url": "http://www.JSON.org/",
|
||||
"comment": "// /* <!-- --",
|
||||
"# -- --> */": " ",
|
||||
" s p a c e d " :[1,2 , 3
|
||||
|
||||
,
|
||||
|
||||
4 , 5 , 6 ,7 ],
|
||||
"compact": [1,2,3,4,5,6,7],
|
||||
"jsontext": "{\"object with 1 member\":[\"array with 1 element\"]}",
|
||||
"quotes": "" \u0022 %22 0x22 034 "",
|
||||
"\/\\\"\uCAFE\uBABE\uAB98\uFCDE\ubcda\uef4A\b\f\n\r\t`1~!@#$%^&*()_+-=[]{}|;:',./<>?"
|
||||
: "A key can be any string"
|
||||
},
|
||||
0.5 ,98.6
|
||||
,
|
||||
99.44
|
||||
,
|
||||
|
||||
1066
|
||||
|
||||
|
||||
,"rosebud"]
|
||||
'''
|
||||
|
||||
class TestPass1(TestCase):
|
||||
def test_parse(self):
|
||||
# test in/out equivalence and parsing
|
||||
res = json.loads(JSON)
|
||||
out = json.dumps(res)
|
||||
self.assertEquals(res, json.loads(out))
|
||||
try:
|
||||
json.dumps(res, allow_nan=False)
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
self.fail("23456789012E666 should be out of range")
|
||||
14
simplejson/tests/test_pass2.py
Normal file
14
simplejson/tests/test_pass2.py
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
from unittest import TestCase
|
||||
import simplejson as json
|
||||
|
||||
# from http://json.org/JSON_checker/test/pass2.json
|
||||
JSON = r'''
|
||||
[[[[[[[[[[[[[[[[[[["Not too deep"]]]]]]]]]]]]]]]]]]]
|
||||
'''
|
||||
|
||||
class TestPass2(TestCase):
|
||||
def test_parse(self):
|
||||
# test in/out equivalence and parsing
|
||||
res = json.loads(JSON)
|
||||
out = json.dumps(res)
|
||||
self.assertEquals(res, json.loads(out))
|
||||
20
simplejson/tests/test_pass3.py
Normal file
20
simplejson/tests/test_pass3.py
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
from unittest import TestCase
|
||||
|
||||
import simplejson as json
|
||||
|
||||
# from http://json.org/JSON_checker/test/pass3.json
|
||||
JSON = r'''
|
||||
{
|
||||
"JSON Test Pattern pass3": {
|
||||
"The outermost value": "must be an object or array.",
|
||||
"In this test": "It is an object."
|
||||
}
|
||||
}
|
||||
'''
|
||||
|
||||
class TestPass3(TestCase):
|
||||
def test_parse(self):
|
||||
# test in/out equivalence and parsing
|
||||
res = json.loads(JSON)
|
||||
out = json.dumps(res)
|
||||
self.assertEquals(res, json.loads(out))
|
||||
67
simplejson/tests/test_recursion.py
Normal file
67
simplejson/tests/test_recursion.py
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
from unittest import TestCase
|
||||
|
||||
import simplejson as json
|
||||
|
||||
class JSONTestObject:
|
||||
pass
|
||||
|
||||
|
||||
class RecursiveJSONEncoder(json.JSONEncoder):
|
||||
recurse = False
|
||||
def default(self, o):
|
||||
if o is JSONTestObject:
|
||||
if self.recurse:
|
||||
return [JSONTestObject]
|
||||
else:
|
||||
return 'JSONTestObject'
|
||||
return json.JSONEncoder.default(o)
|
||||
|
||||
|
||||
class TestRecursion(TestCase):
|
||||
def test_listrecursion(self):
|
||||
x = []
|
||||
x.append(x)
|
||||
try:
|
||||
json.dumps(x)
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
self.fail("didn't raise ValueError on list recursion")
|
||||
x = []
|
||||
y = [x]
|
||||
x.append(y)
|
||||
try:
|
||||
json.dumps(x)
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
self.fail("didn't raise ValueError on alternating list recursion")
|
||||
y = []
|
||||
x = [y, y]
|
||||
# ensure that the marker is cleared
|
||||
json.dumps(x)
|
||||
|
||||
def test_dictrecursion(self):
|
||||
x = {}
|
||||
x["test"] = x
|
||||
try:
|
||||
json.dumps(x)
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
self.fail("didn't raise ValueError on dict recursion")
|
||||
x = {}
|
||||
y = {"a": x, "b": x}
|
||||
# ensure that the marker is cleared
|
||||
json.dumps(x)
|
||||
|
||||
def test_defaultrecursion(self):
|
||||
enc = RecursiveJSONEncoder()
|
||||
self.assertEquals(enc.encode(JSONTestObject), '"JSONTestObject"')
|
||||
enc.recurse = True
|
||||
try:
|
||||
enc.encode(JSONTestObject)
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
self.fail("didn't raise ValueError on default recursion")
|
||||
111
simplejson/tests/test_scanstring.py
Normal file
111
simplejson/tests/test_scanstring.py
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
import sys
|
||||
import decimal
|
||||
from unittest import TestCase
|
||||
|
||||
import simplejson as json
|
||||
import simplejson.decoder
|
||||
|
||||
class TestScanString(TestCase):
|
||||
def test_py_scanstring(self):
|
||||
self._test_scanstring(simplejson.decoder.py_scanstring)
|
||||
|
||||
def test_c_scanstring(self):
|
||||
if not simplejson.decoder.c_scanstring:
|
||||
return
|
||||
self._test_scanstring(simplejson.decoder.c_scanstring)
|
||||
|
||||
def _test_scanstring(self, scanstring):
|
||||
self.assertEquals(
|
||||
scanstring('"z\\ud834\\udd20x"', 1, None, True),
|
||||
(u'z\U0001d120x', 16))
|
||||
|
||||
if sys.maxunicode == 65535:
|
||||
self.assertEquals(
|
||||
scanstring(u'"z\U0001d120x"', 1, None, True),
|
||||
(u'z\U0001d120x', 6))
|
||||
else:
|
||||
self.assertEquals(
|
||||
scanstring(u'"z\U0001d120x"', 1, None, True),
|
||||
(u'z\U0001d120x', 5))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('"\\u007b"', 1, None, True),
|
||||
(u'{', 8))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('"A JSON payload should be an object or array, not a string."', 1, None, True),
|
||||
(u'A JSON payload should be an object or array, not a string.', 60))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('["Unclosed array"', 2, None, True),
|
||||
(u'Unclosed array', 17))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('["extra comma",]', 2, None, True),
|
||||
(u'extra comma', 14))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('["double extra comma",,]', 2, None, True),
|
||||
(u'double extra comma', 21))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('["Comma after the close"],', 2, None, True),
|
||||
(u'Comma after the close', 24))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('["Extra close"]]', 2, None, True),
|
||||
(u'Extra close', 14))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('{"Extra comma": true,}', 2, None, True),
|
||||
(u'Extra comma', 14))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('{"Extra value after close": true} "misplaced quoted value"', 2, None, True),
|
||||
(u'Extra value after close', 26))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('{"Illegal expression": 1 + 2}', 2, None, True),
|
||||
(u'Illegal expression', 21))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('{"Illegal invocation": alert()}', 2, None, True),
|
||||
(u'Illegal invocation', 21))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('{"Numbers cannot have leading zeroes": 013}', 2, None, True),
|
||||
(u'Numbers cannot have leading zeroes', 37))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('{"Numbers cannot be hex": 0x14}', 2, None, True),
|
||||
(u'Numbers cannot be hex', 24))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('[[[[[[[[[[[[[[[[[[[["Too deep"]]]]]]]]]]]]]]]]]]]]', 21, None, True),
|
||||
(u'Too deep', 30))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('{"Missing colon" null}', 2, None, True),
|
||||
(u'Missing colon', 16))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('{"Double colon":: null}', 2, None, True),
|
||||
(u'Double colon', 15))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('{"Comma instead of colon", null}', 2, None, True),
|
||||
(u'Comma instead of colon', 25))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('["Colon instead of comma": false]', 2, None, True),
|
||||
(u'Colon instead of comma', 25))
|
||||
|
||||
self.assertEquals(
|
||||
scanstring('["Bad value", truth]', 2, None, True),
|
||||
(u'Bad value', 12))
|
||||
|
||||
def test_issue3623(self):
|
||||
self.assertRaises(ValueError, json.decoder.scanstring, "xxx", 1,
|
||||
"xxx")
|
||||
self.assertRaises(UnicodeDecodeError,
|
||||
json.encoder.encode_basestring_ascii, "xx\xff")
|
||||
42
simplejson/tests/test_separators.py
Normal file
42
simplejson/tests/test_separators.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
import textwrap
|
||||
from unittest import TestCase
|
||||
|
||||
import simplejson as json
|
||||
|
||||
|
||||
class TestSeparators(TestCase):
|
||||
def test_separators(self):
|
||||
h = [['blorpie'], ['whoops'], [], 'd-shtaeou', 'd-nthiouh', 'i-vhbjkhnth',
|
||||
{'nifty': 87}, {'field': 'yes', 'morefield': False} ]
|
||||
|
||||
expect = textwrap.dedent("""\
|
||||
[
|
||||
[
|
||||
"blorpie"
|
||||
] ,
|
||||
[
|
||||
"whoops"
|
||||
] ,
|
||||
[] ,
|
||||
"d-shtaeou" ,
|
||||
"d-nthiouh" ,
|
||||
"i-vhbjkhnth" ,
|
||||
{
|
||||
"nifty" : 87
|
||||
} ,
|
||||
{
|
||||
"field" : "yes" ,
|
||||
"morefield" : false
|
||||
}
|
||||
]""")
|
||||
|
||||
|
||||
d1 = json.dumps(h)
|
||||
d2 = json.dumps(h, indent=2, sort_keys=True, separators=(' ,', ' : '))
|
||||
|
||||
h1 = json.loads(d1)
|
||||
h2 = json.loads(d2)
|
||||
|
||||
self.assertEquals(h1, h)
|
||||
self.assertEquals(h2, h)
|
||||
self.assertEquals(d2, expect)
|
||||
64
simplejson/tests/test_unicode.py
Normal file
64
simplejson/tests/test_unicode.py
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
from unittest import TestCase
|
||||
|
||||
import simplejson as json
|
||||
|
||||
class TestUnicode(TestCase):
|
||||
def test_encoding1(self):
|
||||
encoder = json.JSONEncoder(encoding='utf-8')
|
||||
u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
|
||||
s = u.encode('utf-8')
|
||||
ju = encoder.encode(u)
|
||||
js = encoder.encode(s)
|
||||
self.assertEquals(ju, js)
|
||||
|
||||
def test_encoding2(self):
|
||||
u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
|
||||
s = u.encode('utf-8')
|
||||
ju = json.dumps(u, encoding='utf-8')
|
||||
js = json.dumps(s, encoding='utf-8')
|
||||
self.assertEquals(ju, js)
|
||||
|
||||
def test_encoding3(self):
|
||||
u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
|
||||
j = json.dumps(u)
|
||||
self.assertEquals(j, '"\\u03b1\\u03a9"')
|
||||
|
||||
def test_encoding4(self):
|
||||
u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
|
||||
j = json.dumps([u])
|
||||
self.assertEquals(j, '["\\u03b1\\u03a9"]')
|
||||
|
||||
def test_encoding5(self):
|
||||
u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
|
||||
j = json.dumps(u, ensure_ascii=False)
|
||||
self.assertEquals(j, u'"%s"' % (u,))
|
||||
|
||||
def test_encoding6(self):
|
||||
u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
|
||||
j = json.dumps([u], ensure_ascii=False)
|
||||
self.assertEquals(j, u'["%s"]' % (u,))
|
||||
|
||||
def test_big_unicode_encode(self):
|
||||
u = u'\U0001d120'
|
||||
self.assertEquals(json.dumps(u), '"\\ud834\\udd20"')
|
||||
self.assertEquals(json.dumps(u, ensure_ascii=False), u'"\U0001d120"')
|
||||
|
||||
def test_big_unicode_decode(self):
|
||||
u = u'z\U0001d120x'
|
||||
self.assertEquals(json.loads('"' + u + '"'), u)
|
||||
self.assertEquals(json.loads('"z\\ud834\\udd20x"'), u)
|
||||
|
||||
def test_unicode_decode(self):
|
||||
for i in range(0, 0xd7ff):
|
||||
u = unichr(i)
|
||||
s = '"\\u%04x"' % (i,)
|
||||
self.assertEquals(json.loads(s), u)
|
||||
|
||||
def test_default_encoding(self):
|
||||
self.assertEquals(json.loads(u'{"a": "\xe9"}'.encode('utf-8')),
|
||||
{'a': u'\xe9'})
|
||||
|
||||
def test_unicode_preservation(self):
|
||||
self.assertEquals(type(json.loads(u'""')), unicode)
|
||||
self.assertEquals(type(json.loads(u'"a"')), unicode)
|
||||
self.assertEquals(type(json.loads(u'["a"]')[0]), unicode)
|
||||
37
simplejson/tool.py
Normal file
37
simplejson/tool.py
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
r"""Command-line tool to validate and pretty-print JSON
|
||||
|
||||
Usage::
|
||||
|
||||
$ echo '{"json":"obj"}' | python -m simplejson.tool
|
||||
{
|
||||
"json": "obj"
|
||||
}
|
||||
$ echo '{ 1.2:3.4}' | python -m simplejson.tool
|
||||
Expecting property name: line 1 column 2 (char 2)
|
||||
|
||||
"""
|
||||
import sys
|
||||
import simplejson
|
||||
|
||||
def main():
|
||||
if len(sys.argv) == 1:
|
||||
infile = sys.stdin
|
||||
outfile = sys.stdout
|
||||
elif len(sys.argv) == 2:
|
||||
infile = open(sys.argv[1], 'rb')
|
||||
outfile = sys.stdout
|
||||
elif len(sys.argv) == 3:
|
||||
infile = open(sys.argv[1], 'rb')
|
||||
outfile = open(sys.argv[2], 'wb')
|
||||
else:
|
||||
raise SystemExit(sys.argv[0] + " [infile [outfile]]")
|
||||
try:
|
||||
obj = simplejson.load(infile)
|
||||
except ValueError, e:
|
||||
raise SystemExit(e)
|
||||
simplejson.dump(obj, outfile, sort_keys=True, indent=4)
|
||||
outfile.write('\n')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
BIN
static/ajax-loader.gif
Normal file
BIN
static/ajax-loader.gif
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 11 KiB |
BIN
static/favicon.ico
Normal file
BIN
static/favicon.ico
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 21 KiB |
79
status.html
Normal file
79
status.html
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
|
||||
<html>
|
||||
<head>
|
||||
<link href="/css/index.css" rel="stylesheet" type="text/css">
|
||||
<title>{% if fic.completed %} Finished {% else %} {% if fic.failure %} Failed {% else %} Working... {% endif %} {% endif %} - Fanfiction Downloader</title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<meta name="google-site-verification" content="kCFc-G4bka_pJN6Rv8CapPBcwmq0hbAUZPkKWqRsAYU" />
|
||||
{% if not fic.completed and not fic.failure %}
|
||||
<meta http-equiv="refresh" content="7">
|
||||
{% endif %}
|
||||
</head>
|
||||
<body>
|
||||
<div id='main'>
|
||||
<h1>
|
||||
<a href="/" style="text-decoration: none; color: black;">FanFiction Downloader</a>
|
||||
</h1>
|
||||
<div style="text-align: center">
|
||||
<script type="text/javascript"><!--
|
||||
google_ad_client = "ca-pub-0320924304307555";
|
||||
/* Standard */
|
||||
google_ad_slot = "8974025478";
|
||||
google_ad_width = 468;
|
||||
google_ad_height = 60;
|
||||
//-->
|
||||
</script>
|
||||
<script type="text/javascript"
|
||||
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
|
||||
</script>
|
||||
</div>
|
||||
|
||||
<div id='urlbox'>
|
||||
<div id='greeting'>
|
||||
<p><a href='{{ fic.url }}'>{{ fic.url }}</a></p>
|
||||
</div>
|
||||
<div>
|
||||
{% if fic.completed %}
|
||||
<p>Your fic has finished processing and you can download it now:</p>
|
||||
<p><a href="/file?id={{ fic.key }}">Download {{ fic.title }}</a>
|
||||
by {{ fic.author }} ({{ fic.format }})</p>
|
||||
{% if escaped_url %}
|
||||
<p><a href="http://www.convertfiles.com/index.php?url={{ escaped_url }}">Convert {{ fic.title }} to other formats</a></p>
|
||||
{% endif %}
|
||||
{% else %}
|
||||
{% if fic.failure %}
|
||||
Your fic failed to process. Please check the URL and the error message below.<br />
|
||||
<div id='error'>
|
||||
{{ fic.failure }}
|
||||
</div>
|
||||
{% else %}
|
||||
<p>Not done yet. This page will periodically poll to see if your story has finished.</p>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
<p>Or see your personal list of <a href="/recent">previously downloaded fanfics</a>.</p>
|
||||
</div>
|
||||
</div>
|
||||
<div style='text-align: center'>
|
||||
<img src="http://code.google.com/appengine/images/appengine-silver-120x30.gif"
|
||||
alt="Powered by Google App Engine" />
|
||||
<br/><br/>
|
||||
FanfictionLoader is a web front-end to <A href="http://code.google.com/p/fanficdownloader/">fanficdownloader</a><br/>
|
||||
Copyright © <a href="http://twitter.com/sigizmund">Roman Kirillov</a>
|
||||
</div>
|
||||
|
||||
<div style="margin-top: 1em; text-align: center'">
|
||||
<script type="text/javascript"><!--
|
||||
google_ad_client = "ca-pub-0320924304307555";
|
||||
/* Standard */
|
||||
google_ad_slot = "8974025478";
|
||||
google_ad_width = 468;
|
||||
google_ad_height = 60;
|
||||
//-->
|
||||
</script>
|
||||
<script type="text/javascript"
|
||||
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
|
||||
</script>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
52
utils/remover.py
Normal file
52
utils/remover.py
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
#!/usr/bin/env python
|
||||
# encoding: utf-8
|
||||
"""
|
||||
remover.py
|
||||
|
||||
Created by Roman on 2010-06-20.
|
||||
Copyright (c) 2010 __MyCompanyName__. All rights reserved.
|
||||
"""
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
|
||||
from google.appengine.ext.webapp import util
|
||||
from google.appengine.ext import webapp
|
||||
from google.appengine.api import users
|
||||
|
||||
from ffstorage import *
|
||||
|
||||
class Remover(webapp.RequestHandler):
|
||||
def get(self):
|
||||
logging.debug("Starting r3m0v3r")
|
||||
user = users.get_current_user()
|
||||
logging.debug("Working as user %s" % user)
|
||||
theDate = datetime.date.today() - datetime.timedelta(days=5)
|
||||
logging.debug("Will delete stuff older than %s" % theDate)
|
||||
|
||||
fics = DownloadMeta.all()
|
||||
fics.filter("date <",theDate).order("date")
|
||||
results = fics.fetch(100)
|
||||
logging.debug([x.name for x in results])
|
||||
|
||||
num = 0
|
||||
for d in results:
|
||||
d.delete()
|
||||
for c in d.data_chunks:
|
||||
c.delete()
|
||||
num = num + 1
|
||||
logging.debug('Delete '+d.url)
|
||||
|
||||
logging.info('Deleted instances: %d' % num)
|
||||
self.response.out.write('Deleted instances: %d' % num)
|
||||
|
||||
|
||||
def main():
|
||||
application = webapp.WSGIApplication([('/r3m0v3r', Remover)],
|
||||
debug=False)
|
||||
util.run_wsgi_app(application)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
main()
|
||||
Loading…
Reference in a new issue