Whole lot of fixes related to appengine

This commit is contained in:
sigizmund 2009-12-18 14:51:53 +00:00
commit 1f897843e0
12 changed files with 2851 additions and 0 deletions

1711
BeautifulSoup.py Normal file

File diff suppressed because it is too large Load diff

0
__init__.py Normal file
View file

32
adapter.py Normal file
View file

@ -0,0 +1,32 @@
class FanfictionSiteAdapter:
login = ''
password = ''
def __init__(self, url):
pass
def requiresLogin(self, url = None):
pass
def performLogin(self, url = None):
pass
def extractIndividualUrls(self):
pass
def getText(self, url):
pass
def setLogin(self, login):
pass
def setPassword(self, password):
pass
def getStoryName(self):
pass
def getAuthorName(self):
pass
def getPrintableUrl(self, url):
pass

135
constants.py Normal file
View file

@ -0,0 +1,135 @@
CSS = '''body { margin-left: 5%; margin-right: 5%; margin-top: 5%; margin-bottom: 5%; text-align: justify; }
pre { font-size: x-small; }
h1 { text-align: center; }
h2 { text-align: center; }
h3 { text-align: center; }
h4 { text-align: center; }
h5 { text-align: center; }
h6 { text-align: center; }
.CI {
text-align:center;
margin-top:0px;
margin-bottom:0px;
padding:0px;
}
.center {text-align: center;}
.smcap {font-variant: small-caps;}
.u {text-decoration: underline;}
.bold {font-weight: bold;}
'''
MIMETYPE = '''application/epub+zip'''
CONTAINER = '''<?xml version="1.0"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>
'''
CONTENT_START = '''<?xml version="1.0"?>
<package version="2.0" xmlns="http://www.idpf.org/2007/opf"
unique-identifier="BookId-Epub-%s">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:opf="http://www.idpf.org/2007/opf">
<dc:title>%s</dc:title>
<dc:creator opf:role="aut">%s</dc:creator>
<dc:language>en-UK</dc:language>
<dc:rights></dc:rights>
<dc:publisher>sgzmd</dc:publisher>
<dc:identifier id="BookId">urn:uuid:sigizmund.com062820072147132</dc:identifier>
</metadata>
<manifest>
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
<item id="style" href="stylesheet.css" media-type="text/css" />
'''
CONTENT_ITEM = '<item id="%s" href="%s" media-type="application/xhtml+xml" />'
CONTENT_END_MANIFEST = '''</manifest>
<spine toc="ncx">
'''
CONTENT_ITEMREF = '''<itemref idref="%s" />'''
CONTENT_END = '''</spine>
</package>
'''
TOC_START = '''<?xml version="1.0" encoding="UTF-8"?>
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
<head>
<meta name="dtb:uid" content="sigizmund.com062820072147132"/>
<meta name="dtb:depth" content="1"/>
<meta name="dtb:totalPageCount" content="0"/>
<meta name="dtb:maxPageNumber" content="0"/>
</head>
<docTitle>
<text>%s</text>
</docTitle>
<navMap>
'''
TOC_ITEM = '''<navPoint id="%s" playOrder="%d">
<navLabel>
<text>%s</text>
</navLabel>
<content src="%s"/>
</navPoint>
'''
TOC_END = '''</navMap>
</ncx>
'''
XHTML_START = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>%s</title>
<link href="stylesheet.css" type="text/css" rel="stylesheet" />
</head>
<body>
<div>
<h3>%s</h3>
'''
XHTML_END = '''</div>
</body>
</html>
'''
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
'blockquote', 'br', 'center', 'cite', 'code', 'col',
'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em',
'font', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'i',
'ins', 'kbd', 'label', 'li', 'ol',
'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike',
'strong', 'sub', 'sup', 'u', 'ul']
acceptable_attributes = ['href']
entities = { '&ndash;' : ' - ', '&mdash;' : ' - ', '&rdquo;' : '"', '&ldquo;' : '"', '&rsquo;' : '\'', '&lsquo;' : '\'', '&quot;' : '"' }
FB2_PROLOGUE = '<FictionBook>'
FB2_DESCRIPTION = '''<description>
<title-info>
<genre>fanfiction</genre>
<author>
<first-name></first-name>
<middle-name></middle-name>
<last-name>%s</last-name>
</author>
<book-title>%s</book-title>
<lang>eng</lang>
</title-info>
<document-info>
<author>
<nickname>sgzmd</nickname>
</author>
<date value="%s">%s</date>
<id>sgzmd_%s</id>
<version>2.0</version>
</document-info>
</description>'''

103
downaloder.py Normal file
View file

@ -0,0 +1,103 @@
import os
import re
import sys
import shutil
import os.path
import getpass
import logging
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import ffa
import ffnet
import ficwad
import output
import fictionalley
class FanficLoader:
'''A controller class which handles the interaction between various specific downloaders and writers'''
booksDirectory = "books"
def __init__(self, adapter, writerClass, quiet = False, inmemory = False, compress=True):
self.adapter = adapter
self.writerClass = writerClass
self.quiet = quiet
self.inmemory = inmemory
self.compress = compress
self.badLogin = False
def download(self):
logging.debug("Trying to download the story")
if self.adapter.requiresLogin():
logging.debug("Story requires login")
if not self.adapter.performLogin():
logging.debug("Login/password problem")
self.badLogin = True
return None
urls = self.adapter.extractIndividualUrls()
self.writer = self.writerClass(self.booksDirectory, self.adapter.getStoryName(), self.adapter.getAuthorName(), inmemory=self.inmemory, compress=self.compress)
i = 0
for u,n in urls:
if not self.quiet:
print('Downloading chapter %d/%d' % (i, len(urls)))
i = i+1
text = self.adapter.getText(u)
self.writer.writeChapter(n, text)
self.writer.finalise()
if self.inmemory:
self.name = self.writer.name
return self.writer.output.getvalue()
if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG)
(url, format) = sys.argv[1:]
if type(url) is unicode:
print('URL is unicode')
url = url.encode('latin1')
adapter = None
writerClass = None
if url.find('fanficauthors') != -1:
adapter = ffa.FFA(url)
elif url.find('fictionalley') != -1:
adapter = fictionalley.FictionAlley(url)
print >> sys.stderr, "FictionAlley adapter is broken, try to find this fic on fanfiction.net or fanficauthors"
sys.exit(0)
elif url.find('ficwad') != -1:
adapter = ficwad.FicWad(url)
elif url.find('fanfiction.net') != -1:
adapter = ffnet.FFNet(url)
else:
print >> sys.stderr, "Oi! I can haz not appropriate adapter for URL %s!" % url
sys.exit(1)
if format == 'epub':
writerClass = output.EPubFanficWriter
elif format == 'html':
writerClass = output.HTMLWriter
if adapter.requiresLogin(url):
print("Meow, URL %s requires you to haz been logged in! Please can I haz this datas?" % url)
sys.stdout.write("Can I haz ur login? ")
login = sys.stdin.readline().strip()
password = getpass.getpass(prompt='Can I haz ur password? ')
print("Login: `%s`, Password: `%s`" % (login, password))
adapter.setLogin(login)
adapter.setPassword(password)
loader = FanficLoader(adapter, writerClass)
loader.download()

197
ffa.py Normal file
View file

@ -0,0 +1,197 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import cgi
import uuid
import shutil
import base64
import os.path
import logging
import unittest
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
from constants import *
from adapter import *
try:
import login_password
except:
# tough luck
pass
class FFA(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
logging.debug("Created FFA: url=%s" % (self.url))
def _getLoginScript(self):
return self.path
def requiresLogin(self, url = None):
resp = self.opener.open(self.url)
data = resp.read()
if data.find('<legend>Please login to continue</legend>') != -1:
return True
else:
return False
def performLogin(self, url = None):
if url == None:
url = self.url
data = {}
data['username'] = self.login
data['password'] = self.password
data['submit'] = 'Submit'
urlvals = u.urlencode(data)
loginUrl = 'http://' + self.host + self._getLoginScript()
logging.debug("Will now login to URL %s" % loginUrl)
req = self.opener.open(loginUrl, urlvals)
if self.requiresLogin():
return False
else:
return True
def extractIndividualUrls(self):
data = self.opener.open(self.url).read()
soup = bs.BeautifulStoneSoup(data)
self.author = soup.find('a', {'href' : '/contact/'}).string
self.storyName = str(soup.find('h1', {'class' : 'textCenter'}).contents[0]).strip()
logging.debug("Story `%s` by `%s`" % (self.storyName, self.author))
selector = soup.find('select', {'class' : 'tinput'})
options = selector.findAll('option')
urls = []
for o in options:
title = o.string
url = o['value']
urls.append((url,title))
return urls
def getText(self, url):
if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url
logging.info('Downloading: %s' % url)
data = self.opener.open(url).read()
lines = data.split('\n')
emit = False
post = ''
for l in lines:
if l.find('</div></form>') != -1:
logging.debug('emit = True')
emit = True
continue
elif l.find('<form action="#">') != -1:
logging.debug('emit = False')
if emit:
break
else:
emit = False
if emit:
post = post + l + '\n'
return post
def setLogin(self, login):
self.login = login
def setPassword(self, password):
self.password = password
def getStoryName(self):
return self.storyName
def getAuthorName(self):
return self.author
def getPrintableUrl(self, url):
return url
class FFA_UnitTests(unittest.TestCase):
def setUp(self):
logging.basicConfig(level=logging.DEBUG)
pass
def testRequiresLoginNeg(self):
f = FFA('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
self.assertFalse(f.requiresLogin())
def testRequiresLogin(self):
f = FFA('http://jeconais.fanficauthors.net/Rons_Harem/Rons_Harem/')
self.assertTrue(f.requiresLogin())
def testPerformLogin(self):
f = FFA('http://jeconais.fanficauthors.net/Rons_Harem/Rons_Harem/')
if login_password != None:
f.setLogin(login_password.login)
f.setPassword(login_password.password)
self.assertTrue(f.performLogin(None))
def testExtractURLsAuthorStoryName(self):
f = FFA('http://draco664.fanficauthors.net/Apprentice_Potter/Prologue/')
f.extractIndividualUrls()
self.assertEquals('Draco664', f.getAuthorName())
self.assertEquals('Apprentice Potter', f.getStoryName())
def testExtractUrls(self):
f = FFA('http://draco664.fanficauthors.net/Apprentice_Potter/Prologue/')
urls = f.extractIndividualUrls()
self.assertEquals(25, len(urls))
self.assertEquals('Grievances', urls[2][1])
self.assertEquals('/Apprentice_Potter/Prologue/', urls[0][0])
def testGetText(self):
f = FFA('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
data = f.getText('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
self.assertTrue(data.find('smiled slightly, and settled back in her rocking chair') != -1)
def testGetTextLogin(self):
url = 'http://viridian.fanficauthors.net/Out_of_the_Darkness_A_Jinchuurikis_Tale/A_Harrowing_Escape/'
f = FFA(url)
if login_password != None:
f.setLogin(login_password.login)
f.setPassword(login_password.password)
if f.requiresLogin():
f.performLogin()
data = f.getText(url)
seek = 'So Hokage-sama” I said, “this is how we came'
self.assertTrue(data.find(seek) != -1)
if __name__ == '__main__':
unittest.main()

162
ffnet.py Normal file
View file

@ -0,0 +1,162 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import cgi
import uuid
import shutil
import base64
import os.path
import logging
import unittest
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
from constants import *
from adapter import *
try:
import login_password
except:
# tough luck
pass
try:
from google.appengine.api.urlfetch import fetch as googlefetch
appEngine = True
except:
appEngine = False
class FFNet(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
self.storyName = 'FF.Net story'
self.storyName = 'FF.Net author'
spl = self.path.split('/')
if len(spl) == 5:
self.path = "/".join(spl[1:-1])
if self.path.startswith('/'):
self.path = self.path[1:]
if self.path.endswith('/'):
self.path = self.path[:-1]
(s, self.storyId, chapter) = self.path.split('/')
logging.debug('self.storyId=%s, chapter=%s' % (self.storyId, chapter))
if not appEngine:
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
else:
self.opener = None
logging.debug("Created FF.Net: url=%s" % (self.url))
def _getLoginScript(self):
return self.path
def requiresLogin(self, url = None):
return False
def performLogin(self, url = None):
return True
def _fetchUrl(self, url):
if not appEngine:
return self.opener.open(url).read().decode('utf-8')
else:
return googlefetch(url).content
def extractIndividualUrls(self):
data = self._fetchUrl(self.url)
urls = []
lines = data.split('\n')
for l in lines:
if l.find("<img src='http://c.fanfiction.net/static/ficons/script.png' width=16 height=16 border=0 align=absmiddle>") != -1:
s2 = bs.BeautifulStoneSoup(l)
self.storyName = s2.find('b').string
elif l.find("<a href='/u/") != -1:
s2 = bs.BeautifulStoneSoup(l)
self.authorName = s2.a.string
elif l.find("<SELECT title='chapter navigation'") != -1:
if len(urls) > 0:
continue
u = l.decode('utf-8')
u = re.sub('&\#[0-9]+;', ' ', u)
s2 = bs.BeautifulSoup(u)
options = s2.findAll('option')
for o in options:
url = 'http://fanfiction.net/s/' + self.storyId + '/' + o['value']
title = o.string
logging.debug('URL = `%s`, Title = `%s`' % (url, title))
urls.append((url,title))
return urls
def getText(self, url):
data = self._fetchUrl(url)
lines = data.split('\n')
for l in lines:
if l.find('<!-- start story -->') != -1:
s2 = bs.BeautifulStoneSoup(l)
return s2.div.prettify()
def setLogin(self, login):
self.login = login
def setPassword(self, password):
self.password = password
def getStoryName(self):
return self.storyName
def getAuthorName(self):
return self.authorName
class FFA_UnitTests(unittest.TestCase):
def setUp(self):
logging.basicConfig(level=logging.DEBUG)
pass
def testChaptersAuthStory(self):
f = FFNet('http://www.fanfiction.net/s/5257563/1')
f.extractIndividualUrls()
self.assertEquals('Beka0502', f.getAuthorName())
self.assertEquals("Draco's Redemption", f.getStoryName())
def testChaptersCountNames(self):
f = FFNet('http://www.fanfiction.net/s/5257563/1')
urls = f.extractIndividualUrls()
self.assertEquals(8, len(urls))
def testGetText(self):
url = 'http://www.fanfiction.net/s/5257563/1'
f = FFNet(url)
text = f.getText(url)
self.assertTrue(text.find('He was just about to look at some photos when he heard a crack') != -1)
def testBrokenWands(self):
url = 'http://www.fanfiction.net/s/1527263/30/Harry_Potter_and_Broken_Wands'
f = FFNet(url)
text = f.getText(url)
urls = f.extractIndividualUrls()
if __name__ == '__main__':
unittest.main()

75
fictionalley.py Normal file
View file

@ -0,0 +1,75 @@
import os
import re
import sys
import shutil
import os.path
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
class FictionAlley:
def __init__(self):
pass
def extractIndividualUrls(self, data, host, contents):
soup = bs.BeautifulStoneSoup(data)
title = soup.find('title').string
self.storyName = "-".join(title.split('-')[1:]).strip()
authors = soup.findAll('a')
print('Story "%s" by %s' % (self.storyName, self.authorName))
links = soup.findAll('a', { 'class' : 'chapterlink' } )
result = []
for a in links:
url = a['href']
title = a.string
result.append((url,title))
return result
def getStoryName(self):
return self.storyName
def getAuthorName(self):
return self.authorName
def getText(self, data, fetch = False):
soup = bs.BeautifulStoneSoup(data)
div = soup.find('div', {'id' : 'storytext'})
if None == div:
return '<html/>'
return div.prettify()
def getPrintableUrl(self, url):
return url
def getPasswordLine(self):
return 'opaopapassword'
def getLoginScript(self):
return 'opaopaloginscript'
def getLoginPasswordOthers(self):
login = dict(login = 'name', password = 'pass')
other = dict(submit = 'Log In', remember='yes')
return (login, other)
if __name__ == '__main__':
url = 'http://www.fictionalley.org/authors/drt/DA.html'
data = u2.urlopen(url).read()
host = up.urlparse(url).netloc
fw = FictionAlley()
fw.authorName = 'DrT'
urls = fw.extractIndividualUrls(data, host, url)
pp.pprint(urls)
print(fw.getText(data))

98
ficwad.py Normal file
View file

@ -0,0 +1,98 @@
import os
import re
import sys
import shutil
import os.path
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
from adapter import *
class FicWad(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
self.host = up.urlparse(url).netloc
def requiresLogin(self, url):
return False
def performLogin(self, url):
pass
def setLogin(self, login):
self.login = login
def setPassword(self, password):
self.password = password
def extractIndividualUrls(self):
data = u2.urlopen(self.url).read()
soup = bs.BeautifulStoneSoup(data)
title = soup.find('title').string
self.storyName = title.split('::')[0].strip()
author = soup.find('span', {'class' : 'author'})
self.authorName = author.a.string
print('Story "%s" by %s' % (self.storyName, self.authorName))
select = soup.find('select', { 'name' : 'goto' } )
allOptions = select.findAll('option')
result = []
for o in allOptions:
url = o['value']
# if type(url) is unicode:
# url = url.encode('utf-8')
title = o.string
result.append((url,title))
return result
def getStoryName(self):
return self.storyName
def getAuthorName(self):
return self.authorName
def getText(self, url):
if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url
data = u2.urlopen(url).read()
soup = bs.BeautifulStoneSoup(data)
div = soup.find('div', {'id' : 'storytext'})
if None == div:
return '<html/>'
return div.prettify()
def getPrintableUrl(self, url):
return url
def getPasswordLine(self):
return 'opaopapassword'
def getLoginScript(self):
return 'opaopaloginscript'
def getLoginPasswordOthers(self):
login = dict(login = 'name', password = 'pass')
other = dict(submit = 'Log In', remember='yes')
return (login, other)
if __name__ == '__main__':
url = 'http://www.ficwad.com/story/14536'
data = u2.urlopen(url).read()
host = up.urlparse(url).netloc
fw = FicWad(url)
urls = fw.extractIndividualUrls()
pp.pprint(urls)
print(fw.getText(data))

17
html_constants.py Normal file
View file

@ -0,0 +1,17 @@
XHTML_START = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>${title} by ${author}</title>
<link href="stylesheet.css" type="text/css" rel="stylesheet" />
</head>
<body>
<div>
<h1>${title} by ${author}</h1>
${body}
</body></html>
'''
XHTML_CHAPTER_START = '''<h2>${chapter}</h2>'''
XHTML_END = ''''''

252
output.py Normal file
View file

@ -0,0 +1,252 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import cgi
import uuid
import codecs
import shutil
import string
import base64
import os.path
import zipfile
import StringIO
import logging
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import zipdir
import html_constants
from constants import *
class FanficWriter:
def __init__(self):
pass
def writeChapter(self, title, text):
pass
def finalise(self):
pass
class HTMLWriter(FanficWriter):
body = ''
def __init__(self, base, name, author, inmemory=False, compress=False):
self.basePath = base
self.name = name.replace(" ", "_")
self.storyTitle = name
self.fileName = self.basePath + '/' + self.name + '.html'
self.authorName = author
self.inmemory = inmemory
if not self.inmemory and os.path.exists(self.fileName):
os.remove(self.fileName)
if self.inmemory:
self.output = StringIO.StringIO()
else:
self.output = open(self.fileName, 'w')
self.xhtmlTemplate = string.Template(html_constants.XHTML_START)
self.chapterStartTemplate = string.Template(html_constants.XHTML_CHAPTER_START)
def writeChapter(self, title, text):
title = title.decode('utf-8')
text = text.decode('utf-8')
self.body = self.body + '\n' + self.chapterStartTemplate.substitute({'chapter' : title})
self.body = self.body + '\n' + text
def finalise(self):
html = self.xhtmlTemplate.substitute({'title' : self.storyTitle, 'author' : self.authorName, 'body' : self.body})
soup = bs.BeautifulSoup(html)
result = soup.prettify()
# f = open(self.fileName, 'w')
# f.write(result)
# f.close()
self.output.write(result)
if not self.inmemory:
self.output.close()
class EPubFanficWriter(FanficWriter):
chapters = []
files = {}
def _writeFile(self, fileName, data):
if fileName in self.files:
self.files[fileName].write(data.decode('utf-8'))
else:
if self.inmemory:
self.files[fileName] = StringIO.StringIO()
else:
self.files[fileName] = open(self.directory + '/' + fileName, 'w')
self._writeFile(fileName, data)
def _closeFiles(self):
if not self.inmemory:
for f in self.files:
self.files[f].close()
def __init__(self, base, name, author, inmemory=False, compress=True):
self.basePath = base
self.name = name.replace(" ", "_")
self.storyTitle = name
self.directory = self.basePath + '/' + self.name
self.inmemory = inmemory
self.authorName = author
self.files = {}
self.chapters = []
if not self.inmemory:
self.inmemory = True
self.writeToFile = True
else:
self.writeToFile = False
if not self.inmemory:
if os.path.exists(self.directory):
shutil.rmtree(self.directory)
os.mkdir(self.directory)
os.mkdir(self.directory + '/META-INF')
os.mkdir(self.directory + '/OEBPS')
# print >> codecs.open(self.directory + '/mimetype', 'w', 'utf-8'), MIMETYPE
# print >> codecs.open(self.directory + '/META-INF/container.xml', 'w', 'utf-8'), CONTAINER
# print >> codecs.open(self.directory + '/OEBPS/stylesheet.css', 'w', 'utf-8'), CSS
self._writeFile('mimetype', MIMETYPE)
self._writeFile('META-INF/container.xml', CONTAINER)
self._writeFile('OEBPS/stylesheet.css', CSS)
def _removeEntities(self, text):
for e in entities:
v = entities[e]
text = text.replace(e, v)
text = text.replace('&', '&amp;')
return text
def writeChapter(self, title, text):
fileName = base64.b64encode(title).replace('/', '_') + ".xhtml"
filePath = self.directory + "/OEBPS/" + fileName
fn = 'OEBPS/' + fileName
# f = open(filePath, 'w')
text = self._removeEntities(text)
self.soup = bs.BeautifulStoneSoup(text)
allTags = self.soup.findAll(recursive=True)
for t in allTags:
for attr in t._getAttrMap().keys():
if attr not in acceptable_attributes:
del t[attr]
allPs = self.soup.findAll(recursive=True)
for p in allPs:
if p.string != None and (len(p.string.strip()) == 0 or p.string.strip() == '&nbsp;' ) :
p.extract()
allBrs = self.soup.findAll(recursive=True, name = ["br", "hr"])
for br in allBrs:
if (br.string != None and len(br.string.strip()) != 0) or (br.contents != None):
br.name = 'p'
# cleanup(self.soup )
text = self.soup.prettify()
tt = self._removeEntities(title)
self._writeFile(fn, XHTML_START % (tt, tt))
self._writeFile(fn, text)
self._writeFile(fn, XHTML_END)
# print >> f, XHTML_START % (tt, tt)
# f.write(text)
# print >> f, XHTML_END
self.chapters.append((title, fileName))
def finalise(self):
logging.debug("Finalising...")
### writing table of contents -- ncx file
tocFilePath = "OEBPS/toc.ncx"
# toc = open(tocFilePath, 'w')
# print >> toc, TOC_START % self.storyTitle
self._writeFile(tocFilePath, TOC_START % self.storyTitle)
### writing content -- opf file
opfFilePath = "OEBPS/content.opf"
# opf = open(opfFilePath, 'w')
self._writeFile(opfFilePath, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName))
# print >> opf, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName)
ids = []
i = 0
for t,f in self.chapters:
chapterId = base64.b64encode(t)
# print >> toc, TOC_ITEM % (chapterId, i, cgi.escape(t), f)
self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, cgi.escape(t), f))
# print >> opf, CONTENT_ITEM % (chapterId, f)
self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))
ids.append(chapterId)
i = i + 1
# logging.d('Toc and refs printed, proceesing to ref-ids....')
# print >> toc, TOC_END
# print >> opf, CONTENT_END_MANIFEST
self._writeFile(tocFilePath, TOC_END)
self._writeFile(opfFilePath, CONTENT_END_MANIFEST)
for chapterId in ids:
# print >> opf, CONTENT_ITEMREF % chapterId
self._writeFile(opfFilePath, CONTENT_ITEMREF % chapterId)
# print >> opf, CONTENT_END
self._writeFile(opfFilePath, CONTENT_END)
# opf.close()
# toc.close()
# print('Finished')
self._closeFiles()
filename = self.directory + '.epub'
zipdata = zipdir.inMemoryZip(self.files)
if self.writeToFile:
f = open(filename, 'w')
f.write(zipdata.getvalue())
f.close()
else:
self.output = zipdata
# zipdir.toZip(filename, self.directory)

69
zipdir.py Normal file
View file

@ -0,0 +1,69 @@
import os
import zipfile
import logging
import StringIO
def toZip(filename, directory):
zippedHelp = zipfile.ZipFile(filename, "w", compression=zipfile.ZIP_DEFLATED)
lst = os.listdir(directory)
for entity in lst:
if entity.startswith('.'):
continue
each = os.path.join(directory,entity)
print(each)
if os.path.isfile(each):
print(each)
zippedHelp.write(each, arcname=entity)
else:
addFolderToZip(zippedHelp,entity, each)
zippedHelp.close()
def addFolderToZip(zippedHelp,folder,fpath):
#print('addFolderToZip(%s)' % folder)
if folder == '.' or folder == '..':
return
folderFiles = os.listdir(fpath)
for f in folderFiles:
if os.path.isfile(fpath + '/' + f):
#print('basename=%s' % os.path.basename(fpath + '/' + f))
zippedHelp.write(fpath + '/' + f, folder + '/' + f, zipfile.ZIP_DEFLATED)
elif os.path.isdir(f):
addFolderToZip(zippedHelp,f)
def inMemoryZip(files):
# files have a structure of {'path/to/file' => content} dictionary
io = StringIO.StringIO()
memzip = zipfile.ZipFile(io, 'a', compression=zipfile.ZIP_DEFLATED)
memzip.debug = 3
for path in files:
if type(files[path]) != type('str'):
data = files[path].getvalue()
else:
data = files[path]
# logging.debug(data)
memzip.writestr(path, data.encode('utf-8'))
for zf in memzip.filelist:
zf.create_system = 0
memzip.close()
return io
if __name__ == '__main__':
# toZip('sample.epub', "books/A_Time_To_Reflect")
# z = zipfile.ZipFile('sample.epub', 'r')
files = {'test.txt' : 'test', 'data/abc.txt' : 'abc'}
data = inMemoryZip(files)
f = open('res.zip', 'w')
f.write(data)
f.close()