mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-06 00:43:00 +01:00
added
This commit is contained in:
commit
c0459faa43
7 changed files with 2415 additions and 0 deletions
1711
BeautifulSoup.py
Normal file
1711
BeautifulSoup.py
Normal file
File diff suppressed because it is too large
Load diff
135
constants.py
Normal file
135
constants.py
Normal file
|
|
@ -0,0 +1,135 @@
|
|||
CSS = '''body { margin-left: 5%; margin-right: 5%; margin-top: 5%; margin-bottom: 5%; text-align: justify; }
|
||||
pre { font-size: x-small; }
|
||||
h1 { text-align: center; }
|
||||
h2 { text-align: center; }
|
||||
h3 { text-align: center; }
|
||||
h4 { text-align: center; }
|
||||
h5 { text-align: center; }
|
||||
h6 { text-align: center; }
|
||||
.CI {
|
||||
text-align:center;
|
||||
margin-top:0px;
|
||||
margin-bottom:0px;
|
||||
padding:0px;
|
||||
}
|
||||
.center {text-align: center;}
|
||||
.smcap {font-variant: small-caps;}
|
||||
.u {text-decoration: underline;}
|
||||
.bold {font-weight: bold;}
|
||||
'''
|
||||
|
||||
MIMETYPE = '''application/epub+zip'''
|
||||
|
||||
CONTAINER = '''<?xml version="1.0"?>
|
||||
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
|
||||
<rootfiles>
|
||||
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
|
||||
</rootfiles>
|
||||
</container>
|
||||
'''
|
||||
|
||||
CONTENT_START = '''<?xml version="1.0"?>
|
||||
<package version="2.0" xmlns="http://www.idpf.org/2007/opf"
|
||||
unique-identifier="BookId-Epub-%s">
|
||||
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:opf="http://www.idpf.org/2007/opf">
|
||||
<dc:title>%s</dc:title>
|
||||
<dc:creator opf:role="aut">%s</dc:creator>
|
||||
<dc:language>en-UK</dc:language>
|
||||
<dc:rights></dc:rights>
|
||||
<dc:publisher>sgzmd</dc:publisher>
|
||||
<dc:identifier id="BookId">urn:uuid:sigizmund.com062820072147132</dc:identifier>
|
||||
</metadata>
|
||||
<manifest>
|
||||
<item id="ncx" href="toc.ncx" media-type="text/xml" />
|
||||
<item id="style" href="stylesheet.css" media-type="text/css" />
|
||||
'''
|
||||
|
||||
CONTENT_ITEM = '<item id="%s" href="%s" media-type="application/xhtml+xml" />'
|
||||
|
||||
CONTENT_END_MANIFEST = '''</manifest>
|
||||
<spine toc="ncx">
|
||||
'''
|
||||
|
||||
CONTENT_ITEMREF = '''<itemref idref="%s" />'''
|
||||
|
||||
CONTENT_END = '''</spine>
|
||||
</package>
|
||||
'''
|
||||
|
||||
TOC_START = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
|
||||
<head>
|
||||
<meta name="dtb:uid" content="sigizmund.com062820072147132"/>
|
||||
<meta name="dtb:depth" content="1"/>
|
||||
<meta name="dtb:totalPageCount" content="0"/>
|
||||
<meta name="dtb:maxPageNumber" content="0"/>
|
||||
</head>
|
||||
<docTitle>
|
||||
<text>%s</text>
|
||||
</docTitle>
|
||||
<navMap>
|
||||
'''
|
||||
|
||||
TOC_ITEM = '''<navPoint id="%s" playOrder="%d">
|
||||
<navLabel>
|
||||
<text>%s</text>
|
||||
</navLabel>
|
||||
<content src="%s"/>
|
||||
</navPoint>
|
||||
'''
|
||||
|
||||
TOC_END = '''</navMap>
|
||||
</ncx>
|
||||
'''
|
||||
|
||||
XHTML_START = '''<?xml version="1.0" encoding="iso-8859-1"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>%s</title>
|
||||
<link href="stylesheet.css" type="text/css" rel="stylesheet" />
|
||||
</head>
|
||||
<body>
|
||||
<div>
|
||||
<h3>%s</h3>
|
||||
'''
|
||||
|
||||
XHTML_END = '''</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
|
||||
'blockquote', 'br', 'center', 'cite', 'code', 'col',
|
||||
'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em',
|
||||
'font', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'i',
|
||||
'ins', 'kbd', 'label', 'li', 'ol',
|
||||
'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike',
|
||||
'strong', 'sub', 'sup', 'u', 'ul']
|
||||
|
||||
acceptable_attributes = ['href']
|
||||
|
||||
entities = { '–' : ' - ', '—' : ' - ', '”' : '"', '“' : '"', '’' : '\'', '‘' : '\'', '"' : '"' }
|
||||
|
||||
FB2_PROLOGUE = '<FictionBook>'
|
||||
FB2_DESCRIPTION = '''<description>
|
||||
<title-info>
|
||||
<genre>fanfiction</genre>
|
||||
<author>
|
||||
<first-name></first-name>
|
||||
<middle-name></middle-name>
|
||||
<last-name>%s</last-name>
|
||||
</author>
|
||||
<book-title>%s</book-title>
|
||||
<lang>eng</lang>
|
||||
</title-info>
|
||||
<document-info>
|
||||
<author>
|
||||
<nickname>sgzmd</nickname>
|
||||
</author>
|
||||
<date value="%s">%s</date>
|
||||
<id>sgzmd_%s</id>
|
||||
<version>2.0</version>
|
||||
</document-info>
|
||||
</description>'''
|
||||
74
downaloder.py
Normal file
74
downaloder.py
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
import os.path
|
||||
import getpass
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
|
||||
import ffa
|
||||
import ficwad
|
||||
import output
|
||||
import fictionalley
|
||||
|
||||
class FanficLoader:
|
||||
'''A controller class which handles the interaction between various specific downloaders and writers'''
|
||||
booksDirectory = "books"
|
||||
|
||||
def __init__(self, adapter, writerClass):
|
||||
self.adapter = adapter
|
||||
self.writerClass = writerClass
|
||||
|
||||
def download(self):
|
||||
urls = self.adapter.extractIndividualUrls()
|
||||
self.writer = self.writerClass(self.booksDirectory, self.adapter.getStoryName(), self.adapter.getAuthorName())
|
||||
|
||||
for u,n in urls:
|
||||
text = self.adapter.getText(u)
|
||||
self.writer.writeChapter(n, text)
|
||||
|
||||
self.writer.finalise()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
(url, format) = sys.argv[1:]
|
||||
|
||||
if type(url) is unicode:
|
||||
print('URL is unicode')
|
||||
url = url.encode('latin1')
|
||||
|
||||
adapter = None
|
||||
writerClass = None
|
||||
|
||||
if url.find('fanficauthors') != -1:
|
||||
adapter = ffa.FFA(url)
|
||||
elif url.find('fictionalley') != -1:
|
||||
adapter = fictionalley.FictionAlley(url)
|
||||
elif url.find('ficwad') != -1:
|
||||
adapter = ficwad.FicWad(url)
|
||||
else:
|
||||
print >> sys.stderr, "Oi! I can haz not appropriate adapter for URL %s!" % url
|
||||
sys.exit(1)
|
||||
|
||||
if format == 'epub':
|
||||
writerClass = output.EPubFanficWriter
|
||||
|
||||
if adapter.requiresLogin(url):
|
||||
print("Meow, URL %s requires you to haz been logged in! Please can I haz this datas?" % url)
|
||||
sys.stdout.write("Can I haz ur login? ")
|
||||
login = sys.stdin.readline().strip()
|
||||
password = getpass.getpass(prompt='Can I haz ur password? ')
|
||||
print("Login: `%s`, Password: `%s`" % (login, password))
|
||||
|
||||
adapter.setLogin(login)
|
||||
adapter.setPassword(password)
|
||||
|
||||
|
||||
loader = FanficLoader(adapter, writerClass)
|
||||
loader.download()
|
||||
|
||||
187
ffa.py
Normal file
187
ffa.py
Normal file
|
|
@ -0,0 +1,187 @@
|
|||
import os
|
||||
import re
|
||||
import sys
|
||||
import cgi
|
||||
import uuid
|
||||
import shutil
|
||||
import base64
|
||||
import os.path
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
|
||||
from constants import *
|
||||
|
||||
from ficwad import *
|
||||
|
||||
class FFA:
|
||||
storyName = None
|
||||
|
||||
def __init__(self):
|
||||
self.grabUrl = re.compile('(\<option.+value=\")(.+?)\"\>(.+?)\<')
|
||||
self.grabAuthor = re.compile('.+pemail.+\'(\w+)')
|
||||
|
||||
def getPasswordLine(self):
|
||||
return '<input type="password" name="pass"'
|
||||
|
||||
def getLoginScript(self):
|
||||
return '/scripts/login.php'
|
||||
|
||||
def getLoginPasswordOthers(self):
|
||||
login = dict(login = 'name', password = 'pass')
|
||||
other = dict(submit = 'Log In', remember='yes')
|
||||
return (login, other)
|
||||
|
||||
def getPrintableUrl(self, url):
|
||||
return url + '?print=yes'
|
||||
|
||||
def _findIndex(self, lines, what, start):
|
||||
for i in range(start, len(lines)):
|
||||
if lines[i].find(what) != -1:
|
||||
return i
|
||||
return -1
|
||||
|
||||
def extractIndividualUrls(self, data, host, first, fetch = False):
|
||||
lines = data.split('\n')
|
||||
|
||||
optionLines = filter(lambda x : x.find('<option value="') != -1, lines)
|
||||
|
||||
authorLines = filter(lambda x : x.find('pemail') != -1, lines)
|
||||
for al in authorLines:
|
||||
m = self.grabAuthor.match(al)
|
||||
if m != None:
|
||||
self.authorName = m.group(1)
|
||||
break
|
||||
|
||||
|
||||
optionsLines = optionLines[:len(optionLines)/2]
|
||||
|
||||
storyName = first.split("/")[1]
|
||||
|
||||
result = []
|
||||
urls = []
|
||||
for line in optionLines:
|
||||
m = self.grabUrl.match(line)
|
||||
u = m.group(2)
|
||||
if u.find('" selected="selected') != -1:
|
||||
u = u.replace('" selected="selected', '')
|
||||
|
||||
if u in urls:
|
||||
continue
|
||||
else:
|
||||
urls.append(u)
|
||||
|
||||
result.append((self.getPrintableUrl(storyName + "/" + u), m.group(3)))
|
||||
|
||||
self.soup = bs.BeautifulSoup(data)
|
||||
titles = self.soup.findAll(name = 'title', recursive=True)
|
||||
if len(titles) > 0:
|
||||
title = titles[0]
|
||||
print(title)
|
||||
(website, rest) = title.string.split('::')
|
||||
story_chapter = rest.split("-")
|
||||
|
||||
story = story_chapter[0].strip()
|
||||
self.storyName = story
|
||||
|
||||
return result
|
||||
|
||||
def getStoryName(self):
|
||||
return self.storyName
|
||||
|
||||
def getAuthorName(self):
|
||||
return self.authorName
|
||||
|
||||
def getText(self, data, fetch = False):
|
||||
lines = data.split('\n')
|
||||
begin = self._findIndex(lines, '</select>', 0)+1
|
||||
if begin == 0:
|
||||
begiun = self._findIndex(lines, '<div><p>', 24)
|
||||
|
||||
if begin == 0:
|
||||
print('BAD start')
|
||||
pp.pprint(lines)
|
||||
sys.abort()
|
||||
end = self._findIndex(lines, '<form action="index.php"><div class="topandbotline"', begin)
|
||||
print('<!-- ========= begin=%d, end=%d ============= -->' % (begin, end))
|
||||
return "\n".join(lines[begin:end])
|
||||
|
||||
class Downloader:
|
||||
login = None
|
||||
password = None
|
||||
url = None
|
||||
host = None
|
||||
first = None
|
||||
opener = None
|
||||
|
||||
writer = None
|
||||
|
||||
def __init__(self, url, login, password):
|
||||
self.login = login
|
||||
self.password = password
|
||||
self.url = url
|
||||
|
||||
self.infoProvider = FicWad() #FFA()
|
||||
|
||||
parse = up.urlparse(url)
|
||||
self.host = parse.scheme + '://' + parse.netloc
|
||||
self.first = parse.path;
|
||||
|
||||
self.loginUrl = self.host + self.infoProvider.getLoginScript()
|
||||
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
|
||||
|
||||
def _loginRequired(self):
|
||||
print('is login required?')
|
||||
resp = self.opener.open(self.url)
|
||||
data = resp.read()
|
||||
if data.find(self.infoProvider.getPasswordLine()) != -1:
|
||||
print('yep')
|
||||
return True
|
||||
else:
|
||||
print('nada')
|
||||
return False
|
||||
|
||||
def _login(self):
|
||||
(login, data) = self.infoProvider.getLoginPasswordOthers()
|
||||
|
||||
data[login['login']] = self.login
|
||||
data[login['password']] = self.password
|
||||
|
||||
urlvals = u.urlencode(data)
|
||||
req = self.opener.open(self.loginUrl, urlvals)
|
||||
|
||||
if req.read().find(self.infoProvider.getPasswordLine()) != -1:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def _getContent(self, url):
|
||||
print("<!-- Opening %s -->" % url)
|
||||
return self.opener.open(url).read()
|
||||
|
||||
def download(self):
|
||||
first = self._getContent(self.host + self.first)
|
||||
urls = self.infoProvider.extractIndividualUrls(first, self.host, self.first)
|
||||
|
||||
self.writer = EPubFanficWriter("books", self.infoProvider.getStoryName(), self.infoProvider.getAuthorName())
|
||||
|
||||
for u,n in urls:
|
||||
text = self.infoProvider.getText(self._getContent(self.host+"/"+u))
|
||||
self.writer.writeChapter(n, text)
|
||||
|
||||
self.writer.finalise()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
f = Downloader(sys.argv[1], 'sigizmund', '***************')
|
||||
if f._loginRequired():
|
||||
f._login()
|
||||
f.download()
|
||||
|
||||
|
||||
|
||||
75
fictionalley.py
Normal file
75
fictionalley.py
Normal file
|
|
@ -0,0 +1,75 @@
|
|||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
import os.path
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
|
||||
class FictionAlley:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def extractIndividualUrls(self, data, host, contents):
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
|
||||
title = soup.find('title').string
|
||||
self.storyName = "-".join(title.split('-')[1:]).strip()
|
||||
|
||||
authors = soup.findAll('a')
|
||||
|
||||
print('Story "%s" by %s' % (self.storyName, self.authorName))
|
||||
|
||||
links = soup.findAll('a', { 'class' : 'chapterlink' } )
|
||||
|
||||
result = []
|
||||
for a in links:
|
||||
url = a['href']
|
||||
title = a.string
|
||||
result.append((url,title))
|
||||
|
||||
return result
|
||||
|
||||
def getStoryName(self):
|
||||
return self.storyName
|
||||
|
||||
def getAuthorName(self):
|
||||
return self.authorName
|
||||
|
||||
|
||||
def getText(self, data, fetch = False):
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
div = soup.find('div', {'id' : 'storytext'})
|
||||
if None == div:
|
||||
return '<html/>'
|
||||
|
||||
return div.prettify()
|
||||
|
||||
def getPrintableUrl(self, url):
|
||||
return url
|
||||
|
||||
def getPasswordLine(self):
|
||||
return 'opaopapassword'
|
||||
|
||||
def getLoginScript(self):
|
||||
return 'opaopaloginscript'
|
||||
|
||||
def getLoginPasswordOthers(self):
|
||||
login = dict(login = 'name', password = 'pass')
|
||||
other = dict(submit = 'Log In', remember='yes')
|
||||
return (login, other)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
url = 'http://www.fictionalley.org/authors/drt/DA.html'
|
||||
data = u2.urlopen(url).read()
|
||||
host = up.urlparse(url).netloc
|
||||
fw = FictionAlley()
|
||||
fw.authorName = 'DrT'
|
||||
urls = fw.extractIndividualUrls(data, host, url)
|
||||
pp.pprint(urls)
|
||||
print(fw.getText(data))
|
||||
97
ficwad.py
Normal file
97
ficwad.py
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
import os.path
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
|
||||
class FicWad:
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
self.host = up.urlparse(url).netloc
|
||||
|
||||
def requiresLogin(self, url):
|
||||
return False
|
||||
|
||||
def performLogin(self, url):
|
||||
pass
|
||||
|
||||
def setLogin(self, login):
|
||||
self.login = login
|
||||
|
||||
def setPassword(self, password):
|
||||
self.password = password
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
data = u2.urlopen(self.url).read()
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
|
||||
title = soup.find('title').string
|
||||
self.storyName = title.split('::')[0].strip()
|
||||
|
||||
author = soup.find('span', {'class' : 'author'})
|
||||
self.authorName = author.a.string
|
||||
|
||||
print('Story "%s" by %s' % (self.storyName, self.authorName))
|
||||
|
||||
select = soup.find('select', { 'name' : 'goto' } )
|
||||
|
||||
allOptions = select.findAll('option')
|
||||
result = []
|
||||
for o in allOptions:
|
||||
url = o['value']
|
||||
# if type(url) is unicode:
|
||||
# url = url.encode('utf-8')
|
||||
title = o.string
|
||||
result.append((url,title))
|
||||
|
||||
return result
|
||||
|
||||
def getStoryName(self):
|
||||
return self.storyName
|
||||
|
||||
def getAuthorName(self):
|
||||
return self.authorName
|
||||
|
||||
def getText(self, url):
|
||||
print(type(url))
|
||||
if url.find('http://') == -1:
|
||||
url = 'http://' + self.host + '/' + url
|
||||
|
||||
data = u2.urlopen(url).read()
|
||||
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
div = soup.find('div', {'id' : 'storytext'})
|
||||
if None == div:
|
||||
return '<html/>'
|
||||
|
||||
return div.prettify()
|
||||
|
||||
def getPrintableUrl(self, url):
|
||||
return url
|
||||
|
||||
def getPasswordLine(self):
|
||||
return 'opaopapassword'
|
||||
|
||||
def getLoginScript(self):
|
||||
return 'opaopaloginscript'
|
||||
|
||||
def getLoginPasswordOthers(self):
|
||||
login = dict(login = 'name', password = 'pass')
|
||||
other = dict(submit = 'Log In', remember='yes')
|
||||
return (login, other)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
url = 'http://www.ficwad.com/story/14536'
|
||||
data = u2.urlopen(url).read()
|
||||
host = up.urlparse(url).netloc
|
||||
fw = FicWad()
|
||||
urls = fw.extractIndividualUrls(data, host, url)
|
||||
pp.pprint(urls)
|
||||
print(fw.getText(data))
|
||||
136
output.py
Normal file
136
output.py
Normal file
|
|
@ -0,0 +1,136 @@
|
|||
import os
|
||||
import re
|
||||
import sys
|
||||
import cgi
|
||||
import uuid
|
||||
import shutil
|
||||
import base64
|
||||
import os.path
|
||||
import urllib as u
|
||||
import pprint as pp
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
|
||||
from constants import *
|
||||
|
||||
class FanficWriter:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def writeChapter(self, title, text):
|
||||
pass
|
||||
|
||||
def finalise(self):
|
||||
pass
|
||||
|
||||
class HTMLWriter(FanficWriter):
|
||||
def __init__(self, base, name, author):
|
||||
pass
|
||||
|
||||
def writeChapter(self, title, text):
|
||||
pass
|
||||
|
||||
def finalise(self):
|
||||
pass
|
||||
|
||||
class EPubFanficWriter(FanficWriter):
|
||||
chapters = []
|
||||
|
||||
def __init__(self, base, name, author):
|
||||
self.basePath = base
|
||||
self.name = name.replace(" ", "_")
|
||||
self.storyTitle = name
|
||||
self.directory = self.basePath + '/' + self.name
|
||||
|
||||
self.authorName = author
|
||||
|
||||
if os.path.exists(self.directory):
|
||||
shutil.rmtree(self.directory)
|
||||
|
||||
os.mkdir(self.directory)
|
||||
|
||||
os.mkdir(self.directory + '/META-INF')
|
||||
os.mkdir(self.directory + '/OEBPS')
|
||||
|
||||
print >> open(self.directory + '/mimetype', 'w'), MIMETYPE
|
||||
print >> open(self.directory + '/META-INF/container.xml', 'w'), CONTAINER
|
||||
print >> open(self.directory + '/OEBPS/stylesheet.css', 'w'), CSS
|
||||
|
||||
def _removeEntities(self, text):
|
||||
for e in entities:
|
||||
v = entities[e]
|
||||
text = text.replace(e, v)
|
||||
|
||||
return text
|
||||
|
||||
def writeChapter(self, title, text):
|
||||
fileName = base64.b64encode(title) + ".xhtml"
|
||||
filePath = self.directory + "/OEBPS/" + fileName
|
||||
f = open(filePath, 'w')
|
||||
|
||||
text = self._removeEntities(text)
|
||||
|
||||
self.soup = bs.BeautifulStoneSoup(text)
|
||||
|
||||
allTags = self.soup.findAll(recursive=True)
|
||||
for t in allTags:
|
||||
for attr in t._getAttrMap().keys():
|
||||
if attr not in acceptable_attributes:
|
||||
del t[attr]
|
||||
|
||||
allPs = self.soup.findAll(recursive=True)
|
||||
for p in allPs:
|
||||
if p.string != None and (len(p.string.strip()) == 0 or p.string.strip() == ' ' ) :
|
||||
p.extract()
|
||||
|
||||
allBrs = self.soup.findAll(recursive=True, name = ["br", "hr"])
|
||||
for br in allBrs:
|
||||
if (br.string != None and len(br.string.strip()) != 0) or (br.contents != None):
|
||||
br.name = 'p'
|
||||
|
||||
# cleanup(self.soup )
|
||||
|
||||
text = self.soup.prettify()
|
||||
|
||||
print >> f, XHTML_START % (title, title)
|
||||
print >> f, text
|
||||
print >> f, XHTML_END
|
||||
|
||||
self.chapters.append((title, fileName))
|
||||
|
||||
def finalise(self):
|
||||
|
||||
### writing table of contents -- ncx file
|
||||
|
||||
tocFilePath = self.directory + "/OEBPS/toc.ncx"
|
||||
toc = open(tocFilePath, 'w')
|
||||
print >> toc, TOC_START % self.storyTitle
|
||||
|
||||
### writing content -- opf file
|
||||
opfFilePath = self.directory + "/OEBPS/content.opf"
|
||||
opf = open(opfFilePath, 'w')
|
||||
|
||||
print >> opf, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName)
|
||||
|
||||
ids = []
|
||||
|
||||
i = 0
|
||||
for t,f in self.chapters:
|
||||
chapterId = base64.b64encode(t)
|
||||
print >> toc, TOC_ITEM % (chapterId, i, cgi.escape(t), f)
|
||||
|
||||
print >> opf, CONTENT_ITEM % (chapterId, f)
|
||||
|
||||
ids.append(chapterId)
|
||||
|
||||
i = i + 1
|
||||
|
||||
print >> toc, TOC_END
|
||||
print >> opf, CONTENT_END_MANIFEST
|
||||
|
||||
for chapterId in ids:
|
||||
print >> opf, CONTENT_ITEMREF % chapterId
|
||||
|
||||
print >> opf, CONTENT_END
|
||||
Loading…
Reference in a new issue