Adding Mobi format and making final configuration changes before uploading a beta version.

This commit is contained in:
sigizmund 2010-11-23 07:15:18 +00:00
parent cf0d1bf09b
commit ebcce7e42d
5 changed files with 533 additions and 9 deletions

View file

@ -1,13 +1,9 @@
application: fanfictionloader
version: 2-5-5
version: 2-6-beta
runtime: python
api_version: 1
handlers:
- url: /generate_mock_data
script: mocks/generate_mock_data.py
login: admin
- url: /r3m0v3r
script: utils/remover.py
login: admin
@ -25,7 +21,5 @@ handlers:
- url: /static
static_dir: static
- url: /.*
script: main.py

View file

@ -160,6 +160,8 @@ if __name__ == '__main__':
writerClass = output.EPubFanficWriter
elif bookFormat == 'html':
writerClass = output.HTMLWriter
elif bookFormat == 'mobi':
writerClass = output.MobiWriter
elif bookFormat == 'text':
writerClass = output.TextWriter

121
fanficdownloader/html.py Normal file
View file

@ -0,0 +1,121 @@
#!/usr/bin/python
# Copyright(c) 2009 Andrew Chatham and Vijay Pandurangan
import re
import sys
import StringIO
import urllib
from BeautifulSoup import BeautifulSoup
class HtmlProcessor:
WHITESPACE_RE = re.compile(r'\s')
# Look for </blockquote <p>
BAD_TAG_RE = re.compile(r'<[^>]+<', re.MULTILINE)
def __init__(self, html, unfill=0):
self.unfill = unfill
html = self._ProcessRawHtml(html)
self._soup = BeautifulSoup(html)
if self._soup.title:
self.title = self._soup.title.contents[0]
else:
self.title = None
def _ProcessRawHtml(self, html):
new_html, count = HtmlProcessor.BAD_TAG_RE.subn('<', html)
if count:
print >>sys.stderr, 'Replaced %d bad tags' % count
return new_html
def _StubInternalAnchors(self):
'''Replace each internal anchor with a fixed-size filepos anchor.
Looks for every anchor with <a href="#myanchor"> and replaces that
with <a filepos="00000000050">. Stores anchors in self._anchor_references'''
self._anchor_references = []
anchor_num = 0
for anchor in self._soup.findAll('a', href=re.compile('^#')):
self._anchor_references.append((anchor_num, anchor['href']))
del anchor['href']
anchor['filepos'] = '%.10d' % anchor_num
anchor_num += 1
def _ReplaceAnchorStubs(self):
# TODO: Browsers allow extra whitespace in the href names.
assembled_text = self._soup.prettify()
del self._soup # shouldn't touch this anymore
for anchor_num, original_ref in self._anchor_references:
ref = urllib.unquote(original_ref[1:]) # remove leading '#'
# Find the position of ref in the utf-8 document.
# TODO(chatham): Using regexes and looking for name= would be better.
newpos = assembled_text.rfind(ref.encode('utf-8'))
if newpos == -1:
print >>sys.stderr, 'Could not find anchor "%s"' % original_ref
continue
newpos += len(ref) + 2 # don't point into the middle of the <a name> tag
old_filepos = 'filepos="%.10d"' % anchor_num
new_filepos = 'filepos="%.10d"' % newpos
assert assembled_text.find(old_filepos) != -1
assembled_text = assembled_text.replace(old_filepos, new_filepos, 1)
return assembled_text
def _FixPreTags(self):
'''Replace <pre> tags with HTML-ified text.'''
pres = self._soup.findAll('pre')
for pre in pres:
pre.replaceWith(self._FixPreContents(str(pre.contents[0])))
def _FixPreContents(self, text):
if self.unfill:
line_splitter = '\n\n'
line_joiner = '<p>'
else:
line_splitter = '\n'
line_joiner = '<br>'
lines = []
for line in text.split(line_splitter):
lines.append(self.WHITESPACE_RE.subn('&nbsp;', line)[0])
return line_joiner.join(lines)
def _RemoveUnsupported(self):
'''Remove any tags which the kindle cannot handle.'''
# TODO(chatham): <link> tags to script?
unsupported_tags = ('script', 'style')
for tag_type in unsupported_tags:
for element in self._soup.findAll(tag_type):
element.extract()
def RenameAnchors(self, prefix):
'''Rename every internal anchor to have the given prefix, then
return the contents of the body tag.'''
for anchor in self._soup.findAll('a', href=re.compile('^#')):
anchor['href'] = '#' + prefix + anchor['href'][1:]
for a in self._soup.findAll('a'):
if a.get('name'):
a['name'] = prefix + a['name']
# TODO(chatham): figure out how to fix this. sometimes body comes out
# as NoneType.
content = []
if self._soup.body is not None:
content = [unicode(c) for c in self._soup.body.contents]
return '\n'.join(content)
def CleanHtml(self):
# TODO(chatham): fix_html_br, fix_html
self._RemoveUnsupported()
self._StubInternalAnchors()
self._FixPreTags()
return self._ReplaceAnchorStubs()
if __name__ == '__main__':
FILE ='/tmp/documentation.html'
#FILE = '/tmp/multipre.html'
FILE = '/tmp/view.html'
import codecs
d = open(FILE).read()
h = HtmlProcessor(d)
s = h.CleanHtml()
#print s

344
fanficdownloader/mobi.py Normal file
View file

@ -0,0 +1,344 @@
#!/usr/bin/python
# Copyright(c) 2009 Andrew Chatham and Vijay Pandurangan
import StringIO
import struct
import time
import random
import logging
from html import HtmlProcessor
# http://wiki.mobileread.com/wiki/MOBI
# http://membres.lycos.fr/microfirst/palm/pdb.html
encoding = {
'UTF-8' : 65001,
'latin-1' : 1252,
}
languages = {"en-us" : 0x0409,
"sv" : 0x041d,
"fi" : 0x000b,
"en" : 0x0009,
"en-gb" : 0x0809}
def ToHex(s):
v = ['%.2x' % ord(c) for c in s]
return ' '.join(v)
class _SubEntry:
def __init__(self, pos, html_data):
self.pos = pos
self.html = HtmlProcessor(html_data)
self.title = self.html.title
self._name = 'mobi_article_%d' % pos
if not self.title:
self.title = 'Article %d' % self.pos
def TocLink(self):
return '<a href="#%s_MOBI_START">%.80s</a>' % (self._name, self.title)
def Anchor(self):
return '<a name="%s_MOBI_START">' % self._name
def Body(self):
return self.html.RenameAnchors(self._name + '_')
class Converter:
def __init__(self, refresh_url=''):
self._header = Header()
self._refresh_url = refresh_url
def ConvertString(self, s):
out = StringIO.StringIO()
self._ConvertStringToFile(s, out)
return out.getvalue()
def ConvertStrings(self, html_strs):
out = StringIO.StringIO()
self._ConvertStringsToFile(html_strs, out)
return out.getvalue()
def ConvertFile(self, html_file, out_file):
self._ConvertStringToFile(open(html_file).read(),
open(out_file, 'w'))
def ConvertFiles(self, html_files, out_file):
html_strs = [open(f).read() for f in html_files]
self._ConvertStringsToFile(html_strs, open(out_file, 'w'))
def MakeOneHTML(self, html_strs):
"""This takes a list of HTML strings and returns a big HTML file with
all contents consolidated. It constructs a table of contents and adds
anchors within the text
"""
toc_html = []
if self._refresh_url:
toc_html.append('<a href="%s">Update Reading List</a><br>' %
self._refresh_url)
body_html = []
titles = []
PAGE_BREAK = '<mdb;pagebreak>'
for pos, html in enumerate(html_strs):
entry = _SubEntry(pos+1, html)
titles.append(entry.title[:10])
toc_html.append('%s<br>' % entry.TocLink())
# give some space between bodies of work.
body_html.append(PAGE_BREAK)
body_html.append(entry.Anchor())
body_html.append('<h1>%s</h1>' % entry.title)
body_html.append(entry.Body())
# TODO: this title can get way too long with RSS feeds. Not sure how to fix
header = '<html><title>Bibliorize %s GMT</title><body>' % time.ctime(
time.time())
footer = '</body></html>'
all_html = header + '\n'.join(toc_html + body_html) + footer
return all_html
def _ConvertStringsToFile(self, html_strs, out_file):
try:
tmp = self.MakeOneHTML(html_strs)
self._ConvertStringToFile(tmp, out_file)
except Exception, e:
logging.error('Error %s', e)
logging.debug('Details: %s' % html_strs)
def _ConvertStringToFile(self, html_data, out):
html = HtmlProcessor(html_data)
data = html.CleanHtml()
records = []
title = html.title
if title:
self._header.SetTitle(title)
record_id = 1
for start_pos in range(0, len(data), Record.MAX_SIZE):
end = min(len(data), start_pos + Record.MAX_SIZE)
record_data = data[start_pos:end]
records.append(self._header.AddRecord(record_data, record_id))
record_id += 1
self._header.SetImageRecordIndex(record_id)
records[0:0] = [self._header.MobiHeader()]
header, rec_offset = self._header.PDBHeader(len(records))
out.write(header)
for record in records:
record.WriteHeader(out, rec_offset)
rec_offset += len(record.data)
# Write to nuls for some reason
out.write('\0\0')
for record in records:
record.WriteData(out)
class Record:
MAX_SIZE = 4096
INDEX_LEN = 8
_unique_id_seed = 28 # should be arbitrary, but taken from MobiHeader
# TODO(chatham): Record compression doesn't look that hard.
def __init__(self, data, record_id):
assert len(data) <= self.MAX_SIZE
self.data = data
if record_id != 0:
self._id = record_id
else:
Record._unique_id_seed += 1
self._id = 0
def __repr__(self):
return 'Record: id=%d len=%d' % (self._id, len(self.data))
def _SetUniqueId(self):
Record._unique_id_seed += 1
# TODO(chatham): Wraparound crap
self._id = Record._unique_id_seed
def WriteData(self, out):
out.write(self.data)
def WriteHeader(self, out, rec_offset):
attributes = 64 # dirty?
header = struct.pack('>IbbH',
rec_offset,
attributes,
0, self._id)
assert len(header) == Record.INDEX_LEN
out.write(header)
EXTH_HEADER_FIELDS = {
'author' : 100,
'publisher' : 101,
}
class Header:
EPOCH_1904 = 2082844800
def __init__(self):
self._length = 0
self._record_count = 0
self._title = '2008_2_34'
self._author = 'Unknown author'
self._publisher = 'Unknown publisher'
self._first_image_index = 0
def SetAuthor(self, author):
self._author = author
def SetTitle(self, title):
# TODO(chatham): Reevaluate whether this needs to be ASCII.
# maybe just do sys.setdefaultencoding('utf-8')? Problems
# appending self._title with other things.
self._title = title.encode('ascii')
def SetPublisher(self, publisher):
self._publisher = publisher
def AddRecord(self, data, record_id):
self.max_record_size = max(Record.MAX_SIZE, len(data))
self._record_count += 1
self._length += len(data)
return Record(data, record_id)
def _ReplaceWord(self, data, pos, word):
return data[:pos] + struct.pack('>I', word) + data[pos+4:]
def PalmDocHeader(self):
compression = 1 # no compression
unused = 0
encryption_type = 0 # no ecryption
records = self._record_count + 1 # the header record itself
palmdoc_header = struct.pack('>HHIHHHH',
compression,
unused,
self._length,
records,
Record.MAX_SIZE,
encryption_type,
unused)
assert len(palmdoc_header) == 16
return palmdoc_header
def PDBHeader(self, num_records):
HEADER_LEN = 32+2+2+9*4
RECORD_INDEX_HEADER_LEN = 6
RESOURCE_INDEX_LEN = 10
index_len = RECORD_INDEX_HEADER_LEN + num_records * Record.INDEX_LEN
rec_offset = HEADER_LEN + index_len + 2
short_title = self._title[0:31]
attributes = 0
version = 0
ctime = self.EPOCH_1904 + int(time.time())
mtime = self.EPOCH_1904 + int(time.time())
backup_time = self.EPOCH_1904 + int(time.time())
modnum = 0
appinfo_offset = 0
sort_offset = 0
type = 'BOOK'
creator = 'MOBI'
id_seed = 36
header = struct.pack('>32sHHII',
short_title, attributes, version,
ctime, mtime)
header += struct.pack('>IIII', backup_time, modnum,
appinfo_offset, sort_offset)
header += struct.pack('>4s4sI',
type, creator, id_seed)
next_record = 0 # not used?
header += struct.pack('>IH', next_record, num_records)
return header, rec_offset
def _GetExthHeader(self):
# They set author, publisher, coveroffset, thumboffset
data = {'author' : self._author,
'publisher' : self._publisher,
}
# Turn string type names into EXTH typeids.
r = []
for key, value in data.items():
typeid = EXTH_HEADER_FIELDS[key]
length_encoding_len = 8
r.append(struct.pack('>LL', typeid, len(value) + length_encoding_len,) + value)
content = ''.join(r)
# Pad to word boundary
while len(content) % 4:
content += '\0'
TODO_mysterious = 12
exth = 'EXTH' + struct.pack('>LL', len(content) + TODO_mysterious, len(data)) + content
return exth
def SetImageRecordIndex(self, idx):
self._first_image_index = idx
def MobiHeader(self):
exth_header = self._GetExthHeader();
palmdoc_header = self.PalmDocHeader()
fs = 0xffffffff
# Record 0
header_len = 0xE4 # TODO
mobi_type = 2 # BOOK
text_encoding = encoding['UTF-8']
unique_id = random.randint(1, 1<<32)
creator_version = 4
reserved = '%c' % 0xff * 40
nonbook_index = fs
full_name_offset = header_len + len(palmdoc_header) + len(exth_header) # put full name after header
language = languages['en-us']
unused = 0
mobi_header = struct.pack('>4sIIIII40sIIIIII',
'MOBI',
header_len,
mobi_type,
text_encoding,
unique_id,
creator_version,
reserved,
nonbook_index,
full_name_offset,
len(self._title),
language,
fs, fs)
assert len(mobi_header) == 104 - 16
unknown_fields = chr(0) * 32
drm_offset = 0
drm_count = 0
drm_size = 0
drm_flags = 0
exth_flags = 0x50
header_end = chr(0) * 64
mobi_header += struct.pack('>IIIIIII',
creator_version,
self._first_image_index,
fs,
unused,
fs,
unused,
exth_flags)
mobi_header += '\0' * 112 # TODO: Why this much padding?
# Set some magic offsets to be 0xFFFFFFF.
for pos in (0x94, 0x98, 0xb0, 0xb8, 0xc0, 0xc8, 0xd0, 0xd8, 0xdc):
mobi_header = self._ReplaceWord(mobi_header, pos, fs)
# 16 bytes?
padding = '\0' * 48 * 4 # why?
total_header = palmdoc_header + mobi_header + exth_header + self._title + padding
return self.AddRecord(total_header, 0)
if __name__ == '__main__':
import sys
m = Converter()
m.ConvertFiles(sys.argv[1:], '/tmp/test.mobi')

View file

@ -21,6 +21,7 @@ import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import mobi
import zipdir
import html_constants
from constants import *
@ -81,6 +82,68 @@ class TextWriter(FanficWriter):
self.output.close()
class MobiWriter(FanficWriter):
body = ''
@staticmethod
def getFormatName():
return 'mobi'
@staticmethod
def getFormatExt():
return '.mobi'
def __init__(self, base, adapter, inmemory=False, compress=False):
self.basePath = base
self.storyTitle = removeEntities(adapter.getStoryName())
self.name = makeAcceptableFilename(adapter.getOutputName())
self.fileName = self.basePath + '/' + self.name + self.getFormatExt()
self.authorName = removeEntities(adapter.getAuthorName())
self.adapter = adapter
self.mobi = mobi
self.inmemory = inmemory
if not self.inmemory and os.path.exists(self.fileName):
os.remove(self.fileName)
if self.inmemory:
self.output = StringIO.StringIO()
else:
self.output = open(self.fileName, 'w')
self.xhtmlTemplate = string.Template(html_constants.XHTML_START)
self.chapterStartTemplate = string.Template(html_constants.XHTML_CHAPTER_START)
def _printableVersion(self, text):
try:
d = text.decode('utf-8')
return d
except:
return text
def writeChapter(self, index, title, text):
title = self._printableVersion(title) #title.decode('utf-8')
text = self._printableVersion(text) #text.decode('utf-8')
self.body = self.body + '\n' + self.chapterStartTemplate.substitute({'chapter' : title})
self.body = self.body + '\n' + text
def finalise(self):
html = self.xhtmlTemplate.substitute({'title' : self.storyTitle, 'author' : self.authorName, 'body' : self.body})
soup = bs.BeautifulSoup(html)
result = soup.__str__('utf8')
# f = open(self.fileName, 'w')
# f.write(result)
# f.close()
c = mobi.Converter()
mobidata = c.ConvertString(result)
self.output.write(mobidata)
if not self.inmemory:
self.output.close()
class HTMLWriter(FanficWriter):
body = ''
@ -92,14 +155,14 @@ class HTMLWriter(FanficWriter):
def getFormatExt():
return '.html'
def __init__(self, base, adapter, inmemory=False, compress=False):
def __init__(self, base, adapter, inmemory=False, compress=False, mobi = False):
self.basePath = base
self.storyTitle = removeEntities(adapter.getStoryName())
self.name = makeAcceptableFilename(adapter.getOutputName())
self.fileName = self.basePath + '/' + self.name + self.getFormatExt()
self.authorName = removeEntities(adapter.getAuthorName())
self.adapter = adapter
self.mobi = mobi
self.inmemory = inmemory
if not self.inmemory and os.path.exists(self.fileName):