Adding Mobi format and making final configuration changes before uploading a beta version.

2025-12-06 08:52:55 +01:00 · 2010-11-23 07:15:18 +00:00 · 2010-11-23 07:15:18 +00:00 · ebcce7e42d
commit ebcce7e42d
parent cf0d1bf09b
5 changed files with 533 additions and 9 deletions
--- a/app.yaml
+++ b/app.yaml
@ -1,13 +1,9 @@
 application: fanfictionloader
-version: 2-5-5
+version: 2-6-beta
 runtime: python
 api_version: 1

 handlers:
- url: /generate_mock_data
-  script: mocks/generate_mock_data.py
-  login: admin
-
 - url: /r3m0v3r
  script: utils/remover.py
  login: admin
@ -25,7 +21,5 @@ handlers:
 - url: /static
  static_dir: static

-
 - url: /.*
  script: main.py
- 
--- a/fanficdownloader/downloader.py
+++ b/fanficdownloader/downloader.py
@ -160,6 +160,8 @@ if __name__ == '__main__':
 		writerClass = output.EPubFanficWriter
 	elif bookFormat == 'html':
 		writerClass = output.HTMLWriter
+	elif bookFormat == 'mobi':
+		writerClass = output.MobiWriter
 	elif bookFormat == 'text':
 		writerClass = output.TextWriter
 	
--- a/fanficdownloader/html.py
+++ b/fanficdownloader/html.py
@ -0,0 +1,121 @@
+#!/usr/bin/python
+# Copyright(c) 2009 Andrew Chatham and Vijay Pandurangan
+
+import re
+import sys
+import StringIO
+import urllib
+
+from BeautifulSoup import BeautifulSoup
+
+class HtmlProcessor:
+  WHITESPACE_RE = re.compile(r'\s')
+  # Look for </blockquote  <p>
+  BAD_TAG_RE = re.compile(r'<[^>]+<', re.MULTILINE)
+
+  def __init__(self, html, unfill=0):
+    self.unfill = unfill
+    html = self._ProcessRawHtml(html)
+    self._soup = BeautifulSoup(html)
+    if self._soup.title:
+      self.title = self._soup.title.contents[0]
+    else:
+      self.title = None
+
+  def _ProcessRawHtml(self, html):
+    new_html, count = HtmlProcessor.BAD_TAG_RE.subn('<', html)
+    if count:
+      print >>sys.stderr, 'Replaced %d bad tags' % count
+    return new_html
+
+  def _StubInternalAnchors(self):
+    '''Replace each internal anchor with a fixed-size filepos anchor.
+
+    Looks for every anchor with <a href="#myanchor"> and replaces that
+    with <a filepos="00000000050">. Stores anchors in self._anchor_references'''
+    self._anchor_references = []
+    anchor_num = 0
+    for anchor in self._soup.findAll('a', href=re.compile('^#')):
+      self._anchor_references.append((anchor_num, anchor['href']))
+      del anchor['href']
+      anchor['filepos'] = '%.10d' % anchor_num
+      anchor_num += 1
+
+  def _ReplaceAnchorStubs(self):
+    # TODO: Browsers allow extra whitespace in the href names.
+    assembled_text = self._soup.prettify()
+    del self._soup # shouldn't touch this anymore
+    for anchor_num, original_ref in self._anchor_references:
+      ref = urllib.unquote(original_ref[1:]) # remove leading '#'
+      # Find the position of ref in the utf-8 document.
+      # TODO(chatham): Using regexes and looking for name= would be better.
+      newpos = assembled_text.rfind(ref.encode('utf-8'))
+      if newpos == -1:
+        print >>sys.stderr, 'Could not find anchor "%s"' % original_ref
+        continue
+      newpos += len(ref) + 2  # don't point into the middle of the <a name> tag
+      old_filepos = 'filepos="%.10d"' % anchor_num
+      new_filepos = 'filepos="%.10d"' % newpos
+      assert assembled_text.find(old_filepos) != -1
+      assembled_text = assembled_text.replace(old_filepos, new_filepos, 1)
+    return assembled_text
+
+  def _FixPreTags(self):
+    '''Replace <pre> tags with HTML-ified text.'''
+    pres = self._soup.findAll('pre')
+    for pre in pres:
+      pre.replaceWith(self._FixPreContents(str(pre.contents[0])))
+
+  def _FixPreContents(self, text):
+    if self.unfill:
+      line_splitter = '\n\n'
+      line_joiner = '<p>'
+    else:
+      line_splitter = '\n'
+      line_joiner = '<br>'
+    lines = []
+    for line in text.split(line_splitter):
+      lines.append(self.WHITESPACE_RE.subn('&nbsp;', line)[0])
+    return line_joiner.join(lines)
+
+  def _RemoveUnsupported(self):
+    '''Remove any tags which the kindle cannot handle.'''
+    # TODO(chatham): <link> tags to script?
+    unsupported_tags = ('script', 'style')
+    for tag_type in unsupported_tags:
+      for element in self._soup.findAll(tag_type):
+        element.extract()
+
+  def RenameAnchors(self, prefix):
+    '''Rename every internal anchor to have the given prefix, then
+    return the contents of the body tag.'''
+    for anchor in self._soup.findAll('a', href=re.compile('^#')):
+      anchor['href'] = '#' + prefix + anchor['href'][1:]
+    for a in self._soup.findAll('a'):
+      if a.get('name'):
+        a['name'] = prefix + a['name']
+
+    # TODO(chatham): figure out how to fix this. sometimes body comes out
+    # as NoneType.
+    content = []
+    if self._soup.body is not None:
+      content = [unicode(c) for c in self._soup.body.contents]
+    return '\n'.join(content)
+
+  def CleanHtml(self):
+    # TODO(chatham): fix_html_br, fix_html
+    self._RemoveUnsupported()
+    self._StubInternalAnchors()
+    self._FixPreTags()
+    return self._ReplaceAnchorStubs()
+
+
+if __name__ == '__main__':
+  FILE ='/tmp/documentation.html'
+  #FILE = '/tmp/multipre.html'
+  FILE = '/tmp/view.html'
+  import codecs
+  d = open(FILE).read()
+  h = HtmlProcessor(d)
+  s = h.CleanHtml()
+  #print s
--- a/fanficdownloader/mobi.py
+++ b/fanficdownloader/mobi.py
@ -0,0 +1,344 @@
+#!/usr/bin/python
+# Copyright(c) 2009 Andrew Chatham and Vijay Pandurangan
+
+    
+import StringIO
+import struct
+import time
+import random
+import logging
+
+from html import HtmlProcessor
+
+# http://wiki.mobileread.com/wiki/MOBI
+# http://membres.lycos.fr/microfirst/palm/pdb.html
+
+encoding = {
+  'UTF-8' : 65001,
+  'latin-1' : 1252,
+}
+
+languages = {"en-us" : 0x0409,
+             "sv"    : 0x041d,
+             "fi"    : 0x000b,
+             "en"    : 0x0009,
+             "en-gb" : 0x0809}
+
+def ToHex(s):
+  v = ['%.2x' % ord(c) for c in s]
+  return ' '.join(v)
+
+class _SubEntry:
+  def __init__(self, pos, html_data):
+    self.pos = pos
+    self.html = HtmlProcessor(html_data)
+    self.title = self.html.title
+    self._name = 'mobi_article_%d' % pos
+    if not self.title:
+      self.title = 'Article %d' % self.pos
+
+  def TocLink(self):
+    return '<a href="#%s_MOBI_START">%.80s</a>' % (self._name, self.title)
+  
+  def Anchor(self):
+    return '<a name="%s_MOBI_START">' % self._name
+
+  def Body(self):
+    return self.html.RenameAnchors(self._name + '_')
+
+class Converter:
+  def __init__(self, refresh_url=''):
+    self._header = Header()
+    self._refresh_url = refresh_url
+
+  def ConvertString(self, s):
+    out = StringIO.StringIO()
+    self._ConvertStringToFile(s, out)
+    return out.getvalue()
+
+  def ConvertStrings(self, html_strs):
+    out = StringIO.StringIO()
+    self._ConvertStringsToFile(html_strs, out)
+    return out.getvalue()
+
+  def ConvertFile(self, html_file, out_file):
+    self._ConvertStringToFile(open(html_file).read(),
+                              open(out_file, 'w'))
+
+  def ConvertFiles(self, html_files, out_file):
+    html_strs = [open(f).read() for f in html_files]
+    self._ConvertStringsToFile(html_strs, open(out_file, 'w'))
+
+  def MakeOneHTML(self, html_strs):
+    """This takes a list of HTML strings and returns a big HTML file with
+    all contents consolidated.  It constructs a table of contents and adds
+    anchors within the text
+    """
+    toc_html = []
+    if self._refresh_url:
+      toc_html.append('<a href="%s">Update Reading List</a><br>' %
+                      self._refresh_url)
+    body_html = []
+    titles = []
+
+    PAGE_BREAK = '<mdb;pagebreak>'
+    for pos, html in enumerate(html_strs):
+      entry = _SubEntry(pos+1, html)
+      titles.append(entry.title[:10])
+      toc_html.append('%s<br>' % entry.TocLink())
+
+      # give some space between bodies of work.
+      body_html.append(PAGE_BREAK)
+      body_html.append(entry.Anchor())
+      
+      body_html.append('<h1>%s</h1>' % entry.title)
+      body_html.append(entry.Body())
+      
+    # TODO: this title can get way too long with RSS feeds. Not sure how to fix
+    header = '<html><title>Bibliorize %s GMT</title><body>' % time.ctime(
+      time.time())
+
+    footer = '</body></html>'
+    all_html = header + '\n'.join(toc_html + body_html) + footer
+    return all_html
+
+  def _ConvertStringsToFile(self, html_strs, out_file):
+    try:
+      tmp = self.MakeOneHTML(html_strs)
+      self._ConvertStringToFile(tmp, out_file)
+    except Exception, e:
+      logging.error('Error %s', e)
+      logging.debug('Details: %s' % html_strs)
+
+  def _ConvertStringToFile(self, html_data, out):
+    html = HtmlProcessor(html_data)
+    data = html.CleanHtml()
+    records = []
+    title = html.title
+    if title:
+      self._header.SetTitle(title)
+    record_id = 1
+    for start_pos in range(0, len(data), Record.MAX_SIZE):
+      end = min(len(data), start_pos + Record.MAX_SIZE)
+      record_data = data[start_pos:end]
+      records.append(self._header.AddRecord(record_data, record_id))
+      record_id += 1
+    self._header.SetImageRecordIndex(record_id)
+    records[0:0] = [self._header.MobiHeader()]
+
+    header, rec_offset = self._header.PDBHeader(len(records))
+    out.write(header)
+    for record in records:
+      record.WriteHeader(out, rec_offset)
+      rec_offset += len(record.data)
+
+    # Write to nuls for some reason
+    out.write('\0\0')
+    for record in records:
+      record.WriteData(out)
+
+class Record:
+  MAX_SIZE = 4096
+  INDEX_LEN = 8
+  _unique_id_seed = 28  # should be arbitrary, but taken from MobiHeader
+
+  # TODO(chatham): Record compression doesn't look that hard.
+
+  def __init__(self, data, record_id):
+    assert len(data) <= self.MAX_SIZE
+    self.data = data
+    if record_id != 0:
+      self._id = record_id
+    else:
+      Record._unique_id_seed += 1
+      self._id = 0
+
+  def __repr__(self):
+    return 'Record: id=%d len=%d' % (self._id, len(self.data))
+
+  def _SetUniqueId(self):
+    Record._unique_id_seed += 1
+    # TODO(chatham): Wraparound crap
+    self._id = Record._unique_id_seed
+
+  def WriteData(self, out):
+    out.write(self.data)
+
+  def WriteHeader(self, out, rec_offset):
+    attributes =  64 # dirty?
+    header = struct.pack('>IbbH',
+                         rec_offset,
+                         attributes,
+                         0, self._id)
+    assert len(header) == Record.INDEX_LEN
+    out.write(header)
+
+EXTH_HEADER_FIELDS = {
+  'author' : 100,
+  'publisher' : 101,
+}
+
+class Header:
+  EPOCH_1904 = 2082844800
+
+  def __init__(self):
+    self._length = 0
+    self._record_count = 0
+    self._title = '2008_2_34'
+    self._author = 'Unknown author'
+    self._publisher = 'Unknown publisher'
+    self._first_image_index = 0
+
+  def SetAuthor(self, author):
+    self._author = author
+
+  def SetTitle(self, title):
+    # TODO(chatham): Reevaluate whether this needs to be ASCII.
+    # maybe just do sys.setdefaultencoding('utf-8')? Problems
+    # appending self._title with other things.
+    self._title = title.encode('ascii')
+
+  def SetPublisher(self, publisher):
+    self._publisher = publisher
+
+  def AddRecord(self, data, record_id):
+    self.max_record_size = max(Record.MAX_SIZE, len(data))
+    self._record_count += 1
+    self._length += len(data)
+    return Record(data, record_id)
+
+  def _ReplaceWord(self, data, pos, word):
+    return data[:pos] + struct.pack('>I', word) + data[pos+4:]
+
+  def PalmDocHeader(self):
+    compression = 1  # no compression
+    unused = 0
+    encryption_type = 0  # no ecryption
+    records = self._record_count + 1  # the header record itself
+    palmdoc_header = struct.pack('>HHIHHHH',
+                                 compression,
+                                 unused,
+                                 self._length,
+                                 records,
+                                 Record.MAX_SIZE,
+                                 encryption_type,
+                                 unused)
+    assert len(palmdoc_header) == 16
+    return palmdoc_header
+
+  def PDBHeader(self, num_records):
+    HEADER_LEN = 32+2+2+9*4
+    RECORD_INDEX_HEADER_LEN = 6
+    RESOURCE_INDEX_LEN = 10
+
+    index_len = RECORD_INDEX_HEADER_LEN + num_records * Record.INDEX_LEN
+    rec_offset = HEADER_LEN + index_len + 2
+
+    short_title = self._title[0:31]
+    attributes = 0
+    version = 0
+    ctime = self.EPOCH_1904 + int(time.time())
+    mtime = self.EPOCH_1904 + int(time.time())
+    backup_time = self.EPOCH_1904 + int(time.time())
+    modnum = 0
+    appinfo_offset = 0
+    sort_offset = 0
+    type = 'BOOK'
+    creator = 'MOBI'
+    id_seed = 36
+    header = struct.pack('>32sHHII',
+                         short_title, attributes, version,
+                         ctime, mtime)
+    header += struct.pack('>IIII', backup_time, modnum,
+                         appinfo_offset, sort_offset)
+    header += struct.pack('>4s4sI',
+                         type, creator, id_seed)
+    next_record = 0  # not used?
+    header += struct.pack('>IH', next_record, num_records)
+    return header, rec_offset
+
+  def _GetExthHeader(self):
+    # They set author, publisher, coveroffset, thumboffset
+    data = {'author' : self._author,
+            'publisher' : self._publisher,
+            }
+    # Turn string type names into EXTH typeids.
+    r = []
+    for key, value in data.items():
+      typeid = EXTH_HEADER_FIELDS[key]
+      length_encoding_len = 8
+      r.append(struct.pack('>LL', typeid, len(value) + length_encoding_len,) + value)
+    content = ''.join(r)
+
+    # Pad to word boundary
+    while len(content) % 4:
+      content += '\0'
+    TODO_mysterious = 12
+    exth = 'EXTH' + struct.pack('>LL', len(content) + TODO_mysterious, len(data)) + content
+    return exth
+
+  def SetImageRecordIndex(self, idx):
+    self._first_image_index = idx
+
+  def MobiHeader(self):
+    exth_header = self._GetExthHeader();
+    palmdoc_header = self.PalmDocHeader()
+
+    fs = 0xffffffff
+
+    # Record 0
+    header_len = 0xE4 # TODO
+    mobi_type = 2 # BOOK
+    text_encoding = encoding['UTF-8']
+    unique_id = random.randint(1, 1<<32)
+    creator_version = 4
+    reserved = '%c' % 0xff * 40
+    nonbook_index = fs
+    full_name_offset = header_len + len(palmdoc_header) + len(exth_header) # put full name after header
+    language = languages['en-us']
+    unused = 0
+    mobi_header = struct.pack('>4sIIIII40sIIIIII',
+                              'MOBI',
+                              header_len,
+                              mobi_type,
+                              text_encoding,
+                              unique_id,
+                              creator_version,
+                              reserved,
+                              nonbook_index,
+                              full_name_offset,
+                              len(self._title),
+                              language,
+                              fs, fs)
+    assert len(mobi_header) == 104 - 16
+
+    unknown_fields = chr(0) * 32
+    drm_offset = 0
+    drm_count = 0
+    drm_size = 0
+    drm_flags = 0
+    exth_flags = 0x50
+    header_end = chr(0) * 64
+    mobi_header += struct.pack('>IIIIIII',
+                               creator_version,
+                               self._first_image_index,
+                               fs,
+                               unused,
+                               fs,
+                               unused,
+                               exth_flags)
+    mobi_header += '\0' * 112 # TODO: Why this much padding?
+    # Set some magic offsets to be 0xFFFFFFF.
+    for pos in (0x94, 0x98, 0xb0, 0xb8, 0xc0, 0xc8, 0xd0, 0xd8, 0xdc):
+      mobi_header = self._ReplaceWord(mobi_header, pos, fs)
+
+    # 16 bytes?
+    padding = '\0' * 48 * 4 # why?
+    total_header = palmdoc_header + mobi_header + exth_header + self._title + padding
+
+    return self.AddRecord(total_header, 0)
+
+if __name__ == '__main__':
+  import sys
+  m = Converter()
+  m.ConvertFiles(sys.argv[1:], '/tmp/test.mobi')
--- a/fanficdownloader/output.py
+++ b/fanficdownloader/output.py
@ -21,6 +21,7 @@ import urlparse as up
 import BeautifulSoup as bs
 import htmlentitydefs as hdefs

+import mobi
 import zipdir
 import html_constants
 from constants import *
@ -81,6 +82,68 @@ class TextWriter(FanficWriter):
 			self.output.close()
 		

+class MobiWriter(FanficWriter):
+	body = ''
+
+	@staticmethod
+	def getFormatName():
+		return 'mobi'
+
+	@staticmethod	
+	def getFormatExt():
+		return '.mobi'
+
+	def __init__(self, base, adapter, inmemory=False, compress=False):
+		self.basePath = base
+		self.storyTitle = removeEntities(adapter.getStoryName())
+		self.name = makeAcceptableFilename(adapter.getOutputName())
+		self.fileName = self.basePath + '/' + self.name + self.getFormatExt()
+		self.authorName = removeEntities(adapter.getAuthorName())
+		self.adapter = adapter
+		self.mobi = mobi
+		self.inmemory = inmemory
+
+		if not self.inmemory and os.path.exists(self.fileName):
+			os.remove(self.fileName)
+
+		if self.inmemory:
+			self.output = StringIO.StringIO()
+		else:
+			self.output = open(self.fileName, 'w')
+
+		self.xhtmlTemplate = string.Template(html_constants.XHTML_START)
+		self.chapterStartTemplate = string.Template(html_constants.XHTML_CHAPTER_START)
+
+	def _printableVersion(self, text):
+		try:
+			d = text.decode('utf-8')
+			return d
+		except:
+			return text
+
+	def writeChapter(self, index, title, text):
+		title = self._printableVersion(title) #title.decode('utf-8')
+		text = self._printableVersion(text) #text.decode('utf-8')
+		self.body = self.body + '\n' + self.chapterStartTemplate.substitute({'chapter' : title})
+		self.body = self.body + '\n' + text
+
+	def finalise(self):
+		html = self.xhtmlTemplate.substitute({'title' : self.storyTitle, 'author' : self.authorName, 'body' : self.body})
+		soup = bs.BeautifulSoup(html)
+		result = soup.__str__('utf8')
+
+#		f = open(self.fileName, 'w')
+#		f.write(result)
+#		f.close()
+
+		c = mobi.Converter()
+		mobidata = c.ConvertString(result)
+
+		self.output.write(mobidata)
+		if not self.inmemory:
+			self.output.close()
+
+
 class HTMLWriter(FanficWriter):
 	body = ''
 	
@ -92,14 +155,14 @@ class HTMLWriter(FanficWriter):
 	def getFormatExt():
 		return '.html'
 	
-	def __init__(self, base, adapter, inmemory=False, compress=False):
+	def __init__(self, base, adapter, inmemory=False, compress=False, mobi = False):
 		self.basePath = base
 		self.storyTitle = removeEntities(adapter.getStoryName())
 		self.name = makeAcceptableFilename(adapter.getOutputName())
 		self.fileName = self.basePath + '/' + self.name + self.getFormatExt()
 		self.authorName = removeEntities(adapter.getAuthorName())
 		self.adapter = adapter
-		
+		self.mobi = mobi
 		self.inmemory = inmemory

 		if not self.inmemory and os.path.exists(self.fileName):