4-0-1 Add mobi back in, changes to recent&status appearance, more reliability.

This commit is contained in:
Jim Miller 2011-05-30 23:02:00 -05:00
parent f7857ad6dc
commit 50b80b6d68
12 changed files with 803 additions and 64 deletions

View file

@ -1,6 +1,6 @@
# fanfictionloader ffd-retief
application: fanfictionloader
version: 4-0-0
version: 4-0-1
runtime: python
api_version: 1

View file

@ -13,7 +13,7 @@ body
#greeting
{
margin-bottom: 1em;
# margin-bottom: 1em;
border-color: #efefef;
}
@ -66,6 +66,8 @@ div.field
#error
{
font-size: small;
color: #f00;
}
.recent {
font-size: large;
}

View file

@ -108,7 +108,7 @@ extratags: FanFiction
[txt]
## Add URLs since there aren't links.
titlepage_entries: category,genre,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,storyUrl, author URL, description
titlepage_entries: category,genre,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,storyUrl, authorUrl, description
## use \r\n for line endings, the windows convention. text output only.
windows_eol: true
@ -132,6 +132,10 @@ titlepage_use_table: false
## When using tables, make these span both columns.
wide_titlepage_entries: description, storyUrl, author URL
[mobi]
## mobi TOC cannot be turned off right now.
#include_tocpage: true
## Each site has a section that overrides [defaults] *and* the format
## sections test1.com specifically is not a real story site. Instead,

View file

@ -16,6 +16,7 @@
#
import datetime
import time
import logging
import fanficdownloader.BeautifulSoup as bs
@ -105,6 +106,10 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
if self.story.getMetadata('storyId') == '667':
raise exceptions.FailedToDownload("Error downloading Chapter: %s!" % url)
if self.story.getMetadata('storyId') == '670' and self.getConfig('slow_down_sleep_time'):
time.sleep(float(self.getConfig('slow_down_sleep_time')))
if "chapter=1" in url :
text=u'''
<div>
@ -116,6 +121,7 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
<p>http://test1.com?sid=667 - raises FailedToDownload on chapter 1</p>
<p>http://test1.com?sid=668 - raises FailedToLogin unless username='Me'</p>
<p>http://test1.com?sid=669 - Succeeds with Updated Date=now</p>
<p>http://test1.com?sid=670 - Succeeds, but applies slow_down_sleep_time</p>
<p>And other storyId will succeed with the same output.</p>
</div>
'''

126
fanficdownloader/html.py Normal file
View file

@ -0,0 +1,126 @@
#!/usr/bin/python
# Copyright(c) 2009 Andrew Chatham and Vijay Pandurangan
import re
import sys
import StringIO
import urllib
from BeautifulSoup import BeautifulSoup
class HtmlProcessor:
WHITESPACE_RE = re.compile(r'\s')
# Look for </blockquote <p>
BAD_TAG_RE = re.compile(r'<[^>]+<', re.MULTILINE)
def __init__(self, html, unfill=0):
self.unfill = unfill
html = self._ProcessRawHtml(html)
self._soup = BeautifulSoup(html)
if self._soup.title:
self.title = self._soup.title.contents[0]
else:
self.title = None
def _ProcessRawHtml(self, html):
new_html, count = HtmlProcessor.BAD_TAG_RE.subn('<', html)
if count:
print >>sys.stderr, 'Replaced %d bad tags' % count
return new_html
def _StubInternalAnchors(self):
'''Replace each internal anchor with a fixed-size filepos anchor.
Looks for every anchor with <a href="#myanchor"> and replaces that
with <a filepos="00000000050">. Stores anchors in self._anchor_references'''
self._anchor_references = []
anchor_num = 0
# anchor links
anchorlist = self._soup.findAll('a', href=re.compile('^#'))
# treat reference tags like a tags for TOCTOP.
anchorlist.extend(self._soup.findAll('reference', href=re.compile('^#')))
for anchor in anchorlist:
self._anchor_references.append((anchor_num, anchor['href']))
del anchor['href']
anchor['filepos'] = '%.10d' % anchor_num
anchor_num += 1
def _ReplaceAnchorStubs(self):
# TODO: Browsers allow extra whitespace in the href names.
# use __str__ instead of prettify--it inserts extra spaces.
assembled_text = self._soup.__str__('utf8')
del self._soup # shouldn't touch this anymore
for anchor_num, original_ref in self._anchor_references:
ref = urllib.unquote(original_ref[1:]) # remove leading '#'
# Find the position of ref in the utf-8 document.
# TODO(chatham): Using regexes and looking for name= would be better.
newpos = assembled_text.rfind(ref.encode('utf-8'))
if newpos == -1:
print >>sys.stderr, 'Could not find anchor "%s"' % original_ref
continue
newpos += len(ref) + 2 # don't point into the middle of the <a name> tag
old_filepos = 'filepos="%.10d"' % anchor_num
new_filepos = 'filepos="%.10d"' % newpos
assert assembled_text.find(old_filepos) != -1
assembled_text = assembled_text.replace(old_filepos, new_filepos, 1)
return assembled_text
def _FixPreTags(self):
'''Replace <pre> tags with HTML-ified text.'''
pres = self._soup.findAll('pre')
for pre in pres:
pre.replaceWith(self._FixPreContents(str(pre.contents[0])))
def _FixPreContents(self, text):
if self.unfill:
line_splitter = '\n\n'
line_joiner = '<p>'
else:
line_splitter = '\n'
line_joiner = '<br>'
lines = []
for line in text.split(line_splitter):
lines.append(self.WHITESPACE_RE.subn('&nbsp;', line)[0])
return line_joiner.join(lines)
def _RemoveUnsupported(self):
'''Remove any tags which the kindle cannot handle.'''
# TODO(chatham): <link> tags to script?
unsupported_tags = ('script', 'style')
for tag_type in unsupported_tags:
for element in self._soup.findAll(tag_type):
element.extract()
def RenameAnchors(self, prefix):
'''Rename every internal anchor to have the given prefix, then
return the contents of the body tag.'''
for anchor in self._soup.findAll('a', href=re.compile('^#')):
anchor['href'] = '#' + prefix + anchor['href'][1:]
for a in self._soup.findAll('a'):
if a.get('name'):
a['name'] = prefix + a['name']
# TODO(chatham): figure out how to fix this. sometimes body comes out
# as NoneType.
content = []
if self._soup.body is not None:
content = [unicode(c) for c in self._soup.body.contents]
return '\n'.join(content)
def CleanHtml(self):
# TODO(chatham): fix_html_br, fix_html
self._RemoveUnsupported()
self._StubInternalAnchors()
self._FixPreTags()
return self._ReplaceAnchorStubs()
if __name__ == '__main__':
FILE ='/tmp/documentation.html'
#FILE = '/tmp/multipre.html'
FILE = '/tmp/view.html'
import codecs
d = open(FILE).read()
h = HtmlProcessor(d)
s = h.CleanHtml()
#print s

384
fanficdownloader/mobi.py Normal file
View file

@ -0,0 +1,384 @@
#!/usr/bin/python
# Copyright(c) 2009 Andrew Chatham and Vijay Pandurangan
import StringIO
import struct
import time
import random
import logging
from html import HtmlProcessor
# http://wiki.mobileread.com/wiki/MOBI
# http://membres.lycos.fr/microfirst/palm/pdb.html
encoding = {
'UTF-8' : 65001,
'latin-1' : 1252,
}
languages = {"en-us" : 0x0409,
"sv" : 0x041d,
"fi" : 0x000b,
"en" : 0x0009,
"en-gb" : 0x0809}
def ToHex(s):
v = ['%.2x' % ord(c) for c in s]
return ' '.join(v)
class _SubEntry:
def __init__(self, pos, html_data):
self.pos = pos
self.html = HtmlProcessor(html_data)
self.title = self.html.title
self._name = 'mobi_article_%d' % pos
if not self.title:
self.title = 'Article %d' % self.pos
def TocLink(self):
return '<a href="#%s_MOBI_START">%.80s</a>' % (self._name, self.title)
def Anchor(self):
return '<a name="%s_MOBI_START">' % self._name
def Body(self):
return self.html.RenameAnchors(self._name + '_')
class Converter:
def __init__(self, refresh_url='', title='Unknown', author='Unknown', publisher='Unknown'):
self._header = Header()
self._header.SetTitle(title)
self._header.SetAuthor(author)
self._header.SetPublisher(publisher)
self._refresh_url = refresh_url
def ConvertString(self, s):
out = StringIO.StringIO()
self._ConvertStringToFile(s, out)
return out.getvalue()
def ConvertStrings(self, html_strs):
out = StringIO.StringIO()
self._ConvertStringsToFile(html_strs, out)
return out.getvalue()
def ConvertFile(self, html_file, out_file):
self._ConvertStringToFile(open(html_file,'rb').read(),
open(out_file, 'wb'))
def ConvertFiles(self, html_files, out_file):
html_strs = [open(f,'rb').read() for f in html_files]
self._ConvertStringsToFile(html_strs, open(out_file, 'wb'))
def MakeOneHTML(self, html_strs):
"""This takes a list of HTML strings and returns a big HTML file with
all contents consolidated. It constructs a table of contents and adds
anchors within the text
"""
title_html = []
toc_html = []
body_html = []
PAGE_BREAK = '<mbp:pagebreak>'
# pull out the title page, assumed first html_strs.
htmltitle = html_strs[0]
entrytitle = _SubEntry(1, htmltitle)
title_html.append(entrytitle.Body())
title_html.append(PAGE_BREAK)
toc_html.append('<a name="TOCTOP"><h3>Table of Contents</h3><br />')
for pos, html in enumerate(html_strs[1:]):
entry = _SubEntry(pos+1, html)
toc_html.append('%s<br />' % entry.TocLink())
# give some space between bodies of work.
body_html.append(PAGE_BREAK)
body_html.append(entry.Anchor())
body_html.append(entry.Body())
# TODO: this title can get way too long with RSS feeds. Not sure how to fix
# cheat slightly and use the <a href> code to set filepos in references.
header = '''<html>
<head>
<title>Bibliorize %s GMT</title>
<guide>
<reference href="#TOCTOP" type="toc" title="Table of Contents"/>
</guide>
</head>
<body>
''' % time.ctime(time.time())
footer = '</body></html>'
all_html = header + '\n'.join(title_html + toc_html + body_html) + footer
#print "%s" % all_html.encode('utf8')
return all_html
def _ConvertStringsToFile(self, html_strs, out_file):
try:
tmp = self.MakeOneHTML(html_strs)
self._ConvertStringToFile(tmp, out_file)
except Exception, e:
logging.error('Error %s', e)
logging.debug('Details: %s' % html_strs)
def _ConvertStringToFile(self, html_data, out):
html = HtmlProcessor(html_data)
data = html.CleanHtml()
# collect offsets of '<mbp:pagebreak>' tags, use to make index list.
# indexlist = [] # list of (offset,length) tuples.
# not in current use.
# j=0
# lastj=0
# while True:
# j=data.find('<mbp:pagebreak>',lastj+10) # plus a bit so we find the next.
# if j < 0:
# break
# indexlist.append((lastj,j-lastj))
# print "index offset: %d length: %d" % (lastj,j-lastj)
# lastj=j
records = []
# title = html.title
# if title:
# self._header.SetTitle(title)
record_id = 1
for start_pos in range(0, len(data), Record.MAX_SIZE):
end = min(len(data), start_pos + Record.MAX_SIZE)
record_data = data[start_pos:end]
records.append(self._header.AddRecord(record_data, record_id))
#print "HTML Record %03d: (size:%d) [[%s ... %s]]" % ( record_id, len(record_data), record_data[:20], record_data[-20:] )
record_id += 1
self._header.SetImageRecordIndex(record_id)
records[0:0] = [self._header.MobiHeader()]
header, rec_offset = self._header.PDBHeader(len(records))
out.write(header)
for record in records:
record.WriteHeader(out, rec_offset)
#print "rec_offset: %d len(record.data): %d" % (rec_offset,len(record.data))
rec_offset += (len(record.data)+1) # plus one for trailing null
# Write to nuls for some reason
out.write('\0\0')
for record in records:
record.WriteData(out)
out.write('\0')
# needs a trailing null, I believe it indicates zero length 'overlap'.
# otherwise, the readers eat the last char of each html record.
# Calibre writes another 6-7 bytes of stuff after that, but we seem
# to be getting along without it.
class Record:
MAX_SIZE = 4096
INDEX_LEN = 8
_unique_id_seed = 28 # should be arbitrary, but taken from MobiHeader
# TODO(chatham): Record compression doesn't look that hard.
def __init__(self, data, record_id):
assert len(data) <= self.MAX_SIZE
self.data = data
if record_id != 0:
self._id = record_id
else:
Record._unique_id_seed += 1
self._id = 0
def __repr__(self):
return 'Record: id=%d len=%d' % (self._id, len(self.data))
def _SetUniqueId(self):
Record._unique_id_seed += 1
# TODO(chatham): Wraparound crap
self._id = Record._unique_id_seed
def WriteData(self, out):
out.write(self.data)
def WriteHeader(self, out, rec_offset):
attributes = 64 # dirty?
header = struct.pack('>IbbH',
rec_offset,
attributes,
0, self._id)
assert len(header) == Record.INDEX_LEN
out.write(header)
EXTH_HEADER_FIELDS = {
'author' : 100,
'publisher' : 101,
}
class Header:
EPOCH_1904 = 2082844800
def __init__(self):
self._length = 0
self._record_count = 0
self._title = '2008_2_34'
self._author = 'Unknown author'
self._publisher = 'Unknown publisher'
self._first_image_index = 0
def SetAuthor(self, author):
self._author = author.encode('ascii','ignore')
def SetTitle(self, title):
# TODO(chatham): Reevaluate whether this needs to be ASCII.
# maybe just do sys.setdefaultencoding('utf-8')? Problems
# appending self._title with other things.
self._title = title.encode('ascii','ignore')
def SetPublisher(self, publisher):
self._publisher = publisher.encode('ascii','ignore')
def AddRecord(self, data, record_id):
self.max_record_size = max(Record.MAX_SIZE, len(data))
self._record_count += 1
self._length += len(data)
return Record(data, record_id)
def _ReplaceWord(self, data, pos, word):
return data[:pos] + struct.pack('>I', word) + data[pos+4:]
def PalmDocHeader(self):
compression = 1 # no compression
unused = 0
encryption_type = 0 # no ecryption
records = self._record_count + 1 # the header record itself
palmdoc_header = struct.pack('>HHIHHHH',
compression,
unused,
self._length,
records,
Record.MAX_SIZE,
encryption_type,
unused)
assert len(palmdoc_header) == 16
return palmdoc_header
def PDBHeader(self, num_records):
HEADER_LEN = 32+2+2+9*4
RECORD_INDEX_HEADER_LEN = 6
RESOURCE_INDEX_LEN = 10
index_len = RECORD_INDEX_HEADER_LEN + num_records * Record.INDEX_LEN
rec_offset = HEADER_LEN + index_len + 2
short_title = self._title[0:31]
attributes = 0
version = 0
ctime = self.EPOCH_1904 + int(time.time())
mtime = self.EPOCH_1904 + int(time.time())
backup_time = self.EPOCH_1904 + int(time.time())
modnum = 0
appinfo_offset = 0
sort_offset = 0
type = 'BOOK'
creator = 'MOBI'
id_seed = 36
header = struct.pack('>32sHHII',
short_title, attributes, version,
ctime, mtime)
header += struct.pack('>IIII', backup_time, modnum,
appinfo_offset, sort_offset)
header += struct.pack('>4s4sI',
type, creator, id_seed)
next_record = 0 # not used?
header += struct.pack('>IH', next_record, num_records)
return header, rec_offset
def _GetExthHeader(self):
# They set author, publisher, coveroffset, thumboffset
data = {'author' : self._author,
'publisher' : self._publisher,
}
# Turn string type names into EXTH typeids.
r = []
for key, value in data.items():
typeid = EXTH_HEADER_FIELDS[key]
length_encoding_len = 8
r.append(struct.pack('>LL', typeid, len(value) + length_encoding_len,) + value)
content = ''.join(r)
# Pad to word boundary
while len(content) % 4:
content += '\0'
TODO_mysterious = 12
exth = 'EXTH' + struct.pack('>LL', len(content) + TODO_mysterious, len(data)) + content
return exth
def SetImageRecordIndex(self, idx):
self._first_image_index = idx
def MobiHeader(self):
exth_header = self._GetExthHeader();
palmdoc_header = self.PalmDocHeader()
fs = 0xffffffff
# Record 0
header_len = 0xE4 # TODO
mobi_type = 2 # BOOK
text_encoding = encoding['UTF-8']
unique_id = random.randint(1, 1<<32)
creator_version = 4
reserved = '%c' % 0xff * 40
nonbook_index = fs
full_name_offset = header_len + len(palmdoc_header) + len(exth_header) # put full name after header
language = languages['en-us']
unused = 0
mobi_header = struct.pack('>4sIIIII40sIIIIII',
'MOBI',
header_len,
mobi_type,
text_encoding,
unique_id,
creator_version,
reserved,
nonbook_index,
full_name_offset,
len(self._title),
language,
fs, fs)
assert len(mobi_header) == 104 - 16
unknown_fields = chr(0) * 32
drm_offset = 0
drm_count = 0
drm_size = 0
drm_flags = 0
exth_flags = 0x50
header_end = chr(0) * 64
mobi_header += struct.pack('>IIIIIII',
creator_version,
self._first_image_index,
fs,
unused,
fs,
unused,
exth_flags)
mobi_header += '\0' * 112 # TODO: Why this much padding?
# Set some magic offsets to be 0xFFFFFFF.
for pos in (0x94, 0x98, 0xb0, 0xb8, 0xc0, 0xc8, 0xd0, 0xd8, 0xdc):
mobi_header = self._ReplaceWord(mobi_header, pos, fs)
# 16 bytes?
padding = '\0' * 48 * 4 # why?
total_header = palmdoc_header + mobi_header + exth_header + self._title + padding
return self.AddRecord(total_header, 0)
if __name__ == '__main__':
import sys
m = Converter(title='Testing Mobi', author='Mobi Author', publisher='mobi converter')
m.ConvertFiles(sys.argv[1:], 'test.mobi')
#m.ConvertFile(sys.argv[1], 'test.mobi')

View file

@ -23,6 +23,7 @@ from fanficdownloader.exceptions import FailedToDownload
from writer_html import HTMLWriter
from writer_txt import TextWriter
from writer_epub import EpubWriter
from writer_mobi import MobiWriter
def getWriter(type,config,story):
if type == "html":
@ -31,5 +32,7 @@ def getWriter(type,config,story):
return TextWriter(config,story)
if type == "epub":
return EpubWriter(config,story)
if type == "mobi":
return MobiWriter(config,story)
raise FailedToDownload("(%s) is not a supported download format."%type)

View file

@ -0,0 +1,196 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import string
import StringIO
from base_writer import *
from fanficdownloader.htmlcleanup import stripHTML
from fanficdownloader.mobi import Converter
class MobiWriter(BaseStoryWriter):
@staticmethod
def getFormatName():
return 'mobi'
@staticmethod
def getFormatExt():
return '.mobi'
def __init__(self, config, story):
BaseStoryWriter.__init__(self, config, story)
self.MOBI_TITLE_PAGE_START = string.Template('''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>${title} by ${author}</title>
<link href="stylesheet.css" type="text/css" charset="UTF-8" rel="stylesheet"/>
</head>
<body>
<h3><a href="${storyUrl}">${title}</a> by <a href="${authorUrl}">${author}</a></h3>
<div>
''')
self.MOBI_TITLE_ENTRY = string.Template('''
<b>${label}:</b> ${value}<br />
''')
self.MOBI_TITLE_PAGE_END = string.Template('''
</div>
</body>
</html>
''')
self.MOBI_TABLE_TITLE_PAGE_START = string.Template('''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>${title} by ${author}</title>
<link href="stylesheet.css" type="text/css" charset="UTF-8" rel="stylesheet"/>
</head>
<body>
<h3><a href="${storyUrl}">${title}</a> by <a href="${authorUrl}">${author}</a></h3>
<table class="full">
''')
self.MOBI_TABLE_TITLE_ENTRY = string.Template('''
<tr><td><b>${label}:</b></td><td>${value}</td></tr>
''')
self.MOBI_TABLE_TITLE_WIDE_ENTRY = string.Template('''
<tr><td colspan="2"><b>${label}:</b> ${value}</td></tr>
''')
self.MOBI_TABLE_TITLE_PAGE_END = string.Template('''
</table>
</body>
</html>
''')
self.MOBI_TOC_PAGE_START = string.Template('''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>${title} by ${author}</title>
<link href="stylesheet.css" type="text/css" charset="UTF-8" rel="stylesheet"/>
</head>
<body>
<div>
<h3>Table of Contents</h3>
''')
self.MOBI_TOC_ENTRY = string.Template('''
<a href="file${index}.xhtml">${chapter}</a><br />
''')
self.MOBI_TOC_PAGE_END = string.Template('''
</div>
</body>
</html>
''')
self.MOBI_CHAPTER_START = string.Template('''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>${chapter}</title>
<link href="stylesheet.css" type="text/css" charset="UTF-8" rel="stylesheet"/>
</head>
<body>
<h3>${chapter}</h3>
''')
self.MOBI_CHAPTER_END = string.Template('''
</body>
</html>
''')
def getMetadata(self,key):
return stripHTML(self.story.getMetadata(key))
def writeStoryImpl(self, out):
files = []
# write title page.
if self.getConfig("titlepage_use_table"):
TITLE_PAGE_START = self.MOBI_TABLE_TITLE_PAGE_START
TITLE_ENTRY = self.MOBI_TABLE_TITLE_ENTRY
WIDE_TITLE_ENTRY = self.MOBI_TABLE_TITLE_WIDE_ENTRY
TITLE_PAGE_END = self.MOBI_TABLE_TITLE_PAGE_END
else:
TITLE_PAGE_START = self.MOBI_TITLE_PAGE_START
TITLE_ENTRY = self.MOBI_TITLE_ENTRY
WIDE_TITLE_ENTRY = self.MOBI_TITLE_ENTRY # same, only wide in tables.
TITLE_PAGE_END = self.MOBI_TITLE_PAGE_END
titlepageIO = StringIO.StringIO()
self.writeTitlePage(out=titlepageIO,
START=TITLE_PAGE_START,
ENTRY=TITLE_ENTRY,
WIDE_ENTRY=WIDE_TITLE_ENTRY,
END=TITLE_PAGE_END)
if titlepageIO.getvalue(): # will be false if no title page.
files.append(titlepageIO.getvalue())
titlepageIO.close()
## MOBI always has a TOC injected by mobi.py because there's
## no meta-data TOC.
# # write toc page.
# tocpageIO = StringIO.StringIO()
# self.writeTOCPage(tocpageIO,
# self.MOBI_TOC_PAGE_START,
# self.MOBI_TOC_ENTRY,
# self.MOBI_TOC_PAGE_END)
# if tocpageIO.getvalue(): # will be false if no toc page.
# files.append(tocpageIO.getvalue())
# tocpageIO.close()
for index, (title,html) in enumerate(self.story.getChapters()):
logging.debug('Writing chapter text for: %s' % title)
fullhtml = self.MOBI_CHAPTER_START.substitute({'chapter':title, 'index':index+1}) + html + self.MOBI_CHAPTER_END.substitute({'chapter':title, 'index':index+1})
# ffnet(& maybe others) gives the whole chapter text as
# one line. This causes problems for nook(at least) when
# the chapter size starts getting big (200k+)
fullhtml = fullhtml.replace('</p>','</p>\n').replace('<br />','<br />\n')
files.append(fullhtml.encode('utf-8'))
del fullhtml
c = Converter(title=self.getMetadata('title'),
author=self.getMetadata('author'),
publisher=self.getMetadata('site'))
mobidata = c.ConvertStrings(files)
out.write(mobidata)
del files
del mobidata
## Utility method for creating new tags.
def newTag(dom,name,attrs=None,text=None):
tag = dom.createElement(name)
if( attrs is not None ):
for attr in attrs.keys():
tag.setAttribute(attr,attrs[attr])
if( text is not None ):
tag.appendChild(dom.createTextNode(text))
return tag

View file

@ -57,7 +57,9 @@
Login/Password is only asked for when required now, as is 'Are you an Adult?' where required.
</p>
<p>
Mobi support (for Kindle) is only via EPub conversion in this version.
The same (rather crude) Mobi support (for Kindle) that we
had before is restored. Mobi via EPub conversion is still
available and provides better output.
</p>
<p>
If you have any problems with this new version, please
@ -81,12 +83,12 @@
<input type='radio' name='format' value='epub' checked>EPub</input>
<input type='radio' name='format' value='html'>HTML</input>
<input type='radio' name='format' value='txt'>Plain Text</input>
<input type='radio' name='format' value='mobi'>Mobi(Kindle)</input>
</div>
<div>
<br />
<input type="submit" value="Download">
<p><i>For most readers, including Sony Reader, Nook and iPad, use EPub.</i></p>
<p><i>For Kindle and other Mobi readers, select EPub and use the Convert link when it's finished.</i></p>
</div>
</div>
<div id='typebox'>

74
main.py
View file

@ -158,6 +158,8 @@ class FileServer(webapp.RequestHandler):
self.response.headers['Content-Type'] = 'text/html'
elif name.endswith('.txt'):
self.response.headers['Content-Type'] = 'text/plain'
elif name.endswith('.mobi'):
self.response.headers['Content-Type'] = 'application/x-mobipocket-ebook'
elif name.endswith('.zip'):
self.response.headers['Content-Type'] = 'application/zip'
else:
@ -334,13 +336,14 @@ class FanfictionDownloader(UserConfigServer):
taskqueue.add(url='/fdowntask',
queue_name="download",
params={'format':format,
'url':download.url,
'login':login,
'password':password,
'user':user.email(),
'is_adult':is_adult})
params={'id':str(download.key()),
'format':format,
'url':download.url,
'login':login,
'password':password,
'user':user.email(),
'is_adult':is_adult})
logging.info("enqueued download key: " + str(download.key()))
except (exceptions.FailedToLogin,exceptions.AdultCheckRequired), e:
@ -390,6 +393,7 @@ class FanfictionDownloaderTask(UserConfigServer):
def post(self):
logging.getLogger().setLevel(logging.DEBUG)
fileId = self.request.get('id')
format = self.request.get('format')
url = self.request.get('url')
login = self.request.get('login')
@ -399,27 +403,42 @@ class FanfictionDownloaderTask(UserConfigServer):
user = users.User(self.request.get('user'))
logging.info("Downloading: " + url + " for user: "+user.nickname())
logging.info("ID: " + fileId)
adapter = None
writerClass = None
# use existing record if available.
q = DownloadMeta.all().filter('user =', user).filter('url =',url).filter('format =',format).fetch(1)
if( q is None or len(q) < 1 ):
download = DownloadMeta()
else:
download = q[0]
download.failure=None
download.date=datetime.datetime.now()
download.completed=False
for c in download.data_chunks:
c.delete()
if fileId:
try:
## try to get download rec from passed id first.
## may need to fall back to user/url/format during transition.
download = db.get(db.Key(fileId))
logging.info("DownloadMeta found by ID:"+fileId)
except:
pass
if not download:
# use existing record if available.
q = DownloadMeta.all().filter('user =', user).filter('url =',url).filter('format =',format).fetch(1)
if( q is None or len(q) < 1 ):
logging.info("New DownloadMeta")
download = DownloadMeta()
else:
logging.info("DownloadMeta found by user/url/format")
download = q[0]
## populate DownloadMeta, regardless of how found or created.
download.failure=None
download.date=datetime.datetime.now()
download.completed=False
download.version = "%s:%s" % (os.environ['APPLICATION_ID'],os.environ['CURRENT_VERSION_ID'])
download.user = user
download.url = url
download.format = format
for c in download.data_chunks:
c.delete()
download.put()
logging.info('Creating adapter...')
try:
@ -441,21 +460,19 @@ class FanfictionDownloaderTask(UserConfigServer):
try:
# adapter.getStory() is what does all the heavy lifting.
writer = writers.getWriter(format,config,adapter)
download.name = writer.getOutputFileName()
logging.debug('output_filename:'+writer.getConfig('output_filename'))
logging.debug('getOutputFileName:'+writer.getOutputFileName())
download.title = adapter.getStory().getMetadata('title')
download.author = adapter.getStory().getMetadata('author')
download.url = adapter.getStory().getMetadata('storyUrl')
download.put()
except Exception, e:
logging.exception(e)
download.failure = unicode(e)
download.put()
return
download.name = writer.getOutputFileName()
logging.debug('output_filename:'+writer.getConfig('output_filename'))
logging.debug('getOutputFileName:'+writer.getOutputFileName())
download.title = adapter.getStory().getMetadata('title')
download.author = adapter.getStory().getMetadata('author')
download.url = adapter.getStory().getMetadata('storyUrl')
download.put()
index=0
outbuffer = StringIO.StringIO()
writer.writeStory(outbuffer)
data = outbuffer.getvalue()
@ -474,6 +491,7 @@ class FanfictionDownloaderTask(UserConfigServer):
def c(data):
return data
index=0
while( len(data) > 0 ):
DownloadData(download=download,
index=index,

View file

@ -49,20 +49,20 @@
{% for fic in fics %}
<p>
{% if fic.completed %}
<a href="/file?id={{ fic.key }}">Download {{ fic.title }}</a>
by {{ fic.author }} ({{ fic.format }})<br/>
{% if fic.escaped_url %}
<a href="http://www.convertfiles.com/index.php?url={{ fic.escaped_url }}">Convert {{ fic.title }} to other formats</a><br />
<span class="recent"><a href="/file?id={{ fic.key }}">Download {{ fic.title }}</a></span>
by {{ fic.author }} ({{ fic.format }})
{% endif %}
{% if not fic.completed and not fic.failure %}
<span class="recent">Processing {{ fic.title }}</span>
by {{ fic.author }} ({{ fic.format }})
{% endif %}
{% if fic.failure %}
<div id='error'>{{ fic.failure }}</div>
<span id='error'>{{ fic.failure }}</span>
{% endif %}
{% if not fic.completed and not fic.failure %}
Request Processing...<br />
<a href="{{ fic.url }}" title="Link to original story">Source</a>
{% if fic.completed and fic.escaped_url %}
<a href="http://www.convertfiles.com/index.php?url={{ fic.escaped_url }}" title="Convert to other formats using Convertfiles.com">Convert</a>
{% endif %}
<small><a href="{{ fic.url }}">{{ fic.url }}</a></small>
</p>
{% endfor %}
</div>

View file

@ -44,29 +44,27 @@
<div id='urlbox'>
{% if fic.url %}
<div id='greeting'>
<p><a href='{{ fic.url }}'>{{ fic.url }}</a></p>
<p>
{% if fic.completed %}
<p>Your fic has finished processing and you can download it now.</p>
<span class="recent"><a href="/file?id={{ fic.key }}">Download {{ fic.title }}</a></span>
by {{ fic.author }} ({{ fic.format }})
{% endif %}
{% if fic.failure %}
<span id='error'>{{ fic.failure }}</span>
{% endif %}
{% if not fic.completed and not fic.failure %}
<span class="recent">Processing {{ fic.title }}</span>
by {{ fic.author }} ({{ fic.format }})
{% endif %}
<a href="{{ fic.url }}" title="Link to original story">Source</a>
{% if fic.completed and escaped_url %}
<a href="http://www.convertfiles.com/index.php?url={{ escaped_url }}" title="Convert to other formats using Convertfiles.com">Convert</a>
{% endif %}
</p>
</div>
{% endif %}
<div>
{% if fic.completed %}
<p>Your fic has finished processing and you can download it now:</p>
<p><a href="/file?id={{ fic.key }}">Download {{ fic.title }}</a>
by {{ fic.author }} ({{ fic.format }})</p>
{% if escaped_url %}
<p><a href="http://www.convertfiles.com/index.php?url={{ escaped_url }}">Convert {{ fic.title }} to other formats</a></p>
{% endif %}
{% else %}
{% if fic.failure %}
Your fic failed to process. Please check the URL and the error message below.<br />
<div id='error'>
{{ fic.failure }}
</div>
{% else %}
<p>Not done yet. This page will periodically poll to see if your story has finished.</p>
{% endif %}
{% endif %}
<p>Or see your personal list of <a href="/recent">previously downloaded fanfics</a>.</p>
</div>
<p>See your personal list of <a href="/recent">previously downloaded fanfics</a>.</p>
</div>
<div style='text-align: center'>
<img src="http://code.google.com/appengine/images/appengine-silver-120x30.gif"