Mobi improvements: Mark TOC so reader can find it, don't let reader eat every 4096th char, don't prettify. Remove debug outputs from whofic. Increase fetch size of remover.

This commit is contained in:
retiefjimm 2011-03-24 12:57:38 -05:00
parent efb521c829
commit bd82311d51
4 changed files with 44 additions and 20 deletions

View file

@ -35,15 +35,20 @@ class HtmlProcessor:
with <a filepos="00000000050">. Stores anchors in self._anchor_references'''
self._anchor_references = []
anchor_num = 0
for anchor in self._soup.findAll('a', href=re.compile('^#')):
# anchor links
anchorlist = self._soup.findAll('a', href=re.compile('^#'))
# treat reference tags like a tags for TOCTOP.
anchorlist.extend(self._soup.findAll('reference', href=re.compile('^#')))
for anchor in anchorlist:
self._anchor_references.append((anchor_num, anchor['href']))
del anchor['href']
anchor['filepos'] = '%.10d' % anchor_num
anchor_num += 1
def _ReplaceAnchorStubs(self):
# TODO: Browsers allow extra whitespace in the href names.
assembled_text = self._soup.prettify()
# use __str__ instead of prettify--it inserts extra spaces.
assembled_text = self._soup.__str__('utf8')
del self._soup # shouldn't touch this anymore
for anchor_num, original_ref in self._anchor_references:
ref = urllib.unquote(original_ref[1:]) # remove leading '#'

View file

@ -88,8 +88,8 @@ class Converter:
entrytitle = _SubEntry(1, htmltitle)
title_html.append(entrytitle.Body())
toc_html.append(PAGE_BREAK)
toc_html.append('<h3>Table of Contents</h3><br />')
title_html.append(PAGE_BREAK)
toc_html.append('<a name="TOCTOP"><h3>Table of Contents</h3><br />')
for pos, html in enumerate(html_strs[1:]):
entry = _SubEntry(pos+1, html)
@ -103,8 +103,16 @@ class Converter:
body_html.append(entry.Body())
# TODO: this title can get way too long with RSS feeds. Not sure how to fix
header = '<html><head><title>Bibliorize %s GMT</title></head><body>' % time.ctime(
time.time())
# cheat slightly and use the <a href> code to set filepos in references.
header = '''<html>
<head>
<title>Bibliorize %s GMT</title>
<guide>
<reference href="#TOCTOP" type="toc" title="Table of Contents"/>
</guide>
</head>
<body>
''' % time.ctime(time.time())
footer = '</body></html>'
all_html = header + '\n'.join(title_html + toc_html + body_html) + footer
@ -122,6 +130,21 @@ class Converter:
def _ConvertStringToFile(self, html_data, out):
html = HtmlProcessor(html_data)
data = html.CleanHtml()
# collect offsets of '<mbp:pagebreak>' tags, use to make index list.
# indexlist = [] # list of (offset,length) tuples.
# not in current use.
# j=0
# lastj=0
# while True:
# j=data.find('<mbp:pagebreak>',lastj+10) # plus a bit so we find the next.
# if j < 0:
# break
# indexlist.append((lastj,j-lastj))
# print "index offset: %d length: %d" % (lastj,j-lastj)
# lastj=j
records = []
# title = html.title
# if title:
@ -131,6 +154,7 @@ class Converter:
end = min(len(data), start_pos + Record.MAX_SIZE)
record_data = data[start_pos:end]
records.append(self._header.AddRecord(record_data, record_id))
#print "HTML Record %03d: (size:%d) [[%s ... %s]]" % ( record_id, len(record_data), record_data[:20], record_data[-20:] )
record_id += 1
self._header.SetImageRecordIndex(record_id)
records[0:0] = [self._header.MobiHeader()]
@ -139,12 +163,18 @@ class Converter:
out.write(header)
for record in records:
record.WriteHeader(out, rec_offset)
rec_offset += len(record.data)
#print "rec_offset: %d len(record.data): %d" % (rec_offset,len(record.data))
rec_offset += (len(record.data)+1) # plus one for trailing null
# Write to nuls for some reason
out.write('\0\0')
for record in records:
record.WriteData(out)
out.write('\0')
# needs a trailing null, I believe it indicates zero length 'overlap'.
# otherwise, the readers eat the last char of each html record.
# Calibre writes another 6-7 bytes of stuff after that, but we seem
# to be getting along without it.
class Record:
MAX_SIZE = 4096

View file

@ -131,25 +131,15 @@ class Whofic(FanfictionSiteAdapter):
# find this story in the list, parse it's metadata based on
# lots of assumptions, since there's little tagging.
for a in soup.findAll('a'):
#print "a href:"+a['href']
if a['href'].find('viewstory.php?sid='+self.storyId) != -1:
metadata = a.findParent('td')
metadatachunks = metadata.__str__('utf8').split('<br />')
# process metadata for this story.
#print a.findParent('td').__str__('utf8')
self.storyDescription = metadatachunks[1]
# for cata in metadata.findAll('a'):
# if cata['href'].startswith('categories.php'):
# if len(self.category) == 0:
# self.category = cata.string
# else:
# self.category = self.category + ", " + cata.string
# the stuff with ' - ' separators
moremeta = metadatachunks[2]
moremeta = re.sub('<[^>]+>','',moremeta) # strip tags.
print "====== moremeta: "+moremeta
moremetaparts = moremeta.split(' - ')
@ -167,7 +157,6 @@ class Whofic(FanfictionSiteAdapter):
# the stuff with ' - ' separators *and* names
moremeta = metadatachunks[5]
moremeta = re.sub('<[^>]+>','',moremeta) # strip tags.
print "====== moremeta 2: "+moremeta
moremetaparts = moremeta.split(' - ')

View file

@ -26,7 +26,7 @@ class Remover(webapp.RequestHandler):
fics = DownloadMeta.all()
fics.filter("date <",theDate).order("date")
results = fics.fetch(100)
results = fics.fetch(500)
logging.debug([x.name for x in results])
num = 0