mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-04-27 17:38:19 +02:00
Mobi improvements: Mark TOC so reader can find it, don't let reader eat every 4096th char, don't prettify. Remove debug outputs from whofic. Increase fetch size of remover.
This commit is contained in:
parent
efb521c829
commit
bd82311d51
4 changed files with 44 additions and 20 deletions
|
|
@ -35,15 +35,20 @@ class HtmlProcessor:
|
|||
with <a filepos="00000000050">. Stores anchors in self._anchor_references'''
|
||||
self._anchor_references = []
|
||||
anchor_num = 0
|
||||
for anchor in self._soup.findAll('a', href=re.compile('^#')):
|
||||
# anchor links
|
||||
anchorlist = self._soup.findAll('a', href=re.compile('^#'))
|
||||
# treat reference tags like a tags for TOCTOP.
|
||||
anchorlist.extend(self._soup.findAll('reference', href=re.compile('^#')))
|
||||
for anchor in anchorlist:
|
||||
self._anchor_references.append((anchor_num, anchor['href']))
|
||||
del anchor['href']
|
||||
anchor['filepos'] = '%.10d' % anchor_num
|
||||
anchor_num += 1
|
||||
|
||||
|
||||
def _ReplaceAnchorStubs(self):
|
||||
# TODO: Browsers allow extra whitespace in the href names.
|
||||
assembled_text = self._soup.prettify()
|
||||
# use __str__ instead of prettify--it inserts extra spaces.
|
||||
assembled_text = self._soup.__str__('utf8')
|
||||
del self._soup # shouldn't touch this anymore
|
||||
for anchor_num, original_ref in self._anchor_references:
|
||||
ref = urllib.unquote(original_ref[1:]) # remove leading '#'
|
||||
|
|
|
|||
|
|
@ -88,8 +88,8 @@ class Converter:
|
|||
entrytitle = _SubEntry(1, htmltitle)
|
||||
title_html.append(entrytitle.Body())
|
||||
|
||||
toc_html.append(PAGE_BREAK)
|
||||
toc_html.append('<h3>Table of Contents</h3><br />')
|
||||
title_html.append(PAGE_BREAK)
|
||||
toc_html.append('<a name="TOCTOP"><h3>Table of Contents</h3><br />')
|
||||
|
||||
for pos, html in enumerate(html_strs[1:]):
|
||||
entry = _SubEntry(pos+1, html)
|
||||
|
|
@ -103,8 +103,16 @@ class Converter:
|
|||
body_html.append(entry.Body())
|
||||
|
||||
# TODO: this title can get way too long with RSS feeds. Not sure how to fix
|
||||
header = '<html><head><title>Bibliorize %s GMT</title></head><body>' % time.ctime(
|
||||
time.time())
|
||||
# cheat slightly and use the <a href> code to set filepos in references.
|
||||
header = '''<html>
|
||||
<head>
|
||||
<title>Bibliorize %s GMT</title>
|
||||
<guide>
|
||||
<reference href="#TOCTOP" type="toc" title="Table of Contents"/>
|
||||
</guide>
|
||||
</head>
|
||||
<body>
|
||||
''' % time.ctime(time.time())
|
||||
|
||||
footer = '</body></html>'
|
||||
all_html = header + '\n'.join(title_html + toc_html + body_html) + footer
|
||||
|
|
@ -122,6 +130,21 @@ class Converter:
|
|||
def _ConvertStringToFile(self, html_data, out):
|
||||
html = HtmlProcessor(html_data)
|
||||
data = html.CleanHtml()
|
||||
|
||||
# collect offsets of '<mbp:pagebreak>' tags, use to make index list.
|
||||
# indexlist = [] # list of (offset,length) tuples.
|
||||
# not in current use.
|
||||
|
||||
# j=0
|
||||
# lastj=0
|
||||
# while True:
|
||||
# j=data.find('<mbp:pagebreak>',lastj+10) # plus a bit so we find the next.
|
||||
# if j < 0:
|
||||
# break
|
||||
# indexlist.append((lastj,j-lastj))
|
||||
# print "index offset: %d length: %d" % (lastj,j-lastj)
|
||||
# lastj=j
|
||||
|
||||
records = []
|
||||
# title = html.title
|
||||
# if title:
|
||||
|
|
@ -131,6 +154,7 @@ class Converter:
|
|||
end = min(len(data), start_pos + Record.MAX_SIZE)
|
||||
record_data = data[start_pos:end]
|
||||
records.append(self._header.AddRecord(record_data, record_id))
|
||||
#print "HTML Record %03d: (size:%d) [[%s ... %s]]" % ( record_id, len(record_data), record_data[:20], record_data[-20:] )
|
||||
record_id += 1
|
||||
self._header.SetImageRecordIndex(record_id)
|
||||
records[0:0] = [self._header.MobiHeader()]
|
||||
|
|
@ -139,12 +163,18 @@ class Converter:
|
|||
out.write(header)
|
||||
for record in records:
|
||||
record.WriteHeader(out, rec_offset)
|
||||
rec_offset += len(record.data)
|
||||
#print "rec_offset: %d len(record.data): %d" % (rec_offset,len(record.data))
|
||||
rec_offset += (len(record.data)+1) # plus one for trailing null
|
||||
|
||||
# Write to nuls for some reason
|
||||
out.write('\0\0')
|
||||
for record in records:
|
||||
record.WriteData(out)
|
||||
out.write('\0')
|
||||
# needs a trailing null, I believe it indicates zero length 'overlap'.
|
||||
# otherwise, the readers eat the last char of each html record.
|
||||
# Calibre writes another 6-7 bytes of stuff after that, but we seem
|
||||
# to be getting along without it.
|
||||
|
||||
class Record:
|
||||
MAX_SIZE = 4096
|
||||
|
|
|
|||
|
|
@ -131,25 +131,15 @@ class Whofic(FanfictionSiteAdapter):
|
|||
# find this story in the list, parse it's metadata based on
|
||||
# lots of assumptions, since there's little tagging.
|
||||
for a in soup.findAll('a'):
|
||||
#print "a href:"+a['href']
|
||||
if a['href'].find('viewstory.php?sid='+self.storyId) != -1:
|
||||
metadata = a.findParent('td')
|
||||
metadatachunks = metadata.__str__('utf8').split('<br />')
|
||||
# process metadata for this story.
|
||||
#print a.findParent('td').__str__('utf8')
|
||||
self.storyDescription = metadatachunks[1]
|
||||
|
||||
# for cata in metadata.findAll('a'):
|
||||
# if cata['href'].startswith('categories.php'):
|
||||
# if len(self.category) == 0:
|
||||
# self.category = cata.string
|
||||
# else:
|
||||
# self.category = self.category + ", " + cata.string
|
||||
|
||||
# the stuff with ' - ' separators
|
||||
moremeta = metadatachunks[2]
|
||||
moremeta = re.sub('<[^>]+>','',moremeta) # strip tags.
|
||||
print "====== moremeta: "+moremeta
|
||||
|
||||
moremetaparts = moremeta.split(' - ')
|
||||
|
||||
|
|
@ -167,7 +157,6 @@ class Whofic(FanfictionSiteAdapter):
|
|||
# the stuff with ' - ' separators *and* names
|
||||
moremeta = metadatachunks[5]
|
||||
moremeta = re.sub('<[^>]+>','',moremeta) # strip tags.
|
||||
print "====== moremeta 2: "+moremeta
|
||||
|
||||
moremetaparts = moremeta.split(' - ')
|
||||
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ class Remover(webapp.RequestHandler):
|
|||
|
||||
fics = DownloadMeta.all()
|
||||
fics.filter("date <",theDate).order("date")
|
||||
results = fics.fetch(100)
|
||||
results = fics.fetch(500)
|
||||
logging.debug([x.name for x in results])
|
||||
|
||||
num = 0
|
||||
|
|
|
|||
Loading…
Reference in a new issue