Mobi improvements: Mark TOC so reader can find it, don't let reader eat every 4096th char, don't prettify. Remove debug outputs from whofic. Increase fetch size of remover.

2026-05-02 20:02:51 +02:00 · 2011-03-24 12:57:38 -05:00 · 2011-03-24 12:57:38 -05:00 · bd82311d51
commit bd82311d51
parent efb521c829
4 changed files with 44 additions and 20 deletions
--- a/fanficdownloader/html.py
+++ b/fanficdownloader/html.py
@ -35,15 +35,20 @@ class HtmlProcessor:
    with <a filepos="00000000050">. Stores anchors in self._anchor_references'''
    self._anchor_references = []
    anchor_num = 0
-    for anchor in self._soup.findAll('a', href=re.compile('^#')):
+    # anchor links
+    anchorlist = self._soup.findAll('a', href=re.compile('^#'))
+    # treat reference tags like a tags for TOCTOP.
+    anchorlist.extend(self._soup.findAll('reference', href=re.compile('^#')))
+    for anchor in anchorlist:
      self._anchor_references.append((anchor_num, anchor['href']))
      del anchor['href']
      anchor['filepos'] = '%.10d' % anchor_num
      anchor_num += 1
-
+            
  def _ReplaceAnchorStubs(self):
    # TODO: Browsers allow extra whitespace in the href names.
-    assembled_text = self._soup.prettify()
+    # use __str__ instead of prettify--it inserts extra spaces.
+    assembled_text = self._soup.__str__('utf8')
    del self._soup # shouldn't touch this anymore
    for anchor_num, original_ref in self._anchor_references:
      ref = urllib.unquote(original_ref[1:]) # remove leading '#'
--- a/fanficdownloader/mobi.py
+++ b/fanficdownloader/mobi.py
@ -88,8 +88,8 @@ class Converter:
    entrytitle = _SubEntry(1, htmltitle)
    title_html.append(entrytitle.Body())
    
-    toc_html.append(PAGE_BREAK)
-    toc_html.append('<h3>Table of Contents</h3><br />')
+    title_html.append(PAGE_BREAK)
+    toc_html.append('<a name="TOCTOP"><h3>Table of Contents</h3><br />')

    for pos, html in enumerate(html_strs[1:]):
      entry = _SubEntry(pos+1, html)
@ -103,8 +103,16 @@ class Converter:
      body_html.append(entry.Body())
      
    # TODO: this title can get way too long with RSS feeds. Not sure how to fix
-    header = '<html><head><title>Bibliorize %s GMT</title></head><body>' % time.ctime(
-      time.time())
+    # cheat slightly and use the <a href> code to set filepos in references.
+    header = '''<html>
+<head>
+<title>Bibliorize %s GMT</title>
+  <guide>
+    <reference href="#TOCTOP" type="toc" title="Table of Contents"/>
+  </guide>
+</head>
+<body>
+''' % time.ctime(time.time())

    footer = '</body></html>'
    all_html = header + '\n'.join(title_html + toc_html + body_html) + footer
@ -122,6 +130,21 @@ class Converter:
  def _ConvertStringToFile(self, html_data, out):
    html = HtmlProcessor(html_data)
    data = html.CleanHtml()
+
+    # collect offsets of '<mbp:pagebreak>' tags, use to make index list.
+    # indexlist = [] # list of (offset,length) tuples.
+    # not in current use.
+    
+    # j=0
+    # lastj=0
+    # while True:
+    #   j=data.find('<mbp:pagebreak>',lastj+10) # plus a bit so we find the next.
+    #   if j < 0:
+    #     break
+    #   indexlist.append((lastj,j-lastj))
+    #   print "index offset: %d length: %d" % (lastj,j-lastj)
+    #   lastj=j
+
    records = []
 #    title = html.title
 #    if title:
@ -131,6 +154,7 @@ class Converter:
      end = min(len(data), start_pos + Record.MAX_SIZE)
      record_data = data[start_pos:end]
      records.append(self._header.AddRecord(record_data, record_id))
+      #print "HTML Record %03d: (size:%d) [[%s ... %s]]" % ( record_id, len(record_data), record_data[:20], record_data[-20:] )
      record_id += 1
    self._header.SetImageRecordIndex(record_id)
    records[0:0] = [self._header.MobiHeader()]
@ -139,12 +163,18 @@ class Converter:
    out.write(header)
    for record in records:
      record.WriteHeader(out, rec_offset)
-      rec_offset += len(record.data)
+      #print "rec_offset: %d len(record.data): %d" % (rec_offset,len(record.data))
+      rec_offset += (len(record.data)+1) # plus one for trailing null

    # Write to nuls for some reason
    out.write('\0\0')
    for record in records:
      record.WriteData(out)
+      out.write('\0')
+      # needs a trailing null, I believe it indicates zero length 'overlap'.
+      # otherwise, the readers eat the last char of each html record.
+      # Calibre writes another 6-7 bytes of stuff after that, but we seem
+      # to be getting along without it.

 class Record:
  MAX_SIZE = 4096
--- a/fanficdownloader/whofic.py
+++ b/fanficdownloader/whofic.py
@ -131,25 +131,15 @@ class Whofic(FanfictionSiteAdapter):
        # find this story in the list, parse it's metadata based on
        # lots of assumptions, since there's little tagging.
        for a in soup.findAll('a'):
-            #print "a href:"+a['href']
            if a['href'].find('viewstory.php?sid='+self.storyId) != -1:
                metadata = a.findParent('td')
                metadatachunks = metadata.__str__('utf8').split('<br />')
                # process metadata for this story.
-                #print a.findParent('td').__str__('utf8')
                self.storyDescription = metadatachunks[1]
-                
-                # for cata in metadata.findAll('a'):
-                #     if cata['href'].startswith('categories.php'):
-                #         if len(self.category) == 0:
-                #             self.category = cata.string
-                #         else:
-                #             self.category = self.category + ", " + cata.string

                # the stuff with ' - ' separators
                moremeta = metadatachunks[2]
                moremeta = re.sub('<[^>]+>','',moremeta) # strip tags.
-                print "====== moremeta: "+moremeta
                
                moremetaparts = moremeta.split(' - ')
                
@ -167,7 +157,6 @@ class Whofic(FanfictionSiteAdapter):
                # the stuff with ' - ' separators *and* names
                moremeta = metadatachunks[5]
                moremeta = re.sub('<[^>]+>','',moremeta) # strip tags.
-                print "====== moremeta 2: "+moremeta
                
                moremetaparts = moremeta.split(' - ')

--- a/utils/remover.py
+++ b/utils/remover.py
@ -26,7 +26,7 @@ class Remover(webapp.RequestHandler):

 		fics = DownloadMeta.all()
 		fics.filter("date <",theDate).order("date")
-		results = fics.fetch(100)
+		results = fics.fetch(500)
 		logging.debug([x.name for x in results])

 		num = 0