Fix for mobi output--link to TOC works again--was broken by html5lib enforcing html5 rules.

2025-12-23 17:23:40 +01:00 · 2018-08-09 19:54:01 -05:00 · 2018-08-09 19:54:01 -05:00 · a93eeec5eb
commit a93eeec5eb
parent 6fbf3bc282
2 changed files with 10 additions and 4 deletions
--- a/fanficfare/mobi.py
+++ b/fanficfare/mobi.py
@ -91,7 +91,9 @@ class Converter:
    toc_html = []
    body_html = []

-    PAGE_BREAK = '<mbp:pagebreak>'
+    ## This gets broken by html5lib/bs4fixed being helpful, but we'll
+    ## fix it inside mobihtml.py
+    PAGE_BREAK = '<mbp:pagebreak/>'

    # pull out the title page, assumed first html_strs.
    htmltitle = html_strs[0]
--- a/fanficfare/mobihtml.py
+++ b/fanficfare/mobihtml.py
@ -33,6 +33,12 @@ class HtmlProcessor:
    self.unfill = unfill
 #    html = self._ProcessRawHtml(html)
    self._soup = BeautifulSoup(html,'html5lib')
+    ## mobi format wants to find this <guide> tag inside <head>.
+    ## html5lib, on the other hand, moved it to <body>.  So we'll move
+    ## it back.
+    guide = self._soup.find('guide')
+    if guide:
+      self._soup.head.append(guide)
    if self._soup.title.contents:
      self.title = self._soup.title.contents[0]
    else:
@ -66,10 +72,8 @@ class HtmlProcessor:
  def _ReplaceAnchorStubs(self):
    # TODO: Browsers allow extra whitespace in the href names.

-    # str() instead of unicode() rather than figure out how to fix
-    # ancient mobi.py code.
    assembled_text = ensure_binary(unicode(self._soup))
-    # bs4 creating close tags for <mbp:pagebreak>
+    # html5lib/bs4 creates close tags for <mbp:pagebreak>
    assembled_text = assembled_text.replace(b'<mbp:pagebreak>',b'<mbp:pagebreak/>')
    assembled_text = assembled_text.replace(b'</mbp:pagebreak>',b'')