Remember original href in data-orighref attr with epub internalize_text_links so inserted 'earlier' chapters don't break internal links.

This commit is contained in:
Jim Miller 2019-01-17 12:49:34 -06:00
parent 3488d35c1f
commit 5e0a036814
5 changed files with 26 additions and 6 deletions

View file

@ -337,11 +337,13 @@ keep_summary_html:true
## Some attributes cause problems for EBook readers. By default,
## FanFicFare will remove all attributes except the ones specified
## from all tags. (The only exception is that <img> tags will also
## keep src, alt and longdesc attributes.)
## keep src, alt and longdesc attributes. data-orighref is used by
## internalize_text_links to preserve links when chapters are
## inserted.)
## Example: To add 'style', 'title' and 'align' to the list to keep,
## in your personal.ini [defaults] put:
## add_to_keep_html_attrs:,style,title,align
keep_html_attrs:href,name,class,id,colspan,rowspan
keep_html_attrs:href,name,class,id,colspan,rowspan,data-orighref
## Tags listed here will be replaced with <span class="tagname">.
## For example: <u>underlined text</u> becomes

View file

@ -451,7 +451,7 @@ class BaseSiteAdapter(Configurable):
if not fetch:
fetch=self._fetchUrlRaw
acceptable_attributes = self.getConfigList('keep_html_attrs',['href','name','class','id'])
acceptable_attributes = self.getConfigList('keep_html_attrs',['href','name','class','id','data-orighref'])
if self.getConfig("keep_style_attr"):
acceptable_attributes.append('style')

View file

@ -340,11 +340,13 @@ keep_summary_html:true
## Some attributes cause problems for EBook readers. By default,
## FanFicFare will remove all attributes except the ones specified
## from all tags. (The only exception is that <img> tags will also
## keep src, alt and longdesc attributes.)
## keep src, alt and longdesc attributes. data-orighref is used by
## internalize_text_links to preserve links when chapters are
## inserted.)
## Example: To add 'style', 'title' and 'align' to the list to keep,
## in your personal.ini [defaults] put:
## add_to_keep_html_attrs:,style,title,align
keep_html_attrs:href,name,class,id,colspan,rowspan
keep_html_attrs:href,name,class,id,colspan,rowspan,data-orighref
## Tags listed here will be replaced with <span class="tagname">.
## For example: <u>underlined text</u> becomes

View file

@ -700,14 +700,27 @@ div { margin: 0pt; padding: 0pt; }
CHAPTER_END = self.EPUB_CHAPTER_END
for index, chap in enumerate(self.story.getChapters()): # (url,title,html)
logger.debug("chapter:%s %s %s"%(len(chap['html']), chap['title'],chap['url']))
if chap['html']:
chap_data = chap['html']
if self.getConfig('internalize_text_links'):
soup = bs4.BeautifulSoup(chap['html'],'html5lib')
changed=False
for alink in soup.find_all('a'):
if alink.has_attr('href') and alink['href'] in chapurlmap:
## Chapters can be inserted in the middle
## which can break existing internal links.
## So let's save the original href and update.
logger.debug("found %s"%alink)
if alink.has_attr('data-orighref') and alink['data-orighref'] in chapurlmap:
alink['href']=chapurlmap[alink['data-orighref']]
logger.debug("set1 %s"%alink)
changed=True
elif alink.has_attr('href') and alink['href'] in chapurlmap:
if not alink['href'].startswith('file'):
# only save orig href if not already internal.
alink['data-orighref']=alink['href']
alink['href']=chapurlmap[alink['href']]
logger.debug("set2 %s"%alink)
changed=True
if changed:
chap_data = unicode(soup)
@ -734,6 +747,7 @@ div { margin: 0pt; padding: 0pt; }
# (200k+)
fullhtml = re.sub(r'(</p>|<br ?/>)\n*',r'\1\n',fullhtml)
logger.debug("write OEBPS/file%s.xhtml"%chap['index04'])
outputepub.writestr("OEBPS/file%s.xhtml"%chap['index04'],fullhtml.encode('utf-8'))
del fullhtml

View file

@ -146,6 +146,8 @@ ${output_css}
if chap['html']:
chap_data = chap['html']
if self.getConfig('internalize_text_links'):
# html doesn't need data-orighref because it
# doesn't do updates.
soup = bs4.BeautifulSoup(chap['html'],'html5lib')
changed=False
for alink in soup.find_all('a'):