Add remove_class_chapter feature, true by default.

This commit is contained in:
Jim Miller 2021-12-11 19:32:06 -06:00
parent 80fb72928e
commit 3b72126f5f
5 changed files with 40 additions and 2 deletions

View file

@ -372,6 +372,15 @@ keep_summary_html:true
## add_to_keep_html_attrs:,style,title,align
keep_html_attrs:href,name,class,id,colspan,rowspan,data-orighref
## Some tags, notable chapter div tags from Base eFiction, have
## class='chapter', which causes calibre convert to identify it as a
## chapter and 'pagebreak' at that point, aka split the file, which
## adds unexpected pagebreaks and breaks FFF update if an epub to epub
## conversion is done. Remove class='chapter' from all tags by
## default. Also affects previously downloaded chapters on epub
## update.
remove_class_chapter:true
## Tags listed here will be replaced with <span class="tagname">.
## For example: <u>underlined text</u> becomes
## <span class="u">underlined text</span>

View file

@ -395,7 +395,7 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
else:
text=u'''
<div>
<div class='chapter'>
<h3 extra="value">Chapter title from site</h3>
<style>
p { color: red; }
@ -403,7 +403,7 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
</style>
<p>chapter URL:'''+url+'''</p>
<p style="color:blue;">Timestamp:'''+datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")+'''</p>
<p>Lorem '''+self.crazystring+u''' <i>italics</i>, <b>bold</b>, <u>underline</u>, <s>Strike through</s> consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
<p class='chapter ptag'>Lorem '''+self.crazystring+u''' <i>italics</i>, <b>bold</b>, <u>underline</u>, <s>Strike through</s> consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
br breaks<br><br>
Puella Magi Madoka Magica/魔法少女まどかマギカ
<!-- a href="http://code.google.com/p/fanficdownloader/wiki/FanFictionDownLoaderPluginWithReadingList" title="Tilt-a-Whirl by Jim &amp; Sarah, on Flickr"><img src="http://i.imgur.com/bo8eD.png"></a --><br/>

View file

@ -592,6 +592,18 @@ class BaseSiteAdapter(Requestable):
self.times.add("utf8FromSoup", datetime.now() - start)
return retval
def remove_class_chapter(self,soup):
def rm_chp_cls(t):
t['class'].remove('chapter')
if not t['class']: # remove if list empty now.
del t['class']
for t in soup.select('.chapter'):
rm_chp_cls(t)
# if soup is itself a tag with class='chapter', select doesn't
# find it.
if soup.has_attr('class') and 'chapter' in soup['class']:
rm_chp_cls(soup)
def _do_utf8FromSoup(self,url,soup,fetch=None,allow_replace_br_with_p=True):
if not fetch:
fetch=self.get_request_raw
@ -624,6 +636,13 @@ class BaseSiteAdapter(Requestable):
if attr not in acceptable_attributes:
del soup[attr] ## strip all tag attributes except configured
## some tags, notable chapter div from Base eFiction have
## class='chapter', which causes calibre convert to id it as a
## chapter and 'pagebreak' - AKA split the file. Remove by
## default, but only if class otherwise allowed (minor perf opt).
if 'class' in acceptable_attributes and self.getConfig('remove_class_chapter',True):
self.remove_class_chapter(soup)
## Make relative links in text into absolute links using page
## URL.
if self.getConfig('fix_relative_text_links'):

View file

@ -268,6 +268,7 @@ def get_valid_set_options():
'fix_relative_text_links':(None,['epub','html'],boollist),
'normalize_text_links':(None,['epub','html'],boollist),
'internalize_text_links':(None,['epub','html'],boollist),
'remove_class_chapter':(None,['epub','html'],boollist),
'capitalize_forumtags':(base_xenforo_list,None,boollist),
'minimum_threadmarks':(base_xenforo_list,None,None),

View file

@ -375,6 +375,15 @@ keep_summary_html:true
## add_to_keep_html_attrs:,style,title,align
keep_html_attrs:href,name,class,id,colspan,rowspan,data-orighref
## Some tags, notable chapter div tags from Base eFiction, have
## class='chapter', which causes calibre convert to identify it as a
## chapter and 'pagebreak' at that point, aka split the file, which
## adds unexpected pagebreaks and breaks FFF update if an epub to epub
## conversion is done. Remove class='chapter' from all tags by
## default. Also affects previously downloaded chapters on epub
## update.
remove_class_chapter:true
## Tags listed here will be replaced with <span class="tagname">.
## For example: <u>underlined text</u> becomes
## <span class="u">underlined text</span>