From 3b72126f5f1cecb2e0c20306e2c9b16a217185dd Mon Sep 17 00:00:00 2001 From: Jim Miller Date: Sat, 11 Dec 2021 19:32:06 -0600 Subject: [PATCH] Add remove_class_chapter feature, true by default. --- calibre-plugin/plugin-defaults.ini | 9 +++++++++ fanficfare/adapters/adapter_test1.py | 4 ++-- fanficfare/adapters/base_adapter.py | 19 +++++++++++++++++++ fanficfare/configurable.py | 1 + fanficfare/defaults.ini | 9 +++++++++ 5 files changed, 40 insertions(+), 2 deletions(-) diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini index b8147c64..44562794 100644 --- a/calibre-plugin/plugin-defaults.ini +++ b/calibre-plugin/plugin-defaults.ini @@ -372,6 +372,15 @@ keep_summary_html:true ## add_to_keep_html_attrs:,style,title,align keep_html_attrs:href,name,class,id,colspan,rowspan,data-orighref +## Some tags, notable chapter div tags from Base eFiction, have +## class='chapter', which causes calibre convert to identify it as a +## chapter and 'pagebreak' at that point, aka split the file, which +## adds unexpected pagebreaks and breaks FFF update if an epub to epub +## conversion is done. Remove class='chapter' from all tags by +## default. Also affects previously downloaded chapters on epub +## update. +remove_class_chapter:true + ## Tags listed here will be replaced with . ## For example: underlined text becomes ## underlined text diff --git a/fanficfare/adapters/adapter_test1.py b/fanficfare/adapters/adapter_test1.py index 66073966..df9ddd6f 100644 --- a/fanficfare/adapters/adapter_test1.py +++ b/fanficfare/adapters/adapter_test1.py @@ -395,7 +395,7 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!" else: text=u''' -
+

Chapter title from site

chapter URL:'''+url+'''

Timestamp:'''+datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")+'''

-

Lorem '''+self.crazystring+u''' italics, bold, underline, Strike through consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+

Lorem '''+self.crazystring+u''' italics, bold, underline, Strike through consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

br breaks

Puella Magi Madoka Magica/魔法少女まどか★マギカ
diff --git a/fanficfare/adapters/base_adapter.py b/fanficfare/adapters/base_adapter.py index 602f5e25..2afe2c12 100644 --- a/fanficfare/adapters/base_adapter.py +++ b/fanficfare/adapters/base_adapter.py @@ -592,6 +592,18 @@ class BaseSiteAdapter(Requestable): self.times.add("utf8FromSoup", datetime.now() - start) return retval + def remove_class_chapter(self,soup): + def rm_chp_cls(t): + t['class'].remove('chapter') + if not t['class']: # remove if list empty now. + del t['class'] + for t in soup.select('.chapter'): + rm_chp_cls(t) + # if soup is itself a tag with class='chapter', select doesn't + # find it. + if soup.has_attr('class') and 'chapter' in soup['class']: + rm_chp_cls(soup) + def _do_utf8FromSoup(self,url,soup,fetch=None,allow_replace_br_with_p=True): if not fetch: fetch=self.get_request_raw @@ -624,6 +636,13 @@ class BaseSiteAdapter(Requestable): if attr not in acceptable_attributes: del soup[attr] ## strip all tag attributes except configured + ## some tags, notable chapter div from Base eFiction have + ## class='chapter', which causes calibre convert to id it as a + ## chapter and 'pagebreak' - AKA split the file. Remove by + ## default, but only if class otherwise allowed (minor perf opt). + if 'class' in acceptable_attributes and self.getConfig('remove_class_chapter',True): + self.remove_class_chapter(soup) + ## Make relative links in text into absolute links using page ## URL. if self.getConfig('fix_relative_text_links'): diff --git a/fanficfare/configurable.py b/fanficfare/configurable.py index 4970fa9c..f1690c6c 100644 --- a/fanficfare/configurable.py +++ b/fanficfare/configurable.py @@ -268,6 +268,7 @@ def get_valid_set_options(): 'fix_relative_text_links':(None,['epub','html'],boollist), 'normalize_text_links':(None,['epub','html'],boollist), 'internalize_text_links':(None,['epub','html'],boollist), + 'remove_class_chapter':(None,['epub','html'],boollist), 'capitalize_forumtags':(base_xenforo_list,None,boollist), 'minimum_threadmarks':(base_xenforo_list,None,None), diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini index f9bb2842..b9f285e9 100644 --- a/fanficfare/defaults.ini +++ b/fanficfare/defaults.ini @@ -375,6 +375,15 @@ keep_summary_html:true ## add_to_keep_html_attrs:,style,title,align keep_html_attrs:href,name,class,id,colspan,rowspan,data-orighref +## Some tags, notable chapter div tags from Base eFiction, have +## class='chapter', which causes calibre convert to identify it as a +## chapter and 'pagebreak' at that point, aka split the file, which +## adds unexpected pagebreaks and breaks FFF update if an epub to epub +## conversion is done. Remove class='chapter' from all tags by +## default. Also affects previously downloaded chapters on epub +## update. +remove_class_chapter:true + ## Tags listed here will be replaced with . ## For example: underlined text becomes ## underlined text