Perf improvement for unnew

2025-12-06 00:43:00 +01:00 · 2025-11-04 12:20:39 -06:00 · 2025-11-04 12:20:39 -06:00 · e5b5768f11
commit e5b5768f11
parent 6cf2519ef9
1 changed files with 27 additions and 3 deletions
--- a/fanficfare/epubutils.py
+++ b/fanficfare/epubutils.py
@ -20,6 +20,26 @@ from .six import ensure_text, text_type as unicode
 from .six import string_types as basestring
 from io import BytesIO

+# from io import StringIO
+# import cProfile, pstats
+# from pstats import SortKey
+# def do_cprofile(func):
+#     def profiled_func(*args, **kwargs):
+#         profile = cProfile.Profile()
+#         try:
+#             profile.enable()
+#             result = func(*args, **kwargs)
+#             profile.disable()
+#             return result
+#         finally:
+#             # profile.sort_stats(SortKey.CUMULATIVE).print_stats(20)
+#             s = StringIO()
+#             sortby = SortKey.CUMULATIVE
+#             ps = pstats.Stats(profile, stream=s).sort_stats(sortby)
+#             ps.print_stats(20)
+#             print(s.getvalue())
+#     return profiled_func
+
 import bs4

 def get_dcsource(inputio):
@ -293,6 +313,7 @@ def get_story_url_from_zip_html(inputio,_is_good_url=None):
                    return ahref
    return None

+# @do_cprofile
 def reset_orig_chapters_epub(inputio,outfile):
    inputepub = ZipFile(inputio, 'r') # works equally well with a path or a blob

@ -345,7 +366,9 @@ def reset_orig_chapters_epub(inputio,outfile):
            if re.match(r'.*/file\d+\.xhtml',zf):
                #logger.debug("zf:%s"%zf)
                data = data.decode('utf-8')
-                soup = make_soup(data)
+                # should be re-reading an FFF file, single soup should
+                # be good enough and halve processing time.
+                soup = make_soup(data,dblsoup=False)

                chapterorigtitle = None
                tag = soup.find('meta',{'name':'chapterorigtitle'})
@ -458,7 +481,7 @@ def _replace_navxhtml(navxhtmldom,zf,chaptertoctitle):
            # logger.debug("a href=%s label:%s"%(zf,atag.toxml()))
            continue

-def make_soup(data):
+def make_soup(data,dblsoup=True):
    '''
    Convenience method for getting a bs4 soup.  bs3 has been removed.
    '''
@ -473,7 +496,8 @@ def make_soup(data):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        soup = bs4.BeautifulSoup(data,'html5lib')
-        soup = bs4.BeautifulSoup(unicode(soup),'html5lib')
+        if dblsoup:
+            soup = bs4.BeautifulSoup(unicode(soup),'html5lib')

    for ns in soup.find_all('fff_hide_noscript'):
        ns.name = 'noscript'