Consolidate and aggregate times for perf prof.

2025-12-06 08:52:55 +01:00 · 2017-07-19 17:19:13 -05:00 · 2017-07-19 17:19:13 -05:00 · 7adc1b4d54
commit 7adc1b4d54
parent 09c684d744
2 changed files with 23 additions and 9 deletions
--- a/fanficfare/adapters/base_adapter.py
+++ b/fanficfare/adapters/base_adapter.py
@ -17,6 +17,7 @@

 import re
 from datetime import datetime, timedelta
+from collections import defaultdict

 import logging
 import urlparse as up
@ -36,6 +37,19 @@ from ..configurable import Configurable
 from ..htmlcleanup import removeEntities, removeAllEntities, stripHTML
 from ..exceptions import InvalidStoryURL

+# quick convenience class
+class TimeKeeper(defaultdict):
+    def __init__(self):
+        defaultdict.__init__(self, timedelta)
+
+    def add(self, name, td):
+        self[name] = self[name] + td
+
+    def __unicode__(self):
+        keys = self.keys()
+        keys.sort()
+        return u"\n".join([ u"%s: %s"%(k,self[k]) for k in keys ])
+
 class BaseSiteAdapter(Configurable):

    @classmethod
@ -72,6 +86,9 @@ class BaseSiteAdapter(Configurable):
        self.calibrebookmark = None
        self.logfile = None

+        ## for doing some performance profiling.
+        self.times = TimeKeeper()
+
        self._setURL(url)
        if not self.validateURL():
            raise InvalidStoryURL(url,
@ -193,6 +210,7 @@ class BaseSiteAdapter(Configurable):
            if self.logfile:
                self.story.logfile = self.logfile

+        logger.debug(u"getStory times:\n%s"%self.times)
        return self.story

    def getStoryMetadataOnly(self,get_cover=True):
@ -210,6 +228,7 @@ class BaseSiteAdapter(Configurable):
            for index, (title,url) in enumerate(self.chapterUrls):
                self.chapterUrls[index] = (title,self.normalize_chapterurl(url))

+        logger.debug(u"getStoryMetadataOnly times:\n%s"%self.times)
        return self.story

    def setStoryMetadata(self,metahtml):
@ -357,10 +376,11 @@ class BaseSiteAdapter(Configurable):
                               # image problems when same chapter URL
                               # included more than once (base_xenforo
                               # always_include_first_post setting)
+        self.times.add("utf8FromSoup->copy", datetime.now() - start)
        ## _do_utf8FromSoup broken out to separate copy & timing and
        ## allow for inherit override.
        retval = self._do_utf8FromSoup(url,soup,fetch,allow_replace_br_with_p)
-        logger.debug("utf8FromSoup time:%s"%(datetime.now() - start))
+        self.times.add("utf8FromSoup", datetime.now() - start)
        return retval

    def _do_utf8FromSoup(self,url,soup,fetch=None,allow_replace_br_with_p=True):
@ -446,7 +466,9 @@ class BaseSiteAdapter(Configurable):
        if self.getConfig("replace_br_with_p") and allow_replace_br_with_p:
            # Apply heuristic processing to replace <br> paragraph
            # breaks with <p> tags.
+            start = datetime.now()
            retval = replace_br_with_p(retval)
+            self.times.add("utf8FromSoup->replace_br_with_p", datetime.now() - start)

        if self.getConfig('replace_hr'):
            # replacing a self-closing tag with a container tag in the
--- a/fanficfare/htmlheuristics.py
+++ b/fanficfare/htmlheuristics.py
@ -21,7 +21,6 @@ import re
 import codecs
 import bs4 as bs
 import HtmlTagStack as stack
-from datetime import datetime

 from . import exceptions as exceptions

@ -36,13 +35,6 @@ def replace_br_with_p(body):
        logger.debug("replace_br_with_p previously applied, skipping.")
        return body

-    start = datetime.now()
-    retval = _replace_br_with_p(body)
-    logger.debug("replace_br_with_p time:%s"%(datetime.now() - start))
-    return retval
-
-def _replace_br_with_p(body):
-
    # Ascii character (and Unicode as well) xA0 is a non-breaking space, ascii code 160.
    # However, Python Regex does not recognize it as a whitespace, so we'll be changing it to a regular space.
    # .strip() so "\n<div>" at beginning is also recognized.