Consolidate and aggregate times for perf prof.

This commit is contained in:
Jim Miller 2017-07-19 17:19:13 -05:00
parent 09c684d744
commit 7adc1b4d54
2 changed files with 23 additions and 9 deletions

View file

@ -17,6 +17,7 @@
import re
from datetime import datetime, timedelta
from collections import defaultdict
import logging
import urlparse as up
@ -36,6 +37,19 @@ from ..configurable import Configurable
from ..htmlcleanup import removeEntities, removeAllEntities, stripHTML
from ..exceptions import InvalidStoryURL
# quick convenience class
class TimeKeeper(defaultdict):
def __init__(self):
defaultdict.__init__(self, timedelta)
def add(self, name, td):
self[name] = self[name] + td
def __unicode__(self):
keys = self.keys()
keys.sort()
return u"\n".join([ u"%s: %s"%(k,self[k]) for k in keys ])
class BaseSiteAdapter(Configurable):
@classmethod
@ -72,6 +86,9 @@ class BaseSiteAdapter(Configurable):
self.calibrebookmark = None
self.logfile = None
## for doing some performance profiling.
self.times = TimeKeeper()
self._setURL(url)
if not self.validateURL():
raise InvalidStoryURL(url,
@ -193,6 +210,7 @@ class BaseSiteAdapter(Configurable):
if self.logfile:
self.story.logfile = self.logfile
logger.debug(u"getStory times:\n%s"%self.times)
return self.story
def getStoryMetadataOnly(self,get_cover=True):
@ -210,6 +228,7 @@ class BaseSiteAdapter(Configurable):
for index, (title,url) in enumerate(self.chapterUrls):
self.chapterUrls[index] = (title,self.normalize_chapterurl(url))
logger.debug(u"getStoryMetadataOnly times:\n%s"%self.times)
return self.story
def setStoryMetadata(self,metahtml):
@ -357,10 +376,11 @@ class BaseSiteAdapter(Configurable):
# image problems when same chapter URL
# included more than once (base_xenforo
# always_include_first_post setting)
self.times.add("utf8FromSoup->copy", datetime.now() - start)
## _do_utf8FromSoup broken out to separate copy & timing and
## allow for inherit override.
retval = self._do_utf8FromSoup(url,soup,fetch,allow_replace_br_with_p)
logger.debug("utf8FromSoup time:%s"%(datetime.now() - start))
self.times.add("utf8FromSoup", datetime.now() - start)
return retval
def _do_utf8FromSoup(self,url,soup,fetch=None,allow_replace_br_with_p=True):
@ -446,7 +466,9 @@ class BaseSiteAdapter(Configurable):
if self.getConfig("replace_br_with_p") and allow_replace_br_with_p:
# Apply heuristic processing to replace <br> paragraph
# breaks with <p> tags.
start = datetime.now()
retval = replace_br_with_p(retval)
self.times.add("utf8FromSoup->replace_br_with_p", datetime.now() - start)
if self.getConfig('replace_hr'):
# replacing a self-closing tag with a container tag in the

View file

@ -21,7 +21,6 @@ import re
import codecs
import bs4 as bs
import HtmlTagStack as stack
from datetime import datetime
from . import exceptions as exceptions
@ -36,13 +35,6 @@ def replace_br_with_p(body):
logger.debug("replace_br_with_p previously applied, skipping.")
return body
start = datetime.now()
retval = _replace_br_with_p(body)
logger.debug("replace_br_with_p time:%s"%(datetime.now() - start))
return retval
def _replace_br_with_p(body):
# Ascii character (and Unicode as well) xA0 is a non-breaking space, ascii code 160.
# However, Python Regex does not recognize it as a whitespace, so we'll be changing it to a regular space.
# .strip() so "\n<div>" at beginning is also recognized.