diff --git a/fanficfare/adapters/base_adapter.py b/fanficfare/adapters/base_adapter.py index c0634578..688378b5 100644 --- a/fanficfare/adapters/base_adapter.py +++ b/fanficfare/adapters/base_adapter.py @@ -17,6 +17,7 @@ import re from datetime import datetime, timedelta +from collections import defaultdict import logging import urlparse as up @@ -36,6 +37,19 @@ from ..configurable import Configurable from ..htmlcleanup import removeEntities, removeAllEntities, stripHTML from ..exceptions import InvalidStoryURL +# quick convenience class +class TimeKeeper(defaultdict): + def __init__(self): + defaultdict.__init__(self, timedelta) + + def add(self, name, td): + self[name] = self[name] + td + + def __unicode__(self): + keys = self.keys() + keys.sort() + return u"\n".join([ u"%s: %s"%(k,self[k]) for k in keys ]) + class BaseSiteAdapter(Configurable): @classmethod @@ -72,6 +86,9 @@ class BaseSiteAdapter(Configurable): self.calibrebookmark = None self.logfile = None + ## for doing some performance profiling. + self.times = TimeKeeper() + self._setURL(url) if not self.validateURL(): raise InvalidStoryURL(url, @@ -193,6 +210,7 @@ class BaseSiteAdapter(Configurable): if self.logfile: self.story.logfile = self.logfile + logger.debug(u"getStory times:\n%s"%self.times) return self.story def getStoryMetadataOnly(self,get_cover=True): @@ -210,6 +228,7 @@ class BaseSiteAdapter(Configurable): for index, (title,url) in enumerate(self.chapterUrls): self.chapterUrls[index] = (title,self.normalize_chapterurl(url)) + logger.debug(u"getStoryMetadataOnly times:\n%s"%self.times) return self.story def setStoryMetadata(self,metahtml): @@ -357,10 +376,11 @@ class BaseSiteAdapter(Configurable): # image problems when same chapter URL # included more than once (base_xenforo # always_include_first_post setting) + self.times.add("utf8FromSoup->copy", datetime.now() - start) ## _do_utf8FromSoup broken out to separate copy & timing and ## allow for inherit override. retval = self._do_utf8FromSoup(url,soup,fetch,allow_replace_br_with_p) - logger.debug("utf8FromSoup time:%s"%(datetime.now() - start)) + self.times.add("utf8FromSoup", datetime.now() - start) return retval def _do_utf8FromSoup(self,url,soup,fetch=None,allow_replace_br_with_p=True): @@ -446,7 +466,9 @@ class BaseSiteAdapter(Configurable): if self.getConfig("replace_br_with_p") and allow_replace_br_with_p: # Apply heuristic processing to replace
paragraph # breaks with

tags. + start = datetime.now() retval = replace_br_with_p(retval) + self.times.add("utf8FromSoup->replace_br_with_p", datetime.now() - start) if self.getConfig('replace_hr'): # replacing a self-closing tag with a container tag in the diff --git a/fanficfare/htmlheuristics.py b/fanficfare/htmlheuristics.py index 13965704..3b67eeee 100644 --- a/fanficfare/htmlheuristics.py +++ b/fanficfare/htmlheuristics.py @@ -21,7 +21,6 @@ import re import codecs import bs4 as bs import HtmlTagStack as stack -from datetime import datetime from . import exceptions as exceptions @@ -36,13 +35,6 @@ def replace_br_with_p(body): logger.debug("replace_br_with_p previously applied, skipping.") return body - start = datetime.now() - retval = _replace_br_with_p(body) - logger.debug("replace_br_with_p time:%s"%(datetime.now() - start)) - return retval - -def _replace_br_with_p(body): - # Ascii character (and Unicode as well) xA0 is a non-breaking space, ascii code 160. # However, Python Regex does not recognize it as a whitespace, so we'll be changing it to a regular space. # .strip() so "\n

" at beginning is also recognized.