diff --git a/fanficfare/adapters/base_adapter.py b/fanficfare/adapters/base_adapter.py
index c0634578..688378b5 100644
--- a/fanficfare/adapters/base_adapter.py
+++ b/fanficfare/adapters/base_adapter.py
@@ -17,6 +17,7 @@
import re
from datetime import datetime, timedelta
+from collections import defaultdict
import logging
import urlparse as up
@@ -36,6 +37,19 @@ from ..configurable import Configurable
from ..htmlcleanup import removeEntities, removeAllEntities, stripHTML
from ..exceptions import InvalidStoryURL
+# quick convenience class
+class TimeKeeper(defaultdict):
+ def __init__(self):
+ defaultdict.__init__(self, timedelta)
+
+ def add(self, name, td):
+ self[name] = self[name] + td
+
+ def __unicode__(self):
+ keys = self.keys()
+ keys.sort()
+ return u"\n".join([ u"%s: %s"%(k,self[k]) for k in keys ])
+
class BaseSiteAdapter(Configurable):
@classmethod
@@ -72,6 +86,9 @@ class BaseSiteAdapter(Configurable):
self.calibrebookmark = None
self.logfile = None
+ ## for doing some performance profiling.
+ self.times = TimeKeeper()
+
self._setURL(url)
if not self.validateURL():
raise InvalidStoryURL(url,
@@ -193,6 +210,7 @@ class BaseSiteAdapter(Configurable):
if self.logfile:
self.story.logfile = self.logfile
+ logger.debug(u"getStory times:\n%s"%self.times)
return self.story
def getStoryMetadataOnly(self,get_cover=True):
@@ -210,6 +228,7 @@ class BaseSiteAdapter(Configurable):
for index, (title,url) in enumerate(self.chapterUrls):
self.chapterUrls[index] = (title,self.normalize_chapterurl(url))
+ logger.debug(u"getStoryMetadataOnly times:\n%s"%self.times)
return self.story
def setStoryMetadata(self,metahtml):
@@ -357,10 +376,11 @@ class BaseSiteAdapter(Configurable):
# image problems when same chapter URL
# included more than once (base_xenforo
# always_include_first_post setting)
+ self.times.add("utf8FromSoup->copy", datetime.now() - start)
## _do_utf8FromSoup broken out to separate copy & timing and
## allow for inherit override.
retval = self._do_utf8FromSoup(url,soup,fetch,allow_replace_br_with_p)
- logger.debug("utf8FromSoup time:%s"%(datetime.now() - start))
+ self.times.add("utf8FromSoup", datetime.now() - start)
return retval
def _do_utf8FromSoup(self,url,soup,fetch=None,allow_replace_br_with_p=True):
@@ -446,7 +466,9 @@ class BaseSiteAdapter(Configurable):
if self.getConfig("replace_br_with_p") and allow_replace_br_with_p:
# Apply heuristic processing to replace
paragraph
# breaks with
tags. + start = datetime.now() retval = replace_br_with_p(retval) + self.times.add("utf8FromSoup->replace_br_with_p", datetime.now() - start) if self.getConfig('replace_hr'): # replacing a self-closing tag with a container tag in the diff --git a/fanficfare/htmlheuristics.py b/fanficfare/htmlheuristics.py index 13965704..3b67eeee 100644 --- a/fanficfare/htmlheuristics.py +++ b/fanficfare/htmlheuristics.py @@ -21,7 +21,6 @@ import re import codecs import bs4 as bs import HtmlTagStack as stack -from datetime import datetime from . import exceptions as exceptions @@ -36,13 +35,6 @@ def replace_br_with_p(body): logger.debug("replace_br_with_p previously applied, skipping.") return body - start = datetime.now() - retval = _replace_br_with_p(body) - logger.debug("replace_br_with_p time:%s"%(datetime.now() - start)) - return retval - -def _replace_br_with_p(body): - # Ascii character (and Unicode as well) xA0 is a non-breaking space, ascii code 160. # However, Python Regex does not recognize it as a whitespace, so we'll be changing it to a regular space. # .strip() so "\n