From d335bccd67da45f2fd6b69b81e7e8d6db89fa378 Mon Sep 17 00:00:00 2001
From: ldolse \s*(?=[[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens
end_rules.append((re.compile(u'[](\s* )+\s*(?=[[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens with formatting
@@ -397,6 +395,8 @@ def __call__(self, html, remove_special_chars=None,
# Un wrap using punctuation
(re.compile(r'(?<=.{%i}([a-z,:)\IA]|(?(i|b|u)>)?\s*( \s*(?=[[a-z\d])' % length), lambda match: ''))
for rule in self.PREPROCESS + start_rules:
html = rule[0].sub(rule[1], html)
From 936451853caa1190eff41bf07a28f39005da5fb3 Mon Sep 17 00:00:00 2001
From: ldolse )', re.DOTALL)
+ linere = re.compile('(?<= ]*>\s* |[iub]>\s* \s*<[iub]>)\s*(?P |[iub]>\s* \s*<[iub]>)\s*(?P \s*(?P \s*(?P )?'), chap_head),)
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
- length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
+ length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'), 'median')
if length:
# print "The pdf line length returned is " + str(length)
end_rules.append(
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index f38d02309a..7e85e24a83 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -153,7 +153,6 @@ def __call__(self, html):
###### Unwrap lines ######
#
- self.log("Unwrapping Lines")
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
# that lines can be un-wrapped across page boundaries
@@ -168,25 +167,40 @@ def __call__(self, html):
format = 'html'
else:
format = 'html'
-
+ # Check Line histogram to determine if the document uses hard line breaks, If 50% or
+ # more of the lines break in the same region of the document then unwrapping is required
+ hardbreaks = line_length(format, html, .50, 'histogram')
+ print "Hard line breaks check returned "+str(hardbreaks)
# Calculate Length
- length = line_length(format, html, getattr(self.extra_opts,
- 'html_unwrap_factor', 0.4))
+ unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
+ length = line_length(format, html, unwrap_factor, 'median')
self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
- max_length = length * 1.4
- min_max = str("(?<=.{"+str(length)+"})(?\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html)
+ # Dehyphenate
+ self.log("Unwrapping/Removing hyphens")
+ dehyphenator = Dehyphenator()
+ html = dehyphenator(html,'html', length)
+ self.log("Done dehyphenating")
+ # Unwrap lines using punctation and line length
+ unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?\s*((p|span|div)>)?\s*(?P )', re.IGNORECASE|re.DOTALL)
blankreg = re.compile(r'\s*(?P ]*>)\s*(?P ]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s* \s*", "\n ", html)
# detect chapters/sections to match xpath or splitting logic
+ #
+ # Build the Regular Expressions in pieces
+ lookahead = "(?=<(p|div))"
+ chapter_line_open = "<(?P ]*>\s* ]*>\s* ]*>\s* tags), check and mark up line endings if required before proceeding
if self.no_markup(html, 0.1):
self.log("not enough paragraph markers, adding now")
- # check if content is in pre tags, use txt procesor to mark up if so
+ # check if content is in pre tags, use txt processor to mark up if so
pre = re.compile(r'', re.IGNORECASE)
if len(pre.findall(html)) == 1:
self.log("Running Text Processing")
From 301af532c6940ec8082dbe6ece4dca351417ac63 Mon Sep 17 00:00:00 2001
From: ldolse
).*?(?=
)', re.DOTALL)
elif format == 'spanned_html':
linere = re.compile('(?<=)', re.DOTALL)
lines = linere.findall(raw)
- lengths = []
- for line in lines:
- if len(line) > 0:
- lengths.append(len(line))
+ if test_type == 'median':
+ lengths = []
+ for line in lines:
+ if len(line) > 0:
+ lengths.append(len(line))
- if not lengths:
- return 0
+ if not lengths:
+ return 0
- lengths = list(set(lengths))
- total = sum(lengths)
- avg = total / len(lengths)
- max_line = avg * 2
+ lengths = list(set(lengths))
+ total = sum(lengths)
+ avg = total / len(lengths)
+ max_line = avg * 2
- lengths = sorted(lengths)
- for i in range(len(lengths) - 1, -1, -1):
- if lengths[i] > max_line:
- del lengths[i]
+ lengths = sorted(lengths)
+ for i in range(len(lengths) - 1, -1, -1):
+ if lengths[i] > max_line:
+ del lengths[i]
- if percent > 1:
- percent = 1
- if percent < 0:
- percent = 0
+ if percent > 1:
+ percent = 1
+ if percent < 0:
+ percent = 0
- index = int(len(lengths) * percent) - 1
+ index = int(len(lengths) * percent) - 1
- return lengths[index]
+ return lengths[index]
+
+ if test_type == 'histogram':
+ minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
+ maxLineLength=1900 # Discard larger than this to stay in range
+ buckets=20 # Each line is divided into a bucket based on length
+
+ #print "there are "+str(len(lines))+" lines"
+ max = 0
+ for line in lines:
+ l = len(line)
+ if l > max:
+ max = l
+ print "max line found is "+str(max)
+ # Build the line length histogram
+ hRaw = [ 0 for i in range(0,buckets) ]
+ for line in lines:
+ l = len(line)
+ if l > minLineLength and l < maxLineLength:
+ l = int(l/100)
+ #print "adding "+str(l)
+ hRaw[l]+=1
+
+ # Normalize the histogram into percents
+ totalLines = len(lines)
+ h = [ float(count)/totalLines for count in hRaw ]
+ print "\nhRaw histogram lengths are: "+str(hRaw)
+ print " percents are: "+str(h)+"\n"
+
+ # Find the biggest bucket
+ maxValue = 0
+ peakPosition = 0
+ for i in range(0,len(h)):
+ if h[i] > maxValue:
+ maxValue = h[i]
+ peakPosition = i
+
+ if maxValue < percent:
+ #print "Line lengths are too variable. Not unwrapping."
+ return False
+ else:
+ #print str(maxValue)+" of the lines were in one bucket"
+ return True
class Dehyphenator(object):
'''
@@ -117,7 +165,7 @@ class Dehyphenator(object):
def __init__(self):
# Add common suffixes to the regex below to increase the likelihood of a match -
# don't add suffixes which are also complete words, such as 'able' or 'sex'
- self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+ self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
# remove prefixes if the prefix was not already the point of hyphenation
self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
@@ -125,34 +173,54 @@ def __init__(self):
def dehyphenate(self, match):
firsthalf = match.group('firstpart')
secondhalf = match.group('secondpart')
+ try:
+ wraptags = match.group('wraptags')
+ except:
+ wraptags = ''
hyphenated = str(firsthalf) + "-" + str(secondhalf)
dehyphenated = str(firsthalf) + str(secondhalf)
lookupword = self.removesuffixes.sub('', dehyphenated)
if self.prefixes.match(firsthalf) is None:
lookupword = self.removeprefix.sub('', lookupword)
booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
- #print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
- match = booklookup.search(self.html)
- if match:
- #print "returned dehyphenated word: " + str(dehyphenated)
- return dehyphenated
+ print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
+ if self.format == 'html_cleanup':
+ match = booklookup.search(self.html)
+ hyphenmatch = re.search(u'%s' % hyphenated, self.html)
+ if match:
+ print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+ return dehyphenated
+ elif hyphenmatch:
+ print "Cleanup:returned hyphenated word: " + str(hyphenated)
+ return hyphenated
+ else:
+ print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+ return firsthalf+u'\u2014'+wraptags+secondhalf
+
else:
- #print "returned hyphenated word: " + str(hyphenated)
- return hyphenated
+ match = booklookup.search(self.html)
+ if match:
+ print "returned dehyphenated word: " + str(dehyphenated)
+ return dehyphenated
+ else:
+ print "returned hyphenated word: " + str(hyphenated)
+ return hyphenated
def __call__(self, html, format, length=1):
self.html = html
+ self.format = format
if format == 'html':
- intextmatch = re.compile(u'(?<=.{%i})(?P
]*>\s*
){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P\s*(?=[[a-z\d])' % length), lambda match: '')) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 555f42702b..f41f6abd08 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -77,6 +77,11 @@ def no_markup(self, raw, percent): def __call__(self, html): self.log("********* Preprocessing HTML *********") + + # Arrange line feeds and
tags so the line_length and no_markup functions work correctly + html = re.sub(r"\s*", "\n", html) + html = re.sub(r"\s*\s*", "\n
", html) + ###### Check Markup ###### # # some lit files don't have any
tags or equivalent (generally just plain text between @@ -135,9 +140,7 @@ def __call__(self, html): #print "blanks between paragraphs is marked True" else: blanks_between_paragraphs = False - # Arrange line feeds and
tags so the line_length and no_markup functions work correctly - html = re.sub(r"\s*", "\n", html) - html = re.sub(r"\s*\s*", "\n
", html)
+ #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
# detect chapters/sections to match xpath or splitting logic
#
# Build the Regular Expressions in pieces
@@ -160,11 +163,10 @@ def __call__(self, html):
default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)"
typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
- uppercase_chapters = r"\s*.?([A-Z#\-\s]+)\s*"
+ uppercase_chapters = r"\s*.?([A-Z#\-]+\s{0,3}){1,5}\s*"
chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
- #print chapter_marker
- #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
+ #print chapter_marker
heading = re.compile(' |[iub]>\s* \s*<[iub]>)\s*(?P |[iub]>\s* \s*<[iub]>)\s*(?P ]*>\s*
]*>\s*
).*?(?=)', re.DOTALL) + elif format == 'pdf': + linere = re.compile('(?<=\s*(?=[[a-z\d])' % length), lambda match: '')) end_rules.append( # Un wrap using punctuation (re.compile(u'(?<=.{%i}([a-z,:)\IA\u00DF]|(?(i|b|u)>)?\s*(
\s*(?=[[a-z\d])' % length), lambda match: ''))
for rule in self.PREPROCESS + start_rules:
html = rule[0].sub(rule[1], html)
diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py
index 9c57756d28..96df37f631 100644
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@@ -6,7 +6,7 @@
__docformat__ = 'restructuredtext en'
import re
-from calibre.ebooks.conversion.preprocess import line_length, Dehyphenator
+from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.utils.logging import default_log
class PreProcessor(object):
@@ -204,11 +204,12 @@ def __call__(self, html):
format = 'html'
# Check Line histogram to determine if the document uses hard line breaks, If 50% or
# more of the lines break in the same region of the document then unwrapping is required
- hardbreaks = line_length(format, html, .50, 'histogram')
- #print "Hard line breaks check returned "+str(hardbreaks)
+ docanalysis = DocAnalysis(format, html)
+ hardbreaks = docanalysis.line_histogram(.50)
+ self.log("Hard line breaks check returned "+str(hardbreaks))
# Calculate Length
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
- length = line_length(format, html, unwrap_factor, 'median')
+ length = docanalysis.line_length(unwrap_factor)
self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
if hardbreaks or unwrap_factor < 0.4:
From b7f6d820a77c64ae15139ea80870f64922b10823 Mon Sep 17 00:00:00 2001
From: ldolse ]*>\s*