diff --git a/resources/recipes/ajc.recipe b/resources/recipes/ajc.recipe
index ccd0efebdd..ea989b4b4c 100644
--- a/resources/recipes/ajc.recipe
+++ b/resources/recipes/ajc.recipe
@@ -1,6 +1,6 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
-__author__ = 'Tony Stegall'
+__author__ = 'Tony Stegall'
__copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com'
__version__ = '1.03'
__date__ = '27, September 2010'
@@ -9,6 +9,8 @@
import datetime
+from calibre.web.feeds.news import BasicNewsRecipe
+
class AdvancedUserRecipe1282101454(BasicNewsRecipe):
now = datetime.datetime.now()
title = 'The AJC'
@@ -20,39 +22,39 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
oldest_article = 1
max_articles_per_feed = 100
no_stylesheets = True
-
+
masthead_url = 'http://gawand.org/wp-content/uploads/2010/06/ajc-logo.gif'
extra_css = '''
h1.articleHeadline{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2.articleSubheadline{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
-
+
p.byline{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;}
p.organization{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;}
-
-
+
+
p{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''
-
-
+
+
keep_only_tags = [
dict(name='div', attrs={'class':['cxArticleHeader']})
,dict(attrs={'id':['cxArticleText']})
]
-
-
+
+
remove_tags = [
dict(name='div' , attrs={'class':'cxArticleList' })
,dict(name='div' , attrs={'class':'cxFeedTease' })
,dict(name='div' , attrs={'class':'cxElementEnlarge' })
,dict(name='div' , attrs={'id':'cxArticleTools' })
]
-
-
-
+
+
+
feeds = [
('Breaking News', 'http://www.ajc.com/genericList-rss.do?source=61499'),
# -------------------------------------------------------------------
- # Here are the different area feeds. Choose which ever one you wish to
+ # Here are the different area feeds. Choose which ever one you wish to
# read by simply removing the pound sign from it. I currently have it
# set to only get the Cobb area
# --------------------------------------------------------------------
@@ -70,7 +72,7 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
('Opinions', 'http://www.ajc.com/section-rss.do?source=opinion'),
('Ga Politics', 'http://www.ajc.com/section-rss.do?source=georgia-politics-elections'),
# ------------------------------------------------------------------------
- # Here are the different sports feeds. I only follow the Falcons, and Highschool
+ # Here are the different sports feeds. I only follow the Falcons, and Highschool
# but again
# You can enable which ever team you like by removing the pound sign
# ------------------------------------------------------------------------
@@ -85,25 +87,25 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
('Music', 'http://www.accessatlanta.com/section-rss.do?source=music'),
]
-
+
def postprocess_html(self, soup, first):
for credit_tag in soup.findAll('span', attrs={'class':['imageCredit rightFloat']}):
credit_tag.extract()
-
+
return soup
-
+
#def print_version(self, url):
# return url.partition('?')[0] +'?printArticle=y'
-
-
-
-
-
-
+
+
+
+
+
+
diff --git a/resources/recipes/boortz.recipe b/resources/recipes/boortz.recipe
index dfb624c4bc..b281798ac8 100644
--- a/resources/recipes/boortz.recipe
+++ b/resources/recipes/boortz.recipe
@@ -1,6 +1,6 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
-__author__ = 'Tony Stegall'
+__author__ = 'Tony Stegall'
__copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com'
__version__ = '1.04'
__date__ = '27, September 2010'
@@ -8,7 +8,7 @@
from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup, re
+
class AdvancedUserRecipe1282101454(BasicNewsRecipe):
title = 'Nealz Nuze'
language = 'en'
@@ -18,7 +18,7 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
category = 'news, politics, USA, talkshow'
oldest_article = 1
max_articles_per_feed = 100
-
+
no_stylesheets = True
remove_javascript = True
use_embedded_content = True
@@ -26,5 +26,5 @@ class AdvancedUserRecipe1282101454(BasicNewsRecipe):
conversion_options = {'linearize_tables' : True}
feeds = [
('NUZE', 'http://boortz.com/nealz_nuze_rss/rss.xml')
-
+
]
diff --git a/resources/recipes/popscience.recipe b/resources/recipes/popscience.recipe
index 1527a1bb71..5f66d048a6 100644
--- a/resources/recipes/popscience.recipe
+++ b/resources/recipes/popscience.recipe
@@ -1,5 +1,5 @@
from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import re
+import re
class AdvancedUserRecipe1282101454(BasicNewsRecipe):
title = 'Popular Science'
diff --git a/resources/recipes/telegraph_uk.recipe b/resources/recipes/telegraph_uk.recipe
index 2c261987b2..f79f0fa50c 100644
--- a/resources/recipes/telegraph_uk.recipe
+++ b/resources/recipes/telegraph_uk.recipe
@@ -1,6 +1,5 @@
-#!/usr/bin/env python
__license__ = 'GPL v3'
-__copyright__ = '2008, Darko Miletic )', re.DOTALL)
- elif format == 'pdf':
- linere = re.compile('(?<= ]*>\s* |[iub]>\s* \s*<[iub]>)\s*(?P |[iub]>\s* \s*<[iub]>)\s*(?P \s*(?=[[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens
end_rules.append((re.compile(u'[](\s* )+\s*(?=[[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens with formatting
@@ -391,12 +464,15 @@ def __call__(self, html, remove_special_chars=None,
length = -1
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
- length = line_length('pdf', html, getattr(self.extra_opts, 'unwrap_factor'))
+ docanalysis = DocAnalysis('pdf', html)
+ length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
if length:
- # print "The pdf line length returned is " + str(length)
+ #print "The pdf line length returned is " + str(length)
+ # unwrap em/en dashes
+ end_rules.append((re.compile(u'(?<=.{%i}[–—])\s* \s*(?=[[a-z\d])' % length), lambda match: ''))
end_rules.append(
# Un wrap using punctuation
- (re.compile(r'(?<=.{%i}([a-z,:)\IA]|(?(i|b|u)>)?\s*(
).*?(?=
)', re.DOTALL)
- elif format == 'spanned_html':
- linere = re.compile('(?<=)', re.DOTALL)
- lines = linere.findall(raw)
- lengths = []
- for line in lines:
- if len(line) > 0:
- lengths.append(len(line))
+ def __init__(self, format='html', raw=''):
+ raw = raw.replace(' ', ' ')
+ if format == 'html':
+ linere = re.compile('(?<=
)(?!\s*
).*?(?=
)', re.DOTALL)
+ elif format == 'spanned_html':
+ linere = re.compile('(?<=)', re.DOTALL)
+ self.lines = linere.findall(raw)
- if not lengths:
- return 0
+ def line_length(self, percent):
+ '''
+ Analyses the document to find the median line length.
+ percentage is a decimal number, 0 - 1 which is used to determine
+ how far in the list of line lengths to use. The list of line lengths is
+ ordered smallest to larged and does not include duplicates. 0.5 is the
+ median value.
+ '''
+ lengths = []
+ for line in self.lines:
+ if len(line) > 0:
+ lengths.append(len(line))
- lengths = list(set(lengths))
- total = sum(lengths)
- avg = total / len(lengths)
- max_line = avg * 2
+ if not lengths:
+ return 0
- lengths = sorted(lengths)
- for i in range(len(lengths) - 1, -1, -1):
- if lengths[i] > max_line:
- del lengths[i]
+ lengths = list(set(lengths))
+ total = sum(lengths)
+ avg = total / len(lengths)
+ max_line = avg * 2
- if percent > 1:
- percent = 1
- if percent < 0:
- percent = 0
+ lengths = sorted(lengths)
+ for i in range(len(lengths) - 1, -1, -1):
+ if lengths[i] > max_line:
+ del lengths[i]
- index = int(len(lengths) * percent) - 1
+ if percent > 1:
+ percent = 1
+ if percent < 0:
+ percent = 0
- return lengths[index]
+ index = int(len(lengths) * percent) - 1
+
+ return lengths[index]
+
+ def line_histogram(self, percent):
+ '''
+ Creates a broad histogram of the document to determine whether it incorporates hard
+ line breaks. Lines are sorted into 20 'buckets' based on length.
+ percent is the percentage of lines that should be in a single bucket to return true
+ The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks
+ '''
+ minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
+ maxLineLength=1900 # Discard larger than this to stay in range
+ buckets=20 # Each line is divided into a bucket based on length
+
+ #print "there are "+str(len(lines))+" lines"
+ #max = 0
+ #for line in self.lines:
+ # l = len(line)
+ # if l > max:
+ # max = l
+ #print "max line found is "+str(max)
+ # Build the line length histogram
+ hRaw = [ 0 for i in range(0,buckets) ]
+ for line in self.lines:
+ l = len(line)
+ if l > minLineLength and l < maxLineLength:
+ l = int(l/100)
+ #print "adding "+str(l)
+ hRaw[l]+=1
+
+ # Normalize the histogram into percents
+ totalLines = len(self.lines)
+ h = [ float(count)/totalLines for count in hRaw ]
+ #print "\nhRaw histogram lengths are: "+str(hRaw)
+ #print " percents are: "+str(h)+"\n"
+
+ # Find the biggest bucket
+ maxValue = 0
+ for i in range(0,len(h)):
+ if h[i] > maxValue:
+ maxValue = h[i]
+
+ if maxValue < percent:
+ #print "Line lengths are too variable. Not unwrapping."
+ return False
+ else:
+ #print str(maxValue)+" of the lines were in one bucket"
+ return True
class Dehyphenator(object):
'''
@@ -117,42 +172,62 @@ class Dehyphenator(object):
def __init__(self):
# Add common suffixes to the regex below to increase the likelihood of a match -
# don't add suffixes which are also complete words, such as 'able' or 'sex'
- self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)ion(s|al(ly)?)?|ings?|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
+ self.removesuffixes = re.compile(r"((ed)?ly|('e)?s|a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|ment(s)?|ism|ated|(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex)$", re.IGNORECASE)
# remove prefixes if the prefix was not already the point of hyphenation
- self.prefixes = re.compile(r'^(un|in|ex)$', re.IGNORECASE)
- self.removeprefix = re.compile(r'^(un|in|ex)', re.IGNORECASE)
+ self.prefixes = re.compile(r'^(dis|re|un|in|ex)$', re.IGNORECASE)
+ self.removeprefix = re.compile(r'^(dis|re|un|in|ex)', re.IGNORECASE)
def dehyphenate(self, match):
firsthalf = match.group('firstpart')
secondhalf = match.group('secondpart')
+ try:
+ wraptags = match.group('wraptags')
+ except:
+ wraptags = ''
hyphenated = str(firsthalf) + "-" + str(secondhalf)
dehyphenated = str(firsthalf) + str(secondhalf)
lookupword = self.removesuffixes.sub('', dehyphenated)
if self.prefixes.match(firsthalf) is None:
lookupword = self.removeprefix.sub('', lookupword)
- booklookup = re.compile(u'%s' % lookupword, re.IGNORECASE)
#print "lookup word is: "+str(lookupword)+", orig is: " + str(hyphenated)
- match = booklookup.search(self.html)
- if match:
- #print "returned dehyphenated word: " + str(dehyphenated)
- return dehyphenated
- else:
- #print "returned hyphenated word: " + str(hyphenated)
+ try:
+ searchresult = self.html.find(str.lower(lookupword))
+ except:
return hyphenated
+ if self.format == 'html_cleanup':
+ if self.html.find(lookupword) != -1 or searchresult != -1:
+ #print "Cleanup:returned dehyphenated word: " + str(dehyphenated)
+ return dehyphenated
+ elif self.html.find(hyphenated) != -1:
+ #print "Cleanup:returned hyphenated word: " + str(hyphenated)
+ return hyphenated
+ else:
+ #print "Cleanup:returning original text "+str(firsthalf)+" + linefeed "+str(secondhalf)
+ return firsthalf+u'\u2014'+wraptags+secondhalf
+
+ else:
+ if self.html.find(lookupword) != -1 or searchresult != -1:
+ #print "returned dehyphenated word: " + str(dehyphenated)
+ return dehyphenated
+ else:
+ #print " returned hyphenated word: " + str(hyphenated)
+ return hyphenated
def __call__(self, html, format, length=1):
self.html = html
+ self.format = format
if format == 'html':
- intextmatch = re.compile(u'(?<=.{%i})(?P
\s*(?P
\s*){1,3}\s*(?P
)?', re.IGNORECASE), chap_head),
+ (re.compile(r'
\s*(?P
\s*){1,3}\s*(?P
)?', re.IGNORECASE), chap_head),
# Cover the case where every letter in a chapter title is separated by a space
(re.compile(r'
\s*(?P
\s*){1,3}\s*(?P
))?'), chap_head),
@@ -374,10 +449,8 @@ def __call__(self, html, remove_special_chars=None,
print 'Failed to parse remove_footer regexp'
traceback.print_exc()
- # unwrap em/en dashes, delete soft hyphens - moved here so it's executed after header/footer removal
+ # delete soft hyphens - moved here so it's executed after header/footer removal
if is_pdftohtml:
- # unwrap em/en dashes
- end_rules.append((re.compile(u'(?<=[–—])\s*
\s*", "\n
", html) + ###### Check Markup ###### # # some lit files don't have any
tags or equivalent (generally just plain text between #
tags), check and mark up line endings if required before proceeding
if self.no_markup(html, 0.1):
self.log("not enough paragraph markers, adding now")
- # check if content is in pre tags, use txt procesor to mark up if so
+ # check if content is in pre tags, use txt processor to mark up if so
pre = re.compile(r'', re.IGNORECASE)
if len(pre.findall(html)) == 1:
self.log("Running Text Processing")
@@ -113,47 +118,77 @@ def __call__(self, html):
# Get rid of empty tags to simplify other processing
html = re.sub(ur'\s*\s* ', ' ', html)
# Get rid of empty span, bold, & italics tags
- html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html)
+ html = re.sub(r"\s*]*>\s*(]*>\s*){0,2}\s*\s*", " ", html)
html = re.sub(r"\s*<[ibu][^>]*>\s*(<[ibu][^>]*>\s*[ibu]>\s*){0,2}\s*[ibu]>", " ", html)
html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html)
- # If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
+ # If more than 40% of the lines are empty paragraphs and the user has enabled remove
+ # paragraph spacing then delete blank lines to clean up spacing
linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL)
blankreg = re.compile(r'\s*(?P]*>)\s*(?P
)', re.IGNORECASE)
#multi_blank = re.compile(r'(\s*]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s*
){2,}', re.IGNORECASE)
blanklines = blankreg.findall(html)
lines = linereg.findall(html)
+ blanks_between_paragraphs = False
if len(lines) > 1:
self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
'remove_paragraph_spacing', False):
self.log("deleting blank lines")
html = blankreg.sub('', html)
- # Arrange line feeds and
tags so the line_length and no_markup functions work correctly
- html = re.sub(r"\s*", "\n", html)
- html = re.sub(r"\s*\s*", "\n
", html)
+ elif float(len(blanklines)) / float(len(lines)) > 0.40:
+ blanks_between_paragraphs = True
+ #print "blanks between paragraphs is marked True"
+ else:
+ blanks_between_paragraphs = False
+ #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
# detect chapters/sections to match xpath or splitting logic
+ #
+ # Build the Regular Expressions in pieces
+ lookahead = "(?=<(p|div))"
+ chapter_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*"
+ chapter_header_open = r"(?P"
+ chapter_header_close = ")\s*"
+ chapter_line_close = "((?P=inner3)>)?\s*((?P=inner2)>)?\s*((?P=inner1)\s[^>]*>)?\s*(?P=outer)>\s*"
+ if blanks_between_paragraphs:
+ blank_lines = "(\s*]*>\s*
){0,2}\s*"
+ else:
+ blank_lines = ""
+ opt_title_open = "("
+ title_line_open = "<(?Pp|div)[^>]*>\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*(<(?Pfont|span|[ibu])[^>]*>)?\s*"
+ title_header_open = "(?P"
+ title_header_close = ")\s*"
+ title_line_close = "((?P=inner6)>)?\s*((?P=inner5)>)?\s*((?P=inner4)\s[^>]*>)?\s*(?P=outer2)>"
+ opt_title_close = ")?"
+
+ default_title = r"(\s*[\w\'\"-]+){1,5}(?!<)"
+ typical_chapters = r".?(Introduction|Synopsis|Acknowledgements|Chapter|Kapitel|Epilogue|Volume\s|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,4}"
+ numeric_chapters = r".?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*"
+ uppercase_chapters = r"\s*.?([A-Z#]+(\s|-){0,3}){1,5}\s*"
+
+ chapter_marker = lookahead+chapter_line_open+chapter_header_open+typical_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+ #print chapter_marker
heading = re.compile(']*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html))
self.log("found " + str(self.html_preprocess_sections) + " pre-existing headings")
#
# Start with most typical chapter headings, get more aggressive until one works
if self.html_preprocess_sections < 10:
- chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*([ibu]>){0,2})\s*( )?s*([ibu]>){0,2}\s*()?\s*((p|/?br)>)\s*\s*(\s*]*>\s*
){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*( )?\s*([ibu]>){0,2}\s*((br|p)>))?', re.IGNORECASE|re.VERBOSE)
+ chapdetect = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
html = chapdetect.sub(self.chapter_head, html)
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
- chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}\s*.?(\d+\.?|(CHAPTER\s*([\dA-Z\-\'\"\?\.!#,]+\s*){1,10}))\s*([ibu]>){0,2})\s*( )?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*( )?\s*([ibu]>){0,2}\s*((br|p)>))?', re.UNICODE)
+ chapter_marker = lookahead+chapter_line_open+chapter_header_open+numeric_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+ chapdetect2 = re.compile(r'%s' % chapter_marker, re.IGNORECASE)
html = chapdetect2.sub(self.chapter_head, html)
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
- chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}\s*.?([A-Z#\-\s]+)\s*([ibu]>){0,2})\s*( )?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu][^>]*>){0,2}\s*(]*>)?\s*(?P(<[ibu][^>]*>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*( )?\s*([ibu]>){0,2}\s*((br|p)>))?', re.UNICODE)
+ chapter_marker = lookahead+chapter_line_open+chapter_header_open+uppercase_chapters+chapter_header_close+chapter_line_close+blank_lines+opt_title_open+title_line_open+title_header_open+default_title+title_header_close+title_line_close+opt_title_close
+ chapdetect2 = re.compile(r'%s' % chapter_marker, re.UNICODE)
html = chapdetect2.sub(self.chapter_head, html)
-
###### Unwrap lines ######
#
- self.log("Unwrapping Lines")
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
# that lines can be un-wrapped across page boundaries
@@ -168,25 +203,40 @@ def __call__(self, html):
format = 'html'
else:
format = 'html'
-
+ # Check Line histogram to determine if the document uses hard line breaks, If 50% or
+ # more of the lines break in the same region of the document then unwrapping is required
+ docanalysis = DocAnalysis(format, html)
+ hardbreaks = docanalysis.line_histogram(.50)
+ self.log("Hard line breaks check returned "+str(hardbreaks))
# Calculate Length
- length = line_length(format, html, getattr(self.extra_opts,
- 'html_unwrap_factor', 0.4))
+ unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
+ length = docanalysis.line_length(unwrap_factor)
self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
- max_length = length * 1.4
- min_max = str("(?<=.{"+str(length)+"})(?\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
- html = re.sub(u'%s(?<=[\u2013\u2014])\s*(?=<)(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % min_max, '', html)
- # Dehyphenate
- dehyphenator = Dehyphenator()
- html = dehyphenator(html,'html', length)
+ # only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
+ if hardbreaks or unwrap_factor < 0.4:
+ self.log("Unwrapping required, unwrapping Lines")
+ # Unwrap em/en dashes
+ html = re.sub(u'(?<=.{%i}[\u2013\u2014])\s*(?=<)(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*(?=[[a-z\d])' % length, '', html)
+ # Dehyphenate
+ self.log("Unwrapping/Removing hyphens")
+ dehyphenator = Dehyphenator()
+ html = dehyphenator(html,'html', length)
+ self.log("Done dehyphenating")
+ # Unwrap lines using punctation and line length
+ unwrap = re.compile(u"(?<=.{%i}([a-z,:)\IA\u00DF]|(?\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
+ html = unwrap.sub(' ', html)
+ #check any remaining hyphens, but only unwrap if there is a match
+ dehyphenator = Dehyphenator()
+ html = dehyphenator(html,'html_cleanup', length)
+ else:
+ # dehyphenate in cleanup mode to fix anything previous conversions/editing missed
+ self.log("Cleaning up hyphenation")
+ dehyphenator = Dehyphenator()
+ html = dehyphenator(html,'html_cleanup', length)
+ self.log("Done dehyphenating")
- # Unwrap lines using punctation and line length
- unwrap = re.compile(r"(?<=.{%i}([a-z,;):\IA]|(?\s*((p|span|div)>)?\s*(?P<(p|span|div)[^>]*>\s*(<(p|span|div)[^>]*>\s*(span|p|div)>\s*)(span|p|div)>\s*){0,3}\s*<(span|div|p)[^>]*>\s*(<(span|div|p)[^>]*>)?\s*" % length, re.UNICODE)
- html = unwrap.sub(' ', html)
+ # delete soft hyphens
+ html = re.sub(u'\xad\s*( \s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
# If still no sections after unwrapping mark split points on lines with no punctuation
if self.html_preprocess_sections < 10:
diff --git a/src/calibre/ebooks/metadata/covers.py b/src/calibre/ebooks/metadata/covers.py
index b05444c1c6..2f6fb46540 100644
--- a/src/calibre/ebooks/metadata/covers.py
+++ b/src/calibre/ebooks/metadata/covers.py
@@ -9,6 +9,7 @@
from functools import partial
from threading import Thread, Event
from Queue import Queue, Empty
+from lxml import etree
import mechanize
@@ -216,6 +217,68 @@ def download_covers(mi, result_queue, max_covers=50, timeout=5.): # {{{
# }}}
+class DoubanCovers(CoverDownload): # {{{
+ 'Download covers from Douban.com'
+
+ DOUBAN_ISBN_URL = 'http://api.douban.com/book/subject/isbn/'
+ CALIBRE_DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d'
+ name = 'Douban.com covers'
+ description = _('Download covers from Douban.com')
+ author = 'Li Fanxi'
+
+ def get_cover_url(self, isbn, br, timeout=5.):
+ try:
+ url = self.DOUBAN_ISBN_URL + isbn + "?apikey=" + self.CALIBRE_DOUBAN_API_KEY
+ src = br.open(url, timeout=timeout).read()
+ except Exception, err:
+ if isinstance(getattr(err, 'args', [None])[0], socket.timeout):
+ err = Exception(_('Douban.com API timed out. Try again later.'))
+ raise err
+ else:
+ feed = etree.fromstring(src)
+ NAMESPACES = {
+ 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
+ 'atom' : 'http://www.w3.org/2005/Atom',
+ 'db': 'http://www.douban.com/xmlns/'
+ }
+ XPath = partial(etree.XPath, namespaces=NAMESPACES)
+ entries = XPath('//atom:entry')(feed)
+ if len(entries) < 1:
+ return None
+ try:
+ cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href")
+ u = cover_url(entries[0])[0].replace('/spic/', '/lpic/');
+ # If URL contains "book-default", the book doesn't have a cover
+ if u.find('book-default') != -1:
+ return None
+ except:
+ return None
+ return u
+
+ def has_cover(self, mi, ans, timeout=5.):
+ if not mi.isbn:
+ return False
+ br = browser()
+ try:
+ if self.get_cover_url(mi.isbn, br, timeout=timeout) != None:
+ self.debug('cover for', mi.isbn, 'found')
+ ans.set()
+ except Exception, e:
+ self.debug(e)
+
+ def get_covers(self, mi, result_queue, abort, timeout=5.):
+ if not mi.isbn:
+ return
+ br = browser()
+ try:
+ url = self.get_cover_url(mi.isbn, br, timeout=timeout)
+ cover_data = br.open_novisit(url).read()
+ result_queue.put((True, cover_data, 'jpg', self.name))
+ except Exception, e:
+ result_queue.put((False, self.exception_to_string(e),
+ traceback.format_exc(), self.name))
+# }}}
+
def download_cover(mi, timeout=5.): # {{{
results = Queue()
download_covers(mi, results, max_covers=1, timeout=timeout)
diff --git a/src/calibre/ebooks/metadata/meta.py b/src/calibre/ebooks/metadata/meta.py
index 68deca5e10..b02ae2dbff 100644
--- a/src/calibre/ebooks/metadata/meta.py
+++ b/src/calibre/ebooks/metadata/meta.py
@@ -181,7 +181,7 @@ def swap(a):
mi.isbn = si
except (IndexError, ValueError):
pass
- if not mi.title:
+ if mi.is_null('title'):
mi.title = name
return mi
diff --git a/src/calibre/gui2/actions/edit_metadata.py b/src/calibre/gui2/actions/edit_metadata.py
index bd9728989b..cc74b3c515 100644
--- a/src/calibre/gui2/actions/edit_metadata.py
+++ b/src/calibre/gui2/actions/edit_metadata.py
@@ -184,7 +184,7 @@ def edit_bulk_metadata(self, checked):
self.gui.tags_view.blockSignals(True)
try:
changed = MetadataBulkDialog(self.gui, rows,
- self.gui.library_view.model().db).changed
+ self.gui.library_view.model()).changed
finally:
self.gui.tags_view.blockSignals(False)
if changed:
diff --git a/src/calibre/gui2/dialogs/metadata_bulk.py b/src/calibre/gui2/dialogs/metadata_bulk.py
index 9c83b3aee5..b0ce0a1e6d 100644
--- a/src/calibre/gui2/dialogs/metadata_bulk.py
+++ b/src/calibre/gui2/dialogs/metadata_bulk.py
@@ -142,12 +142,13 @@ class MetadataBulkDialog(QDialog, Ui_MetadataBulkDialog):
_('Append to field'),
]
- def __init__(self, window, rows, db):
+ def __init__(self, window, rows, model):
QDialog.__init__(self, window)
Ui_MetadataBulkDialog.__init__(self)
self.setupUi(self)
- self.db = db
- self.ids = [db.id(r) for r in rows]
+ self.model = model
+ self.db = model.db
+ self.ids = [self.db.id(r) for r in rows]
self.box_title.setText('' +
_('Editing meta information for %d books') %
len(rows))
@@ -170,7 +171,7 @@ def __init__(self, window, rows, db):
self.tag_editor_button.clicked.connect(self.tag_editor)
self.autonumber_series.stateChanged[int].connect(self.auto_number_changed)
- if len(db.custom_field_keys(include_composites=False)) == 0:
+ if len(self.db.custom_field_keys(include_composites=False)) == 0:
self.central_widget.removeTab(1)
else:
self.create_custom_column_editors()
@@ -617,8 +618,15 @@ def accept(self):
self.worker = Worker(args, self.db, self.ids,
getattr(self, 'custom_column_widgets', []),
Dispatcher(bb.accept, parent=bb))
- self.worker.start()
- bb.exec_()
+
+ # The metadata backup thread causes database commits
+ # which can slow down bulk editing of large numbers of books
+ self.model.stop_metadata_backup()
+ try:
+ self.worker.start()
+ bb.exec_()
+ finally:
+ self.model.start_metadata_backup()
if self.worker.error is not None:
return error_dialog(self, _('Failed'),
diff --git a/src/calibre/gui2/dialogs/scheduler.py b/src/calibre/gui2/dialogs/scheduler.py
index fd8184933f..30f4a2d8a2 100644
--- a/src/calibre/gui2/dialogs/scheduler.py
+++ b/src/calibre/gui2/dialogs/scheduler.py
@@ -57,6 +57,10 @@ def __init__(self, recipe_model, parent=None):
self.old_news.setValue(gconf['oldest_news'])
+ def keyPressEvent(self, ev):
+ if ev.key() not in (Qt.Key_Enter, Qt.Key_Return):
+ return QDialog.keyPressEvent(self, ev)
+
def break_cycles(self):
self.disconnect(self.recipe_model, SIGNAL('searched(PyQt_PyObject)'),
self.search_done)
diff --git a/src/calibre/gui2/library/models.py b/src/calibre/gui2/library/models.py
index b2a7f08055..9da5420681 100644
--- a/src/calibre/gui2/library/models.py
+++ b/src/calibre/gui2/library/models.py
@@ -159,17 +159,24 @@ def col_idx(name):
# do something on the GUI thread. Deadlock.
self.cover_cache = CoverCache(db, FunctionDispatcher(self.db.cover))
self.cover_cache.start()
- if self.metadata_backup is not None:
- self.metadata_backup.stop()
- # Would like to to a join here, but the thread might be waiting to
- # do something on the GUI thread. Deadlock.
- self.metadata_backup = MetadataBackup(db)
- self.metadata_backup.start()
+ self.stop_metadata_backup()
+ self.start_metadata_backup()
def refresh_cover(event, ids):
if event == 'cover' and self.cover_cache is not None:
self.cover_cache.refresh(ids)
db.add_listener(refresh_cover)
+ def start_metadata_backup(self):
+ self.metadata_backup = MetadataBackup(self.db)
+ self.metadata_backup.start()
+
+ def stop_metadata_backup(self):
+ if getattr(self, 'metadata_backup', None) is not None:
+ self.metadata_backup.stop()
+ # Would like to to a join here, but the thread might be waiting to
+ # do something on the GUI thread. Deadlock.
+
+
def refresh_ids(self, ids, current_row=-1):
rows = self.db.refresh_ids(ids)
if rows:
diff --git a/src/calibre/gui2/preferences/misc.py b/src/calibre/gui2/preferences/misc.py
index 865115c2ed..582d110c6c 100644
--- a/src/calibre/gui2/preferences/misc.py
+++ b/src/calibre/gui2/preferences/misc.py
@@ -106,14 +106,13 @@ def debug_device_detection(self, *args):
d.exec_()
def compact(self, *args):
- from calibre.library.caches import MetadataBackup
m = self.gui.library_view.model()
- if m.metadata_backup is not None:
- m.metadata_backup.stop()
- d = CheckIntegrity(m.db, self)
- d.exec_()
- m.metadata_backup = MetadataBackup(m.db)
- m.metadata_backup.start()
+ m.stop_metadata_backup()
+ try:
+ d = CheckIntegrity(m.db, self)
+ d.exec_()
+ finally:
+ m.start_metadata_backup()
def open_config_dir(self, *args):
from calibre.utils.config import config_dir
diff --git a/src/calibre/gui2/tools.py b/src/calibre/gui2/tools.py
index 2f0452a773..c068168247 100644
--- a/src/calibre/gui2/tools.py
+++ b/src/calibre/gui2/tools.py
@@ -217,9 +217,12 @@ def fetch_scheduled_recipe(arg):
if 'output_profile' in ps:
recs.append(('output_profile', ps['output_profile'],
OptionRecommendation.HIGH))
- if ps['output_profile'] == 'kindle':
- recs.append(('no_inline_toc', True,
- OptionRecommendation.HIGH))
+ # Disabled since apparently some people use
+ # K4PC and, surprise, surprise, it doesn't support
+ # indexed MOBIs.
+ #if ps['output_profile'] == 'kindle':
+ # recs.append(('no_inline_toc', True,
+ # OptionRecommendation.HIGH))
lf = load_defaults('look_and_feel')
if lf.get('base_font_size', 0.0) != 0.0:
diff --git a/src/calibre/gui2/wizard/__init__.py b/src/calibre/gui2/wizard/__init__.py
index f3234d48d5..37b7c7bd7c 100644
--- a/src/calibre/gui2/wizard/__init__.py
+++ b/src/calibre/gui2/wizard/__init__.py
@@ -73,6 +73,14 @@ class JetBook(Device):
manufacturer = 'Ectaco'
id = 'jetbook'
+class JetBookMini(Device):
+
+ output_profile = 'jetbook5'
+ output_format = 'FB2'
+ name = 'JetBook Mini'
+ manufacturer = 'Ectaco'
+ id = 'jetbookmini'
+
class KindleDX(Kindle):
output_profile = 'kindle_dx'
@@ -584,12 +592,42 @@ def change_language(self, idx):
qt_app.load_translations()
self.emit(SIGNAL('retranslate()'))
self.init_languages()
+ try:
+ if prefs['language'].lower().startswith('zh'):
+ from calibre.customize.ui import enable_plugin
+ for name in ('Douban Books', 'Douban.com covers'):
+ enable_plugin(name)
+ except:
+ pass
+
+ def is_library_dir_suitable(self, x):
+ return LibraryDatabase2.exists_at(x) or not os.listdir(x)
+
+ def validatePage(self):
+ newloc = unicode(self.location.text())
+ if not self.is_library_dir_suitable(newloc):
+ self.show_library_dir_error(newloc)
+ return False
+ return True
def change(self):
- dir = choose_dir(self, 'database location dialog',
+ x = choose_dir(self, 'database location dialog',
_('Select location for books'))
- if dir:
- self.location.setText(dir)
+ if x:
+ if self.is_library_dir_suitable(x):
+ self.location.setText(x)
+ else:
+ self.show_library_dir_error(x)
+
+ def show_library_dir_error(self, x):
+ if not isinstance(x, unicode):
+ try:
+ x = x.decode(filesystem_encoding)
+ except:
+ x = unicode(repr(x))
+ error_dialog(self, _('Bad location'),
+ _('You must choose an empty folder for '
+ 'the calibre library. %s is not empty.')%x, show=True)
def initializePage(self):
lp = prefs['library_path']