diff --git a/recipes/people_us_mashup.recipe b/recipes/people_us_mashup.recipe
index 28c76d820c..5d820bacc0 100644
--- a/recipes/people_us_mashup.recipe
+++ b/recipes/people_us_mashup.recipe
@@ -18,6 +18,7 @@ class PeopleMag(BasicNewsRecipe):
no_stylesheets = True
auto_cleanup = True
+ auto_cleanup_keep = '//div[@id="article-image"]'
feeds = [
diff --git a/src/calibre/ebooks/readability/readability.py b/src/calibre/ebooks/readability/readability.py
index 8d4a23b338..028a4d6ede 100644
--- a/src/calibre/ebooks/readability/readability.py
+++ b/src/calibre/ebooks/readability/readability.py
@@ -1,3 +1,8 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+ print_function)
+
import re, sys
from collections import defaultdict
@@ -72,10 +77,15 @@ def __init__(self, input, log, **options):
self.options[k] = v
self.html = None
self.log = log
+ self.keep_elements = set()
def _html(self, force=False):
if force or self.html is None:
self.html = self._parse(self.input)
+ path = self.options['keep_elements']
+ if path is not None:
+ self.keep_elements = set(self.html.xpath(path))
+
return self.html
def _parse(self, input):
@@ -152,8 +162,9 @@ def get_article(self, candidates, best_candidate):
append = False
if sibling is best_elem:
append = True
- sibling_key = sibling #HashableElement(sibling)
- if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
+ if sibling in candidates and candidates[sibling]['content_score'] >= sibling_score_threshold:
+ append = True
+ if sibling in self.keep_elements:
append = True
if sibling.tag == "p":
@@ -283,6 +294,8 @@ def debug(self, *a):
def remove_unlikely_candidates(self):
for elem in self.html.iter():
+ if elem in self.keep_elements:
+ continue
s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
#self.debug(s)
if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body':
@@ -337,7 +350,7 @@ def sanitize(self, node, candidates):
allowed = {}
# Conditionally clean
s, s, and s
for el in self.reverse_tags(node, "table", "ul", "div"):
- if el in allowed:
+ if el in allowed or el in self.keep_elements:
continue
weight = self.class_weight(el)
if el in candidates:
@@ -450,46 +463,17 @@ def sanitize(self, node, candidates):
#self.debug("pname %s pweight %.3f" %(pname, pweight))
el.drop_tree()
- for el in ([node] + [n for n in node.iter()]):
- if not (self.options['attributes']):
- #el.attrib = {} #FIXME:Checkout the effects of disabling this
- pass
-
return clean_attributes(tounicode(node))
-
-class HashableElement():
- def __init__(self, node):
- self.node = node
- self._path = None
-
- def _get_path(self):
- if self._path is None:
- reverse_path = []
- node = self.node
- while node is not None:
- node_id = (node.tag, tuple(node.attrib.items()), node.text)
- reverse_path.append(node_id)
- node = node.getparent()
- self._path = tuple(reverse_path)
- return self._path
- path = property(_get_path)
-
- def __hash__(self):
- return hash(self.path)
-
- def __eq__(self, other):
- return self.path == other.path
-
- def __getattr__(self, tag):
- return getattr(self.node, tag)
-
def option_parser():
from calibre.utils.config import OptionParser
parser = OptionParser(usage='%prog: [options] file')
parser.add_option('-v', '--verbose', default=False, action='store_true',
- dest='verbose',
- help=_('Show detailed output information. Useful for debugging'))
+ dest='verbose',
+ help='Show detailed output information. Useful for debugging')
+ parser.add_option('-k', '--keep-elements', default=None, action='store',
+ dest='keep_elements',
+ help='XPath specifying elements that should not be removed')
return parser
@@ -506,7 +490,12 @@ def main():
raw = f.read()
enc = sys.__stdout__.encoding or 'utf-8'
- print Document(raw, default_log, debug=options.verbose).summary().encode(enc, 'replace')
+ if options.verbose:
+ default_log.filter_level = default_log.DEBUG
+ print (Document(raw, default_log,
+ debug=options.verbose,
+ keep_elements=options.keep_elements).summary().encode(enc,
+ 'replace'))
if __name__ == '__main__':
main()
diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py
index 436612af7e..b7efd611e0 100644
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@@ -144,6 +144,18 @@ class BasicNewsRecipe(Recipe):
#: manually (though manual cleanup will always be superior).
auto_cleanup = False
+ #: Specify elements that the auto cleanup algorithm should never remove
+ #: The syntax is a XPath expression. For example::
+ #:
+ #: auto_cleanup_keep = '//div[@id="article-image"]' will keep all divs with
+ #: id="article-image"
+ #: auto_cleanup_keep = '//*[@class="important"]' will keep all elements
+ #: with class="important"
+ #: auto_cleanup_keep = '//div[@id="article-image"]|//span[@class="important"]'
+ #: will keep all divs with id="article-image" and spans
+ #: with class="important"
+ auto_cleanup_keep = None
+
#: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files
#: It will be inserted into `