Input plugin for recipes

This commit is contained in:
Kovid Goyal 2009-04-27 15:41:10 -07:00
parent 996dda3ffe
commit 2da5589964
8 changed files with 108 additions and 31 deletions

View file

@ -287,6 +287,7 @@ def set_metadata(self, stream, mi, type):
from calibre.ebooks.rtf.input import RTFInput
from calibre.ebooks.html.input import HTMLInput
from calibre.ebooks.comic.input import ComicInput
from calibre.web.feeds.input import RecipeInput
from calibre.ebooks.oeb.output import OEBOutput
from calibre.ebooks.epub.output import EPUBOutput
from calibre.ebooks.txt.output import TXTOutput
@ -296,7 +297,7 @@ def set_metadata(self, stream, mi, type):
plugins = [HTML2ZIP, EPUBInput, MOBIInput, PDBInput, PDFInput, HTMLInput,
TXTInput, OEBOutput, TXTOutput, PDFOutput, LITInput, ComicInput,
FB2Input, ODTInput, RTFInput, EPUBOutput, EREADEROutput]
FB2Input, ODTInput, RTFInput, EPUBOutput, EREADEROutput, RecipeInput]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \
x.__name__.endswith('MetadataReader')]
plugins += [x for x in list(locals().values()) if isinstance(x, type) and \

View file

@ -52,7 +52,7 @@ def check_command_line_options(parser, args, log):
raise SystemExit(1)
input = os.path.abspath(args[1])
if not os.access(input, os.R_OK):
if not input.endswith('.recipe') and not os.access(input, os.R_OK):
log.error('Cannot read from', input)
raise SystemExit(1)
@ -169,6 +169,9 @@ def add_pipeline_options(parser, plumber):
if rec.level < rec.HIGH:
option_recommendation_to_cli_option(add_option, rec)
option_recommendation_to_cli_option(parser.add_option,
plumber.get_option_by_name('list_recipes'))
def option_parser():
return OptionParser(usage=USAGE)

View file

@ -360,6 +360,10 @@ def __init__(self, input, output, log):
OptionRecommendation(name='language',
recommended_value=None, level=OptionRecommendation.LOW,
help=_('Set the language.')),
OptionRecommendation(name='list_recipes',
recommended_value=False, help=_('List available recipes.')),
]
input_fmt = os.path.splitext(self.input)[1]
@ -525,6 +529,13 @@ def run(self):
self.setup_options()
if self.opts.verbose:
self.log.filter_level = self.log.DEBUG
if self.opts.list_recipes:
from calibre.web.feeds.recipes import titles
self.log('Available recipes:')
for title in sorted(titles):
self.log('\t'+title)
self.log('%d recipes available'%len(titles))
raise SystemExit(0)
# Run any preprocess plugins
from calibre.customize.ui import run_plugins_on_preprocess
@ -535,8 +546,13 @@ def run(self):
accelerators = {}
tdir = PersistentTemporaryDirectory('_plumber')
stream = self.input if self.input_fmt == 'recipe' else \
open(self.input, 'rb')
self.oeb = self.input_plugin(open(self.input, 'rb'), self.opts,
if hasattr(self.opts, 'lrf') and self.output_plugin.file_type == 'lrf':
self.opts.lrf = True
self.oeb = self.input_plugin(stream, self.opts,
self.input_fmt, self.log,
accelerators, tdir)
if self.opts.debug_input is not None:

View file

@ -1578,15 +1578,15 @@ def decode(self, data):
return data.decode('utf-16')
except UnicodeDecodeError:
pass
try:
return data.decode('utf-8')
except UnicodeDecodeError:
pass
if self.encoding is not None:
try:
return data.decode(self.encoding)
except UnicodeDecodeError:
pass
try:
return data.decode('utf-8')
except UnicodeDecodeError:
pass
data, _ = xml_to_unicode(data)
data = data.replace('\r\n', '\n')
data = data.replace('\r', '\n')

View file

@ -59,6 +59,7 @@ def __call__(self, oeb, context):
self.fix_links()
def split_item(self, item):
page_breaks, page_break_ids = [], []
if self.split_on_page_breaks:
page_breaks, page_break_ids = self.find_page_breaks(item)

View file

@ -2,5 +2,6 @@
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
class Recipe(object):
pass

View file

@ -0,0 +1,65 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
class RecipeInput(InputFormatPlugin):
name = 'Recipe Input'
author = 'Kovid Goyal'
description = _('Download periodical content from the internet')
file_types = set(['recipe'])
recommendations = set([
('chapter_mark', 'none', OptionRecommendation.HIGH),
('dont_split_on_page_breaks', True, OptionRecommendation.HIGH),
('use_auto_toc', False, OptionRecommendation.HIGH),
])
options = set([
OptionRecommendation(name='test', recommended_value=False,
help=_('Useful for recipe development. Forces '
'max_articles_per_feed to 2 and downloads at most 2 feeds.')),
OptionRecommendation(name='username', recommended_value=None,
help=_('Username for sites that require a login to access '
'content.')),
OptionRecommendation(name='password', recommended_value=None,
help=_('Password for sites that require a login to access '
'content.')),
OptionRecommendation(name='lrf', recommended_value=False,
help='Optimize fetching for subsequent conversion to LRF.'),
])
def convert(self, recipe_or_file, opts, file_ext, log,
accelerators, progress=lambda x, y: x):
from calibre.web.feeds.recipes import \
get_builtin_recipe, compile_recipe
if os.access(recipe_or_file, os.R_OK):
recipe = compile_recipe(open(recipe_or_file, 'rb').read())
else:
title = os.path.basename(recipe_or_file).rpartition('.')[0]
recipe = get_builtin_recipe(title)
if recipe is None:
raise ValueError('%s is not a valid recipe file or builtin recipe' %
recipe_or_file)
ro = recipe(opts, log, progress)
ro.download()
opts.output_profile.flow_size = 0
for f in os.listdir('.'):
if f.endswith('.opf'):
return os.path.abspath(f)

View file

@ -20,6 +20,7 @@
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.lrf import entity_to_unicode
from calibre.web import Recipe
from calibre.ebooks import render_html
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
@ -27,12 +28,11 @@
from calibre.web.fetch.simple import option_parser as web2disk_option_parser
from calibre.web.fetch.simple import RecursiveFetcher
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
from calibre.utils.logging import Log
from calibre.ptempfile import PersistentTemporaryFile, \
PersistentTemporaryDirectory
class BasicNewsRecipe(object):
class BasicNewsRecipe(Recipe):
'''
Abstract base class that contains logic needed in all feed fetchers.
'''
@ -443,40 +443,34 @@ def get_obfuscated_article(self, url):
'''
raise NotImplementedError
def __init__(self, options, parser, progress_reporter):
def __init__(self, options, log, progress_reporter):
'''
Initialize the recipe.
:param options: Parsed commandline options
:param parser: Command line option parser. Used to intelligently merge options.
:param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
'''
self.log = Log()
if options.verbose:
self.log.filter_level = self.log.DEBUG
self.log = log
if not isinstance(self.title, unicode):
self.title = unicode(self.title, 'utf-8', 'replace')
for attr in ('username', 'password', 'lrf', 'output_dir', 'verbose', 'debug', 'test'):
setattr(self, attr, getattr(options, attr))
self.debug = options.verbose > 1
self.output_dir = os.getcwd()
self.verbose = options.verbose
self.test = options.test
self.username = options.username
self.password = options.password
self.lrf = options.lrf
self.output_dir = os.path.abspath(self.output_dir)
if options.test:
self.max_articles_per_feed = 2
self.simultaneous_downloads = min(4, self.simultaneous_downloads)
if self.debug:
self.verbose = True
self.report_progress = progress_reporter
self.username = self.password = None
#: If True optimize downloading for eventual conversion to LRF
self.lrf = False
defaults = parser.get_default_values()
for opt in options.__dict__.keys():
if getattr(options, opt) != getattr(defaults, opt, None):
setattr(self, opt, getattr(options, opt))
if isinstance(self.feeds, basestring):
self.feeds = eval(self.feeds)
if isinstance(self.feeds, basestring):
@ -493,7 +487,6 @@ def __init__(self, options, parser, progress_reporter):
'--timeout', str(self.timeout),
'--max-recursions', str(self.recursions),
'--delay', str(self.delay),
'--timeout', str(self.timeout),
]
if self.encoding is not None:
web2disk_cmdline.extend(['--encoding', self.encoding])
@ -520,9 +513,6 @@ def __init__(self, options, parser, progress_reporter):
self.simultaneous_downloads = 1
self.navbar = templates.NavBarTemplate()
self.html2lrf_options.extend(['--page-break-before', '$', '--use-spine', '--header', '--encoding', 'utf-8'])
if '--base-font-size' not in self.html2lrf_options:
self.html2lrf_options.extend(['--base-font-size', '12'])
self.failed_downloads = []
self.partial_failures = []
@ -557,7 +547,7 @@ def _postprocess_html(self, soup, first_fetch, job_info):
return self.postprocess_html(soup, first_fetch)
def download(self, for_lrf=False):
def download(self):
'''
Download and pre-process all articles from the feeds in this recipe.
This method should be called only one on a particular Recipe instance.