calibre/recipes/la_jornada.recipe
Rogelio Domínguez Hernández b54734383d
Change cover url to new domain www.jornada.com.mx
old domain still works but it will probably stop working any day, so better to use the new one
2020-10-03 08:52:36 -05:00

106 lines
4.6 KiB
Text

__license__ = 'GPL v3'
__copyright__ = '2010-2012, Darko Miletic <darko.miletic at gmail.com>, Rogelio Domínguez <rogelio.dominguez@gmail.com>'
'''
www.jornada.com.mx
'''
import re
try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
try:
from urllib.parse import urlparse, urlunparse, parse_qs
except ImportError:
from urlparse import urlparse, urlunparse, parse_qs
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
class LaJornada_mx(BasicNewsRecipe):
title = 'La Jornada (Mexico)'
__author__ = 'Darko Miletic/Rogelio Domínguez'
description = 'Noticias del diario mexicano La Jornada'
publisher = 'DEMOS, Desarrollo de Medios, S.A. de C.V.'
category = 'news, Mexico'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'es_MX'
remove_empty_feeds = True
cover_url = strftime("http://www.jornada.com.mx/%Y/%m/%d/portada.pdf")
masthead_url = 'http://www.jornada.com.mx/v7.0/imagenes/la-jornada-trans.png'
publication_type = 'newspaper'
extra_css = """
body{font-family: "Times New Roman",serif }
.cabeza{font-size: xx-large; font-weight: bold }
.documentFirstHeading{font-size: xx-large; font-weight: bold }
.credito-articulo{font-variant: small-caps; font-weight: bold }
.foto{text-align: center}
.pie-foto{font-size: 0.9em}
.credito{font-weight: bold; margin-left: 1em}
.credito-autor{font-variant: small-caps; font-weight: bold }
.credito-titulo{text-align: right}
.hemero{text-align: right; font-size: 0.9em; margin-bottom: 0.5em }
.loc{font-weight: bold}
.carton{text-align: center}
.credit{font-weight: bold}
.sumario{font-weight: bold; text-align: center}
.text{margin-top: 1.4em}
p.inicial{display: inline; font-size: xx-large; font-weight: bold}
p.s-s{display: inline; text-indent: 0}
"""
conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
preprocess_regexps = [
(re.compile(r'<div class="inicial">(.*)</div><p class="s-s">', re.DOTALL | re.IGNORECASE),
lambda match: '<p class="inicial">' + match.group(1) + '</p><p class="s-s">')
]
keep_only_tags = [
dict(name='div', attrs={'class': ['documentContent', 'cabeza', 'sumarios',
'credito-articulo', 'text', 'carton']}), dict(name='div', attrs={'id': 'renderComments'})
]
remove_tags = [
dict(name='div', attrs={'class': ['buttonbar', 'comment-cont']})]
feeds = [
(u'Opinion', u'http://www.jornada.com.mx/rss/opinion.xml'),
(u'Cartones', u'http://www.jornada.com.mx/rss/cartones.xml'),
(u'Politica', u'http://www.jornada.com.mx/rss/politica.xml'),
(u'Economia', u'http://www.jornada.com.mx/rss/economia.xml'),
(u'Mundo', u'http://www.jornada.com.mx/rss/mundo.xml'),
(u'Estados', u'http://www.jornada.com.mx/rss/estados.xml'),
(u'Capital', u'http://www.jornada.com.mx/rss/capital.xml'),
(u'Sociedad y justicia', u'http://www.jornada.com.mx/rss/sociedad.xml'),
(u'Ciencias', u'http://www.jornada.com.mx/rss/ciencias.xml'),
(u'Cultura', u'http://www.jornada.com.mx/rss/cultura.xml'),
(u'Gastronomia', u'http://www.jornada.com.mx/rss/gastronomia.xml'),
(u'Espectaculos', u'http://www.jornada.com.mx/rss/espectaculos.xml'),
(u'Deportes', u'http://www.jornada.com.mx/rss/deportes.xml'),
(u'Ultimas noticias', u'http://www.jornada.com.mx/ultimas/news/RSS')
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup
def get_article_url(self, article):
# Get link to original article URL
rurl = article.get('guid', None)
if not rurl:
# Use the "link" attribute as failover
return article.get('link', None)
# Remove "partner" query param
u = urlparse(rurl)
query = parse_qs(u.query)
query.pop('partner', None)
u = u._replace(query=urlencode(query, True))
return urlunparse(u)