diff --git a/resources/recipes/criticadigital.recipe b/resources/recipes/criticadigital.recipe
index e1e5030a00..d1ef97aef9 100644
--- a/resources/recipes/criticadigital.recipe
+++ b/resources/recipes/criticadigital.recipe
@@ -10,7 +10,7 @@
class CriticaDigital(BasicNewsRecipe):
title = 'Critica de la Argentina'
- __author__ = 'Darko Miletic'
+ __author__ = 'Darko Miletic and Sujata Raman'
description = 'Noticias de Argentina'
oldest_article = 2
max_articles_per_feed = 100
@@ -20,17 +20,22 @@ class CriticaDigital(BasicNewsRecipe):
use_embedded_content = False
encoding = 'cp1252'
- html2lrf_options = [
- '--comment' , description
- , '--category' , 'news, Argentina'
- , '--publisher' , title
- ]
-
+ extra_css = '''
+ h1{font-family:"Trebuchet MS";}
+ h3{color:#9A0000; font-family:Tahoma; font-size:x-small;}
+ h2{color:#504E53; font-family:Arial,Helvetica,sans-serif ;font-size:small;}
+ #epigrafe{font-family:Arial,Helvetica,sans-serif ;color:#666666 ; font-size:x-small;}
+ p {font-family:Arial,Helvetica,sans-serif;}
+ #fecha{color:#858585; font-family:Tahoma; font-size:x-small;}
+ #autor{color:#858585; font-family:Tahoma; font-size:x-small;}
+ #hora{color:#F00000;font-family:Tahoma; font-size:x-small;}
+ '''
keep_only_tags = [
- dict(name='div', attrs={'class':'bloqueTitulosNoticia'})
- ,dict(name='div', attrs={'id':'c453-1' })
+ dict(name='div', attrs={'class':['bloqueTitulosNoticia','cfotonota']})
+ ,dict(name='div', attrs={'id':'boxautor'})
+ ,dict(name='p', attrs={'id':'textoNota'})
]
-
+
remove_tags = [
dict(name='div', attrs={'class':'box300' })
,dict(name='div', style=True )
@@ -38,7 +43,7 @@ class CriticaDigital(BasicNewsRecipe):
,dict(name='div', attrs={'class':'comentario' })
,dict(name='div', attrs={'class':'paginador' })
]
-
+
feeds = [
(u'Politica', u'http://www.criticadigital.com/herramientas/rss.php?ch=politica' )
,(u'Economia', u'http://www.criticadigital.com/herramientas/rss.php?ch=economia' )
@@ -60,3 +65,5 @@ def get_cover_url(self):
if link_item:
cover_url = index + link_item.img['src']
return cover_url
+
+
diff --git a/resources/recipes/infobae.recipe b/resources/recipes/infobae.recipe
index 78d00677b6..79937ce4f7 100644
--- a/resources/recipes/infobae.recipe
+++ b/resources/recipes/infobae.recipe
@@ -5,55 +5,90 @@
'''
infobae.com
'''
-
+import re
from calibre.web.feeds.news import BasicNewsRecipe
class Infobae(BasicNewsRecipe):
title = 'Infobae.com'
- __author__ = 'Darko Miletic'
+ __author__ = 'Darko Miletic and Sujata Raman'
description = 'Informacion Libre las 24 horas'
publisher = 'Infobae.com'
- category = 'news, politics, Argentina'
+ category = 'news, politics, Argentina'
oldest_article = 1
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
language = 'es'
+ lang = 'es-AR'
encoding = 'cp1252'
cover_url = 'http://www.infobae.com/imgs/header/header.gif'
remove_javascript = True
-
- html2lrf_options = [
- '--comment' , description
- , '--category' , category
- , '--publisher', publisher
- , '--ignore-tables'
- , '--ignore-colors'
- ]
-
+ preprocess_regexps = [(re.compile(
+ r''), lambda m:'')]
+
+
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
+ extra_css = '''
+ .col-center{font-family:Arial,Helvetica,sans-serif;}
+ h1{font-family:Arial,Helvetica,sans-serif; color:#0D4261;}
+ .fuenteIntNota{font-family:Arial,Helvetica,sans-serif; color:#1D1D1D; font-size:x-small;}
+ '''
+
+ keep_only_tags = [dict(name='div', attrs={'class':['content']})]
+
+
remove_tags = [
- dict(name=['embed','link','object'])
- ,dict(name='a', attrs={'onclick':'javascript:window.print()'})
- ]
-
- feeds = [
+ dict(name='div', attrs={'class':['options','col-right','controles', 'bannerLibre','tiulo-masleidas','masleidas-h']}),
+ dict(name='a', attrs={'name' : 'comentario',}),
+ dict(name='iframe'),
+ dict(name='img', alt = "Ver galerias de imagenes"),
+
+ ]
+
+
+ feeds = [
(u'Noticias' , u'http://www.infobae.com/adjuntos/html/RSS/hoy.xml' )
,(u'Salud' , u'http://www.infobae.com/adjuntos/html/RSS/salud.xml' )
,(u'Tecnologia', u'http://www.infobae.com/adjuntos/html/RSS/tecnologia.xml')
,(u'Deportes' , u'http://www.infobae.com/adjuntos/html/RSS/deportes.xml' )
]
- def print_version(self, url):
- main, sep, article_part = url.partition('contenidos/')
- article_id, rsep, rrest = article_part.partition('-')
- return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id
+# def print_version(self, url):
+# main, sep, article_part = url.partition('contenidos/')
+# article_id, rsep, rrest = article_part.partition('-')
+# return u'http://www.infobae.com/notas/nota_imprimir.php?Idx=' + article_id
+
+ def get_article_url(self, article):
+ import urllib, urlparse
+ parts = list(urlparse.urlparse(article.get('link')))
+ parts[2] = urllib.quote(parts[2])
+ ans = urlparse.urlunparse(parts)
+ return ans
+
def preprocess_html(self, soup):
- mtag = '\n\n'
- soup.head.insert(0,mtag)
+
+ for tag in soup.head.findAll('strong'):
+ tag.extract()
+ for tag in soup.findAll('meta'):
+ del tag['content']
+ tag.extract()
+
+ mtag = '\n\n'
+ soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
+
return soup
+
+ def postprocess_html(self, soup, first):
+
+ for tag in soup.findAll(name='strong'):
+ tag.name = 'b'
+
+ return soup
+
+
+