merge from trunk

This commit is contained in:
ldolse 2010-11-23 20:37:28 +08:00
commit 806fe56d69
24 changed files with 813 additions and 141 deletions

View file

@ -0,0 +1,50 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
'''
180.com.uy
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Noticias(BasicNewsRecipe):
title = '180.com.uy'
__author__ = 'Gustavo Azambuja'
description = 'Noticias de Uruguay'
language = 'es'
timefmt = '[%a, %d %b, %Y]'
use_embedded_content = False
recursion = 5
encoding = 'utf-8'
remove_javascript = True
no_stylesheets = True
oldest_article = 2
max_articles_per_feed = 100
keep_only_tags = [dict(name='div', attrs={'class':'tef-md tef-md-seccion-sociedad'})]
remove_tags = [
dict(name=['object','link'])
]
remove_attributes = ['width','height', 'style', 'font', 'color']
extra_css = '''
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
p {font-family:Arial,Helvetica,sans-serif;}
'''
feeds = [
(u'Titulares', u'http://www.180.com.uy/feed.php')
]
def get_cover_url(self):
return 'http://www.180.com.uy/tplef/img/logo.gif'
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View file

@ -0,0 +1,58 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
'''
bitacora.com.uy
'''
from calibre.web.feeds.news import BasicNewsRecipe
class General(BasicNewsRecipe):
title = 'bitacora.com.uy'
__author__ = 'Gustavo Azambuja'
description = 'Noticias de Uruguay'
language = 'es'
timefmt = '[%a, %d %b, %Y]'
use_embedded_content = False
recursion = 5
encoding = 'iso-8859-1'
remove_javascript = True
no_stylesheets = True
oldest_article = 2
max_articles_per_feed = 100
keep_only_tags = [dict(id=['txt'])]
remove_tags = [
dict(name='div', attrs={'class':'tablafoot'}),
dict(name=['object','h4']),
dict(name=['object','link'])
]
remove_attributes = ['width','height', 'style', 'font', 'color']
extra_css = '''
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
p {font-family:Arial,Helvetica,sans-serif;}
'''
feeds = [
(u'Titulares', u'http://www.bitacora.com.uy/anxml.cgi?15')
]
def get_cover_url(self):
cover_url = None
index = 'http://www.bitacora.com.uy'
soup = self.index_to_soup(index)
link_item = soup.find('img',attrs={'class':'imgtapa'})
if link_item:
cover_url = "http://www.bitacora.com.uy/"+link_item['src']
return cover_url
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View file

@ -0,0 +1,69 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
'''
Muy Interesante
'''
from calibre.web.feeds.news import BasicNewsRecipe
class General(BasicNewsRecipe):
title = 'Cosmopolitan'
__author__ = 'Gustavo Azambuja'
description = 'Revista Cosmopolitan, Edicion Espanola'
language = 'es'
timefmt = '[%a, %d %b, %Y]'
use_embedded_content = False
recursion = 1
encoding = 'utf8'
remove_javascript = True
no_stylesheets = True
conversion_options = {'linearize_tables': True}
oldest_article = 180
max_articles_per_feed = 100
keep_only_tags = [
dict(id=['contenido']),
dict(name='td', attrs={'class':['contentheading', 'txt_articulo']})
]
remove_tags = [
dict(name='div', attrs={'class':['breadcrumb', 'bloque1', 'article', 'bajo_title', 'tags_articles', 'otrosenlaces_title', 'otrosenlaces_parent', 'compartir']}),
dict(name='div', attrs={'id':'comment'}),
dict(name='table', attrs={'class':'pagenav'}),
dict(name=['object','link'])
]
remove_attributes = ['width','height', 'style', 'font', 'color']
extra_css = '''
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
img {float:left; clear:both; margin:10px}
p {font-family:Arial,Helvetica,sans-serif;}
'''
feeds = [
(u'Articulos', u'http://feeds.feedburner.com/cosmohispano/FSSt')
]
def preprocess_html(self, soup):
attribs = [ 'style','font','valign'
,'colspan','width','height'
,'rowspan','summary','align'
,'cellspacing','cellpadding'
,'frames','rules','border'
]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div'
for attrib in attribs:
if item.has_key(attrib):
del item[attrib]
return soup
def get_cover_url(self):
index = 'http://www.cosmohispano.com/revista'
soup = self.index_to_soup(index)
link_item = soup.find('img',attrs={'class':'img_portada'})
if link_item:
cover_url = "http://www.cosmohispano.com"+link_item['src']
return cover_url

View file

@ -0,0 +1,67 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
'''
http://www.elpais.com.uy/
'''
from calibre.web.feeds.news import BasicNewsRecipe
class General(BasicNewsRecipe):
title = 'Diario El Pais'
__author__ = 'Gustavo Azambuja'
description = 'Noticias | Uruguay'
language = 'es'
timefmt = '[%a, %d %b, %Y]'
use_embedded_content = False
recursion = 2
encoding = 'iso-8859-1'
remove_javascript = True
no_stylesheets = True
oldest_article = 2
max_articles_per_feed = 100
keep_only_tags = [
dict(name='h1'),
dict(name='div', attrs={'id':'Contenido'})
]
remove_tags = [
dict(name='div', attrs={'class':['date_text', 'comments', 'form_section', 'share_it']}),
dict(name='div', attrs={'id':['relatedPosts', 'spacer', 'banner_izquierda', 'right_container']}),
dict(name='p', attrs={'class':'FacebookLikeButton'}),
dict(name=['object','form']),
dict(name=['object','table']) ]
extra_css = '''
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
p {font-family:Arial,Helvetica,sans-serif;}
'''
feeds = [
(u'Ultimo Momento', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=umomento'),
(u'Editorial', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=editorial'),
(u'Nacional', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=nacional'),
(u'Internacional', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=internacional'),
(u'Espectaculos', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=espectaculos'),
(u'Deportes', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=deportes'),
(u'Ciudades', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=ciudades'),
(u'Economia', u'http://www.elpais.com.uy/formatos/rss/index.asp?seccion=economia')
]
def get_cover_url(self):
cover_url = None
index = 'http://www.elpais.com.uy'
soup = self.index_to_soup(index)
link_item = soup.find('div',attrs={'class':'boxmedio box257'})
print link_item
if link_item:
cover_url = 'http://www.elpais.com.uy'+link_item.img['src']
return cover_url
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View file

@ -0,0 +1,100 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
'''
http://freeway.com.uy
'''
from calibre.web.feeds.news import BasicNewsRecipe
class General(BasicNewsRecipe):
title = 'freeway.com.uy'
__author__ = 'Gustavo Azambuja'
description = 'Revista Freeway, Montevideo, Uruguay'
language = 'es'
timefmt = '[%a, %d %b, %Y]'
use_embedded_content = False
recursion = 1
encoding = 'utf8'
remove_javascript = True
no_stylesheets = True
conversion_options = {'linearize_tables': True}
oldest_article = 180
max_articles_per_feed = 100
keep_only_tags = [
dict(id=['contenido']),
dict(name='a', attrs={'class':'titulo_art_ppal'}),
dict(name='img', attrs={'class':'recuadro'}),
dict(name='td', attrs={'class':'txt_art_ppal'})
]
remove_tags = [
dict(name=['object','link'])
]
remove_attributes = ['width','height', 'style', 'font', 'color']
extra_css = '''
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
img {float:left; clear:both; margin:10px}
p {font-family:Arial,Helvetica,sans-serif;}
'''
def parse_index(self):
feeds = []
for title, url in [('Articulos', 'http://freeway.com.uy/revista/')]:
articles = self.art_parse_section(url)
if articles:
feeds.append((title, articles))
return feeds
def art_parse_section(self, url):
soup = self.index_to_soup(url)
div = soup.find(attrs={'id': 'tbl_1'})
current_articles = []
for tag in div.findAllNext(attrs = {'class': 'ancho_articulos'}):
if tag.get('class') == 'link-list-heading':
break
for td in tag.findAll('td'):
a = td.find('a', attrs= {'class': 'titulo_articulos'})
if a is None:
continue
title = self.tag_to_string(a)
url = a.get('href', False)
if not url or not title:
continue
if url.startswith('/'):
url = 'http://freeway.com.uy'+url
p = td.find('p', attrs= {'class': 'txt_articulos'})
description = self.tag_to_string(p)
self.log('\t\tFound article:', title)
self.log('\t\t\t', url)
self.log('\t\t\t', description)
current_articles.append({'title': title, 'url': url, 'description':description, 'date':''})
return current_articles
def preprocess_html(self, soup):
attribs = [ 'style','font','valign'
,'colspan','width','height'
,'rowspan','summary','align'
,'cellspacing','cellpadding'
,'frames','rules','border'
]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div'
for attrib in attribs:
if item.has_key(attrib):
del item[attrib]
return soup
def get_cover_url(self):
#index = 'http://www.cosmohispano.com/revista'
#soup = self.index_to_soup(index)
#link_item = soup.find('img',attrs={'class':'img_portada'})
#if link_item:
# cover_url = "http://www.cosmohispano.com"+link_item['src']
return 'http://freeway.com.uy/_upload/_n_foto_grande/noticia_1792_tapanoviembre2010.jpg'

View file

@ -0,0 +1,48 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
'''
ladiaria.com.uy
'''
from calibre.web.feeds.news import BasicNewsRecipe
class General(BasicNewsRecipe):
title = 'La Diaria'
__author__ = 'Gustavo Azambuja'
description = 'Noticias de Uruguay'
language = 'es'
timefmt = '[%a, %d %b, %Y]'
use_embedded_content = False
recursion = 5
encoding = 'utf8'
remove_javascript = True
no_stylesheets = True
oldest_article = 2
max_articles_per_feed = 100
keep_only_tags = [dict(id=['article'])]
remove_tags = [
dict(name='div', attrs={'class':['byline', 'hr', 'titlebar', 'volver-arriba-right']}),
dict(name='div', attrs={'id':'discussion'}),
dict(name=['object','link'])
]
extra_css = '''
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
p {font-family:Arial,Helvetica,sans-serif;}
'''
feeds = [
(u'Articulos', u'http://ladiaria.com/feeds/articulos')
]
def get_cover_url(self):
return 'http://ladiaria.com/edicion/imagenportada/'
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View file

@ -8,7 +8,7 @@
from calibre.web.feeds.news import BasicNewsRecipe
class LaRazon_Bol(BasicNewsRecipe):
title = 'La Razón - Bolivia'
title = u'La Razón - Bolivia'
__author__ = 'Darko Miletic'
description = 'El diario nacional de Bolivia'
publisher = 'Praxsis S.R.L.'

View file

@ -0,0 +1,56 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
'''
http://www.montevideo.com.uy
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Noticias(BasicNewsRecipe):
title = 'Montevideo COMM'
__author__ = 'Gustavo Azambuja'
description = 'Noticias de Uruguay'
language = 'es'
timefmt = '[%a, %d %b, %Y]'
use_embedded_content = False
recursion = 5
encoding = 'utf-8'
remove_javascript = True
no_stylesheets = True
oldest_article = 2
max_articles_per_feed = 100
keep_only_tags = [dict(id=['txt'])]
remove_tags = [
dict(name=['object','link'])
]
remove_attributes = ['width','height', 'style', 'font', 'color']
extra_css = '''
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
p {font-family:Arial,Helvetica,sans-serif;}
'''
feeds = [
(u'Destacados', u'http://www.montevideo.com.uy/anxml.aspx?58'),
(u'Noticias', u'http://www.montevideo.com.uy/anxml.aspx?59'),
(u'Tecnologia', u'http://www.montevideo.com.uy/anxml.aspx?133'),
(u'Tiempo Libre', u'http://www.montevideo.com.uy/anxml.aspx?60'),
# (u'Deportes', u'http://www.montevideo.com.uy/anxml.aspx?968'),
# (u'Pantallazo', u'http://www.montevideo.com.uy/anxml.aspx?1022'),
(u'Gastronomia', u'http://www.montevideo.com.uy/anxml.aspx?1023')
]
def get_cover_url(self):
return 'http://sphotos.ak.fbcdn.net/hphotos-ak-snc1/hs276.snc1/10319_147339559330_147337559330_2625816_6636564_n.jpg'
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View file

@ -0,0 +1,63 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
'''
observa.com.uy
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Noticias(BasicNewsRecipe):
title = 'Observa Digital'
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
description = 'Noticias desde Uruguay'
language = 'es'
timefmt = '[%a, %d %b, %Y]'
use_embedded_content = False
recursion = 5
encoding = 'utf8'
remove_javascript = True
no_stylesheets = True
oldest_article = 2
max_articles_per_feed = 100
keep_only_tags = [dict(id=['contenido'])]
remove_tags = [
dict(name='div', attrs={'id':'contenedorVinculadas'}),
dict(name='p', attrs={'id':'nota_firma'}),
dict(name=['object','link'])
]
remove_attributes = ['width','height', 'style', 'font', 'color']
extra_css = '''
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
p {font-family:Arial,Helvetica,sans-serif;}
'''
feeds = [
(u'Actualidad', u'http://www.observa.com.uy/RSS/actualidad.xml'),
(u'Deportes', u'http://www.observa.com.uy/RSS/deportes.xml'),
(u'Vida', u'http://www.observa.com.uy/RSS/vida.xml'),
(u'Ciencia y Tecnologia', u'http://www.observa.com.uy/RSS/ciencia.xml')
]
def get_cover_url(self):
cover_url = None
index = 'http://www.elobservador.com.uy/elobservador/nav_portada.asp?suplemento=dia'
soup = self.index_to_soup(index)
link_item = soup.find('img',attrs={'usemap':'#mapeo_imagenes'})
if link_item:
cover_url = 'http://www.elobservador.com.uy'+link_item['src'].strip()
print cover_url
return cover_url
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View file

@ -0,0 +1,54 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = '2010, Gustavo Azambuja <hola at gazambuja.com>'
'''
http://www.revistabla.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Noticias(BasicNewsRecipe):
title = 'Revista Bla'
__author__ = 'Gustavo Azambuja'
description = 'Moda | Uruguay'
language = 'es'
timefmt = '[%a, %d %b, %Y]'
use_embedded_content = False
recursion = 5
encoding = 'utf8'
remove_javascript = True
no_stylesheets = True
oldest_article = 20
max_articles_per_feed = 100
keep_only_tags = [dict(id=['body_container'])]
remove_tags = [
dict(name='div', attrs={'class':['date_text', 'comments', 'form_section', 'share_it']}),
dict(name='div', attrs={'id':['relatedPosts', 'spacer', 'banner_izquierda', 'right_container']}),
dict(name='p', attrs={'class':'FacebookLikeButton'}),
dict(name=['object','link']) ]
extra_css = '''
h1{font-family:Geneva, Arial, Helvetica, sans-serif;color:#154B7A;}
h3{font-size: 14px;color:#999999; font-family:Geneva, Arial, Helvetica, sans-serif;font-weight: bold;}
h2{color:#666666; font-family:Geneva, Arial, Helvetica, sans-serif;font-size:small;}
p {font-family:Arial,Helvetica,sans-serif;}
'''
feeds = [
(u'Articulos', u'http://www.revistabla.com/feed/')
]
def get_cover_url(self):
cover_url = None
index = 'http://www.revistabla.com'
soup = self.index_to_soup(index)
link_item = soup.find('div',attrs={'class':'header_right'})
if link_item:
cover_url = link_item.img['src']
return cover_url
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup

View file

@ -108,3 +108,10 @@ def parse_index(self):
feeds.append((title, articles))
return feeds
def get_cover_url(self):
index = 'http://www.muyinteresante.es/revista'
soup = self.index_to_soup(index)
link_item = soup.find('img',attrs={'class':'img_portada'})
if link_item:
cover_url = "http://www.muyinteresante.es"+link_item['src']
return cover_url

View file

@ -3,12 +3,12 @@
__license__ = 'GPL v3'
__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
''' http://www.derstandard.at - Austrian Newspaper '''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class TelepolisNews(BasicNewsRecipe):
title = u'Telepolis (News)'
title = u'Telepolis (News+Artikel)'
__author__ = 'Gerhard Aigner'
publisher = 'Heise Zeitschriften Verlag GmbH & Co KG'
description = 'News from telepolis'
@ -20,16 +20,16 @@ class TelepolisNews(BasicNewsRecipe):
encoding = "utf-8"
language = 'de_AT'
use_embedded_content = False
use_embedded_content =False
remove_empty_feeds = True
preprocess_regexps = [(re.compile(r'<a[^>]*>', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(r'</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),]
keep_only_tags = [dict(name = 'table',attrs={'class':'blogtable'})]
remove_tags = [dict(name='img'), dict(name='td',attrs={'class':'blogbottom'})]
keep_only_tags = [dict(name = 'td',attrs={'class':'bloghead'}),dict(name = 'td',attrs={'class':'blogfliess'})]
remove_tags = [dict(name='img'), dict(name='td',attrs={'class':'blogbottom'}), dict(name='td',attrs={'class':'forum'})]
feeds = [(u'News', u'http://www.heise.de/tp/news.rdf')]
feeds = [(u'News', u'http://www.heise.de/tp/news-atom.xml')]
html2lrf_options = [
'--comment' , description
@ -41,7 +41,7 @@ class TelepolisNews(BasicNewsRecipe):
def get_article_url(self, article):
'''if the linked article is of kind artikel don't take it'''
if (article.link.count('artikel') > 0) :
if (article.link.count('artikel') > 1) :
return None
return article.link
@ -49,3 +49,5 @@ def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
soup.head.insert(0,mtag)
return soup

View file

@ -132,7 +132,7 @@ def ignore_lib(root, items):
shutil.copytree(self.j(comext, 'shell'), self.j(sp_dir, 'win32com', 'shell'))
shutil.rmtree(comext)
for pat in (r'numpy', r'PyQt4\uic\port_v3'):
for pat in (r'PyQt4\uic\port_v3', ):
x = glob.glob(self.j(self.lib_dir, 'site-packages', pat))[0]
shutil.rmtree(x)

View file

@ -19,7 +19,7 @@ Set CMAKE_PREFIX_PATH environment variable to C:\cygwin\home\kovid\sw
This is where all dependencies will be installed.
Add C:\Python26\Scripts and C:\Python26 to PATH
Add C:\Python27\Scripts and C:\Python27 to PATH
Install setuptools from http://pypi.python.org/pypi/setuptools
If there are no windows binaries already compiled for the version of python you are using then download the source and run the following command in the folder where the source has been unpacked::
@ -28,7 +28,7 @@ If there are no windows binaries already compiled for the version of python you
Run the following command to install python dependencies::
easy_install --always-unzip -U ipython mechanize pyreadline python-dateutil dnspython cssutils clientform
easy_install --always-unzip -U ipython mechanize pyreadline python-dateutil dnspython cssutils clientform pycrypto
Install BeautifulSoup 3.0.x manually into site-packages (3.1.x parses broken HTML very poorly)

View file

@ -229,6 +229,10 @@ def delete_via_sql(self, ContentID, ContentType):
#Delete the volume_shortcovers second
cursor.execute('delete from volume_shortcovers where volumeid = ?', t)
# Delete the rows from content_keys
if self.dbversion >= 8:
cursor.execute('delete from content_keys where volumeid = ?', t)
# Delete the chapters associated with the book next
t = (ContentID,ContentID,)
cursor.execute('delete from content where BookID = ? or ContentID = ?', t)

View file

@ -145,18 +145,21 @@ def config_widget(self):
setattr(w, '_'+x, cb)
cb.setChecked(c.get(x, True))
w._layout.addWidget(cb)
cb = QCheckBox(_('Convert comments downloaded from %s to plain text')%(self.name))
setattr(w, '_textcomments', cb)
cb.setChecked(c.get('textcomments', False))
w._layout.addWidget(cb)
if self.has_html_comments:
cb = QCheckBox(_('Convert comments downloaded from %s to plain text')%(self.name))
setattr(w, '_textcomments', cb)
cb.setChecked(c.get('textcomments', False))
w._layout.addWidget(cb)
return w
def save_settings(self, w):
dl_settings = {}
for x in ('rating', 'tags', 'comments', 'textcomments'):
for x in ('rating', 'tags', 'comments'):
dl_settings[x] = getattr(w, '_'+x).isChecked()
if self.has_html_comments:
dl_settings['textcomments'] = getattr(w, '_textcomments').isChecked()
c = self.config_store()
c.set(self.name, dl_settings)
if hasattr(w, '_sc'):

View file

@ -90,10 +90,8 @@ def build_isbn(base_url, opts):
return base_url + 'index1=isbn&value1='+opts.isbn
def build_combined(base_url, opts):
query = ''
for e in (opts.title, opts.author, opts.publisher):
if e is not None:
query += ' ' + e
query = ' '.join([e for e in (opts.title, opts.author, opts.publisher) \
if e is not None ])
query = query.strip()
if len(query) == 0:
raise ISBNDBError('You must specify at least one of --author, --title or --publisher')
@ -141,15 +139,8 @@ def create_books(opts, args, timeout=5.):
print ('ISBNDB query: '+url)
tans = [ISBNDBMetadata(book) for book in fetch_metadata(url, timeout=timeout)]
ans = []
for x in tans:
add = True
for y in ans:
if y.isbn == x.isbn:
add = False
if add:
ans.append(x)
return ans
#remove duplicates ISBN
return list(dict((book.isbn, book) for book in tans).values())
def main(args=sys.argv):
parser = option_parser()

View file

@ -6,3 +6,53 @@ def db(path=None):
from calibre.library.database2 import LibraryDatabase2
from calibre.utils.config import prefs
return LibraryDatabase2(path if path else prefs['library_path'])
def generate_test_db(library_path,
num_of_records=20000,
num_of_authors=6000,
num_of_tags=10000,
tag_length=7,
author_length=7,
title_length=10,
max_authors=10,
max_tags=10
):
import random, string, os, sys, time
if not os.path.exists(library_path):
os.makedirs(library_path)
def randstr(length):
return ''.join(random.choice(string.letters) for i in
xrange(length))
all_tags = [randstr(tag_length) for j in xrange(num_of_tags)]
print 'Generated', num_of_tags, 'tags'
all_authors = [randstr(author_length) for j in xrange(num_of_authors)]
print 'Generated', num_of_authors, 'authors'
all_titles = [randstr(title_length) for j in xrange(num_of_records)]
print 'Generated', num_of_records, 'titles'
testdb = db(library_path)
print 'Creating', num_of_records, 'records...'
start = time.time()
for i, title in enumerate(all_titles):
print i+1,
sys.stdout.flush()
authors = random.randint(1, max_authors)
authors = [random.choice(all_authors) for i in xrange(authors)]
tags = random.randint(0, max_tags)
tags = [random.choice(all_tags) for i in xrange(tags)]
from calibre.ebooks.metadata.book.base import Metadata
mi = Metadata(title, authors)
mi.tags = tags
testdb.import_book(mi, [])
t = time.time() - start
print '\nGenerated', num_of_records, 'records in:', t, 'seconds'
print 'Time per record:', t/float(num_of_records)

View file

@ -278,10 +278,10 @@ def run(self, path_to_output, opts, db, notification=DummyReporter()):
from calibre.library.save_to_disk import preprocess_template
#Bibtex functions
from calibre.utils.bibtex import bibtex_author_format, utf8ToBibtex, ValidateCitationKey
from calibre.utils.bibtex import BibTeX
def create_bibtex_entry(entry, fields, mode, template_citation,
asccii_bibtex = True, citation_bibtex = True):
bibtexdict, citation_bibtex = True):
#Bibtex doesn't like UTF-8 but keep unicode until writing
#Define starting chain or if book valid strict and not book return a Fail string
@ -297,7 +297,8 @@ def create_bibtex_entry(entry, fields, mode, template_citation,
if citation_bibtex :
# Citation tag
bibtex_entry.append(make_bibtex_citation(entry, template_citation, asccii_bibtex))
bibtex_entry.append(make_bibtex_citation(entry, template_citation,
bibtexdict))
bibtex_entry = [u' '.join(bibtex_entry)]
for field in fields:
@ -312,11 +313,11 @@ def create_bibtex_entry(entry, fields, mode, template_citation,
pass
if field == 'authors' :
bibtex_entry.append(u'author = "%s"' % bibtex_author_format(item))
bibtex_entry.append(u'author = "%s"' % bibtexdict.bibtex_author_format(item))
elif field in ['title', 'publisher', 'cover', 'uuid',
'author_sort', 'series'] :
bibtex_entry.append(u'%s = "%s"' % (field, utf8ToBibtex(item, asccii_bibtex)))
bibtex_entry.append(u'%s = "%s"' % (field, bibtexdict.utf8ToBibtex(item)))
elif field == 'id' :
bibtex_entry.append(u'calibreid = "%s"' % int(item))
@ -329,13 +330,13 @@ def create_bibtex_entry(entry, fields, mode, template_citation,
elif field == 'tags' :
#A list to flatten
bibtex_entry.append(u'tags = "%s"' % utf8ToBibtex(u', '.join(item), asccii_bibtex))
bibtex_entry.append(u'tags = "%s"' % bibtexdict.utf8ToBibtex(u', '.join(item)))
elif field == 'comments' :
#\n removal
item = item.replace(u'\r\n',u' ')
item = item.replace(u'\n',u' ')
bibtex_entry.append(u'note = "%s"' % utf8ToBibtex(item, asccii_bibtex))
bibtex_entry.append(u'note = "%s"' % bibtexdict.utf8ToBibtex(item))
elif field == 'isbn' :
# Could be 9, 10 or 13 digits
@ -353,8 +354,7 @@ def create_bibtex_entry(entry, fields, mode, template_citation,
elif field == 'pubdate' :
bibtex_entry.append(u'year = "%s"' % item.year)
bibtex_entry.append(u'month = "%s"' % utf8ToBibtex(strftime("%b", item),
asccii_bibtex))
bibtex_entry.append(u'month = "%s"' % bibtexdict.utf8ToBibtex(strftime("%b", item)))
bibtex_entry = u',\n '.join(bibtex_entry)
bibtex_entry += u' }\n\n'
@ -371,7 +371,7 @@ def check_entry_book_valid(entry):
else :
return True
def make_bibtex_citation(entry, template_citation, asccii_bibtex):
def make_bibtex_citation(entry, template_citation, bibtexclass):
#define a function to replace the template entry by its value
def tpl_replace(objtplname) :
@ -392,8 +392,9 @@ def tpl_replace(objtplname) :
return u''
if len(template_citation) >0 :
tpl_citation = utf8ToBibtex(ValidateCitationKey(re.sub(u'\{[^{}]*\}',
tpl_replace, template_citation)), asccii_bibtex)
tpl_citation = bibtexclass.utf8ToBibtex(
bibtexclass.ValidateCitationKey(re.sub(u'\{[^{}]*\}',
tpl_replace, template_citation)))
if len(tpl_citation) >0 :
return tpl_citation
@ -404,10 +405,7 @@ def tpl_replace(objtplname) :
else :
template_citation = u'%s' % str(entry["id"])
if asccii_bibtex :
return ValidateCitationKey(template_citation.encode('ascii', 'replace'))
else :
return ValidateCitationKey(template_citation)
return bibtexclass.ValidateCitationKey(template_citation)
self.fmt = path_to_output.rpartition('.')[2]
self.notification = notification
@ -475,13 +473,16 @@ def tpl_replace(objtplname) :
if not len(data):
log.error("\nNo matching database entries for search criteria '%s'" % opts.search_text)
#Initialize BibTeX class
bibtexc = BibTeX()
#Entries writing after Bibtex formating (or not)
if bibfile_enc != 'ascii' :
asccii_bibtex = False
bibtexc.ascii_bibtex = False
else :
asccii_bibtex = True
bibtexc.ascii_bibtex = True
#Check and go to default in case of bad CLI
#Check citation choice and go to default in case of bad CLI
if isinstance(opts.impcit, (StringType, UnicodeType)) :
if opts.impcit == 'False' :
citation_bibtex= False
@ -493,6 +494,7 @@ def tpl_replace(objtplname) :
else :
citation_bibtex= opts.impcit
#Preprocess for error and light correction
template_citation = preprocess_template(opts.bib_cit)
#Open output and write entries
@ -514,7 +516,7 @@ def tpl_replace(objtplname) :
for entry in data:
outfile.write(create_bibtex_entry(entry, fields, bib_entry, template_citation,
asccii_bibtex, citation_bibtex))
bibtexc, citation_bibtex))
outfile.close()

View file

@ -1248,15 +1248,20 @@ def doit(func, *args, **kwargs):
traceback.print_exc()
else:
raise
path_changed = False
if set_title and mi.title:
self.set_title(id, mi.title, commit=False)
self._set_title(id, mi.title)
path_changed = True
if set_authors:
if not mi.authors:
mi.authors = [_('Unknown')]
authors = []
for a in mi.authors:
authors += string_to_authors(a)
self.set_authors(id, authors, notify=False, commit=False)
self._set_authors(id, authors)
path_changed = True
if path_changed:
self.set_path(id, index_is_id=True)
if mi.author_sort:
doit(self.set_author_sort, id, mi.author_sort, notify=False,
commit=False)
@ -1348,13 +1353,7 @@ def author_sort_from_authors(self, authors):
result.append(r)
return ' & '.join(result).replace('|', ',')
def set_authors(self, id, authors, notify=True, commit=True):
'''
Note that even if commit is False, the db will still be committed to
because this causes the location of files to change
:param authors: A list of authors.
'''
def _set_authors(self, id, authors):
if not authors:
authors = [_('Unknown')]
self.conn.execute('DELETE FROM books_authors_link WHERE book=?',(id,))
@ -1379,25 +1378,30 @@ def set_authors(self, id, authors, notify=True, commit=True):
ss = self.author_sort_from_book(id, index_is_id=True)
self.conn.execute('UPDATE books SET author_sort=? WHERE id=?',
(ss, id))
self.dirtied([id], commit=False)
if commit:
self.conn.commit()
self.data.set(id, self.FIELD_MAP['authors'],
','.join([a.replace(',', '|') for a in authors]),
row_is_id=True)
self.data.set(id, self.FIELD_MAP['author_sort'], ss, row_is_id=True)
def set_authors(self, id, authors, notify=True, commit=True):
'''
Note that even if commit is False, the db will still be committed to
because this causes the location of files to change
:param authors: A list of authors.
'''
self._set_authors(id, authors)
self.dirtied([id], commit=False)
if commit:
self.conn.commit()
self.set_path(id, index_is_id=True)
if notify:
self.notify('metadata', [id])
def set_title(self, id, title, notify=True, commit=True):
'''
Note that even if commit is False, the db will still be committed to
because this causes the location of files to change
'''
def _set_title(self, id, title):
if not title:
return
if not isinstance(title, unicode):
return False
if isbytestring(title):
title = title.decode(preferred_encoding, 'replace')
self.conn.execute('UPDATE books SET title=? WHERE id=?', (title, id))
self.data.set(id, self.FIELD_MAP['title'], title, row_is_id=True)
@ -1405,6 +1409,15 @@ def set_title(self, id, title, notify=True, commit=True):
self.data.set(id, self.FIELD_MAP['sort'], title_sort(title), row_is_id=True)
else:
self.data.set(id, self.FIELD_MAP['sort'], title, row_is_id=True)
return True
def set_title(self, id, title, notify=True, commit=True):
'''
Note that even if commit is False, the db will still be committed to
because this causes the location of files to change
'''
if not self._set_title(id, title):
return
self.set_path(id, index_is_id=True)
self.dirtied([id], commit=False)
if commit:
@ -2072,13 +2085,11 @@ def create_book_entry(self, mi, cover=None, add_duplicates=True,
(id, title, series_index, aus))
self.data.books_added([id], self)
self.set_path(id, True)
self.conn.commit()
if mi.timestamp is None:
mi.timestamp = utcnow()
if mi.pubdate is None:
mi.pubdate = utcnow()
self.set_metadata(id, mi, ignore_errors=True)
self.set_metadata(id, mi, ignore_errors=True, commit=True)
if cover is not None:
try:
self.set_cover(id, cover)
@ -2114,13 +2125,11 @@ def add_books(self, paths, formats, metadata, add_duplicates=True):
id = obj.lastrowid
self.data.books_added([id], self)
ids.append(id)
self.set_path(id, True)
self.conn.commit()
if mi.timestamp is None:
mi.timestamp = utcnow()
if mi.pubdate is None:
mi.pubdate = utcnow()
self.set_metadata(id, mi)
self.set_metadata(id, mi, commit=True, ignore_errors=True)
npath = self.run_import_plugins(path, format)
format = os.path.splitext(npath)[-1].lower().replace('.', '').upper()
stream = lopen(npath, 'rb')
@ -2154,12 +2163,11 @@ def import_book(self, mi, formats, notify=True, import_hooks=True,
(title, series_index, aus))
id = obj.lastrowid
self.data.books_added([id], self)
self.set_path(id, True)
if mi.timestamp is None:
mi.timestamp = utcnow()
if mi.pubdate is None:
mi.pubdate = utcnow()
self.set_metadata(id, mi, ignore_errors=True)
self.set_metadata(id, mi, ignore_errors=True, commit=True)
if preserve_uuid and mi.uuid:
self.set_uuid(id, mi.uuid, commit=False)
for path in formats:

View file

@ -129,7 +129,7 @@ def __init__(self, name, fobject):
def __getattribute__(self, attr):
if attr in ('name', '__enter__', '__str__', '__unicode__',
'__repr__'):
'__repr__', '__exit__'):
return object.__getattribute__(self, attr)
fobject = object.__getattribute__(self, 'fobject')
return getattr(fobject, attr)
@ -155,6 +155,11 @@ def __enter__(self):
fobject.__enter__()
return self
def __exit__(self, *args):
fobject = object.__getattribute__(self, 'fobject')
return fobject.__exit__(*args)
m = mode[0]
random = len(mode) > 1 and mode[1] == '+'
binary = mode[-1] == 'b'

View file

@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-
""" Collection of python utility-methodes commonly used by other
bibliograph packages.
From http://pypi.python.org/pypi/bibliograph.core/
@ -62,11 +60,14 @@
DAMAGE.
"""
__docformat__ = 'reStructuredText'
__author__ = 'sengian <sengian1 at gmail.com>'
__docformat__ = 'restructuredtext en'
import re, string
from calibre.constants import preferred_encoding
from calibre.utils.mreplace import MReplace
utf8enc2latex_mapping = {
# This is a mapping of Unicode characters to LaTeX equivalents.
# The information has been extracted from
@ -2463,7 +2464,7 @@
u'\U0001d7fd': '$\\mathtt{7}$',
u'\U0001d7fe': '$\\mathtt{8}$',
u'\U0001d7ff': '$\\mathtt{9}$',
#Items from simple list
u'\u0106': "{\\a\\'C}",
u'\u0408': '{\\CYRJE}',
@ -2842,69 +2843,66 @@
'"':'{"}',
}
def ValidateCitationKey(text):
"""
removes characters not allowed in BibTeX keys
class BibTeX:
def __init__(self):
self.rep_utf8 = MReplace(utf8enc2latex_mapping)
self.rep_ent = MReplace(entity_mapping)
#Set default conversion to ASCII BibTeX
self.ascii_bibtex = True
# This substitution is based on the description of cite key restrictions at
# http://bibdesk.sourceforge.net/manual/BibDesk%20Help_2.html
self.invalid_cit = re.compile(u'[ "@\',\\#}{~%&$^]')
self.upper = re.compile(u'[' +
string.uppercase.decode(preferred_encoding) + u']')
self.escape = re.compile(u'[~#&%_]')
>>> from bibliograph.core.utils import _validKey
>>> _validKey(DummyEntry('Foo Bar'))
'FooBar'
def ValidateCitationKey(self, text):
"""
removes characters not allowed in BibTeX keys
>>> ValidateCitationKey(DummyEntry('my@id'))
'myid'
"""
return self.invalid_cit.sub(u'', text)
>>> _validKey(DummyEntry('my@id'))
'myid'
def braceUppercase(self, text):
""" Convert uppercase letters to bibtex encoded uppercase
>>> braceUppercase('Foo Bar')
'{F}oo {B}ar'
"""
return self.upper.sub(lambda m: u'{%s}' % m.group(), text)
"""
# This substitution is based on the description of cite key restrictions at
# http://bibdesk.sourceforge.net/manual/BibDesk%20Help_2.html
return re.sub(u'[ "@\',\\#}{~%&$^]', u'', text)
def resolveEntities(self, text):
#for entity, entity_map in entity_mapping.iteritems():
# text = text.replace(entity, entity_map)
#return text
return self.rep_ent.mreplace(text)
def BraceUppercase(text):
""" Convert uppercase letters to bibtex encoded uppercase
def resolveUnicode(self, text):
#UTF-8 text as entry
#for unichar, latexenc in utf8enc2latex_mapping.iteritems() :
# text = text.replace(unichar, latexenc)
text = self.rep_utf8.mreplace(text)
return text.replace(u'$}{$', u'')
>>> from bibliograph.core.utils import _braceUppercase
>>> _braceUppercase('foo bar')
'foo bar'
def escapeSpecialCharacters(self, text):
"""
latex escaping some (not all) special characters
"""
text.replace('\\', '\\\\')
return self.escape.sub(lambda m: u'\\%s' % m.group(), text)
>>> _braceUppercase('Foo Bar')
'{F}oo {B}ar'
"""
for uc in string.uppercase:
text = text.replace(uc, u'{%s}' % uc)
return text
#Calibre functions
#Option to go to official ASCII Bibtex or unofficial UTF-8
#Go from an unicode entry to ASCII Bibtex format without encoding
def utf8ToBibtex(self, text):
if len(text) == 0:
return ''
text.replace('\\', '\\\\')
text = self.resolveEntities(text)
if self.ascii_bibtex :
text = self.resolveUnicode(text)
return self.escapeSpecialCharacters(text)
def resolveEntities(text):
for entity, entity_map in entity_mapping.iteritems():
text = text.replace(entity, entity_map)
return text
def resolveUnicode(text):
#UTF-8 text as entry
for unichar, latexenc in utf8enc2latex_mapping.iteritems() :
text = text.replace(unichar, latexenc)
return text.replace(u'$}{$', u'')
def escapeSpecialCharacters(text):
"""
latex escaping some (not all) special characters
"""
text.replace('\\', '\\\\')
escape = ['~', '#', '&', '%', '_']
for c in escape:
text = text.replace(c, '\\' + c )
return text
#Calibre functions
#Go from an unicode entry to ASCII Bibtex format without encoding
#Option to go to official ASCII Bibtex or unofficial UTF-8
def utf8ToBibtex(text, asccii_bibtex = True):
if len(text) == 0:
return ''
text.replace('\\', '\\\\')
text = resolveEntities(text)
if asccii_bibtex :
text = resolveUnicode(text)
return escapeSpecialCharacters(text)
def bibtex_author_format(item):
#Format authors for Bibtex compliance (get a list as input)
return utf8ToBibtex(u' and'.join([author for author in item]))
def bibtex_author_format(self, item):
#Format authors for Bibtex compliance (get a list as input)
return self.utf8ToBibtex(u' and'.join([author for author in item]))

View file

@ -0,0 +1,32 @@
#multiple replace from dictionnary : http://code.activestate.com/recipes/81330/
__license__ = 'GPL v3'
__copyright__ = '2010, sengian <sengian1 @ gmail.com>'
__docformat__ = 'restructuredtext en'
import re
from UserDict import UserDict
class MReplace(UserDict):
def __init__(self, dict = None):
UserDict.__init__(self, dict)
self.re = None
self.regex = None
self.compile_regex()
def compile_regex(self):
if len(self.data) > 0:
keys = sorted(self.data.keys(), key=len)
keys.reverse()
tmp = "(%s)" % "|".join(map(re.escape, keys))
if self.re != tmp:
self.re = tmp
self.regex = re.compile(self.re)
def __call__(self, mo):
return self[mo.string[mo.start():mo.end()]]
def mreplace(self, text):
#Replace without regex compile
if len(self.data) < 1 or self.re is None:
return text
return self.regex.sub(self, text)

View file

@ -61,6 +61,11 @@ def attr(n, d):
def serialize_collection(mapping_of_recipe_classes):
collection = E.recipe_collection()
'''for u, x in mapping_of_recipe_classes.items():
print 11111, u, repr(x.title)
if isinstance(x.title, str):
x.title.decode('ascii')
'''
for urn in sorted(mapping_of_recipe_classes.keys(),
key=lambda key: getattr(mapping_of_recipe_classes[key], 'title',
'zzz')):