Implement #5016 (Add formats to existing ebook records)

This commit is contained in:
Kovid Goyal 2010-03-11 23:26:10 -07:00
commit a0f2163403
8 changed files with 101 additions and 20 deletions

View file

@ -20,7 +20,7 @@ def string_to_authors(raw):
raw = raw.replace('&&', u'\uffff')
raw = _author_pat.sub('&', raw)
authors = [a.strip().replace(u'\uffff', '&') for a in raw.split('&')]
return authors
return [a for a in authors if a]
def authors_to_string(authors):
if authors is not None:

View file

@ -4,10 +4,9 @@
__docformat__ = 'restructuredtext en'
''' Read/write metadata from Amazon's topaz format '''
import copy, StringIO, sys
from struct import pack, unpack
import StringIO, sys
from struct import pack
from calibre import prints
from calibre.ebooks.metadata import MetaInformation
class StreamSlicer(object):
@ -200,7 +199,6 @@ def get_headers(self, offset):
# Build a dict of topaz_header records
topaz_headers = {}
for x in range(self.header_records):
c_marker = self.data[offset]
offset += 1
taglen, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
@ -259,7 +257,6 @@ def get_original_metadata(self):
self.metadata = {}
for x in range(self.md_header['num_recs']):
md_record = {}
taglen, consumed = self.decode_vwi(self.data[offset:offset+4])
offset += consumed
tag = self.data[offset:offset+taglen]
@ -380,7 +377,6 @@ def set_metadata(stream, mi):
return
if __name__ == '__main__':
import cStringIO, sys
#print get_metadata(open(sys.argv[1], 'rb'))
mi = MetaInformation(title="My New Title", authors=['Smith, John'])
set_metadata(open(sys.argv[1], 'rb'), mi)

View file

@ -1,7 +1,7 @@
'''
UI for adding books to the database and saving books to disk
'''
import os, shutil, time
import os, shutil, time, re
from Queue import Queue, Empty
from threading import Thread
@ -13,9 +13,10 @@
from calibre.ebooks.metadata.opf2 import OPF
from calibre.ebooks.metadata import MetaInformation
from calibre.constants import preferred_encoding, filesystem_encoding
from calibre.utils.config import prefs
class DuplicatesAdder(QThread):
# Add duplicate books
def __init__(self, parent, db, duplicates, db_adder):
QThread.__init__(self, parent)
self.db, self.db_adder = db, db_adder
@ -27,6 +28,7 @@ def run(self):
formats = [f for f in formats if not f.lower().endswith('.opf')]
id = self.db.create_book_entry(mi, cover=cover,
add_duplicates=True)
# here we add all the formats for dupe book record created above
self.db_adder.add_formats(id, formats)
self.db_adder.number_of_books_added += 1
self.emit(SIGNAL('added(PyQt_PyObject)'), count)
@ -90,6 +92,15 @@ def __init__(self, db, ids, nmap):
self.daemon = True
self.input_queue = Queue()
self.output_queue = Queue()
self.fuzzy_title_patterns = [(re.compile(pat), repl) for pat, repl in
[
(r'[\[\](){}<>\'";,:#]', ''),
(r'^(the|a|an) ', ''),
(r'[-._]', ' '),
(r'\s+', ' ')
]
]
self.merged_books = set([])
def run(self):
while not self.end:
@ -125,6 +136,34 @@ def process_formats(self, opf, formats):
fmts[-1] = fmt
return fmts
def fuzzy_title(self, title):
title = title.strip().lower()
for pat, repl in self.fuzzy_title_patterns:
title = pat.sub(repl, title)
return title
def find_identical_books(self, mi):
identical_book_ids = set([])
if mi.authors:
try:
query = u' and '.join([u'author:"=%s"'%(a.replace('"', '')) for a in
mi.authors])
except ValueError:
return identical_book_ids
try:
book_ids = self.db.data.parse(query)
except:
import traceback
traceback.print_exc()
return identical_book_ids
for book_id in book_ids:
fbook_title = self.db.title(book_id, index_is_id=True)
fbook_title = self.fuzzy_title(fbook_title)
mbook_title = self.fuzzy_title(mi.title)
if fbook_title == mbook_title:
identical_book_ids.add(book_id)
return identical_book_ids
def add(self, id, opf, cover, name):
formats = self.ids.pop(id)
if opf.endswith('.error'):
@ -145,25 +184,38 @@ def add(self, id, opf, cover, name):
if self.db is not None:
if cover:
cover = open(cover, 'rb').read()
id = self.db.create_book_entry(mi, cover=cover, add_duplicates=False)
self.number_of_books_added += 1
if id is None:
self.duplicates.append((mi, cover, formats))
orig_formats = formats
formats = [f for f in formats if not f.lower().endswith('.opf')]
if prefs['add_formats_to_existing']:
identical_book_list = self.find_identical_books(mi)
if identical_book_list: # books with same author and nearly same title exist in db
self.merged_books.add(mi.title)
for identical_book in identical_book_list:
self.add_formats(identical_book, formats, replace=False)
else:
id = self.db.create_book_entry(mi, cover=cover, add_duplicates=True)
self.number_of_books_added += 1
self.add_formats(id, formats)
else:
formats = [f for f in formats if not f.lower().endswith('.opf')]
self.add_formats(id, formats)
id = self.db.create_book_entry(mi, cover=cover, add_duplicates=False)
self.number_of_books_added += 1
if id is None:
self.duplicates.append((mi, cover, orig_formats))
else:
self.add_formats(id, formats)
else:
self.names.append(name)
self.paths.append(formats[0])
self.infos.append(mi)
return mi.title
def add_formats(self, id, formats):
def add_formats(self, id, formats, replace=True):
for path in formats:
fmt = os.path.splitext(path)[-1].replace('.', '').upper()
with open(path, 'rb') as f:
self.db.add_format(id, fmt, f, index_is_id=True,
notify=False)
notify=False, replace=replace)
class Adder(QObject):
@ -330,6 +382,11 @@ def number_of_books_added(self):
return getattr(getattr(self, 'db_adder', None), 'number_of_books_added',
0)
@property
def merged_books(self):
return getattr(getattr(self, 'db_adder', None), 'merged_books',
set([]))
@property
def critical(self):
return getattr(getattr(self, 'db_adder', None), 'critical',

View file

@ -44,6 +44,7 @@ def __init__(self, parent=None):
self.filename_pattern = FilenamePattern(self)
self.metadata_box.layout().insertWidget(0, self.filename_pattern)
self.opt_swap_author_names.setChecked(prefs['swap_author_names'])
self.opt_add_formats_to_existing.setChecked(prefs['add_formats_to_existing'])
help = '\n'.join(textwrap.wrap(c.get_option('template').help, 75))
self.save_template.initialize('save_to_disk', opts.template, help)
self.send_template.initialize('send_to_device', opts.send_template, help)
@ -69,6 +70,7 @@ def save_settings(self):
pattern = self.filename_pattern.commit()
prefs['filename_pattern'] = pattern
prefs['swap_author_names'] = bool(self.opt_swap_author_names.isChecked())
prefs['add_formats_to_existing'] = bool(self.opt_add_formats_to_existing.isChecked())
return True

View file

@ -6,7 +6,7 @@
<rect>
<x>0</x>
<y>0</y>
<width>645</width>
<width>588</width>
<height>516</height>
</rect>
</property>
@ -49,6 +49,19 @@
</widget>
</item>
<item row="2" column="0" colspan="2">
<widget class="QCheckBox" name="opt_add_formats_to_existing">
<property name="toolTip">
<string>If an existing book with a similar title and author is found that does not have the format being added, the format is added
to the existing book, instead of creating a new entry. If the existing book already has the format, then it is silently ignored.
Title match ignores leading indefinite articles (&quot;the&quot;, &quot;a&quot;, &quot;an&quot;), punctuation, case, etc. Author match is exact.</string>
</property>
<property name="text">
<string>If books with similar titles and authors found, &amp;merge the new files automatically</string>
</property>
</widget>
</item>
<item row="3" column="0" colspan="2">
<widget class="QGroupBox" name="metadata_box">
<property name="title">
<string>&amp;Configure metadata from file name</string>

View file

@ -24,7 +24,7 @@
from calibre import prints, patheq, strftime
from calibre.constants import __version__, __appname__, isfrozen, islinux, \
iswindows, isosx, filesystem_encoding
iswindows, isosx, filesystem_encoding, preferred_encoding
from calibre.utils.filenames import ascii_filename
from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.config import prefs, dynamic
@ -1244,6 +1244,13 @@ def _files_added(self, paths=[], names=[], infos=[], on_card=None):
self.library_view.model().books_added(self._adder.number_of_books_added)
if hasattr(self, 'db_images'):
self.db_images.reset()
if getattr(self._adder, 'merged_books', False):
books = u'\n'.join([x if isinstance(x, unicode) else
x.decode(preferred_encoding, 'replace') for x in
self._adder.merged_books])
info_dialog(self, _('Merged some books'),
_('Some duplicates were found and merged into the '
'following existing books:'), det_msg=books, show=True)
if getattr(self._adder, 'critical', None):
det_msg = []
for name, log in self._adder.critical.items():

View file

@ -998,12 +998,15 @@ def add_format_with_hooks(self, index, format, fpath, index_is_id=False,
return self.add_format(index, format, stream,
index_is_id=index_is_id, path=path, notify=notify)
def add_format(self, index, format, stream, index_is_id=False, path=None, notify=True):
def add_format(self, index, format, stream, index_is_id=False, path=None,
notify=True, replace=True):
id = index if index_is_id else self.id(index)
if path is None:
path = os.path.join(self.library_path, self.path(id, index_is_id=True))
name = self.conn.get('SELECT name FROM data WHERE book=? AND format=?', (id, format), all=False)
if name:
if not replace:
return False
self.conn.execute('DELETE FROM data WHERE book=? AND format=?', (id, format))
name = self.construct_file_name(id)
ext = ('.' + format.lower()) if format else ''
@ -1021,6 +1024,7 @@ def add_format(self, index, format, stream, index_is_id=False, path=None, notify
self.refresh_ids([id])
if notify:
self.notify('metadata', [id])
return True
def delete_book(self, id, notify=True):
'''

View file

@ -670,6 +670,8 @@ def _prefs():
help=_('The priority of worker processes'))
c.add_opt('swap_author_names', default=False,
help=_('Swap author first and last names when reading metadata'))
c.add_opt('add_formats_to_existing', default=False,
help=_('Add new formats to existing book records'))
c.add_opt('migrated', default=False, help='For Internal use. Don\'t modify.')
return c