Fix bug processing author names with initials when downloading metadata from ozon.ru. Fixes #845420 (Problems with processing metadata in plugin ozon.ru)

2026-04-23 09:14:29 +02:00 · 2011-09-14 17:03:43 -06:00 · 2011-09-14 17:03:43 -06:00 · c8a78a83bc
commit c8a78a83bc
parent fb0f73a7ec 640f345640
4 changed files with 109 additions and 45 deletions
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -11,7 +11,7 @@
 Input plugin for HTML or OPF ebooks.
 '''

-import os, re, sys, uuid, tempfile, errno
+import os, re, sys, uuid, tempfile
 from urlparse import urlparse, urlunparse
 from urllib import unquote
 from functools import partial
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -116,7 +116,8 @@ def cap_author_token(token):
    lt = lower(token)
    if lt in ('von', 'de', 'el', 'van', 'le'):
        return lt
-    if re.match(r'([a-z]\.){2,}$', lt) is not None:
+    # no digits no spez. characters
+    if re.match(r'([^\d\W]\.){2,}$', lt, re.UNICODE) is not None:
        # Normalize tokens of the form J.K. to J. K.
        parts = token.split('.')
        return '. '.join(map(capitalize, parts)).strip()
--- a/src/calibre/ebooks/metadata/sources/ozon.py
+++ b/src/calibre/ebooks/metadata/sources/ozon.py
@ -28,7 +28,7 @@ class Ozon(Source):
    touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
                               'publisher', 'pubdate', 'comments', 'series', 'rating', 'language'])
    # Test purpose only, test function does not like when sometimes some filed are empty
-    #touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
+    # touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
    #                          'publisher', 'pubdate', 'comments'])

    supports_gzip_transfer_encoding = True
@ -109,8 +109,16 @@ def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
    # }}}

    def get_metadata(self, log, entries, title, authors, identifiers): # {{{
+        # some book titles have extra charactes like this
+        # TODO: make a twick
+        reRemoveFromTitle = None 
+        #reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')
+        
        title = unicode(title).upper() if title else ''
-        authors = map(unicode.upper, map(unicode, authors)) if authors else None
+        if reRemoveFromTitle:
+            title = reRemoveFromTitle.sub('', title) 
+        authors = map(_normalizeAuthorNameWithInitials, 
+                      map(unicode.upper, map(unicode, authors))) if authors else None
        ozon_id = identifiers.get('ozon', None)

        unk = unicode(_('Unknown')).upper()
@ -124,6 +132,7 @@ def get_metadata(self, log, entries, title, authors, identifiers): # {{{
        def in_authors(authors, miauthors):
            for author in authors:
                for miauthor in miauthors:
+                    #log.debug(u'=> %s <> %s'%(author, miauthor))
                    if author in miauthor: return True
            return None

@ -131,7 +140,10 @@ def ensure_metadata_match(mi): # {{{
            match = True
            if title:
                mititle = unicode(mi.title).upper() if mi.title else ''
+                if reRemoveFromTitle:
+                    mititle = reRemoveFromTitle.sub('', mititle)
                match = title in mititle
+                #log.debug(u't=> %s <> %s'%(title, mititle))
            if match and authors:
                miauthors = map(unicode.upper, map(unicode, mi.authors)) if mi.authors else []
                match = in_authors(authors, miauthors)
@ -190,7 +202,8 @@ def to_metadata(self, log, entry): # {{{

        title = entry.xpath(xp_template.format('Name'))
        author = entry.xpath(xp_template.format('Author'))
-        mi = Metadata(title, author.split(','))
+        norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u',')))
+        mi = Metadata(title, norm_authors)

        ozon_id = entry.xpath(xp_template.format('ID'))
        mi.identifiers = {'ozon':ozon_id}
@ -202,6 +215,11 @@ def to_metadata(self, log, entry): # {{{
        if cover:
            mi.ozon_cover_url = _translateToBigCoverUrl(cover)

+        pub_year = entry.xpath(xp_template.format('Year'))
+        if pub_year:
+            mi.pubdate = toPubdate(log, pub_year)
+            #log.debug('pubdate %s'%mi.pubdate)
+
        rating = entry.xpath(xp_template.format('ClientRatingValue'))
        if rating:
            try:
@ -269,13 +287,17 @@ def get_book_details(self, log, metadata, timeout): # {{{
        raw = self.browser.open_novisit(url, timeout=timeout).read()
        doc = html.fromstring(raw)

+        xpt_prod_det_at = u'string(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "%s")]/a[1]/@title)'
+        xpt_prod_det_tx = u'substring-after(//div[contains(@class, "product-detail")]//text()[contains(., "%s")], ":")'
+
        # series
-        xpt = u'normalize-space(//div[@class="frame_content"]//div[contains(normalize-space(text()), "Серия:")]//a/@title)'
+        xpt = xpt_prod_det_at % u'Сери'
+        # % u'Серия:'
        series = doc.xpath(xpt)
        if series:
            metadata.series = series

-        xpt = u'substring-after(//meta[@name="description"]/@content, "ISBN")'
+        xpt = u'normalize-space(substring-after(//meta[@name="description"]/@content, "ISBN"))'
        isbn_str = doc.xpath(xpt)
        if isbn_str:
            all_isbns = [check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if check_isbn(isbn)]
@ -283,38 +305,42 @@ def get_book_details(self, log, metadata, timeout): # {{{
                metadata.all_isbns = all_isbns
                metadata.isbn = all_isbns[0]

-        xpt = u'//div[@class="frame_content"]//div[contains(normalize-space(text()), "Издатель")]//a[@title="Издательство"]'
+        xpt = xpt_prod_det_at % u'Издатель'
        publishers = doc.xpath(xpt)
        if publishers:
-            metadata.publisher = publishers[0].text
+            metadata.publisher = publishers

-            xpt = u'string(../text()[contains(., "г.")])'
-            yearIn = publishers[0].xpath(xpt)
+        displ_lang = None
+        xpt = xpt_prod_det_tx % u'Язык'
+        langs = doc.xpath(xpt)
+        if langs:
+            lng_splt = langs.split(u',')
+            if lng_splt:
+                displ_lang = lng_splt[0].strip()
+        metadata.language = _translageLanguageToCode(displ_lang)
+        #log.debug(u'language: %s'%displ_lang)
+        
+        # can be set before from xml search responce
+        if not metadata.pubdate:
+            xpt = u'normalize-space(//div[@class="product-misc"]//text()[contains(., "г.")])'
+            yearIn = doc.xpath(xpt)
            if yearIn:
                matcher = re.search(r'\d{4}', yearIn)
                if matcher:
-                    year = int(matcher.group(0))
-                    # only year is available, so use 1-st of Jan
-                    metadata.pubdate = datetime.datetime(year, 1, 1) #<- failed comparation in identify.py
-                    #metadata.pubdate = datetime(year, 1, 1)
-            xpt = u'substring-after(string(../text()[contains(., "Язык")]), ": ")'
-            displLang = publishers[0].xpath(xpt)
-            lang_code =_translageLanguageToCode(displLang)
-            if lang_code:
-                metadata.language = lang_code
+                    metadata.pubdate = toPubdate(log, matcher.group(0))

        # overwrite comments from HTML if any
-        # tr/td[contains(.//text(), "От издателя")] -> does not work, why?
-        xpt = u'//div[contains(@class, "detail")]//tr/td//text()[contains(., "От издателя")]'\
-              u'/ancestor::tr[1]/following-sibling::tr[1]/td[contains(./@class, "description")][1]'
+        xpt = u'//table[@id="detail_description"]//tr/td'
        comment_elem = doc.xpath(xpt)
        if comment_elem:
            comments = unicode(etree.tostring(comment_elem[0]))
            if comments:
                # cleanup root tag, TODO: remove tags like object/embeded
-                comments = re.sub(r'^<td.+?>|</td>.+?$', u'', comments).strip()
-                if comments:
+                comments = re.sub(r'\A.*?<td.*?>|</td>.*\Z', u'', comments.strip(), re.MULTILINE).strip()
+                if comments and (not metadata.comments or len(comments) > len(metadata.comments)):
                    metadata.comments = comments
+                else:
+                    log.debug('HTML book description skipped in favour of search service xml responce')
        else:
            log.debug('No book description found in HTML')
    # }}}
@ -390,10 +416,40 @@ def _translageLanguageToCode(displayLang): # {{{
                u'Итальянский': 'it',
                u'Испанский': 'es',
                u'Китайский': 'zh',
-                u'Японский': 'ja' }
+                u'Японский': 'ja',
+                u'Финский' : 'fi',
+                u'Польский' : 'pl',}
    return langTbl.get(displayLang, None)
 # }}}

+# [В.П. Колесников | Колесников В.П.]-> В. П. BКолесников
+def _normalizeAuthorNameWithInitials(name): # {{{
+    res = name
+    if name: 
+        re1 = u'^(?P<lname>\S+)\s+(?P<fname>[^\d\W]\.)(?:\s*(?P<mname>[^\d\W]\.))?$' 
+        re2 = u'^(?P<fname>[^\d\W]\.)(?:\s*(?P<mname>[^\d\W]\.))?\s+(?P<lname>\S+)$'
+        matcher = re.match(re1, unicode(name), re.UNICODE)
+        if not matcher:
+            matcher = re.match(re2, unicode(name), re.UNICODE)
+            
+        if matcher:
+            d = matcher.groupdict()
+            res = ' '.join(x for x in (d['fname'], d['mname'], d['lname']) if x)
+    return res
+# }}}
+
+def toPubdate(log, yearAsString):
+    res = None
+    if yearAsString:
+        try:
+            year = int(yearAsString)
+            # only year is available, so use 1-st of Jan
+            res = datetime.datetime(year, 1, 1)
+        except:
+            log.error('cannot parse to date %s'%yearAsString)
+    return res
+
+
 if __name__ == '__main__': # tests {{{
    # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/ozon.py
    # comment some touched_fields before run thoses tests
@ -403,40 +459,45 @@ def _translageLanguageToCode(displayLang): # {{{

    test_identify_plugin(Ozon.name,
        [
-
-            (
+#            (
+#                {'identifiers':{}, 'title':u'Норвежский язык: Практический курс',
+#                    'authors':[u'Колесников В.П.', u'Г.В. Шатков']},
+#                [title_test(u'Норвежский язык: Практический курс', exact=True),
+#                 authors_test([u'В. П. Колесников', u'Г. В. Шатков'])]
+#            ),
+             (
                {'identifiers':{'isbn': '9785916572629'} },
                [title_test(u'На все четыре стороны', exact=True),
                 authors_test([u'А. А. Гилл'])]
-            ),
-            (
+             ),
+             (
                {'identifiers':{}, 'title':u'Der Himmel Kennt Keine Gunstlinge',
                    'authors':[u'Erich Maria Remarque']},
                [title_test(u'Der Himmel Kennt Keine Gunstlinge', exact=True),
                 authors_test([u'Erich Maria Remarque'])]
-            ),
-            (
+             ),
+             (
                {'identifiers':{ }, 'title':u'Метро 2033',
                    'authors':[u'Дмитрий Глуховский']},
                [title_test(u'Метро 2033', exact=False)]
-            ),
-            (
+             ),
+             (
                {'identifiers':{'isbn': '9785170727209'}, 'title':u'Метро 2033',
                    'authors':[u'Дмитрий Глуховский']},
                [title_test(u'Метро 2033', exact=True),
                    authors_test([u'Дмитрий Глуховский']),
                    isbn_test('9785170727209')]
-            ),
-            (
+             ),
+             (
                {'identifiers':{'isbn': '5-699-13613-4'}, 'title':u'Метро 2033',
                    'authors':[u'Дмитрий Глуховский']},
                [title_test(u'Метро 2033', exact=True),
                 authors_test([u'Дмитрий Глуховский'])]
-            ),
-            (
+             ),
+             (
                {'identifiers':{}, 'title':u'Метро',
                    'authors':[u'Глуховский']},
                [title_test(u'Метро', exact=False)]
-            ),
+             ),
    ])
 # }}}
--- a/src/calibre/gui2/store/stores/ozon_ru_plugin.py
+++ b/src/calibre/gui2/store/stores/ozon_ru_plugin.py
@ -80,13 +80,15 @@ def get_details(self, search_result, timeout=60):
            doc = html.fromstring(f.read())
            
            # example where we are going to find formats
-            # <div class="box">
-            # ...
-            #     <b>Доступные&nbsp;форматы:</b>
-            #     <div class="vertpadd">.epub, .fb2, .pdf, .pdf, .txt</div>
-            # ...
+            # <div class="l">
+            #     <p>            
+            #         Доступно:
+            #    </p>
            # </div>
-            xpt = u'normalize-space(//div[@class="box"]//*[contains(normalize-space(text()), "Доступные форматы:")][1]/following-sibling::div[1]/text())'
+            # <div class="l">
+            #     <p>.epub, .fb2.zip, .pdf</p>
+            # </div>
+            xpt = u'normalize-space(//div[contains(@class, "product-detail")]//*[contains(normalize-space(text()), "Доступ")]/ancestor-or-self::div[1]/following-sibling::div[1]/*[1])'
            formats = doc.xpath(xpt)
            if formats:
                result = True