Merge branch 'master' into import-filefilter

Conflicts: test/test_importer.py
2026-02-20 14:21:34 +01:00 · 2014-12-31 17:04:16 +01:00 · 2014-12-31 17:04:16 +01:00 · ab996a0de0
commit ab996a0de0
parent 7f4a06d12c 405726ba13
9 changed files with 99 additions and 50 deletions
--- a/beets/importer.py
+++ b/beets/importer.py
@ -1313,10 +1313,12 @@ def log_files(session, task):
        return

    if isinstance(task, SingletonImportTask):
-        log.info(displayable_path(task.item['path']))
+        log.info(
+            'Singleton: {0}'.format(displayable_path(task.item['path'])))
    elif task.items:
+        log.info('Album {0}'.format(displayable_path(task.paths[0])))
        for item in task.items:
-            log.info(displayable_path(item['path']))
+            log.info('  {0}'.format(displayable_path(item['path'])))


 def group_albums(session):
--- a/beetsplug/lastgenre/init.py
+++ b/beetsplug/lastgenre/init.py
@ -41,6 +41,10 @@ PYLAST_EXCEPTIONS = (
    pylast.NetworkError,
 )

+REPLACE = {
+    u'\u2010': '-',
+}
+

 def deduplicate(seq):
    """Remove duplicates from sequence wile preserving order.
@ -235,10 +239,13 @@ class LastGenrePlugin(plugins.BeetsPlugin):

    # Cached entity lookups.

-    def _cached_lookup(self, entity, method, *args):
+    def _last_lookup(self, entity, method, *args):
        """Get a genre based on the named entity using the callable `method`
        whose arguments are given in the sequence `args`. The genre lookup
-        is cached based on the entity name and the arguments.
+        is cached based on the entity name and the arguments. Before the
+        lookup, each argument is has some Unicode characters replaced with
+        rough ASCII equivalents in order to return better results from the
+        Last.fm database.
        """
        # Shortcut if we're missing metadata.
        if any(not s for s in args):
@ -248,32 +255,38 @@ class LastGenrePlugin(plugins.BeetsPlugin):
        if key in self._genre_cache:
            return self._genre_cache[key]
        else:
-            genre = self.fetch_genre(method(*args))
+            args_replaced = []
+            for arg in args:
+                for k, v in REPLACE.items():
+                    arg = arg.replace(k, v)
+                args_replaced.append(arg)
+
+            genre = self.fetch_genre(method(*args_replaced))
            self._genre_cache[key] = genre
            return genre

    def fetch_album_genre(self, obj):
        """Return the album genre for this Item or Album.
        """
-        return self._cached_lookup(u'album', LASTFM.get_album, obj.albumartist,
-                                   obj.album)
+        return self._last_lookup(u'album', LASTFM.get_album, obj.albumartist,
+                                 obj.album)

    def fetch_album_artist_genre(self, obj):
        """Return the album artist genre for this Item or Album.
        """
-        return self._cached_lookup(u'artist', LASTFM.get_artist,
-                                   obj.albumartist)
+        return self._last_lookup(u'artist', LASTFM.get_artist,
+                                 obj.albumartist)

    def fetch_artist_genre(self, item):
        """Returns the track artist genre for this Item.
        """
-        return self._cached_lookup(u'artist', LASTFM.get_artist, item.artist)
+        return self._last_lookup(u'artist', LASTFM.get_artist, item.artist)

    def fetch_track_genre(self, obj):
        """Returns the track genre for this Item.
        """
-        return self._cached_lookup(u'track', LASTFM.get_track, obj.artist,
-                                   obj.title)
+        return self._last_lookup(u'track', LASTFM.get_track, obj.artist,
+                                 obj.title)

    def _get_genre(self, obj):
        """Get the genre string for an Album or Item object based on
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@ -90,7 +90,7 @@ def extract_text_between(html, start_marker, end_marker):
        html, _ = html.split(end_marker, 1)
    except ValueError:
        return u''
-    return _scrape_strip_cruft(html, True)
+    return html


 def extract_text_in(html, starttag):
@ -124,8 +124,7 @@ def extract_text_in(html, starttag):
    else:
        print('no closing tag found!')
        return
-    lyrics = ''.join(parts)
-    return _scrape_strip_cruft(lyrics, True)
+    return u''.join(parts)


 def search_pairs(item):
@ -221,7 +220,7 @@ def fetch_lyricswiki(artist, title):
    if not html:
        return

-    lyrics = extract_text_in(html, "<div class='lyricbox'>")
+    lyrics = extract_text_in(html, u"<div class='lyricbox'>")
    if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
        return lyrics

@ -360,13 +359,14 @@ def _scrape_strip_cruft(html, plain_text_out=False):
        html = COMMENT_RE.sub('', html)
        html = TAG_RE.sub('', html)

-    # Strip lines
    html = '\n'.join([x.strip() for x in html.strip().split('\n')])
+    html = re.sub(r'\n{3,}', r'\n\n', html)
    return html


 def _scrape_merge_paragraphs(html):
-    return re.sub(r'</p>\s*<p(\s*[^>]*)>', '\n', html)
+    html = re.sub(r'</p>\s*<p(\s*[^>]*)>', '\n', html)
+    return re.sub(r'<div .*>\s*</div>', '\n', html)


 def scrape_lyrics_from_html(html):
@ -541,4 +541,4 @@ class LyricsPlugin(plugins.BeetsPlugin):
            if lyrics:
                log.debug(u'got lyrics from backend: {0}'
                          .format(backend.__name__))
-                return lyrics.strip()
+                return _scrape_strip_cruft(lyrics, True)
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@ -30,6 +30,8 @@ New:

 Fixed:

+* :doc:`/plugins/lyrics`: Avoid fetching truncated lyrics from the Google
+  backed by merging text blocks separated by empty ``<div>`` before scraping.
 * Fix a new crash with the latest version of Mutagen (1.26).
 * We now print a better error message when the database file is corrupted.
 * :doc:`/plugins/discogs`: Only prompt for authentication when running the
@ -64,6 +66,8 @@ Fixed:
  the import process. Thanks to :user:`Freso`. :bug:`1176` :bug:`1172`
 * :doc:`/plugins/ftintitle`: Fix weird behavior when the same artist appears
  twice in the artist string. Thanks to Marc Addeo. :bug:`1179` :bug:`1181`
+* :doc:`/plugins/lastgenre`: Match songs more robustly when they contain
+  dashes. Thanks to :user:`djl`. :bug:`1156`

 .. _API changes: http://developer.echonest.com/forums/thread/3650
 .. _Plex: https://plex.tv/
--- a/test/helper.py
+++ b/test/helper.py
@ -120,6 +120,8 @@ def has_program(cmd, args=['--version']):
                                  stdout=devnull, stdin=devnull)
    except OSError:
        return False
+    except subprocess.CalledProcessError:
+        return False
    else:
        return True

--- a/test/rsrc/lyrics/examplecom/beetssong.txt
+++ b/test/rsrc/lyrics/examplecom/beetssong.txt
@ -222,7 +222,9 @@ e9.size = "120x600, 160x600";
        <h2>John Doe <br> beets song lyrics</h2>
 <img src="images/phone-left.gif" alt="Ringtones left icon" width="16" height="17"> <a href="http://www.ringtonematcher.com/go/?sid=LBSMros&amp;artist=The+John Doe&amp;song=Beets+Song" target="_blank"><b><font size="+1" color="red" face="arial">Send "beets song" Ringtone to your Cell</font></b></a> <img src="images/phone-right.gif" alt="Ringtones right icon" width="16" height="17"><br><br><center>Beets is the media library management system for obsessive-compulsive music geeks.<br>
 The purpose of beets is to get your music collection right once and for all. It catalogs your collection, automatically improving its metadata as it goes. It then provides a bouquet of tools for manipulating and accessing your music.<br>
-Here's an example of beets' brainy tag corrector doing its thing:</center>
+<div class='flow breaker'>           </div>
+Here's an example of beets' brainy tag corrector doing its thing:
+Because beets is designed as a library, it can do almost anything you can imagine for your music collection. Via plugins, beets becomes a panacea</center>
 <img src="images/phone-left.gif" alt="Ringtones left icon" width="16" height="17"> <a href="http://www.ringtonematcher.com/go/?sid=LBSMros&amp;artist=The+John Doe&amp;song=Beets+Song" target="_blank"><b><font size="+1" color="red" face="arial">Send "beets song" Ringtone to your Cell</font></b></a> <img src="images/phone-right.gif" alt="Ringtones right icon" width="16" height="17"><br><br>
        <center>
         <font color="black" size="2" face="arial">Share <strong>beets song lyrics</strong></font><br><p style="height: 1px; margin: 3pt; padding: 0pt;"></p>
--- a/test/rsrc/lyricstext.yaml
+++ b/test/rsrc/lyricstext.yaml
@ -1,3 +1,8 @@
+Beets_song:
+    - geeks
+    - bouquet
+    - panacea
+
 Amsterdam:
    - oriflammes
    - fortune
--- a/test/test_importer.py
+++ b/test/test_importer.py
@ -1542,7 +1542,8 @@ class ImportPretendTest(_common.TestCase, ImportHelper):
    def setUp(self):
        super(ImportPretendTest, self).setUp()
        self.setup_beets()
-        self._create_import_dir(1)
+        self.__create_import_dir()
+        self.__create_empty_import_dir()
        self._setup_import_session()
        config['import']['pretend'] = True
        self.matcher = AutotagStub().install()
@ -1552,37 +1553,29 @@ class ImportPretendTest(_common.TestCase, ImportHelper):
        self.teardown_beets()
        self.matcher.restore()

-    def test_import_pretend(self):
+    def __create_import_dir(self):
+        self._create_import_dir(1)
        resource_path = os.path.join(_common.RSRC, u'empty.mp3')
        single_path = os.path.join(self.import_dir, u'track_2.mp3')
-
        shutil.copy(resource_path, single_path)
-        import_files = [
+        self.import_paths = [
            os.path.join(self.import_dir, u'the_album'),
            single_path
        ]
-        self._setup_import_session(singletons=True)
-        self.importer.paths = import_files
+        self.import_files = [
+            displayable_path(
+                os.path.join(self.import_paths[0], u'track_1.mp3')),
+            displayable_path(single_path)
+        ]

-        with capture_log() as logs:
-            self.importer.run()
-
-        logs = [line for line in logs if not line.startswith('Sending event:')]
-
-        self.assertEqual(len(self.lib.items()), 0)
-        self.assertEqual(len(self.lib.albums()), 0)
-
-        self.assertEqual(len(logs), 2)
-        self.assertEqual(logs[0], os.path.join(import_files[0],
-                                               u'track_1.mp3'))
-        self.assertEqual(logs[1], import_files[1])
-
-    def test_import_pretend_empty(self):
+    def __create_empty_import_dir(self):
        path = os.path.join(self.temp_dir, 'empty')
        os.makedirs(path)
+        self.empty_path = path

-        self._setup_import_session(singletons=True)
-        self.importer.paths = [path]
+    def __run(self, import_paths, singletons=True):
+        self._setup_import_session(singletons=singletons)
+        self.importer.paths = import_paths

        with capture_log() as logs:
            self.importer.run()
@ -1592,9 +1585,29 @@ class ImportPretendTest(_common.TestCase, ImportHelper):
        self.assertEqual(len(self.lib.items()), 0)
        self.assertEqual(len(self.lib.albums()), 0)

-        self.assertEqual(len(logs), 1)
-        self.assertEqual(logs[0], 'No files imported from {0}'
-                                  .format(displayable_path(path)))
+        return logs
+
+    def test_import_singletons_pretend(self):
+        logs = self.__run(self.import_paths)
+
+        self.assertEqual(logs, [
+            'Singleton: %s' % self.import_files[0],
+            'Singleton: %s' % self.import_paths[1]])
+
+    def test_import_album_pretend(self):
+        logs = self.__run(self.import_paths, singletons=False)
+
+        self.assertEqual(logs, [
+            'Album %s' % displayable_path(self.import_paths[0]),
+            '  %s' % self.import_files[0],
+            'Album %s' % displayable_path(self.import_paths[1]),
+            '  %s' % self.import_paths[1]])
+
+    def test_import_pretend_empty(self):
+        logs = self.__run([self.empty_path])
+
+        self.assertEqual(logs, ['No files imported from {0}'
+                         .format(displayable_path(self.empty_path))])


 def suite():
--- a/test/test_lyrics.py
+++ b/test/test_lyrics.py
@ -300,6 +300,15 @@ class LyricsGooglePluginTest(unittest.TestCase):
        lyrics.LyricsPlugin()
        lyrics.fetch_url = MockFetchUrl()

+    def test_mocked_source_ok(self):
+        """Test that lyrics of the mocked page are correctly scraped"""
+        url = self.source['url'] + self.source['path']
+        if os.path.isfile(url_to_filename(url)):
+            res = lyrics.scrape_lyrics_from_html(lyrics.fetch_url(url))
+            self.assertTrue(lyrics.is_lyrics(res), url)
+            self.assertTrue(is_lyrics_content_ok(self.source['title'], res),
+                            url)
+
    def test_google_sources_ok(self):
        """Test if lyrics present on websites registered in beets google custom
        search engine are correctly scraped."""
@ -319,12 +328,11 @@ class LyricsGooglePluginTest(unittest.TestCase):
        for (fun, s) in zip([lyrics.fetch_lyricswiki,
                             lyrics.fetch_lyricscom,
                             lyrics.fetch_musixmatch], DEFAULT_SOURCES):
-            if os.path.isfile(url_to_filename(
-                              s['url'] + s['path'])):
+            url = s['url'] + s['path']
+            if os.path.isfile(url_to_filename(url)):
                res = fun(s['artist'], s['title'])
-                self.assertTrue(lyrics.is_lyrics(res))
-                self.assertTrue(is_lyrics_content_ok(
-                                s['title'], res))
+                self.assertTrue(lyrics.is_lyrics(res), url)
+                self.assertTrue(is_lyrics_content_ok(s['title'], res), url)

    def test_is_page_candidate_exact_match(self):
        """Test matching html page title with song infos -- when song infos are