diff --git a/beetsplug/bareasc.py b/beetsplug/bareasc.py new file mode 100644 index 000000000..4d574c756 --- /dev/null +++ b/beetsplug/bareasc.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- +# This file is part of beets. +# Copyright 2016, Philippe Mongeau. +# Copyright 2021, Graham R. Cobb. +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and ascociated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# This module is adapted from Fuzzy in accordance to the licence of +# that module + +"""Provides a bare-ASCII matching query.""" + +from __future__ import division, absolute_import, print_function + +from beets import ui +from beets.ui import print_, decargs +from beets.plugins import BeetsPlugin +from beets.dbcore.query import StringFieldQuery +from unidecode import unidecode +import six + + +class BareascQuery(StringFieldQuery): + """Compare items using bare ASCII, without accents etc.""" + @classmethod + def string_match(cls, pattern, val): + """Convert both pattern and string to plain ASCII before matching. + + If pattern is all lower case, also convert string to lower case so + match is also case insensitive + """ + # smartcase + if pattern.islower(): + val = val.lower() + pattern = unidecode(pattern) + val = unidecode(val) + return pattern in val + + +class BareascPlugin(BeetsPlugin): + """Plugin to provide bare-ASCII option for beets matching.""" + def __init__(self): + """Default prefix for selecting bare-ASCII matching is #.""" + super(BareascPlugin, self).__init__() + self.config.add({ + 'prefix': '#', + }) + + def queries(self): + """Register bare-ASCII matching.""" + prefix = self.config['prefix'].as_str() + return {prefix: BareascQuery} + + def commands(self): + """Add bareasc command as unidecode version of 'list'.""" + cmd = ui.Subcommand('bareasc', + help='unidecode version of beet list command') + cmd.parser.usage += u"\n" \ + u'Example: %prog -f \'$album: $title\' artist:beatles' + cmd.parser.add_all_common_options() + cmd.func = self.unidecode_list + return [cmd] + + def unidecode_list(self, lib, opts, args): + """Emulate normal 'list' command but with unidecode output.""" + query = decargs(args) + album = opts.album + # Copied from commands.py - list_items + if album: + for album in lib.albums(query): + bare = unidecode(six.ensure_text(str(album))) + print_(six.ensure_text(bare)) + else: + for item in lib.items(query): + bare = unidecode(six.ensure_text(str(item))) + print_(six.ensure_text(bare)) diff --git a/docs/changelog.rst b/docs/changelog.rst index 7338282f5..07fa573f9 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -38,6 +38,11 @@ Major new features: `Deezer`_ database. Thanks to :user:`rhlahuja`. :bug:`3355` +* A new :doc:`/plugins/bareasc` provides a new query type: `bare ASCII` + which ignores accented characters, treating them as though they + were the base ASCII character. To perform `bare ASCII` searches, use + the ``#`` prefix with :ref:`list-cmd` or other commands. + :bug:`3882` Other new things: diff --git a/docs/plugins/bareasc.rst b/docs/plugins/bareasc.rst new file mode 100644 index 000000000..0c8d6636c --- /dev/null +++ b/docs/plugins/bareasc.rst @@ -0,0 +1,69 @@ +Bare-ASCII Search Plugin +======================== + +The ``bareasc`` plugin provides a prefixed query that searches your library using +simple ASCII character matching, with accented characters folded to their base +ASCII character. This can be useful if you want to find a track with accented +characters in the title or artist, particularly if you are not confident +you have the accents correct. It is also not unknown for the accents +to not be correct in the database entry or wrong in the CD information. + +First, enable the plugin named ``bareasc`` (see :ref:`using-plugins`). +You'll then be able to use the ``#`` prefix to use bare-ASCII matching:: + + $ beet ls '#dvorak' + István Kertész - REQUIEM - Dvořàk: Requiem, op.89 - Confutatis maledictis + +Command +------- + +In addition to the query prefix, the plugin provides a utility ``bareasc`` command. +This command is **exactly** the same as the ``beet list`` command except that +the output is passed through the bare-ASCII transformation before being printed. +This allows you to easily check what the library data looks like in bare ASCII, +which can be useful if you are trying to work out why a query is not matching. + +Using the same example track as above:: + + $ beet bareasc 'Dvořàk' + Istvan Kertesz - REQUIEM - Dvorak: Requiem, op.89 - Confutatis maledictis + +Note: the ``bareasc`` command does *not* automatically use bare-ASCII queries. +If you want a bare-ASCII query you still need to specify the ``#`` prefix. + +Notes +----- + +If the query string is all in lower case, the comparison ignores case as well as +accents. + +The default ``bareasc`` prefix (``#``) is used as a comment character in some shells +so may need to be protected (for example in quotes) when typed into the command line. + +The bare ASCII transliteration is quite simple. It may not give the expected output +for all languages. For example, German u-umlaut ``ü`` is transformed into ASCII ``u``, +not into ``ue``. + +The bare ASCII transformation also changes Unicode punctuation like double quotes, +apostrophes and even some hyphens. It is often best to leave out punctuation +in the queries. Note that the punctuation changes are often not even visible +with normal terminal fonts. You can always use the ``bareasc`` command to print the +transformed entries and use a command like ``diff`` to compare with the output +from the ``list`` command. + +Configuration +------------- + +To configure the plugin, make a ``bareasc:`` section in your configuration +file. The only available option is: + +- **prefix**: The character used to designate bare-ASCII queries. + Default: ``#``, which may need to be escaped in some shells. + +Credits +------- + +The hard work in this plugin is done in Sean Burke's +`Unidecode `__ library. +Thanks are due to Sean and to all the people who created the Python +version and the beets extensible query architecture. diff --git a/docs/plugins/index.rst b/docs/plugins/index.rst index ae14b8166..14dd5137b 100644 --- a/docs/plugins/index.rst +++ b/docs/plugins/index.rst @@ -63,6 +63,7 @@ following to your configuration:: acousticbrainz aura badfiles + bareasc beatport bpd bpm @@ -218,6 +219,7 @@ Interoperability Miscellaneous ------------- +* :doc:`bareasc`: Search albums and tracks with bare ASCII string matching. * :doc:`bpd`: A music player for your beets library that emulates `MPD`_ and is compatible with `MPD clients`_. * :doc:`convert`: Transcode music and embed album art while exporting to diff --git a/test/test_bareasc.py b/test/test_bareasc.py new file mode 100644 index 000000000..1ce4e6176 --- /dev/null +++ b/test/test_bareasc.py @@ -0,0 +1,154 @@ +# -*- coding: utf-8 -*- +# This file is part of beets. +# Copyright 2021, Graham R. Cobb. + +"""Tests for the 'bareasc' plugin.""" + +from __future__ import division, absolute_import, print_function + +import unittest + +from test.helper import capture_stdout, TestHelper + +from beets import logging + + +class BareascPluginTest(unittest.TestCase, TestHelper): + """Test bare ASCII query matching.""" + + def setUp(self): + """Set up test environment for bare ASCII query matching.""" + self.setup_beets() + self.log = logging.getLogger('beets.web') + self.config['bareasc']['prefix'] = u'#' + self.load_plugins('bareasc') + + # Add library elements. Note that self.lib.add overrides any "id=" + # and assigns the next free id number. + self.add_item(title=u'with accents', + album_id=2, + artist=u'Antonín Dvořák') + self.add_item(title=u'without accents', + artist=u'Antonín Dvorak') + self.add_item(title=u'with umlaut', + album_id=2, + artist=u'Brüggen') + self.add_item(title=u'without umlaut or e', + artist=u'Bruggen') + self.add_item(title=u'without umlaut with e', + artist=u'Brueggen') + + def test_search_normal_noaccent(self): + """Normal search, no accents, not using bare-ASCII match. + + Finds just the unaccented entry. + """ + items = self.lib.items(u'dvorak') + + self.assertEqual(len(items), 1) + self.assertEqual([items[0].title], [u'without accents']) + + def test_search_normal_accent(self): + """Normal search, with accents, not using bare-ASCII match. + + Finds just the accented entry. + """ + items = self.lib.items(u'dvořák') + + self.assertEqual(len(items), 1) + self.assertEqual([items[0].title], [u'with accents']) + + def test_search_bareasc_noaccent(self): + """Bare-ASCII search, no accents. + + Finds both entries. + """ + items = self.lib.items(u'#dvorak') + + self.assertEqual(len(items), 2) + self.assertEqual( + {items[0].title, items[1].title}, + {u'without accents', u'with accents'} + ) + + def test_search_bareasc_accent(self): + """Bare-ASCII search, with accents. + + Finds both entries. + """ + items = self.lib.items(u'#dvořák') + + self.assertEqual(len(items), 2) + self.assertEqual( + {items[0].title, items[1].title}, + {u'without accents', u'with accents'} + ) + + def test_search_bareasc_wrong_accent(self): + """Bare-ASCII search, with incorrect accent. + + Finds both entries. + """ + items = self.lib.items(u'#dvořäk') + + self.assertEqual(len(items), 2) + self.assertEqual( + {items[0].title, items[1].title}, + {u'without accents', u'with accents'} + ) + + def test_search_bareasc_noumlaut(self): + """Bare-ASCII search, with no umlaut. + + Finds entry with 'u' not 'ue', although German speaker would + normally replace ü with ue. + + This is expected behaviour for this simple plugin. + """ + items = self.lib.items(u'#Bruggen') + + self.assertEqual(len(items), 2) + self.assertEqual( + {items[0].title, items[1].title}, + {u'without umlaut or e', u'with umlaut'} + ) + + def test_search_bareasc_umlaut(self): + """Bare-ASCII search, with umlaut. + + Finds entry with 'u' not 'ue', although German speaker would + normally replace ü with ue. + + This is expected behaviour for this simple plugin. + """ + items = self.lib.items(u'#Brüggen') + + self.assertEqual(len(items), 2) + self.assertEqual( + {items[0].title, items[1].title}, + {u'without umlaut or e', u'with umlaut'} + ) + + def test_bareasc_list_output(self): + """Bare-ASCII version of list command - check output.""" + with capture_stdout() as output: + self.run_command('bareasc', 'with accents') + + self.assertIn('Antonin Dvorak', output.getvalue()) + + def test_bareasc_format_output(self): + """Bare-ASCII version of list -f command - check output.""" + with capture_stdout() as output: + self.run_command('bareasc', 'with accents', + '-f', '$artist:: $title') + + self.assertEqual('Antonin Dvorak:: with accents\n', + output.getvalue()) + + +def suite(): + """loader.""" + return unittest.TestLoader().loadTestsFromName(__name__) + +if __name__ == '__main__': + unittest.main(defaultTest='suite')