Merge pull request #3883 from GrahamCobb/bareasc

Experimental "bare-ASCII" matching query
This commit is contained in:
Graham Cobb 2021-03-17 11:06:31 +00:00 committed by GitHub
commit 39cb013bd3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 315 additions and 0 deletions

85
beetsplug/bareasc.py Normal file
View file

@ -0,0 +1,85 @@
# -*- coding: utf-8 -*-
# This file is part of beets.
# Copyright 2016, Philippe Mongeau.
# Copyright 2021, Graham R. Cobb.
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and ascociated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# This module is adapted from Fuzzy in accordance to the licence of
# that module
"""Provides a bare-ASCII matching query."""
from __future__ import division, absolute_import, print_function
from beets import ui
from beets.ui import print_, decargs
from beets.plugins import BeetsPlugin
from beets.dbcore.query import StringFieldQuery
from unidecode import unidecode
import six
class BareascQuery(StringFieldQuery):
"""Compare items using bare ASCII, without accents etc."""
@classmethod
def string_match(cls, pattern, val):
"""Convert both pattern and string to plain ASCII before matching.
If pattern is all lower case, also convert string to lower case so
match is also case insensitive
"""
# smartcase
if pattern.islower():
val = val.lower()
pattern = unidecode(pattern)
val = unidecode(val)
return pattern in val
class BareascPlugin(BeetsPlugin):
"""Plugin to provide bare-ASCII option for beets matching."""
def __init__(self):
"""Default prefix for selecting bare-ASCII matching is #."""
super(BareascPlugin, self).__init__()
self.config.add({
'prefix': '#',
})
def queries(self):
"""Register bare-ASCII matching."""
prefix = self.config['prefix'].as_str()
return {prefix: BareascQuery}
def commands(self):
"""Add bareasc command as unidecode version of 'list'."""
cmd = ui.Subcommand('bareasc',
help='unidecode version of beet list command')
cmd.parser.usage += u"\n" \
u'Example: %prog -f \'$album: $title\' artist:beatles'
cmd.parser.add_all_common_options()
cmd.func = self.unidecode_list
return [cmd]
def unidecode_list(self, lib, opts, args):
"""Emulate normal 'list' command but with unidecode output."""
query = decargs(args)
album = opts.album
# Copied from commands.py - list_items
if album:
for album in lib.albums(query):
bare = unidecode(six.ensure_text(str(album)))
print_(six.ensure_text(bare))
else:
for item in lib.items(query):
bare = unidecode(six.ensure_text(str(item)))
print_(six.ensure_text(bare))

View file

@ -38,6 +38,11 @@ Major new features:
`Deezer`_ database.
Thanks to :user:`rhlahuja`.
:bug:`3355`
* A new :doc:`/plugins/bareasc` provides a new query type: `bare ASCII`
which ignores accented characters, treating them as though they
were the base ASCII character. To perform `bare ASCII` searches, use
the ``#`` prefix with :ref:`list-cmd` or other commands.
:bug:`3882`
Other new things:

69
docs/plugins/bareasc.rst Normal file
View file

@ -0,0 +1,69 @@
Bare-ASCII Search Plugin
========================
The ``bareasc`` plugin provides a prefixed query that searches your library using
simple ASCII character matching, with accented characters folded to their base
ASCII character. This can be useful if you want to find a track with accented
characters in the title or artist, particularly if you are not confident
you have the accents correct. It is also not unknown for the accents
to not be correct in the database entry or wrong in the CD information.
First, enable the plugin named ``bareasc`` (see :ref:`using-plugins`).
You'll then be able to use the ``#`` prefix to use bare-ASCII matching::
$ beet ls '#dvorak'
István Kertész - REQUIEM - Dvořàk: Requiem, op.89 - Confutatis maledictis
Command
-------
In addition to the query prefix, the plugin provides a utility ``bareasc`` command.
This command is **exactly** the same as the ``beet list`` command except that
the output is passed through the bare-ASCII transformation before being printed.
This allows you to easily check what the library data looks like in bare ASCII,
which can be useful if you are trying to work out why a query is not matching.
Using the same example track as above::
$ beet bareasc 'Dvořàk'
Istvan Kertesz - REQUIEM - Dvorak: Requiem, op.89 - Confutatis maledictis
Note: the ``bareasc`` command does *not* automatically use bare-ASCII queries.
If you want a bare-ASCII query you still need to specify the ``#`` prefix.
Notes
-----
If the query string is all in lower case, the comparison ignores case as well as
accents.
The default ``bareasc`` prefix (``#``) is used as a comment character in some shells
so may need to be protected (for example in quotes) when typed into the command line.
The bare ASCII transliteration is quite simple. It may not give the expected output
for all languages. For example, German u-umlaut ``ü`` is transformed into ASCII ``u``,
not into ``ue``.
The bare ASCII transformation also changes Unicode punctuation like double quotes,
apostrophes and even some hyphens. It is often best to leave out punctuation
in the queries. Note that the punctuation changes are often not even visible
with normal terminal fonts. You can always use the ``bareasc`` command to print the
transformed entries and use a command like ``diff`` to compare with the output
from the ``list`` command.
Configuration
-------------
To configure the plugin, make a ``bareasc:`` section in your configuration
file. The only available option is:
- **prefix**: The character used to designate bare-ASCII queries.
Default: ``#``, which may need to be escaped in some shells.
Credits
-------
The hard work in this plugin is done in Sean Burke's
`Unidecode <https://pypi.org/project/Unidecode/>`__ library.
Thanks are due to Sean and to all the people who created the Python
version and the beets extensible query architecture.

View file

@ -63,6 +63,7 @@ following to your configuration::
acousticbrainz
aura
badfiles
bareasc
beatport
bpd
bpm
@ -218,6 +219,7 @@ Interoperability
Miscellaneous
-------------
* :doc:`bareasc`: Search albums and tracks with bare ASCII string matching.
* :doc:`bpd`: A music player for your beets library that emulates `MPD`_ and is
compatible with `MPD clients`_.
* :doc:`convert`: Transcode music and embed album art while exporting to

154
test/test_bareasc.py Normal file
View file

@ -0,0 +1,154 @@
# -*- coding: utf-8 -*-
# This file is part of beets.
# Copyright 2021, Graham R. Cobb.
"""Tests for the 'bareasc' plugin."""
from __future__ import division, absolute_import, print_function
import unittest
from test.helper import capture_stdout, TestHelper
from beets import logging
class BareascPluginTest(unittest.TestCase, TestHelper):
"""Test bare ASCII query matching."""
def setUp(self):
"""Set up test environment for bare ASCII query matching."""
self.setup_beets()
self.log = logging.getLogger('beets.web')
self.config['bareasc']['prefix'] = u'#'
self.load_plugins('bareasc')
# Add library elements. Note that self.lib.add overrides any "id=<n>"
# and assigns the next free id number.
self.add_item(title=u'with accents',
album_id=2,
artist=u'Antonín Dvořák')
self.add_item(title=u'without accents',
artist=u'Antonín Dvorak')
self.add_item(title=u'with umlaut',
album_id=2,
artist=u'Brüggen')
self.add_item(title=u'without umlaut or e',
artist=u'Bruggen')
self.add_item(title=u'without umlaut with e',
artist=u'Brueggen')
def test_search_normal_noaccent(self):
"""Normal search, no accents, not using bare-ASCII match.
Finds just the unaccented entry.
"""
items = self.lib.items(u'dvorak')
self.assertEqual(len(items), 1)
self.assertEqual([items[0].title], [u'without accents'])
def test_search_normal_accent(self):
"""Normal search, with accents, not using bare-ASCII match.
Finds just the accented entry.
"""
items = self.lib.items(u'dvořák')
self.assertEqual(len(items), 1)
self.assertEqual([items[0].title], [u'with accents'])
def test_search_bareasc_noaccent(self):
"""Bare-ASCII search, no accents.
Finds both entries.
"""
items = self.lib.items(u'#dvorak')
self.assertEqual(len(items), 2)
self.assertEqual(
{items[0].title, items[1].title},
{u'without accents', u'with accents'}
)
def test_search_bareasc_accent(self):
"""Bare-ASCII search, with accents.
Finds both entries.
"""
items = self.lib.items(u'#dvořák')
self.assertEqual(len(items), 2)
self.assertEqual(
{items[0].title, items[1].title},
{u'without accents', u'with accents'}
)
def test_search_bareasc_wrong_accent(self):
"""Bare-ASCII search, with incorrect accent.
Finds both entries.
"""
items = self.lib.items(u'#dvořäk')
self.assertEqual(len(items), 2)
self.assertEqual(
{items[0].title, items[1].title},
{u'without accents', u'with accents'}
)
def test_search_bareasc_noumlaut(self):
"""Bare-ASCII search, with no umlaut.
Finds entry with 'u' not 'ue', although German speaker would
normally replace ü with ue.
This is expected behaviour for this simple plugin.
"""
items = self.lib.items(u'#Bruggen')
self.assertEqual(len(items), 2)
self.assertEqual(
{items[0].title, items[1].title},
{u'without umlaut or e', u'with umlaut'}
)
def test_search_bareasc_umlaut(self):
"""Bare-ASCII search, with umlaut.
Finds entry with 'u' not 'ue', although German speaker would
normally replace ü with ue.
This is expected behaviour for this simple plugin.
"""
items = self.lib.items(u'#Brüggen')
self.assertEqual(len(items), 2)
self.assertEqual(
{items[0].title, items[1].title},
{u'without umlaut or e', u'with umlaut'}
)
def test_bareasc_list_output(self):
"""Bare-ASCII version of list command - check output."""
with capture_stdout() as output:
self.run_command('bareasc', 'with accents')
self.assertIn('Antonin Dvorak', output.getvalue())
def test_bareasc_format_output(self):
"""Bare-ASCII version of list -f command - check output."""
with capture_stdout() as output:
self.run_command('bareasc', 'with accents',
'-f', '$artist:: $title')
self.assertEqual('Antonin Dvorak:: with accents\n',
output.getvalue())
def suite():
"""loader."""
return unittest.TestLoader().loadTestsFromName(__name__)
if __name__ == '__main__':
unittest.main(defaultTest='suite')