cleanup and docs for regular expression queries

2026-02-07 07:54:43 +01:00 · 2012-05-01 20:03:01 -07:00 · 2012-05-01 20:03:01 -07:00 · 104aec3c2f
commit 104aec3c2f
parent a6e6da245a
4 changed files with 85 additions and 51 deletions
--- a/beets/library.py
+++ b/beets/library.py
@ -425,7 +425,7 @@ class RegexpQuery(FieldQuery):

    def match(self, item):
        value = getattr(item, self.field) or ''
-        return self.regexp.match(value) is not None
+        return self.regexp.search(value) is not None

 class BooleanQuery(MatchQuery):
    """Matches a boolean field. Pattern should either be a boolean or a
@ -476,24 +476,28 @@ class CollectionQuery(Query):
            subvals += subq_subvals
        clause = (' ' + joiner + ' ').join(clause_parts)
        return clause, subvals
-    
-    # regular expression for _parse_query_part, below
-    _pq_regex = re.compile(# non-grouping optional segment for the keyword
-                           r'(?:'
-                                r'(\S+?)'   # the keyword
-                                r'(?<!\\):' # unescaped :
-                           r')?'
-                           r'((?<!\\):?)'  # unescaped : for regexps
-                           r'(.+)',        # the term itself
-                           re.I)            # case-insensitive
+
+    # Regular expression for _parse_query_part, below.
+    _pq_regex = re.compile(
+        # Non-capturing optional segment for the keyword.
+        r'(?:'
+            r'(\S+?)'    # The field key.
+            r'(?<!\\):'  # Unescaped :
+        r')?'
+
+        r'((?<!\\):?)'   # Unescaped : indicating a regex.
+        r'(.+)',         # The term itself.
+
+        re.I  # Case-insensitive.
+    )
    @classmethod
    def _parse_query_part(cls, part):
        """Takes a query in the form of a key/value pair separated by a
        colon. An additional colon before the value indicates that the
-        value is a regular expression.
-        Returns tuple (key, term, is_regexp) where key is None if
-        the search term has no key and is_regexp indicates whether term
-        is a regular expression or not.
+        value is a regular expression. Returns tuple (key, term,
+        is_regexp) where key is None if the search term has no key and
+        is_regexp indicates whether term is a regular expression or an
+        ordinary substring match.

        For instance,
        parse_query('stapler') == (None, 'stapler', false)
@ -507,10 +511,15 @@ class CollectionQuery(Query):
        part = part.strip()
        match = cls._pq_regex.match(part)
        if match:
-            return match.group(1), match.group(3).replace(r'\:', ':'), match.group(2)==':'
+            return (
+                match.group(1),  # Key.
+                match.group(3).replace(r'\:', ':'),  # Term.
+                match.group(2) == ':',  # Regular expression.
+            )

    @classmethod
-    def from_strings(cls, query_parts, default_fields=None, all_keys=ITEM_KEYS):
+    def from_strings(cls, query_parts, default_fields=None,
+                     all_keys=ITEM_KEYS):
        """Creates a query from a list of strings in the format used by
        _parse_query_part. If default_fields are specified, they are the
        fields to be searched by unqualified search terms. Otherwise,
@ -522,30 +531,40 @@ class CollectionQuery(Query):
            if not res:
                continue
            key, pattern, is_regexp = res
-            if key is None: # No key specified.
+
+            # No key specified.
+            if key is None:
                if os.sep in pattern and 'path' in all_keys:
                    # This looks like a path.
                    subqueries.append(PathQuery(pattern))
                else:
                    # Match any field.
                    if is_regexp:
-                        subqueries.append(
-                            AnyRegexpQuery(pattern, default_fields))
+                        subq = AnyRegexpQuery(pattern, default_fields)
                    else:
-                        subqueries.append(
-                            AnySubstringQuery(pattern, default_fields))
-            elif key.lower() == 'comp': # a boolean field
+                        subq = AnySubstringQuery(pattern, default_fields)
+                    subqueries.append(subq)
+
+            # A boolean field.
+            elif key.lower() == 'comp':
                subqueries.append(BooleanQuery(key.lower(), pattern))
+
+            # Path field.
            elif key.lower() == 'path' and 'path' in all_keys:
                subqueries.append(PathQuery(pattern))
-            elif key.lower() in all_keys: # ignore unrecognized keys
+
+            # Other (recognized) field.
+            elif key.lower() in all_keys:
                if is_regexp:
                    subqueries.append(RegexpQuery(key.lower(), pattern))
                else:
                    subqueries.append(SubstringQuery(key.lower(), pattern))
+
+            # Singleton query (not a real field).
            elif key.lower() == 'singleton':
                subqueries.append(SingletonQuery(util.str2bool(pattern)))
-        if not subqueries: # no terms in query
+
+        if not subqueries:  # No terms in query.
            subqueries = [TrueQuery()]
        return cls(subqueries)

@ -890,19 +909,21 @@ class Library(BaseLibrary):

        self.timeout = timeout
        self.conn = sqlite3.connect(self.path, timeout)
+        # This way we can access our SELECT results like dictionaries.
        self.conn.row_factory = sqlite3.Row
-            # this way we can access our SELECT results like dictionaries

        # Add REGEXP function to SQLite queries.
-        def regexp(expr, item):
-            if item == None:
+        def regexp(expr, val):
+            if val is None or expr is None:
                return False
+            if not isinstance(val, basestring):
+                val = unicode(val)
            try:
-                reg = re.compile(expr)
-                res = reg.search(item)
-                return res is not None
-            except:
+                res = re.search(expr, val)
+            except re.error:
+                # Invalid regular expression.
                return False
+            return res is not None
        self.conn.create_function("REGEXP", 2, regexp)

        self._make_table('items', item_fields)
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@ -14,6 +14,9 @@ Changelog
  The new ``%aunique{}`` template function, which is included in the default
  path formats, ensures that Crystal Castles' albums will be placed into
  different directories. See :ref:`aunique` for details.
+* Beets queries can now use **regular expressions**. Use an additional `:` in
+  your query to enable regex matching. See :ref:`regex` for the full details.
+  Thanks to Matteo Mecucci.
 * Artist **sort names** are now fetched from MusicBrainz. There are two new data
  fields, ``artist_sort`` and ``albumartist_sort``, that contain sortable artist
  names like "Beatles, The". These fields are also used to sort albums and items
--- a/docs/reference/query.rst
+++ b/docs/reference/query.rst
@ -79,6 +79,33 @@ backslashes are not part of beets' syntax; I'm just using the escaping
 functionality of by shell (bash or zsh, for instance) to pass ``the rebel`` as a
 single argument instead of two.

+.. _regex:
+
+Regular Expressions
+-------------------
+
+While ordinary keywords perform simple substring matches, beets also supports
+regular expression matching for more advanced queries. To run a regex query, use
+an additional ``:`` between the field name and the expression::
+
+    $ beet list 'artist::Ann(a|ie)'
+
+That query finds songs by Anna Calvi and Annie but not Annuals. Similarly, this
+query prints the path to any file in my library that's missing a track title::
+
+    $ beet list -p title::^$
+
+To search *all* fields using a regular expression, just prefix the expression
+with a single `:`, like so::
+
+    $ beet list :Ho[pm]eless
+
+Regular expressions are case-sensitive and build on `Python's built-in
+implementation`_. See Python's documentation for specifics on regex syntax.
+
+.. _Python's built-in implementation: http://docs.python.org/library/re.html
+
+
 Path Queries
 ------------

@ -97,19 +124,3 @@ equivalent::
 Note that this only matches items that are *already in your library*, so a path
 query won't necessarily find *all* the audio files in a directory---just the
 ones you've already added to your beets library.
-
-Future Work
-----------
-
-Here are a few things that the query syntax should eventually support but aren't
-yet implemented. Please drop me a line if you have other ideas.
-
-* "Null" queries. It's currently impossible to query for items that have an
-  empty artist. Perhaps the syntax should look like ``artist:NULL`` or
-  ``artist:EMPTY``.
-
-* Regular expressions. Beets queries are based on simple case-insensitive
-  substring matching, but regexes might be useful occasionally as well. Maybe
-  the syntax should look something like ``re:artist:^.*$`` or, perhaps,
-  ``artist:/^.*$/``. Having regular expressions could help with null queries
-  (above): ``re:artist:^$``.
--- a/test/test_query.py
+++ b/test/test_query.py
@ -72,7 +72,7 @@ class AnySubstringQueryTest(unittest.TestCase):
    def test_restriction_completeness(self):
        q = beets.library.AnySubstringQuery('title', ['title'])
        self.assertEqual(self.lib.items(q).next().title, 'the title')
-        
+
    def test_restriction_soundness(self):
        q = beets.library.AnySubstringQuery('title', ['artist'])
        self.assertRaises(StopIteration, self.lib.items(q).next)
@ -98,7 +98,6 @@ class AnyRegexpQueryTest(unittest.TestCase):
        q = beets.library.AnyRegexpQuery(r'the ti$', ['title'])
        self.assertRaises(StopIteration, self.lib.items(q).next)

-
 # Convenient asserts for matching items.
 class AssertsMixin(object):
    def assert_matched(self, result_iterator, title):
@ -111,7 +110,7 @@ class AssertsMixin(object):
        self.assert_matched(result_iterator, 'Lovers Who Uncover')
        self.assert_matched(result_iterator, 'Boracay')
        self.assert_done(result_iterator)
-    
+
 class GetTest(unittest.TestCase, AssertsMixin):
    def setUp(self):
        self.lib = beets.library.Library(