beets/beets/importer/tasks.py

# This file is part of beets.
# Copyright 2016, Adrian Sampson.
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.

from __future__ import annotations

import logging
import os
import re
import shutil
import time
from collections import defaultdict
from collections.abc import Callable, Iterable, Sequence
from enum import Enum
from tempfile import mkdtemp
from typing import TYPE_CHECKING, Any

import mediafile

from beets import autotag, config, library, plugins, util
from beets.dbcore.query import PathQuery

from .state import ImportState

if TYPE_CHECKING:
    from beets.autotag.match import Recommendation

    from .session import ImportSession

# Global logger.
log = logging.getLogger("beets")


SINGLE_ARTIST_THRESH = 0.25

# Usually flexible attributes are preserved (i.e., not updated) during
# reimports. The following two lists (globally) change this behaviour for
# certain fields. To alter these lists only when a specific plugin is in use,
# something like this can be used within that plugin's code:
#
# from beets import importer
# def extend_reimport_fresh_fields_item():
#     importer.REIMPORT_FRESH_FIELDS_ITEM.extend(['tidal_track_popularity']
# )
REIMPORT_FRESH_FIELDS_ITEM = [
    "data_source",
    "bandcamp_album_id",
    "spotify_album_id",
    "deezer_album_id",
    "beatport_album_id",
    "tidal_album_id",
    "data_url",
]
REIMPORT_FRESH_FIELDS_ALBUM = [*REIMPORT_FRESH_FIELDS_ITEM, "media"]

# Global logger.
log = logging.getLogger("beets")


class ImportAbortError(Exception):
    """Raised when the user aborts the tagging operation."""

    pass


class Action(Enum):
    """Enumeration of possible actions for an import task."""

    SKIP = "SKIP"
    ASIS = "ASIS"
    TRACKS = "TRACKS"
    APPLY = "APPLY"
    ALBUMS = "ALBUMS"
    RETAG = "RETAG"
    # The RETAG action represents "don't apply any match, but do record
    # new metadata". It's not reachable via the standard command prompt but
    # can be used by plugins.


class BaseImportTask:
    """An abstract base class for importer tasks.

    Tasks flow through the importer pipeline. Each stage can update
    them."""

    toppath: util.PathBytes | None
    paths: list[util.PathBytes]
    items: list[library.Item]

    def __init__(
        self,
        toppath: util.PathBytes | None,
        paths: Iterable[util.PathBytes] | None,
        items: Iterable[library.Item] | None,
    ):
        """Create a task. The primary fields that define a task are:

        * `toppath`: The user-specified base directory that contains the
          music for this task. If the task has *no* user-specified base
          (for example, when importing based on an -L query), this can
          be None. This is used for tracking progress and history.
        * `paths`: A list of *specific* paths where the music for this task
          came from. These paths can be directories, when their entire
          contents are being imported, or files, when the task comprises
          individual tracks. This is used for progress/history tracking and
          for displaying the task to the user.
        * `items`: A list of `Item` objects representing the music being
          imported.

        These fields should not change after initialization.
        """
        self.toppath = toppath
        self.paths = list(paths) if paths is not None else []
        self.items = list(items) if items is not None else []


class ImportTask(BaseImportTask):
    """Represents a single set of items to be imported along with its
    intermediate state. May represent an album or a single item.

    The import session and stages call the following methods in the
    given order.

    * `lookup_candidates()` Sets the `common_artist`, `common_album`,
      `candidates`, and `rec` attributes. `candidates` is a list of
      `AlbumMatch` objects.

    * `choose_match()` Uses the session to set the `match` attribute
      from the `candidates` list.

    * `find_duplicates()` Returns a list of albums from `lib` with the
      same artist and album name as the task.

    * `apply_metadata()` Sets the attributes of the items from the
      task's `match` attribute.

    * `add()` Add the imported items and album to the database.

    * `manipulate_files()` Copy, move, and write files depending on the
      session configuration.

    * `set_fields()` Sets the fields given at CLI or configuration to
      the specified values.

    * `finalize()` Update the import progress and cleanup the file
      system.
    """

    choice_flag: Action | None = None
    match: autotag.AlbumMatch | autotag.TrackMatch | None = None

    # Keep track of the current task item
    cur_album: str | None = None
    cur_artist: str | None = None
    candidates: Sequence[autotag.AlbumMatch | autotag.TrackMatch] = []
    rec: Recommendation | None = None

    def __init__(
        self,
        toppath: util.PathBytes | None,
        paths: Iterable[util.PathBytes] | None,
        items: Iterable[library.Item] | None,
    ):
        super().__init__(toppath, paths, items)
        self.should_remove_duplicates = False
        self.should_merge_duplicates = False
        self.is_album = True

    def set_choice(
        self, choice: Action | autotag.AlbumMatch | autotag.TrackMatch
    ):
        """Given an AlbumMatch or TrackMatch object or an action constant,
        indicates that an action has been selected for this task.

        Album and trackmatch are implemented as tuples, so we can't
        use isinstance to check for them.
        """
        # Not part of the task structure:
        assert choice != Action.APPLY  # Only used internally.

        if choice in (
            Action.SKIP,
            Action.ASIS,
            Action.TRACKS,
            Action.ALBUMS,
            Action.RETAG,
        ):
            # TODO: redesign to stricten the type
            self.choice_flag = choice  # type: ignore[assignment]
            self.match = None
        else:
            self.choice_flag = Action.APPLY  # Implicit choice.
            self.match = choice  # type: ignore[assignment]

    def save_progress(self):
        """Updates the progress state to indicate that this album has
        finished.
        """
        if self.toppath:
            ImportState().progress_add(self.toppath, *self.paths)

    def save_history(self):
        """Save the directory in the history for incremental imports."""
        ImportState().history_add(self.paths)

    # Logical decisions.

    @property
    def apply(self):
        return self.choice_flag == Action.APPLY

    @property
    def skip(self):
        return self.choice_flag == Action.SKIP

    # Convenient data.

    def chosen_info(self):
        """Return a dictionary of metadata about the current choice.
        May only be called when the choice flag is ASIS or RETAG
        (in which case the data comes from the files' current metadata)
        or APPLY (in which case the data comes from the choice).
        """
        if self.choice_flag in (Action.ASIS, Action.RETAG):
            likelies, consensus = util.get_most_common_tags(self.items)
            return likelies
        elif self.choice_flag is Action.APPLY and self.match:
            return self.match.info.copy()
        assert False

    def imported_items(self):
        """Return a list of Items that should be added to the library.

        If the tasks applies an album match the method only returns the
        matched items.
        """
        if self.choice_flag in (Action.ASIS, Action.RETAG):
            return self.items
        elif self.choice_flag == Action.APPLY and isinstance(
            self.match, autotag.AlbumMatch
        ):
            return self.match.items
        else:
            assert False

    def apply_metadata(self):
        """Copy metadata from match info to the items."""
        if config["import"]["from_scratch"]:
            for item in self.match.items:
                item.clear()

        autotag.apply_metadata(self.match.info, self.match.item_info_pairs)

    def duplicate_items(self, lib: library.Library):
        duplicate_items = []
        for album in self.find_duplicates(lib):
            duplicate_items += album.items()
        return duplicate_items

    def remove_duplicates(self, lib: library.Library):
        duplicate_items = self.duplicate_items(lib)
        log.debug("removing {} old duplicated items", len(duplicate_items))
        for item in duplicate_items:
            item.remove()
            if lib.directory in util.ancestry(item.path):
                log.debug("deleting duplicate {.filepath}", item)
                util.remove(item.path)
                util.prune_dirs(os.path.dirname(item.path), lib.directory)

    def set_fields(self, lib: library.Library):
        """Sets the fields given at CLI or configuration to the specified
        values, for both the album and all its items.
        """
        items = self.imported_items()
        for field, view in config["import"]["set_fields"].items():
            value = str(view.get())
            log.debug(
                "Set field {}={} for {}",
                field,
                value,
                util.displayable_path(self.paths),
            )
            self.album.set_parse(field, format(self.album, value))
            for item in items:
                item.set_parse(field, format(item, value))
        with lib.transaction():
            for item in items:
                item.store()
            self.album.store()

    def finalize(self, session: ImportSession):
        """Save progress, clean up files, and emit plugin event."""
        # Update progress.
        if session.want_resume:
            self.save_progress()
        if session.config["incremental"] and not (
            # Should we skip recording to incremental list?
            self.skip and session.config["incremental_skip_later"]
        ):
            self.save_history()

        self.cleanup(
            copy=session.config["copy"],
            delete=session.config["delete"],
            move=session.config["move"],
        )

        if not self.skip:
            self._emit_imported(session.lib)

    def cleanup(self, copy=False, delete=False, move=False):
        """Remove and prune imported paths."""
        # Do not delete any files or prune directories when skipping.
        if self.skip:
            return

        items = self.imported_items()

        # When copying and deleting originals, delete old files.
        if copy and delete:
            new_paths = [os.path.realpath(item.path) for item in items]
            for old_path in self.old_paths:
                # Only delete files that were actually copied.
                if old_path not in new_paths:
                    util.remove(old_path, False)
                    self.prune(old_path)

        # When moving, prune empty directories containing the original files.
        elif move:
            for old_path in self.old_paths:
                self.prune(old_path)

    def _emit_imported(self, lib: library.Library):
        plugins.send("album_imported", lib=lib, album=self.album)

    def handle_created(self, session: ImportSession):
        """Send the `import_task_created` event for this task. Return a list of
        tasks that should continue through the pipeline. By default, this is a
        list containing only the task itself, but plugins can replace the task
        with new ones.
        """
        tasks = plugins.send("import_task_created", session=session, task=self)
        if not tasks:
            tasks = [self]
        else:
            # The plugins gave us a list of lists of tasks. Flatten it.
            tasks = [t for inner in tasks for t in inner]
        return tasks

    def lookup_candidates(self, search_ids: list[str]) -> None:
        """Retrieve and store candidates for this album.

        If User-specified ``search_ids`` list is not empty, the lookup is
        restricted to only those IDs.
        """
        self.cur_artist, self.cur_album, (self.candidates, self.rec) = (
            autotag.tag_album(self.items, search_ids=search_ids)
        )

    def find_duplicates(self, lib: library.Library) -> list[library.Album]:
        """Return a list of albums from `lib` with the same artist and
        album name as the task.
        """
        info = self.chosen_info()
        info["albumartist"] = info["artist"]

        if info["artist"] is None:
            # As-is import with no artist. Skip check.
            return []

        # Construct a query to find duplicates with this metadata. We
        # use a temporary Album object to generate any computed fields.
        tmp_album = library.Album(lib, **info)
        keys: list[str] = config["import"]["duplicate_keys"][
            "album"
        ].as_str_seq()
        dup_query = tmp_album.duplicates_query(keys)

        # Don't count albums with the same files as duplicates.
        task_paths = {i.path for i in self.items if i}

        duplicates = []
        for album in lib.albums(dup_query):
            # Check whether the album paths are all present in the task
            # i.e. album is being completely re-imported by the task,
            # in which case it is not a duplicate (will be replaced).
            album_paths = {i.path for i in album.items()}
            if not (album_paths <= task_paths):
                duplicates.append(album)

        return duplicates

    def align_album_level_fields(self):
        """Make some album fields equal across `self.items`. For the
        RETAG action, we assume that the responsible for returning it
        (ie. a plugin) always ensures that the first item contains
        valid data on the relevant fields.
        """
        changes = {}

        if self.choice_flag == Action.ASIS:
            # Taking metadata "as-is". Guess whether this album is VA.
            plur_albumartist, freq = util.plurality(
                [i.albumartist or i.artist for i in self.items]
            )
            if freq == len(self.items) or (
                freq > 1
                and float(freq) / len(self.items) >= SINGLE_ARTIST_THRESH
            ):
                # Single-artist album.
                changes["albumartist"] = plur_albumartist
                changes["comp"] = False
            else:
                # VA.
                changes["albumartist"] = config["va_name"].as_str()
                changes["comp"] = True

        elif self.choice_flag in (Action.APPLY, Action.RETAG):
            # Applying autotagged metadata. Just get AA from the first
            # item.
            if not self.items[0].albumartist:
                changes["albumartist"] = self.items[0].artist
            if not self.items[0].albumartists:
                changes["albumartists"] = self.items[0].artists
            if not self.items[0].mb_albumartistid:
                changes["mb_albumartistid"] = self.items[0].mb_artistid
            if not self.items[0].mb_albumartistids:
                changes["mb_albumartistids"] = self.items[0].mb_artistids

        # Apply new metadata.
        for item in self.items:
            item.update(changes)

    def manipulate_files(
        self,
        session: ImportSession,
        operation: util.MoveOperation | None = None,
        write=False,
    ):
        """Copy, move, link, hardlink or reflink (depending on `operation`)
        the files as well as write metadata.

        `operation` should be an instance of `util.MoveOperation`.

        If `write` is `True` metadata is written to the files.
        # TODO: Introduce a MoveOperation.NONE or SKIP
        """

        items = self.imported_items()
        # Save the original paths of all items for deletion and pruning
        # in the next step (finalization).
        self.old_paths: list[util.PathBytes] = [item.path for item in items]
        for item in items:
            if operation is not None:
                # In copy and link modes, treat re-imports specially:
                # move in-library files. (Out-of-library files are
                # copied/moved as usual).
                old_path = item.path
                if (
                    operation != util.MoveOperation.MOVE
                    and self.replaced_items[item]
                    and session.lib.directory in util.ancestry(old_path)
                ):
                    item.move()
                    # We moved the item, so remove the
                    # now-nonexistent file from old_paths.
                    self.old_paths.remove(old_path)
                else:
                    # A normal import. Just copy files and keep track of
                    # old paths.
                    item.move(operation)

            if write and (self.apply or self.choice_flag == Action.RETAG):
                item.try_write()

        with session.lib.transaction():
            for item in self.imported_items():
                item.store()

        plugins.send("import_task_files", session=session, task=self)

    def add(self, lib: library.Library):
        """Add the items as an album to the library and remove replaced items."""
        self.align_album_level_fields()
        with lib.transaction():
            self.record_replaced(lib)
            self.remove_replaced(lib)

            self.album = lib.add_album(self.imported_items())
            if self.choice_flag == Action.APPLY and isinstance(
                self.match, autotag.AlbumMatch
            ):
                # Copy album flexible fields to the DB
                # TODO: change the flow so we create the `Album` object earlier,
                #   and we can move this into `self.apply_metadata`, just like
                #   is done for tracks.
                autotag.apply_album_metadata(self.match.info, self.album)
                self.album.store()

            self.reimport_metadata(lib)

    def record_replaced(self, lib: library.Library):
        """Records the replaced items and albums in the `replaced_items`
        and `replaced_albums` dictionaries.
        """
        self.replaced_items = defaultdict(list)
        self.replaced_albums: dict[util.PathBytes, library.Album] = (
            defaultdict()
        )
        replaced_album_ids = set()
        for item in self.imported_items():
            dup_items = list(lib.items(query=PathQuery("path", item.path)))
            self.replaced_items[item] = dup_items
            for dup_item in dup_items:
                if (
                    not dup_item.album_id
                    or dup_item.album_id in replaced_album_ids
                ):
                    continue
                replaced_album = dup_item._cached_album
                if replaced_album:
                    replaced_album_ids.add(dup_item.album_id)
                    self.replaced_albums[replaced_album.path] = replaced_album

    def reimport_metadata(self, lib: library.Library):
        """For reimports, preserves metadata for reimported items and
        albums.
        """

        def _reduce_and_log(new_obj, existing_fields, overwrite_keys):
            """Some flexible attributes should be overwritten (rather than
            preserved) on reimports; Copies existing_fields, logs and removes
            entries that should not be preserved and returns a dict containing
            those fields left to actually be preserved.
            """
            noun = "album" if isinstance(new_obj, library.Album) else "item"
            existing_fields = dict(existing_fields)
            overwritten_fields = [
                k
                for k in existing_fields
                if k in overwrite_keys
                and new_obj.get(k)
                and existing_fields.get(k) != new_obj.get(k)
            ]
            if overwritten_fields:
                log.debug(
                    "Reimported {0} {1.id}. Not preserving flexible attributes {2}. "
                    "Path: {1.filepath}",
                    noun,
                    new_obj,
                    overwritten_fields,
                )
                for key in overwritten_fields:
                    del existing_fields[key]
            return existing_fields

        if self.is_album:
            replaced_album = self.replaced_albums.get(self.album.path)
            if replaced_album:
                album_fields = _reduce_and_log(
                    self.album,
                    replaced_album._values_flex,
                    REIMPORT_FRESH_FIELDS_ALBUM,
                )
                self.album.added = replaced_album.added
                self.album.update(album_fields)
                self.album.artpath = replaced_album.artpath
                self.album.store()
                log.debug(
                    "Reimported album {0.album.id}. Preserving attribute ['added']. "
                    "Path: {0.album.filepath}",
                    self,
                )
                log.debug(
                    "Reimported album {0.album.id}. Preserving flexible"
                    " attributes {1}. Path: {0.album.filepath}",
                    self,
                    list(album_fields.keys()),
                )

        for item in self.imported_items():
            dup_items = self.replaced_items[item]
            for dup_item in dup_items:
                if dup_item.added and dup_item.added != item.added:
                    item.added = dup_item.added
                    log.debug(
                        "Reimported item {0.id}. Preserving attribute ['added']. "
                        "Path: {0.filepath}",
                        item,
                    )
                item_fields = _reduce_and_log(
                    item, dup_item._values_flex, REIMPORT_FRESH_FIELDS_ITEM
                )
                item.update(item_fields)
                log.debug(
                    "Reimported item {0.id}. Preserving flexible attributes {1}. "
                    "Path: {0.filepath}",
                    item,
                    list(item_fields.keys()),
                )
                item.store()

    def remove_replaced(self, lib):
        """Removes all the items from the library that have the same
        path as an item from this task.
        """
        for item in self.imported_items():
            for dup_item in self.replaced_items[item]:
                log.debug("Replacing item {.id}: {.filepath}", dup_item, item)
                dup_item.remove()
        log.debug(
            "{} of {} items replaced",
            sum(bool(v) for v in self.replaced_items.values()),
            len(self.imported_items()),
        )

    def choose_match(self, session):
        """Ask the session which match should apply and apply it."""
        choice = session.choose_match(self)
        self.set_choice(choice)
        session.log_choice(self)

    def reload(self):
        """Reload albums and items from the database."""
        for item in self.imported_items():
            item.load()
        self.album.load()

    # Utilities.

    def prune(self, filename):
        """Prune any empty directories above the given file. If this
        task has no `toppath` or the file path provided is not within
        the `toppath`, then this function has no effect. Similarly, if
        the file still exists, no pruning is performed, so it's safe to
        call when the file in question may not have been removed.
        """
        if self.toppath and not os.path.exists(util.syspath(filename)):
            util.prune_dirs(
                os.path.dirname(filename),
                self.toppath,
                clutter=config["clutter"].as_str_seq(),
            )


class SingletonImportTask(ImportTask):
    """ImportTask for a single track that is not associated to an album."""

    def __init__(self, toppath: util.PathBytes | None, item: library.Item):
        super().__init__(toppath, [item.path], [item])
        self.item = item
        self.is_album = False
        self.paths = [item.path]

    def chosen_info(self):
        """Return a dictionary of metadata about the current choice.
        May only be called when the choice flag is ASIS or RETAG
        (in which case the data comes from the files' current metadata)
        or APPLY (in which case the data comes from the choice).
        """
        assert self.choice_flag in (Action.ASIS, Action.RETAG, Action.APPLY)
        if self.choice_flag in (Action.ASIS, Action.RETAG):
            return dict(self.item)
        elif self.choice_flag is Action.APPLY:
            return self.match.info.copy()

    def imported_items(self):
        return [self.item]

    def apply_metadata(self):
        autotag.apply_item_metadata(self.item, self.match.info)

    def _emit_imported(self, lib):
        for item in self.imported_items():
            plugins.send("item_imported", lib=lib, item=item)

    def lookup_candidates(self, search_ids: list[str]) -> None:
        self.candidates, self.rec = autotag.tag_item(
            self.item, search_ids=search_ids
        )

    def find_duplicates(self, lib: library.Library) -> list[library.Item]:  # type: ignore[override] # Need splitting Singleton and Album tasks into separate classes
        """Return a list of items from `lib` that have the same artist
        and title as the task.
        """
        info = self.chosen_info()

        # Query for existing items using the same metadata. We use a
        # temporary `Item` object to generate any computed fields.
        tmp_item = library.Item(lib, **info)
        keys: list[str] = config["import"]["duplicate_keys"][
            "item"
        ].as_str_seq()
        dup_query = tmp_item.duplicates_query(keys)

        found_items = []
        for other_item in lib.items(dup_query):
            # Existing items not considered duplicates.
            if other_item.path != self.item.path:
                found_items.append(other_item)
        return found_items

    duplicate_items = find_duplicates

    def add(self, lib):
        with lib.transaction():
            self.record_replaced(lib)
            self.remove_replaced(lib)
            lib.add(self.item)
            self.reimport_metadata(lib)

    def infer_album_fields(self):
        raise NotImplementedError

    def choose_match(self, session: ImportSession):
        """Ask the session which match should apply and apply it."""
        choice = session.choose_item(self)
        self.set_choice(choice)
        session.log_choice(self)

    def reload(self):
        self.item.load()

    def set_fields(self, lib):
        """Sets the fields given at CLI or configuration to the specified
        values, for the singleton item.
        """
        for field, view in config["import"]["set_fields"].items():
            value = str(view.get())
            log.debug(
                "Set field {}={} for {}",
                field,
                value,
                util.displayable_path(self.paths),
            )
            self.item.set_parse(field, format(self.item, value))
        self.item.store()


# FIXME The inheritance relationships are inverted. This is why there
# are so many methods which pass. More responsibility should be delegated to
# the BaseImportTask class.
class SentinelImportTask(ImportTask):
    """A sentinel task marks the progress of an import and does not
    import any items itself.

    If only `toppath` is set the task indicates the end of a top-level
    directory import. If the `paths` argument is also given, the task
    indicates the progress in the `toppath` import.
    """

    def __init__(self, toppath, paths):
        super().__init__(toppath, paths, ())
        # TODO Remove the remaining attributes eventually
        self.should_remove_duplicates = False
        self.is_album = True
        self.choice_flag = None

    def save_history(self):
        pass

    def save_progress(self):
        if not self.paths:
            # "Done" sentinel.
            ImportState().progress_reset(self.toppath)
        elif self.toppath:
            # "Directory progress" sentinel for singletons
            super().save_progress()

    @property
    def skip(self) -> bool:
        return True

    def set_choice(self, choice):
        raise NotImplementedError

    def cleanup(self, copy=False, delete=False, move=False):
        pass

    def _emit_imported(self, lib):
        pass


ArchiveHandler = tuple[
    Callable[[util.StrPath], bool], Callable[[util.StrPath], Any]
]


class ArchiveImportTask(SentinelImportTask):
    """An import task that represents the processing of an archive.

    `toppath` must be a `zip`, `tar`, or `rar` archive. Archive tasks
    serve two purposes:
    - First, it will unarchive the files to a temporary directory and
      return it. The client should read tasks from the resulting
      directory and send them through the pipeline.
    - Second, it will clean up the temporary directory when it proceeds
      through the pipeline. The client should send the archive task
      after sending the rest of the music tasks to make this work.
    """

    def __init__(self, toppath):
        super().__init__(toppath, ())
        self.extracted = False

    @classmethod
    def is_archive(cls, path):
        """Returns true if the given path points to an archive that can
        be handled.
        """
        if not os.path.isfile(path):
            return False

        for path_test, _ in cls.handlers:
            if path_test(os.fsdecode(path)):
                return True
        return False

    @util.cached_classproperty
    def handlers(cls) -> list[ArchiveHandler]:
        """Returns a list of archive handlers.

        Each handler is a `(path_test, ArchiveClass)` tuple. `path_test`
        is a function that returns `True` if the given path can be
        handled by `ArchiveClass`. `ArchiveClass` is a class that
        implements the same interface as `tarfile.TarFile`.
        """
        _handlers: list[ArchiveHandler] = []
        from zipfile import ZipFile, is_zipfile

        _handlers.append((is_zipfile, ZipFile))
        import tarfile

        _handlers.append((tarfile.is_tarfile, tarfile.open))
        try:
            from rarfile import RarFile, is_rarfile
        except ImportError:
            pass
        else:
            _handlers.append((is_rarfile, RarFile))
        try:
            from py7zr import SevenZipFile, is_7zfile
        except ImportError:
            pass
        else:
            _handlers.append((is_7zfile, SevenZipFile))

        return _handlers

    def cleanup(self, copy=False, delete=False, move=False):
        """Removes the temporary directory the archive was extracted to."""
        if self.extracted and self.toppath:
            log.debug(
                "Removing extracted directory: {}",
                util.displayable_path(self.toppath),
            )
            shutil.rmtree(util.syspath(self.toppath))

    def extract(self):
        """Extracts the archive to a temporary directory and sets
        `toppath` to that directory.
        """
        assert self.toppath is not None, "toppath must be set"

        for path_test, handler_class in self.handlers:
            if path_test(os.fsdecode(self.toppath)):
                break
        else:
            raise ValueError(f"No handler found for archive: {self.toppath}")
        extract_to = mkdtemp()
        archive = handler_class(os.fsdecode(self.toppath), mode="r")
        try:
            archive.extractall(extract_to)

            # Adjust the files' mtimes to match the information from the
            # archive. Inspired by: https://stackoverflow.com/q/9813243
            for f in archive.infolist():
                # The date_time will need to adjusted otherwise
                # the item will have the current date_time of extraction.
                # The (0, 0, -1) is added to date_time because the
                # function time.mktime expects a 9-element tuple.
                # The -1 indicates that the DST flag is unknown.
                date_time = time.mktime(f.date_time + (0, 0, -1))
                fullpath = os.path.join(extract_to, f.filename)
                os.utime(fullpath, (date_time, date_time))

        finally:
            archive.close()
        self.extracted = True
        self.toppath = extract_to


class ImportTaskFactory:
    """Generate album and singleton import tasks for all media files
    indicated by a path.
    """

    def __init__(self, toppath: util.PathBytes, session: ImportSession):
        """Create a new task factory.

        `toppath` is the user-specified path to search for music to
        import. `session` is the `ImportSession`, which controls how
        tasks are read from the directory.
        """
        self.toppath = toppath
        self.session = session
        self.skipped = 0  # Skipped due to incremental/resume.
        self.imported = 0  # "Real" tasks created.
        self.is_archive = ArchiveImportTask.is_archive(util.syspath(toppath))

    def tasks(self) -> Iterable[ImportTask]:
        """Yield all import tasks for music found in the user-specified
        path `self.toppath`. Any necessary sentinel tasks are also
        produced.

        During generation, update `self.skipped` and `self.imported`
        with the number of tasks that were not produced (due to
        incremental mode or resumed imports) and the number of concrete
        tasks actually produced, respectively.

        If `self.toppath` is an archive, it is adjusted to point to the
        extracted data.
        """
        # Check whether this is an archive.
        archive_task: ArchiveImportTask | None = None
        if self.is_archive:
            archive_task = self.unarchive()
            if not archive_task:
                return

        # Search for music in the directory.
        for dirs, paths in self.paths():
            if self.session.config["singletons"]:
                for path in paths:
                    tasks = self._create(self.singleton(path))
                    yield from tasks
                yield self.sentinel(dirs)

            else:
                tasks = self._create(self.album(paths, dirs))
                yield from tasks

        # Produce the final sentinel for this toppath to indicate that
        # it is finished. This is usually just a SentinelImportTask, but
        # for archive imports, send the archive task instead (to remove
        # the extracted directory).
        yield archive_task or self.sentinel()

    def _create(self, task: ImportTask | None):
        """Handle a new task to be emitted by the factory.

        Emit the `import_task_created` event and increment the
        `imported` count if the task is not skipped. Return the same
        task. If `task` is None, do nothing.
        """
        if task:
            tasks = task.handle_created(self.session)
            self.imported += len(tasks)
            return tasks
        return []

    def paths(self):
        """Walk `self.toppath` and yield `(dirs, files)` pairs where
        `files` are individual music files and `dirs` the set of
        containing directories where the music was found.

        This can either be a recursive search in the ordinary case, a
        single track when `toppath` is a file, a single directory in
        `flat` mode.
        """
        if not os.path.isdir(util.syspath(self.toppath)):
            yield [self.toppath], [self.toppath]
        elif self.session.config["flat"]:
            paths = []
            for dirs, paths_in_dir in albums_in_dir(self.toppath):
                paths += paths_in_dir
            yield [self.toppath], paths
        else:
            for dirs, paths in albums_in_dir(self.toppath):
                yield dirs, paths

    def singleton(self, path: util.PathBytes):
        """Return a `SingletonImportTask` for the music file."""
        if self.session.already_imported(self.toppath, [path]):
            log.debug(
                "Skipping previously-imported path: {}",
                util.displayable_path(path),
            )
            self.skipped += 1
            return None

        item = self.read_item(path)
        if item:
            return SingletonImportTask(self.toppath, item)
        else:
            return None

    def album(self, paths: Iterable[util.PathBytes], dirs=None):
        """Return a `ImportTask` with all media files from paths.

        `dirs` is a list of parent directories used to record already
        imported albums.
        """

        if dirs is None:
            dirs = list({os.path.dirname(p) for p in paths})

        if self.session.already_imported(self.toppath, dirs):
            log.debug(
                "Skipping previously-imported path: {}",
                util.displayable_path(dirs),
            )
            self.skipped += 1
            return None

        items: list[library.Item] = [
            item for item in map(self.read_item, paths) if item
        ]

        if len(items) > 0:
            return ImportTask(self.toppath, dirs, items)
        else:
            return None

    def sentinel(self, paths: Iterable[util.PathBytes] | None = None):
        """Return a `SentinelImportTask` indicating the end of a
        top-level directory import.
        """
        return SentinelImportTask(self.toppath, paths)

    def unarchive(self):
        """Extract the archive for this `toppath`.

        Extract the archive to a new directory, adjust `toppath` to
        point to the extracted directory, and return an
        `ArchiveImportTask`. If extraction fails, return None.
        """
        assert self.is_archive

        if not (self.session.config["move"] or self.session.config["copy"]):
            log.warning(
                "Archive importing requires either "
                "'copy' or 'move' to be enabled."
            )
            return

        log.debug("Extracting archive: {}", util.displayable_path(self.toppath))
        archive_task = ArchiveImportTask(self.toppath)
        try:
            archive_task.extract()
        except Exception as exc:
            log.error("extraction failed: {}", exc)
            return

        # Now read albums from the extracted directory.
        self.toppath = archive_task.toppath
        log.debug("Archive extracted to: {.toppath}", self)
        return archive_task

    def read_item(self, path: util.PathBytes):
        """Return an `Item` read from the path.

        If an item cannot be read, return `None` instead and log an
        error.
        """
        try:
            return library.Item.from_path(path)
        except library.ReadError as exc:
            if isinstance(exc.reason, mediafile.FileTypeError):
                # Silently ignore non-music files.
                pass
            elif isinstance(exc.reason, mediafile.UnreadableFileError):
                log.warning("unreadable file: {}", util.displayable_path(path))
            else:
                log.error(
                    "error reading {}: {}", util.displayable_path(path), exc
                )


MULTIDISC_MARKERS = (rb"dis[ck]", rb"cd")
MULTIDISC_PAT_FMT = rb"^(.*%s[\W_]*)\d"


def is_subdir_of_any_in_list(path, dirs):
    """Returns True if path os a subdirectory of any directory in dirs
    (a list). In other case, returns False.
    """
    ancestors = util.ancestry(path)
    return any(d in ancestors for d in dirs)


def albums_in_dir(path: util.PathBytes):
    """Recursively searches the given directory and returns an iterable
    of (paths, items) where paths is a list of directories and items is
    a list of Items that is probably an album. Specifically, any folder
    containing any media files is an album.
    """
    collapse_paths: list[util.PathBytes] = []
    collapse_items: list[util.PathBytes] = []
    collapse_pat = None

    ignore: list[str] = config["ignore"].as_str_seq()
    ignore_hidden: bool = config["ignore_hidden"].get(bool)

    for root, dirs, files in util.sorted_walk(
        path, ignore=ignore, ignore_hidden=ignore_hidden, logger=log
    ):
        items = [os.path.join(root, f) for f in files]
        # If we're currently collapsing the constituent directories in a
        # multi-disc album, check whether we should continue collapsing
        # and add the current directory. If so, just add the directory
        # and move on to the next directory. If not, stop collapsing.
        if collapse_paths:
            if (is_subdir_of_any_in_list(root, collapse_paths)) or (
                collapse_pat and collapse_pat.match(os.path.basename(root))
            ):
                # Still collapsing.
                collapse_paths.append(root)
                collapse_items += items
                continue
            else:
                # Collapse finished. Yield the collapsed directory and
                # proceed to process the current one.
                if collapse_items:
                    yield collapse_paths, collapse_items
                collapse_pat, collapse_paths, collapse_items = None, [], []

        # Check whether this directory looks like the *first* directory
        # in a multi-disc sequence. There are two indicators: the file
        # is named like part of a multi-disc sequence (e.g., "Title Disc
        # 1") or it contains no items but only directories that are
        # named in this way.
        start_collapsing = False
        for marker in MULTIDISC_MARKERS:
            # We're using replace on %s due to lack of .format() on bytestrings
            p = MULTIDISC_PAT_FMT.replace(b"%s", marker)
            marker_pat = re.compile(p, re.I)
            match = marker_pat.match(os.path.basename(root))

            # Is this directory the root of a nested multi-disc album?
            if dirs and not items:
                # Check whether all subdirectories have the same prefix.
                start_collapsing = True
                subdir_pat = None
                for subdir in dirs:
                    subdir = util.bytestring_path(subdir)
                    # The first directory dictates the pattern for
                    # the remaining directories.
                    if not subdir_pat:
                        match = marker_pat.match(subdir)
                        if match:
                            match_group = re.escape(match.group(1))
                            subdir_pat = re.compile(
                                b"".join([b"^", match_group, rb"\d"]), re.I
                            )
                        else:
                            start_collapsing = False
                            break

                    # Subsequent directories must match the pattern.
                    elif not subdir_pat.match(subdir):
                        start_collapsing = False
                        break

                # If all subdirectories match, don't check other
                # markers.
                if start_collapsing:
                    break

            # Is this directory the first in a flattened multi-disc album?
            elif match:
                start_collapsing = True
                # Set the current pattern to match directories with the same
                # prefix as this one, followed by a digit.
                collapse_pat = re.compile(
                    b"".join([b"^", re.escape(match.group(1)), rb"\d"]), re.I
                )
                break

        # If either of the above heuristics indicated that this is the
        # beginning of a multi-disc album, initialize the collapsed
        # directory and item lists and check the next directory.
        if start_collapsing:
            # Start collapsing; continue to the next iteration.
            collapse_paths = [root]
            collapse_items = items
            continue

        # If it's nonempty, yield it.
        if items:
            yield [root], items

    # Clear out any unfinished collapse.
    if collapse_paths and collapse_items:
        yield collapse_paths, collapse_items