stash/pkg/match/path_semantic_test.go
abdusalam.dihan cd64433dc5 Speed up file-based auto-tag
Replaces the per-file SQL QueryForAutoTag prefilter with an in-memory
2-rune prefix index over performers/studios/tags, preloaded once at job
start. Also:

  - runs file processing through job.TaskQueue so scenes/images/
    galleries tag in parallel instead of one file at a time
  - keyset-paginates the query loop so batch N+1 doesn't pay the
    O(offset) scan past large tables
  - bulk-loads studio/tag aliases via a new optional AllAliasLoader
    interface, avoiding N+1 GetAliases calls during preload
  - caches compiled name regexps (same candidate names repeat across
    thousands of files)
  - hoists strings.ToLower(path) and allASCII(path) out of the per-
    candidate match loop
  - opens a fresh write txn per applied match instead of holding one
    for every tagger phase

Tagger gains *AtPath methods that own the cache + txn manager, letting
the task code stay slim.
2026-04-19 22:22:37 +01:00

426 lines
13 KiB
Go

package match
import (
"context"
"slices"
"testing"
"github.com/stashapp/stash/pkg/models"
"github.com/stashapp/stash/pkg/models/mocks"
"github.com/stretchr/testify/mock"
)
// Path-matching semantic tests that lock in the behavior of
// PathTo{Performers,Studio,Tags} via the generated testify mocks in
// pkg/models/mocks. These are the regression guard when the candidate-
// lookup strategy changes (e.g., replacing the SQL prefilter with an
// in-memory matcher): each case runs against both cache=nil and a
// preloaded cache, asserting identical output.
// --- mock setup helpers ---
// preloadFilter matches the filter PreloadX passes: IgnoreAutoTag=false.
// singleLetterFilter matches the filter the single-letter-cache path
// passes: a regex in Name. Keeping them disjoint means testify will
// route each Query call to the right stub regardless of declaration
// order.
func performerPreloadFilter() interface{} {
return mock.MatchedBy(func(f *models.PerformerFilterType) bool {
return f != nil && f.IgnoreAutoTag != nil && !*f.IgnoreAutoTag
})
}
func performerSingleLetterFilter() interface{} {
return mock.MatchedBy(func(f *models.PerformerFilterType) bool {
return f != nil && f.Name != nil
})
}
func studioPreloadFilter() interface{} {
return mock.MatchedBy(func(f *models.StudioFilterType) bool {
return f != nil && f.IgnoreAutoTag != nil && !*f.IgnoreAutoTag
})
}
func studioSingleLetterFilter() interface{} {
return mock.MatchedBy(func(f *models.StudioFilterType) bool {
return f != nil && f.Name != nil
})
}
func tagPreloadFilter() interface{} {
return mock.MatchedBy(func(f *models.TagFilterType) bool {
return f != nil && f.IgnoreAutoTag != nil && !*f.IgnoreAutoTag
})
}
func tagSingleLetterFilter() interface{} {
return mock.MatchedBy(func(f *models.TagFilterType) bool {
return f != nil && f.Name != nil
})
}
// primePerformerMock sets up a PerformerReaderWriter to serve both the
// no-preload path (QueryForAutoTag returns all non-ignored; single-letter
// Query returns nothing) and the preload path (Query with IgnoreAutoTag
// filter returns all non-ignored). All expectations are .Maybe() because
// which ones fire depends on whether the test passes a cache.
func primePerformerMock(m *mocks.PerformerReaderWriter, performers []*models.Performer) {
var nonIgnored []*models.Performer
for _, p := range performers {
if !p.IgnoreAutoTag {
nonIgnored = append(nonIgnored, p)
}
}
m.On("QueryForAutoTag", mock.Anything, mock.Anything).Return(nonIgnored, nil).Maybe()
m.On("Query", mock.Anything, performerPreloadFilter(), mock.Anything).Return(nonIgnored, len(nonIgnored), nil).Maybe()
m.On("Query", mock.Anything, performerSingleLetterFilter(), mock.Anything).Return(nil, 0, nil).Maybe()
}
func primeStudioMock(m *mocks.StudioReaderWriter, studios []*models.Studio, aliases map[int][]string) {
var nonIgnored []*models.Studio
for _, s := range studios {
if !s.IgnoreAutoTag {
nonIgnored = append(nonIgnored, s)
}
}
m.On("QueryForAutoTag", mock.Anything, mock.Anything).Return(nonIgnored, nil).Maybe()
m.On("Query", mock.Anything, studioPreloadFilter(), mock.Anything).Return(nonIgnored, len(nonIgnored), nil).Maybe()
m.On("Query", mock.Anything, studioSingleLetterFilter(), mock.Anything).Return(nil, 0, nil).Maybe()
for _, s := range studios {
m.On("GetAliases", mock.Anything, s.ID).Return(aliases[s.ID], nil).Maybe()
}
}
func primeTagMock(m *mocks.TagReaderWriter, tags []*models.Tag, aliases map[int][]string) {
var nonIgnored []*models.Tag
for _, t := range tags {
if !t.IgnoreAutoTag {
nonIgnored = append(nonIgnored, t)
}
}
m.On("QueryForAutoTag", mock.Anything, mock.Anything).Return(nonIgnored, nil).Maybe()
m.On("Query", mock.Anything, tagPreloadFilter(), mock.Anything).Return(nonIgnored, len(nonIgnored), nil).Maybe()
m.On("Query", mock.Anything, tagSingleLetterFilter(), mock.Anything).Return(nil, 0, nil).Maybe()
for _, t := range tags {
m.On("GetAliases", mock.Anything, t.ID).Return(aliases[t.ID], nil).Maybe()
}
}
// --- helpers ---
func perfIDs(ps []*models.Performer) []int {
ids := make([]int, 0, len(ps))
for _, p := range ps {
ids = append(ids, p.ID)
}
slices.Sort(ids)
return ids
}
func tagIDs(ts []*models.Tag) []int {
ids := make([]int, 0, len(ts))
for _, t := range ts {
ids = append(ids, t.ID)
}
slices.Sort(ids)
return ids
}
// --- tests ---
func TestPathToPerformers_Semantics(t *testing.T) {
t.Parallel()
ctx := context.Background()
alice := &models.Performer{ID: 1, Name: "alice smith"}
bob := &models.Performer{ID: 2, Name: "bob jones"}
unicodeP := &models.Performer{ID: 3, Name: "伏字"}
ignored := &models.Performer{ID: 4, Name: "ignored person", IgnoreAutoTag: true}
substr := &models.Performer{ID: 5, Name: "ali"} // substring of "alice" - should NOT match "alice smith.jpg"
performers := []*models.Performer{alice, bob, unicodeP, ignored, substr}
db := mocks.NewDatabase()
primePerformerMock(db.Performer, performers)
tests := []struct {
name string
path string
wantIDs []int
}{
{"plain name match", "/media/alice smith.jpg", []int{1}},
{"separator variants", "/media/alice.smith.jpg", []int{1}},
{"separator variants 2", "/media/alice_smith.jpg", []int{1}},
{"multiple matches", "/media/alice smith and bob jones.jpg", []int{1, 2}},
{"case insensitive", "/media/ALICE SMITH.jpg", []int{1}},
{"unicode", "/media/伏字.jpg", []int{3}},
{"ignore_auto_tag skipped", "/media/ignored person.jpg", nil},
{"no substring match", "/media/alicent.jpg", nil},
{"short name does NOT match inside longer", "/media/alice smith.jpg", []int{1}}, // 'ali' should not match
{"short name matches exact", "/media/ali.jpg", []int{5}},
{"no match", "/media/nobody here.jpg", nil},
}
for _, tt := range tests {
t.Run(tt.name+"/no-preload", func(t *testing.T) {
got, err := PathToPerformers(ctx, tt.path, db.Performer, nil, false)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if gotIDs := perfIDs(got); !slices.Equal(gotIDs, tt.wantIDs) {
t.Errorf("got %v, want %v", gotIDs, tt.wantIDs)
}
})
t.Run(tt.name+"/preloaded", func(t *testing.T) {
cache := &Cache{}
if err := cache.PreloadPerformers(ctx, db.Performer); err != nil {
t.Fatalf("preload: %v", err)
}
got, err := PathToPerformers(ctx, tt.path, db.Performer, cache, false)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if gotIDs := perfIDs(got); !slices.Equal(gotIDs, tt.wantIDs) {
t.Errorf("got %v, want %v", gotIDs, tt.wantIDs)
}
})
}
}
func TestPathToStudio_Semantics(t *testing.T) {
t.Parallel()
ctx := context.Background()
s1 := &models.Studio{ID: 1, Name: "first studio"}
s2 := &models.Studio{ID: 2, Name: "second"}
s3 := &models.Studio{ID: 3, Name: "third", IgnoreAutoTag: true}
studios := []*models.Studio{s1, s2, s3}
aliases := map[int][]string{2: {"second alias"}}
db := mocks.NewDatabase()
primeStudioMock(db.Studio, studios, aliases)
tests := []struct {
name string
path string
wantID int // 0 == no match
}{
{"primary name", "/first studio/scene.mp4", 1},
{"alias matches", "/second alias/scene.mp4", 2},
{"ignore_auto_tag studio skipped", "/third/scene.mp4", 0},
{"multiple matches - rightmost wins", "/first studio/second/scene.mp4", 2},
{"no match", "/unknown/scene.mp4", 0},
}
runCase := func(t *testing.T, path string, wantID int, cache *Cache) {
got, err := PathToStudio(ctx, path, db.Studio, cache, false)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
var gotID int
if got != nil {
gotID = got.ID
}
if gotID != wantID {
t.Errorf("got %d, want %d", gotID, wantID)
}
}
for _, tt := range tests {
t.Run(tt.name+"/no-preload", func(t *testing.T) {
runCase(t, tt.path, tt.wantID, nil)
})
t.Run(tt.name+"/preloaded", func(t *testing.T) {
cache := &Cache{}
if err := cache.PreloadStudios(ctx, db.Studio); err != nil {
t.Fatalf("preload: %v", err)
}
runCase(t, tt.path, tt.wantID, cache)
})
}
}
func TestPathToTags_Semantics(t *testing.T) {
t.Parallel()
ctx := context.Background()
t1 := &models.Tag{ID: 1, Name: "anime"}
t2 := &models.Tag{ID: 2, Name: "docs"}
t3 := &models.Tag{ID: 3, Name: "skip me", IgnoreAutoTag: true}
tags := []*models.Tag{t1, t2, t3}
aliases := map[int][]string{2: {"documentary"}}
db := mocks.NewDatabase()
primeTagMock(db.Tag, tags, aliases)
tests := []struct {
name string
path string
wantIDs []int
}{
{"name match", "/media/anime/x.mp4", []int{1}},
{"alias match", "/media/documentary/x.mp4", []int{2}},
{"multiple matches", "/media/anime-documentary/x.mp4", []int{1, 2}},
{"ignore_auto_tag skipped", "/media/skip me/x.mp4", nil},
{"no match", "/media/comedy/x.mp4", nil},
}
runCase := func(t *testing.T, path string, wantIDs []int, cache *Cache) {
got, err := PathToTags(ctx, path, db.Tag, cache, false)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if gotIDs := tagIDs(got); !slices.Equal(gotIDs, wantIDs) {
t.Errorf("got %v, want %v", gotIDs, wantIDs)
}
}
for _, tt := range tests {
t.Run(tt.name+"/no-preload", func(t *testing.T) {
runCase(t, tt.path, tt.wantIDs, nil)
})
t.Run(tt.name+"/preloaded", func(t *testing.T) {
cache := &Cache{}
if err := cache.PreloadTags(ctx, db.Tag); err != nil {
t.Fatalf("preload: %v", err)
}
runCase(t, tt.path, tt.wantIDs, cache)
})
}
}
// Performer whose name starts with a single-letter word (e.g., "X Man")
// can't be reached via 2-rune prefix lookup (getPathWords drops 1-char
// words). The preload must put them in the alwaysCheck list so they're
// still regex-tested.
func TestPathToPerformers_SingleLetterFirstWord(t *testing.T) {
t.Parallel()
ctx := context.Background()
xman := &models.Performer{ID: 1, Name: "X Man"}
other := &models.Performer{ID: 2, Name: "alice smith"}
db := mocks.NewDatabase()
primePerformerMock(db.Performer, []*models.Performer{xman, other})
cache := &Cache{}
if err := cache.PreloadPerformers(ctx, db.Performer); err != nil {
t.Fatal(err)
}
got, err := PathToPerformers(ctx, "/media/X Man.mp4", db.Performer, cache, false)
if err != nil {
t.Fatal(err)
}
if ids := perfIDs(got); !slices.Equal(ids, []int{1}) {
t.Errorf("expected [1], got %v", ids)
}
}
// A studio whose name shares no prefix with its aliases must be reachable
// by alias prefix. "Acme Corp" with alias "Widgets Inc" must match a path
// containing "widgets inc".
func TestPathToStudio_AliasPrefixDistinctFromName(t *testing.T) {
t.Parallel()
ctx := context.Background()
s := &models.Studio{ID: 1, Name: "Acme Corp"}
db := mocks.NewDatabase()
primeStudioMock(db.Studio, []*models.Studio{s}, map[int][]string{1: {"Widgets Inc"}})
cache := &Cache{}
if err := cache.PreloadStudios(ctx, db.Studio); err != nil {
t.Fatal(err)
}
got, err := PathToStudio(ctx, "/media/Widgets Inc/scene.mp4", db.Studio, cache, false)
if err != nil {
t.Fatal(err)
}
if got == nil || got.ID != 1 {
t.Errorf("expected studio 1, got %v", got)
}
}
// Same for tags.
func TestPathToTags_AliasPrefixDistinctFromName(t *testing.T) {
t.Parallel()
ctx := context.Background()
db := mocks.NewDatabase()
primeTagMock(db.Tag, []*models.Tag{{ID: 1, Name: "documentary"}}, map[int][]string{1: {"film"}})
cache := &Cache{}
if err := cache.PreloadTags(ctx, db.Tag); err != nil {
t.Fatal(err)
}
got, err := PathToTags(ctx, "/media/film/x.mp4", db.Tag, cache, false)
if err != nil {
t.Fatal(err)
}
if ids := tagIDs(got); !slices.Equal(ids, []int{1}) {
t.Errorf("expected [1], got %v", ids)
}
}
// Two aliases on the same studio with different prefixes should each
// reach the studio. Index bucket must dedupe inside the bucket.
func TestPathToStudio_MultipleAliasesDedup(t *testing.T) {
t.Parallel()
ctx := context.Background()
s := &models.Studio{ID: 1, Name: "Primary Name"}
db := mocks.NewDatabase()
primeStudioMock(db.Studio, []*models.Studio{s}, map[int][]string{1: {"Primary Nickname", "Primary Alt"}})
cache := &Cache{}
if err := cache.PreloadStudios(ctx, db.Studio); err != nil {
t.Fatal(err)
}
// Studio "Primary Name" and both aliases all share prefix "pr".
// The bucket should contain it exactly once.
if got := len(cache.studioByPrefix["pr"]); got != 1 {
t.Errorf("bucket 'pr' should have 1 entry, got %d", got)
}
}
// Equivalence test: the function must return the same result regardless of
// whether a match.Cache is passed in. This is the invariant that any
// caching-based optimization must preserve.
func TestPathToPerformers_CachedVsUncached(t *testing.T) {
t.Parallel()
ctx := context.Background()
perfs := []*models.Performer{
{ID: 1, Name: "alice smith"},
{ID: 2, Name: "bob jones"},
{ID: 3, Name: "charlie"},
{ID: 4, Name: "david wong"},
}
db := mocks.NewDatabase()
primePerformerMock(db.Performer, perfs)
paths := []string{
"/media/alice smith.jpg",
"/media/bob_jones.jpg",
"/media/alice smith and charlie.jpg",
"/media/nobody.jpg",
"/media/alice smith.jpg", // repeat: cached regex should not change outcome
}
var noCache, withCache [][]int
cache := &Cache{}
for _, p := range paths {
uc, err := PathToPerformers(ctx, p, db.Performer, nil, false)
if err != nil {
t.Fatal(err)
}
wc, err := PathToPerformers(ctx, p, db.Performer, cache, false)
if err != nil {
t.Fatal(err)
}
noCache = append(noCache, perfIDs(uc))
withCache = append(withCache, perfIDs(wc))
}
for i := range paths {
if !slices.Equal(noCache[i], withCache[i]) {
t.Errorf("path %q: no-cache %v vs cached %v", paths[i], noCache[i], withCache[i])
}
}
}