mirror of
https://github.com/stashapp/stash.git
synced 2026-05-09 05:05:29 +02:00
Replaces the per-file SQL QueryForAutoTag prefilter with an in-memory
2-rune prefix index over performers/studios/tags, preloaded once at job
start. Also:
- runs file processing through job.TaskQueue so scenes/images/
galleries tag in parallel instead of one file at a time
- keyset-paginates the query loop so batch N+1 doesn't pay the
O(offset) scan past large tables
- bulk-loads studio/tag aliases via a new optional AllAliasLoader
interface, avoiding N+1 GetAliases calls during preload
- caches compiled name regexps (same candidate names repeat across
thousands of files)
- hoists strings.ToLower(path) and allASCII(path) out of the per-
candidate match loop
- opens a fresh write txn per applied match instead of holding one
for every tagger phase
Tagger gains *AtPath methods that own the cache + txn manager, letting
the task code stay slim.
426 lines
13 KiB
Go
426 lines
13 KiB
Go
package match
|
|
|
|
import (
|
|
"context"
|
|
"slices"
|
|
"testing"
|
|
|
|
"github.com/stashapp/stash/pkg/models"
|
|
"github.com/stashapp/stash/pkg/models/mocks"
|
|
"github.com/stretchr/testify/mock"
|
|
)
|
|
|
|
// Path-matching semantic tests that lock in the behavior of
|
|
// PathTo{Performers,Studio,Tags} via the generated testify mocks in
|
|
// pkg/models/mocks. These are the regression guard when the candidate-
|
|
// lookup strategy changes (e.g., replacing the SQL prefilter with an
|
|
// in-memory matcher): each case runs against both cache=nil and a
|
|
// preloaded cache, asserting identical output.
|
|
|
|
// --- mock setup helpers ---
|
|
|
|
// preloadFilter matches the filter PreloadX passes: IgnoreAutoTag=false.
|
|
// singleLetterFilter matches the filter the single-letter-cache path
|
|
// passes: a regex in Name. Keeping them disjoint means testify will
|
|
// route each Query call to the right stub regardless of declaration
|
|
// order.
|
|
func performerPreloadFilter() interface{} {
|
|
return mock.MatchedBy(func(f *models.PerformerFilterType) bool {
|
|
return f != nil && f.IgnoreAutoTag != nil && !*f.IgnoreAutoTag
|
|
})
|
|
}
|
|
func performerSingleLetterFilter() interface{} {
|
|
return mock.MatchedBy(func(f *models.PerformerFilterType) bool {
|
|
return f != nil && f.Name != nil
|
|
})
|
|
}
|
|
func studioPreloadFilter() interface{} {
|
|
return mock.MatchedBy(func(f *models.StudioFilterType) bool {
|
|
return f != nil && f.IgnoreAutoTag != nil && !*f.IgnoreAutoTag
|
|
})
|
|
}
|
|
func studioSingleLetterFilter() interface{} {
|
|
return mock.MatchedBy(func(f *models.StudioFilterType) bool {
|
|
return f != nil && f.Name != nil
|
|
})
|
|
}
|
|
func tagPreloadFilter() interface{} {
|
|
return mock.MatchedBy(func(f *models.TagFilterType) bool {
|
|
return f != nil && f.IgnoreAutoTag != nil && !*f.IgnoreAutoTag
|
|
})
|
|
}
|
|
func tagSingleLetterFilter() interface{} {
|
|
return mock.MatchedBy(func(f *models.TagFilterType) bool {
|
|
return f != nil && f.Name != nil
|
|
})
|
|
}
|
|
|
|
// primePerformerMock sets up a PerformerReaderWriter to serve both the
|
|
// no-preload path (QueryForAutoTag returns all non-ignored; single-letter
|
|
// Query returns nothing) and the preload path (Query with IgnoreAutoTag
|
|
// filter returns all non-ignored). All expectations are .Maybe() because
|
|
// which ones fire depends on whether the test passes a cache.
|
|
func primePerformerMock(m *mocks.PerformerReaderWriter, performers []*models.Performer) {
|
|
var nonIgnored []*models.Performer
|
|
for _, p := range performers {
|
|
if !p.IgnoreAutoTag {
|
|
nonIgnored = append(nonIgnored, p)
|
|
}
|
|
}
|
|
m.On("QueryForAutoTag", mock.Anything, mock.Anything).Return(nonIgnored, nil).Maybe()
|
|
m.On("Query", mock.Anything, performerPreloadFilter(), mock.Anything).Return(nonIgnored, len(nonIgnored), nil).Maybe()
|
|
m.On("Query", mock.Anything, performerSingleLetterFilter(), mock.Anything).Return(nil, 0, nil).Maybe()
|
|
}
|
|
|
|
func primeStudioMock(m *mocks.StudioReaderWriter, studios []*models.Studio, aliases map[int][]string) {
|
|
var nonIgnored []*models.Studio
|
|
for _, s := range studios {
|
|
if !s.IgnoreAutoTag {
|
|
nonIgnored = append(nonIgnored, s)
|
|
}
|
|
}
|
|
m.On("QueryForAutoTag", mock.Anything, mock.Anything).Return(nonIgnored, nil).Maybe()
|
|
m.On("Query", mock.Anything, studioPreloadFilter(), mock.Anything).Return(nonIgnored, len(nonIgnored), nil).Maybe()
|
|
m.On("Query", mock.Anything, studioSingleLetterFilter(), mock.Anything).Return(nil, 0, nil).Maybe()
|
|
for _, s := range studios {
|
|
m.On("GetAliases", mock.Anything, s.ID).Return(aliases[s.ID], nil).Maybe()
|
|
}
|
|
}
|
|
|
|
func primeTagMock(m *mocks.TagReaderWriter, tags []*models.Tag, aliases map[int][]string) {
|
|
var nonIgnored []*models.Tag
|
|
for _, t := range tags {
|
|
if !t.IgnoreAutoTag {
|
|
nonIgnored = append(nonIgnored, t)
|
|
}
|
|
}
|
|
m.On("QueryForAutoTag", mock.Anything, mock.Anything).Return(nonIgnored, nil).Maybe()
|
|
m.On("Query", mock.Anything, tagPreloadFilter(), mock.Anything).Return(nonIgnored, len(nonIgnored), nil).Maybe()
|
|
m.On("Query", mock.Anything, tagSingleLetterFilter(), mock.Anything).Return(nil, 0, nil).Maybe()
|
|
for _, t := range tags {
|
|
m.On("GetAliases", mock.Anything, t.ID).Return(aliases[t.ID], nil).Maybe()
|
|
}
|
|
}
|
|
|
|
// --- helpers ---
|
|
|
|
func perfIDs(ps []*models.Performer) []int {
|
|
ids := make([]int, 0, len(ps))
|
|
for _, p := range ps {
|
|
ids = append(ids, p.ID)
|
|
}
|
|
slices.Sort(ids)
|
|
return ids
|
|
}
|
|
|
|
func tagIDs(ts []*models.Tag) []int {
|
|
ids := make([]int, 0, len(ts))
|
|
for _, t := range ts {
|
|
ids = append(ids, t.ID)
|
|
}
|
|
slices.Sort(ids)
|
|
return ids
|
|
}
|
|
|
|
// --- tests ---
|
|
|
|
func TestPathToPerformers_Semantics(t *testing.T) {
|
|
t.Parallel()
|
|
ctx := context.Background()
|
|
|
|
alice := &models.Performer{ID: 1, Name: "alice smith"}
|
|
bob := &models.Performer{ID: 2, Name: "bob jones"}
|
|
unicodeP := &models.Performer{ID: 3, Name: "伏字"}
|
|
ignored := &models.Performer{ID: 4, Name: "ignored person", IgnoreAutoTag: true}
|
|
substr := &models.Performer{ID: 5, Name: "ali"} // substring of "alice" - should NOT match "alice smith.jpg"
|
|
|
|
performers := []*models.Performer{alice, bob, unicodeP, ignored, substr}
|
|
db := mocks.NewDatabase()
|
|
primePerformerMock(db.Performer, performers)
|
|
|
|
tests := []struct {
|
|
name string
|
|
path string
|
|
wantIDs []int
|
|
}{
|
|
{"plain name match", "/media/alice smith.jpg", []int{1}},
|
|
{"separator variants", "/media/alice.smith.jpg", []int{1}},
|
|
{"separator variants 2", "/media/alice_smith.jpg", []int{1}},
|
|
{"multiple matches", "/media/alice smith and bob jones.jpg", []int{1, 2}},
|
|
{"case insensitive", "/media/ALICE SMITH.jpg", []int{1}},
|
|
{"unicode", "/media/伏字.jpg", []int{3}},
|
|
{"ignore_auto_tag skipped", "/media/ignored person.jpg", nil},
|
|
{"no substring match", "/media/alicent.jpg", nil},
|
|
{"short name does NOT match inside longer", "/media/alice smith.jpg", []int{1}}, // 'ali' should not match
|
|
{"short name matches exact", "/media/ali.jpg", []int{5}},
|
|
{"no match", "/media/nobody here.jpg", nil},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name+"/no-preload", func(t *testing.T) {
|
|
got, err := PathToPerformers(ctx, tt.path, db.Performer, nil, false)
|
|
if err != nil {
|
|
t.Fatalf("unexpected error: %v", err)
|
|
}
|
|
if gotIDs := perfIDs(got); !slices.Equal(gotIDs, tt.wantIDs) {
|
|
t.Errorf("got %v, want %v", gotIDs, tt.wantIDs)
|
|
}
|
|
})
|
|
t.Run(tt.name+"/preloaded", func(t *testing.T) {
|
|
cache := &Cache{}
|
|
if err := cache.PreloadPerformers(ctx, db.Performer); err != nil {
|
|
t.Fatalf("preload: %v", err)
|
|
}
|
|
got, err := PathToPerformers(ctx, tt.path, db.Performer, cache, false)
|
|
if err != nil {
|
|
t.Fatalf("unexpected error: %v", err)
|
|
}
|
|
if gotIDs := perfIDs(got); !slices.Equal(gotIDs, tt.wantIDs) {
|
|
t.Errorf("got %v, want %v", gotIDs, tt.wantIDs)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestPathToStudio_Semantics(t *testing.T) {
|
|
t.Parallel()
|
|
ctx := context.Background()
|
|
|
|
s1 := &models.Studio{ID: 1, Name: "first studio"}
|
|
s2 := &models.Studio{ID: 2, Name: "second"}
|
|
s3 := &models.Studio{ID: 3, Name: "third", IgnoreAutoTag: true}
|
|
|
|
studios := []*models.Studio{s1, s2, s3}
|
|
aliases := map[int][]string{2: {"second alias"}}
|
|
db := mocks.NewDatabase()
|
|
primeStudioMock(db.Studio, studios, aliases)
|
|
|
|
tests := []struct {
|
|
name string
|
|
path string
|
|
wantID int // 0 == no match
|
|
}{
|
|
{"primary name", "/first studio/scene.mp4", 1},
|
|
{"alias matches", "/second alias/scene.mp4", 2},
|
|
{"ignore_auto_tag studio skipped", "/third/scene.mp4", 0},
|
|
{"multiple matches - rightmost wins", "/first studio/second/scene.mp4", 2},
|
|
{"no match", "/unknown/scene.mp4", 0},
|
|
}
|
|
|
|
runCase := func(t *testing.T, path string, wantID int, cache *Cache) {
|
|
got, err := PathToStudio(ctx, path, db.Studio, cache, false)
|
|
if err != nil {
|
|
t.Fatalf("unexpected error: %v", err)
|
|
}
|
|
var gotID int
|
|
if got != nil {
|
|
gotID = got.ID
|
|
}
|
|
if gotID != wantID {
|
|
t.Errorf("got %d, want %d", gotID, wantID)
|
|
}
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name+"/no-preload", func(t *testing.T) {
|
|
runCase(t, tt.path, tt.wantID, nil)
|
|
})
|
|
t.Run(tt.name+"/preloaded", func(t *testing.T) {
|
|
cache := &Cache{}
|
|
if err := cache.PreloadStudios(ctx, db.Studio); err != nil {
|
|
t.Fatalf("preload: %v", err)
|
|
}
|
|
runCase(t, tt.path, tt.wantID, cache)
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestPathToTags_Semantics(t *testing.T) {
|
|
t.Parallel()
|
|
ctx := context.Background()
|
|
|
|
t1 := &models.Tag{ID: 1, Name: "anime"}
|
|
t2 := &models.Tag{ID: 2, Name: "docs"}
|
|
t3 := &models.Tag{ID: 3, Name: "skip me", IgnoreAutoTag: true}
|
|
|
|
tags := []*models.Tag{t1, t2, t3}
|
|
aliases := map[int][]string{2: {"documentary"}}
|
|
db := mocks.NewDatabase()
|
|
primeTagMock(db.Tag, tags, aliases)
|
|
|
|
tests := []struct {
|
|
name string
|
|
path string
|
|
wantIDs []int
|
|
}{
|
|
{"name match", "/media/anime/x.mp4", []int{1}},
|
|
{"alias match", "/media/documentary/x.mp4", []int{2}},
|
|
{"multiple matches", "/media/anime-documentary/x.mp4", []int{1, 2}},
|
|
{"ignore_auto_tag skipped", "/media/skip me/x.mp4", nil},
|
|
{"no match", "/media/comedy/x.mp4", nil},
|
|
}
|
|
|
|
runCase := func(t *testing.T, path string, wantIDs []int, cache *Cache) {
|
|
got, err := PathToTags(ctx, path, db.Tag, cache, false)
|
|
if err != nil {
|
|
t.Fatalf("unexpected error: %v", err)
|
|
}
|
|
if gotIDs := tagIDs(got); !slices.Equal(gotIDs, wantIDs) {
|
|
t.Errorf("got %v, want %v", gotIDs, wantIDs)
|
|
}
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name+"/no-preload", func(t *testing.T) {
|
|
runCase(t, tt.path, tt.wantIDs, nil)
|
|
})
|
|
t.Run(tt.name+"/preloaded", func(t *testing.T) {
|
|
cache := &Cache{}
|
|
if err := cache.PreloadTags(ctx, db.Tag); err != nil {
|
|
t.Fatalf("preload: %v", err)
|
|
}
|
|
runCase(t, tt.path, tt.wantIDs, cache)
|
|
})
|
|
}
|
|
}
|
|
|
|
// Performer whose name starts with a single-letter word (e.g., "X Man")
|
|
// can't be reached via 2-rune prefix lookup (getPathWords drops 1-char
|
|
// words). The preload must put them in the alwaysCheck list so they're
|
|
// still regex-tested.
|
|
func TestPathToPerformers_SingleLetterFirstWord(t *testing.T) {
|
|
t.Parallel()
|
|
ctx := context.Background()
|
|
xman := &models.Performer{ID: 1, Name: "X Man"}
|
|
other := &models.Performer{ID: 2, Name: "alice smith"}
|
|
|
|
db := mocks.NewDatabase()
|
|
primePerformerMock(db.Performer, []*models.Performer{xman, other})
|
|
|
|
cache := &Cache{}
|
|
if err := cache.PreloadPerformers(ctx, db.Performer); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
got, err := PathToPerformers(ctx, "/media/X Man.mp4", db.Performer, cache, false)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if ids := perfIDs(got); !slices.Equal(ids, []int{1}) {
|
|
t.Errorf("expected [1], got %v", ids)
|
|
}
|
|
}
|
|
|
|
// A studio whose name shares no prefix with its aliases must be reachable
|
|
// by alias prefix. "Acme Corp" with alias "Widgets Inc" must match a path
|
|
// containing "widgets inc".
|
|
func TestPathToStudio_AliasPrefixDistinctFromName(t *testing.T) {
|
|
t.Parallel()
|
|
ctx := context.Background()
|
|
s := &models.Studio{ID: 1, Name: "Acme Corp"}
|
|
|
|
db := mocks.NewDatabase()
|
|
primeStudioMock(db.Studio, []*models.Studio{s}, map[int][]string{1: {"Widgets Inc"}})
|
|
|
|
cache := &Cache{}
|
|
if err := cache.PreloadStudios(ctx, db.Studio); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
got, err := PathToStudio(ctx, "/media/Widgets Inc/scene.mp4", db.Studio, cache, false)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if got == nil || got.ID != 1 {
|
|
t.Errorf("expected studio 1, got %v", got)
|
|
}
|
|
}
|
|
|
|
// Same for tags.
|
|
func TestPathToTags_AliasPrefixDistinctFromName(t *testing.T) {
|
|
t.Parallel()
|
|
ctx := context.Background()
|
|
|
|
db := mocks.NewDatabase()
|
|
primeTagMock(db.Tag, []*models.Tag{{ID: 1, Name: "documentary"}}, map[int][]string{1: {"film"}})
|
|
|
|
cache := &Cache{}
|
|
if err := cache.PreloadTags(ctx, db.Tag); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
got, err := PathToTags(ctx, "/media/film/x.mp4", db.Tag, cache, false)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if ids := tagIDs(got); !slices.Equal(ids, []int{1}) {
|
|
t.Errorf("expected [1], got %v", ids)
|
|
}
|
|
}
|
|
|
|
// Two aliases on the same studio with different prefixes should each
|
|
// reach the studio. Index bucket must dedupe inside the bucket.
|
|
func TestPathToStudio_MultipleAliasesDedup(t *testing.T) {
|
|
t.Parallel()
|
|
ctx := context.Background()
|
|
s := &models.Studio{ID: 1, Name: "Primary Name"}
|
|
|
|
db := mocks.NewDatabase()
|
|
primeStudioMock(db.Studio, []*models.Studio{s}, map[int][]string{1: {"Primary Nickname", "Primary Alt"}})
|
|
|
|
cache := &Cache{}
|
|
if err := cache.PreloadStudios(ctx, db.Studio); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
// Studio "Primary Name" and both aliases all share prefix "pr".
|
|
// The bucket should contain it exactly once.
|
|
if got := len(cache.studioByPrefix["pr"]); got != 1 {
|
|
t.Errorf("bucket 'pr' should have 1 entry, got %d", got)
|
|
}
|
|
}
|
|
|
|
// Equivalence test: the function must return the same result regardless of
|
|
// whether a match.Cache is passed in. This is the invariant that any
|
|
// caching-based optimization must preserve.
|
|
func TestPathToPerformers_CachedVsUncached(t *testing.T) {
|
|
t.Parallel()
|
|
ctx := context.Background()
|
|
|
|
perfs := []*models.Performer{
|
|
{ID: 1, Name: "alice smith"},
|
|
{ID: 2, Name: "bob jones"},
|
|
{ID: 3, Name: "charlie"},
|
|
{ID: 4, Name: "david wong"},
|
|
}
|
|
db := mocks.NewDatabase()
|
|
primePerformerMock(db.Performer, perfs)
|
|
|
|
paths := []string{
|
|
"/media/alice smith.jpg",
|
|
"/media/bob_jones.jpg",
|
|
"/media/alice smith and charlie.jpg",
|
|
"/media/nobody.jpg",
|
|
"/media/alice smith.jpg", // repeat: cached regex should not change outcome
|
|
}
|
|
|
|
var noCache, withCache [][]int
|
|
cache := &Cache{}
|
|
for _, p := range paths {
|
|
uc, err := PathToPerformers(ctx, p, db.Performer, nil, false)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
wc, err := PathToPerformers(ctx, p, db.Performer, cache, false)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
noCache = append(noCache, perfIDs(uc))
|
|
withCache = append(withCache, perfIDs(wc))
|
|
}
|
|
|
|
for i := range paths {
|
|
if !slices.Equal(noCache[i], withCache[i]) {
|
|
t.Errorf("path %q: no-cache %v vs cached %v", paths[i], noCache[i], withCache[i])
|
|
}
|
|
}
|
|
}
|