stash/pkg/match/cache_test.go
abdusalam.dihan cd64433dc5 Speed up file-based auto-tag
Replaces the per-file SQL QueryForAutoTag prefilter with an in-memory
2-rune prefix index over performers/studios/tags, preloaded once at job
start. Also:

  - runs file processing through job.TaskQueue so scenes/images/
    galleries tag in parallel instead of one file at a time
  - keyset-paginates the query loop so batch N+1 doesn't pay the
    O(offset) scan past large tables
  - bulk-loads studio/tag aliases via a new optional AllAliasLoader
    interface, avoiding N+1 GetAliases calls during preload
  - caches compiled name regexps (same candidate names repeat across
    thousands of files)
  - hoists strings.ToLower(path) and allASCII(path) out of the per-
    candidate match loop
  - opens a fresh write txn per applied match instead of holding one
    for every tagger phase

Tagger gains *AtPath methods that own the cache + txn manager, letting
the task code stay slim.
2026-04-19 22:22:37 +01:00

204 lines
5.9 KiB
Go

package match
import (
"context"
"slices"
"testing"
"github.com/stashapp/stash/pkg/models"
"github.com/stashapp/stash/pkg/models/mocks"
)
func TestFirstTwoRunesLower(t *testing.T) {
t.Parallel()
tests := []struct {
in string
want string
}{
{"alice smith", "al"},
{"ALICE", "al"},
{"Àbc", "àb"},
{"伏字 name", "伏字"},
{"ab", "ab"},
{"a", ""}, // single rune -> no prefix
{"", ""}, // empty -> no prefix
{"X Man", "x "}, // space is preserved in 2-rune prefix
}
for _, tt := range tests {
t.Run(tt.in, func(t *testing.T) {
t.Parallel()
if got := firstTwoRunesLower(tt.in); got != tt.want {
t.Errorf("firstTwoRunesLower(%q) = %q, want %q", tt.in, got, tt.want)
}
})
}
}
func TestCacheNameRegexpCaches(t *testing.T) {
t.Parallel()
c := &Cache{}
r1 := c.nameRegexp("alice smith", true)
r2 := c.nameRegexp("alice smith", true)
if r1 != r2 {
t.Error("expected cached regexp to be reused across calls")
}
// Different useUnicode flag -> different cached regexp.
r3 := c.nameRegexp("alice smith", false)
if r3 == r1 {
t.Error("expected ASCII and unicode variants to be distinct cached entries")
}
// Nil cache must still return a valid regexp, just uncached.
var nilCache *Cache
if got := nilCache.nameRegexp("alice smith", true); got == nil {
t.Error("nil cache should still return a regexp")
}
}
func TestPreloadPerformersBuildsIndex(t *testing.T) {
t.Parallel()
alice := &models.Performer{ID: 1, Name: "Alice Smith"}
bob := &models.Performer{ID: 2, Name: "bob jones"}
xman := &models.Performer{ID: 3, Name: "X Man"}
ignored := &models.Performer{ID: 4, Name: "ignored", IgnoreAutoTag: true}
performers := []*models.Performer{alice, bob, xman, ignored}
db := mocks.NewDatabase()
primePerformerMock(db.Performer, performers)
c := &Cache{}
if err := c.PreloadPerformers(context.Background(), db.Performer); err != nil {
t.Fatalf("PreloadPerformers: %v", err)
}
// allPerformers excludes IgnoreAutoTag=true.
if got := len(c.allPerformers); got != 3 {
t.Errorf("allPerformers len = %d, want 3 (ignored must be excluded)", got)
}
// Prefix "al" -> alice, "bo" -> bob, "x " -> xman.
assertBucket := func(prefix string, wantIDs []int) {
t.Helper()
var gotIDs []int
for _, p := range c.performerByPrefix[prefix] {
gotIDs = append(gotIDs, p.ID)
}
slices.Sort(gotIDs)
if !slices.Equal(gotIDs, wantIDs) {
t.Errorf("bucket %q = %v, want %v", prefix, gotIDs, wantIDs)
}
}
assertBucket("al", []int{1})
assertBucket("bo", []int{2})
assertBucket("x ", []int{3})
// Single-letter-first-word performer must also be in alwaysCheck.
var alwaysIDs []int
for _, p := range c.performerAlwaysCheck {
alwaysIDs = append(alwaysIDs, p.ID)
}
if !slices.Equal(alwaysIDs, []int{3}) {
t.Errorf("alwaysCheck IDs = %v, want [3]", alwaysIDs)
}
// Idempotent: second call is a no-op.
if err := c.PreloadPerformers(context.Background(), db.Performer); err != nil {
t.Fatalf("second PreloadPerformers: %v", err)
}
if got := len(c.allPerformers); got != 3 {
t.Errorf("after idempotent call allPerformers len = %d, want 3", got)
}
}
func TestPreloadStudiosIndexesAliasPrefixes(t *testing.T) {
t.Parallel()
// Name "Acme" shares no prefix with alias "Widgets" — both must be
// reachable by their own 2-rune prefix.
s := &models.Studio{ID: 1, Name: "Acme Corp"}
ignored := &models.Studio{ID: 2, Name: "ignored", IgnoreAutoTag: true}
db := mocks.NewDatabase()
primeStudioMock(db.Studio, []*models.Studio{s, ignored}, map[int][]string{1: {"Widgets Inc"}})
c := &Cache{}
if err := c.PreloadStudios(context.Background(), db.Studio); err != nil {
t.Fatalf("PreloadStudios: %v", err)
}
if got := len(c.allStudios); got != 1 {
t.Errorf("allStudios len = %d, want 1 (ignored must be excluded)", got)
}
// "ac" bucket has the studio (via name), "wi" bucket has it (via alias).
if len(c.studioByPrefix["ac"]) != 1 || c.studioByPrefix["ac"][0].Studio.ID != 1 {
t.Errorf("bucket 'ac' should hold studio 1, got %+v", c.studioByPrefix["ac"])
}
if len(c.studioByPrefix["wi"]) != 1 || c.studioByPrefix["wi"][0].Studio.ID != 1 {
t.Errorf("bucket 'wi' should hold studio 1, got %+v", c.studioByPrefix["wi"])
}
}
func TestPreloadStudiosDedupsSharedPrefix(t *testing.T) {
t.Parallel()
// Name and two aliases all share prefix "pr"; the bucket must contain
// the studio exactly once.
s := &models.Studio{ID: 1, Name: "Primary"}
db := mocks.NewDatabase()
primeStudioMock(db.Studio, []*models.Studio{s}, map[int][]string{1: {"Primary Nick", "Primary Alt"}})
c := &Cache{}
if err := c.PreloadStudios(context.Background(), db.Studio); err != nil {
t.Fatal(err)
}
if got := len(c.studioByPrefix["pr"]); got != 1 {
t.Errorf("bucket 'pr' should have 1 entry, got %d", got)
}
}
func TestPreloadTagsIndexesAliasPrefixes(t *testing.T) {
t.Parallel()
db := mocks.NewDatabase()
primeTagMock(db.Tag, []*models.Tag{{ID: 1, Name: "documentary"}}, map[int][]string{1: {"film"}})
c := &Cache{}
if err := c.PreloadTags(context.Background(), db.Tag); err != nil {
t.Fatal(err)
}
if len(c.tagByPrefix["do"]) != 1 || c.tagByPrefix["do"][0].Tag.ID != 1 {
t.Errorf("bucket 'do' should hold tag 1")
}
if len(c.tagByPrefix["fi"]) != 1 || c.tagByPrefix["fi"][0].Tag.ID != 1 {
t.Errorf("bucket 'fi' should hold tag 1 (via alias)")
}
}
func TestCandidateLookupDedupesAcrossPathWords(t *testing.T) {
t.Parallel()
// A performer with name "alabama" falls in bucket "al". If a path has
// two words that both map to bucket "al" (e.g., from separate tokens),
// the candidate must appear exactly once.
p := &models.Performer{ID: 1, Name: "alabama"}
db := mocks.NewDatabase()
primePerformerMock(db.Performer, []*models.Performer{p})
c := &Cache{}
if err := c.PreloadPerformers(context.Background(), db.Performer); err != nil {
t.Fatal(err)
}
got := c.performerCandidates([]string{"al", "AL", "al"}) // same bucket three times
if len(got) != 1 {
t.Errorf("expected 1 candidate after dedup, got %d: %v", len(got), got)
}
}