mirror of
https://github.com/stashapp/stash.git
synced 2026-05-09 05:05:29 +02:00
Replaces the per-file SQL QueryForAutoTag prefilter with an in-memory
2-rune prefix index over performers/studios/tags, preloaded once at job
start. Also:
- runs file processing through job.TaskQueue so scenes/images/
galleries tag in parallel instead of one file at a time
- keyset-paginates the query loop so batch N+1 doesn't pay the
O(offset) scan past large tables
- bulk-loads studio/tag aliases via a new optional AllAliasLoader
interface, avoiding N+1 GetAliases calls during preload
- caches compiled name regexps (same candidate names repeat across
thousands of files)
- hoists strings.ToLower(path) and allASCII(path) out of the per-
candidate match loop
- opens a fresh write txn per applied match instead of holding one
for every tagger phase
Tagger gains *AtPath methods that own the cache + txn manager, letting
the task code stay slim.
204 lines
5.9 KiB
Go
204 lines
5.9 KiB
Go
package match
|
|
|
|
import (
|
|
"context"
|
|
"slices"
|
|
"testing"
|
|
|
|
"github.com/stashapp/stash/pkg/models"
|
|
"github.com/stashapp/stash/pkg/models/mocks"
|
|
)
|
|
|
|
func TestFirstTwoRunesLower(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
tests := []struct {
|
|
in string
|
|
want string
|
|
}{
|
|
{"alice smith", "al"},
|
|
{"ALICE", "al"},
|
|
{"Àbc", "àb"},
|
|
{"伏字 name", "伏字"},
|
|
{"ab", "ab"},
|
|
{"a", ""}, // single rune -> no prefix
|
|
{"", ""}, // empty -> no prefix
|
|
{"X Man", "x "}, // space is preserved in 2-rune prefix
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.in, func(t *testing.T) {
|
|
t.Parallel()
|
|
if got := firstTwoRunesLower(tt.in); got != tt.want {
|
|
t.Errorf("firstTwoRunesLower(%q) = %q, want %q", tt.in, got, tt.want)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestCacheNameRegexpCaches(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
c := &Cache{}
|
|
r1 := c.nameRegexp("alice smith", true)
|
|
r2 := c.nameRegexp("alice smith", true)
|
|
if r1 != r2 {
|
|
t.Error("expected cached regexp to be reused across calls")
|
|
}
|
|
|
|
// Different useUnicode flag -> different cached regexp.
|
|
r3 := c.nameRegexp("alice smith", false)
|
|
if r3 == r1 {
|
|
t.Error("expected ASCII and unicode variants to be distinct cached entries")
|
|
}
|
|
|
|
// Nil cache must still return a valid regexp, just uncached.
|
|
var nilCache *Cache
|
|
if got := nilCache.nameRegexp("alice smith", true); got == nil {
|
|
t.Error("nil cache should still return a regexp")
|
|
}
|
|
}
|
|
|
|
func TestPreloadPerformersBuildsIndex(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
alice := &models.Performer{ID: 1, Name: "Alice Smith"}
|
|
bob := &models.Performer{ID: 2, Name: "bob jones"}
|
|
xman := &models.Performer{ID: 3, Name: "X Man"}
|
|
ignored := &models.Performer{ID: 4, Name: "ignored", IgnoreAutoTag: true}
|
|
|
|
performers := []*models.Performer{alice, bob, xman, ignored}
|
|
db := mocks.NewDatabase()
|
|
primePerformerMock(db.Performer, performers)
|
|
|
|
c := &Cache{}
|
|
if err := c.PreloadPerformers(context.Background(), db.Performer); err != nil {
|
|
t.Fatalf("PreloadPerformers: %v", err)
|
|
}
|
|
|
|
// allPerformers excludes IgnoreAutoTag=true.
|
|
if got := len(c.allPerformers); got != 3 {
|
|
t.Errorf("allPerformers len = %d, want 3 (ignored must be excluded)", got)
|
|
}
|
|
|
|
// Prefix "al" -> alice, "bo" -> bob, "x " -> xman.
|
|
assertBucket := func(prefix string, wantIDs []int) {
|
|
t.Helper()
|
|
var gotIDs []int
|
|
for _, p := range c.performerByPrefix[prefix] {
|
|
gotIDs = append(gotIDs, p.ID)
|
|
}
|
|
slices.Sort(gotIDs)
|
|
if !slices.Equal(gotIDs, wantIDs) {
|
|
t.Errorf("bucket %q = %v, want %v", prefix, gotIDs, wantIDs)
|
|
}
|
|
}
|
|
assertBucket("al", []int{1})
|
|
assertBucket("bo", []int{2})
|
|
assertBucket("x ", []int{3})
|
|
|
|
// Single-letter-first-word performer must also be in alwaysCheck.
|
|
var alwaysIDs []int
|
|
for _, p := range c.performerAlwaysCheck {
|
|
alwaysIDs = append(alwaysIDs, p.ID)
|
|
}
|
|
if !slices.Equal(alwaysIDs, []int{3}) {
|
|
t.Errorf("alwaysCheck IDs = %v, want [3]", alwaysIDs)
|
|
}
|
|
|
|
// Idempotent: second call is a no-op.
|
|
if err := c.PreloadPerformers(context.Background(), db.Performer); err != nil {
|
|
t.Fatalf("second PreloadPerformers: %v", err)
|
|
}
|
|
if got := len(c.allPerformers); got != 3 {
|
|
t.Errorf("after idempotent call allPerformers len = %d, want 3", got)
|
|
}
|
|
}
|
|
|
|
func TestPreloadStudiosIndexesAliasPrefixes(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// Name "Acme" shares no prefix with alias "Widgets" — both must be
|
|
// reachable by their own 2-rune prefix.
|
|
s := &models.Studio{ID: 1, Name: "Acme Corp"}
|
|
ignored := &models.Studio{ID: 2, Name: "ignored", IgnoreAutoTag: true}
|
|
|
|
db := mocks.NewDatabase()
|
|
primeStudioMock(db.Studio, []*models.Studio{s, ignored}, map[int][]string{1: {"Widgets Inc"}})
|
|
|
|
c := &Cache{}
|
|
if err := c.PreloadStudios(context.Background(), db.Studio); err != nil {
|
|
t.Fatalf("PreloadStudios: %v", err)
|
|
}
|
|
|
|
if got := len(c.allStudios); got != 1 {
|
|
t.Errorf("allStudios len = %d, want 1 (ignored must be excluded)", got)
|
|
}
|
|
|
|
// "ac" bucket has the studio (via name), "wi" bucket has it (via alias).
|
|
if len(c.studioByPrefix["ac"]) != 1 || c.studioByPrefix["ac"][0].Studio.ID != 1 {
|
|
t.Errorf("bucket 'ac' should hold studio 1, got %+v", c.studioByPrefix["ac"])
|
|
}
|
|
if len(c.studioByPrefix["wi"]) != 1 || c.studioByPrefix["wi"][0].Studio.ID != 1 {
|
|
t.Errorf("bucket 'wi' should hold studio 1, got %+v", c.studioByPrefix["wi"])
|
|
}
|
|
}
|
|
|
|
func TestPreloadStudiosDedupsSharedPrefix(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// Name and two aliases all share prefix "pr"; the bucket must contain
|
|
// the studio exactly once.
|
|
s := &models.Studio{ID: 1, Name: "Primary"}
|
|
db := mocks.NewDatabase()
|
|
primeStudioMock(db.Studio, []*models.Studio{s}, map[int][]string{1: {"Primary Nick", "Primary Alt"}})
|
|
|
|
c := &Cache{}
|
|
if err := c.PreloadStudios(context.Background(), db.Studio); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
if got := len(c.studioByPrefix["pr"]); got != 1 {
|
|
t.Errorf("bucket 'pr' should have 1 entry, got %d", got)
|
|
}
|
|
}
|
|
|
|
func TestPreloadTagsIndexesAliasPrefixes(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
db := mocks.NewDatabase()
|
|
primeTagMock(db.Tag, []*models.Tag{{ID: 1, Name: "documentary"}}, map[int][]string{1: {"film"}})
|
|
|
|
c := &Cache{}
|
|
if err := c.PreloadTags(context.Background(), db.Tag); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
if len(c.tagByPrefix["do"]) != 1 || c.tagByPrefix["do"][0].Tag.ID != 1 {
|
|
t.Errorf("bucket 'do' should hold tag 1")
|
|
}
|
|
if len(c.tagByPrefix["fi"]) != 1 || c.tagByPrefix["fi"][0].Tag.ID != 1 {
|
|
t.Errorf("bucket 'fi' should hold tag 1 (via alias)")
|
|
}
|
|
}
|
|
|
|
func TestCandidateLookupDedupesAcrossPathWords(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// A performer with name "alabama" falls in bucket "al". If a path has
|
|
// two words that both map to bucket "al" (e.g., from separate tokens),
|
|
// the candidate must appear exactly once.
|
|
p := &models.Performer{ID: 1, Name: "alabama"}
|
|
db := mocks.NewDatabase()
|
|
primePerformerMock(db.Performer, []*models.Performer{p})
|
|
|
|
c := &Cache{}
|
|
if err := c.PreloadPerformers(context.Background(), db.Performer); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
got := c.performerCandidates([]string{"al", "AL", "al"}) // same bucket three times
|
|
if len(got) != 1 {
|
|
t.Errorf("expected 1 candidate after dedup, got %d: %v", len(got), got)
|
|
}
|
|
}
|