stash/pkg/match/cache.go
abdusalam.dihan 93a05fce8f Use sync.Map instead of LRU for the per-job regexp cache
The preceding commit added lru.Cache for the compiled-regexp cache to
match the style in pkg/sqlite/regex.go. That file's use case is
different: a small bounded cache serving a read-dominated workload. The
auto-tag regexp cache is job-scoped (so eviction buys us nothing) and
hit by every worker on every candidate (so the LRU's per-Get mutex
becomes contention, measurable under the parallel worker pool).

sync.Map's read-optimised path avoids the contention without changing
any observable behavior. Kept as a separate commit so it can be
reverted independently if upstream prefers the LRU approach — the
first commit stands on its own either way.
2026-04-19 22:22:37 +01:00

463 lines
13 KiB
Go

package match
import (
"context"
"regexp"
"strings"
"sync"
"github.com/stashapp/stash/pkg/models"
)
const singleFirstCharacterRegex = `^[\p{L}][.\-_ ]`
var singleFirstCharacterRE = regexp.MustCompile(singleFirstCharacterRegex)
// firstTwoRunesLower returns the first two runes of s, lowercased. Returns
// "" if s has fewer than two runes. Mirrors what getPathWords produces for
// path words, so the two can be compared as index keys.
func firstTwoRunesLower(s string) string {
lower := strings.ToLower(s)
runes := []rune(lower)
if len(runes) < 2 {
return ""
}
return string(runes[0:2])
}
// performerCandidates returns the set of preloaded performers that should
// be regex-checked for the given path words. Mirrors the SQL
// `name LIKE 'xx%' OR name LIKE 'yy%'` prefilter, plus always-check
// performers whose name begins with a single-letter word (which the 2-rune
// prefix lookup can't reach).
func (c *Cache) performerCandidates(pathWords []string) []*models.Performer {
if len(c.performerByPrefix) == 0 && len(c.performerAlwaysCheck) == 0 {
return nil
}
seen := make(map[int]bool, len(pathWords)*2)
out := make([]*models.Performer, 0, len(pathWords)*2)
for _, w := range pathWords {
key := strings.ToLower(w)
for _, p := range c.performerByPrefix[key] {
if !seen[p.ID] {
seen[p.ID] = true
out = append(out, p)
}
}
}
for _, p := range c.performerAlwaysCheck {
if !seen[p.ID] {
seen[p.ID] = true
out = append(out, p)
}
}
return out
}
func (c *Cache) studioCandidates(pathWords []string) []cachedStudio {
if len(c.studioByPrefix) == 0 && len(c.studioAlwaysCheck) == 0 {
return nil
}
seen := make(map[int]bool, len(pathWords)*2)
out := make([]cachedStudio, 0, len(pathWords)*2)
for _, w := range pathWords {
key := strings.ToLower(w)
for _, s := range c.studioByPrefix[key] {
if !seen[s.Studio.ID] {
seen[s.Studio.ID] = true
out = append(out, s)
}
}
}
for _, s := range c.studioAlwaysCheck {
if !seen[s.Studio.ID] {
seen[s.Studio.ID] = true
out = append(out, s)
}
}
return out
}
func (c *Cache) tagCandidates(pathWords []string) []cachedTag {
if len(c.tagByPrefix) == 0 && len(c.tagAlwaysCheck) == 0 {
return nil
}
seen := make(map[int]bool, len(pathWords)*2)
out := make([]cachedTag, 0, len(pathWords)*2)
for _, w := range pathWords {
key := strings.ToLower(w)
for _, t := range c.tagByPrefix[key] {
if !seen[t.Tag.ID] {
seen[t.Tag.ID] = true
out = append(out, t)
}
}
}
for _, t := range c.tagAlwaysCheck {
if !seen[t.Tag.ID] {
seen[t.Tag.ID] = true
out = append(out, t)
}
}
return out
}
// Cache is used to cache queries that should not change across an autotag
// process. Safe for concurrent use by multiple goroutines.
type Cache struct {
performersOnce sync.Once
performersErr error
studiosOnce sync.Once
studiosErr error
tagsOnce sync.Once
tagsErr error
singleCharPerformers []*models.Performer
singleCharStudios []*models.Studio
singleCharTags []*models.Tag
// Preloaded candidate sets. When populated (via PreloadX), the
// PathTo* functions skip the per-path QueryForAutoTag DB roundtrip
// and consult the in-memory prefix index instead. Nil means
// "not preloaded, fall back to the old SQL-prefilter path".
allPerformers []*models.Performer
allStudios []cachedStudio
allTags []cachedTag
// Prefix indexes built at preload time. Map key is the first two
// lowercased runes of name (or alias, for studios/tags). The
// alwaysCandidate slice holds entries whose first "word" is a
// single letter — they wouldn't be reached by 2-rune path word
// lookup, so they must always be checked (mirroring the existing
// single-letter regex query).
performerByPrefix map[string][]*models.Performer
performerAlwaysCheck []*models.Performer
studioByPrefix map[string][]cachedStudio
studioAlwaysCheck []cachedStudio
tagByPrefix map[string][]cachedTag
tagAlwaysCheck []cachedTag
// regexpCache maps regexpCacheKey → *regexp.Regexp. sync.Map rather
// than the hashicorp LRU used in pkg/sqlite/regex.go: this cache is
// job-scoped (so LRU's eviction buys us nothing) and is hit by every
// worker on every candidate (so a single-mutex Get becomes the
// bottleneck). sync.Map's read-optimised path sidesteps that.
regexpCache sync.Map
}
// cachedStudio bundles a studio with its aliases so PathToStudio can match
// against both without an N+1 GetAliases query.
type cachedStudio struct {
Studio *models.Studio
Aliases []string
}
// cachedTag bundles a tag with its aliases so PathToTags can match against
// both without an N+1 GetAliases query.
type cachedTag struct {
Tag *models.Tag
Aliases []string
}
// PreloadPerformers loads all non-ignored performers into the cache and
// builds a 2-rune prefix index so subsequent PathToPerformers calls can
// skip both the per-path QueryForAutoTag and the per-candidate regex
// when no prefix matches.
func (c *Cache) PreloadPerformers(ctx context.Context, reader models.PerformerAutoTagQueryer) error {
if c.allPerformers != nil {
return nil
}
ignoreAutoTag := false
perPage := -1
perfs, _, err := reader.Query(ctx, &models.PerformerFilterType{
IgnoreAutoTag: &ignoreAutoTag,
}, &models.FindFilterType{PerPage: &perPage})
if err != nil {
return err
}
if perfs == nil {
perfs = []*models.Performer{}
}
c.allPerformers = perfs
c.performerByPrefix = make(map[string][]*models.Performer, len(perfs))
for _, p := range perfs {
if prefix := firstTwoRunesLower(p.Name); prefix != "" {
c.performerByPrefix[prefix] = append(c.performerByPrefix[prefix], p)
}
if singleFirstCharacterRE.MatchString(p.Name) {
c.performerAlwaysCheck = append(c.performerAlwaysCheck, p)
}
}
return nil
}
// loadAllAliases loads aliases for the given ids. Uses the reader's bulk
// GetAllAliases method when available (avoiding the N+1 per-id roundtrip);
// otherwise falls back to per-id GetAliases.
func loadAllAliases(ctx context.Context, reader models.AliasLoader, ids []int) (map[int][]string, error) {
if bulk, ok := reader.(models.AllAliasLoader); ok {
return bulk.GetAllAliases(ctx)
}
ret := make(map[int][]string, len(ids))
for _, id := range ids {
a, err := reader.GetAliases(ctx, id)
if err != nil {
return nil, err
}
if len(a) > 0 {
ret[id] = a
}
}
return ret, nil
}
// PreloadStudios loads all non-ignored studios plus their aliases into the
// cache and builds a 2-rune prefix index (over names AND aliases, mirroring
// the SQL LEFT JOIN on studio_aliases).
func (c *Cache) PreloadStudios(ctx context.Context, reader models.StudioAutoTagQueryer) error {
if c.allStudios != nil {
return nil
}
ignoreAutoTag := false
perPage := -1
studios, _, err := reader.Query(ctx, &models.StudioFilterType{
IgnoreAutoTag: &ignoreAutoTag,
}, &models.FindFilterType{PerPage: &perPage})
if err != nil {
return err
}
ids := make([]int, len(studios))
for i, s := range studios {
ids[i] = s.ID
}
aliasesByID, err := loadAllAliases(ctx, reader, ids)
if err != nil {
return err
}
out := make([]cachedStudio, len(studios))
c.studioByPrefix = make(map[string][]cachedStudio, len(studios))
seenPerPrefix := make(map[string]map[int]bool)
for i, s := range studios {
aliases := aliasesByID[s.ID]
cs := cachedStudio{Studio: s, Aliases: aliases}
out[i] = cs
c.indexByPrefix(s.ID, s.Name, aliases, seenPerPrefix, func(prefix string) {
c.studioByPrefix[prefix] = append(c.studioByPrefix[prefix], cs)
})
if hasSingleFirstChar(s.Name, aliases) {
c.studioAlwaysCheck = append(c.studioAlwaysCheck, cs)
}
}
c.allStudios = out
return nil
}
// PreloadTags loads all non-ignored tags plus their aliases into the cache
// and builds a 2-rune prefix index (over names AND aliases).
func (c *Cache) PreloadTags(ctx context.Context, reader models.TagAutoTagQueryer) error {
if c.allTags != nil {
return nil
}
ignoreAutoTag := false
perPage := -1
tags, _, err := reader.Query(ctx, &models.TagFilterType{
IgnoreAutoTag: &ignoreAutoTag,
}, &models.FindFilterType{PerPage: &perPage})
if err != nil {
return err
}
ids := make([]int, len(tags))
for i, t := range tags {
ids[i] = t.ID
}
aliasesByID, err := loadAllAliases(ctx, reader, ids)
if err != nil {
return err
}
out := make([]cachedTag, len(tags))
c.tagByPrefix = make(map[string][]cachedTag, len(tags))
seenPerPrefix := make(map[string]map[int]bool)
for i, t := range tags {
aliases := aliasesByID[t.ID]
ct := cachedTag{Tag: t, Aliases: aliases}
out[i] = ct
c.indexByPrefix(t.ID, t.Name, aliases, seenPerPrefix, func(prefix string) {
c.tagByPrefix[prefix] = append(c.tagByPrefix[prefix], ct)
})
if hasSingleFirstChar(t.Name, aliases) {
c.tagAlwaysCheck = append(c.tagAlwaysCheck, ct)
}
}
c.allTags = out
return nil
}
// indexByPrefix records the entity under every distinct 2-rune prefix of
// its name/aliases (deduping so a name+alias that share a prefix bucket
// only add the entity once).
func (c *Cache) indexByPrefix(id int, name string, aliases []string, seen map[string]map[int]bool, add func(prefix string)) {
emit := func(s string) {
prefix := firstTwoRunesLower(s)
if prefix == "" {
return
}
if seen[prefix] == nil {
seen[prefix] = make(map[int]bool)
}
if !seen[prefix][id] {
seen[prefix][id] = true
add(prefix)
}
}
emit(name)
for _, a := range aliases {
emit(a)
}
}
func hasSingleFirstChar(name string, aliases []string) bool {
if singleFirstCharacterRE.MatchString(name) {
return true
}
for _, a := range aliases {
if singleFirstCharacterRE.MatchString(a) {
return true
}
}
return false
}
type regexpCacheKey struct {
name string
useUnicode bool
}
// nameRegexp returns a compiled regexp for the given name, caching the
// result so repeated autotag calls across many files don't pay the
// compile cost each time.
func (c *Cache) nameRegexp(name string, useUnicode bool) *regexp.Regexp {
if c == nil {
return nameToRegexp(name, useUnicode)
}
key := regexpCacheKey{name: name, useUnicode: useUnicode}
if r, ok := c.regexpCache.Load(key); ok {
return r.(*regexp.Regexp)
}
r := nameToRegexp(name, useUnicode)
actual, _ := c.regexpCache.LoadOrStore(key, r)
return actual.(*regexp.Regexp)
}
// getSingleLetterPerformers returns all performers with names that start with single character words.
// The autotag query splits the words into two-character words to query
// against. This means that performers with single-letter words in their names could potentially
// be missed.
// This query is expensive, so it's queried once and cached, if the cache if provided.
func getSingleLetterPerformers(ctx context.Context, c *Cache, reader models.PerformerAutoTagQueryer) ([]*models.Performer, error) {
if c == nil {
c = &Cache{}
}
c.performersOnce.Do(func() {
pp := -1
performers, _, err := reader.Query(ctx, &models.PerformerFilterType{
Name: &models.StringCriterionInput{
Value: singleFirstCharacterRegex,
Modifier: models.CriterionModifierMatchesRegex,
},
}, &models.FindFilterType{
PerPage: &pp,
})
if err != nil {
c.performersErr = err
return
}
if len(performers) == 0 {
c.singleCharPerformers = make([]*models.Performer, 0)
} else {
c.singleCharPerformers = performers
}
})
return c.singleCharPerformers, c.performersErr
}
// getSingleLetterStudios returns all studios with names that start with single character words.
// See getSingleLetterPerformers for details.
func getSingleLetterStudios(ctx context.Context, c *Cache, reader models.StudioAutoTagQueryer) ([]*models.Studio, error) {
if c == nil {
c = &Cache{}
}
c.studiosOnce.Do(func() {
pp := -1
studios, _, err := reader.Query(ctx, &models.StudioFilterType{
Name: &models.StringCriterionInput{
Value: singleFirstCharacterRegex,
Modifier: models.CriterionModifierMatchesRegex,
},
}, &models.FindFilterType{
PerPage: &pp,
})
if err != nil {
c.studiosErr = err
return
}
if len(studios) == 0 {
c.singleCharStudios = make([]*models.Studio, 0)
} else {
c.singleCharStudios = studios
}
})
return c.singleCharStudios, c.studiosErr
}
// getSingleLetterTags returns all tags with names that start with single character words.
// See getSingleLetterPerformers for details.
func getSingleLetterTags(ctx context.Context, c *Cache, reader models.TagAutoTagQueryer) ([]*models.Tag, error) {
if c == nil {
c = &Cache{}
}
c.tagsOnce.Do(func() {
pp := -1
tags, _, err := reader.Query(ctx, &models.TagFilterType{
Name: &models.StringCriterionInput{
Value: singleFirstCharacterRegex,
Modifier: models.CriterionModifierMatchesRegex,
},
OperatorFilter: models.OperatorFilter[models.TagFilterType]{
Or: &models.TagFilterType{
Aliases: &models.StringCriterionInput{
Value: singleFirstCharacterRegex,
Modifier: models.CriterionModifierMatchesRegex,
},
},
},
}, &models.FindFilterType{
PerPage: &pp,
})
if err != nil {
c.tagsErr = err
return
}
if len(tags) == 0 {
c.singleCharTags = make([]*models.Tag, 0)
} else {
c.singleCharTags = tags
}
})
return c.singleCharTags, c.tagsErr
}