mirror of
https://github.com/stashapp/stash.git
synced 2026-05-09 05:05:29 +02:00
The preceding commit added lru.Cache for the compiled-regexp cache to match the style in pkg/sqlite/regex.go. That file's use case is different: a small bounded cache serving a read-dominated workload. The auto-tag regexp cache is job-scoped (so eviction buys us nothing) and hit by every worker on every candidate (so the LRU's per-Get mutex becomes contention, measurable under the parallel worker pool). sync.Map's read-optimised path avoids the contention without changing any observable behavior. Kept as a separate commit so it can be reverted independently if upstream prefers the LRU approach — the first commit stands on its own either way.
463 lines
13 KiB
Go
463 lines
13 KiB
Go
package match
|
|
|
|
import (
|
|
"context"
|
|
"regexp"
|
|
"strings"
|
|
"sync"
|
|
|
|
"github.com/stashapp/stash/pkg/models"
|
|
)
|
|
|
|
const singleFirstCharacterRegex = `^[\p{L}][.\-_ ]`
|
|
|
|
var singleFirstCharacterRE = regexp.MustCompile(singleFirstCharacterRegex)
|
|
|
|
// firstTwoRunesLower returns the first two runes of s, lowercased. Returns
|
|
// "" if s has fewer than two runes. Mirrors what getPathWords produces for
|
|
// path words, so the two can be compared as index keys.
|
|
func firstTwoRunesLower(s string) string {
|
|
lower := strings.ToLower(s)
|
|
runes := []rune(lower)
|
|
if len(runes) < 2 {
|
|
return ""
|
|
}
|
|
return string(runes[0:2])
|
|
}
|
|
|
|
// performerCandidates returns the set of preloaded performers that should
|
|
// be regex-checked for the given path words. Mirrors the SQL
|
|
// `name LIKE 'xx%' OR name LIKE 'yy%'` prefilter, plus always-check
|
|
// performers whose name begins with a single-letter word (which the 2-rune
|
|
// prefix lookup can't reach).
|
|
func (c *Cache) performerCandidates(pathWords []string) []*models.Performer {
|
|
if len(c.performerByPrefix) == 0 && len(c.performerAlwaysCheck) == 0 {
|
|
return nil
|
|
}
|
|
seen := make(map[int]bool, len(pathWords)*2)
|
|
out := make([]*models.Performer, 0, len(pathWords)*2)
|
|
for _, w := range pathWords {
|
|
key := strings.ToLower(w)
|
|
for _, p := range c.performerByPrefix[key] {
|
|
if !seen[p.ID] {
|
|
seen[p.ID] = true
|
|
out = append(out, p)
|
|
}
|
|
}
|
|
}
|
|
for _, p := range c.performerAlwaysCheck {
|
|
if !seen[p.ID] {
|
|
seen[p.ID] = true
|
|
out = append(out, p)
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func (c *Cache) studioCandidates(pathWords []string) []cachedStudio {
|
|
if len(c.studioByPrefix) == 0 && len(c.studioAlwaysCheck) == 0 {
|
|
return nil
|
|
}
|
|
seen := make(map[int]bool, len(pathWords)*2)
|
|
out := make([]cachedStudio, 0, len(pathWords)*2)
|
|
for _, w := range pathWords {
|
|
key := strings.ToLower(w)
|
|
for _, s := range c.studioByPrefix[key] {
|
|
if !seen[s.Studio.ID] {
|
|
seen[s.Studio.ID] = true
|
|
out = append(out, s)
|
|
}
|
|
}
|
|
}
|
|
for _, s := range c.studioAlwaysCheck {
|
|
if !seen[s.Studio.ID] {
|
|
seen[s.Studio.ID] = true
|
|
out = append(out, s)
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func (c *Cache) tagCandidates(pathWords []string) []cachedTag {
|
|
if len(c.tagByPrefix) == 0 && len(c.tagAlwaysCheck) == 0 {
|
|
return nil
|
|
}
|
|
seen := make(map[int]bool, len(pathWords)*2)
|
|
out := make([]cachedTag, 0, len(pathWords)*2)
|
|
for _, w := range pathWords {
|
|
key := strings.ToLower(w)
|
|
for _, t := range c.tagByPrefix[key] {
|
|
if !seen[t.Tag.ID] {
|
|
seen[t.Tag.ID] = true
|
|
out = append(out, t)
|
|
}
|
|
}
|
|
}
|
|
for _, t := range c.tagAlwaysCheck {
|
|
if !seen[t.Tag.ID] {
|
|
seen[t.Tag.ID] = true
|
|
out = append(out, t)
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// Cache is used to cache queries that should not change across an autotag
|
|
// process. Safe for concurrent use by multiple goroutines.
|
|
type Cache struct {
|
|
performersOnce sync.Once
|
|
performersErr error
|
|
studiosOnce sync.Once
|
|
studiosErr error
|
|
tagsOnce sync.Once
|
|
tagsErr error
|
|
|
|
singleCharPerformers []*models.Performer
|
|
singleCharStudios []*models.Studio
|
|
singleCharTags []*models.Tag
|
|
|
|
// Preloaded candidate sets. When populated (via PreloadX), the
|
|
// PathTo* functions skip the per-path QueryForAutoTag DB roundtrip
|
|
// and consult the in-memory prefix index instead. Nil means
|
|
// "not preloaded, fall back to the old SQL-prefilter path".
|
|
allPerformers []*models.Performer
|
|
allStudios []cachedStudio
|
|
allTags []cachedTag
|
|
|
|
// Prefix indexes built at preload time. Map key is the first two
|
|
// lowercased runes of name (or alias, for studios/tags). The
|
|
// alwaysCandidate slice holds entries whose first "word" is a
|
|
// single letter — they wouldn't be reached by 2-rune path word
|
|
// lookup, so they must always be checked (mirroring the existing
|
|
// single-letter regex query).
|
|
performerByPrefix map[string][]*models.Performer
|
|
performerAlwaysCheck []*models.Performer
|
|
studioByPrefix map[string][]cachedStudio
|
|
studioAlwaysCheck []cachedStudio
|
|
tagByPrefix map[string][]cachedTag
|
|
tagAlwaysCheck []cachedTag
|
|
|
|
// regexpCache maps regexpCacheKey → *regexp.Regexp. sync.Map rather
|
|
// than the hashicorp LRU used in pkg/sqlite/regex.go: this cache is
|
|
// job-scoped (so LRU's eviction buys us nothing) and is hit by every
|
|
// worker on every candidate (so a single-mutex Get becomes the
|
|
// bottleneck). sync.Map's read-optimised path sidesteps that.
|
|
regexpCache sync.Map
|
|
}
|
|
|
|
// cachedStudio bundles a studio with its aliases so PathToStudio can match
|
|
// against both without an N+1 GetAliases query.
|
|
type cachedStudio struct {
|
|
Studio *models.Studio
|
|
Aliases []string
|
|
}
|
|
|
|
// cachedTag bundles a tag with its aliases so PathToTags can match against
|
|
// both without an N+1 GetAliases query.
|
|
type cachedTag struct {
|
|
Tag *models.Tag
|
|
Aliases []string
|
|
}
|
|
|
|
// PreloadPerformers loads all non-ignored performers into the cache and
|
|
// builds a 2-rune prefix index so subsequent PathToPerformers calls can
|
|
// skip both the per-path QueryForAutoTag and the per-candidate regex
|
|
// when no prefix matches.
|
|
func (c *Cache) PreloadPerformers(ctx context.Context, reader models.PerformerAutoTagQueryer) error {
|
|
if c.allPerformers != nil {
|
|
return nil
|
|
}
|
|
ignoreAutoTag := false
|
|
perPage := -1
|
|
perfs, _, err := reader.Query(ctx, &models.PerformerFilterType{
|
|
IgnoreAutoTag: &ignoreAutoTag,
|
|
}, &models.FindFilterType{PerPage: &perPage})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if perfs == nil {
|
|
perfs = []*models.Performer{}
|
|
}
|
|
c.allPerformers = perfs
|
|
|
|
c.performerByPrefix = make(map[string][]*models.Performer, len(perfs))
|
|
for _, p := range perfs {
|
|
if prefix := firstTwoRunesLower(p.Name); prefix != "" {
|
|
c.performerByPrefix[prefix] = append(c.performerByPrefix[prefix], p)
|
|
}
|
|
if singleFirstCharacterRE.MatchString(p.Name) {
|
|
c.performerAlwaysCheck = append(c.performerAlwaysCheck, p)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// loadAllAliases loads aliases for the given ids. Uses the reader's bulk
|
|
// GetAllAliases method when available (avoiding the N+1 per-id roundtrip);
|
|
// otherwise falls back to per-id GetAliases.
|
|
func loadAllAliases(ctx context.Context, reader models.AliasLoader, ids []int) (map[int][]string, error) {
|
|
if bulk, ok := reader.(models.AllAliasLoader); ok {
|
|
return bulk.GetAllAliases(ctx)
|
|
}
|
|
ret := make(map[int][]string, len(ids))
|
|
for _, id := range ids {
|
|
a, err := reader.GetAliases(ctx, id)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if len(a) > 0 {
|
|
ret[id] = a
|
|
}
|
|
}
|
|
return ret, nil
|
|
}
|
|
|
|
// PreloadStudios loads all non-ignored studios plus their aliases into the
|
|
// cache and builds a 2-rune prefix index (over names AND aliases, mirroring
|
|
// the SQL LEFT JOIN on studio_aliases).
|
|
func (c *Cache) PreloadStudios(ctx context.Context, reader models.StudioAutoTagQueryer) error {
|
|
if c.allStudios != nil {
|
|
return nil
|
|
}
|
|
ignoreAutoTag := false
|
|
perPage := -1
|
|
studios, _, err := reader.Query(ctx, &models.StudioFilterType{
|
|
IgnoreAutoTag: &ignoreAutoTag,
|
|
}, &models.FindFilterType{PerPage: &perPage})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
ids := make([]int, len(studios))
|
|
for i, s := range studios {
|
|
ids[i] = s.ID
|
|
}
|
|
aliasesByID, err := loadAllAliases(ctx, reader, ids)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
out := make([]cachedStudio, len(studios))
|
|
c.studioByPrefix = make(map[string][]cachedStudio, len(studios))
|
|
seenPerPrefix := make(map[string]map[int]bool)
|
|
for i, s := range studios {
|
|
aliases := aliasesByID[s.ID]
|
|
cs := cachedStudio{Studio: s, Aliases: aliases}
|
|
out[i] = cs
|
|
|
|
c.indexByPrefix(s.ID, s.Name, aliases, seenPerPrefix, func(prefix string) {
|
|
c.studioByPrefix[prefix] = append(c.studioByPrefix[prefix], cs)
|
|
})
|
|
if hasSingleFirstChar(s.Name, aliases) {
|
|
c.studioAlwaysCheck = append(c.studioAlwaysCheck, cs)
|
|
}
|
|
}
|
|
c.allStudios = out
|
|
return nil
|
|
}
|
|
|
|
// PreloadTags loads all non-ignored tags plus their aliases into the cache
|
|
// and builds a 2-rune prefix index (over names AND aliases).
|
|
func (c *Cache) PreloadTags(ctx context.Context, reader models.TagAutoTagQueryer) error {
|
|
if c.allTags != nil {
|
|
return nil
|
|
}
|
|
ignoreAutoTag := false
|
|
perPage := -1
|
|
tags, _, err := reader.Query(ctx, &models.TagFilterType{
|
|
IgnoreAutoTag: &ignoreAutoTag,
|
|
}, &models.FindFilterType{PerPage: &perPage})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
ids := make([]int, len(tags))
|
|
for i, t := range tags {
|
|
ids[i] = t.ID
|
|
}
|
|
aliasesByID, err := loadAllAliases(ctx, reader, ids)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
out := make([]cachedTag, len(tags))
|
|
c.tagByPrefix = make(map[string][]cachedTag, len(tags))
|
|
seenPerPrefix := make(map[string]map[int]bool)
|
|
for i, t := range tags {
|
|
aliases := aliasesByID[t.ID]
|
|
ct := cachedTag{Tag: t, Aliases: aliases}
|
|
out[i] = ct
|
|
|
|
c.indexByPrefix(t.ID, t.Name, aliases, seenPerPrefix, func(prefix string) {
|
|
c.tagByPrefix[prefix] = append(c.tagByPrefix[prefix], ct)
|
|
})
|
|
if hasSingleFirstChar(t.Name, aliases) {
|
|
c.tagAlwaysCheck = append(c.tagAlwaysCheck, ct)
|
|
}
|
|
}
|
|
c.allTags = out
|
|
return nil
|
|
}
|
|
|
|
// indexByPrefix records the entity under every distinct 2-rune prefix of
|
|
// its name/aliases (deduping so a name+alias that share a prefix bucket
|
|
// only add the entity once).
|
|
func (c *Cache) indexByPrefix(id int, name string, aliases []string, seen map[string]map[int]bool, add func(prefix string)) {
|
|
emit := func(s string) {
|
|
prefix := firstTwoRunesLower(s)
|
|
if prefix == "" {
|
|
return
|
|
}
|
|
if seen[prefix] == nil {
|
|
seen[prefix] = make(map[int]bool)
|
|
}
|
|
if !seen[prefix][id] {
|
|
seen[prefix][id] = true
|
|
add(prefix)
|
|
}
|
|
}
|
|
emit(name)
|
|
for _, a := range aliases {
|
|
emit(a)
|
|
}
|
|
}
|
|
|
|
func hasSingleFirstChar(name string, aliases []string) bool {
|
|
if singleFirstCharacterRE.MatchString(name) {
|
|
return true
|
|
}
|
|
for _, a := range aliases {
|
|
if singleFirstCharacterRE.MatchString(a) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
type regexpCacheKey struct {
|
|
name string
|
|
useUnicode bool
|
|
}
|
|
|
|
// nameRegexp returns a compiled regexp for the given name, caching the
|
|
// result so repeated autotag calls across many files don't pay the
|
|
// compile cost each time.
|
|
func (c *Cache) nameRegexp(name string, useUnicode bool) *regexp.Regexp {
|
|
if c == nil {
|
|
return nameToRegexp(name, useUnicode)
|
|
}
|
|
|
|
key := regexpCacheKey{name: name, useUnicode: useUnicode}
|
|
if r, ok := c.regexpCache.Load(key); ok {
|
|
return r.(*regexp.Regexp)
|
|
}
|
|
r := nameToRegexp(name, useUnicode)
|
|
actual, _ := c.regexpCache.LoadOrStore(key, r)
|
|
return actual.(*regexp.Regexp)
|
|
}
|
|
|
|
// getSingleLetterPerformers returns all performers with names that start with single character words.
|
|
// The autotag query splits the words into two-character words to query
|
|
// against. This means that performers with single-letter words in their names could potentially
|
|
// be missed.
|
|
// This query is expensive, so it's queried once and cached, if the cache if provided.
|
|
func getSingleLetterPerformers(ctx context.Context, c *Cache, reader models.PerformerAutoTagQueryer) ([]*models.Performer, error) {
|
|
if c == nil {
|
|
c = &Cache{}
|
|
}
|
|
|
|
c.performersOnce.Do(func() {
|
|
pp := -1
|
|
performers, _, err := reader.Query(ctx, &models.PerformerFilterType{
|
|
Name: &models.StringCriterionInput{
|
|
Value: singleFirstCharacterRegex,
|
|
Modifier: models.CriterionModifierMatchesRegex,
|
|
},
|
|
}, &models.FindFilterType{
|
|
PerPage: &pp,
|
|
})
|
|
|
|
if err != nil {
|
|
c.performersErr = err
|
|
return
|
|
}
|
|
|
|
if len(performers) == 0 {
|
|
c.singleCharPerformers = make([]*models.Performer, 0)
|
|
} else {
|
|
c.singleCharPerformers = performers
|
|
}
|
|
})
|
|
|
|
return c.singleCharPerformers, c.performersErr
|
|
}
|
|
|
|
// getSingleLetterStudios returns all studios with names that start with single character words.
|
|
// See getSingleLetterPerformers for details.
|
|
func getSingleLetterStudios(ctx context.Context, c *Cache, reader models.StudioAutoTagQueryer) ([]*models.Studio, error) {
|
|
if c == nil {
|
|
c = &Cache{}
|
|
}
|
|
|
|
c.studiosOnce.Do(func() {
|
|
pp := -1
|
|
studios, _, err := reader.Query(ctx, &models.StudioFilterType{
|
|
Name: &models.StringCriterionInput{
|
|
Value: singleFirstCharacterRegex,
|
|
Modifier: models.CriterionModifierMatchesRegex,
|
|
},
|
|
}, &models.FindFilterType{
|
|
PerPage: &pp,
|
|
})
|
|
|
|
if err != nil {
|
|
c.studiosErr = err
|
|
return
|
|
}
|
|
|
|
if len(studios) == 0 {
|
|
c.singleCharStudios = make([]*models.Studio, 0)
|
|
} else {
|
|
c.singleCharStudios = studios
|
|
}
|
|
})
|
|
|
|
return c.singleCharStudios, c.studiosErr
|
|
}
|
|
|
|
// getSingleLetterTags returns all tags with names that start with single character words.
|
|
// See getSingleLetterPerformers for details.
|
|
func getSingleLetterTags(ctx context.Context, c *Cache, reader models.TagAutoTagQueryer) ([]*models.Tag, error) {
|
|
if c == nil {
|
|
c = &Cache{}
|
|
}
|
|
|
|
c.tagsOnce.Do(func() {
|
|
pp := -1
|
|
tags, _, err := reader.Query(ctx, &models.TagFilterType{
|
|
Name: &models.StringCriterionInput{
|
|
Value: singleFirstCharacterRegex,
|
|
Modifier: models.CriterionModifierMatchesRegex,
|
|
},
|
|
OperatorFilter: models.OperatorFilter[models.TagFilterType]{
|
|
Or: &models.TagFilterType{
|
|
Aliases: &models.StringCriterionInput{
|
|
Value: singleFirstCharacterRegex,
|
|
Modifier: models.CriterionModifierMatchesRegex,
|
|
},
|
|
},
|
|
},
|
|
}, &models.FindFilterType{
|
|
PerPage: &pp,
|
|
})
|
|
|
|
if err != nil {
|
|
c.tagsErr = err
|
|
return
|
|
}
|
|
|
|
if len(tags) == 0 {
|
|
c.singleCharTags = make([]*models.Tag, 0)
|
|
} else {
|
|
c.singleCharTags = tags
|
|
}
|
|
})
|
|
|
|
return c.singleCharTags, c.tagsErr
|
|
}
|