package match import ( "context" "regexp" "strings" "sync" "github.com/stashapp/stash/pkg/models" ) const singleFirstCharacterRegex = `^[\p{L}][.\-_ ]` var singleFirstCharacterRE = regexp.MustCompile(singleFirstCharacterRegex) // firstTwoRunesLower returns the first two runes of s, lowercased. Returns // "" if s has fewer than two runes. Mirrors what getPathWords produces for // path words, so the two can be compared as index keys. func firstTwoRunesLower(s string) string { lower := strings.ToLower(s) runes := []rune(lower) if len(runes) < 2 { return "" } return string(runes[0:2]) } // performerCandidates returns the set of preloaded performers that should // be regex-checked for the given path words. Mirrors the SQL // `name LIKE 'xx%' OR name LIKE 'yy%'` prefilter, plus always-check // performers whose name begins with a single-letter word (which the 2-rune // prefix lookup can't reach). func (c *Cache) performerCandidates(pathWords []string) []*models.Performer { if len(c.performerByPrefix) == 0 && len(c.performerAlwaysCheck) == 0 { return nil } seen := make(map[int]bool, len(pathWords)*2) out := make([]*models.Performer, 0, len(pathWords)*2) for _, w := range pathWords { key := strings.ToLower(w) for _, p := range c.performerByPrefix[key] { if !seen[p.ID] { seen[p.ID] = true out = append(out, p) } } } for _, p := range c.performerAlwaysCheck { if !seen[p.ID] { seen[p.ID] = true out = append(out, p) } } return out } func (c *Cache) studioCandidates(pathWords []string) []cachedStudio { if len(c.studioByPrefix) == 0 && len(c.studioAlwaysCheck) == 0 { return nil } seen := make(map[int]bool, len(pathWords)*2) out := make([]cachedStudio, 0, len(pathWords)*2) for _, w := range pathWords { key := strings.ToLower(w) for _, s := range c.studioByPrefix[key] { if !seen[s.Studio.ID] { seen[s.Studio.ID] = true out = append(out, s) } } } for _, s := range c.studioAlwaysCheck { if !seen[s.Studio.ID] { seen[s.Studio.ID] = true out = append(out, s) } } return out } func (c *Cache) tagCandidates(pathWords []string) []cachedTag { if len(c.tagByPrefix) == 0 && len(c.tagAlwaysCheck) == 0 { return nil } seen := make(map[int]bool, len(pathWords)*2) out := make([]cachedTag, 0, len(pathWords)*2) for _, w := range pathWords { key := strings.ToLower(w) for _, t := range c.tagByPrefix[key] { if !seen[t.Tag.ID] { seen[t.Tag.ID] = true out = append(out, t) } } } for _, t := range c.tagAlwaysCheck { if !seen[t.Tag.ID] { seen[t.Tag.ID] = true out = append(out, t) } } return out } // Cache is used to cache queries that should not change across an autotag // process. Safe for concurrent use by multiple goroutines. type Cache struct { performersOnce sync.Once performersErr error studiosOnce sync.Once studiosErr error tagsOnce sync.Once tagsErr error singleCharPerformers []*models.Performer singleCharStudios []*models.Studio singleCharTags []*models.Tag // Preloaded candidate sets. When populated (via PreloadX), the // PathTo* functions skip the per-path QueryForAutoTag DB roundtrip // and consult the in-memory prefix index instead. Nil means // "not preloaded, fall back to the old SQL-prefilter path". allPerformers []*models.Performer allStudios []cachedStudio allTags []cachedTag // Prefix indexes built at preload time. Map key is the first two // lowercased runes of name (or alias, for studios/tags). The // alwaysCandidate slice holds entries whose first "word" is a // single letter — they wouldn't be reached by 2-rune path word // lookup, so they must always be checked (mirroring the existing // single-letter regex query). performerByPrefix map[string][]*models.Performer performerAlwaysCheck []*models.Performer studioByPrefix map[string][]cachedStudio studioAlwaysCheck []cachedStudio tagByPrefix map[string][]cachedTag tagAlwaysCheck []cachedTag // regexpCache maps regexpCacheKey → *regexp.Regexp. sync.Map rather // than the hashicorp LRU used in pkg/sqlite/regex.go: this cache is // job-scoped (so LRU's eviction buys us nothing) and is hit by every // worker on every candidate (so a single-mutex Get becomes the // bottleneck). sync.Map's read-optimised path sidesteps that. regexpCache sync.Map } // cachedStudio bundles a studio with its aliases so PathToStudio can match // against both without an N+1 GetAliases query. type cachedStudio struct { Studio *models.Studio Aliases []string } // cachedTag bundles a tag with its aliases so PathToTags can match against // both without an N+1 GetAliases query. type cachedTag struct { Tag *models.Tag Aliases []string } // PreloadPerformers loads all non-ignored performers into the cache and // builds a 2-rune prefix index so subsequent PathToPerformers calls can // skip both the per-path QueryForAutoTag and the per-candidate regex // when no prefix matches. func (c *Cache) PreloadPerformers(ctx context.Context, reader models.PerformerAutoTagQueryer) error { if c.allPerformers != nil { return nil } ignoreAutoTag := false perPage := -1 perfs, _, err := reader.Query(ctx, &models.PerformerFilterType{ IgnoreAutoTag: &ignoreAutoTag, }, &models.FindFilterType{PerPage: &perPage}) if err != nil { return err } if perfs == nil { perfs = []*models.Performer{} } c.allPerformers = perfs c.performerByPrefix = make(map[string][]*models.Performer, len(perfs)) for _, p := range perfs { if prefix := firstTwoRunesLower(p.Name); prefix != "" { c.performerByPrefix[prefix] = append(c.performerByPrefix[prefix], p) } if singleFirstCharacterRE.MatchString(p.Name) { c.performerAlwaysCheck = append(c.performerAlwaysCheck, p) } } return nil } // loadAllAliases loads aliases for the given ids. Uses the reader's bulk // GetAllAliases method when available (avoiding the N+1 per-id roundtrip); // otherwise falls back to per-id GetAliases. func loadAllAliases(ctx context.Context, reader models.AliasLoader, ids []int) (map[int][]string, error) { if bulk, ok := reader.(models.AllAliasLoader); ok { return bulk.GetAllAliases(ctx) } ret := make(map[int][]string, len(ids)) for _, id := range ids { a, err := reader.GetAliases(ctx, id) if err != nil { return nil, err } if len(a) > 0 { ret[id] = a } } return ret, nil } // PreloadStudios loads all non-ignored studios plus their aliases into the // cache and builds a 2-rune prefix index (over names AND aliases, mirroring // the SQL LEFT JOIN on studio_aliases). func (c *Cache) PreloadStudios(ctx context.Context, reader models.StudioAutoTagQueryer) error { if c.allStudios != nil { return nil } ignoreAutoTag := false perPage := -1 studios, _, err := reader.Query(ctx, &models.StudioFilterType{ IgnoreAutoTag: &ignoreAutoTag, }, &models.FindFilterType{PerPage: &perPage}) if err != nil { return err } ids := make([]int, len(studios)) for i, s := range studios { ids[i] = s.ID } aliasesByID, err := loadAllAliases(ctx, reader, ids) if err != nil { return err } out := make([]cachedStudio, len(studios)) c.studioByPrefix = make(map[string][]cachedStudio, len(studios)) seenPerPrefix := make(map[string]map[int]bool) for i, s := range studios { aliases := aliasesByID[s.ID] cs := cachedStudio{Studio: s, Aliases: aliases} out[i] = cs c.indexByPrefix(s.ID, s.Name, aliases, seenPerPrefix, func(prefix string) { c.studioByPrefix[prefix] = append(c.studioByPrefix[prefix], cs) }) if hasSingleFirstChar(s.Name, aliases) { c.studioAlwaysCheck = append(c.studioAlwaysCheck, cs) } } c.allStudios = out return nil } // PreloadTags loads all non-ignored tags plus their aliases into the cache // and builds a 2-rune prefix index (over names AND aliases). func (c *Cache) PreloadTags(ctx context.Context, reader models.TagAutoTagQueryer) error { if c.allTags != nil { return nil } ignoreAutoTag := false perPage := -1 tags, _, err := reader.Query(ctx, &models.TagFilterType{ IgnoreAutoTag: &ignoreAutoTag, }, &models.FindFilterType{PerPage: &perPage}) if err != nil { return err } ids := make([]int, len(tags)) for i, t := range tags { ids[i] = t.ID } aliasesByID, err := loadAllAliases(ctx, reader, ids) if err != nil { return err } out := make([]cachedTag, len(tags)) c.tagByPrefix = make(map[string][]cachedTag, len(tags)) seenPerPrefix := make(map[string]map[int]bool) for i, t := range tags { aliases := aliasesByID[t.ID] ct := cachedTag{Tag: t, Aliases: aliases} out[i] = ct c.indexByPrefix(t.ID, t.Name, aliases, seenPerPrefix, func(prefix string) { c.tagByPrefix[prefix] = append(c.tagByPrefix[prefix], ct) }) if hasSingleFirstChar(t.Name, aliases) { c.tagAlwaysCheck = append(c.tagAlwaysCheck, ct) } } c.allTags = out return nil } // indexByPrefix records the entity under every distinct 2-rune prefix of // its name/aliases (deduping so a name+alias that share a prefix bucket // only add the entity once). func (c *Cache) indexByPrefix(id int, name string, aliases []string, seen map[string]map[int]bool, add func(prefix string)) { emit := func(s string) { prefix := firstTwoRunesLower(s) if prefix == "" { return } if seen[prefix] == nil { seen[prefix] = make(map[int]bool) } if !seen[prefix][id] { seen[prefix][id] = true add(prefix) } } emit(name) for _, a := range aliases { emit(a) } } func hasSingleFirstChar(name string, aliases []string) bool { if singleFirstCharacterRE.MatchString(name) { return true } for _, a := range aliases { if singleFirstCharacterRE.MatchString(a) { return true } } return false } type regexpCacheKey struct { name string useUnicode bool } // nameRegexp returns a compiled regexp for the given name, caching the // result so repeated autotag calls across many files don't pay the // compile cost each time. func (c *Cache) nameRegexp(name string, useUnicode bool) *regexp.Regexp { if c == nil { return nameToRegexp(name, useUnicode) } key := regexpCacheKey{name: name, useUnicode: useUnicode} if r, ok := c.regexpCache.Load(key); ok { return r.(*regexp.Regexp) } r := nameToRegexp(name, useUnicode) actual, _ := c.regexpCache.LoadOrStore(key, r) return actual.(*regexp.Regexp) } // getSingleLetterPerformers returns all performers with names that start with single character words. // The autotag query splits the words into two-character words to query // against. This means that performers with single-letter words in their names could potentially // be missed. // This query is expensive, so it's queried once and cached, if the cache if provided. func getSingleLetterPerformers(ctx context.Context, c *Cache, reader models.PerformerAutoTagQueryer) ([]*models.Performer, error) { if c == nil { c = &Cache{} } c.performersOnce.Do(func() { pp := -1 performers, _, err := reader.Query(ctx, &models.PerformerFilterType{ Name: &models.StringCriterionInput{ Value: singleFirstCharacterRegex, Modifier: models.CriterionModifierMatchesRegex, }, }, &models.FindFilterType{ PerPage: &pp, }) if err != nil { c.performersErr = err return } if len(performers) == 0 { c.singleCharPerformers = make([]*models.Performer, 0) } else { c.singleCharPerformers = performers } }) return c.singleCharPerformers, c.performersErr } // getSingleLetterStudios returns all studios with names that start with single character words. // See getSingleLetterPerformers for details. func getSingleLetterStudios(ctx context.Context, c *Cache, reader models.StudioAutoTagQueryer) ([]*models.Studio, error) { if c == nil { c = &Cache{} } c.studiosOnce.Do(func() { pp := -1 studios, _, err := reader.Query(ctx, &models.StudioFilterType{ Name: &models.StringCriterionInput{ Value: singleFirstCharacterRegex, Modifier: models.CriterionModifierMatchesRegex, }, }, &models.FindFilterType{ PerPage: &pp, }) if err != nil { c.studiosErr = err return } if len(studios) == 0 { c.singleCharStudios = make([]*models.Studio, 0) } else { c.singleCharStudios = studios } }) return c.singleCharStudios, c.studiosErr } // getSingleLetterTags returns all tags with names that start with single character words. // See getSingleLetterPerformers for details. func getSingleLetterTags(ctx context.Context, c *Cache, reader models.TagAutoTagQueryer) ([]*models.Tag, error) { if c == nil { c = &Cache{} } c.tagsOnce.Do(func() { pp := -1 tags, _, err := reader.Query(ctx, &models.TagFilterType{ Name: &models.StringCriterionInput{ Value: singleFirstCharacterRegex, Modifier: models.CriterionModifierMatchesRegex, }, OperatorFilter: models.OperatorFilter[models.TagFilterType]{ Or: &models.TagFilterType{ Aliases: &models.StringCriterionInput{ Value: singleFirstCharacterRegex, Modifier: models.CriterionModifierMatchesRegex, }, }, }, }, &models.FindFilterType{ PerPage: &pp, }) if err != nil { c.tagsErr = err return } if len(tags) == 0 { c.singleCharTags = make([]*models.Tag, 0) } else { c.singleCharTags = tags } }) return c.singleCharTags, c.tagsErr }