stash/pkg/match/path.go
abdusalam.dihan cd64433dc5 Speed up file-based auto-tag
Replaces the per-file SQL QueryForAutoTag prefilter with an in-memory
2-rune prefix index over performers/studios/tags, preloaded once at job
start. Also:

  - runs file processing through job.TaskQueue so scenes/images/
    galleries tag in parallel instead of one file at a time
  - keyset-paginates the query loop so batch N+1 doesn't pay the
    O(offset) scan past large tables
  - bulk-loads studio/tag aliases via a new optional AllAliasLoader
    interface, avoiding N+1 GetAliases calls during preload
  - caches compiled name regexps (same candidate names repeat across
    thousands of files)
  - hoists strings.ToLower(path) and allASCII(path) out of the per-
    candidate match loop
  - opens a fresh write txn per applied match instead of holding one
    for every tagger phase

Tagger gains *AtPath methods that own the cache + txn manager, letting
the task code stay slim.
2026-04-19 22:22:37 +01:00

544 lines
14 KiB
Go

// Package match provides functions for matching paths to models.
package match
import (
"context"
"fmt"
"path/filepath"
"regexp"
"strings"
"unicode"
"unicode/utf8"
"github.com/stashapp/stash/pkg/gallery"
"github.com/stashapp/stash/pkg/image"
"github.com/stashapp/stash/pkg/models"
"github.com/stashapp/stash/pkg/scene"
"github.com/stashapp/stash/pkg/sliceutil"
)
const (
separatorChars = `.\-_ `
separatorPattern = `(?:_|[^\p{L}\w\d])+`
reNotLetterWordUnicode = `[^\p{L}\w\d]`
reNotLetterWord = `[^\w\d]`
)
var separatorRE = regexp.MustCompile(separatorPattern)
func getPathQueryRegex(name string) string {
// escape specific regex characters
name = regexp.QuoteMeta(name)
// handle path separators
const separator = `[` + separatorChars + `]`
ret := strings.ReplaceAll(name, " ", separator+"*")
ret = `(?:^|_|[^\p{L}\d])` + ret + `(?:$|_|[^\p{L}\d])`
return ret
}
func getPathWords(path string, trimExt bool) []string {
retStr := path
if trimExt {
// remove the extension
ext := filepath.Ext(retStr)
if ext != "" {
retStr = strings.TrimSuffix(retStr, ext)
}
}
// handle path separators
retStr = separatorRE.ReplaceAllString(retStr, " ")
words := strings.Split(retStr, " ")
// remove any single letter words
var ret []string
for _, w := range words {
if utf8.RuneCountInString(w) > 1 {
// #1450 - we need to open up the criteria for matching so that we
// can match where path has no space between subject names -
// ie name = "foo bar" - path = "foobar"
// we post-match afterwards, so we can afford to be a little loose
// with the query
// just use the first two characters
// #2293 - need to convert to unicode runes for the substring, otherwise
// the resulting string is corrupted.
ret = sliceutil.AppendUnique(ret, string([]rune(w)[0:2]))
}
}
return ret
}
// https://stackoverflow.com/a/53069799
func allASCII(s string) bool {
for i := 0; i < len(s); i++ {
if s[i] > unicode.MaxASCII {
return false
}
}
return true
}
// nameMatchesPath returns the index in the path for the right-most match.
// Returns -1 if not found.
func nameMatchesPath(name, path string) int {
// #2363 - optimisation: only use unicode character regexp if path contains
// unicode characters
re := nameToRegexp(name, !allASCII(path))
return regexpMatchesPath(re, path)
}
// pathMatcher holds per-path precomputed values so they aren't recomputed
// for every candidate name. `allASCII` and `strings.ToLower(path)` were
// running once per (candidate, file) pair before; under a worker pool with
// thousands of candidates per file that was the dominant allocation.
type pathMatcher struct {
loweredPath string
useUnicode bool
cache *Cache
}
func newPathMatcher(path string, cache *Cache) pathMatcher {
return pathMatcher{
loweredPath: strings.ToLower(path),
useUnicode: !allASCII(path),
cache: cache,
}
}
// match returns the right-most index where name matches the path, or -1.
// Uses the cache's compiled-regexp table so each name is compiled once per
// autotag run instead of once per file.
func (m *pathMatcher) match(name string) int {
re := m.cache.nameRegexp(name, m.useUnicode)
found := re.FindAllStringIndex(m.loweredPath, -1)
if found == nil {
return -1
}
return found[len(found)-1][0]
}
// nameToRegexp compiles a regexp pattern to match paths from the given name.
// Set useUnicode to true if this regexp is to be used on any strings with unicode characters.
func nameToRegexp(name string, useUnicode bool) *regexp.Regexp {
// escape specific regex characters
name = regexp.QuoteMeta(name)
name = strings.ToLower(name)
// handle path separators
const separator = `[` + separatorChars + `]`
// performance optimisation: only use \p{L} is useUnicode is true
notWord := reNotLetterWord
if useUnicode {
notWord = reNotLetterWordUnicode
}
reStr := strings.ReplaceAll(name, " ", separator+"*")
reStr = `(?:^|_|` + notWord + `)` + reStr + `(?:$|_|` + notWord + `)`
re := regexp.MustCompile(reStr)
return re
}
func regexpMatchesPath(r *regexp.Regexp, path string) int {
path = strings.ToLower(path)
found := r.FindAllStringIndex(path, -1)
if found == nil {
return -1
}
return found[len(found)-1][0]
}
func getPerformers(ctx context.Context, words []string, performerReader models.PerformerAutoTagQueryer, cache *Cache) ([]*models.Performer, error) {
performers, err := performerReader.QueryForAutoTag(ctx, words)
if err != nil {
return nil, err
}
swPerformers, err := getSingleLetterPerformers(ctx, cache, performerReader)
if err != nil {
return nil, err
}
return append(performers, swPerformers...), nil
}
// PathToPerformers returns performers whose name matches the given path.
//
// When the cache has been preloaded via Cache.PreloadPerformers, the full
// non-ignored performer set is already in memory and a 2-rune prefix index
// narrows candidates before regex-matching — this is the path the bulk
// file-based auto-tag job takes. Otherwise (e.g., the built-in scraper,
// which runs on a single scene per request) falls back to a per-call SQL
// prefilter via reader.QueryForAutoTag.
func PathToPerformers(ctx context.Context, path string, reader models.PerformerAutoTagQueryer, cache *Cache, trimExt bool) ([]*models.Performer, error) {
var performers []*models.Performer
if cache != nil && cache.allPerformers != nil {
performers = cache.performerCandidates(getPathWords(path, trimExt))
} else {
words := getPathWords(path, trimExt)
var err error
performers, err = getPerformers(ctx, words, reader, cache)
if err != nil {
return nil, err
}
}
pm := newPathMatcher(path, cache)
var ret []*models.Performer
for _, p := range performers {
matches := false
if pm.match(p.Name) != -1 {
matches = true
}
// TODO - disabled alias matching until we can get finer
// control over the matching. To re-enable:
// - uncomment this block (fallback path)
// - have Cache.PreloadPerformers load aliases (e.g. via
// loadAllAliases, as PreloadStudios/PreloadTags do) and
// iterate them here in the preloaded path too
// if !matches {
// if err := p.LoadAliases(ctx, reader); err != nil {
// return nil, err
// }
// for _, alias := range p.Aliases.List() {
// if pm.match(alias) != -1 {
// matches = true
// break
// }
// }
// }
if matches {
ret = append(ret, p)
}
}
return ret, nil
}
func getStudios(ctx context.Context, words []string, reader models.StudioAutoTagQueryer, cache *Cache) ([]*models.Studio, error) {
studios, err := reader.QueryForAutoTag(ctx, words)
if err != nil {
return nil, err
}
swStudios, err := getSingleLetterStudios(ctx, cache, reader)
if err != nil {
return nil, err
}
return append(studios, swStudios...), nil
}
// PathToStudio returns the studio whose name or alias matches the given
// path. Where multiple match, the one matching the latest position wins.
//
// See PathToPerformers for the preloaded-vs-fallback behavior.
func PathToStudio(ctx context.Context, path string, reader models.StudioAutoTagQueryer, cache *Cache, trimExt bool) (*models.Studio, error) {
pm := newPathMatcher(path, cache)
if cache != nil && cache.allStudios != nil {
candidates := cache.studioCandidates(getPathWords(path, trimExt))
var ret *models.Studio
index := -1
for _, c := range candidates {
if matchIndex := pm.match(c.Studio.Name); matchIndex != -1 && matchIndex > index {
ret = c.Studio
index = matchIndex
}
for _, alias := range c.Aliases {
if matchIndex := pm.match(alias); matchIndex != -1 && matchIndex > index {
ret = c.Studio
index = matchIndex
}
}
}
return ret, nil
}
words := getPathWords(path, trimExt)
candidates, err := getStudios(ctx, words, reader, cache)
if err != nil {
return nil, err
}
var ret *models.Studio
index := -1
for _, c := range candidates {
if matchIndex := pm.match(c.Name); matchIndex != -1 && matchIndex > index {
ret = c
index = matchIndex
}
aliases, err := reader.GetAliases(ctx, c.ID)
if err != nil {
return nil, err
}
for _, alias := range aliases {
if matchIndex := pm.match(alias); matchIndex != -1 && matchIndex > index {
ret = c
index = matchIndex
}
}
}
return ret, nil
}
func getTags(ctx context.Context, words []string, reader models.TagAutoTagQueryer, cache *Cache) ([]*models.Tag, error) {
tags, err := reader.QueryForAutoTag(ctx, words)
if err != nil {
return nil, err
}
swTags, err := getSingleLetterTags(ctx, cache, reader)
if err != nil {
return nil, err
}
return append(tags, swTags...), nil
}
// PathToTags returns tags whose name or alias matches the given path.
//
// See PathToPerformers for the preloaded-vs-fallback behavior.
func PathToTags(ctx context.Context, path string, reader models.TagAutoTagQueryer, cache *Cache, trimExt bool) ([]*models.Tag, error) {
pm := newPathMatcher(path, cache)
if cache != nil && cache.allTags != nil {
candidates := cache.tagCandidates(getPathWords(path, trimExt))
var ret []*models.Tag
for _, c := range candidates {
if pm.match(c.Tag.Name) != -1 {
ret = append(ret, c.Tag)
continue
}
for _, alias := range c.Aliases {
if pm.match(alias) != -1 {
ret = append(ret, c.Tag)
break
}
}
}
return ret, nil
}
words := getPathWords(path, trimExt)
tags, err := getTags(ctx, words, reader, cache)
if err != nil {
return nil, err
}
var ret []*models.Tag
for _, t := range tags {
matches := false
if pm.match(t.Name) != -1 {
matches = true
}
if !matches {
aliases, err := reader.GetAliases(ctx, t.ID)
if err != nil {
return nil, err
}
for _, alias := range aliases {
if pm.match(alias) != -1 {
matches = true
break
}
}
}
if matches {
ret = append(ret, t)
}
}
return ret, nil
}
func PathToScenesFn(ctx context.Context, name string, paths []string, sceneReader models.SceneQueryer, fn func(ctx context.Context, scene *models.Scene) error) error {
regex := getPathQueryRegex(name)
organized := false
filter := models.SceneFilterType{
Path: &models.StringCriterionInput{
Value: "(?i)" + regex,
Modifier: models.CriterionModifierMatchesRegex,
},
Organized: &organized,
}
filter.And = scene.PathsFilter(paths)
// do in batches
pp := 1000
sort := "id"
sortDir := models.SortDirectionEnumAsc
lastID := 0
for {
if lastID != 0 {
filter.ID = &models.IntCriterionInput{
Value: lastID,
Modifier: models.CriterionModifierGreaterThan,
}
}
scenes, err := scene.Query(ctx, sceneReader, &filter, &models.FindFilterType{
PerPage: &pp,
Sort: &sort,
Direction: &sortDir,
})
if err != nil {
return fmt.Errorf("error querying scenes with regex '%s': %s", regex, err.Error())
}
// paths may have unicode characters
const useUnicode = true
r := nameToRegexp(name, useUnicode)
for _, p := range scenes {
if regexpMatchesPath(r, p.Path) != -1 {
if err := fn(ctx, p); err != nil {
return fmt.Errorf("processing scene %s: %w", p.GetTitle(), err)
}
}
}
if len(scenes) < pp {
break
}
lastID = scenes[len(scenes)-1].ID
}
return nil
}
func PathToImagesFn(ctx context.Context, name string, paths []string, imageReader models.ImageQueryer, fn func(ctx context.Context, scene *models.Image) error) error {
regex := getPathQueryRegex(name)
organized := false
filter := models.ImageFilterType{
Path: &models.StringCriterionInput{
Value: "(?i)" + regex,
Modifier: models.CriterionModifierMatchesRegex,
},
Organized: &organized,
}
filter.And = image.PathsFilter(paths)
// do in batches
pp := 1000
sort := "id"
sortDir := models.SortDirectionEnumAsc
lastID := 0
for {
if lastID != 0 {
filter.ID = &models.IntCriterionInput{
Value: lastID,
Modifier: models.CriterionModifierGreaterThan,
}
}
images, err := image.Query(ctx, imageReader, &filter, &models.FindFilterType{
PerPage: &pp,
Sort: &sort,
Direction: &sortDir,
})
if err != nil {
return fmt.Errorf("error querying images with regex '%s': %s", regex, err.Error())
}
// paths may have unicode characters
const useUnicode = true
r := nameToRegexp(name, useUnicode)
for _, p := range images {
if regexpMatchesPath(r, p.Path) != -1 {
if err := fn(ctx, p); err != nil {
return fmt.Errorf("processing image %s: %w", p.GetTitle(), err)
}
}
}
if len(images) < pp {
break
}
lastID = images[len(images)-1].ID
}
return nil
}
func PathToGalleriesFn(ctx context.Context, name string, paths []string, galleryReader models.GalleryQueryer, fn func(ctx context.Context, scene *models.Gallery) error) error {
regex := getPathQueryRegex(name)
organized := false
filter := models.GalleryFilterType{
Path: &models.StringCriterionInput{
Value: "(?i)" + regex,
Modifier: models.CriterionModifierMatchesRegex,
},
Organized: &organized,
}
filter.And = gallery.PathsFilter(paths)
// do in batches
pp := 1000
sort := "id"
sortDir := models.SortDirectionEnumAsc
lastID := 0
for {
if lastID != 0 {
filter.ID = &models.IntCriterionInput{
Value: lastID,
Modifier: models.CriterionModifierGreaterThan,
}
}
galleries, _, err := galleryReader.Query(ctx, &filter, &models.FindFilterType{
PerPage: &pp,
Sort: &sort,
Direction: &sortDir,
})
if err != nil {
return fmt.Errorf("error querying galleries with regex '%s': %s", regex, err.Error())
}
// paths may have unicode characters
const useUnicode = true
r := nameToRegexp(name, useUnicode)
for _, p := range galleries {
path := p.Path
if path != "" && regexpMatchesPath(r, path) != -1 {
if err := fn(ctx, p); err != nil {
return fmt.Errorf("processing gallery %s: %w", p.GetTitle(), err)
}
}
}
if len(galleries) < pp {
break
}
lastID = galleries[len(galleries)-1].ID
}
return nil
}