Remove single unicode character from autotag query (#2363)

* Remove single unicode character from autotag query * Compile regex once where possible * Fix CPU profiling * Only match unicode characters if in path
2026-03-17 02:23:19 +01:00 · 2022-03-07 13:26:24 +11:00 · 2022-03-07 13:26:24 +11:00 · 18665863d6
commit 18665863d6
parent 0737ca953d
3 changed files with 79 additions and 20 deletions
--- a/main.go
+++ b/main.go
@ -35,10 +35,11 @@ func main() {
 	manager.Initialize()
 	api.Start(uiBox, loginUIBox)

-	// stop any profiling at exit
-	defer pprof.StopCPUProfile()
 	blockForever()

+	// stop any profiling at exit
+	pprof.StopCPUProfile()
+
 	manager.GetInstance().Shutdown(0)
 }

--- a/pkg/match/path.go
+++ b/pkg/match/path.go
@ -5,6 +5,7 @@ import (
 	"path/filepath"
 	"regexp"
 	"strings"
+	"unicode"

 	"github.com/stashapp/stash/pkg/gallery"
 	"github.com/stashapp/stash/pkg/image"
@ -12,7 +13,12 @@ import (
 	"github.com/stashapp/stash/pkg/scene"
 )

-const separatorChars = `.\-_ `
+const (
+	separatorChars = `.\-_ `
+
+	reNotLetterWordUnicode = `[^\p{L}\w\d]`
+	reNotLetterWord        = `[^\w\d]`
+)

 func getPathQueryRegex(name string) string {
 	// escape specific regex characters
@ -68,22 +74,22 @@ func getPathWords(path string) []string {
 	return ret
 }

+// https://stackoverflow.com/a/53069799
+func allASCII(s string) bool {
+	for i := 0; i < len(s); i++ {
+		if s[i] > unicode.MaxASCII {
+			return false
+		}
+	}
+	return true
+}
+
 // nameMatchesPath returns the index in the path for the right-most match.
 // Returns -1 if not found.
 func nameMatchesPath(name, path string) int {
-	// escape specific regex characters
-	name = regexp.QuoteMeta(name)
-
-	name = strings.ToLower(name)
-	path = strings.ToLower(path)
-
-	// handle path separators
-	const separator = `[` + separatorChars + `]`
-
-	reStr := strings.ReplaceAll(name, " ", separator+"*")
-	reStr = `(?:^|_|[^\p{L}\w\d])` + reStr + `(?:$|_|[^\p{L}\w\d])`
-
-	re := regexp.MustCompile(reStr)
+	// #2363 - optimisation: only use unicode character regexp if path contains
+	// unicode characters
+	re := nameToRegexp(name, !allASCII(path))
 	found := re.FindAllStringIndex(path, -1)

 	if found == nil {
@ -93,6 +99,39 @@ func nameMatchesPath(name, path string) int {
 	return found[len(found)-1][0]
 }

+// nameToRegexp compiles a regexp pattern to match paths from the given name.
+// Set useUnicode to true if this regexp is to be used on any strings with unicode characters.
+func nameToRegexp(name string, useUnicode bool) *regexp.Regexp {
+	// escape specific regex characters
+	name = regexp.QuoteMeta(name)
+
+	name = strings.ToLower(name)
+
+	// handle path separators
+	const separator = `[` + separatorChars + `]`
+
+	// performance optimisation: only use \p{L} is useUnicode is true
+	notWord := reNotLetterWord
+	if useUnicode {
+		notWord = reNotLetterWordUnicode
+	}
+
+	reStr := strings.ReplaceAll(name, " ", separator+"*")
+	reStr = `(?:^|_|` + notWord + `)` + reStr + `(?:$|_|` + notWord + `)`
+
+	re := regexp.MustCompile(reStr)
+	return re
+}
+
+func regexpMatchesPath(r *regexp.Regexp, path string) int {
+	path = strings.ToLower(path)
+	found := r.FindAllStringIndex(path, -1)
+	if found == nil {
+		return -1
+	}
+	return found[len(found)-1][0]
+}
+
 func PathToPerformers(path string, performerReader models.PerformerReader) ([]*models.Performer, error) {
 	words := getPathWords(path)
 	performers, err := performerReader.QueryForAutoTag(words)
@ -208,8 +247,13 @@ func PathToScenes(name string, paths []string, sceneReader models.SceneReader) (
 	}

 	var ret []*models.Scene
+
+	// paths may have unicode characters
+	const useUnicode = true
+
+	r := nameToRegexp(name, useUnicode)
 	for _, p := range scenes {
-		if nameMatchesPath(name, p.Path) != -1 {
+		if regexpMatchesPath(r, p.Path) != -1 {
 			ret = append(ret, p)
 		}
 	}
@ -240,8 +284,13 @@ func PathToImages(name string, paths []string, imageReader models.ImageReader) (
 	}

 	var ret []*models.Image
+
+	// paths may have unicode characters
+	const useUnicode = true
+
+	r := nameToRegexp(name, useUnicode)
 	for _, p := range images {
-		if nameMatchesPath(name, p.Path) != -1 {
+		if regexpMatchesPath(r, p.Path) != -1 {
 			ret = append(ret, p)
 		}
 	}
@ -272,8 +321,13 @@ func PathToGalleries(name string, paths []string, galleryReader models.GalleryRe
 	}

 	var ret []*models.Gallery
+
+	// paths may have unicode characters
+	const useUnicode = true
+
+	r := nameToRegexp(name, useUnicode)
 	for _, p := range gallerys {
-		if nameMatchesPath(name, p.Path.String) != -1 {
+		if regexpMatchesPath(r, p.Path.String) != -1 {
 			ret = append(ret, p)
 		}
 	}
--- a/pkg/sqlite/performer.go
+++ b/pkg/sqlite/performer.go
@ -21,7 +21,11 @@ WHERE performers_tags.tag_id = ?
 GROUP BY performers_tags.performer_id
 `

-const singleFirstCharacterRegex = `^[\w\p{L}][.\-_ ]`
+// KNOWN ISSUE: using \p{L} to find single unicode character names results in
+// very slow queries.
+// Suggested solution will be to cache single-character names and not include it
+// in the autotag query.
+const singleFirstCharacterRegex = `^[\w][.\-_ ]`

 type performerQueryBuilder struct {
 	repository