Remove single unicode character from autotag query (#2363)

* Remove single unicode character from autotag query
* Compile regex once where possible
* Fix CPU profiling
* Only match unicode characters if in path
This commit is contained in:
WithoutPants 2022-03-07 13:26:24 +11:00 committed by GitHub
parent 0737ca953d
commit 18665863d6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 79 additions and 20 deletions

View file

@ -35,10 +35,11 @@ func main() {
manager.Initialize()
api.Start(uiBox, loginUIBox)
// stop any profiling at exit
defer pprof.StopCPUProfile()
blockForever()
// stop any profiling at exit
pprof.StopCPUProfile()
manager.GetInstance().Shutdown(0)
}

View file

@ -5,6 +5,7 @@ import (
"path/filepath"
"regexp"
"strings"
"unicode"
"github.com/stashapp/stash/pkg/gallery"
"github.com/stashapp/stash/pkg/image"
@ -12,7 +13,12 @@ import (
"github.com/stashapp/stash/pkg/scene"
)
const separatorChars = `.\-_ `
const (
separatorChars = `.\-_ `
reNotLetterWordUnicode = `[^\p{L}\w\d]`
reNotLetterWord = `[^\w\d]`
)
func getPathQueryRegex(name string) string {
// escape specific regex characters
@ -68,22 +74,22 @@ func getPathWords(path string) []string {
return ret
}
// https://stackoverflow.com/a/53069799
func allASCII(s string) bool {
for i := 0; i < len(s); i++ {
if s[i] > unicode.MaxASCII {
return false
}
}
return true
}
// nameMatchesPath returns the index in the path for the right-most match.
// Returns -1 if not found.
func nameMatchesPath(name, path string) int {
// escape specific regex characters
name = regexp.QuoteMeta(name)
name = strings.ToLower(name)
path = strings.ToLower(path)
// handle path separators
const separator = `[` + separatorChars + `]`
reStr := strings.ReplaceAll(name, " ", separator+"*")
reStr = `(?:^|_|[^\p{L}\w\d])` + reStr + `(?:$|_|[^\p{L}\w\d])`
re := regexp.MustCompile(reStr)
// #2363 - optimisation: only use unicode character regexp if path contains
// unicode characters
re := nameToRegexp(name, !allASCII(path))
found := re.FindAllStringIndex(path, -1)
if found == nil {
@ -93,6 +99,39 @@ func nameMatchesPath(name, path string) int {
return found[len(found)-1][0]
}
// nameToRegexp compiles a regexp pattern to match paths from the given name.
// Set useUnicode to true if this regexp is to be used on any strings with unicode characters.
func nameToRegexp(name string, useUnicode bool) *regexp.Regexp {
// escape specific regex characters
name = regexp.QuoteMeta(name)
name = strings.ToLower(name)
// handle path separators
const separator = `[` + separatorChars + `]`
// performance optimisation: only use \p{L} is useUnicode is true
notWord := reNotLetterWord
if useUnicode {
notWord = reNotLetterWordUnicode
}
reStr := strings.ReplaceAll(name, " ", separator+"*")
reStr = `(?:^|_|` + notWord + `)` + reStr + `(?:$|_|` + notWord + `)`
re := regexp.MustCompile(reStr)
return re
}
func regexpMatchesPath(r *regexp.Regexp, path string) int {
path = strings.ToLower(path)
found := r.FindAllStringIndex(path, -1)
if found == nil {
return -1
}
return found[len(found)-1][0]
}
func PathToPerformers(path string, performerReader models.PerformerReader) ([]*models.Performer, error) {
words := getPathWords(path)
performers, err := performerReader.QueryForAutoTag(words)
@ -208,8 +247,13 @@ func PathToScenes(name string, paths []string, sceneReader models.SceneReader) (
}
var ret []*models.Scene
// paths may have unicode characters
const useUnicode = true
r := nameToRegexp(name, useUnicode)
for _, p := range scenes {
if nameMatchesPath(name, p.Path) != -1 {
if regexpMatchesPath(r, p.Path) != -1 {
ret = append(ret, p)
}
}
@ -240,8 +284,13 @@ func PathToImages(name string, paths []string, imageReader models.ImageReader) (
}
var ret []*models.Image
// paths may have unicode characters
const useUnicode = true
r := nameToRegexp(name, useUnicode)
for _, p := range images {
if nameMatchesPath(name, p.Path) != -1 {
if regexpMatchesPath(r, p.Path) != -1 {
ret = append(ret, p)
}
}
@ -272,8 +321,13 @@ func PathToGalleries(name string, paths []string, galleryReader models.GalleryRe
}
var ret []*models.Gallery
// paths may have unicode characters
const useUnicode = true
r := nameToRegexp(name, useUnicode)
for _, p := range gallerys {
if nameMatchesPath(name, p.Path.String) != -1 {
if regexpMatchesPath(r, p.Path.String) != -1 {
ret = append(ret, p)
}
}

View file

@ -21,7 +21,11 @@ WHERE performers_tags.tag_id = ?
GROUP BY performers_tags.performer_id
`
const singleFirstCharacterRegex = `^[\w\p{L}][.\-_ ]`
// KNOWN ISSUE: using \p{L} to find single unicode character names results in
// very slow queries.
// Suggested solution will be to cache single-character names and not include it
// in the autotag query.
const singleFirstCharacterRegex = `^[\w][.\-_ ]`
type performerQueryBuilder struct {
repository