mirror of
https://github.com/stashapp/stash.git
synced 2025-12-09 18:04:33 +01:00
Remove single unicode character from autotag query (#2363)
* Remove single unicode character from autotag query * Compile regex once where possible * Fix CPU profiling * Only match unicode characters if in path
This commit is contained in:
parent
0737ca953d
commit
18665863d6
3 changed files with 79 additions and 20 deletions
5
main.go
5
main.go
|
|
@ -35,10 +35,11 @@ func main() {
|
|||
manager.Initialize()
|
||||
api.Start(uiBox, loginUIBox)
|
||||
|
||||
// stop any profiling at exit
|
||||
defer pprof.StopCPUProfile()
|
||||
blockForever()
|
||||
|
||||
// stop any profiling at exit
|
||||
pprof.StopCPUProfile()
|
||||
|
||||
manager.GetInstance().Shutdown(0)
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ import (
|
|||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
"unicode"
|
||||
|
||||
"github.com/stashapp/stash/pkg/gallery"
|
||||
"github.com/stashapp/stash/pkg/image"
|
||||
|
|
@ -12,7 +13,12 @@ import (
|
|||
"github.com/stashapp/stash/pkg/scene"
|
||||
)
|
||||
|
||||
const separatorChars = `.\-_ `
|
||||
const (
|
||||
separatorChars = `.\-_ `
|
||||
|
||||
reNotLetterWordUnicode = `[^\p{L}\w\d]`
|
||||
reNotLetterWord = `[^\w\d]`
|
||||
)
|
||||
|
||||
func getPathQueryRegex(name string) string {
|
||||
// escape specific regex characters
|
||||
|
|
@ -68,22 +74,22 @@ func getPathWords(path string) []string {
|
|||
return ret
|
||||
}
|
||||
|
||||
// https://stackoverflow.com/a/53069799
|
||||
func allASCII(s string) bool {
|
||||
for i := 0; i < len(s); i++ {
|
||||
if s[i] > unicode.MaxASCII {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// nameMatchesPath returns the index in the path for the right-most match.
|
||||
// Returns -1 if not found.
|
||||
func nameMatchesPath(name, path string) int {
|
||||
// escape specific regex characters
|
||||
name = regexp.QuoteMeta(name)
|
||||
|
||||
name = strings.ToLower(name)
|
||||
path = strings.ToLower(path)
|
||||
|
||||
// handle path separators
|
||||
const separator = `[` + separatorChars + `]`
|
||||
|
||||
reStr := strings.ReplaceAll(name, " ", separator+"*")
|
||||
reStr = `(?:^|_|[^\p{L}\w\d])` + reStr + `(?:$|_|[^\p{L}\w\d])`
|
||||
|
||||
re := regexp.MustCompile(reStr)
|
||||
// #2363 - optimisation: only use unicode character regexp if path contains
|
||||
// unicode characters
|
||||
re := nameToRegexp(name, !allASCII(path))
|
||||
found := re.FindAllStringIndex(path, -1)
|
||||
|
||||
if found == nil {
|
||||
|
|
@ -93,6 +99,39 @@ func nameMatchesPath(name, path string) int {
|
|||
return found[len(found)-1][0]
|
||||
}
|
||||
|
||||
// nameToRegexp compiles a regexp pattern to match paths from the given name.
|
||||
// Set useUnicode to true if this regexp is to be used on any strings with unicode characters.
|
||||
func nameToRegexp(name string, useUnicode bool) *regexp.Regexp {
|
||||
// escape specific regex characters
|
||||
name = regexp.QuoteMeta(name)
|
||||
|
||||
name = strings.ToLower(name)
|
||||
|
||||
// handle path separators
|
||||
const separator = `[` + separatorChars + `]`
|
||||
|
||||
// performance optimisation: only use \p{L} is useUnicode is true
|
||||
notWord := reNotLetterWord
|
||||
if useUnicode {
|
||||
notWord = reNotLetterWordUnicode
|
||||
}
|
||||
|
||||
reStr := strings.ReplaceAll(name, " ", separator+"*")
|
||||
reStr = `(?:^|_|` + notWord + `)` + reStr + `(?:$|_|` + notWord + `)`
|
||||
|
||||
re := regexp.MustCompile(reStr)
|
||||
return re
|
||||
}
|
||||
|
||||
func regexpMatchesPath(r *regexp.Regexp, path string) int {
|
||||
path = strings.ToLower(path)
|
||||
found := r.FindAllStringIndex(path, -1)
|
||||
if found == nil {
|
||||
return -1
|
||||
}
|
||||
return found[len(found)-1][0]
|
||||
}
|
||||
|
||||
func PathToPerformers(path string, performerReader models.PerformerReader) ([]*models.Performer, error) {
|
||||
words := getPathWords(path)
|
||||
performers, err := performerReader.QueryForAutoTag(words)
|
||||
|
|
@ -208,8 +247,13 @@ func PathToScenes(name string, paths []string, sceneReader models.SceneReader) (
|
|||
}
|
||||
|
||||
var ret []*models.Scene
|
||||
|
||||
// paths may have unicode characters
|
||||
const useUnicode = true
|
||||
|
||||
r := nameToRegexp(name, useUnicode)
|
||||
for _, p := range scenes {
|
||||
if nameMatchesPath(name, p.Path) != -1 {
|
||||
if regexpMatchesPath(r, p.Path) != -1 {
|
||||
ret = append(ret, p)
|
||||
}
|
||||
}
|
||||
|
|
@ -240,8 +284,13 @@ func PathToImages(name string, paths []string, imageReader models.ImageReader) (
|
|||
}
|
||||
|
||||
var ret []*models.Image
|
||||
|
||||
// paths may have unicode characters
|
||||
const useUnicode = true
|
||||
|
||||
r := nameToRegexp(name, useUnicode)
|
||||
for _, p := range images {
|
||||
if nameMatchesPath(name, p.Path) != -1 {
|
||||
if regexpMatchesPath(r, p.Path) != -1 {
|
||||
ret = append(ret, p)
|
||||
}
|
||||
}
|
||||
|
|
@ -272,8 +321,13 @@ func PathToGalleries(name string, paths []string, galleryReader models.GalleryRe
|
|||
}
|
||||
|
||||
var ret []*models.Gallery
|
||||
|
||||
// paths may have unicode characters
|
||||
const useUnicode = true
|
||||
|
||||
r := nameToRegexp(name, useUnicode)
|
||||
for _, p := range gallerys {
|
||||
if nameMatchesPath(name, p.Path.String) != -1 {
|
||||
if regexpMatchesPath(r, p.Path.String) != -1 {
|
||||
ret = append(ret, p)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -21,7 +21,11 @@ WHERE performers_tags.tag_id = ?
|
|||
GROUP BY performers_tags.performer_id
|
||||
`
|
||||
|
||||
const singleFirstCharacterRegex = `^[\w\p{L}][.\-_ ]`
|
||||
// KNOWN ISSUE: using \p{L} to find single unicode character names results in
|
||||
// very slow queries.
|
||||
// Suggested solution will be to cache single-character names and not include it
|
||||
// in the autotag query.
|
||||
const singleFirstCharacterRegex = `^[\w][.\-_ ]`
|
||||
|
||||
type performerQueryBuilder struct {
|
||||
repository
|
||||
|
|
|
|||
Loading…
Reference in a new issue