Handle unicode characters in autotag (#2336)

2026-04-27 09:21:07 +02:00 · 2022-02-28 13:12:43 +11:00 · 2022-02-28 13:12:43 +11:00 · 1ab5be162e
commit 1ab5be162e
parent c5c94e783e
6 changed files with 41 additions and 11 deletions
--- a/pkg/match/path.go
+++ b/pkg/match/path.go
@ -22,6 +22,13 @@ func getPathQueryRegex(name string) string {
 	const separator = `[` + separatorChars + `]`

 	ret := strings.ReplaceAll(name, " ", separator+"*")
+
+	// \p{L} is specifically omitted here because of the performance hit when
+	// including it. It does mean that paths where the name is bounded by
+	// unicode letters will be returned. However, the results should be tested
+	// by nameMatchesPath which does include \p{L}. The improvement in query
+	// performance should be outweigh the performance hit of testing any extra
+	// results.
 	ret = `(?:^|_|[^\w\d])` + ret + `(?:$|_|[^\w\d])`
 	return ret
 }
@ -36,7 +43,7 @@ func getPathWords(path string) []string {
 	}

 	// handle path separators
-	const separator = `(?:_|[^\w\d])+`
+	const separator = `(?:_|[^\p{L}\w\d])+`
 	re := regexp.MustCompile(separator)
 	retStr = re.ReplaceAllString(retStr, " ")

@ -52,7 +59,9 @@ func getPathWords(path string) []string {
 			// we post-match afterwards, so we can afford to be a little loose
 			// with the query
 			// just use the first two characters
-			ret = append(ret, w[0:2])
+			// #2293 - need to convert to unicode runes for the substring, otherwise
+			// the resulting string is corrupted.
+			ret = append(ret, string([]rune(w)[0:2]))
 		}
 	}

@ -72,7 +81,7 @@ func nameMatchesPath(name, path string) int {
 	const separator = `[` + separatorChars + `]`

 	reStr := strings.ReplaceAll(name, " ", separator+"*")
-	reStr = `(?:^|_|[^\w\d])` + reStr + `(?:$|_|[^\w\d])`
+	reStr = `(?:^|_|[^\p{L}\w\d])` + reStr + `(?:$|_|[^\p{L}\w\d])`

 	re := regexp.MustCompile(reStr)
 	found := re.FindAllStringIndex(path, -1)
--- a/pkg/match/path_test.go
+++ b/pkg/match/path_test.go
@ -4,71 +4,90 @@ import "testing"

 func Test_nameMatchesPath(t *testing.T) {
 	const name = "first last"
+	const unicodeName = "伏字"

 	tests := []struct {
-		name string
-		path string
-		want int
+		testName string
+		name     string
+		path     string
+		want     int
 	}{
 		{
 			"exact",
 			name,
+			name,
 			0,
 		},
 		{
 			"partial",
+			name,
 			"first",
 			-1,
 		},
 		{
 			"separator",
+			name,
 			"first.last",
 			0,
 		},
 		{
 			"separator",
+			name,
 			"first-last",
 			0,
 		},
 		{
 			"separator",
+			name,
 			"first_last",
 			0,
 		},
 		{
 			"separators",
+			name,
 			"first.-_ last",
 			0,
 		},
 		{
 			"within string",
+			name,
 			"before_first last/after",
 			6,
 		},
 		{
 			"not within string",
+			name,
 			"beforefirst last/after",
 			-1,
 		},
 		{
 			"not within string",
+			name,
 			"before/first lastafter",
 			-1,
 		},
 		{
 			"not within string",
+			name,
 			"first last1",
 			-1,
 		},
 		{
 			"not within string",
+			name,
 			"1first last",
 			-1,
 		},
+		{
+			"unicode",
+			unicodeName,
+			unicodeName,
+			0,
+		},
 	}
 	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			if got := nameMatchesPath(name, tt.path); got != tt.want {
+		t.Run(tt.testName, func(t *testing.T) {
+			if got := nameMatchesPath(tt.name, tt.path); got != tt.want {
 				t.Errorf("nameMatchesPath() = %v, want %v", got, tt.want)
 			}
 		})
--- a/pkg/sqlite/performer.go
+++ b/pkg/sqlite/performer.go
@ -21,6 +21,8 @@ WHERE performers_tags.tag_id = ?
 GROUP BY performers_tags.performer_id
 `

+const singleFirstCharacterRegex = `^[\w\p{L}][.\-_ ]`
+
 type performerQueryBuilder struct {
 	repository
 }
@ -184,7 +186,7 @@ func (qb *performerQueryBuilder) QueryForAutoTag(words []string) ([]*models.Perf
 	var args []interface{}

 	whereClauses = append(whereClauses, "name regexp ?")
-	args = append(args, "^[\\w][.\\-_ ]")
+	args = append(args, singleFirstCharacterRegex)

 	for _, w := range words {
 		whereClauses = append(whereClauses, "name like ?")
--- a/pkg/sqlite/repository.go
+++ b/pkg/sqlite/repository.go
@ -171,6 +171,8 @@ func (r *repository) runSumQuery(query string, args []interface{}) (float64, err
 }

 func (r *repository) queryFunc(query string, args []interface{}, single bool, f func(rows *sqlx.Rows) error) error {
+	logger.Tracef("SQL: %s, args: %v", query, args)
+
 	rows, err := r.tx.Queryx(query, args...)

 	if err != nil && !errors.Is(err, sql.ErrNoRows) {
--- a/pkg/sqlite/studio.go
+++ b/pkg/sqlite/studio.go
@ -145,7 +145,6 @@ func (qb *studioQueryBuilder) QueryForAutoTag(words []string) ([]*models.Studio,
 	var args []interface{}

 	// always include names that begin with a single character
-	singleFirstCharacterRegex := "^[\\w][.\\-_ ]"
 	whereClauses = append(whereClauses, "studios.name regexp ? OR COALESCE(studio_aliases.alias, '') regexp ?")
 	args = append(args, singleFirstCharacterRegex, singleFirstCharacterRegex)

--- a/pkg/sqlite/tag.go
+++ b/pkg/sqlite/tag.go
@ -236,7 +236,6 @@ func (qb *tagQueryBuilder) QueryForAutoTag(words []string) ([]*models.Tag, error
 	var args []interface{}

 	// always include names that begin with a single character
-	singleFirstCharacterRegex := "^[\\w][.\\-_ ]"
 	whereClauses = append(whereClauses, "tags.name regexp ? OR COALESCE(tag_aliases.alias, '') regexp ?")
 	args = append(args, singleFirstCharacterRegex, singleFirstCharacterRegex)