From 1ab5be162e72655b57c231ec6b0374f52a74bc92 Mon Sep 17 00:00:00 2001 From: WithoutPants <53250216+WithoutPants@users.noreply.github.com> Date: Mon, 28 Feb 2022 13:12:43 +1100 Subject: [PATCH] Handle unicode characters in autotag (#2336) --- pkg/match/path.go | 15 ++++++++++++--- pkg/match/path_test.go | 29 ++++++++++++++++++++++++----- pkg/sqlite/performer.go | 4 +++- pkg/sqlite/repository.go | 2 ++ pkg/sqlite/studio.go | 1 - pkg/sqlite/tag.go | 1 - 6 files changed, 41 insertions(+), 11 deletions(-) diff --git a/pkg/match/path.go b/pkg/match/path.go index 9dd4bdec0..e80601fcd 100644 --- a/pkg/match/path.go +++ b/pkg/match/path.go @@ -22,6 +22,13 @@ func getPathQueryRegex(name string) string { const separator = `[` + separatorChars + `]` ret := strings.ReplaceAll(name, " ", separator+"*") + + // \p{L} is specifically omitted here because of the performance hit when + // including it. It does mean that paths where the name is bounded by + // unicode letters will be returned. However, the results should be tested + // by nameMatchesPath which does include \p{L}. The improvement in query + // performance should be outweigh the performance hit of testing any extra + // results. ret = `(?:^|_|[^\w\d])` + ret + `(?:$|_|[^\w\d])` return ret } @@ -36,7 +43,7 @@ func getPathWords(path string) []string { } // handle path separators - const separator = `(?:_|[^\w\d])+` + const separator = `(?:_|[^\p{L}\w\d])+` re := regexp.MustCompile(separator) retStr = re.ReplaceAllString(retStr, " ") @@ -52,7 +59,9 @@ func getPathWords(path string) []string { // we post-match afterwards, so we can afford to be a little loose // with the query // just use the first two characters - ret = append(ret, w[0:2]) + // #2293 - need to convert to unicode runes for the substring, otherwise + // the resulting string is corrupted. + ret = append(ret, string([]rune(w)[0:2])) } } @@ -72,7 +81,7 @@ func nameMatchesPath(name, path string) int { const separator = `[` + separatorChars + `]` reStr := strings.ReplaceAll(name, " ", separator+"*") - reStr = `(?:^|_|[^\w\d])` + reStr + `(?:$|_|[^\w\d])` + reStr = `(?:^|_|[^\p{L}\w\d])` + reStr + `(?:$|_|[^\p{L}\w\d])` re := regexp.MustCompile(reStr) found := re.FindAllStringIndex(path, -1) diff --git a/pkg/match/path_test.go b/pkg/match/path_test.go index f2818a801..c162b8feb 100644 --- a/pkg/match/path_test.go +++ b/pkg/match/path_test.go @@ -4,71 +4,90 @@ import "testing" func Test_nameMatchesPath(t *testing.T) { const name = "first last" + const unicodeName = "伏字" tests := []struct { - name string - path string - want int + testName string + name string + path string + want int }{ { "exact", name, + name, 0, }, { "partial", + name, "first", -1, }, { "separator", + name, "first.last", 0, }, { "separator", + name, "first-last", 0, }, { "separator", + name, "first_last", 0, }, { "separators", + name, "first.-_ last", 0, }, { "within string", + name, "before_first last/after", 6, }, { "not within string", + name, "beforefirst last/after", -1, }, { "not within string", + name, "before/first lastafter", -1, }, { "not within string", + name, "first last1", -1, }, { "not within string", + name, "1first last", -1, }, + { + "unicode", + unicodeName, + unicodeName, + 0, + }, } for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := nameMatchesPath(name, tt.path); got != tt.want { + t.Run(tt.testName, func(t *testing.T) { + if got := nameMatchesPath(tt.name, tt.path); got != tt.want { t.Errorf("nameMatchesPath() = %v, want %v", got, tt.want) } }) diff --git a/pkg/sqlite/performer.go b/pkg/sqlite/performer.go index d824dc37e..8a67db052 100644 --- a/pkg/sqlite/performer.go +++ b/pkg/sqlite/performer.go @@ -21,6 +21,8 @@ WHERE performers_tags.tag_id = ? GROUP BY performers_tags.performer_id ` +const singleFirstCharacterRegex = `^[\w\p{L}][.\-_ ]` + type performerQueryBuilder struct { repository } @@ -184,7 +186,7 @@ func (qb *performerQueryBuilder) QueryForAutoTag(words []string) ([]*models.Perf var args []interface{} whereClauses = append(whereClauses, "name regexp ?") - args = append(args, "^[\\w][.\\-_ ]") + args = append(args, singleFirstCharacterRegex) for _, w := range words { whereClauses = append(whereClauses, "name like ?") diff --git a/pkg/sqlite/repository.go b/pkg/sqlite/repository.go index 49329f13c..b9c8d22bb 100644 --- a/pkg/sqlite/repository.go +++ b/pkg/sqlite/repository.go @@ -171,6 +171,8 @@ func (r *repository) runSumQuery(query string, args []interface{}) (float64, err } func (r *repository) queryFunc(query string, args []interface{}, single bool, f func(rows *sqlx.Rows) error) error { + logger.Tracef("SQL: %s, args: %v", query, args) + rows, err := r.tx.Queryx(query, args...) if err != nil && !errors.Is(err, sql.ErrNoRows) { diff --git a/pkg/sqlite/studio.go b/pkg/sqlite/studio.go index 91e6c63ea..6eac885cd 100644 --- a/pkg/sqlite/studio.go +++ b/pkg/sqlite/studio.go @@ -145,7 +145,6 @@ func (qb *studioQueryBuilder) QueryForAutoTag(words []string) ([]*models.Studio, var args []interface{} // always include names that begin with a single character - singleFirstCharacterRegex := "^[\\w][.\\-_ ]" whereClauses = append(whereClauses, "studios.name regexp ? OR COALESCE(studio_aliases.alias, '') regexp ?") args = append(args, singleFirstCharacterRegex, singleFirstCharacterRegex) diff --git a/pkg/sqlite/tag.go b/pkg/sqlite/tag.go index c5f3858d7..57514d751 100644 --- a/pkg/sqlite/tag.go +++ b/pkg/sqlite/tag.go @@ -236,7 +236,6 @@ func (qb *tagQueryBuilder) QueryForAutoTag(words []string) ([]*models.Tag, error var args []interface{} // always include names that begin with a single character - singleFirstCharacterRegex := "^[\\w][.\\-_ ]" whereClauses = append(whereClauses, "tags.name regexp ? OR COALESCE(tag_aliases.alias, '') regexp ?") args = append(args, singleFirstCharacterRegex, singleFirstCharacterRegex)