From aaf2ba4cf69a3645570d0768ea577599011f5a99 Mon Sep 17 00:00:00 2001 From: Slick Daddy Date: Sun, 3 May 2026 20:33:41 +0300 Subject: [PATCH 1/9] feat(schema): add scene_filter to findDuplicateScenes Expands the findDuplicateScenes query to accept a full SceneFilterType. This enables filtering out specific directories or tags from the duplication matching process. --- graphql/schema/schema.graphql | 1 + 1 file changed, 1 insertion(+) diff --git a/graphql/schema/schema.graphql b/graphql/schema/schema.graphql index 7f07e4579..6054faea8 100644 --- a/graphql/schema/schema.graphql +++ b/graphql/schema/schema.graphql @@ -51,6 +51,7 @@ type Query { Fractional seconds are ok: 0.5 will mean only files that have durations within 0.5 seconds between them will be matched based on PHash distance. """ duration_diff: Float + scene_filter: SceneFilterType ): [[Scene!]!]! "Return valid stream paths" From 78ed5ea5133414b89282bab766784fda4d0b4ef7 Mon Sep 17 00:00:00 2001 From: Slick Daddy Date: Sun, 3 May 2026 20:34:49 +0300 Subject: [PATCH 2/9] feat(backend): update SceneReaderWriter interface to accept filter Modifies the FindDuplicates signature to take a SceneFilterType pointer, allowing the underlying repository to filter the pool of scenes before duplicate checking. --- pkg/models/repository_scene.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/models/repository_scene.go b/pkg/models/repository_scene.go index 6b795c3af..fb1c32974 100644 --- a/pkg/models/repository_scene.go +++ b/pkg/models/repository_scene.go @@ -27,7 +27,7 @@ type SceneFinder interface { FindByPerformerID(ctx context.Context, performerID int) ([]*Scene, error) FindByGalleryID(ctx context.Context, performerID int) ([]*Scene, error) FindByGroupID(ctx context.Context, groupID int) ([]*Scene, error) - FindDuplicates(ctx context.Context, distance int, durationDiff float64) ([][]*Scene, error) + FindDuplicates(ctx context.Context, distance int, durationDiff float64, filter *SceneFilterType) ([][]*Scene, error) } // SceneQueryer provides methods to query scenes. From 16faed14d413f69cec1e6a70c65816cd03d7c166 Mon Sep 17 00:00:00 2001 From: Slick Daddy Date: Sun, 3 May 2026 20:35:11 +0300 Subject: [PATCH 3/9] test(backend): update SceneReaderWriter mock Updates the mock definition to match the new FindDuplicates method signature. --- pkg/models/mocks/SceneReaderWriter.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/models/mocks/SceneReaderWriter.go b/pkg/models/mocks/SceneReaderWriter.go index 0053ad6f8..28ff84495 100644 --- a/pkg/models/mocks/SceneReaderWriter.go +++ b/pkg/models/mocks/SceneReaderWriter.go @@ -664,13 +664,13 @@ func (_m *SceneReaderWriter) FindByPrimaryFileID(ctx context.Context, fileID mod return r0, r1 } -// FindDuplicates provides a mock function with given fields: ctx, distance, durationDiff -func (_m *SceneReaderWriter) FindDuplicates(ctx context.Context, distance int, durationDiff float64) ([][]*models.Scene, error) { - ret := _m.Called(ctx, distance, durationDiff) +// FindDuplicates provides a mock function with given fields: ctx, distance, durationDiff, filter +func (_m *SceneReaderWriter) FindDuplicates(ctx context.Context, distance int, durationDiff float64, filter *models.SceneFilterType) ([][]*models.Scene, error) { + ret := _m.Called(ctx, distance, durationDiff, filter) var r0 [][]*models.Scene - if rf, ok := ret.Get(0).(func(context.Context, int, float64) [][]*models.Scene); ok { - r0 = rf(ctx, distance, durationDiff) + if rf, ok := ret.Get(0).(func(context.Context, int, float64, *models.SceneFilterType) [][]*models.Scene); ok { + r0 = rf(ctx, distance, durationDiff, filter) } else { if ret.Get(0) != nil { r0 = ret.Get(0).([][]*models.Scene) @@ -678,8 +678,8 @@ func (_m *SceneReaderWriter) FindDuplicates(ctx context.Context, distance int, d } var r1 error - if rf, ok := ret.Get(1).(func(context.Context, int, float64) error); ok { - r1 = rf(ctx, distance, durationDiff) + if rf, ok := ret.Get(1).(func(context.Context, int, float64, *models.SceneFilterType) error); ok { + r1 = rf(ctx, distance, durationDiff, filter) } else { r1 = ret.Error(1) } From c7f1d5612fd1bc0f2299a0e94d1f30afec2a6d70 Mon Sep 17 00:00:00 2001 From: Slick Daddy Date: Sun, 3 May 2026 20:35:27 +0300 Subject: [PATCH 4/9] feat(api): pass scene filter to duplicate checking repository Modifies the FindDuplicateScenes GraphQL resolver to pass the newly added scene_filter schema argument into the database repository layer. --- internal/api/resolver_query_find_scene.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/api/resolver_query_find_scene.go b/internal/api/resolver_query_find_scene.go index 135ec43b7..5b4f4d614 100644 --- a/internal/api/resolver_query_find_scene.go +++ b/internal/api/resolver_query_find_scene.go @@ -227,7 +227,7 @@ func (r *queryResolver) ParseSceneFilenames(ctx context.Context, filter *models. return ret, nil } -func (r *queryResolver) FindDuplicateScenes(ctx context.Context, distance *int, durationDiff *float64) (ret [][]*models.Scene, err error) { +func (r *queryResolver) FindDuplicateScenes(ctx context.Context, distance *int, durationDiff *float64, sceneFilter *models.SceneFilterType) (ret [][]*models.Scene, err error) { dist := 0 durDiff := -1. if distance != nil { @@ -237,7 +237,7 @@ func (r *queryResolver) FindDuplicateScenes(ctx context.Context, distance *int, durDiff = *durationDiff } if err := r.withReadTxn(ctx, func(ctx context.Context) error { - ret, err = r.repository.Scene.FindDuplicates(ctx, dist, durDiff) + ret, err = r.repository.Scene.FindDuplicates(ctx, dist, durDiff, sceneFilter) return err }); err != nil { return nil, err From 864412c73505514da8bcff200e5ff3004b41c44c Mon Sep 17 00:00:00 2001 From: Slick Daddy Date: Sun, 3 May 2026 20:36:13 +0300 Subject: [PATCH 5/9] feat(sqlite): dynamically build FindDuplicates query Refactors the FindDuplicates implementation to use the internal qb.makeQuery tool instead of static raw SQL. This enables the duplicate checker to utilize the provided SceneFilterType, natively supporting advanced filtering like path exclusions. --- pkg/sqlite/scene.go | 62 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/pkg/sqlite/scene.go b/pkg/sqlite/scene.go index c2093431d..a700e9da3 100644 --- a/pkg/sqlite/scene.go +++ b/pkg/sqlite/scene.go @@ -1430,11 +1430,58 @@ func (qb *SceneStore) GetStashIDs(ctx context.Context, sceneID int) ([]models.St return sceneRepository.stashIDs.get(ctx, sceneID) } -func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int, durationDiff float64) ([][]*models.Scene, error) { +func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int, durationDiff float64, filter *models.SceneFilterType) ([][]*models.Scene, error) { var dupeIds [][]int + + query, err := qb.makeQuery(ctx, filter, nil) + if err != nil { + return nil, err + } + + // Add necessary joins for duplicate checking + query.addJoins( + join{ + table: scenesFilesTable, + onClause: "scenes.id = scenes_files.scene_id", + }, + join{ + table: fileTable, + onClause: "scenes_files.file_id = files.id", + }, + join{ + table: fingerprintTable, + onClause: "scenes_files.file_id = files_fingerprints.file_id AND files_fingerprints.type = 'phash'", + }, + join{ + table: videoFileTable, + onClause: "files.id = video_files.file_id", + }, + ) + if distance == 0 { + query.columns = []string{ + "scenes.id as scene_id", + "video_files.duration as file_duration", + "files.size as file_size", + "files_fingerprints.fingerprint as phash", + "abs(max(video_files.duration) OVER (PARTITION by files_fingerprints.fingerprint) - video_files.duration) as durationDiff", + } + + sqlStr := query.toSQL(false) + + finalQuery := ` +SELECT GROUP_CONCAT(DISTINCT scene_id) as ids +FROM (` + sqlStr + `) +WHERE durationDiff <= ? + OR ? < 0 +GROUP BY phash +HAVING COUNT(phash) > 1 + AND COUNT(DISTINCT scene_id) > 1 +ORDER BY SUM(file_size) DESC; +` var ids []string - if err := dbWrapper.Select(ctx, &ids, findExactDuplicateQuery, durationDiff); err != nil { + args := append(query.allArgs(), durationDiff, durationDiff) + if err := dbWrapper.Select(ctx, &ids, finalQuery, args...); err != nil { return nil, err } @@ -1452,9 +1499,18 @@ func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int, duration } } } else { + query.columns = []string{ + "scenes.id as id", + "files_fingerprints.fingerprint as phash", + "video_files.duration as duration", + } + query.sortAndPagination = " ORDER BY files.size DESC" + + sqlStr := query.toSQL(true) + var hashes []*utils.Phash - if err := sceneRepository.queryFunc(ctx, findAllPhashesQuery, nil, false, func(rows *sqlx.Rows) error { + if err := sceneRepository.queryFunc(ctx, sqlStr, query.allArgs(), false, func(rows *sqlx.Rows) error { phash := utils.Phash{ Bucket: -1, Duration: -1, From f87f9aa2ff654881386c2b04324898b3f0fb9024 Mon Sep 17 00:00:00 2001 From: Slick Daddy Date: Sun, 3 May 2026 20:36:40 +0300 Subject: [PATCH 6/9] test(sqlite): update duplicate checking tests Updates the unit tests to pass a nil scene filter, matching the new FindDuplicates method signature. --- pkg/sqlite/scene_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/sqlite/scene_test.go b/pkg/sqlite/scene_test.go index 67bf227a2..bbea297dd 100644 --- a/pkg/sqlite/scene_test.go +++ b/pkg/sqlite/scene_test.go @@ -4631,7 +4631,7 @@ func TestSceneStore_FindDuplicates(t *testing.T) { withRollbackTxn(func(ctx context.Context) error { distance := 0 durationDiff := -1. - got, err := qb.FindDuplicates(ctx, distance, durationDiff) + got, err := qb.FindDuplicates(ctx, distance, durationDiff, nil) if err != nil { t.Errorf("SceneStore.FindDuplicates() error = %v", err) return nil @@ -4641,7 +4641,7 @@ func TestSceneStore_FindDuplicates(t *testing.T) { distance = 1 durationDiff = -1. - got, err = qb.FindDuplicates(ctx, distance, durationDiff) + got, err = qb.FindDuplicates(ctx, distance, durationDiff, nil) if err != nil { t.Errorf("SceneStore.FindDuplicates() error = %v", err) return nil From 8274d5f3afb299c473dd0daa8a754c5a9e1a3605 Mon Sep 17 00:00:00 2001 From: Slick Daddy Date: Sun, 3 May 2026 22:53:38 +0300 Subject: [PATCH 7/9] fix(sqlite): remove unused queries and fix formatting --- pkg/sqlite/scene.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/sqlite/scene.go b/pkg/sqlite/scene.go index a700e9da3..db8e10ddc 100644 --- a/pkg/sqlite/scene.go +++ b/pkg/sqlite/scene.go @@ -1466,9 +1466,9 @@ func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int, duration "files_fingerprints.fingerprint as phash", "abs(max(video_files.duration) OVER (PARTITION by files_fingerprints.fingerprint) - video_files.duration) as durationDiff", } - + sqlStr := query.toSQL(false) - + finalQuery := ` SELECT GROUP_CONCAT(DISTINCT scene_id) as ids FROM (` + sqlStr + `) From 18be6a76077be8e35a512fb88c5478c3bd8086b1 Mon Sep 17 00:00:00 2001 From: Slick Daddy Date: Sun, 3 May 2026 22:56:12 +0300 Subject: [PATCH 8/9] fix(sqlite): remove unused queries --- pkg/sqlite/scene.go | 35 ----------------------------------- 1 file changed, 35 deletions(-) diff --git a/pkg/sqlite/scene.go b/pkg/sqlite/scene.go index db8e10ddc..acaf5c5ab 100644 --- a/pkg/sqlite/scene.go +++ b/pkg/sqlite/scene.go @@ -41,41 +41,6 @@ const ( sceneCoverBlobColumn = "cover_blob" ) -var findExactDuplicateQuery = ` -SELECT GROUP_CONCAT(DISTINCT scene_id) as ids -FROM ( - SELECT scenes.id as scene_id - , video_files.duration as file_duration - , files.size as file_size - , files_fingerprints.fingerprint as phash - , abs(max(video_files.duration) OVER (PARTITION by files_fingerprints.fingerprint) - video_files.duration) as durationDiff - FROM scenes - INNER JOIN scenes_files ON (scenes.id = scenes_files.scene_id) - INNER JOIN files ON (scenes_files.file_id = files.id) - INNER JOIN files_fingerprints ON (scenes_files.file_id = files_fingerprints.file_id AND files_fingerprints.type = 'phash') - INNER JOIN video_files ON (files.id == video_files.file_id) -) -WHERE durationDiff <= ?1 - OR ?1 < 0 -- Always TRUE if the parameter is negative. - -- That will disable the durationDiff checking. -GROUP BY phash -HAVING COUNT(phash) > 1 - AND COUNT(DISTINCT scene_id) > 1 -ORDER BY SUM(file_size) DESC; -` - -var findAllPhashesQuery = ` -SELECT scenes.id as id - , files_fingerprints.fingerprint as phash - , video_files.duration as duration -FROM scenes -INNER JOIN scenes_files ON (scenes.id = scenes_files.scene_id) -INNER JOIN files ON (scenes_files.file_id = files.id) -INNER JOIN files_fingerprints ON (scenes_files.file_id = files_fingerprints.file_id AND files_fingerprints.type = 'phash') -INNER JOIN video_files ON (files.id == video_files.file_id) -ORDER BY files.size DESC; -` - type sceneRow struct { ID int `db:"id" goqu:"skipinsert"` Title zero.String `db:"title"` From 765a6f6fc4d5ca4b362176408d525c0b92b7bcc5 Mon Sep 17 00:00:00 2001 From: Slick Daddy Date: Sun, 3 May 2026 23:04:59 +0300 Subject: [PATCH 9/9] fix(sqlite): filter out NULL phash values in FindDuplicates --- pkg/sqlite/scene.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pkg/sqlite/scene.go b/pkg/sqlite/scene.go index acaf5c5ab..4b41910ad 100644 --- a/pkg/sqlite/scene.go +++ b/pkg/sqlite/scene.go @@ -1437,13 +1437,15 @@ func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int, duration finalQuery := ` SELECT GROUP_CONCAT(DISTINCT scene_id) as ids FROM (` + sqlStr + `) -WHERE durationDiff <= ? - OR ? < 0 +WHERE phash IS NOT NULL + AND (durationDiff <= ? + OR ? < 0) GROUP BY phash HAVING COUNT(phash) > 1 AND COUNT(DISTINCT scene_id) > 1 ORDER BY SUM(file_size) DESC; ` + var ids []string args := append(query.allArgs(), durationDiff, durationDiff) if err := dbWrapper.Select(ctx, &ids, finalQuery, args...); err != nil {