diff --git a/graphql/schema/schema.graphql b/graphql/schema/schema.graphql index 7f07e4579..6054faea8 100644 --- a/graphql/schema/schema.graphql +++ b/graphql/schema/schema.graphql @@ -51,6 +51,7 @@ type Query { Fractional seconds are ok: 0.5 will mean only files that have durations within 0.5 seconds between them will be matched based on PHash distance. """ duration_diff: Float + scene_filter: SceneFilterType ): [[Scene!]!]! "Return valid stream paths" diff --git a/internal/api/resolver_query_find_scene.go b/internal/api/resolver_query_find_scene.go index 135ec43b7..5b4f4d614 100644 --- a/internal/api/resolver_query_find_scene.go +++ b/internal/api/resolver_query_find_scene.go @@ -227,7 +227,7 @@ func (r *queryResolver) ParseSceneFilenames(ctx context.Context, filter *models. return ret, nil } -func (r *queryResolver) FindDuplicateScenes(ctx context.Context, distance *int, durationDiff *float64) (ret [][]*models.Scene, err error) { +func (r *queryResolver) FindDuplicateScenes(ctx context.Context, distance *int, durationDiff *float64, sceneFilter *models.SceneFilterType) (ret [][]*models.Scene, err error) { dist := 0 durDiff := -1. if distance != nil { @@ -237,7 +237,7 @@ func (r *queryResolver) FindDuplicateScenes(ctx context.Context, distance *int, durDiff = *durationDiff } if err := r.withReadTxn(ctx, func(ctx context.Context) error { - ret, err = r.repository.Scene.FindDuplicates(ctx, dist, durDiff) + ret, err = r.repository.Scene.FindDuplicates(ctx, dist, durDiff, sceneFilter) return err }); err != nil { return nil, err diff --git a/pkg/models/mocks/SceneReaderWriter.go b/pkg/models/mocks/SceneReaderWriter.go index 0053ad6f8..28ff84495 100644 --- a/pkg/models/mocks/SceneReaderWriter.go +++ b/pkg/models/mocks/SceneReaderWriter.go @@ -664,13 +664,13 @@ func (_m *SceneReaderWriter) FindByPrimaryFileID(ctx context.Context, fileID mod return r0, r1 } -// FindDuplicates provides a mock function with given fields: ctx, distance, durationDiff -func (_m *SceneReaderWriter) FindDuplicates(ctx context.Context, distance int, durationDiff float64) ([][]*models.Scene, error) { - ret := _m.Called(ctx, distance, durationDiff) +// FindDuplicates provides a mock function with given fields: ctx, distance, durationDiff, filter +func (_m *SceneReaderWriter) FindDuplicates(ctx context.Context, distance int, durationDiff float64, filter *models.SceneFilterType) ([][]*models.Scene, error) { + ret := _m.Called(ctx, distance, durationDiff, filter) var r0 [][]*models.Scene - if rf, ok := ret.Get(0).(func(context.Context, int, float64) [][]*models.Scene); ok { - r0 = rf(ctx, distance, durationDiff) + if rf, ok := ret.Get(0).(func(context.Context, int, float64, *models.SceneFilterType) [][]*models.Scene); ok { + r0 = rf(ctx, distance, durationDiff, filter) } else { if ret.Get(0) != nil { r0 = ret.Get(0).([][]*models.Scene) @@ -678,8 +678,8 @@ func (_m *SceneReaderWriter) FindDuplicates(ctx context.Context, distance int, d } var r1 error - if rf, ok := ret.Get(1).(func(context.Context, int, float64) error); ok { - r1 = rf(ctx, distance, durationDiff) + if rf, ok := ret.Get(1).(func(context.Context, int, float64, *models.SceneFilterType) error); ok { + r1 = rf(ctx, distance, durationDiff, filter) } else { r1 = ret.Error(1) } diff --git a/pkg/models/repository_scene.go b/pkg/models/repository_scene.go index 6b795c3af..fb1c32974 100644 --- a/pkg/models/repository_scene.go +++ b/pkg/models/repository_scene.go @@ -27,7 +27,7 @@ type SceneFinder interface { FindByPerformerID(ctx context.Context, performerID int) ([]*Scene, error) FindByGalleryID(ctx context.Context, performerID int) ([]*Scene, error) FindByGroupID(ctx context.Context, groupID int) ([]*Scene, error) - FindDuplicates(ctx context.Context, distance int, durationDiff float64) ([][]*Scene, error) + FindDuplicates(ctx context.Context, distance int, durationDiff float64, filter *SceneFilterType) ([][]*Scene, error) } // SceneQueryer provides methods to query scenes. diff --git a/pkg/sqlite/scene.go b/pkg/sqlite/scene.go index c2093431d..4b41910ad 100644 --- a/pkg/sqlite/scene.go +++ b/pkg/sqlite/scene.go @@ -41,41 +41,6 @@ const ( sceneCoverBlobColumn = "cover_blob" ) -var findExactDuplicateQuery = ` -SELECT GROUP_CONCAT(DISTINCT scene_id) as ids -FROM ( - SELECT scenes.id as scene_id - , video_files.duration as file_duration - , files.size as file_size - , files_fingerprints.fingerprint as phash - , abs(max(video_files.duration) OVER (PARTITION by files_fingerprints.fingerprint) - video_files.duration) as durationDiff - FROM scenes - INNER JOIN scenes_files ON (scenes.id = scenes_files.scene_id) - INNER JOIN files ON (scenes_files.file_id = files.id) - INNER JOIN files_fingerprints ON (scenes_files.file_id = files_fingerprints.file_id AND files_fingerprints.type = 'phash') - INNER JOIN video_files ON (files.id == video_files.file_id) -) -WHERE durationDiff <= ?1 - OR ?1 < 0 -- Always TRUE if the parameter is negative. - -- That will disable the durationDiff checking. -GROUP BY phash -HAVING COUNT(phash) > 1 - AND COUNT(DISTINCT scene_id) > 1 -ORDER BY SUM(file_size) DESC; -` - -var findAllPhashesQuery = ` -SELECT scenes.id as id - , files_fingerprints.fingerprint as phash - , video_files.duration as duration -FROM scenes -INNER JOIN scenes_files ON (scenes.id = scenes_files.scene_id) -INNER JOIN files ON (scenes_files.file_id = files.id) -INNER JOIN files_fingerprints ON (scenes_files.file_id = files_fingerprints.file_id AND files_fingerprints.type = 'phash') -INNER JOIN video_files ON (files.id == video_files.file_id) -ORDER BY files.size DESC; -` - type sceneRow struct { ID int `db:"id" goqu:"skipinsert"` Title zero.String `db:"title"` @@ -1430,11 +1395,60 @@ func (qb *SceneStore) GetStashIDs(ctx context.Context, sceneID int) ([]models.St return sceneRepository.stashIDs.get(ctx, sceneID) } -func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int, durationDiff float64) ([][]*models.Scene, error) { +func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int, durationDiff float64, filter *models.SceneFilterType) ([][]*models.Scene, error) { var dupeIds [][]int + + query, err := qb.makeQuery(ctx, filter, nil) + if err != nil { + return nil, err + } + + // Add necessary joins for duplicate checking + query.addJoins( + join{ + table: scenesFilesTable, + onClause: "scenes.id = scenes_files.scene_id", + }, + join{ + table: fileTable, + onClause: "scenes_files.file_id = files.id", + }, + join{ + table: fingerprintTable, + onClause: "scenes_files.file_id = files_fingerprints.file_id AND files_fingerprints.type = 'phash'", + }, + join{ + table: videoFileTable, + onClause: "files.id = video_files.file_id", + }, + ) + if distance == 0 { + query.columns = []string{ + "scenes.id as scene_id", + "video_files.duration as file_duration", + "files.size as file_size", + "files_fingerprints.fingerprint as phash", + "abs(max(video_files.duration) OVER (PARTITION by files_fingerprints.fingerprint) - video_files.duration) as durationDiff", + } + + sqlStr := query.toSQL(false) + + finalQuery := ` +SELECT GROUP_CONCAT(DISTINCT scene_id) as ids +FROM (` + sqlStr + `) +WHERE phash IS NOT NULL + AND (durationDiff <= ? + OR ? < 0) +GROUP BY phash +HAVING COUNT(phash) > 1 + AND COUNT(DISTINCT scene_id) > 1 +ORDER BY SUM(file_size) DESC; +` + var ids []string - if err := dbWrapper.Select(ctx, &ids, findExactDuplicateQuery, durationDiff); err != nil { + args := append(query.allArgs(), durationDiff, durationDiff) + if err := dbWrapper.Select(ctx, &ids, finalQuery, args...); err != nil { return nil, err } @@ -1452,9 +1466,18 @@ func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int, duration } } } else { + query.columns = []string{ + "scenes.id as id", + "files_fingerprints.fingerprint as phash", + "video_files.duration as duration", + } + query.sortAndPagination = " ORDER BY files.size DESC" + + sqlStr := query.toSQL(true) + var hashes []*utils.Phash - if err := sceneRepository.queryFunc(ctx, findAllPhashesQuery, nil, false, func(rows *sqlx.Rows) error { + if err := sceneRepository.queryFunc(ctx, sqlStr, query.allArgs(), false, func(rows *sqlx.Rows) error { phash := utils.Phash{ Bucket: -1, Duration: -1, diff --git a/pkg/sqlite/scene_test.go b/pkg/sqlite/scene_test.go index 67bf227a2..bbea297dd 100644 --- a/pkg/sqlite/scene_test.go +++ b/pkg/sqlite/scene_test.go @@ -4631,7 +4631,7 @@ func TestSceneStore_FindDuplicates(t *testing.T) { withRollbackTxn(func(ctx context.Context) error { distance := 0 durationDiff := -1. - got, err := qb.FindDuplicates(ctx, distance, durationDiff) + got, err := qb.FindDuplicates(ctx, distance, durationDiff, nil) if err != nil { t.Errorf("SceneStore.FindDuplicates() error = %v", err) return nil @@ -4641,7 +4641,7 @@ func TestSceneStore_FindDuplicates(t *testing.T) { distance = 1 durationDiff = -1. - got, err = qb.FindDuplicates(ctx, distance, durationDiff) + got, err = qb.FindDuplicates(ctx, distance, durationDiff, nil) if err != nil { t.Errorf("SceneStore.FindDuplicates() error = %v", err) return nil