This commit is contained in:
Slick Daddy 2026-05-05 08:03:26 -05:00 committed by GitHub
commit 3a3d2b3575
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 74 additions and 50 deletions

View file

@ -51,6 +51,7 @@ type Query {
Fractional seconds are ok: 0.5 will mean only files that have durations within 0.5 seconds between them will be matched based on PHash distance.
"""
duration_diff: Float
scene_filter: SceneFilterType
): [[Scene!]!]!
"Return valid stream paths"

View file

@ -227,7 +227,7 @@ func (r *queryResolver) ParseSceneFilenames(ctx context.Context, filter *models.
return ret, nil
}
func (r *queryResolver) FindDuplicateScenes(ctx context.Context, distance *int, durationDiff *float64) (ret [][]*models.Scene, err error) {
func (r *queryResolver) FindDuplicateScenes(ctx context.Context, distance *int, durationDiff *float64, sceneFilter *models.SceneFilterType) (ret [][]*models.Scene, err error) {
dist := 0
durDiff := -1.
if distance != nil {
@ -237,7 +237,7 @@ func (r *queryResolver) FindDuplicateScenes(ctx context.Context, distance *int,
durDiff = *durationDiff
}
if err := r.withReadTxn(ctx, func(ctx context.Context) error {
ret, err = r.repository.Scene.FindDuplicates(ctx, dist, durDiff)
ret, err = r.repository.Scene.FindDuplicates(ctx, dist, durDiff, sceneFilter)
return err
}); err != nil {
return nil, err

View file

@ -664,13 +664,13 @@ func (_m *SceneReaderWriter) FindByPrimaryFileID(ctx context.Context, fileID mod
return r0, r1
}
// FindDuplicates provides a mock function with given fields: ctx, distance, durationDiff
func (_m *SceneReaderWriter) FindDuplicates(ctx context.Context, distance int, durationDiff float64) ([][]*models.Scene, error) {
ret := _m.Called(ctx, distance, durationDiff)
// FindDuplicates provides a mock function with given fields: ctx, distance, durationDiff, filter
func (_m *SceneReaderWriter) FindDuplicates(ctx context.Context, distance int, durationDiff float64, filter *models.SceneFilterType) ([][]*models.Scene, error) {
ret := _m.Called(ctx, distance, durationDiff, filter)
var r0 [][]*models.Scene
if rf, ok := ret.Get(0).(func(context.Context, int, float64) [][]*models.Scene); ok {
r0 = rf(ctx, distance, durationDiff)
if rf, ok := ret.Get(0).(func(context.Context, int, float64, *models.SceneFilterType) [][]*models.Scene); ok {
r0 = rf(ctx, distance, durationDiff, filter)
} else {
if ret.Get(0) != nil {
r0 = ret.Get(0).([][]*models.Scene)
@ -678,8 +678,8 @@ func (_m *SceneReaderWriter) FindDuplicates(ctx context.Context, distance int, d
}
var r1 error
if rf, ok := ret.Get(1).(func(context.Context, int, float64) error); ok {
r1 = rf(ctx, distance, durationDiff)
if rf, ok := ret.Get(1).(func(context.Context, int, float64, *models.SceneFilterType) error); ok {
r1 = rf(ctx, distance, durationDiff, filter)
} else {
r1 = ret.Error(1)
}

View file

@ -27,7 +27,7 @@ type SceneFinder interface {
FindByPerformerID(ctx context.Context, performerID int) ([]*Scene, error)
FindByGalleryID(ctx context.Context, performerID int) ([]*Scene, error)
FindByGroupID(ctx context.Context, groupID int) ([]*Scene, error)
FindDuplicates(ctx context.Context, distance int, durationDiff float64) ([][]*Scene, error)
FindDuplicates(ctx context.Context, distance int, durationDiff float64, filter *SceneFilterType) ([][]*Scene, error)
}
// SceneQueryer provides methods to query scenes.

View file

@ -41,41 +41,6 @@ const (
sceneCoverBlobColumn = "cover_blob"
)
var findExactDuplicateQuery = `
SELECT GROUP_CONCAT(DISTINCT scene_id) as ids
FROM (
SELECT scenes.id as scene_id
, video_files.duration as file_duration
, files.size as file_size
, files_fingerprints.fingerprint as phash
, abs(max(video_files.duration) OVER (PARTITION by files_fingerprints.fingerprint) - video_files.duration) as durationDiff
FROM scenes
INNER JOIN scenes_files ON (scenes.id = scenes_files.scene_id)
INNER JOIN files ON (scenes_files.file_id = files.id)
INNER JOIN files_fingerprints ON (scenes_files.file_id = files_fingerprints.file_id AND files_fingerprints.type = 'phash')
INNER JOIN video_files ON (files.id == video_files.file_id)
)
WHERE durationDiff <= ?1
OR ?1 < 0 -- Always TRUE if the parameter is negative.
-- That will disable the durationDiff checking.
GROUP BY phash
HAVING COUNT(phash) > 1
AND COUNT(DISTINCT scene_id) > 1
ORDER BY SUM(file_size) DESC;
`
var findAllPhashesQuery = `
SELECT scenes.id as id
, files_fingerprints.fingerprint as phash
, video_files.duration as duration
FROM scenes
INNER JOIN scenes_files ON (scenes.id = scenes_files.scene_id)
INNER JOIN files ON (scenes_files.file_id = files.id)
INNER JOIN files_fingerprints ON (scenes_files.file_id = files_fingerprints.file_id AND files_fingerprints.type = 'phash')
INNER JOIN video_files ON (files.id == video_files.file_id)
ORDER BY files.size DESC;
`
type sceneRow struct {
ID int `db:"id" goqu:"skipinsert"`
Title zero.String `db:"title"`
@ -1430,11 +1395,60 @@ func (qb *SceneStore) GetStashIDs(ctx context.Context, sceneID int) ([]models.St
return sceneRepository.stashIDs.get(ctx, sceneID)
}
func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int, durationDiff float64) ([][]*models.Scene, error) {
func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int, durationDiff float64, filter *models.SceneFilterType) ([][]*models.Scene, error) {
var dupeIds [][]int
query, err := qb.makeQuery(ctx, filter, nil)
if err != nil {
return nil, err
}
// Add necessary joins for duplicate checking
query.addJoins(
join{
table: scenesFilesTable,
onClause: "scenes.id = scenes_files.scene_id",
},
join{
table: fileTable,
onClause: "scenes_files.file_id = files.id",
},
join{
table: fingerprintTable,
onClause: "scenes_files.file_id = files_fingerprints.file_id AND files_fingerprints.type = 'phash'",
},
join{
table: videoFileTable,
onClause: "files.id = video_files.file_id",
},
)
if distance == 0 {
query.columns = []string{
"scenes.id as scene_id",
"video_files.duration as file_duration",
"files.size as file_size",
"files_fingerprints.fingerprint as phash",
"abs(max(video_files.duration) OVER (PARTITION by files_fingerprints.fingerprint) - video_files.duration) as durationDiff",
}
sqlStr := query.toSQL(false)
finalQuery := `
SELECT GROUP_CONCAT(DISTINCT scene_id) as ids
FROM (` + sqlStr + `)
WHERE phash IS NOT NULL
AND (durationDiff <= ?
OR ? < 0)
GROUP BY phash
HAVING COUNT(phash) > 1
AND COUNT(DISTINCT scene_id) > 1
ORDER BY SUM(file_size) DESC;
`
var ids []string
if err := dbWrapper.Select(ctx, &ids, findExactDuplicateQuery, durationDiff); err != nil {
args := append(query.allArgs(), durationDiff, durationDiff)
if err := dbWrapper.Select(ctx, &ids, finalQuery, args...); err != nil {
return nil, err
}
@ -1452,9 +1466,18 @@ func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int, duration
}
}
} else {
query.columns = []string{
"scenes.id as id",
"files_fingerprints.fingerprint as phash",
"video_files.duration as duration",
}
query.sortAndPagination = " ORDER BY files.size DESC"
sqlStr := query.toSQL(true)
var hashes []*utils.Phash
if err := sceneRepository.queryFunc(ctx, findAllPhashesQuery, nil, false, func(rows *sqlx.Rows) error {
if err := sceneRepository.queryFunc(ctx, sqlStr, query.allArgs(), false, func(rows *sqlx.Rows) error {
phash := utils.Phash{
Bucket: -1,
Duration: -1,

View file

@ -4631,7 +4631,7 @@ func TestSceneStore_FindDuplicates(t *testing.T) {
withRollbackTxn(func(ctx context.Context) error {
distance := 0
durationDiff := -1.
got, err := qb.FindDuplicates(ctx, distance, durationDiff)
got, err := qb.FindDuplicates(ctx, distance, durationDiff, nil)
if err != nil {
t.Errorf("SceneStore.FindDuplicates() error = %v", err)
return nil
@ -4641,7 +4641,7 @@ func TestSceneStore_FindDuplicates(t *testing.T) {
distance = 1
durationDiff = -1.
got, err = qb.FindDuplicates(ctx, distance, durationDiff)
got, err = qb.FindDuplicates(ctx, distance, durationDiff, nil)
if err != nil {
t.Errorf("SceneStore.FindDuplicates() error = %v", err)
return nil