diff --git a/pkg/sqlite/image.go b/pkg/sqlite/image.go index ec180e05f..2ccbc6e03 100644 --- a/pkg/sqlite/image.go +++ b/pkg/sqlite/image.go @@ -1099,17 +1099,11 @@ func (qb *ImageStore) GetURLs(ctx context.Context, imageID int) ([]string, error var findExactImageDuplicateQuery = ` SELECT GROUP_CONCAT(DISTINCT image_id) as ids -FROM ( - SELECT images.id as image_id - , files_fingerprints.fingerprint as phash - FROM images - JOIN images_files ON images.id = images_files.image_id - JOIN files_fingerprints ON images_files.file_id = files_fingerprints.file_id - WHERE files_fingerprints.type = 'phash' -) -GROUP BY phash -HAVING COUNT(phash) > 1 - AND COUNT(DISTINCT image_id) > 1; +FROM images_files +JOIN files_fingerprints ON images_files.file_id = files_fingerprints.file_id +WHERE files_fingerprints.type = 'phash' +GROUP BY fingerprint +HAVING COUNT(DISTINCT image_id) > 1; ` func (qb *ImageStore) FindDuplicates(ctx context.Context, distance int) ([][]*models.Image, error) { @@ -1160,13 +1154,26 @@ func (qb *ImageStore) FindDuplicates(ctx context.Context, distance int) ([][]*mo dupeIds = utils.FindDuplicates(hashes, distance, -1) } - var result [][]*models.Image + var allIds []int for _, comp := range dupeIds { - if images, err := qb.FindMany(ctx, comp); err == nil { - if len(images) > 1 { - result = append(result, images) - } - } + allIds = append(allIds, comp...) + } + + if len(allIds) == 0 { + return nil, nil + } + + allImages, err := qb.FindMany(ctx, allIds) + if err != nil { + return nil, err + } + + var result [][]*models.Image + offset := 0 + for _, comp := range dupeIds { + group := allImages[offset : offset+len(comp)] + result = append(result, group) + offset += len(comp) } return result, nil diff --git a/pkg/sqlite/scene.go b/pkg/sqlite/scene.go index c2093431d..fef08dd38 100644 --- a/pkg/sqlite/scene.go +++ b/pkg/sqlite/scene.go @@ -1472,11 +1472,26 @@ func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int, duration dupeIds = utils.FindDuplicates(hashes, distance, durationDiff) } + var allIds []int + for _, comp := range dupeIds { + allIds = append(allIds, comp...) + } + + if len(allIds) == 0 { + return nil, nil + } + + allScenes, err := qb.FindMany(ctx, allIds) + if err != nil { + return nil, err + } + var duplicates [][]*models.Scene - for _, sceneIds := range dupeIds { - if scenes, err := qb.FindMany(ctx, sceneIds); err == nil { - duplicates = append(duplicates, scenes) - } + offset := 0 + for _, comp := range dupeIds { + group := allScenes[offset : offset+len(comp)] + duplicates = append(duplicates, group) + offset += len(comp) } sortByPath(duplicates) diff --git a/pkg/utils/phash.go b/pkg/utils/phash.go index 77fded911..9d6053c2d 100644 --- a/pkg/utils/phash.go +++ b/pkg/utils/phash.go @@ -2,6 +2,7 @@ package utils import ( "math" + "math/bits" "strconv" "github.com/corona10/goimagehash" @@ -17,22 +18,35 @@ type Phash struct { } func FindDuplicates(hashes []*Phash, distance int, durationDiff float64) [][]int { + // Pre-calculate hash values to avoid allocations and method calls in the inner loop + uintHashes := make([]uint64, len(hashes)) + for i, h := range hashes { + uintHashes[i] = uint64(h.Hash) + } + for i, subject := range hashes { - subjectHash := goimagehash.NewImageHash(uint64(subject.Hash), goimagehash.PHash) - for j, neighbor := range hashes { - if i != j && subject.ID != neighbor.ID { - neighbourDurationDistance := 0. + subjectHash := uintHashes[i] + for j := i + 1; j < len(hashes); j++ { + neighbor := hashes[j] + if subject.ID == neighbor.ID { + continue + } + + // Check duration if applicable (for scenes) + if durationDiff >= 0 { if subject.Duration > 0 && neighbor.Duration > 0 { - neighbourDurationDistance = math.Abs(subject.Duration - neighbor.Duration) - } - if (neighbourDurationDistance <= durationDiff) || (durationDiff < 0) { - neighborHash := goimagehash.NewImageHash(uint64(neighbor.Hash), goimagehash.PHash) - neighborDistance, _ := subjectHash.Distance(neighborHash) - if neighborDistance <= distance { - subject.Neighbors = append(subject.Neighbors, j) + if math.Abs(subject.Duration-neighbor.Duration) > durationDiff { + continue } } } + + neighborHash := uintHashes[j] + // Hamming distance using native bit counting + if bits.OnesCount64(subjectHash^neighborHash) <= distance { + subject.Neighbors = append(subject.Neighbors, j) + neighbor.Neighbors = append(neighbor.Neighbors, i) + } } }