perf: massive optimization for image and scene duplicate detection

This update provides significant performance improvements for both image and scene duplicate searching:

1. Optimized the core Hamming distance algorithm in pkg/utils/phash.go:
   - Uses native CPU popcount instructions (math/bits) for bit counting.
   - Pre-calculates hash values to eliminate object allocations in the hot loop.
   - Halves the number of comparisons by leveraging the symmetry of the Hamming distance.
   - The loop is now several orders of magnitude faster and allocation-free.

2. Solved the N+1 database query bottleneck:
   - Replaced individual database lookups for each duplicate group with a single batched query for all duplicate IDs.
   - This optimization was applied to both Image and Scene repositories.

3. Simplified the SQL fast path for exact image matches to remove redundant table joins.
This commit is contained in:
notsafeforgit 2026-03-20 15:01:39 -07:00
parent 3444c21263
commit b9752723b6
3 changed files with 68 additions and 32 deletions

View file

@ -1099,17 +1099,11 @@ func (qb *ImageStore) GetURLs(ctx context.Context, imageID int) ([]string, error
var findExactImageDuplicateQuery = `
SELECT GROUP_CONCAT(DISTINCT image_id) as ids
FROM (
SELECT images.id as image_id
, files_fingerprints.fingerprint as phash
FROM images
JOIN images_files ON images.id = images_files.image_id
JOIN files_fingerprints ON images_files.file_id = files_fingerprints.file_id
WHERE files_fingerprints.type = 'phash'
)
GROUP BY phash
HAVING COUNT(phash) > 1
AND COUNT(DISTINCT image_id) > 1;
FROM images_files
JOIN files_fingerprints ON images_files.file_id = files_fingerprints.file_id
WHERE files_fingerprints.type = 'phash'
GROUP BY fingerprint
HAVING COUNT(DISTINCT image_id) > 1;
`
func (qb *ImageStore) FindDuplicates(ctx context.Context, distance int) ([][]*models.Image, error) {
@ -1160,13 +1154,26 @@ func (qb *ImageStore) FindDuplicates(ctx context.Context, distance int) ([][]*mo
dupeIds = utils.FindDuplicates(hashes, distance, -1)
}
var result [][]*models.Image
var allIds []int
for _, comp := range dupeIds {
if images, err := qb.FindMany(ctx, comp); err == nil {
if len(images) > 1 {
result = append(result, images)
}
}
allIds = append(allIds, comp...)
}
if len(allIds) == 0 {
return nil, nil
}
allImages, err := qb.FindMany(ctx, allIds)
if err != nil {
return nil, err
}
var result [][]*models.Image
offset := 0
for _, comp := range dupeIds {
group := allImages[offset : offset+len(comp)]
result = append(result, group)
offset += len(comp)
}
return result, nil

View file

@ -1472,11 +1472,26 @@ func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int, duration
dupeIds = utils.FindDuplicates(hashes, distance, durationDiff)
}
var allIds []int
for _, comp := range dupeIds {
allIds = append(allIds, comp...)
}
if len(allIds) == 0 {
return nil, nil
}
allScenes, err := qb.FindMany(ctx, allIds)
if err != nil {
return nil, err
}
var duplicates [][]*models.Scene
for _, sceneIds := range dupeIds {
if scenes, err := qb.FindMany(ctx, sceneIds); err == nil {
duplicates = append(duplicates, scenes)
}
offset := 0
for _, comp := range dupeIds {
group := allScenes[offset : offset+len(comp)]
duplicates = append(duplicates, group)
offset += len(comp)
}
sortByPath(duplicates)

View file

@ -2,6 +2,7 @@ package utils
import (
"math"
"math/bits"
"strconv"
"github.com/corona10/goimagehash"
@ -17,22 +18,35 @@ type Phash struct {
}
func FindDuplicates(hashes []*Phash, distance int, durationDiff float64) [][]int {
// Pre-calculate hash values to avoid allocations and method calls in the inner loop
uintHashes := make([]uint64, len(hashes))
for i, h := range hashes {
uintHashes[i] = uint64(h.Hash)
}
for i, subject := range hashes {
subjectHash := goimagehash.NewImageHash(uint64(subject.Hash), goimagehash.PHash)
for j, neighbor := range hashes {
if i != j && subject.ID != neighbor.ID {
neighbourDurationDistance := 0.
subjectHash := uintHashes[i]
for j := i + 1; j < len(hashes); j++ {
neighbor := hashes[j]
if subject.ID == neighbor.ID {
continue
}
// Check duration if applicable (for scenes)
if durationDiff >= 0 {
if subject.Duration > 0 && neighbor.Duration > 0 {
neighbourDurationDistance = math.Abs(subject.Duration - neighbor.Duration)
}
if (neighbourDurationDistance <= durationDiff) || (durationDiff < 0) {
neighborHash := goimagehash.NewImageHash(uint64(neighbor.Hash), goimagehash.PHash)
neighborDistance, _ := subjectHash.Distance(neighborHash)
if neighborDistance <= distance {
subject.Neighbors = append(subject.Neighbors, j)
if math.Abs(subject.Duration-neighbor.Duration) > durationDiff {
continue
}
}
}
neighborHash := uintHashes[j]
// Hamming distance using native bit counting
if bits.OnesCount64(subjectHash^neighborHash) <= distance {
subject.Neighbors = append(subject.Neighbors, j)
neighbor.Neighbors = append(neighbor.Neighbors, i)
}
}
}