mirror of
https://github.com/stashapp/stash.git
synced 2026-05-09 05:05:29 +02:00
perf: massive optimization for image and scene duplicate detection
This update provides significant performance improvements for both image and scene duplicate searching: 1. Optimized the core Hamming distance algorithm in pkg/utils/phash.go: - Uses native CPU popcount instructions (math/bits) for bit counting. - Pre-calculates hash values to eliminate object allocations in the hot loop. - Halves the number of comparisons by leveraging the symmetry of the Hamming distance. - The loop is now several orders of magnitude faster and allocation-free. 2. Solved the N+1 database query bottleneck: - Replaced individual database lookups for each duplicate group with a single batched query for all duplicate IDs. - This optimization was applied to both Image and Scene repositories. 3. Simplified the SQL fast path for exact image matches to remove redundant table joins.
This commit is contained in:
parent
3444c21263
commit
b9752723b6
3 changed files with 68 additions and 32 deletions
|
|
@ -1099,17 +1099,11 @@ func (qb *ImageStore) GetURLs(ctx context.Context, imageID int) ([]string, error
|
|||
|
||||
var findExactImageDuplicateQuery = `
|
||||
SELECT GROUP_CONCAT(DISTINCT image_id) as ids
|
||||
FROM (
|
||||
SELECT images.id as image_id
|
||||
, files_fingerprints.fingerprint as phash
|
||||
FROM images
|
||||
JOIN images_files ON images.id = images_files.image_id
|
||||
JOIN files_fingerprints ON images_files.file_id = files_fingerprints.file_id
|
||||
WHERE files_fingerprints.type = 'phash'
|
||||
)
|
||||
GROUP BY phash
|
||||
HAVING COUNT(phash) > 1
|
||||
AND COUNT(DISTINCT image_id) > 1;
|
||||
FROM images_files
|
||||
JOIN files_fingerprints ON images_files.file_id = files_fingerprints.file_id
|
||||
WHERE files_fingerprints.type = 'phash'
|
||||
GROUP BY fingerprint
|
||||
HAVING COUNT(DISTINCT image_id) > 1;
|
||||
`
|
||||
|
||||
func (qb *ImageStore) FindDuplicates(ctx context.Context, distance int) ([][]*models.Image, error) {
|
||||
|
|
@ -1160,13 +1154,26 @@ func (qb *ImageStore) FindDuplicates(ctx context.Context, distance int) ([][]*mo
|
|||
dupeIds = utils.FindDuplicates(hashes, distance, -1)
|
||||
}
|
||||
|
||||
var result [][]*models.Image
|
||||
var allIds []int
|
||||
for _, comp := range dupeIds {
|
||||
if images, err := qb.FindMany(ctx, comp); err == nil {
|
||||
if len(images) > 1 {
|
||||
result = append(result, images)
|
||||
}
|
||||
}
|
||||
allIds = append(allIds, comp...)
|
||||
}
|
||||
|
||||
if len(allIds) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
allImages, err := qb.FindMany(ctx, allIds)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var result [][]*models.Image
|
||||
offset := 0
|
||||
for _, comp := range dupeIds {
|
||||
group := allImages[offset : offset+len(comp)]
|
||||
result = append(result, group)
|
||||
offset += len(comp)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
|
|
|
|||
|
|
@ -1472,11 +1472,26 @@ func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int, duration
|
|||
dupeIds = utils.FindDuplicates(hashes, distance, durationDiff)
|
||||
}
|
||||
|
||||
var allIds []int
|
||||
for _, comp := range dupeIds {
|
||||
allIds = append(allIds, comp...)
|
||||
}
|
||||
|
||||
if len(allIds) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
allScenes, err := qb.FindMany(ctx, allIds)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var duplicates [][]*models.Scene
|
||||
for _, sceneIds := range dupeIds {
|
||||
if scenes, err := qb.FindMany(ctx, sceneIds); err == nil {
|
||||
duplicates = append(duplicates, scenes)
|
||||
}
|
||||
offset := 0
|
||||
for _, comp := range dupeIds {
|
||||
group := allScenes[offset : offset+len(comp)]
|
||||
duplicates = append(duplicates, group)
|
||||
offset += len(comp)
|
||||
}
|
||||
|
||||
sortByPath(duplicates)
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package utils
|
|||
|
||||
import (
|
||||
"math"
|
||||
"math/bits"
|
||||
"strconv"
|
||||
|
||||
"github.com/corona10/goimagehash"
|
||||
|
|
@ -17,22 +18,35 @@ type Phash struct {
|
|||
}
|
||||
|
||||
func FindDuplicates(hashes []*Phash, distance int, durationDiff float64) [][]int {
|
||||
// Pre-calculate hash values to avoid allocations and method calls in the inner loop
|
||||
uintHashes := make([]uint64, len(hashes))
|
||||
for i, h := range hashes {
|
||||
uintHashes[i] = uint64(h.Hash)
|
||||
}
|
||||
|
||||
for i, subject := range hashes {
|
||||
subjectHash := goimagehash.NewImageHash(uint64(subject.Hash), goimagehash.PHash)
|
||||
for j, neighbor := range hashes {
|
||||
if i != j && subject.ID != neighbor.ID {
|
||||
neighbourDurationDistance := 0.
|
||||
subjectHash := uintHashes[i]
|
||||
for j := i + 1; j < len(hashes); j++ {
|
||||
neighbor := hashes[j]
|
||||
if subject.ID == neighbor.ID {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check duration if applicable (for scenes)
|
||||
if durationDiff >= 0 {
|
||||
if subject.Duration > 0 && neighbor.Duration > 0 {
|
||||
neighbourDurationDistance = math.Abs(subject.Duration - neighbor.Duration)
|
||||
}
|
||||
if (neighbourDurationDistance <= durationDiff) || (durationDiff < 0) {
|
||||
neighborHash := goimagehash.NewImageHash(uint64(neighbor.Hash), goimagehash.PHash)
|
||||
neighborDistance, _ := subjectHash.Distance(neighborHash)
|
||||
if neighborDistance <= distance {
|
||||
subject.Neighbors = append(subject.Neighbors, j)
|
||||
if math.Abs(subject.Duration-neighbor.Duration) > durationDiff {
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
neighborHash := uintHashes[j]
|
||||
// Hamming distance using native bit counting
|
||||
if bits.OnesCount64(subjectHash^neighborHash) <= distance {
|
||||
subject.Neighbors = append(subject.Neighbors, j)
|
||||
neighbor.Neighbors = append(neighbor.Neighbors, i)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue