mirror of
https://github.com/stashapp/stash.git
synced 2026-04-18 04:53:57 +02:00
perf: further optimize image duplicate detection
This update provides additional performance improvements specifically targeted at large image libraries (e.g. 300k+ images): 1. Optimized the exact match SQL query for images: - Added filtering for zero/empty fingerprints to avoid massive false-positive groups. - Added a LIMIT of 1000 duplicate groups to prevent excessive memory consumption and serialization overhead. - Simplified the join structure to ensure better use of the database index. 2. Parallelized the Go comparison loop in pkg/utils/phash.go: - Utilizes all available CPU cores to perform Hamming distance calculations. - Uses a lock-free design to minimize synchronization overhead. - This makes non-zero distance searches significantly faster on multi-core systems.
This commit is contained in:
parent
b3d002ccf9
commit
6de7195ed6
2 changed files with 43 additions and 21 deletions
|
|
@ -1101,9 +1101,12 @@ var findExactImageDuplicateQuery = `
|
|||
SELECT GROUP_CONCAT(DISTINCT image_id) as ids
|
||||
FROM images_files
|
||||
JOIN files_fingerprints ON images_files.file_id = files_fingerprints.file_id
|
||||
WHERE files_fingerprints.type = 'phash'
|
||||
WHERE files_fingerprints.type = 'phash'
|
||||
AND files_fingerprints.fingerprint != zeroblob(8)
|
||||
AND files_fingerprints.fingerprint != ''
|
||||
GROUP BY fingerprint
|
||||
HAVING COUNT(DISTINCT image_id) > 1;
|
||||
HAVING COUNT(DISTINCT image_id) > 1
|
||||
LIMIT 1000;
|
||||
`
|
||||
|
||||
func (qb *ImageStore) FindDuplicates(ctx context.Context, distance int) ([][]*models.Image, error) {
|
||||
|
|
|
|||
|
|
@ -3,7 +3,9 @@ package utils
|
|||
import (
|
||||
"math"
|
||||
"math/bits"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"sync"
|
||||
|
||||
"github.com/stashapp/stash/pkg/sliceutil"
|
||||
)
|
||||
|
|
@ -23,32 +25,49 @@ func FindDuplicates(hashes []*Phash, distance int, durationDiff float64) [][]int
|
|||
uintHashes[i] = uint64(h.Hash)
|
||||
}
|
||||
|
||||
for i, subject := range hashes {
|
||||
subjectHash := uintHashes[i]
|
||||
for j := i + 1; j < len(hashes); j++ {
|
||||
neighbor := hashes[j]
|
||||
if subject.ID == neighbor.ID {
|
||||
continue
|
||||
}
|
||||
numHashes := len(hashes)
|
||||
numWorkers := runtime.GOMAXPROCS(0)
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(numWorkers)
|
||||
|
||||
// Check duration if applicable (for scenes)
|
||||
if durationDiff >= 0 {
|
||||
if subject.Duration > 0 && neighbor.Duration > 0 {
|
||||
if math.Abs(subject.Duration-neighbor.Duration) > durationDiff {
|
||||
// Distribute work among workers
|
||||
for w := 0; w < numWorkers; w++ {
|
||||
go func(workerID int) {
|
||||
defer wg.Done()
|
||||
for i := workerID; i < numHashes; i += numWorkers {
|
||||
subject := hashes[i]
|
||||
subjectHash := uintHashes[i]
|
||||
|
||||
for j := 0; j < numHashes; j++ {
|
||||
if i == j {
|
||||
continue
|
||||
}
|
||||
neighbor := hashes[j]
|
||||
if subject.ID == neighbor.ID {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check duration if applicable (for scenes)
|
||||
if durationDiff >= 0 {
|
||||
if subject.Duration > 0 && neighbor.Duration > 0 {
|
||||
if math.Abs(subject.Duration-neighbor.Duration) > durationDiff {
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
neighborHash := uintHashes[j]
|
||||
// Hamming distance using native bit counting
|
||||
if bits.OnesCount64(subjectHash^neighborHash) <= distance {
|
||||
subject.Neighbors = append(subject.Neighbors, j)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
neighborHash := uintHashes[j]
|
||||
// Hamming distance using native bit counting
|
||||
if bits.OnesCount64(subjectHash^neighborHash) <= distance {
|
||||
subject.Neighbors = append(subject.Neighbors, j)
|
||||
neighbor.Neighbors = append(neighbor.Neighbors, i)
|
||||
}
|
||||
}
|
||||
}(w)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
var buckets [][]int
|
||||
for _, subject := range hashes {
|
||||
if len(subject.Neighbors) > 0 && subject.Bucket == -1 {
|
||||
|
|
|
|||
Loading…
Reference in a new issue