stash/pkg/utils/phash.go
notsafeforgit 6de7195ed6 perf: further optimize image duplicate detection
This update provides additional performance improvements specifically targeted at large image libraries (e.g. 300k+ images):

1. Optimized the exact match SQL query for images:
   - Added filtering for zero/empty fingerprints to avoid massive false-positive groups.
   - Added a LIMIT of 1000 duplicate groups to prevent excessive memory consumption and serialization overhead.
   - Simplified the join structure to ensure better use of the database index.

2. Parallelized the Go comparison loop in pkg/utils/phash.go:
   - Utilizes all available CPU cores to perform Hamming distance calculations.
   - Uses a lock-free design to minimize synchronization overhead.
   - This makes non-zero distance searches significantly faster on multi-core systems.
2026-03-20 15:26:18 -07:00

110 lines
2.4 KiB
Go

package utils
import (
"math"
"math/bits"
"runtime"
"strconv"
"sync"
"github.com/stashapp/stash/pkg/sliceutil"
)
type Phash struct {
ID int `db:"id"`
Hash int64 `db:"phash"`
Duration float64 `db:"duration"`
Neighbors []int
Bucket int
}
func FindDuplicates(hashes []*Phash, distance int, durationDiff float64) [][]int {
// Pre-calculate hash values to avoid allocations and method calls in the inner loop
uintHashes := make([]uint64, len(hashes))
for i, h := range hashes {
uintHashes[i] = uint64(h.Hash)
}
numHashes := len(hashes)
numWorkers := runtime.GOMAXPROCS(0)
var wg sync.WaitGroup
wg.Add(numWorkers)
// Distribute work among workers
for w := 0; w < numWorkers; w++ {
go func(workerID int) {
defer wg.Done()
for i := workerID; i < numHashes; i += numWorkers {
subject := hashes[i]
subjectHash := uintHashes[i]
for j := 0; j < numHashes; j++ {
if i == j {
continue
}
neighbor := hashes[j]
if subject.ID == neighbor.ID {
continue
}
// Check duration if applicable (for scenes)
if durationDiff >= 0 {
if subject.Duration > 0 && neighbor.Duration > 0 {
if math.Abs(subject.Duration-neighbor.Duration) > durationDiff {
continue
}
}
}
neighborHash := uintHashes[j]
// Hamming distance using native bit counting
if bits.OnesCount64(subjectHash^neighborHash) <= distance {
subject.Neighbors = append(subject.Neighbors, j)
}
}
}
}(w)
}
wg.Wait()
var buckets [][]int
for _, subject := range hashes {
if len(subject.Neighbors) > 0 && subject.Bucket == -1 {
bucket := len(buckets)
ids := []int{subject.ID}
subject.Bucket = bucket
findNeighbors(bucket, subject.Neighbors, hashes, &ids)
if len(ids) > 1 {
buckets = append(buckets, ids)
}
}
}
return buckets
}
func findNeighbors(bucket int, neighbors []int, hashes []*Phash, ids *[]int) {
for _, id := range neighbors {
hash := hashes[id]
if hash.Bucket == -1 {
hash.Bucket = bucket
*ids = sliceutil.AppendUnique(*ids, hash.ID)
findNeighbors(bucket, hash.Neighbors, hashes, ids)
}
}
}
func PhashToString(phash int64) string {
return strconv.FormatUint(uint64(phash), 16)
}
func StringToPhash(s string) (int64, error) {
ret, err := strconv.ParseUint(s, 16, 64)
if err != nil {
return 0, err
}
return int64(ret), nil
}