perf: further optimize image duplicate detection

This update provides additional performance improvements specifically targeted at large image libraries (e.g. 300k+ images): 1. Optimized the exact match SQL query for images: - Added filtering for zero/empty fingerprints to avoid massive false-positive groups. - Added a LIMIT of 1000 duplicate groups to prevent excessive memory consumption and serialization overhead. - Simplified the join structure to ensure better use of the database index. 2. Parallelized the Go comparison loop in pkg/utils/phash.go: - Utilizes all available CPU cores to perform Hamming distance calculations. - Uses a lock-free design to minimize synchronization overhead. - This makes non-zero distance searches significantly faster on multi-core systems.
2026-04-18 04:53:57 +02:00 · 2026-03-20 15:26:18 -07:00 · 2026-03-20 15:26:18 -07:00 · 6de7195ed6
commit 6de7195ed6
parent b3d002ccf9
2 changed files with 43 additions and 21 deletions
--- a/pkg/sqlite/image.go
+++ b/pkg/sqlite/image.go
@ -1101,9 +1101,12 @@ var findExactImageDuplicateQuery = `
 SELECT GROUP_CONCAT(DISTINCT image_id) as ids
 FROM images_files
 JOIN files_fingerprints ON images_files.file_id = files_fingerprints.file_id
-WHERE files_fingerprints.type = 'phash'
+WHERE files_fingerprints.type = 'phash' 
+  AND files_fingerprints.fingerprint != zeroblob(8)
+  AND files_fingerprints.fingerprint != ''
 GROUP BY fingerprint
-HAVING COUNT(DISTINCT image_id) > 1;
+HAVING COUNT(DISTINCT image_id) > 1
+LIMIT 1000;
 `

 func (qb *ImageStore) FindDuplicates(ctx context.Context, distance int) ([][]*models.Image, error) {
--- a/pkg/utils/phash.go
+++ b/pkg/utils/phash.go
@ -3,7 +3,9 @@ package utils
 import (
 	"math"
 	"math/bits"
+	"runtime"
 	"strconv"
+	"sync"

 	"github.com/stashapp/stash/pkg/sliceutil"
 )
@ -23,32 +25,49 @@ func FindDuplicates(hashes []*Phash, distance int, durationDiff float64) [][]int
 		uintHashes[i] = uint64(h.Hash)
 	}

-	for i, subject := range hashes {
-		subjectHash := uintHashes[i]
-		for j := i + 1; j < len(hashes); j++ {
-			neighbor := hashes[j]
-			if subject.ID == neighbor.ID {
-				continue
-			}
+	numHashes := len(hashes)
+	numWorkers := runtime.GOMAXPROCS(0)
+	var wg sync.WaitGroup
+	wg.Add(numWorkers)

-			// Check duration if applicable (for scenes)
-			if durationDiff >= 0 {
-				if subject.Duration > 0 && neighbor.Duration > 0 {
-					if math.Abs(subject.Duration-neighbor.Duration) > durationDiff {
+	// Distribute work among workers
+	for w := 0; w < numWorkers; w++ {
+		go func(workerID int) {
+			defer wg.Done()
+			for i := workerID; i < numHashes; i += numWorkers {
+				subject := hashes[i]
+				subjectHash := uintHashes[i]
+
+				for j := 0; j < numHashes; j++ {
+					if i == j {
 						continue
 					}
+					neighbor := hashes[j]
+					if subject.ID == neighbor.ID {
+						continue
+					}
+
+					// Check duration if applicable (for scenes)
+					if durationDiff >= 0 {
+						if subject.Duration > 0 && neighbor.Duration > 0 {
+							if math.Abs(subject.Duration-neighbor.Duration) > durationDiff {
+								continue
+							}
+						}
+					}
+
+					neighborHash := uintHashes[j]
+					// Hamming distance using native bit counting
+					if bits.OnesCount64(subjectHash^neighborHash) <= distance {
+						subject.Neighbors = append(subject.Neighbors, j)
+					}
 				}
 			}
-
-			neighborHash := uintHashes[j]
-			// Hamming distance using native bit counting
-			if bits.OnesCount64(subjectHash^neighborHash) <= distance {
-				subject.Neighbors = append(subject.Neighbors, j)
-				neighbor.Neighbors = append(neighbor.Neighbors, i)
-			}
-		}
+		}(w)
 	}

+	wg.Wait()
+
 	var buckets [][]int
 	for _, subject := range hashes {
 		if len(subject.Neighbors) > 0 && subject.Bucket == -1 {