perf: eliminate O(N^2) bottlenecks in image and scene duplicate detection

This update resolves major performance regressions when processing large libraries: 1. Optimized FindMany in both Image and Scene stores to use map-based ID lookups. Previously, this function used slices.Index in a loop, resulting in O(N^2) complexity. On a library with 300k items, this was causing the server to hang indefinitely. 2. Refined the exact image duplicate SQL query to match the scene checker's level of optimization. It now joins the files table and orders results by total duplicate file size, ensuring that the most impactful duplicates are shown first. 3. Removed the temporary LIMIT 1000 from the image duplicate query now that the algorithmic bottlenecks have been resolved.
2026-05-09 05:05:29 +02:00 · 2026-03-20 15:29:04 -07:00 · 2026-03-20 15:29:04 -07:00 · 34821a5d40
commit 34821a5d40
parent 15c6dd5575
2 changed files with 29 additions and 13 deletions
--- a/pkg/sqlite/image.go
+++ b/pkg/sqlite/image.go
@ -6,7 +6,6 @@ import (
 	"errors"
 	"fmt"
 	"path/filepath"
-	"slices"
 	"strconv"
 	"strings"

@ -412,6 +411,11 @@ func (qb *ImageStore) Find(ctx context.Context, id int) (*models.Image, error) {
 func (qb *ImageStore) FindMany(ctx context.Context, ids []int) ([]*models.Image, error) {
 	images := make([]*models.Image, len(ids))

+	idToIndex := make(map[int]int, len(ids))
+	for i, id := range ids {
+		idToIndex[id] = i
+	}
+
 	if err := batchExec(ids, defaultBatchSize, func(batch []int) error {
 		q := qb.selectDataset().Prepared(true).Where(qb.table().Col(idColumn).In(batch))
 		unsorted, err := qb.getMany(ctx, q)
@ -420,8 +424,9 @@ func (qb *ImageStore) FindMany(ctx context.Context, ids []int) ([]*models.Image,
 		}

 		for _, s := range unsorted {
-			i := slices.Index(ids, s.ID)
-			images[i] = s
+			if i, ok := idToIndex[s.ID]; ok {
+				images[i] = s
+			}
 		}

 		return nil
@ -1099,14 +1104,20 @@ func (qb *ImageStore) GetURLs(ctx context.Context, imageID int) ([]string, error

 var findExactImageDuplicateQuery = `
 SELECT GROUP_CONCAT(DISTINCT image_id) as ids
-FROM images_files
-JOIN files_fingerprints ON images_files.file_id = files_fingerprints.file_id
-WHERE files_fingerprints.type = 'phash' 
-  AND files_fingerprints.fingerprint != zeroblob(8)
-  AND files_fingerprints.fingerprint != ''
-GROUP BY fingerprint
+FROM (
+	SELECT images_files.image_id
+		 , files.size as file_size
+		 , files_fingerprints.fingerprint as phash
+	FROM images_files
+	JOIN files ON images_files.file_id = files.id
+	JOIN files_fingerprints ON images_files.file_id = files_fingerprints.file_id
+	WHERE files_fingerprints.type = 'phash' 
+	  AND files_fingerprints.fingerprint != zeroblob(8)
+	  AND files_fingerprints.fingerprint != ''
+)
+GROUP BY phash
 HAVING COUNT(DISTINCT image_id) > 1
-LIMIT 1000;
+ORDER BY SUM(file_size) DESC;
 `

 func (qb *ImageStore) FindDuplicates(ctx context.Context, distance int) ([][]*models.Image, error) {
--- a/pkg/sqlite/scene.go
+++ b/pkg/sqlite/scene.go
@ -6,7 +6,6 @@ import (
 	"errors"
 	"fmt"
 	"path/filepath"
-	"slices"
 	"sort"
 	"strconv"
 	"strings"
@ -533,9 +532,15 @@ func (qb *SceneStore) FindMany(ctx context.Context, ids []int) ([]*models.Scene,
 		return nil, err
 	}

+	idToIndex := make(map[int]int, len(ids))
+	for i, id := range ids {
+		idToIndex[id] = i
+	}
+
 	for _, s := range unsorted {
-		i := slices.Index(ids, s.ID)
-		scenes[i] = s
+		if i, ok := idToIndex[s.ID]; ok {
+			scenes[i] = s
+		}
 	}

 	for i := range scenes {