From 34821a5d4046dfba76a081fdc2d329d776f18397 Mon Sep 17 00:00:00 2001 From: notsafeforgit Date: Fri, 20 Mar 2026 15:29:04 -0700 Subject: [PATCH] perf: eliminate O(N^2) bottlenecks in image and scene duplicate detection This update resolves major performance regressions when processing large libraries: 1. Optimized FindMany in both Image and Scene stores to use map-based ID lookups. Previously, this function used slices.Index in a loop, resulting in O(N^2) complexity. On a library with 300k items, this was causing the server to hang indefinitely. 2. Refined the exact image duplicate SQL query to match the scene checker's level of optimization. It now joins the files table and orders results by total duplicate file size, ensuring that the most impactful duplicates are shown first. 3. Removed the temporary LIMIT 1000 from the image duplicate query now that the algorithmic bottlenecks have been resolved. --- pkg/sqlite/image.go | 31 +++++++++++++++++++++---------- pkg/sqlite/scene.go | 11 ++++++++--- 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/pkg/sqlite/image.go b/pkg/sqlite/image.go index b7ef60e50..1016b98e0 100644 --- a/pkg/sqlite/image.go +++ b/pkg/sqlite/image.go @@ -6,7 +6,6 @@ import ( "errors" "fmt" "path/filepath" - "slices" "strconv" "strings" @@ -412,6 +411,11 @@ func (qb *ImageStore) Find(ctx context.Context, id int) (*models.Image, error) { func (qb *ImageStore) FindMany(ctx context.Context, ids []int) ([]*models.Image, error) { images := make([]*models.Image, len(ids)) + idToIndex := make(map[int]int, len(ids)) + for i, id := range ids { + idToIndex[id] = i + } + if err := batchExec(ids, defaultBatchSize, func(batch []int) error { q := qb.selectDataset().Prepared(true).Where(qb.table().Col(idColumn).In(batch)) unsorted, err := qb.getMany(ctx, q) @@ -420,8 +424,9 @@ func (qb *ImageStore) FindMany(ctx context.Context, ids []int) ([]*models.Image, } for _, s := range unsorted { - i := slices.Index(ids, s.ID) - images[i] = s + if i, ok := idToIndex[s.ID]; ok { + images[i] = s + } } return nil @@ -1099,14 +1104,20 @@ func (qb *ImageStore) GetURLs(ctx context.Context, imageID int) ([]string, error var findExactImageDuplicateQuery = ` SELECT GROUP_CONCAT(DISTINCT image_id) as ids -FROM images_files -JOIN files_fingerprints ON images_files.file_id = files_fingerprints.file_id -WHERE files_fingerprints.type = 'phash' - AND files_fingerprints.fingerprint != zeroblob(8) - AND files_fingerprints.fingerprint != '' -GROUP BY fingerprint +FROM ( + SELECT images_files.image_id + , files.size as file_size + , files_fingerprints.fingerprint as phash + FROM images_files + JOIN files ON images_files.file_id = files.id + JOIN files_fingerprints ON images_files.file_id = files_fingerprints.file_id + WHERE files_fingerprints.type = 'phash' + AND files_fingerprints.fingerprint != zeroblob(8) + AND files_fingerprints.fingerprint != '' +) +GROUP BY phash HAVING COUNT(DISTINCT image_id) > 1 -LIMIT 1000; +ORDER BY SUM(file_size) DESC; ` func (qb *ImageStore) FindDuplicates(ctx context.Context, distance int) ([][]*models.Image, error) { diff --git a/pkg/sqlite/scene.go b/pkg/sqlite/scene.go index fef08dd38..e1f750477 100644 --- a/pkg/sqlite/scene.go +++ b/pkg/sqlite/scene.go @@ -6,7 +6,6 @@ import ( "errors" "fmt" "path/filepath" - "slices" "sort" "strconv" "strings" @@ -533,9 +532,15 @@ func (qb *SceneStore) FindMany(ctx context.Context, ids []int) ([]*models.Scene, return nil, err } + idToIndex := make(map[int]int, len(ids)) + for i, id := range ids { + idToIndex[id] = i + } + for _, s := range unsorted { - i := slices.Index(ids, s.ID) - scenes[i] = s + if i, ok := idToIndex[s.ID]; ok { + scenes[i] = s + } } for i := range scenes {