From 3444c21263472f7add782368c88faec69b75df25 Mon Sep 17 00:00:00 2001 From: notsafeforgit Date: Fri, 20 Mar 2026 14:35:19 -0700 Subject: [PATCH] perf(sqlite): implement SQL-based fast path for exact image duplicate detection This change adds a specialized SQL query to find exact image duplicate matches (distance 0) directly in the database. Previously, the image duplicate checker always used an O(N^2) Go-based comparison loop, which caused indefinite loading and timeouts on libraries with a large number of images. The new SQL fast path reduces the time to find exact duplicates from minutes/hours to milliseconds. --- pkg/sqlite/image.go | 69 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 15 deletions(-) diff --git a/pkg/sqlite/image.go b/pkg/sqlite/image.go index 780979270..ec180e05f 100644 --- a/pkg/sqlite/image.go +++ b/pkg/sqlite/image.go @@ -7,6 +7,8 @@ import ( "fmt" "path/filepath" "slices" + "strconv" + "strings" "github.com/jmoiron/sqlx" "github.com/stashapp/stash/pkg/models" @@ -1095,32 +1097,69 @@ func (qb *ImageStore) GetURLs(ctx context.Context, imageID int) ([]string, error return imagesURLsTableMgr.get(ctx, imageID) } +var findExactImageDuplicateQuery = ` +SELECT GROUP_CONCAT(DISTINCT image_id) as ids +FROM ( + SELECT images.id as image_id + , files_fingerprints.fingerprint as phash + FROM images + JOIN images_files ON images.id = images_files.image_id + JOIN files_fingerprints ON images_files.file_id = files_fingerprints.file_id + WHERE files_fingerprints.type = 'phash' +) +GROUP BY phash +HAVING COUNT(phash) > 1 + AND COUNT(DISTINCT image_id) > 1; +` + func (qb *ImageStore) FindDuplicates(ctx context.Context, distance int) ([][]*models.Image, error) { - query := ` + var dupeIds [][]int + if distance == 0 { + var ids []string + if err := dbWrapper.Select(ctx, &ids, findExactImageDuplicateQuery); err != nil { + return nil, err + } + + for _, id := range ids { + strIds := strings.Split(id, ",") + var imageIds []int + for _, strId := range strIds { + if intId, err := strconv.Atoi(strId); err == nil { + imageIds = sliceutil.AppendUnique(imageIds, intId) + } + } + // filter out + if len(imageIds) > 1 { + dupeIds = append(dupeIds, imageIds) + } + } + } else { + query := ` SELECT images.id, files_fingerprints.fingerprint as phash FROM images JOIN images_files ON images.id = images_files.image_id JOIN files_fingerprints ON images_files.file_id = files_fingerprints.file_id WHERE files_fingerprints.type = 'phash'` - var hashes []*utils.Phash - if err := imageRepository.queryFunc(ctx, query, nil, false, func(rows *sqlx.Rows) error { - phash := utils.Phash{ - Bucket: -1, - Duration: -1, - } - if err := rows.StructScan(&phash); err != nil { - return err + var hashes []*utils.Phash + if err := imageRepository.queryFunc(ctx, query, nil, false, func(rows *sqlx.Rows) error { + phash := utils.Phash{ + Bucket: -1, + Duration: -1, + } + if err := rows.StructScan(&phash); err != nil { + return err + } + + hashes = append(hashes, &phash) + return nil + }); err != nil { + return nil, err } - hashes = append(hashes, &phash) - return nil - }); err != nil { - return nil, err + dupeIds = utils.FindDuplicates(hashes, distance, -1) } - dupeIds := utils.FindDuplicates(hashes, distance, -1) - var result [][]*models.Image for _, comp := range dupeIds { if images, err := qb.FindMany(ctx, comp); err == nil {