mirror of
https://github.com/stashapp/stash.git
synced 2026-05-09 05:05:29 +02:00
perf: eliminate O(N^2) bottlenecks in image and scene duplicate detection
This update resolves major performance regressions when processing large libraries: 1. Optimized FindMany in both Image and Scene stores to use map-based ID lookups. Previously, this function used slices.Index in a loop, resulting in O(N^2) complexity. On a library with 300k items, this was causing the server to hang indefinitely. 2. Refined the exact image duplicate SQL query to match the scene checker's level of optimization. It now joins the files table and orders results by total duplicate file size, ensuring that the most impactful duplicates are shown first. 3. Removed the temporary LIMIT 1000 from the image duplicate query now that the algorithmic bottlenecks have been resolved.
This commit is contained in:
parent
15c6dd5575
commit
34821a5d40
2 changed files with 29 additions and 13 deletions
|
|
@ -6,7 +6,6 @@ import (
|
|||
"errors"
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
|
|
@ -412,6 +411,11 @@ func (qb *ImageStore) Find(ctx context.Context, id int) (*models.Image, error) {
|
|||
func (qb *ImageStore) FindMany(ctx context.Context, ids []int) ([]*models.Image, error) {
|
||||
images := make([]*models.Image, len(ids))
|
||||
|
||||
idToIndex := make(map[int]int, len(ids))
|
||||
for i, id := range ids {
|
||||
idToIndex[id] = i
|
||||
}
|
||||
|
||||
if err := batchExec(ids, defaultBatchSize, func(batch []int) error {
|
||||
q := qb.selectDataset().Prepared(true).Where(qb.table().Col(idColumn).In(batch))
|
||||
unsorted, err := qb.getMany(ctx, q)
|
||||
|
|
@ -420,8 +424,9 @@ func (qb *ImageStore) FindMany(ctx context.Context, ids []int) ([]*models.Image,
|
|||
}
|
||||
|
||||
for _, s := range unsorted {
|
||||
i := slices.Index(ids, s.ID)
|
||||
images[i] = s
|
||||
if i, ok := idToIndex[s.ID]; ok {
|
||||
images[i] = s
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
|
|
@ -1099,14 +1104,20 @@ func (qb *ImageStore) GetURLs(ctx context.Context, imageID int) ([]string, error
|
|||
|
||||
var findExactImageDuplicateQuery = `
|
||||
SELECT GROUP_CONCAT(DISTINCT image_id) as ids
|
||||
FROM images_files
|
||||
JOIN files_fingerprints ON images_files.file_id = files_fingerprints.file_id
|
||||
WHERE files_fingerprints.type = 'phash'
|
||||
AND files_fingerprints.fingerprint != zeroblob(8)
|
||||
AND files_fingerprints.fingerprint != ''
|
||||
GROUP BY fingerprint
|
||||
FROM (
|
||||
SELECT images_files.image_id
|
||||
, files.size as file_size
|
||||
, files_fingerprints.fingerprint as phash
|
||||
FROM images_files
|
||||
JOIN files ON images_files.file_id = files.id
|
||||
JOIN files_fingerprints ON images_files.file_id = files_fingerprints.file_id
|
||||
WHERE files_fingerprints.type = 'phash'
|
||||
AND files_fingerprints.fingerprint != zeroblob(8)
|
||||
AND files_fingerprints.fingerprint != ''
|
||||
)
|
||||
GROUP BY phash
|
||||
HAVING COUNT(DISTINCT image_id) > 1
|
||||
LIMIT 1000;
|
||||
ORDER BY SUM(file_size) DESC;
|
||||
`
|
||||
|
||||
func (qb *ImageStore) FindDuplicates(ctx context.Context, distance int) ([][]*models.Image, error) {
|
||||
|
|
|
|||
|
|
@ -6,7 +6,6 @@ import (
|
|||
"errors"
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"slices"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
|
@ -533,9 +532,15 @@ func (qb *SceneStore) FindMany(ctx context.Context, ids []int) ([]*models.Scene,
|
|||
return nil, err
|
||||
}
|
||||
|
||||
idToIndex := make(map[int]int, len(ids))
|
||||
for i, id := range ids {
|
||||
idToIndex[id] = i
|
||||
}
|
||||
|
||||
for _, s := range unsorted {
|
||||
i := slices.Index(ids, s.ID)
|
||||
scenes[i] = s
|
||||
if i, ok := idToIndex[s.ID]; ok {
|
||||
scenes[i] = s
|
||||
}
|
||||
}
|
||||
|
||||
for i := range scenes {
|
||||
|
|
|
|||
Loading…
Reference in a new issue