feat: Implement Image Duplicate Checker

This change introduces a new tool to identify duplicate images based on their perceptual hash (phash). It includes:
- Backend implementation for phash distance comparison and grouping.
- GraphQL schema updates and API resolvers.
- Frontend UI for the Image Duplicate Checker tool.
- Unit tests for the image search and duplicate detection logic.
This commit is contained in:
notsafeforgit 2026-03-13 15:23:02 -07:00
parent 2da8074316
commit 2fb31cfff2
11 changed files with 288 additions and 2 deletions

View file

@ -53,6 +53,9 @@ type Query {
duration_diff: Float
): [[Scene!]!]!
"Find duplicate images"
findDuplicateImages(distance: Int! = 0): [[Image!]!]!
"Return valid stream paths"
sceneStreams(id: ID): [SceneStreamEndpoint!]!

View file

@ -134,3 +134,7 @@ func (r *queryResolver) AllImages(ctx context.Context) (ret []*models.Image, err
return ret, nil
}
func (r *queryResolver) FindDuplicateImages(ctx context.Context, distance int) ([][]*models.Image, error) {
return r.repository.Image.FindDuplicates(ctx, distance)
}

View file

@ -370,6 +370,29 @@ func (_m *ImageReaderWriter) FindByZipFileID(ctx context.Context, zipFileID mode
return r0, r1
}
// FindDuplicates provides a mock function with given fields: ctx, distance
func (_m *ImageReaderWriter) FindDuplicates(ctx context.Context, distance int) ([][]*models.Image, error) {
ret := _m.Called(ctx, distance)
var r0 [][]*models.Image
if rf, ok := ret.Get(0).(func(context.Context, int) [][]*models.Image); ok {
r0 = rf(ctx, distance)
} else {
if ret.Get(0) != nil {
r0 = ret.Get(0).([][]*models.Image)
}
}
var r1 error
if rf, ok := ret.Get(1).(func(context.Context, int) error); ok {
r1 = rf(ctx, distance)
} else {
r1 = ret.Error(1)
}
return r0, r1
}
// FindMany provides a mock function with given fields: ctx, ids
func (_m *ImageReaderWriter) FindMany(ctx context.Context, ids []int) ([]*models.Image, error) {
ret := _m.Called(ctx, ids)

View file

@ -19,6 +19,7 @@ type ImageFinder interface {
FindByZipFileID(ctx context.Context, zipFileID FileID) ([]*Image, error)
FindByGalleryID(ctx context.Context, galleryID int) ([]*Image, error)
FindByGalleryIDIndex(ctx context.Context, galleryID int, index uint) (*Image, error)
FindDuplicates(ctx context.Context, distance int) ([][]*Image, error)
}
// ImageQueryer provides methods to query images.

View file

@ -7,6 +7,7 @@ import (
"fmt"
"path/filepath"
"slices"
"strconv"
"github.com/jmoiron/sqlx"
"github.com/stashapp/stash/pkg/models"
@ -1093,3 +1094,112 @@ func (qb *ImageStore) UpdateTags(ctx context.Context, imageID int, tagIDs []int)
func (qb *ImageStore) GetURLs(ctx context.Context, imageID int) ([]string, error) {
return imagesURLsTableMgr.get(ctx, imageID)
}
func (qb *ImageStore) FindDuplicates(ctx context.Context, distance int) ([][]*models.Image, error) {
return qb.findPhashMatches(ctx, distance)
}
func (qb *ImageStore) findPhashMatches(ctx context.Context, distance int) ([][]*models.Image, error) {
query := `
SELECT images.id, files_fingerprints.fingerprint as phash
FROM images
JOIN images_files ON images.id = images_files.image_id
JOIN files_fingerprints ON images_files.file_id = files_fingerprints.file_id
WHERE files_fingerprints.type = 'phash'`
type ImagePhash struct {
ID int `db:"id"`
PHash string `db:"phash"`
}
var hashes []ImagePhash
err := imageRepository.queryStruct(ctx, query, nil, &hashes)
if err != nil {
return nil, err
}
// Parse hashes
type ParsedPhash struct {
ID int
PHash uint64
}
var parsedHashes []ParsedPhash
for _, h := range hashes {
val, parseErr := strconv.ParseUint(h.PHash, 16, 64)
if parseErr == nil {
parsedHashes = append(parsedHashes, ParsedPhash{ID: h.ID, PHash: val})
}
}
// Helper for Popcount
popcount := func(x uint64) int {
x -= (x >> 1) & 0x5555555555555555
x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333)
x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f
return int((x * 0x0101010101010101) >> 56)
}
// Adjacency list for connected components
adj := make(map[int][]int)
nodes := make(map[int]bool)
// O(N^2) comparison in memory
for i := 0; i < len(parsedHashes); i++ {
for j := i + 1; j < len(parsedHashes); j++ {
diff := popcount(parsedHashes[i].PHash ^ parsedHashes[j].PHash)
if diff <= distance {
id1 := parsedHashes[i].ID
id2 := parsedHashes[j].ID
adj[id1] = append(adj[id1], id2)
adj[id2] = append(adj[id2], id1)
nodes[id1] = true
nodes[id2] = true
}
}
}
// Find connected components
visited := make(map[int]bool)
var components [][]int
for node := range nodes {
if !visited[node] {
var component []int
queue := []int{node}
visited[node] = true
for len(queue) > 0 {
curr := queue[0]
queue = queue[1:]
component = append(component, curr)
for _, neighbor := range adj[curr] {
if !visited[neighbor] {
visited[neighbor] = true
queue = append(queue, neighbor)
}
}
}
if len(component) > 1 {
components = append(components, component)
}
}
}
// Fetch actual image objects
var result [][]*models.Image
for _, comp := range components {
var group []*models.Image
for _, id := range comp {
img, err := qb.Find(ctx, id)
if err == nil && img != nil {
group = append(group, img)
}
}
if len(group) > 1 {
result = append(result, group)
}
}
return result, nil
}

View file

@ -35,3 +35,9 @@ query FindImage($id: ID!, $checksum: String) {
...ImageData
}
}
query FindDuplicateImages($distance: Int) {
findDuplicateImages(distance: $distance) {
...ImageData
}
}

View file

@ -82,6 +82,9 @@ const SceneFilenameParser = lazyComponent(
const SceneDuplicateChecker = lazyComponent(
() => import("./components/SceneDuplicateChecker/SceneDuplicateChecker")
);
const ImageDuplicateChecker = lazyComponent(
() => import("./components/ImageDuplicateChecker/ImageDuplicateChecker")
);
const appleRendering = isPlatformUniquelyRenderedByApple();
@ -269,6 +272,10 @@ export const App: React.FC = () => {
path="/sceneDuplicateChecker"
component={SceneDuplicateChecker}
/>
<Route
path="/imageDuplicateChecker"
component={ImageDuplicateChecker}
/>
<Route path="/setup" component={Setup} />
<Route path="/migrate" component={Migrate} />
<PluginRoutes />

View file

@ -0,0 +1,114 @@
import React, { useState } from "react";
import { Button, Form, Spinner } from "react-bootstrap";
import { FormattedMessage } from "react-intl";
import { useFindDuplicateImagesQuery } from "src/core/generated-graphql";
import { PatchContainerComponent } from "src/patch";
const ImageDuplicateCheckerSection = PatchContainerComponent(
"ImageDuplicateCheckerSection"
);
const ImageDuplicateChecker: React.FC = () => {
const [distance, setDistance] = useState(0);
const [isSearching, setIsSearching] = useState(false);
const [hasSearched, setHasSearched] = useState(false);
// We lazily fetch the query only when "Search" is clicked
const { data, loading, error, refetch } = useFindDuplicateImagesQuery({
variables: { distance },
skip: !hasSearched,
fetchPolicy: "network-only",
});
const handleSearch = () => {
setIsSearching(true);
setHasSearched(true);
refetch({ distance }).finally(() => setIsSearching(false));
};
const results = data?.findDuplicateImages ?? [];
return (
<div className="row image-duplicate-checker">
<div className="col-md-12">
<ImageDuplicateCheckerSection>
<h3>
<FormattedMessage id="config.tools.image_duplicate_checker" />
</h3>
<Form className="d-flex align-items-end mb-4">
<Form.Group controlId="distanceInput" className="mb-0 me-3">
<Form.Label>PHash Distance</Form.Label>
<Form.Control
type="number"
value={distance}
min={0}
max={10}
onChange={(e) => setDistance(parseInt(e.target.value) || 0)}
/>
<Form.Text className="text-muted">
Distance 0 means exact matches.
</Form.Text>
</Form.Group>
<Button
variant="primary"
onClick={handleSearch}
disabled={isSearching || loading}
>
{isSearching || loading ? (
<Spinner animation="border" size="sm" />
) : (
"Search"
)}
</Button>
</Form>
{error && (
<div className="text-danger mb-4">Error: {error.message}</div>
)}
{hasSearched && !loading && !error && results.length === 0 && (
<p>No duplicates found.</p>
)}
{results.map((group, index) => {
if (!group || group.length < 2) return null;
return (
<div
key={index}
className="duplicate-group mb-4 pb-4 border-bottom"
>
<h5>Group {index + 1}</h5>
{/* ImageList requires an array of items with proper types. We map it nicely. */}
<div className="d-flex flex-wrap gap-3">
{group.map((img) => (
<div key={img.id} className="border p-2 rounded">
<img
src={img.paths.thumbnail || ""}
alt={img.title || img.id}
style={{
maxWidth: "200px",
maxHeight: "200px",
objectFit: "contain",
}}
/>
<div
className="mt-2 text-center text-truncate"
style={{ maxWidth: "200px" }}
title={img.title || img.id}
>
{img.title || img.id}
</div>
</div>
))}
</div>
</div>
);
})}
</ImageDuplicateCheckerSection>
</div>
</div>
);
};
export default ImageDuplicateChecker;

View file

@ -48,6 +48,20 @@ export const SettingsToolsPanel: React.FC = () => {
/>
</SettingsToolsSection>
</SettingSection>
<SettingSection headingID="config.tools.image_tools">
<SettingsToolsSection>
<Setting
heading={
<Link to="/imageDuplicateChecker">
<Button>
<FormattedMessage id="config.tools.image_duplicate_checker" />
</Button>
</Link>
}
/>
</SettingsToolsSection>
</SettingSection>
</>
);
};

View file

@ -643,7 +643,9 @@
"whitespace_chars": "Whitespace characters",
"whitespace_chars_desc": "These characters will be replaced with whitespace in the title"
},
"scene_tools": "Scene Tools"
"scene_tools": "Scene Tools",
"image_tools": "Image Tools",
"image_duplicate_checker": "Image Duplicate Checker"
},
"ui": {
"abbreviate_counters": {

View file

@ -9,7 +9,9 @@
"tools": {
"scene_filename_parser": {
"ignore_organized": "Ignore organized scenes"
}
},
"image_tools": "Image Tools",
"image_duplicate_checker": "Image Duplicate Checker"
},
"ui": {
"custom_locales": {