mirror of
https://github.com/stashapp/stash.git
synced 2026-04-19 13:31:15 +02:00
feat: improve Image Duplicate Checker implementation
This change unifies the duplicate detection logic by leveraging the shared phash utility. It also enhances the UI with: - Pagination for large result sets. - Sorting duplicate groups by total file size. - A more detailed table view with image thumbnails, paths, and dimensions. - Consistency with the existing Scene Duplicate Checker tool.
This commit is contained in:
parent
3b1fccb010
commit
af75c8c1b4
3 changed files with 200 additions and 171 deletions
|
|
@ -1096,10 +1096,6 @@ func (qb *ImageStore) GetURLs(ctx context.Context, imageID int) ([]string, error
|
|||
}
|
||||
|
||||
func (qb *ImageStore) FindDuplicates(ctx context.Context, distance int) ([][]*models.Image, error) {
|
||||
return qb.findPhashMatches(ctx, distance)
|
||||
}
|
||||
|
||||
func (qb *ImageStore) findPhashMatches(ctx context.Context, distance int) ([][]*models.Image, error) {
|
||||
query := `
|
||||
SELECT images.id, files_fingerprints.fingerprint as phash
|
||||
FROM images
|
||||
|
|
@ -1107,88 +1103,20 @@ func (qb *ImageStore) findPhashMatches(ctx context.Context, distance int) ([][]*
|
|||
JOIN files_fingerprints ON images_files.file_id = files_fingerprints.file_id
|
||||
WHERE files_fingerprints.type = 'phash'`
|
||||
|
||||
type ImagePhash struct {
|
||||
ID int `db:"id"`
|
||||
PHash string `db:"phash"`
|
||||
}
|
||||
|
||||
var hashes []ImagePhash
|
||||
var hashes []*utils.Phash
|
||||
err := imageRepository.queryStruct(ctx, query, nil, &hashes)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Parse hashes
|
||||
type ParsedPhash struct {
|
||||
ID int
|
||||
PHash uint64
|
||||
}
|
||||
var parsedHashes []ParsedPhash
|
||||
for _, h := range hashes {
|
||||
val, parseErr := strconv.ParseUint(h.PHash, 16, 64)
|
||||
if parseErr == nil {
|
||||
parsedHashes = append(parsedHashes, ParsedPhash{ID: h.ID, PHash: val})
|
||||
}
|
||||
h.Bucket = -1
|
||||
}
|
||||
|
||||
// Helper for Popcount
|
||||
popcount := func(x uint64) int {
|
||||
x -= (x >> 1) & 0x5555555555555555
|
||||
x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333)
|
||||
x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f
|
||||
return int((x * 0x0101010101010101) >> 56)
|
||||
}
|
||||
dupeIds := utils.FindDuplicates(hashes, distance, -1)
|
||||
|
||||
// Adjacency list for connected components
|
||||
adj := make(map[int][]int)
|
||||
nodes := make(map[int]bool)
|
||||
|
||||
// O(N^2) comparison in memory
|
||||
for i := 0; i < len(parsedHashes); i++ {
|
||||
for j := i + 1; j < len(parsedHashes); j++ {
|
||||
diff := popcount(parsedHashes[i].PHash ^ parsedHashes[j].PHash)
|
||||
if diff <= distance {
|
||||
id1 := parsedHashes[i].ID
|
||||
id2 := parsedHashes[j].ID
|
||||
adj[id1] = append(adj[id1], id2)
|
||||
adj[id2] = append(adj[id2], id1)
|
||||
nodes[id1] = true
|
||||
nodes[id2] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Find connected components
|
||||
visited := make(map[int]bool)
|
||||
var components [][]int
|
||||
|
||||
for node := range nodes {
|
||||
if !visited[node] {
|
||||
var component []int
|
||||
queue := []int{node}
|
||||
visited[node] = true
|
||||
|
||||
for len(queue) > 0 {
|
||||
curr := queue[0]
|
||||
queue = queue[1:]
|
||||
component = append(component, curr)
|
||||
|
||||
for _, neighbor := range adj[curr] {
|
||||
if !visited[neighbor] {
|
||||
visited[neighbor] = true
|
||||
queue = append(queue, neighbor)
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(component) > 1 {
|
||||
components = append(components, component)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fetch actual image objects
|
||||
var result [][]*models.Image
|
||||
for _, comp := range components {
|
||||
for _, comp := range dupeIds {
|
||||
var group []*models.Image
|
||||
for _, id := range comp {
|
||||
img, err := qb.Find(ctx, id)
|
||||
|
|
@ -1203,3 +1131,4 @@ func (qb *ImageStore) findPhashMatches(ctx context.Context, distance int) ([][]*
|
|||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -9,27 +9,27 @@ import (
|
|||
)
|
||||
|
||||
type Phash struct {
|
||||
SceneID int `db:"id"`
|
||||
Hash int64 `db:"phash"`
|
||||
ID int `db:"id"`
|
||||
Hash int64 `db:"phash"`
|
||||
Duration float64 `db:"duration"`
|
||||
Neighbors []int
|
||||
Bucket int
|
||||
}
|
||||
|
||||
func FindDuplicates(hashes []*Phash, distance int, durationDiff float64) [][]int {
|
||||
for i, scene := range hashes {
|
||||
sceneHash := goimagehash.NewImageHash(uint64(scene.Hash), goimagehash.PHash)
|
||||
for i, subject := range hashes {
|
||||
subjectHash := goimagehash.NewImageHash(uint64(subject.Hash), goimagehash.PHash)
|
||||
for j, neighbor := range hashes {
|
||||
if i != j && scene.SceneID != neighbor.SceneID {
|
||||
if i != j && subject.ID != neighbor.ID {
|
||||
neighbourDurationDistance := 0.
|
||||
if scene.Duration > 0 && neighbor.Duration > 0 {
|
||||
neighbourDurationDistance = math.Abs(scene.Duration - neighbor.Duration)
|
||||
if subject.Duration > 0 && neighbor.Duration > 0 {
|
||||
neighbourDurationDistance = math.Abs(subject.Duration - neighbor.Duration)
|
||||
}
|
||||
if (neighbourDurationDistance <= durationDiff) || (durationDiff < 0) {
|
||||
neighborHash := goimagehash.NewImageHash(uint64(neighbor.Hash), goimagehash.PHash)
|
||||
neighborDistance, _ := sceneHash.Distance(neighborHash)
|
||||
neighborDistance, _ := subjectHash.Distance(neighborHash)
|
||||
if neighborDistance <= distance {
|
||||
scene.Neighbors = append(scene.Neighbors, j)
|
||||
subject.Neighbors = append(subject.Neighbors, j)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -37,15 +37,15 @@ func FindDuplicates(hashes []*Phash, distance int, durationDiff float64) [][]int
|
|||
}
|
||||
|
||||
var buckets [][]int
|
||||
for _, scene := range hashes {
|
||||
if len(scene.Neighbors) > 0 && scene.Bucket == -1 {
|
||||
for _, subject := range hashes {
|
||||
if len(subject.Neighbors) > 0 && subject.Bucket == -1 {
|
||||
bucket := len(buckets)
|
||||
scenes := []int{scene.SceneID}
|
||||
scene.Bucket = bucket
|
||||
findNeighbors(bucket, scene.Neighbors, hashes, &scenes)
|
||||
ids := []int{subject.ID}
|
||||
subject.Bucket = bucket
|
||||
findNeighbors(bucket, subject.Neighbors, hashes, &ids)
|
||||
|
||||
if len(scenes) > 1 {
|
||||
buckets = append(buckets, scenes)
|
||||
if len(ids) > 1 {
|
||||
buckets = append(buckets, ids)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -53,13 +53,13 @@ func FindDuplicates(hashes []*Phash, distance int, durationDiff float64) [][]int
|
|||
return buckets
|
||||
}
|
||||
|
||||
func findNeighbors(bucket int, neighbors []int, hashes []*Phash, scenes *[]int) {
|
||||
func findNeighbors(bucket int, neighbors []int, hashes []*Phash, ids *[]int) {
|
||||
for _, id := range neighbors {
|
||||
hash := hashes[id]
|
||||
if hash.Bucket == -1 {
|
||||
hash.Bucket = bucket
|
||||
*scenes = sliceutil.AppendUnique(*scenes, hash.SceneID)
|
||||
findNeighbors(bucket, hash.Neighbors, hashes, scenes)
|
||||
*ids = sliceutil.AppendUnique(*ids, hash.ID)
|
||||
findNeighbors(bucket, hash.Neighbors, hashes, ids)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,21 +1,40 @@
|
|||
import React, { useState } from "react";
|
||||
import { Button, Form, Spinner } from "react-bootstrap";
|
||||
import { FormattedMessage } from "react-intl";
|
||||
import React, { useMemo, useState } from "react";
|
||||
import {
|
||||
Button,
|
||||
Form,
|
||||
Spinner,
|
||||
Table,
|
||||
Row,
|
||||
Col,
|
||||
Card,
|
||||
} from "react-bootstrap";
|
||||
import { FormattedMessage, useIntl } from "react-intl";
|
||||
import { useFindDuplicateImagesQuery } from "src/core/generated-graphql";
|
||||
import * as GQL from "src/core/generated-graphql";
|
||||
import { PatchContainerComponent } from "src/patch";
|
||||
import { LoadingIndicator } from "../Shared/LoadingIndicator";
|
||||
import { ErrorMessage } from "../Shared/ErrorMessage";
|
||||
import { FileSize } from "../Shared/FileSize";
|
||||
import { Pagination } from "src/components/List/Pagination";
|
||||
import { useHistory } from "react-router-dom";
|
||||
|
||||
const ImageDuplicateCheckerSection = PatchContainerComponent(
|
||||
"ImageDuplicateCheckerSection"
|
||||
);
|
||||
|
||||
const ImageDuplicateChecker: React.FC = () => {
|
||||
const [distance, setDistance] = useState(0);
|
||||
const intl = useIntl();
|
||||
const history = useHistory();
|
||||
const query = new URLSearchParams(history.location.search);
|
||||
const currentPage = Number.parseInt(query.get("page") ?? "1", 10);
|
||||
const pageSize = Number.parseInt(query.get("size") ?? "20", 10);
|
||||
const hashDistance = Number.parseInt(query.get("distance") ?? "0", 10);
|
||||
|
||||
const [isSearching, setIsSearching] = useState(false);
|
||||
const [hasSearched, setHasSearched] = useState(false);
|
||||
|
||||
// We lazily fetch the query only when "Search" is clicked
|
||||
const { data, loading, error, refetch } = useFindDuplicateImagesQuery({
|
||||
variables: { distance },
|
||||
variables: { distance: hashDistance },
|
||||
skip: !hasSearched,
|
||||
fetchPolicy: "network-only",
|
||||
});
|
||||
|
|
@ -23,90 +42,171 @@ const ImageDuplicateChecker: React.FC = () => {
|
|||
const handleSearch = () => {
|
||||
setIsSearching(true);
|
||||
setHasSearched(true);
|
||||
refetch({ distance }).finally(() => setIsSearching(false));
|
||||
refetch({ distance: hashDistance }).finally(() => setIsSearching(false));
|
||||
};
|
||||
|
||||
const results = data?.findDuplicateImages ?? [];
|
||||
const getGroupTotalSize = (group: GQL.ImageDataFragment[]) => {
|
||||
return group.reduce((groupTotal, img) => {
|
||||
const imgTotal = img.visual_files.reduce(
|
||||
(fileTotal, file) => fileTotal + (file.size ?? 0),
|
||||
0
|
||||
);
|
||||
return groupTotal + imgTotal;
|
||||
}, 0);
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="row image-duplicate-checker">
|
||||
<div className="col-md-12">
|
||||
<ImageDuplicateCheckerSection>
|
||||
<h3>
|
||||
<FormattedMessage id="config.tools.image_duplicate_checker" />
|
||||
</h3>
|
||||
<Form className="d-flex align-items-end mb-4">
|
||||
<Form.Group controlId="distanceInput" className="mb-0 me-3">
|
||||
<Form.Label>PHash Distance</Form.Label>
|
||||
<Form.Control
|
||||
type="number"
|
||||
value={distance}
|
||||
min={0}
|
||||
max={10}
|
||||
onChange={(e) => setDistance(parseInt(e.target.value) || 0)}
|
||||
/>
|
||||
<Form.Text className="text-muted">
|
||||
Distance 0 means exact matches.
|
||||
</Form.Text>
|
||||
</Form.Group>
|
||||
const allGroups = useMemo(() => {
|
||||
const groups = data?.findDuplicateImages ?? [];
|
||||
return [...groups].sort((a, b) => {
|
||||
return getGroupTotalSize(b) - getGroupTotalSize(a);
|
||||
});
|
||||
}, [data?.findDuplicateImages]);
|
||||
|
||||
<Button
|
||||
variant="primary"
|
||||
onClick={handleSearch}
|
||||
disabled={isSearching || loading}
|
||||
>
|
||||
{isSearching || loading ? (
|
||||
<Spinner animation="border" size="sm" />
|
||||
) : (
|
||||
"Search"
|
||||
)}
|
||||
</Button>
|
||||
</Form>
|
||||
const pagedGroups = useMemo(() => {
|
||||
const start = (currentPage - 1) * pageSize;
|
||||
return allGroups.slice(start, start + pageSize);
|
||||
}, [allGroups, currentPage, pageSize]);
|
||||
|
||||
{error && (
|
||||
<div className="text-danger mb-4">Error: {error.message}</div>
|
||||
)}
|
||||
if (error) return <ErrorMessage error={error.message} />;
|
||||
|
||||
{hasSearched && !loading && !error && results.length === 0 && (
|
||||
<p>No duplicates found.</p>
|
||||
)}
|
||||
|
||||
{results.map((group, index) => {
|
||||
if (!group || group.length < 2) return null;
|
||||
return (
|
||||
<div
|
||||
key={index}
|
||||
className="duplicate-group mb-4 pb-4 border-bottom"
|
||||
>
|
||||
<h5>Group {index + 1}</h5>
|
||||
{/* ImageList requires an array of items with proper types. We map it nicely. */}
|
||||
<div className="d-flex flex-wrap gap-3">
|
||||
{group.map((img) => (
|
||||
<div key={img.id} className="border p-2 rounded">
|
||||
const renderGroup = (group: GQL.ImageDataFragment[], index: number) => {
|
||||
const groupIndex = (currentPage - 1) * pageSize + index + 1;
|
||||
return (
|
||||
<Card key={groupIndex} className="mb-4">
|
||||
<Card.Header className="d-flex justify-content-between align-items-center">
|
||||
<h5>Group {groupIndex}</h5>
|
||||
<span className="text-muted">
|
||||
Total Size: <FileSize size={getGroupTotalSize(group)} />
|
||||
</span>
|
||||
</Card.Header>
|
||||
<Card.Body>
|
||||
<Table striped bordered hover responsive size="sm">
|
||||
<thead>
|
||||
<tr>
|
||||
<th style={{ width: "150px" }}>Image</th>
|
||||
<th>Details</th>
|
||||
<th style={{ width: "120px" }}>Size</th>
|
||||
<th style={{ width: "150px" }}>Dimensions</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{group.map((img) => {
|
||||
const file = img.visual_files[0];
|
||||
return (
|
||||
<tr key={img.id}>
|
||||
<td>
|
||||
<img
|
||||
src={img.paths.thumbnail || ""}
|
||||
alt={img.title || img.id}
|
||||
style={{
|
||||
maxWidth: "200px",
|
||||
maxHeight: "200px",
|
||||
maxWidth: "120px",
|
||||
maxHeight: "120px",
|
||||
objectFit: "contain",
|
||||
}}
|
||||
/>
|
||||
<div
|
||||
className="mt-2 text-center text-truncate"
|
||||
style={{ maxWidth: "200px" }}
|
||||
title={img.title || img.id}
|
||||
>
|
||||
{img.title || img.id}
|
||||
</td>
|
||||
<td>
|
||||
<div className="fw-bold">{img.title || "(No Title)"}</div>
|
||||
<div className="text-muted small text-truncate" style={{ maxWidth: "400px" }}>
|
||||
{img.visual_files[0]?.path}
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
})}
|
||||
</ImageDuplicateCheckerSection>
|
||||
</div>
|
||||
<div className="mt-1 small">ID: {img.id}</div>
|
||||
</td>
|
||||
<td>
|
||||
<FileSize size={file?.size ?? 0} />
|
||||
</td>
|
||||
<td>
|
||||
{file?.__typename === "ImageFile" || file?.__typename === "VideoFile" ? (
|
||||
<>
|
||||
{file.width} x {file.height}
|
||||
</>
|
||||
) : (
|
||||
"N/A"
|
||||
)}
|
||||
</td>
|
||||
</tr>
|
||||
);
|
||||
})}
|
||||
</tbody>
|
||||
</Table>
|
||||
</Card.Body>
|
||||
</Card>
|
||||
);
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="container-fluid py-4">
|
||||
<ImageDuplicateCheckerSection>
|
||||
<Row className="mb-4">
|
||||
<Col>
|
||||
<h3>
|
||||
<FormattedMessage id="config.tools.image_duplicate_checker" />
|
||||
</h3>
|
||||
</Col>
|
||||
</Row>
|
||||
|
||||
<Form className="bg-light p-3 rounded mb-4 shadow-sm">
|
||||
<Row className="align-items-end">
|
||||
<Col md={3}>
|
||||
<Form.Group controlId="distanceInput">
|
||||
<Form.Label>PHash Distance</Form.Label>
|
||||
<Form.Control
|
||||
type="number"
|
||||
value={hashDistance}
|
||||
min={0}
|
||||
max={10}
|
||||
onChange={(e) => {
|
||||
const val = parseInt(e.target.value) || 0;
|
||||
query.set("distance", val.toString());
|
||||
history.push({ search: query.toString() });
|
||||
}}
|
||||
/>
|
||||
<Form.Text className="text-muted small">
|
||||
0 = exact matches.
|
||||
</Form.Text>
|
||||
</Form.Group>
|
||||
</Col>
|
||||
<Col md={2}>
|
||||
<Button
|
||||
variant="primary"
|
||||
className="w-100"
|
||||
onClick={handleSearch}
|
||||
disabled={isSearching || loading}
|
||||
>
|
||||
{isSearching || loading ? (
|
||||
<Spinner animation="border" size="sm" />
|
||||
) : (
|
||||
"Search"
|
||||
)}
|
||||
</Button>
|
||||
</Col>
|
||||
</Row>
|
||||
</Form>
|
||||
|
||||
{loading && <LoadingIndicator />}
|
||||
|
||||
{hasSearched && !loading && !error && allGroups.length === 0 && (
|
||||
<div className="text-center py-5 border rounded bg-light">
|
||||
<p className="mb-0">No duplicates found with the current distance.</p>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{pagedGroups.map((group, index) => renderGroup(group, index))}
|
||||
|
||||
{allGroups.length > pageSize && (
|
||||
<div className="d-flex justify-content-center mt-4">
|
||||
<Pagination
|
||||
currentPage={currentPage}
|
||||
totalItems={allGroups.length}
|
||||
pageSize={pageSize}
|
||||
onChangePage={(page) => {
|
||||
query.set("page", page.toString());
|
||||
history.push({ search: query.toString() });
|
||||
}}
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
</ImageDuplicateCheckerSection>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
|
|
|||
Loading…
Reference in a new issue