feat: improve Image Duplicate Checker implementation

This change unifies the duplicate detection logic by leveraging the shared phash utility. It also enhances the UI with:
- Pagination for large result sets.
- Sorting duplicate groups by total file size.
- A more detailed table view with image thumbnails, paths, and dimensions.
- Consistency with the existing Scene Duplicate Checker tool.
This commit is contained in:
notsafeforgit 2026-03-13 15:35:29 -07:00
parent 3b1fccb010
commit af75c8c1b4
3 changed files with 200 additions and 171 deletions

View file

@ -1096,10 +1096,6 @@ func (qb *ImageStore) GetURLs(ctx context.Context, imageID int) ([]string, error
}
func (qb *ImageStore) FindDuplicates(ctx context.Context, distance int) ([][]*models.Image, error) {
return qb.findPhashMatches(ctx, distance)
}
func (qb *ImageStore) findPhashMatches(ctx context.Context, distance int) ([][]*models.Image, error) {
query := `
SELECT images.id, files_fingerprints.fingerprint as phash
FROM images
@ -1107,88 +1103,20 @@ func (qb *ImageStore) findPhashMatches(ctx context.Context, distance int) ([][]*
JOIN files_fingerprints ON images_files.file_id = files_fingerprints.file_id
WHERE files_fingerprints.type = 'phash'`
type ImagePhash struct {
ID int `db:"id"`
PHash string `db:"phash"`
}
var hashes []ImagePhash
var hashes []*utils.Phash
err := imageRepository.queryStruct(ctx, query, nil, &hashes)
if err != nil {
return nil, err
}
// Parse hashes
type ParsedPhash struct {
ID int
PHash uint64
}
var parsedHashes []ParsedPhash
for _, h := range hashes {
val, parseErr := strconv.ParseUint(h.PHash, 16, 64)
if parseErr == nil {
parsedHashes = append(parsedHashes, ParsedPhash{ID: h.ID, PHash: val})
}
h.Bucket = -1
}
// Helper for Popcount
popcount := func(x uint64) int {
x -= (x >> 1) & 0x5555555555555555
x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333)
x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f
return int((x * 0x0101010101010101) >> 56)
}
dupeIds := utils.FindDuplicates(hashes, distance, -1)
// Adjacency list for connected components
adj := make(map[int][]int)
nodes := make(map[int]bool)
// O(N^2) comparison in memory
for i := 0; i < len(parsedHashes); i++ {
for j := i + 1; j < len(parsedHashes); j++ {
diff := popcount(parsedHashes[i].PHash ^ parsedHashes[j].PHash)
if diff <= distance {
id1 := parsedHashes[i].ID
id2 := parsedHashes[j].ID
adj[id1] = append(adj[id1], id2)
adj[id2] = append(adj[id2], id1)
nodes[id1] = true
nodes[id2] = true
}
}
}
// Find connected components
visited := make(map[int]bool)
var components [][]int
for node := range nodes {
if !visited[node] {
var component []int
queue := []int{node}
visited[node] = true
for len(queue) > 0 {
curr := queue[0]
queue = queue[1:]
component = append(component, curr)
for _, neighbor := range adj[curr] {
if !visited[neighbor] {
visited[neighbor] = true
queue = append(queue, neighbor)
}
}
}
if len(component) > 1 {
components = append(components, component)
}
}
}
// Fetch actual image objects
var result [][]*models.Image
for _, comp := range components {
for _, comp := range dupeIds {
var group []*models.Image
for _, id := range comp {
img, err := qb.Find(ctx, id)
@ -1203,3 +1131,4 @@ func (qb *ImageStore) findPhashMatches(ctx context.Context, distance int) ([][]*
return result, nil
}

View file

@ -9,27 +9,27 @@ import (
)
type Phash struct {
SceneID int `db:"id"`
Hash int64 `db:"phash"`
ID int `db:"id"`
Hash int64 `db:"phash"`
Duration float64 `db:"duration"`
Neighbors []int
Bucket int
}
func FindDuplicates(hashes []*Phash, distance int, durationDiff float64) [][]int {
for i, scene := range hashes {
sceneHash := goimagehash.NewImageHash(uint64(scene.Hash), goimagehash.PHash)
for i, subject := range hashes {
subjectHash := goimagehash.NewImageHash(uint64(subject.Hash), goimagehash.PHash)
for j, neighbor := range hashes {
if i != j && scene.SceneID != neighbor.SceneID {
if i != j && subject.ID != neighbor.ID {
neighbourDurationDistance := 0.
if scene.Duration > 0 && neighbor.Duration > 0 {
neighbourDurationDistance = math.Abs(scene.Duration - neighbor.Duration)
if subject.Duration > 0 && neighbor.Duration > 0 {
neighbourDurationDistance = math.Abs(subject.Duration - neighbor.Duration)
}
if (neighbourDurationDistance <= durationDiff) || (durationDiff < 0) {
neighborHash := goimagehash.NewImageHash(uint64(neighbor.Hash), goimagehash.PHash)
neighborDistance, _ := sceneHash.Distance(neighborHash)
neighborDistance, _ := subjectHash.Distance(neighborHash)
if neighborDistance <= distance {
scene.Neighbors = append(scene.Neighbors, j)
subject.Neighbors = append(subject.Neighbors, j)
}
}
}
@ -37,15 +37,15 @@ func FindDuplicates(hashes []*Phash, distance int, durationDiff float64) [][]int
}
var buckets [][]int
for _, scene := range hashes {
if len(scene.Neighbors) > 0 && scene.Bucket == -1 {
for _, subject := range hashes {
if len(subject.Neighbors) > 0 && subject.Bucket == -1 {
bucket := len(buckets)
scenes := []int{scene.SceneID}
scene.Bucket = bucket
findNeighbors(bucket, scene.Neighbors, hashes, &scenes)
ids := []int{subject.ID}
subject.Bucket = bucket
findNeighbors(bucket, subject.Neighbors, hashes, &ids)
if len(scenes) > 1 {
buckets = append(buckets, scenes)
if len(ids) > 1 {
buckets = append(buckets, ids)
}
}
}
@ -53,13 +53,13 @@ func FindDuplicates(hashes []*Phash, distance int, durationDiff float64) [][]int
return buckets
}
func findNeighbors(bucket int, neighbors []int, hashes []*Phash, scenes *[]int) {
func findNeighbors(bucket int, neighbors []int, hashes []*Phash, ids *[]int) {
for _, id := range neighbors {
hash := hashes[id]
if hash.Bucket == -1 {
hash.Bucket = bucket
*scenes = sliceutil.AppendUnique(*scenes, hash.SceneID)
findNeighbors(bucket, hash.Neighbors, hashes, scenes)
*ids = sliceutil.AppendUnique(*ids, hash.ID)
findNeighbors(bucket, hash.Neighbors, hashes, ids)
}
}
}

View file

@ -1,21 +1,40 @@
import React, { useState } from "react";
import { Button, Form, Spinner } from "react-bootstrap";
import { FormattedMessage } from "react-intl";
import React, { useMemo, useState } from "react";
import {
Button,
Form,
Spinner,
Table,
Row,
Col,
Card,
} from "react-bootstrap";
import { FormattedMessage, useIntl } from "react-intl";
import { useFindDuplicateImagesQuery } from "src/core/generated-graphql";
import * as GQL from "src/core/generated-graphql";
import { PatchContainerComponent } from "src/patch";
import { LoadingIndicator } from "../Shared/LoadingIndicator";
import { ErrorMessage } from "../Shared/ErrorMessage";
import { FileSize } from "../Shared/FileSize";
import { Pagination } from "src/components/List/Pagination";
import { useHistory } from "react-router-dom";
const ImageDuplicateCheckerSection = PatchContainerComponent(
"ImageDuplicateCheckerSection"
);
const ImageDuplicateChecker: React.FC = () => {
const [distance, setDistance] = useState(0);
const intl = useIntl();
const history = useHistory();
const query = new URLSearchParams(history.location.search);
const currentPage = Number.parseInt(query.get("page") ?? "1", 10);
const pageSize = Number.parseInt(query.get("size") ?? "20", 10);
const hashDistance = Number.parseInt(query.get("distance") ?? "0", 10);
const [isSearching, setIsSearching] = useState(false);
const [hasSearched, setHasSearched] = useState(false);
// We lazily fetch the query only when "Search" is clicked
const { data, loading, error, refetch } = useFindDuplicateImagesQuery({
variables: { distance },
variables: { distance: hashDistance },
skip: !hasSearched,
fetchPolicy: "network-only",
});
@ -23,90 +42,171 @@ const ImageDuplicateChecker: React.FC = () => {
const handleSearch = () => {
setIsSearching(true);
setHasSearched(true);
refetch({ distance }).finally(() => setIsSearching(false));
refetch({ distance: hashDistance }).finally(() => setIsSearching(false));
};
const results = data?.findDuplicateImages ?? [];
const getGroupTotalSize = (group: GQL.ImageDataFragment[]) => {
return group.reduce((groupTotal, img) => {
const imgTotal = img.visual_files.reduce(
(fileTotal, file) => fileTotal + (file.size ?? 0),
0
);
return groupTotal + imgTotal;
}, 0);
};
return (
<div className="row image-duplicate-checker">
<div className="col-md-12">
<ImageDuplicateCheckerSection>
<h3>
<FormattedMessage id="config.tools.image_duplicate_checker" />
</h3>
<Form className="d-flex align-items-end mb-4">
<Form.Group controlId="distanceInput" className="mb-0 me-3">
<Form.Label>PHash Distance</Form.Label>
<Form.Control
type="number"
value={distance}
min={0}
max={10}
onChange={(e) => setDistance(parseInt(e.target.value) || 0)}
/>
<Form.Text className="text-muted">
Distance 0 means exact matches.
</Form.Text>
</Form.Group>
const allGroups = useMemo(() => {
const groups = data?.findDuplicateImages ?? [];
return [...groups].sort((a, b) => {
return getGroupTotalSize(b) - getGroupTotalSize(a);
});
}, [data?.findDuplicateImages]);
<Button
variant="primary"
onClick={handleSearch}
disabled={isSearching || loading}
>
{isSearching || loading ? (
<Spinner animation="border" size="sm" />
) : (
"Search"
)}
</Button>
</Form>
const pagedGroups = useMemo(() => {
const start = (currentPage - 1) * pageSize;
return allGroups.slice(start, start + pageSize);
}, [allGroups, currentPage, pageSize]);
{error && (
<div className="text-danger mb-4">Error: {error.message}</div>
)}
if (error) return <ErrorMessage error={error.message} />;
{hasSearched && !loading && !error && results.length === 0 && (
<p>No duplicates found.</p>
)}
{results.map((group, index) => {
if (!group || group.length < 2) return null;
return (
<div
key={index}
className="duplicate-group mb-4 pb-4 border-bottom"
>
<h5>Group {index + 1}</h5>
{/* ImageList requires an array of items with proper types. We map it nicely. */}
<div className="d-flex flex-wrap gap-3">
{group.map((img) => (
<div key={img.id} className="border p-2 rounded">
const renderGroup = (group: GQL.ImageDataFragment[], index: number) => {
const groupIndex = (currentPage - 1) * pageSize + index + 1;
return (
<Card key={groupIndex} className="mb-4">
<Card.Header className="d-flex justify-content-between align-items-center">
<h5>Group {groupIndex}</h5>
<span className="text-muted">
Total Size: <FileSize size={getGroupTotalSize(group)} />
</span>
</Card.Header>
<Card.Body>
<Table striped bordered hover responsive size="sm">
<thead>
<tr>
<th style={{ width: "150px" }}>Image</th>
<th>Details</th>
<th style={{ width: "120px" }}>Size</th>
<th style={{ width: "150px" }}>Dimensions</th>
</tr>
</thead>
<tbody>
{group.map((img) => {
const file = img.visual_files[0];
return (
<tr key={img.id}>
<td>
<img
src={img.paths.thumbnail || ""}
alt={img.title || img.id}
style={{
maxWidth: "200px",
maxHeight: "200px",
maxWidth: "120px",
maxHeight: "120px",
objectFit: "contain",
}}
/>
<div
className="mt-2 text-center text-truncate"
style={{ maxWidth: "200px" }}
title={img.title || img.id}
>
{img.title || img.id}
</td>
<td>
<div className="fw-bold">{img.title || "(No Title)"}</div>
<div className="text-muted small text-truncate" style={{ maxWidth: "400px" }}>
{img.visual_files[0]?.path}
</div>
</div>
))}
</div>
</div>
);
})}
</ImageDuplicateCheckerSection>
</div>
<div className="mt-1 small">ID: {img.id}</div>
</td>
<td>
<FileSize size={file?.size ?? 0} />
</td>
<td>
{file?.__typename === "ImageFile" || file?.__typename === "VideoFile" ? (
<>
{file.width} x {file.height}
</>
) : (
"N/A"
)}
</td>
</tr>
);
})}
</tbody>
</Table>
</Card.Body>
</Card>
);
};
return (
<div className="container-fluid py-4">
<ImageDuplicateCheckerSection>
<Row className="mb-4">
<Col>
<h3>
<FormattedMessage id="config.tools.image_duplicate_checker" />
</h3>
</Col>
</Row>
<Form className="bg-light p-3 rounded mb-4 shadow-sm">
<Row className="align-items-end">
<Col md={3}>
<Form.Group controlId="distanceInput">
<Form.Label>PHash Distance</Form.Label>
<Form.Control
type="number"
value={hashDistance}
min={0}
max={10}
onChange={(e) => {
const val = parseInt(e.target.value) || 0;
query.set("distance", val.toString());
history.push({ search: query.toString() });
}}
/>
<Form.Text className="text-muted small">
0 = exact matches.
</Form.Text>
</Form.Group>
</Col>
<Col md={2}>
<Button
variant="primary"
className="w-100"
onClick={handleSearch}
disabled={isSearching || loading}
>
{isSearching || loading ? (
<Spinner animation="border" size="sm" />
) : (
"Search"
)}
</Button>
</Col>
</Row>
</Form>
{loading && <LoadingIndicator />}
{hasSearched && !loading && !error && allGroups.length === 0 && (
<div className="text-center py-5 border rounded bg-light">
<p className="mb-0">No duplicates found with the current distance.</p>
</div>
)}
{pagedGroups.map((group, index) => renderGroup(group, index))}
{allGroups.length > pageSize && (
<div className="d-flex justify-content-center mt-4">
<Pagination
currentPage={currentPage}
totalItems={allGroups.length}
pageSize={pageSize}
onChangePage={(page) => {
query.set("page", page.toString());
history.push({ search: query.toString() });
}}
/>
</div>
)}
</ImageDuplicateCheckerSection>
</div>
);
};