diff --git a/pkg/sqlite/image.go b/pkg/sqlite/image.go index 28ee5e49a..726683a57 100644 --- a/pkg/sqlite/image.go +++ b/pkg/sqlite/image.go @@ -1096,10 +1096,6 @@ func (qb *ImageStore) GetURLs(ctx context.Context, imageID int) ([]string, error } func (qb *ImageStore) FindDuplicates(ctx context.Context, distance int) ([][]*models.Image, error) { - return qb.findPhashMatches(ctx, distance) -} - -func (qb *ImageStore) findPhashMatches(ctx context.Context, distance int) ([][]*models.Image, error) { query := ` SELECT images.id, files_fingerprints.fingerprint as phash FROM images @@ -1107,88 +1103,20 @@ func (qb *ImageStore) findPhashMatches(ctx context.Context, distance int) ([][]* JOIN files_fingerprints ON images_files.file_id = files_fingerprints.file_id WHERE files_fingerprints.type = 'phash'` - type ImagePhash struct { - ID int `db:"id"` - PHash string `db:"phash"` - } - - var hashes []ImagePhash + var hashes []*utils.Phash err := imageRepository.queryStruct(ctx, query, nil, &hashes) if err != nil { return nil, err } - // Parse hashes - type ParsedPhash struct { - ID int - PHash uint64 - } - var parsedHashes []ParsedPhash for _, h := range hashes { - val, parseErr := strconv.ParseUint(h.PHash, 16, 64) - if parseErr == nil { - parsedHashes = append(parsedHashes, ParsedPhash{ID: h.ID, PHash: val}) - } + h.Bucket = -1 } - // Helper for Popcount - popcount := func(x uint64) int { - x -= (x >> 1) & 0x5555555555555555 - x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333) - x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f - return int((x * 0x0101010101010101) >> 56) - } + dupeIds := utils.FindDuplicates(hashes, distance, -1) - // Adjacency list for connected components - adj := make(map[int][]int) - nodes := make(map[int]bool) - - // O(N^2) comparison in memory - for i := 0; i < len(parsedHashes); i++ { - for j := i + 1; j < len(parsedHashes); j++ { - diff := popcount(parsedHashes[i].PHash ^ parsedHashes[j].PHash) - if diff <= distance { - id1 := parsedHashes[i].ID - id2 := parsedHashes[j].ID - adj[id1] = append(adj[id1], id2) - adj[id2] = append(adj[id2], id1) - nodes[id1] = true - nodes[id2] = true - } - } - } - - // Find connected components - visited := make(map[int]bool) - var components [][]int - - for node := range nodes { - if !visited[node] { - var component []int - queue := []int{node} - visited[node] = true - - for len(queue) > 0 { - curr := queue[0] - queue = queue[1:] - component = append(component, curr) - - for _, neighbor := range adj[curr] { - if !visited[neighbor] { - visited[neighbor] = true - queue = append(queue, neighbor) - } - } - } - if len(component) > 1 { - components = append(components, component) - } - } - } - - // Fetch actual image objects var result [][]*models.Image - for _, comp := range components { + for _, comp := range dupeIds { var group []*models.Image for _, id := range comp { img, err := qb.Find(ctx, id) @@ -1203,3 +1131,4 @@ func (qb *ImageStore) findPhashMatches(ctx context.Context, distance int) ([][]* return result, nil } + diff --git a/pkg/utils/phash.go b/pkg/utils/phash.go index 413293c65..36075742b 100644 --- a/pkg/utils/phash.go +++ b/pkg/utils/phash.go @@ -9,27 +9,27 @@ import ( ) type Phash struct { - SceneID int `db:"id"` - Hash int64 `db:"phash"` + ID int `db:"id"` + Hash int64 `db:"phash"` Duration float64 `db:"duration"` Neighbors []int Bucket int } func FindDuplicates(hashes []*Phash, distance int, durationDiff float64) [][]int { - for i, scene := range hashes { - sceneHash := goimagehash.NewImageHash(uint64(scene.Hash), goimagehash.PHash) + for i, subject := range hashes { + subjectHash := goimagehash.NewImageHash(uint64(subject.Hash), goimagehash.PHash) for j, neighbor := range hashes { - if i != j && scene.SceneID != neighbor.SceneID { + if i != j && subject.ID != neighbor.ID { neighbourDurationDistance := 0. - if scene.Duration > 0 && neighbor.Duration > 0 { - neighbourDurationDistance = math.Abs(scene.Duration - neighbor.Duration) + if subject.Duration > 0 && neighbor.Duration > 0 { + neighbourDurationDistance = math.Abs(subject.Duration - neighbor.Duration) } if (neighbourDurationDistance <= durationDiff) || (durationDiff < 0) { neighborHash := goimagehash.NewImageHash(uint64(neighbor.Hash), goimagehash.PHash) - neighborDistance, _ := sceneHash.Distance(neighborHash) + neighborDistance, _ := subjectHash.Distance(neighborHash) if neighborDistance <= distance { - scene.Neighbors = append(scene.Neighbors, j) + subject.Neighbors = append(subject.Neighbors, j) } } } @@ -37,15 +37,15 @@ func FindDuplicates(hashes []*Phash, distance int, durationDiff float64) [][]int } var buckets [][]int - for _, scene := range hashes { - if len(scene.Neighbors) > 0 && scene.Bucket == -1 { + for _, subject := range hashes { + if len(subject.Neighbors) > 0 && subject.Bucket == -1 { bucket := len(buckets) - scenes := []int{scene.SceneID} - scene.Bucket = bucket - findNeighbors(bucket, scene.Neighbors, hashes, &scenes) + ids := []int{subject.ID} + subject.Bucket = bucket + findNeighbors(bucket, subject.Neighbors, hashes, &ids) - if len(scenes) > 1 { - buckets = append(buckets, scenes) + if len(ids) > 1 { + buckets = append(buckets, ids) } } } @@ -53,13 +53,13 @@ func FindDuplicates(hashes []*Phash, distance int, durationDiff float64) [][]int return buckets } -func findNeighbors(bucket int, neighbors []int, hashes []*Phash, scenes *[]int) { +func findNeighbors(bucket int, neighbors []int, hashes []*Phash, ids *[]int) { for _, id := range neighbors { hash := hashes[id] if hash.Bucket == -1 { hash.Bucket = bucket - *scenes = sliceutil.AppendUnique(*scenes, hash.SceneID) - findNeighbors(bucket, hash.Neighbors, hashes, scenes) + *ids = sliceutil.AppendUnique(*ids, hash.ID) + findNeighbors(bucket, hash.Neighbors, hashes, ids) } } } diff --git a/ui/v2.5/src/components/ImageDuplicateChecker/ImageDuplicateChecker.tsx b/ui/v2.5/src/components/ImageDuplicateChecker/ImageDuplicateChecker.tsx index 7ff3b3b26..f78141335 100644 --- a/ui/v2.5/src/components/ImageDuplicateChecker/ImageDuplicateChecker.tsx +++ b/ui/v2.5/src/components/ImageDuplicateChecker/ImageDuplicateChecker.tsx @@ -1,21 +1,40 @@ -import React, { useState } from "react"; -import { Button, Form, Spinner } from "react-bootstrap"; -import { FormattedMessage } from "react-intl"; +import React, { useMemo, useState } from "react"; +import { + Button, + Form, + Spinner, + Table, + Row, + Col, + Card, +} from "react-bootstrap"; +import { FormattedMessage, useIntl } from "react-intl"; import { useFindDuplicateImagesQuery } from "src/core/generated-graphql"; +import * as GQL from "src/core/generated-graphql"; import { PatchContainerComponent } from "src/patch"; +import { LoadingIndicator } from "../Shared/LoadingIndicator"; +import { ErrorMessage } from "../Shared/ErrorMessage"; +import { FileSize } from "../Shared/FileSize"; +import { Pagination } from "src/components/List/Pagination"; +import { useHistory } from "react-router-dom"; const ImageDuplicateCheckerSection = PatchContainerComponent( "ImageDuplicateCheckerSection" ); const ImageDuplicateChecker: React.FC = () => { - const [distance, setDistance] = useState(0); + const intl = useIntl(); + const history = useHistory(); + const query = new URLSearchParams(history.location.search); + const currentPage = Number.parseInt(query.get("page") ?? "1", 10); + const pageSize = Number.parseInt(query.get("size") ?? "20", 10); + const hashDistance = Number.parseInt(query.get("distance") ?? "0", 10); + const [isSearching, setIsSearching] = useState(false); const [hasSearched, setHasSearched] = useState(false); - // We lazily fetch the query only when "Search" is clicked const { data, loading, error, refetch } = useFindDuplicateImagesQuery({ - variables: { distance }, + variables: { distance: hashDistance }, skip: !hasSearched, fetchPolicy: "network-only", }); @@ -23,90 +42,171 @@ const ImageDuplicateChecker: React.FC = () => { const handleSearch = () => { setIsSearching(true); setHasSearched(true); - refetch({ distance }).finally(() => setIsSearching(false)); + refetch({ distance: hashDistance }).finally(() => setIsSearching(false)); }; - const results = data?.findDuplicateImages ?? []; + const getGroupTotalSize = (group: GQL.ImageDataFragment[]) => { + return group.reduce((groupTotal, img) => { + const imgTotal = img.visual_files.reduce( + (fileTotal, file) => fileTotal + (file.size ?? 0), + 0 + ); + return groupTotal + imgTotal; + }, 0); + }; - return ( -
-
- -

- -

-
- - PHash Distance - setDistance(parseInt(e.target.value) || 0)} - /> - - Distance 0 means exact matches. - - + const allGroups = useMemo(() => { + const groups = data?.findDuplicateImages ?? []; + return [...groups].sort((a, b) => { + return getGroupTotalSize(b) - getGroupTotalSize(a); + }); + }, [data?.findDuplicateImages]); - -
+ const pagedGroups = useMemo(() => { + const start = (currentPage - 1) * pageSize; + return allGroups.slice(start, start + pageSize); + }, [allGroups, currentPage, pageSize]); - {error && ( -
Error: {error.message}
- )} + if (error) return ; - {hasSearched && !loading && !error && results.length === 0 && ( -

No duplicates found.

- )} - - {results.map((group, index) => { - if (!group || group.length < 2) return null; - return ( -
-
Group {index + 1}
- {/* ImageList requires an array of items with proper types. We map it nicely. */} -
- {group.map((img) => ( -
+ const renderGroup = (group: GQL.ImageDataFragment[], index: number) => { + const groupIndex = (currentPage - 1) * pageSize + index + 1; + return ( + + +
Group {groupIndex}
+ + Total Size: + +
+ + + + + + + + + + + + {group.map((img) => { + const file = img.visual_files[0]; + return ( + + + + + + + ); + })} + +
ImageDetailsSizeDimensions
{img.title -
- {img.title || img.id} +
+
{img.title || "(No Title)"}
+
+ {img.visual_files[0]?.path}
- - ))} - - - ); - })} - - +
ID: {img.id}
+
+ + + {file?.__typename === "ImageFile" || file?.__typename === "VideoFile" ? ( + <> + {file.width} x {file.height} + + ) : ( + "N/A" + )} +
+
+
+ ); + }; + + return ( +
+ + + +

+ +

+ +
+ +
+ + + + PHash Distance + { + const val = parseInt(e.target.value) || 0; + query.set("distance", val.toString()); + history.push({ search: query.toString() }); + }} + /> + + 0 = exact matches. + + + + + + + +
+ + {loading && } + + {hasSearched && !loading && !error && allGroups.length === 0 && ( +
+

No duplicates found with the current distance.

+
+ )} + + {pagedGroups.map((group, index) => renderGroup(group, index))} + + {allGroups.length > pageSize && ( +
+ { + query.set("page", page.toString()); + history.push({ search: query.toString() }); + }} + /> +
+ )} +
); };