Add option to ignore zip contents during clean (#6700)

* Add option to ignore zip file contents while cleaning Speeds up the clean process with the assumption that files within zip files are not deleted. * Add UI for new option
2026-04-17 12:31:44 +02:00 · 2026-03-18 15:58:32 +11:00 · 2026-03-18 15:58:32 +11:00 · 93fbb4be80
commit 93fbb4be80
parent f7b66c7ff9
12 changed files with 61 additions and 23 deletions
--- a/graphql/schema/types/metadata.graphql
+++ b/graphql/schema/types/metadata.graphql
@ -131,6 +131,14 @@ type ScanMetadataOptions {
 input CleanMetadataInput {
  paths: [String!]

+  """
+  Don't check zip file contents when determining whether to clean a file.
+  This can significantly speed up the clean process, but will potentially miss removed files within zip files.
+  Where users do not modify zip files contents directly, this should be safe to use.
+  Defaults to false.
+  """
+  ignoreZipFileContents: Boolean
+
  "Do a dry run. Don't delete any files"
  dryRun: Boolean!
 }
--- a/internal/manager/manager_tasks.go
+++ b/internal/manager/manager_tasks.go
@ -314,6 +314,8 @@ type CleanMetadataInput struct {
 	Paths []string `json:"paths"`
 	// Do a dry run. Don't delete any files
 	DryRun bool `json:"dryRun"`
+
+	IgnoreZipFileContents bool `json:"ignoreZipFileContents"`
 }

 func (s *Manager) Clean(ctx context.Context, input CleanMetadataInput) int {
--- a/internal/manager/task_clean.go
+++ b/internal/manager/task_clean.go
@ -40,9 +40,10 @@ func (j *cleanJob) Execute(ctx context.Context, progress *job.Progress) error {
 	}

 	j.cleaner.Clean(ctx, file.CleanOptions{
-		Paths:      j.input.Paths,
-		DryRun:     j.input.DryRun,
-		PathFilter: newCleanFilter(instance.Config),
+		Paths:                 j.input.Paths,
+		DryRun:                j.input.DryRun,
+		IgnoreZipFileContents: j.input.IgnoreZipFileContents,
+		PathFilter:            newCleanFilter(instance.Config),
 	}, progress)

 	if job.IsCancelled(ctx) {
--- a/pkg/file/clean.go
+++ b/pkg/file/clean.go
@ -33,6 +33,11 @@ type cleanJob struct {
 type CleanOptions struct {
 	Paths []string

+	// IgnoreZipFileContents will skip checking the contents of zip files when determining whether to clean a file.
+	// This can significantly speed up the clean process, but will potentially miss removed files within zip files.
+	// Where users do not modify zip files contents directly, this should be safe to use.
+	IgnoreZipFileContents bool
+
 	// Do a dry run. Don't delete any files
 	DryRun bool

@ -174,13 +179,16 @@ func (j *cleanJob) assessFiles(ctx context.Context, toDelete *deleteSet) error {

 	more := true
 	r := j.Repository
+
+	includeZipContents := !j.options.IgnoreZipFileContents
+
 	if err := r.WithReadTxn(ctx, func(ctx context.Context) error {
 		for more {
 			if job.IsCancelled(ctx) {
 				return nil
 			}

-			files, err := r.File.FindAllInPaths(ctx, j.options.Paths, batchSize, offset)
+			files, err := r.File.FindAllInPaths(ctx, j.options.Paths, includeZipContents, batchSize, offset)
 			if err != nil {
 				return fmt.Errorf("error querying for files: %w", err)
 			}
@ -258,6 +266,8 @@ func (j *cleanJob) assessFolders(ctx context.Context, toDelete *deleteSet) error
 	offset := 0
 	progress := j.progress

+	includeZipContents := !j.options.IgnoreZipFileContents
+
 	more := true
 	r := j.Repository
 	if err := r.WithReadTxn(ctx, func(ctx context.Context) error {
@ -266,7 +276,7 @@ func (j *cleanJob) assessFolders(ctx context.Context, toDelete *deleteSet) error
 				return nil
 			}

-			folders, err := r.Folder.FindAllInPaths(ctx, j.options.Paths, batchSize, offset)
+			folders, err := r.Folder.FindAllInPaths(ctx, j.options.Paths, includeZipContents, batchSize, offset)
 			if err != nil {
 				return fmt.Errorf("error querying for folders: %w", err)
 			}
--- a/pkg/models/mocks/FileReaderWriter.go
+++ b/pkg/models/mocks/FileReaderWriter.go
@ -153,13 +153,13 @@ func (_m *FileReaderWriter) FindAllByPath(ctx context.Context, path string, case
 	return r0, r1
 }

-// FindAllInPaths provides a mock function with given fields: ctx, p, limit, offset
-func (_m *FileReaderWriter) FindAllInPaths(ctx context.Context, p []string, limit int, offset int) ([]models.File, error) {
-	ret := _m.Called(ctx, p, limit, offset)
+// FindAllInPaths provides a mock function with given fields: ctx, p, includeZipContents, limit, offset
+func (_m *FileReaderWriter) FindAllInPaths(ctx context.Context, p []string, includeZipContents bool, limit int, offset int) ([]models.File, error) {
+	ret := _m.Called(ctx, p, includeZipContents, limit, offset)

 	var r0 []models.File
-	if rf, ok := ret.Get(0).(func(context.Context, []string, int, int) []models.File); ok {
-		r0 = rf(ctx, p, limit, offset)
+	if rf, ok := ret.Get(0).(func(context.Context, []string, bool, int, int) []models.File); ok {
+		r0 = rf(ctx, p, includeZipContents, limit, offset)
 	} else {
 		if ret.Get(0) != nil {
 			r0 = ret.Get(0).([]models.File)
@ -167,8 +167,8 @@ func (_m *FileReaderWriter) FindAllInPaths(ctx context.Context, p []string, limi
 	}

 	var r1 error
-	if rf, ok := ret.Get(1).(func(context.Context, []string, int, int) error); ok {
-		r1 = rf(ctx, p, limit, offset)
+	if rf, ok := ret.Get(1).(func(context.Context, []string, bool, int, int) error); ok {
+		r1 = rf(ctx, p, includeZipContents, limit, offset)
 	} else {
 		r1 = ret.Error(1)
 	}
--- a/pkg/models/mocks/FolderReaderWriter.go
+++ b/pkg/models/mocks/FolderReaderWriter.go
@ -86,13 +86,13 @@ func (_m *FolderReaderWriter) Find(ctx context.Context, id models.FolderID) (*mo
 	return r0, r1
 }

-// FindAllInPaths provides a mock function with given fields: ctx, p, limit, offset
-func (_m *FolderReaderWriter) FindAllInPaths(ctx context.Context, p []string, limit int, offset int) ([]*models.Folder, error) {
-	ret := _m.Called(ctx, p, limit, offset)
+// FindAllInPaths provides a mock function with given fields: ctx, p, includeZipContents, limit, offset
+func (_m *FolderReaderWriter) FindAllInPaths(ctx context.Context, p []string, includeZipContents bool, limit int, offset int) ([]*models.Folder, error) {
+	ret := _m.Called(ctx, p, includeZipContents, limit, offset)

 	var r0 []*models.Folder
-	if rf, ok := ret.Get(0).(func(context.Context, []string, int, int) []*models.Folder); ok {
-		r0 = rf(ctx, p, limit, offset)
+	if rf, ok := ret.Get(0).(func(context.Context, []string, bool, int, int) []*models.Folder); ok {
+		r0 = rf(ctx, p, includeZipContents, limit, offset)
 	} else {
 		if ret.Get(0) != nil {
 			r0 = ret.Get(0).([]*models.Folder)
@ -100,8 +100,8 @@ func (_m *FolderReaderWriter) FindAllInPaths(ctx context.Context, p []string, li
 	}

 	var r1 error
-	if rf, ok := ret.Get(1).(func(context.Context, []string, int, int) error); ok {
-		r1 = rf(ctx, p, limit, offset)
+	if rf, ok := ret.Get(1).(func(context.Context, []string, bool, int, int) error); ok {
+		r1 = rf(ctx, p, includeZipContents, limit, offset)
 	} else {
 		r1 = ret.Error(1)
 	}
--- a/pkg/models/repository_file.go
+++ b/pkg/models/repository_file.go
@ -14,7 +14,7 @@ type FileGetter interface {
 type FileFinder interface {
 	FileGetter
 	FindAllByPath(ctx context.Context, path string, caseSensitive bool) ([]File, error)
-	FindAllInPaths(ctx context.Context, p []string, limit, offset int) ([]File, error)
+	FindAllInPaths(ctx context.Context, p []string, includeZipContents bool, limit, offset int) ([]File, error)
 	FindByPath(ctx context.Context, path string, caseSensitive bool) (File, error)
 	FindByFingerprint(ctx context.Context, fp Fingerprint) ([]File, error)
 	FindByZipFileID(ctx context.Context, zipFileID FileID) ([]File, error)
--- a/pkg/models/repository_folder.go
+++ b/pkg/models/repository_folder.go
@ -11,7 +11,7 @@ type FolderGetter interface {
 // FolderFinder provides methods to find folders.
 type FolderFinder interface {
 	FolderGetter
-	FindAllInPaths(ctx context.Context, p []string, limit, offset int) ([]*Folder, error)
+	FindAllInPaths(ctx context.Context, p []string, includeZipContents bool, limit, offset int) ([]*Folder, error)
 	FindByPath(ctx context.Context, path string, caseSensitive bool) (*Folder, error)
 	FindByZipFileID(ctx context.Context, zipFileID FileID) ([]*Folder, error)
 	FindByParentFolderID(ctx context.Context, parentFolderID FolderID) ([]*Folder, error)
--- a/pkg/sqlite/file.go
+++ b/pkg/sqlite/file.go
@ -695,7 +695,7 @@ func (qb *FileStore) allInPaths(q *goqu.SelectDataset, p []string) *goqu.SelectD
 // FindAllByPaths returns the all files that are within any of the given paths.
 // Returns all if limit is < 0.
 // Returns all files if p is empty.
-func (qb *FileStore) FindAllInPaths(ctx context.Context, p []string, limit, offset int) ([]models.File, error) {
+func (qb *FileStore) FindAllInPaths(ctx context.Context, p []string, includeZipContents bool, limit, offset int) ([]models.File, error) {
 	table := qb.table()
 	folderTable := folderTableMgr.table

@ -706,6 +706,10 @@ func (qb *FileStore) FindAllInPaths(ctx context.Context, p []string, limit, offs

 	q = qb.allInPaths(q, p)

+	if !includeZipContents {
+		q = q.Where(table.Col("zip_file_id").IsNull())
+	}
+
 	if limit > -1 {
 		q = q.Limit(uint(limit))
 	}
--- a/pkg/sqlite/folder.go
+++ b/pkg/sqlite/folder.go
@ -427,10 +427,14 @@ func (qb *FolderStore) allInPaths(q *goqu.SelectDataset, p []string) *goqu.Selec
 // FindAllInPaths returns the all folders that are or are within any of the given paths.
 // Returns all if limit is < 0.
 // Returns all folders if p is empty.
-func (qb *FolderStore) FindAllInPaths(ctx context.Context, p []string, limit, offset int) ([]*models.Folder, error) {
+func (qb *FolderStore) FindAllInPaths(ctx context.Context, p []string, includeZipContents bool, limit, offset int) ([]*models.Folder, error) {
 	q := qb.selectDataset().Prepared(true)
 	q = qb.allInPaths(q, p)

+	if !includeZipContents {
+		q = q.Where(qb.table().Col("zip_file_id").IsNull())
+	}
+
 	if limit > -1 {
 		q = q.Limit(uint(limit))
 	}
--- a/ui/v2.5/src/components/Settings/Tasks/DataManagementTasks.tsx
+++ b/ui/v2.5/src/components/Settings/Tasks/DataManagementTasks.tsx
@ -145,6 +145,13 @@ const CleanOptions: React.FC<ICleanOptions> = ({

  return (
    <>
+      <BooleanSetting
+        id="clean-ignore-zip-contents"
+        checked={options.ignoreZipFileContents ?? false}
+        headingID="config.tasks.clean_ignore_zip_contents"
+        subHeadingID="config.tasks.clean_ignore_zip_contents_desc"
+        onChange={(v) => setOptions({ ignoreZipFileContents: v })}
+      />
      <BooleanSetting
        id="clean-dryrun"
        checked={options.dryRun}
--- a/ui/v2.5/src/locales/en-GB.json
+++ b/ui/v2.5/src/locales/en-GB.json
@ -527,6 +527,8 @@
        "zip": "SQLite database file and blob files will be zipped into a single file, with the filename {filename_format}"
      },
      "cleanup_desc": "Check for missing files and remove them from the database. This is a destructive action.",
+      "clean_ignore_zip_contents": "Ignore zip file contents",
+      "clean_ignore_zip_contents_desc": "Faster but will miss files removed inside zip files. Safe to enable if you don't delete files within zips.",
      "clean_generated": {
        "blob_files": "Blob files",
        "description": "Removes generated files without a corresponding database entry.",