feat(book analyzer): partial handling of archives with errors

entries of zip/rar archives which cannot be extracted will be skipped (closes #57) move images detection from extractors to BookAnalyzer rename archive package to mediacontainer
2025-12-21 16:03:03 +01:00 · 2020-01-15 15:29:05 +08:00 · 2020-01-15 15:29:05 +08:00 · 2605b1d943
commit 2605b1d943
parent a7548e298a
13 changed files with 169 additions and 134 deletions
--- a/komga/src/main/kotlin/org/gotson/komga/domain/model/MediaContainerEntry.kt
+++ b/komga/src/main/kotlin/org/gotson/komga/domain/model/MediaContainerEntry.kt
@ -0,0 +1,7 @@
+package org.gotson.komga.domain.model
+
+class MediaContainerEntry(
+  val name: String,
+  val mediaType: String? = null,
+  val comment: String? = null
+)
--- a/komga/src/main/kotlin/org/gotson/komga/domain/service/BookAnalyzer.kt
+++ b/komga/src/main/kotlin/org/gotson/komga/domain/service/BookAnalyzer.kt
@ -4,12 +4,13 @@ import mu.KotlinLogging
 import net.coobird.thumbnailator.Thumbnails
 import net.greypanther.natsort.CaseInsensitiveSimpleNaturalComparator
 import org.gotson.komga.domain.model.Book
+import org.gotson.komga.domain.model.BookPage
 import org.gotson.komga.domain.model.Media
 import org.gotson.komga.domain.model.MediaNotReadyException
-import org.gotson.komga.infrastructure.archive.ContentDetector
-import org.gotson.komga.infrastructure.archive.PdfExtractor
-import org.gotson.komga.infrastructure.archive.RarExtractor
-import org.gotson.komga.infrastructure.archive.ZipExtractor
+import org.gotson.komga.infrastructure.mediacontainer.ContentDetector
+import org.gotson.komga.infrastructure.mediacontainer.PdfExtractor
+import org.gotson.komga.infrastructure.mediacontainer.RarExtractor
+import org.gotson.komga.infrastructure.mediacontainer.ZipExtractor
 import org.springframework.stereotype.Service
 import java.io.ByteArrayOutputStream
 import java.util.*
@ -43,13 +44,31 @@ class BookAnalyzer(
    if (!supportedMediaTypes.keys.contains(mediaType))
      return Media(mediaType = mediaType, status = Media.Status.UNSUPPORTED, comment = "Media type $mediaType is not supported")

-    val pages = try {
-      supportedMediaTypes.getValue(mediaType).getPagesList(book.path()).sortedWith(compareBy(natSortComparator) { it.fileName })
+    val entries = try {
+      supportedMediaTypes.getValue(mediaType).getEntries(book.path())
    } catch (ex: Exception) {
      logger.error(ex) { "Error while analyzing book: $book" }
      return Media(mediaType = mediaType, status = Media.Status.ERROR, comment = ex.message)
    }

+    val (pages, others) = entries
+      .partition { entry ->
+        entry.mediaType?.let { contentDetector.isImage(it) } ?: false
+      }.let { (images, others) ->
+        Pair(
+          images
+            .map { BookPage(it.name, it.mediaType!!) }
+            .sortedWith(compareBy(natSortComparator) { it.fileName }),
+          others
+        )
+      }
+
+    val entriesErrorSummary = others
+      .filter { it.mediaType.isNullOrBlank() }
+      .map { it.name }
+      .ifEmpty { null }
+      ?.joinToString(prefix = "Some entries could not be analyzed: [", postfix = "]") { it }
+
    if (pages.isEmpty()) {
      logger.warn { "Book $book does not contain any pages" }
      return Media(mediaType = mediaType, status = Media.Status.ERROR, comment = "Book does not contain any pages")
@ -59,7 +78,7 @@ class BookAnalyzer(
    logger.info { "Trying to generate cover for book: $book" }
    val thumbnail = generateThumbnail(book, mediaType, pages.first().fileName)

-    return Media(mediaType = mediaType, status = Media.Status.READY, pages = pages, thumbnail = thumbnail)
+    return Media(mediaType = mediaType, status = Media.Status.READY, pages = pages, thumbnail = thumbnail, comment = entriesErrorSummary)
  }

  @Throws(MediaNotReadyException::class)
@ -84,7 +103,7 @@ class BookAnalyzer(
  private fun generateThumbnail(book: Book, mediaType: String, entry: String): ByteArray? =
    try {
      ByteArrayOutputStream().use {
-        supportedMediaTypes.getValue(mediaType).getPageStream(book.path(), entry).let { cover ->
+        supportedMediaTypes.getValue(mediaType).getEntryStream(book.path(), entry).let { cover ->
          Thumbnails.of(cover.inputStream())
            .size(thumbnailSize, thumbnailSize)
            .outputFormat(thumbnailFormat)
@ -114,6 +133,6 @@ class BookAnalyzer(
      throw IndexOutOfBoundsException("Page $number does not exist")
    }

-    return supportedMediaTypes.getValue(book.media.mediaType!!).getPageStream(book.path(), book.media.pages[number - 1].fileName)
+    return supportedMediaTypes.getValue(book.media.mediaType!!).getEntryStream(book.path(), book.media.pages[number - 1].fileName)
  }
 }
--- a/komga/src/main/kotlin/org/gotson/komga/infrastructure/archive/ArchiveExtractor.kt
+++ b/komga/src/main/kotlin/org/gotson/komga/infrastructure/archive/ArchiveExtractor.kt
@ -1,9 +0,0 @@
-package org.gotson.komga.infrastructure.archive
-
-import org.gotson.komga.domain.model.BookPage
-import java.nio.file.Path
-
-abstract class ArchiveExtractor {
-  abstract fun getPagesList(path: Path): List<BookPage>
-  abstract fun getPageStream(path: Path, entryName: String): ByteArray
-}
--- a/komga/src/main/kotlin/org/gotson/komga/infrastructure/archive/PdfExtractor.kt
+++ b/komga/src/main/kotlin/org/gotson/komga/infrastructure/archive/PdfExtractor.kt
@ -1,42 +0,0 @@
-package org.gotson.komga.infrastructure.archive
-
-import org.apache.pdfbox.pdmodel.PDDocument
-import org.apache.pdfbox.rendering.ImageType
-import org.apache.pdfbox.rendering.PDFRenderer
-import org.gotson.komga.domain.model.BookPage
-import org.springframework.stereotype.Service
-import java.io.ByteArrayOutputStream
-import java.nio.file.Files
-import java.nio.file.Path
-import javax.imageio.ImageIO
-
-@Service
-class PdfExtractor : ArchiveExtractor() {
-
-  private val mediaType = "image/jpeg"
-  private val imageIOFormat = "jpeg"
-  private val resolution = 1536F
-
-  override fun getPagesList(path: Path): List<BookPage> =
-      Files.newInputStream(path).use { inputStream ->
-        PDDocument.load(inputStream).use { pdf ->
-          (0 until pdf.numberOfPages).map { index ->
-            BookPage(index.toString(), mediaType)
-          }
-        }
-      }
-
-  override fun getPageStream(path: Path, entryName: String): ByteArray =
-      Files.newInputStream(path).use { inputStream ->
-        PDDocument.load(inputStream).use { pdf ->
-          val pageNumber = entryName.toInt()
-          val page = pdf.getPage(pageNumber)
-          val scale = resolution / minOf(page.cropBox.width, page.cropBox.height)
-          val image = PDFRenderer(pdf).renderImage(pageNumber, scale, ImageType.RGB)
-          ByteArrayOutputStream().use { out ->
-            ImageIO.write(image, imageIOFormat, out)
-            out.toByteArray()
-          }
-        }
-      }
-}
--- a/komga/src/main/kotlin/org/gotson/komga/infrastructure/archive/RarExtractor.kt
+++ b/komga/src/main/kotlin/org/gotson/komga/infrastructure/archive/RarExtractor.kt
@ -1,32 +0,0 @@
-package org.gotson.komga.infrastructure.archive
-
-import com.github.junrar.Archive
-import org.gotson.komga.domain.model.BookPage
-import org.springframework.stereotype.Service
-import java.nio.file.Files
-import java.nio.file.Path
-
-@Service
-class RarExtractor(
-    private val contentDetector: ContentDetector
-) : ArchiveExtractor() {
-
-  override fun getPagesList(path: Path): List<BookPage> =
-      Archive(Files.newInputStream(path)).use { rar ->
-        rar.fileHeaders
-            .filter { !it.isDirectory }
-            .map {
-              BookPage(
-                  it.fileNameString,
-                  contentDetector.detectMediaType(rar.getInputStream(it))
-              )
-            }
-            .filter { contentDetector.isImage(it.mediaType) }
-      }
-
-  override fun getPageStream(path: Path, entryName: String): ByteArray =
-      Archive(Files.newInputStream(path)).use { rar ->
-        val header = rar.fileHeaders.find { it.fileNameString == entryName }
-        rar.getInputStream(header).readBytes()
-      }
-}
--- a/komga/src/main/kotlin/org/gotson/komga/infrastructure/archive/ZipExtractor.kt
+++ b/komga/src/main/kotlin/org/gotson/komga/infrastructure/archive/ZipExtractor.kt
@ -1,30 +0,0 @@
-package org.gotson.komga.infrastructure.archive
-
-import org.apache.commons.compress.archivers.zip.ZipFile
-import org.gotson.komga.domain.model.BookPage
-import org.springframework.stereotype.Service
-import java.nio.file.Path
-
-@Service
-class ZipExtractor(
-    private val contentDetector: ContentDetector
-) : ArchiveExtractor() {
-
-  override fun getPagesList(path: Path): List<BookPage> =
-      ZipFile(path.toFile()).use { zip ->
-        zip.entries.toList()
-            .filter { !it.isDirectory }
-            .map {
-              BookPage(
-                  it.name,
-                  contentDetector.detectMediaType(zip.getInputStream(it))
-              )
-            }
-            .filter { contentDetector.isImage(it.mediaType) }
-      }
-
-  override fun getPageStream(path: Path, entryName: String): ByteArray =
-      ZipFile(path.toFile()).use {
-        it.getInputStream(it.getEntry(entryName)).readBytes()
-      }
-}
--- a/komga/src/main/kotlin/org/gotson/komga/infrastructure/mediacontainer/ContentDetector.kt
+++ b/komga/src/main/kotlin/org/gotson/komga/infrastructure/mediacontainer/ContentDetector.kt
@ -1,4 +1,4 @@
-package org.gotson.komga.infrastructure.archive
+package org.gotson.komga.infrastructure.mediacontainer

 import mu.KotlinLogging
 import org.apache.tika.config.TikaConfig
@ -36,4 +36,4 @@ class ContentDetector(

  fun isImage(mediaType: String): Boolean =
      mediaType.startsWith("image/")
-}
+}
--- a/komga/src/main/kotlin/org/gotson/komga/infrastructure/mediacontainer/MediaContainerExtractor.kt
+++ b/komga/src/main/kotlin/org/gotson/komga/infrastructure/mediacontainer/MediaContainerExtractor.kt
@ -0,0 +1,9 @@
+package org.gotson.komga.infrastructure.mediacontainer
+
+import org.gotson.komga.domain.model.MediaContainerEntry
+import java.nio.file.Path
+
+abstract class MediaContainerExtractor {
+  abstract fun getEntries(path: Path): List<MediaContainerEntry>
+  abstract fun getEntryStream(path: Path, entryName: String): ByteArray
+}
--- a/komga/src/main/kotlin/org/gotson/komga/infrastructure/mediacontainer/PdfExtractor.kt
+++ b/komga/src/main/kotlin/org/gotson/komga/infrastructure/mediacontainer/PdfExtractor.kt
@ -0,0 +1,42 @@
+package org.gotson.komga.infrastructure.mediacontainer
+
+import org.apache.pdfbox.pdmodel.PDDocument
+import org.apache.pdfbox.rendering.ImageType
+import org.apache.pdfbox.rendering.PDFRenderer
+import org.gotson.komga.domain.model.MediaContainerEntry
+import org.springframework.stereotype.Service
+import java.io.ByteArrayOutputStream
+import java.nio.file.Files
+import java.nio.file.Path
+import javax.imageio.ImageIO
+
+@Service
+class PdfExtractor : MediaContainerExtractor() {
+
+  private val mediaType = "image/jpeg"
+  private val imageIOFormat = "jpeg"
+  private val resolution = 1536F
+
+  override fun getEntries(path: Path): List<MediaContainerEntry> =
+    Files.newInputStream(path).use { inputStream ->
+      PDDocument.load(inputStream).use { pdf ->
+        (0 until pdf.numberOfPages).map { index ->
+          MediaContainerEntry(index.toString(), mediaType)
+        }
+      }
+    }
+
+  override fun getEntryStream(path: Path, entryName: String): ByteArray =
+    Files.newInputStream(path).use { inputStream ->
+      PDDocument.load(inputStream).use { pdf ->
+        val pageNumber = entryName.toInt()
+        val page = pdf.getPage(pageNumber)
+        val scale = resolution / minOf(page.cropBox.width, page.cropBox.height)
+        val image = PDFRenderer(pdf).renderImage(pageNumber, scale, ImageType.RGB)
+        ByteArrayOutputStream().use { out ->
+          ImageIO.write(image, imageIOFormat, out)
+          out.toByteArray()
+        }
+        }
+      }
+}
--- a/komga/src/main/kotlin/org/gotson/komga/infrastructure/mediacontainer/RarExtractor.kt
+++ b/komga/src/main/kotlin/org/gotson/komga/infrastructure/mediacontainer/RarExtractor.kt
@ -0,0 +1,36 @@
+package org.gotson.komga.infrastructure.mediacontainer
+
+import com.github.junrar.Archive
+import mu.KotlinLogging
+import org.gotson.komga.domain.model.MediaContainerEntry
+import org.springframework.stereotype.Service
+import java.nio.file.Files
+import java.nio.file.Path
+
+private val logger = KotlinLogging.logger {}
+
+@Service
+class RarExtractor(
+  private val contentDetector: ContentDetector
+) : MediaContainerExtractor() {
+
+  override fun getEntries(path: Path): List<MediaContainerEntry> =
+    Archive(Files.newInputStream(path)).use { rar ->
+      rar.fileHeaders
+        .filter { !it.isDirectory }
+        .map {
+          try {
+            MediaContainerEntry(name = it.fileNameString, mediaType = contentDetector.detectMediaType(rar.getInputStream(it)))
+          } catch (e: Exception) {
+            logger.warn(e) { "Could not analyze entry: ${it.fileNameString}" }
+            MediaContainerEntry(name = it.fileNameString, comment = e.message)
+          }
+        }
+    }
+
+  override fun getEntryStream(path: Path, entryName: String): ByteArray =
+    Archive(Files.newInputStream(path)).use { rar ->
+      val header = rar.fileHeaders.find { it.fileNameString == entryName }
+      rar.getInputStream(header).readBytes()
+    }
+}
--- a/komga/src/main/kotlin/org/gotson/komga/infrastructure/mediacontainer/TikaConfiguration.kt
+++ b/komga/src/main/kotlin/org/gotson/komga/infrastructure/mediacontainer/TikaConfiguration.kt
@ -1,4 +1,4 @@
-package org.gotson.komga.infrastructure.archive
+package org.gotson.komga.infrastructure.mediacontainer

 import org.apache.tika.config.TikaConfig
 import org.springframework.context.annotation.Bean
@ -9,4 +9,4 @@ class TikaConfiguration {

  @Bean
  fun tika() = TikaConfig()
-}
+}
--- a/komga/src/main/kotlin/org/gotson/komga/infrastructure/mediacontainer/ZipExtractor.kt
+++ b/komga/src/main/kotlin/org/gotson/komga/infrastructure/mediacontainer/ZipExtractor.kt
@ -0,0 +1,34 @@
+package org.gotson.komga.infrastructure.mediacontainer
+
+import mu.KotlinLogging
+import org.apache.commons.compress.archivers.zip.ZipFile
+import org.gotson.komga.domain.model.MediaContainerEntry
+import org.springframework.stereotype.Service
+import java.nio.file.Path
+
+private val logger = KotlinLogging.logger {}
+
+@Service
+class ZipExtractor(
+  private val contentDetector: ContentDetector
+) : MediaContainerExtractor() {
+
+  override fun getEntries(path: Path): List<MediaContainerEntry> =
+    ZipFile(path.toFile()).use { zip ->
+      zip.entries.toList()
+        .filter { !it.isDirectory }
+        .map {
+          try {
+            MediaContainerEntry(name = it.name, mediaType = contentDetector.detectMediaType(zip.getInputStream(it)))
+          } catch (e: Exception) {
+            logger.warn(e) { "Could not analyze entry: ${it.name}" }
+            MediaContainerEntry(name = it.name, comment = e.message)
+          }
+        }
+    }
+
+  override fun getEntryStream(path: Path, entryName: String): ByteArray =
+    ZipFile(path.toFile()).use {
+      it.getInputStream(it.getEntry(entryName)).readBytes()
+    }
+}
--- a/komga/src/test/kotlin/org/gotson/komga/domain/service/BookAnalyzerTest.kt
+++ b/komga/src/test/kotlin/org/gotson/komga/domain/service/BookAnalyzerTest.kt
@ -4,12 +4,12 @@ import io.mockk.every
 import io.mockk.mockk
 import io.mockk.slot
 import org.assertj.core.api.Assertions.assertThat
+import org.gotson.komga.domain.model.MediaContainerEntry
 import org.gotson.komga.domain.model.makeBook
-import org.gotson.komga.domain.model.makeBookPage
-import org.gotson.komga.infrastructure.archive.ContentDetector
-import org.gotson.komga.infrastructure.archive.PdfExtractor
-import org.gotson.komga.infrastructure.archive.RarExtractor
-import org.gotson.komga.infrastructure.archive.ZipExtractor
+import org.gotson.komga.infrastructure.mediacontainer.ContentDetector
+import org.gotson.komga.infrastructure.mediacontainer.PdfExtractor
+import org.gotson.komga.infrastructure.mediacontainer.RarExtractor
+import org.gotson.komga.infrastructure.mediacontainer.ZipExtractor
 import org.junit.jupiter.api.Test

 class BookAnalyzerTest {
@ -25,13 +25,14 @@ class BookAnalyzerTest {
    // given
    val book = makeBook("book")
    every { mockContent.detectMediaType(book.path()) } returns "application/zip"
+    every { mockContent.isImage(any()) } returns true

-    val unorderedPages = listOf("08", "01", "02").map { makeBookPage(it) }
-    every { mockZip.getPagesList(book.path()) } returns unorderedPages
+    val unorderedPages = listOf("08", "01", "02").map { MediaContainerEntry(it, "image/png") }
+    every { mockZip.getEntries(book.path()) } returns unorderedPages

    //when
    val thumbnailFile = slot<String>()
-    every { mockZip.getPageStream(book.path(), capture(thumbnailFile)) } returns ByteArray(1)
+    every { mockZip.getEntryStream(book.path(), capture(thumbnailFile)) } returns ByteArray(1)
    bookAnalyzer.analyze(book)

    // then