feat(book analyzer): partial handling of archives with errors

entries of zip/rar archives which cannot be extracted will be skipped (closes #57)
move images detection from extractors to BookAnalyzer
rename archive package to mediacontainer
This commit is contained in:
Gauthier Roebroeck 2020-01-15 15:29:05 +08:00
parent a7548e298a
commit 2605b1d943
13 changed files with 169 additions and 134 deletions

View file

@ -0,0 +1,7 @@
package org.gotson.komga.domain.model
class MediaContainerEntry(
val name: String,
val mediaType: String? = null,
val comment: String? = null
)

View file

@ -4,12 +4,13 @@ import mu.KotlinLogging
import net.coobird.thumbnailator.Thumbnails
import net.greypanther.natsort.CaseInsensitiveSimpleNaturalComparator
import org.gotson.komga.domain.model.Book
import org.gotson.komga.domain.model.BookPage
import org.gotson.komga.domain.model.Media
import org.gotson.komga.domain.model.MediaNotReadyException
import org.gotson.komga.infrastructure.archive.ContentDetector
import org.gotson.komga.infrastructure.archive.PdfExtractor
import org.gotson.komga.infrastructure.archive.RarExtractor
import org.gotson.komga.infrastructure.archive.ZipExtractor
import org.gotson.komga.infrastructure.mediacontainer.ContentDetector
import org.gotson.komga.infrastructure.mediacontainer.PdfExtractor
import org.gotson.komga.infrastructure.mediacontainer.RarExtractor
import org.gotson.komga.infrastructure.mediacontainer.ZipExtractor
import org.springframework.stereotype.Service
import java.io.ByteArrayOutputStream
import java.util.*
@ -43,13 +44,31 @@ class BookAnalyzer(
if (!supportedMediaTypes.keys.contains(mediaType))
return Media(mediaType = mediaType, status = Media.Status.UNSUPPORTED, comment = "Media type $mediaType is not supported")
val pages = try {
supportedMediaTypes.getValue(mediaType).getPagesList(book.path()).sortedWith(compareBy(natSortComparator) { it.fileName })
val entries = try {
supportedMediaTypes.getValue(mediaType).getEntries(book.path())
} catch (ex: Exception) {
logger.error(ex) { "Error while analyzing book: $book" }
return Media(mediaType = mediaType, status = Media.Status.ERROR, comment = ex.message)
}
val (pages, others) = entries
.partition { entry ->
entry.mediaType?.let { contentDetector.isImage(it) } ?: false
}.let { (images, others) ->
Pair(
images
.map { BookPage(it.name, it.mediaType!!) }
.sortedWith(compareBy(natSortComparator) { it.fileName }),
others
)
}
val entriesErrorSummary = others
.filter { it.mediaType.isNullOrBlank() }
.map { it.name }
.ifEmpty { null }
?.joinToString(prefix = "Some entries could not be analyzed: [", postfix = "]") { it }
if (pages.isEmpty()) {
logger.warn { "Book $book does not contain any pages" }
return Media(mediaType = mediaType, status = Media.Status.ERROR, comment = "Book does not contain any pages")
@ -59,7 +78,7 @@ class BookAnalyzer(
logger.info { "Trying to generate cover for book: $book" }
val thumbnail = generateThumbnail(book, mediaType, pages.first().fileName)
return Media(mediaType = mediaType, status = Media.Status.READY, pages = pages, thumbnail = thumbnail)
return Media(mediaType = mediaType, status = Media.Status.READY, pages = pages, thumbnail = thumbnail, comment = entriesErrorSummary)
}
@Throws(MediaNotReadyException::class)
@ -84,7 +103,7 @@ class BookAnalyzer(
private fun generateThumbnail(book: Book, mediaType: String, entry: String): ByteArray? =
try {
ByteArrayOutputStream().use {
supportedMediaTypes.getValue(mediaType).getPageStream(book.path(), entry).let { cover ->
supportedMediaTypes.getValue(mediaType).getEntryStream(book.path(), entry).let { cover ->
Thumbnails.of(cover.inputStream())
.size(thumbnailSize, thumbnailSize)
.outputFormat(thumbnailFormat)
@ -114,6 +133,6 @@ class BookAnalyzer(
throw IndexOutOfBoundsException("Page $number does not exist")
}
return supportedMediaTypes.getValue(book.media.mediaType!!).getPageStream(book.path(), book.media.pages[number - 1].fileName)
return supportedMediaTypes.getValue(book.media.mediaType!!).getEntryStream(book.path(), book.media.pages[number - 1].fileName)
}
}

View file

@ -1,9 +0,0 @@
package org.gotson.komga.infrastructure.archive
import org.gotson.komga.domain.model.BookPage
import java.nio.file.Path
abstract class ArchiveExtractor {
abstract fun getPagesList(path: Path): List<BookPage>
abstract fun getPageStream(path: Path, entryName: String): ByteArray
}

View file

@ -1,42 +0,0 @@
package org.gotson.komga.infrastructure.archive
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.rendering.ImageType
import org.apache.pdfbox.rendering.PDFRenderer
import org.gotson.komga.domain.model.BookPage
import org.springframework.stereotype.Service
import java.io.ByteArrayOutputStream
import java.nio.file.Files
import java.nio.file.Path
import javax.imageio.ImageIO
@Service
class PdfExtractor : ArchiveExtractor() {
private val mediaType = "image/jpeg"
private val imageIOFormat = "jpeg"
private val resolution = 1536F
override fun getPagesList(path: Path): List<BookPage> =
Files.newInputStream(path).use { inputStream ->
PDDocument.load(inputStream).use { pdf ->
(0 until pdf.numberOfPages).map { index ->
BookPage(index.toString(), mediaType)
}
}
}
override fun getPageStream(path: Path, entryName: String): ByteArray =
Files.newInputStream(path).use { inputStream ->
PDDocument.load(inputStream).use { pdf ->
val pageNumber = entryName.toInt()
val page = pdf.getPage(pageNumber)
val scale = resolution / minOf(page.cropBox.width, page.cropBox.height)
val image = PDFRenderer(pdf).renderImage(pageNumber, scale, ImageType.RGB)
ByteArrayOutputStream().use { out ->
ImageIO.write(image, imageIOFormat, out)
out.toByteArray()
}
}
}
}

View file

@ -1,32 +0,0 @@
package org.gotson.komga.infrastructure.archive
import com.github.junrar.Archive
import org.gotson.komga.domain.model.BookPage
import org.springframework.stereotype.Service
import java.nio.file.Files
import java.nio.file.Path
@Service
class RarExtractor(
private val contentDetector: ContentDetector
) : ArchiveExtractor() {
override fun getPagesList(path: Path): List<BookPage> =
Archive(Files.newInputStream(path)).use { rar ->
rar.fileHeaders
.filter { !it.isDirectory }
.map {
BookPage(
it.fileNameString,
contentDetector.detectMediaType(rar.getInputStream(it))
)
}
.filter { contentDetector.isImage(it.mediaType) }
}
override fun getPageStream(path: Path, entryName: String): ByteArray =
Archive(Files.newInputStream(path)).use { rar ->
val header = rar.fileHeaders.find { it.fileNameString == entryName }
rar.getInputStream(header).readBytes()
}
}

View file

@ -1,30 +0,0 @@
package org.gotson.komga.infrastructure.archive
import org.apache.commons.compress.archivers.zip.ZipFile
import org.gotson.komga.domain.model.BookPage
import org.springframework.stereotype.Service
import java.nio.file.Path
@Service
class ZipExtractor(
private val contentDetector: ContentDetector
) : ArchiveExtractor() {
override fun getPagesList(path: Path): List<BookPage> =
ZipFile(path.toFile()).use { zip ->
zip.entries.toList()
.filter { !it.isDirectory }
.map {
BookPage(
it.name,
contentDetector.detectMediaType(zip.getInputStream(it))
)
}
.filter { contentDetector.isImage(it.mediaType) }
}
override fun getPageStream(path: Path, entryName: String): ByteArray =
ZipFile(path.toFile()).use {
it.getInputStream(it.getEntry(entryName)).readBytes()
}
}

View file

@ -1,4 +1,4 @@
package org.gotson.komga.infrastructure.archive
package org.gotson.komga.infrastructure.mediacontainer
import mu.KotlinLogging
import org.apache.tika.config.TikaConfig
@ -36,4 +36,4 @@ class ContentDetector(
fun isImage(mediaType: String): Boolean =
mediaType.startsWith("image/")
}
}

View file

@ -0,0 +1,9 @@
package org.gotson.komga.infrastructure.mediacontainer
import org.gotson.komga.domain.model.MediaContainerEntry
import java.nio.file.Path
abstract class MediaContainerExtractor {
abstract fun getEntries(path: Path): List<MediaContainerEntry>
abstract fun getEntryStream(path: Path, entryName: String): ByteArray
}

View file

@ -0,0 +1,42 @@
package org.gotson.komga.infrastructure.mediacontainer
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.rendering.ImageType
import org.apache.pdfbox.rendering.PDFRenderer
import org.gotson.komga.domain.model.MediaContainerEntry
import org.springframework.stereotype.Service
import java.io.ByteArrayOutputStream
import java.nio.file.Files
import java.nio.file.Path
import javax.imageio.ImageIO
@Service
class PdfExtractor : MediaContainerExtractor() {
private val mediaType = "image/jpeg"
private val imageIOFormat = "jpeg"
private val resolution = 1536F
override fun getEntries(path: Path): List<MediaContainerEntry> =
Files.newInputStream(path).use { inputStream ->
PDDocument.load(inputStream).use { pdf ->
(0 until pdf.numberOfPages).map { index ->
MediaContainerEntry(index.toString(), mediaType)
}
}
}
override fun getEntryStream(path: Path, entryName: String): ByteArray =
Files.newInputStream(path).use { inputStream ->
PDDocument.load(inputStream).use { pdf ->
val pageNumber = entryName.toInt()
val page = pdf.getPage(pageNumber)
val scale = resolution / minOf(page.cropBox.width, page.cropBox.height)
val image = PDFRenderer(pdf).renderImage(pageNumber, scale, ImageType.RGB)
ByteArrayOutputStream().use { out ->
ImageIO.write(image, imageIOFormat, out)
out.toByteArray()
}
}
}
}

View file

@ -0,0 +1,36 @@
package org.gotson.komga.infrastructure.mediacontainer
import com.github.junrar.Archive
import mu.KotlinLogging
import org.gotson.komga.domain.model.MediaContainerEntry
import org.springframework.stereotype.Service
import java.nio.file.Files
import java.nio.file.Path
private val logger = KotlinLogging.logger {}
@Service
class RarExtractor(
private val contentDetector: ContentDetector
) : MediaContainerExtractor() {
override fun getEntries(path: Path): List<MediaContainerEntry> =
Archive(Files.newInputStream(path)).use { rar ->
rar.fileHeaders
.filter { !it.isDirectory }
.map {
try {
MediaContainerEntry(name = it.fileNameString, mediaType = contentDetector.detectMediaType(rar.getInputStream(it)))
} catch (e: Exception) {
logger.warn(e) { "Could not analyze entry: ${it.fileNameString}" }
MediaContainerEntry(name = it.fileNameString, comment = e.message)
}
}
}
override fun getEntryStream(path: Path, entryName: String): ByteArray =
Archive(Files.newInputStream(path)).use { rar ->
val header = rar.fileHeaders.find { it.fileNameString == entryName }
rar.getInputStream(header).readBytes()
}
}

View file

@ -1,4 +1,4 @@
package org.gotson.komga.infrastructure.archive
package org.gotson.komga.infrastructure.mediacontainer
import org.apache.tika.config.TikaConfig
import org.springframework.context.annotation.Bean
@ -9,4 +9,4 @@ class TikaConfiguration {
@Bean
fun tika() = TikaConfig()
}
}

View file

@ -0,0 +1,34 @@
package org.gotson.komga.infrastructure.mediacontainer
import mu.KotlinLogging
import org.apache.commons.compress.archivers.zip.ZipFile
import org.gotson.komga.domain.model.MediaContainerEntry
import org.springframework.stereotype.Service
import java.nio.file.Path
private val logger = KotlinLogging.logger {}
@Service
class ZipExtractor(
private val contentDetector: ContentDetector
) : MediaContainerExtractor() {
override fun getEntries(path: Path): List<MediaContainerEntry> =
ZipFile(path.toFile()).use { zip ->
zip.entries.toList()
.filter { !it.isDirectory }
.map {
try {
MediaContainerEntry(name = it.name, mediaType = contentDetector.detectMediaType(zip.getInputStream(it)))
} catch (e: Exception) {
logger.warn(e) { "Could not analyze entry: ${it.name}" }
MediaContainerEntry(name = it.name, comment = e.message)
}
}
}
override fun getEntryStream(path: Path, entryName: String): ByteArray =
ZipFile(path.toFile()).use {
it.getInputStream(it.getEntry(entryName)).readBytes()
}
}

View file

@ -4,12 +4,12 @@ import io.mockk.every
import io.mockk.mockk
import io.mockk.slot
import org.assertj.core.api.Assertions.assertThat
import org.gotson.komga.domain.model.MediaContainerEntry
import org.gotson.komga.domain.model.makeBook
import org.gotson.komga.domain.model.makeBookPage
import org.gotson.komga.infrastructure.archive.ContentDetector
import org.gotson.komga.infrastructure.archive.PdfExtractor
import org.gotson.komga.infrastructure.archive.RarExtractor
import org.gotson.komga.infrastructure.archive.ZipExtractor
import org.gotson.komga.infrastructure.mediacontainer.ContentDetector
import org.gotson.komga.infrastructure.mediacontainer.PdfExtractor
import org.gotson.komga.infrastructure.mediacontainer.RarExtractor
import org.gotson.komga.infrastructure.mediacontainer.ZipExtractor
import org.junit.jupiter.api.Test
class BookAnalyzerTest {
@ -25,13 +25,14 @@ class BookAnalyzerTest {
// given
val book = makeBook("book")
every { mockContent.detectMediaType(book.path()) } returns "application/zip"
every { mockContent.isImage(any()) } returns true
val unorderedPages = listOf("08", "01", "02").map { makeBookPage(it) }
every { mockZip.getPagesList(book.path()) } returns unorderedPages
val unorderedPages = listOf("08", "01", "02").map { MediaContainerEntry(it, "image/png") }
every { mockZip.getEntries(book.path()) } returns unorderedPages
//when
val thumbnailFile = slot<String>()
every { mockZip.getPageStream(book.path(), capture(thumbnailFile)) } returns ByteArray(1)
every { mockZip.getEntryStream(book.path(), capture(thumbnailFile)) } returns ByteArray(1)
bookAnalyzer.analyze(book)
// then