From dcdae8f3e7908050bb78544d52fc1cd471cec3ac Mon Sep 17 00:00:00 2001 From: Gauthier Roebroeck Date: Fri, 6 Sep 2019 14:27:45 +0800 Subject: [PATCH] new version of the PdfExtractor that properly handles multiple images on one page --- .../infrastructure/archive/PdfExtractor.kt | 215 ++---------------- 1 file changed, 19 insertions(+), 196 deletions(-) diff --git a/komga/src/main/kotlin/org/gotson/komga/infrastructure/archive/PdfExtractor.kt b/komga/src/main/kotlin/org/gotson/komga/infrastructure/archive/PdfExtractor.kt index b3ed5f96a..a01d2740e 100644 --- a/komga/src/main/kotlin/org/gotson/komga/infrastructure/archive/PdfExtractor.kt +++ b/komga/src/main/kotlin/org/gotson/komga/infrastructure/archive/PdfExtractor.kt @@ -1,215 +1,38 @@ package org.gotson.komga.infrastructure.archive -import mu.KotlinLogging -import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine -import org.apache.pdfbox.cos.COSName -import org.apache.pdfbox.io.IOUtils import org.apache.pdfbox.pdmodel.PDDocument -import org.apache.pdfbox.pdmodel.PDPage -import org.apache.pdfbox.pdmodel.font.PDFont -import org.apache.pdfbox.pdmodel.graphics.color.PDColor -import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray -import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB -import org.apache.pdfbox.pdmodel.graphics.color.PDPattern -import org.apache.pdfbox.pdmodel.graphics.image.PDImage -import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject -import org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern -import org.apache.pdfbox.util.Matrix -import org.apache.pdfbox.util.Vector -import org.apache.tika.config.TikaConfig +import org.apache.pdfbox.rendering.ImageType +import org.apache.pdfbox.rendering.PDFRenderer import org.gotson.komga.domain.model.BookPage import org.springframework.stereotype.Service -import java.awt.geom.Point2D import java.io.ByteArrayOutputStream -import java.net.URLConnection import java.nio.file.Files import java.nio.file.Path import javax.imageio.ImageIO -private val logger = KotlinLogging.logger {} - -/** - * Largely inspired by https://github.com/apache/pdfbox/blob/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractImages.java - */ @Service -class PdfExtractor( - private val tika: TikaConfig -) : ArchiveExtractor() { +class PdfExtractor : ArchiveExtractor() { - private val JPEG = listOf( - COSName.DCT_DECODE.name, - COSName.DCT_DECODE_ABBREVIATION.name - ) + private val mediaType = "image/jpeg" + private val imageIOFormat = "jpeg" + private val resolution = 1536F - override fun getPagesList(path: Path): List { - return PDDocument.load(Files.newInputStream(path)).use { pdf -> - pdf.pages.mapIndexed { index, page -> - val extractor = ImageGraphicsEngine(page, false) - extractor.run() - BookPage(index.toString(), extractor.mediaType) - } - } - } - - override fun getPageStream(path: Path, entryName: String): ByteArray { - return PDDocument.load(Files.newInputStream(path)).use { pdf -> - val extractor = ImageGraphicsEngine(pdf.pages[entryName.toInt()], true) - extractor.run() - extractor.buffer - } - } - - private inner class ImageGraphicsEngine( - page: PDPage, - val extractImage: Boolean - ) : PDFGraphicsStreamEngine(page) { - - lateinit var buffer: ByteArray - lateinit var mediaType: String - - fun run() { - val p = getPage() - processPage(p) - val res = p.resources - for (name in res.extGStateNames) { - val softMask = res.getExtGState(name).softMask - if (softMask != null) { - val group = softMask.group - if (group != null) { - processSoftMask(group) - } + override fun getPagesList(path: Path): List = + PDDocument.load(Files.newInputStream(path)).use { pdf -> + (0..pdf.numberOfPages).map { index -> + BookPage(index.toString(), mediaType) } } - } - // find out if it is a tiling pattern, then process that one - private fun processColor(color: PDColor) { - if (color.colorSpace is PDPattern) { - val pattern = color.colorSpace as PDPattern - val abstractPattern = pattern.getPattern(color) - if (abstractPattern is PDTilingPattern) { - processTilingPattern(abstractPattern, null, null) + override fun getPageStream(path: Path, entryName: String): ByteArray = + PDDocument.load(Files.newInputStream(path)).use { pdf -> + val pageNumber = entryName.toInt() + val page = pdf.getPage(pageNumber) + val scale = resolution / minOf(page.cropBox.width, page.cropBox.height) + val image = PDFRenderer(pdf).renderImage(pageNumber, scale, ImageType.RGB) + ByteArrayOutputStream().use { out -> + ImageIO.write(image, imageIOFormat, out) + out.toByteArray() } } - } - - override fun drawImage(pdImage: PDImage) { - if (pdImage is PDImageXObject) { - if (pdImage.isStencil()) { - processColor(graphicsState.nonStrokingColor) - } - } - writeToBuffer(pdImage, extractImage) - } - - override fun fillAndStrokePath(windingRule: Int) { - processColor(graphicsState.nonStrokingColor) - } - - override fun fillPath(windingRule: Int) { - processColor(graphicsState.nonStrokingColor) - } - - override fun strokePath() { - processColor(graphicsState.nonStrokingColor) - } - - override fun showGlyph(textRenderingMatrix: Matrix, - font: PDFont, - code: Int, - unicode: String, - displacement: Vector) { - val renderingMode = graphicsState.textState.renderingMode - if (renderingMode.isFill) { - processColor(graphicsState.nonStrokingColor) - } - if (renderingMode.isStroke) { - processColor(graphicsState.strokingColor) - } - } - - override fun shadingFill(shadingName: COSName?) {} - override fun clip(windingRule: Int) {} - override fun endPath() {} - override fun closePath() {} - override fun getCurrentPoint(): Point2D = Point2D.Float(0F, 0F) - override fun moveTo(x: Float, y: Float) {} - override fun lineTo(x: Float, y: Float) {} - override fun appendRectangle(p0: Point2D?, p1: Point2D?, p2: Point2D?, p3: Point2D?) {} - override fun curveTo(x1: Float, y1: Float, x2: Float, y2: Float, x3: Float, y3: Float) {} - - private fun writeToBuffer(pdImage: PDImage, extractImage: Boolean) { - var suffix: String? = pdImage.suffix - logger.trace { "PDF image suffix: $suffix" } - if (suffix == null || suffix == "jb2") { - suffix = "png" - } else if (suffix == "jpx") { - // use jp2 suffix for file because jpx not known by windows - suffix = "jp2" - } - logger.trace { "PDF image computed suffix: $suffix" } - - ByteArrayOutputStream().use { out -> - val image = pdImage.image - if (image != null) { - when (suffix) { - "jpg" -> { - mediaType = "image/jpeg" - val colorSpaceName = pdImage.colorSpace.name - if (!hasMasks(pdImage) && (PDDeviceGray.INSTANCE.name == colorSpaceName || PDDeviceRGB.INSTANCE.name == colorSpaceName)) { - // RGB or Gray colorspace: get and write the unmodified JPEG stream - if (extractImage) { - logger.debug { "RGB or Gray colorspace, get the unmodified JPEG stream" } - val data = pdImage.createInputStream(JPEG) - IOUtils.copy(data, out) - IOUtils.closeQuietly(data) - } - } else { - // for CMYK and other "unusual" colorspaces, the JPEG will be converted - if (extractImage) { - logger.debug { "CMYK or other colorspace, converting to JPEG" } - ImageIO.write(image, suffix, out) - } - } - } - "jp2" -> { - mediaType = "image/jp2" - val colorSpaceName = pdImage.colorSpace.name - if (!hasMasks(pdImage) && (PDDeviceGray.INSTANCE.name == colorSpaceName || PDDeviceRGB.INSTANCE.name == colorSpaceName)) { - // RGB or Gray colorspace: get and write the unmodified JPEG2000 stream - if (extractImage) { - logger.debug { "RGB or Gray colorspace, get the unmodified JPEG2000 stream" } - val data = pdImage.createInputStream(listOf(COSName.JPX_DECODE.name)) - IOUtils.copy(data, out) - IOUtils.closeQuietly(data) - } - } else { - // for CMYK and other "unusual" colorspaces, the image will be converted - // ImageIOUtil.writeImage(image, "jpeg2000", out) - if (extractImage) { - logger.debug { "CMYK or other colorspace, converting to JPEG2000" } - ImageIO.write(image, "jpeg2000", out) - } - } - } - else -> { - mediaType = URLConnection.guessContentTypeFromName("file.$suffix") ?: "application/octet-stream" - if (extractImage) { - logger.debug { "Converting to $suffix" } - ImageIO.write(image, suffix, out) - } - } - } - } - buffer = out.toByteArray() - } - } - - } - - private fun hasMasks(pdImage: PDImage): Boolean { - return if (pdImage is PDImageXObject) { - pdImage.mask != null || pdImage.softMask != null - } else false - } } \ No newline at end of file