fix: better handling of broken epub

Closes: #1844
This commit is contained in:
Gauthier Roebroeck 2025-01-23 11:37:28 +08:00
parent 214f687c2e
commit 42047cdafb
5 changed files with 27 additions and 21 deletions

View file

@ -20,7 +20,7 @@ data class EpubPackage(
inline fun <R> Path.epub(block: (EpubPackage) -> R): R =
ZipFile.builder().setPath(this).use { zip ->
val opfFile = zip.getPackagePath()
val opfDoc = zip.getEntryInputStream(opfFile).use { Jsoup.parse(it, null, "", Parser.xmlParser()) }
val opfDoc = zip.getEntryInputStream(opfFile)?.use { Jsoup.parse(it, null, "", Parser.xmlParser()) } ?: throw MediaUnsupportedException("Could not open OPF resource")
val opfDir = Paths.get(opfFile).parent
block(EpubPackage(zip, opfDoc, opfDir, opfDoc.getManifest()))
}
@ -30,9 +30,9 @@ inline fun <R> Path.epub(block: (EpubPackage) -> R): R =
*/
fun ZipFile.getPackagePath(): String =
getEntryInputStream("META-INF/container.xml")
.use { Jsoup.parse(it, null, "") }
.getElementsByTag("rootfile")
.first()
?.use { Jsoup.parse(it, null, "") }
?.getElementsByTag("rootfile")
?.first()
?.attr("full-path") ?: throw MediaUnsupportedException("META-INF/container.xml does not contain rootfile tag")
/**
@ -41,7 +41,7 @@ fun ZipFile.getPackagePath(): String =
fun getPackageFileContent(path: Path): String? =
ZipFile.builder().setPath(path).use { zip ->
try {
zip.getEntryInputStream(zip.getPackagePath()).reader().use { it.readText() }
zip.getEntryInputStream(zip.getPackagePath())?.reader()?.use { it.readText() }
} catch (e: Exception) {
null
}

View file

@ -70,10 +70,12 @@ class EpubExtractor(
val href = coverManifestItem.href
val mediaType = coverManifestItem.mediaType
val coverPath = normalizeHref(opfDir, href)
TypedBytes(
zip.getEntryBytes(coverPath),
mediaType,
)
zip.getEntryBytes(coverPath)?.let { coverBytes ->
TypedBytes(
coverBytes,
mediaType,
)
}
} else {
null
}
@ -151,7 +153,7 @@ class EpubExtractor(
.map { it.attr("idref") }
.mapNotNull { idref -> epub.manifest[idref]?.href?.let { normalizeHref(epub.opfDir, it) } }
.map { pagePath ->
val doc = epub.zip.getEntryInputStream(pagePath).use { Jsoup.parse(it, null, "") }
val doc = epub.zip.getEntryInputStream(pagePath)?.use { Jsoup.parse(it, null, "") } ?: return@map emptyList()
// if a page has text over the threshold then the book is not divina compatible
if (doc.body().text().length > letterCountThreshold) return emptyList()
@ -217,8 +219,8 @@ class EpubExtractor(
val readingOrder = resources.filter { it.subType == MediaFile.SubType.EPUB_PAGE }
readingOrder.forEach { mediaFile ->
val doc = epub.zip.getEntryInputStream(mediaFile.fileName).use { Jsoup.parse(it, null, "") }
if (!doc.getElementsByClass("koboSpan").isNullOrEmpty()) return true
val doc = epub.zip.getEntryInputStream(mediaFile.fileName)?.use { Jsoup.parse(it, null, "") }
if (!doc?.getElementsByClass("koboSpan").isNullOrEmpty()) return true
}
} catch (e: Exception) {
logger.warn(e) { "Error while checking if EPUB is KEPUB" }
@ -257,7 +259,7 @@ class EpubExtractor(
val koboPositions =
when {
isFixedLayout -> emptyMap()
isKepub -> computePositionsFromKoboSpan(readingOrder) { filename -> epub.zip.getEntryInputStream(filename).use { it.readBytes().decodeToString() } }
isKepub -> computePositionsFromKoboSpan(readingOrder) { filename -> epub.zip.getEntryInputStream(filename).use { it?.readBytes()?.decodeToString() } }
kepubConverter.isAvailable -> {
try {
val kepub =
@ -327,12 +329,12 @@ class EpubExtractor(
*/
private fun computePositionsFromKoboSpan(
readingOrder: List<MediaFile>,
resourceSupplier: (String) -> String,
): Map<String, List<Pair<String, Float>>> =
resourceSupplier: (String) -> String?,
): Map<String, List<Pair<String, Float>>?> =
readingOrder.associate { file ->
val doc = Jsoup.parse(resourceSupplier(file.fileName), Parser.htmlParser().setTrackPosition(true))
val doc = resourceSupplier(file.fileName)?.let { resource -> Jsoup.parse(resource, Parser.htmlParser().setTrackPosition(true)) }
file.fileName to
doc.select("span.koboSpan").mapNotNull { koboSpan ->
doc?.select("span.koboSpan")?.mapNotNull { koboSpan ->
val id = koboSpan.id()
if (!id.isNullOrBlank()) {
// progression is built from the position in the file of each koboSpan, divided by the file size

View file

@ -11,7 +11,9 @@ import kotlin.io.path.Path
fun EpubPackage.getNavResource(): ResourceContent? =
manifest.values.firstOrNull { it.properties.contains("nav") }?.let { nav ->
val href = normalizeHref(opfDir, nav.href)
ResourceContent(Path(href), zip.getEntryBytes(href).decodeToString())
zip.getEntryBytes(href)?.decodeToString()?.let { navContent ->
ResourceContent(Path(href), navContent)
}
}
fun processNav(

View file

@ -13,7 +13,9 @@ private val possibleNcxItemIds = listOf("toc", "ncx", "ncxtoc")
fun EpubPackage.getNcxResource(): ResourceContent? =
(manifest.values.firstOrNull { it.mediaType == "application/x-dtbncx+xml" } ?: manifest.values.firstOrNull { possibleNcxItemIds.contains(it.id) })?.let { ncx ->
val href = normalizeHref(opfDir, ncx.href)
ResourceContent(Path(href), zip.getEntryBytes(href).decodeToString())
zip.getEntryBytes(href)?.decodeToString()?.let { ncxContent ->
ResourceContent(Path(href), ncxContent)
}
}
fun processNcx(

View file

@ -7,9 +7,9 @@ import java.nio.file.Path
inline fun <R> ZipFile.Builder.use(block: (ZipFile) -> R) = this.get().use(block)
fun ZipFile.getEntryInputStream(entryName: String): InputStream = this.getInputStream(this.getEntry(entryName))
fun ZipFile.getEntryInputStream(entryName: String): InputStream? = this.getEntry(entryName)?.let { entry -> this.getInputStream(entry) }
fun ZipFile.getEntryBytes(entryName: String): ByteArray = this.getInputStream(this.getEntry(entryName)).use { it.readBytes() }
fun ZipFile.getEntryBytes(entryName: String): ByteArray? = this.getEntry(entryName)?.let { entry -> this.getInputStream(entry).use { it.readBytes() } }
fun getZipEntryBytes(
path: Path,