fix(epub): some EPUB files would not be parsed correctly

This commit is contained in:
Gauthier Roebroeck 2020-04-10 11:49:39 +08:00
parent a4f5015435
commit f0ff785d66

View file

@ -18,12 +18,10 @@ class EpubExtractor(contentDetector: ContentDetector) : ZipExtractor(contentDete
override fun getEntries(path: Path): List<MediaContainerEntry> { override fun getEntries(path: Path): List<MediaContainerEntry> {
ZipFile(path.toFile()).use { zip -> ZipFile(path.toFile()).use { zip ->
try { try {
val packagePath = zip.getEntry("META-INF/container.xml").let { entry -> val packagePath = getPackagePath(zip)
val container = zip.getInputStream(entry).use { Jsoup.parse(it, null, "") }
container.getElementsByTag("rootfile").first().attr("full-path")
}
val opf = zip.getInputStream(zip.getEntry(packagePath)).use { Jsoup.parse(it, null, "") } val opf = zip.getInputStream(zip.getEntry(packagePath)).use { Jsoup.parse(it, null, "") }
val opfPath = Paths.get(packagePath).parentOrEmpty()
val manifest = opf.select("manifest > item") val manifest = opf.select("manifest > item")
.associate { it.attr("id") to ManifestItem(it.attr("id"), it.attr("href"), it.attr("media-type")) } .associate { it.attr("id") to ManifestItem(it.attr("id"), it.attr("href"), it.attr("media-type")) }
@ -32,15 +30,17 @@ class EpubExtractor(contentDetector: ContentDetector) : ZipExtractor(contentDete
.mapNotNull { manifest[it] } .mapNotNull { manifest[it] }
.map { it.href } .map { it.href }
val images = pages.flatMap { pagePath -> val images = pages
val doc = zip.getInputStream(zip.getEntry(pagePath)).use { Jsoup.parse(it, null, "") } .map { opfPath.resolve(it).normalize() }
doc.getElementsByTag("img") .flatMap { pagePath ->
.map { it.attr("src") } val doc = zip.getInputStream(zip.getEntry(pagePath.toString())).use { Jsoup.parse(it, null, "") }
.map { Paths.get(pagePath).parent?.resolve(it).toString() } doc.getElementsByTag("img")
} .map { it.attr("src") }
.map { pagePath.parentOrEmpty().resolve(it).normalize() }
}
return images.map { image -> return images.map { image ->
MediaContainerEntry(image, manifest.values.first { it.href == image }.mediaType) MediaContainerEntry(image.toString(), manifest.values.first { it.href == opfPath.relativize(image).toString() }.mediaType)
} }
} catch (e: Exception) { } catch (e: Exception) {
logger.error(e) { "File is not a proper Epub, treating it as a zip file" } logger.error(e) { "File is not a proper Epub, treating it as a zip file" }