fix: epub parsing namespace issue

This commit is contained in:
Gauthier Roebroeck 2021-06-22 15:54:51 +08:00
parent 55ec5a3478
commit 7a566326b0
3 changed files with 40 additions and 7 deletions

View file

@ -51,16 +51,19 @@ class EpubMetadataProvider(
val description = opf.selectFirst("metadata > dc|description")?.text()?.let { Jsoup.clean(it, Whitelist.none()) }?.ifBlank { null }
val date = opf.selectFirst("metadata > dc|date")?.text()?.let { parseDate(it) }
val creatorRefines = opf.select("metadata > meta[property=role][scheme=marc:relators]")
val authorRoles = (
opf.select("metadata > *|meta[property=role][scheme=marc:relators]") +
opf.select("metadata > meta[property=role][scheme=marc:relators]")
)
.associate { it.attr("refines").removePrefix("#") to it.text() }
val authors = opf.select("metadata > dc|creator")
.mapNotNull { el ->
val name = el.text()?.trim()
if (name.isNullOrBlank()) null
else {
val opfRole = el.attr("opf|role").ifBlank { null }
val opfRole = el.attr("opf:role").ifBlank { null }
val id = el.attr("id").ifBlank { null }
val refineRole = creatorRefines[id]?.ifBlank { null }
val refineRole = authorRoles[id]?.ifBlank { null }
Author(name, relators[opfRole ?: refineRole] ?: "writer")
}
}
@ -86,7 +89,10 @@ class EpubMetadataProvider(
epubExtractor.getPackageFile(book.book.path)?.let { packageFile ->
val opf = Jsoup.parse(packageFile)
val series = opf.selectFirst("metadata > *|meta[property=belongs-to-collection]")?.text()?.ifBlank { null }
val series = (
opf.selectFirst("metadata > meta[property=belongs-to-collection]")
?: opf.selectFirst("metadata > *|meta[property=belongs-to-collection]")
)?.text()?.ifBlank { null }
val publisher = opf.selectFirst("metadata > dc|publisher")?.text()?.ifBlank { null }
val language = opf.selectFirst("metadata > dc|language")?.text()?.ifBlank { null }
val genre = opf.selectFirst("metadata > dc|subject")?.text()?.ifBlank { null }

View file

@ -3,13 +3,17 @@ package org.gotson.komga.infrastructure.metadata.epub
import io.mockk.every
import io.mockk.mockk
import org.apache.commons.validator.routines.ISBNValidator
import org.apache.tika.config.TikaConfig
import org.assertj.core.api.Assertions.assertThat
import org.gotson.komga.domain.model.Author
import org.gotson.komga.domain.model.BookWithMedia
import org.gotson.komga.domain.model.Media
import org.gotson.komga.domain.model.SeriesMetadata
import org.gotson.komga.domain.model.makeBook
import org.gotson.komga.infrastructure.image.ImageAnalyzer
import org.gotson.komga.infrastructure.mediacontainer.ContentDetector
import org.gotson.komga.infrastructure.mediacontainer.EpubExtractor
import org.gotson.komga.infrastructure.mediacontainer.ZipExtractor
import org.junit.jupiter.api.Nested
import org.junit.jupiter.api.Test
import org.springframework.core.io.ClassPathResource
@ -21,6 +25,10 @@ class EpubMetadataProviderTest {
private val isbnValidator = ISBNValidator(true)
private val epubMetadataProvider = EpubMetadataProvider(mockExtractor, isbnValidator)
private val contentDetector = ContentDetector(TikaConfig())
private val imageAnalyzer = ImageAnalyzer()
private val epubMetadataProviderProper = EpubMetadataProvider(EpubExtractor(ZipExtractor(contentDetector, imageAnalyzer), contentDetector, imageAnalyzer), ISBNValidator(true))
private val book = makeBook("book")
private val media = Media(
status = Media.Status.READY,
@ -42,13 +50,32 @@ class EpubMetadataProviderTest {
assertThat(summary).isEqualTo("Bereits im ersten Band \"Panik im Paradies\" machen die drei berühmten Detektive ihrem Namen alle Ehre. Eigentlich haben sie ja gerade Ferien. Doch dann treffen sie auf diesen schrulligen Kapitän Larsson, der sich einen kleinen Privatzoo mit exotischen Tieren hält. Als plötzlich alle Tiere an rätselhaften Infektionen erkranken und die Besucher ausbleiben, werden Justus, Peter und Bob neugierig. Schon bald merken sie, daß da jemand ein düsteres Geheimnis hütet...")
assertThat(releaseDate).isEqualTo(LocalDate.of(1999, 7, 31))
assertThat(authors).containsExactlyInAnyOrder(
Author("Blanck, Ulf", "writer"),
Author("Editor, The", "editor"),
Author("Ulf Blanck", "writer"),
Author("The Editor", "editor"),
)
assertThat(isbn).isEqualTo("9783440077894")
}
}
@Test
fun `given real epub 3 when getting book metadata then metadata patch is valid`() {
val epubResource = ClassPathResource("epub/The Incomplete Theft - Ralph Burke.epub")
val epubBook = BookWithMedia(
makeBook("Epub", url = epubResource.url),
media,
)
val patch = epubMetadataProviderProper.getBookMetadataFromBook(epubBook)
with(patch!!) {
assertThat(title).isEqualTo("The Incomplete Theft")
assertThat(summary).isNull()
assertThat(releaseDate).isEqualTo(LocalDate.of(2021, 6, 20))
assertThat(authors).containsExactlyInAnyOrder(Author("Ralph Burke", "writer"))
assertThat(isbn).isNull()
}
}
@Test
fun `given epub 2 opf when getting book metadata then metadata patch is valid`() {
val opf = ClassPathResource("epub/1979.opf")
@ -62,7 +89,7 @@ class EpubMetadataProviderTest {
assertThat(releaseDate).isEqualTo(LocalDate.of(101, 1, 1))
assertThat(authors).containsExactlyInAnyOrder(
Author("Kracht, Christian", "writer"),
Author("Editor, The", "editor"),
Author("The Editor", "editor"),
)
assertThat(isbn).isNull()
}