fix: epub parsing namespace issue

This commit is contained in:
Gauthier Roebroeck 2021-06-22 15:54:51 +08:00
parent 55ec5a3478
commit 7a566326b0
3 changed files with 40 additions and 7 deletions

View file

@ -51,16 +51,19 @@ class EpubMetadataProvider(
val description = opf.selectFirst("metadata > dc|description")?.text()?.let { Jsoup.clean(it, Whitelist.none()) }?.ifBlank { null } val description = opf.selectFirst("metadata > dc|description")?.text()?.let { Jsoup.clean(it, Whitelist.none()) }?.ifBlank { null }
val date = opf.selectFirst("metadata > dc|date")?.text()?.let { parseDate(it) } val date = opf.selectFirst("metadata > dc|date")?.text()?.let { parseDate(it) }
val creatorRefines = opf.select("metadata > meta[property=role][scheme=marc:relators]") val authorRoles = (
opf.select("metadata > *|meta[property=role][scheme=marc:relators]") +
opf.select("metadata > meta[property=role][scheme=marc:relators]")
)
.associate { it.attr("refines").removePrefix("#") to it.text() } .associate { it.attr("refines").removePrefix("#") to it.text() }
val authors = opf.select("metadata > dc|creator") val authors = opf.select("metadata > dc|creator")
.mapNotNull { el -> .mapNotNull { el ->
val name = el.text()?.trim() val name = el.text()?.trim()
if (name.isNullOrBlank()) null if (name.isNullOrBlank()) null
else { else {
val opfRole = el.attr("opf|role").ifBlank { null } val opfRole = el.attr("opf:role").ifBlank { null }
val id = el.attr("id").ifBlank { null } val id = el.attr("id").ifBlank { null }
val refineRole = creatorRefines[id]?.ifBlank { null } val refineRole = authorRoles[id]?.ifBlank { null }
Author(name, relators[opfRole ?: refineRole] ?: "writer") Author(name, relators[opfRole ?: refineRole] ?: "writer")
} }
} }
@ -86,7 +89,10 @@ class EpubMetadataProvider(
epubExtractor.getPackageFile(book.book.path)?.let { packageFile -> epubExtractor.getPackageFile(book.book.path)?.let { packageFile ->
val opf = Jsoup.parse(packageFile) val opf = Jsoup.parse(packageFile)
val series = opf.selectFirst("metadata > *|meta[property=belongs-to-collection]")?.text()?.ifBlank { null } val series = (
opf.selectFirst("metadata > meta[property=belongs-to-collection]")
?: opf.selectFirst("metadata > *|meta[property=belongs-to-collection]")
)?.text()?.ifBlank { null }
val publisher = opf.selectFirst("metadata > dc|publisher")?.text()?.ifBlank { null } val publisher = opf.selectFirst("metadata > dc|publisher")?.text()?.ifBlank { null }
val language = opf.selectFirst("metadata > dc|language")?.text()?.ifBlank { null } val language = opf.selectFirst("metadata > dc|language")?.text()?.ifBlank { null }
val genre = opf.selectFirst("metadata > dc|subject")?.text()?.ifBlank { null } val genre = opf.selectFirst("metadata > dc|subject")?.text()?.ifBlank { null }

View file

@ -3,13 +3,17 @@ package org.gotson.komga.infrastructure.metadata.epub
import io.mockk.every import io.mockk.every
import io.mockk.mockk import io.mockk.mockk
import org.apache.commons.validator.routines.ISBNValidator import org.apache.commons.validator.routines.ISBNValidator
import org.apache.tika.config.TikaConfig
import org.assertj.core.api.Assertions.assertThat import org.assertj.core.api.Assertions.assertThat
import org.gotson.komga.domain.model.Author import org.gotson.komga.domain.model.Author
import org.gotson.komga.domain.model.BookWithMedia import org.gotson.komga.domain.model.BookWithMedia
import org.gotson.komga.domain.model.Media import org.gotson.komga.domain.model.Media
import org.gotson.komga.domain.model.SeriesMetadata import org.gotson.komga.domain.model.SeriesMetadata
import org.gotson.komga.domain.model.makeBook import org.gotson.komga.domain.model.makeBook
import org.gotson.komga.infrastructure.image.ImageAnalyzer
import org.gotson.komga.infrastructure.mediacontainer.ContentDetector
import org.gotson.komga.infrastructure.mediacontainer.EpubExtractor import org.gotson.komga.infrastructure.mediacontainer.EpubExtractor
import org.gotson.komga.infrastructure.mediacontainer.ZipExtractor
import org.junit.jupiter.api.Nested import org.junit.jupiter.api.Nested
import org.junit.jupiter.api.Test import org.junit.jupiter.api.Test
import org.springframework.core.io.ClassPathResource import org.springframework.core.io.ClassPathResource
@ -21,6 +25,10 @@ class EpubMetadataProviderTest {
private val isbnValidator = ISBNValidator(true) private val isbnValidator = ISBNValidator(true)
private val epubMetadataProvider = EpubMetadataProvider(mockExtractor, isbnValidator) private val epubMetadataProvider = EpubMetadataProvider(mockExtractor, isbnValidator)
private val contentDetector = ContentDetector(TikaConfig())
private val imageAnalyzer = ImageAnalyzer()
private val epubMetadataProviderProper = EpubMetadataProvider(EpubExtractor(ZipExtractor(contentDetector, imageAnalyzer), contentDetector, imageAnalyzer), ISBNValidator(true))
private val book = makeBook("book") private val book = makeBook("book")
private val media = Media( private val media = Media(
status = Media.Status.READY, status = Media.Status.READY,
@ -42,13 +50,32 @@ class EpubMetadataProviderTest {
assertThat(summary).isEqualTo("Bereits im ersten Band \"Panik im Paradies\" machen die drei berühmten Detektive ihrem Namen alle Ehre. Eigentlich haben sie ja gerade Ferien. Doch dann treffen sie auf diesen schrulligen Kapitän Larsson, der sich einen kleinen Privatzoo mit exotischen Tieren hält. Als plötzlich alle Tiere an rätselhaften Infektionen erkranken und die Besucher ausbleiben, werden Justus, Peter und Bob neugierig. Schon bald merken sie, daß da jemand ein düsteres Geheimnis hütet...") assertThat(summary).isEqualTo("Bereits im ersten Band \"Panik im Paradies\" machen die drei berühmten Detektive ihrem Namen alle Ehre. Eigentlich haben sie ja gerade Ferien. Doch dann treffen sie auf diesen schrulligen Kapitän Larsson, der sich einen kleinen Privatzoo mit exotischen Tieren hält. Als plötzlich alle Tiere an rätselhaften Infektionen erkranken und die Besucher ausbleiben, werden Justus, Peter und Bob neugierig. Schon bald merken sie, daß da jemand ein düsteres Geheimnis hütet...")
assertThat(releaseDate).isEqualTo(LocalDate.of(1999, 7, 31)) assertThat(releaseDate).isEqualTo(LocalDate.of(1999, 7, 31))
assertThat(authors).containsExactlyInAnyOrder( assertThat(authors).containsExactlyInAnyOrder(
Author("Blanck, Ulf", "writer"), Author("Ulf Blanck", "writer"),
Author("Editor, The", "editor"), Author("The Editor", "editor"),
) )
assertThat(isbn).isEqualTo("9783440077894") assertThat(isbn).isEqualTo("9783440077894")
} }
} }
@Test
fun `given real epub 3 when getting book metadata then metadata patch is valid`() {
val epubResource = ClassPathResource("epub/The Incomplete Theft - Ralph Burke.epub")
val epubBook = BookWithMedia(
makeBook("Epub", url = epubResource.url),
media,
)
val patch = epubMetadataProviderProper.getBookMetadataFromBook(epubBook)
with(patch!!) {
assertThat(title).isEqualTo("The Incomplete Theft")
assertThat(summary).isNull()
assertThat(releaseDate).isEqualTo(LocalDate.of(2021, 6, 20))
assertThat(authors).containsExactlyInAnyOrder(Author("Ralph Burke", "writer"))
assertThat(isbn).isNull()
}
}
@Test @Test
fun `given epub 2 opf when getting book metadata then metadata patch is valid`() { fun `given epub 2 opf when getting book metadata then metadata patch is valid`() {
val opf = ClassPathResource("epub/1979.opf") val opf = ClassPathResource("epub/1979.opf")
@ -62,7 +89,7 @@ class EpubMetadataProviderTest {
assertThat(releaseDate).isEqualTo(LocalDate.of(101, 1, 1)) assertThat(releaseDate).isEqualTo(LocalDate.of(101, 1, 1))
assertThat(authors).containsExactlyInAnyOrder( assertThat(authors).containsExactlyInAnyOrder(
Author("Kracht, Christian", "writer"), Author("Kracht, Christian", "writer"),
Author("Editor, The", "editor"), Author("The Editor", "editor"),
) )
assertThat(isbn).isNull() assertThat(isbn).isNull()
} }