fix: ignore xml namespace in EPUB opf file

Closes: #2043
This commit is contained in:
Gauthier Roebroeck 2025-08-20 14:30:33 +08:00
parent 4e7c49d5d8
commit 3ab21ff6aa
5 changed files with 171 additions and 29 deletions

View file

@ -60,7 +60,7 @@ class EpubExtractor(
manifest.values.firstOrNull { it.properties.contains("cover-image") }
?: // EPUB 2 - get cover from meta element with name="cover"
opfDoc
.selectFirst("metadata > meta[name=cover]")
.selectFirst("*|metadata > *|meta[name=cover]")
?.attr("content")
?.ifBlank { null }
?.let { manifest[it] }
@ -84,7 +84,7 @@ class EpubExtractor(
fun getResources(epub: EpubPackage): List<MediaFile> {
val spine =
epub.opfDoc
.select("spine > itemref")
.select("*|spine > *|itemref")
.map { it.attr("idref") }
.mapNotNull { epub.manifest[it] }
@ -126,7 +126,7 @@ class EpubExtractor(
run {
val spine =
epub.opfDoc
.select("spine > itemref")
.select("*|spine > *|itemref")
.map { it.attr("idref") }
.mapNotNull { idref -> epub.manifest[idref]?.href?.let { normalizeHref(epub.opfDir, it) } }
@ -137,7 +137,7 @@ class EpubExtractor(
val pagesWithImages =
epub.opfDoc
.select("spine > itemref")
.select("*|spine > *|itemref")
.map { it.attr("idref") }
.mapNotNull { idref -> epub.manifest[idref]?.href?.let { normalizeHref(epub.opfDir, it) } }
.map { pagePath ->
@ -219,7 +219,7 @@ class EpubExtractor(
fun computePageCount(epub: EpubPackage): Int {
val spine =
epub.opfDoc
.select("spine > itemref")
.select("*|spine > *|itemref")
.map { it.attr("idref") }
.mapNotNull { idref -> epub.manifest[idref]?.href?.let { normalizeHref(epub.opfDir, it) } }
@ -230,8 +230,8 @@ class EpubExtractor(
}
fun isFixedLayout(epub: EpubPackage) =
epub.opfDoc.selectFirst("metadata > *|meta[property=rendition:layout]")?.text() == "pre-paginated" ||
epub.opfDoc.selectFirst("metadata > *|meta[name=fixed-layout]")?.attr("content") == "true"
epub.opfDoc.selectFirst("*|metadata > *|meta[property=rendition:layout]")?.text() == "pre-paginated" ||
epub.opfDoc.selectFirst("*|metadata > *|meta[name=fixed-layout]")?.attr("content") == "true"
fun computePositions(
epub: EpubPackage,

View file

@ -8,7 +8,7 @@ import java.nio.file.Paths
import kotlin.io.path.invariantSeparatorsPathString
fun Document.getManifest() =
select("manifest > item").associate {
select("*|manifest > *|item").associate {
it.attr("id") to
ManifestItem(
it.attr("id"),
@ -36,8 +36,8 @@ fun processOpfGuide(
opf: Document,
opfDir: Path?,
): List<EpubTocEntry> {
val guide = opf.selectFirst("guide") ?: return emptyList()
return guide.select("reference").map { ref ->
val guide = opf.selectFirst("*|guide") ?: return emptyList()
return guide.select("*|reference").map { ref ->
EpubTocEntry(
ref.attr("title"),
ref.attr("href").ifBlank { null }?.let { normalizeHref(opfDir, URLDecoder.decode(it, Charsets.UTF_8)) },

View file

@ -51,22 +51,22 @@ class EpubMetadataProvider(
getPackageFileContent(book.book.path)?.let { packageFile ->
val opf = Jsoup.parse(packageFile, "", Parser.xmlParser())
val title = opf.selectFirst("metadata > dc|title")?.text()?.ifBlank { null }
val title = opf.selectFirst("*|metadata > *|title")?.text()?.ifBlank { null }
val description =
opf
.selectFirst("metadata > dc|description")
.selectFirst("*|metadata > *|description")
?.text()
?.let { Jsoup.clean(it, Safelist.none()) }
?.ifBlank { null }
val date = opf.selectFirst("metadata > dc|date")?.text()?.let { parseDate(it) }
val date = opf.selectFirst("*|metadata > *|date")?.text()?.let { parseDate(it) }
val authorRoles =
opf
.select("metadata > *|meta[property=role][scheme=marc:relators]")
.select("*|metadata > *|meta[property=role][scheme=marc:relators]")
.associate { it.attr("refines").removePrefix("#") to it.text() }
val authors =
opf
.select("metadata > dc|creator")
.select("*|metadata > *|creator")
.mapNotNull { el ->
val name = el.text().trim()
if (name.isBlank()) {
@ -81,16 +81,16 @@ class EpubMetadataProvider(
val isbn =
opf
.select("metadata > dc|identifier")
.select("*|metadata > *|identifier")
.map { it.text().lowercase().removePrefix("isbn:") }
.firstNotNullOfOrNull { isbnValidator.validate(it) }
val seriesIndex =
opf
.selectFirst("metadata > *|meta[property=belongs-to-collection]")
.selectFirst("*|metadata > *|meta[property=belongs-to-collection]")
?.attr("id")
?.let { id ->
opf.selectFirst("metadata > *|meta[refines=#$id][property=group-position]")
opf.selectFirst("*|metadata > *|meta[refines=#$id][property=group-position]")
}?.text()
return BookMetadataPatch(
@ -116,18 +116,18 @@ class EpubMetadataProvider(
getPackageFileContent(book.book.path)?.let { packageFile ->
val opf = Jsoup.parse(packageFile, "", Parser.xmlParser())
val series = opf.selectFirst("metadata > *|meta[property=belongs-to-collection]")?.text()?.ifBlank { null }
val publisher = opf.selectFirst("metadata > dc|publisher")?.text()?.ifBlank { null }
val language = opf.selectFirst("metadata > dc|language")?.text()?.ifBlank { null }
val series = opf.selectFirst("*|metadata > *|meta[property=belongs-to-collection]")?.text()?.ifBlank { null }
val publisher = opf.selectFirst("*|metadata > *|publisher")?.text()?.ifBlank { null }
val language = opf.selectFirst("*|metadata > *|language")?.text()?.ifBlank { null }
val genres =
opf
.select("metadata > dc|subject")
.select("*|metadata > *|subject")
.mapNotNull { it.text().trim().ifBlank { null } }
.toSet()
.ifEmpty { null }
val direction =
opf.getElementsByTag("spine").first()?.attr("page-progression-direction")?.let {
opf.selectFirst("*|spine")?.attr("page-progression-direction")?.let {
when (it) {
"rtl" -> SeriesMetadata.ReadingDirection.RIGHT_TO_LEFT
"ltr" -> SeriesMetadata.ReadingDirection.LEFT_TO_RIGHT

View file

@ -14,6 +14,8 @@ import org.gotson.komga.infrastructure.mediacontainer.epub.getPackageFileContent
import org.junit.jupiter.api.AfterEach
import org.junit.jupiter.api.Nested
import org.junit.jupiter.api.Test
import org.junit.jupiter.params.ParameterizedTest
import org.junit.jupiter.params.provider.ValueSource
import org.springframework.core.io.ClassPathResource
import java.time.LocalDate
@ -37,9 +39,15 @@ class EpubMetadataProviderTest {
@Nested
inner class Book {
@Test
fun `given epub 3 opf when getting book metadata then metadata patch is valid`() {
val opf = ClassPathResource("epub/Panik im Paradies.opf")
@ParameterizedTest
@ValueSource(
strings = [
"epub/Panik im Paradies.opf",
"epub/Panik im Paradies - namespace.opf",
],
)
fun `given epub 3 opf when getting book metadata then metadata patch is valid`(opfFile: String) {
val opf = ClassPathResource(opfFile)
mockkStatic(::getPackageFileContent)
every { getPackageFileContent(any()) } returns opf.file.readText()
@ -128,9 +136,15 @@ class EpubMetadataProviderTest {
@Nested
inner class Series {
@Test
fun `given epub 3 opf when getting series metadata then metadata patch is valid`() {
val opf = ClassPathResource("epub/Panik im Paradies.opf")
@ParameterizedTest
@ValueSource(
strings = [
"epub/Panik im Paradies.opf",
"epub/Panik im Paradies - namespace.opf",
],
)
fun `given epub 3 opf when getting series metadata then metadata patch is valid`(opfFile: String) {
val opf = ClassPathResource(opfFile)
mockkStatic(::getPackageFileContent)
every { getPackageFileContent(any()) } returns opf.file.readText()

View file

@ -0,0 +1,128 @@
<myopf:package xmlns:myopf="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="uuid_id" prefix="calibre: https://calibre-ebook.com">
<myopf:metadata xmlns:mydc="http://purl.org/dc/elements/1.1/"
xmlns:dcterms="http://purl.org/dc/terms/"
xmlns:calibre="http://calibre.kovidgoyal.net/2009/metadata">
<mydc:title id="id">Panik im Paradies</mydc:title>
<mydc:creator id="id-1">Ulf Blanck</mydc:creator>
<mydc:creator id="id-3">The Editor</mydc:creator>
<mydc:identifier>goodreads:222735</mydc:identifier>
<mydc:identifier>isbn:9783440077894</mydc:identifier>
<mydc:identifier>calibre:255</mydc:identifier>
<mydc:identifier>uuid:499def46-39dc-4e79-b474-d0ec12ea5dc5</mydc:identifier>
<mydc:identifier id="uuid_id">uuid:499def46-39dc-4e79-b474-d0ec12ea5dc5</mydc:identifier>
<mydc:language>de</mydc:language>
<mydc:date>1999-07-31T16:00:00+00:00</mydc:date>
<mydc:description>&lt;div&gt;
&lt;p&gt;Bereits im ersten Band "Panik im Paradies" machen die drei berühmten Detektive ihrem Namen alle Ehre. Eigentlich haben sie ja gerade Ferien. Doch dann treffen sie auf diesen schrulligen Kapitän Larsson, der sich einen kleinen Privatzoo mit exotischen Tieren hält. Als plötzlich alle Tiere an rätselhaften Infektionen erkranken und die Besucher ausbleiben, werden Justus, Peter und Bob neugierig. Schon bald merken sie, daß da jemand ein düsteres Geheimnis hütet...&lt;/p&gt;&lt;/div&gt;</mydc:description>
<mydc:publisher>Kosmos</mydc:publisher>
<mydc:subject>Kinder- und Jugendbücher</mydc:subject>
<myopf:meta refines="#id" property="title-type">main</myopf:meta>
<myopf:meta refines="#id" property="file-as">Panik im Paradies</myopf:meta>
<myopf:meta name="cover" content="cover"/>
<myopf:meta property="calibre:timestamp" scheme="dcterms:W3CDTF">2020-08-09T08:40:58Z</myopf:meta>
<myopf:meta property="dcterms:modified" scheme="dcterms:W3CDTF">2021-06-19T08:20:33Z</myopf:meta>
<myopf:meta refines="#id-1" property="role" scheme="marc:relators">aut</myopf:meta>
<myopf:meta refines="#id-1" property="file-as">Blanck, Ulf</myopf:meta>
<myopf:meta refines="#id-3" property="role" scheme="marc:relators">edt</myopf:meta>
<myopf:meta refines="#id-3" property="file-as">Editor, The</myopf:meta>
<myopf:meta property="calibre:rating">6</myopf:meta>
<myopf:meta property="belongs-to-collection" id="id-2">Die drei ??? Kids</myopf:meta>
<myopf:meta refines="#id-2" property="collection-type">series</myopf:meta>
<myopf:meta refines="#id-2" property="group-position">1.5</myopf:meta>
<myopf:meta property="calibre:author_link_map">{"Ulf Blanck": ""}</myopf:meta>
</myopf:metadata>
<myopf:manifest>
<myopf:item id="titlepage" href="titlepage.xhtml" media-type="application/xhtml+xml" properties="svg calibre:title-page"/>
<myopf:item id="TableOfContents_html" href="OPS/TableOfContents.html" media-type="application/xhtml+xml"/>
<myopf:item id="section-0001_html" href="OPS/section-0001.html" media-type="application/xhtml+xml"/>
<myopf:item id="section-0002_html" href="OPS/section-0002.html" media-type="application/xhtml+xml"/>
<myopf:item id="section-0003_html" href="OPS/section-0003.html" media-type="application/xhtml+xml"/>
<myopf:item id="section-0004_html" href="OPS/section-0004.html" media-type="application/xhtml+xml"/>
<myopf:item id="section-0005_html" href="OPS/section-0005.html" media-type="application/xhtml+xml"/>
<myopf:item id="section-0006_html" href="OPS/section-0006.html" media-type="application/xhtml+xml"/>
<myopf:item id="section-0007_html" href="OPS/section-0007.html" media-type="application/xhtml+xml"/>
<myopf:item id="section-0008_html" href="OPS/section-0008.html" media-type="application/xhtml+xml"/>
<myopf:item id="section-0009_html" href="OPS/section-0009.html" media-type="application/xhtml+xml"/>
<myopf:item id="section-0010_html" href="OPS/section-0010.html" media-type="application/xhtml+xml"/>
<myopf:item id="section-0011_html" href="OPS/section-0011.html" media-type="application/xhtml+xml"/>
<myopf:item id="section-0012_html" href="OPS/section-0012.html" media-type="application/xhtml+xml"/>
<myopf:item id="section-0013_html" href="OPS/section-0013.html" media-type="application/xhtml+xml"/>
<myopf:item id="section-0014_html" href="OPS/section-0014.html" media-type="application/xhtml+xml"/>
<myopf:item id="section-0015_html" href="OPS/section-0015.html" media-type="application/xhtml+xml"/>
<myopf:item id="section-0016_html" href="OPS/section-0016.html" media-type="application/xhtml+xml"/>
<myopf:item id="section-0017_html" href="OPS/section-0017.html" media-type="application/xhtml+xml"/>
<myopf:item id="section-0018_html" href="OPS/section-0018.html" media-type="application/xhtml+xml"/>
<myopf:item id="section-0019_html" href="OPS/section-0019.html" media-type="application/xhtml+xml"/>
<myopf:item id="section-0020_html" href="OPS/section-0020.html" media-type="application/xhtml+xml"/>
<myopf:item id="section-0021_html" href="OPS/section-0021.html" media-type="application/xhtml+xml"/>
<myopf:item id="section-0022_html" href="OPS/section-0022.html" media-type="application/xhtml+xml"/>
<myopf:item id="section-0023_html" href="OPS/section-0023.html" media-type="application/xhtml+xml"/>
<myopf:item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
<myopf:item id="page_css" href="page_styles.css" media-type="text/css"/>
<myopf:item id="css" href="stylesheet.css" media-type="text/css"/>
<myopf:item id="cover" href="cover.jpeg" media-type="image/jpeg" properties="cover-image"/>
<myopf:item id="image0_jpg" href="OPS/image0.jpg" media-type="image/jpeg"/>
<myopf:item id="image1_jpg" href="OPS/image1.jpg" media-type="image/jpeg"/>
<myopf:item id="image10_jpg" href="OPS/image10.jpg" media-type="image/jpeg"/>
<myopf:item id="image11_jpg" href="OPS/image11.jpg" media-type="image/jpeg"/>
<myopf:item id="image12_jpg" href="OPS/image12.jpg" media-type="image/jpeg"/>
<myopf:item id="image13_jpg" href="OPS/image13.jpg" media-type="image/jpeg"/>
<myopf:item id="image14_jpg" href="OPS/image14.jpg" media-type="image/jpeg"/>
<myopf:item id="image15_jpg" href="OPS/image15.jpg" media-type="image/jpeg"/>
<myopf:item id="image16_jpg" href="OPS/image16.jpg" media-type="image/jpeg"/>
<myopf:item id="image17_jpg" href="OPS/image17.jpg" media-type="image/jpeg"/>
<myopf:item id="image18_jpg" href="OPS/image18.jpg" media-type="image/jpeg"/>
<myopf:item id="image19_jpg" href="OPS/image19.jpg" media-type="image/jpeg"/>
<myopf:item id="image2_jpg" href="OPS/image2.jpg" media-type="image/jpeg"/>
<myopf:item id="image20_jpg" href="OPS/image20.jpg" media-type="image/jpeg"/>
<myopf:item id="image21_jpg" href="OPS/image21.jpg" media-type="image/jpeg"/>
<myopf:item id="image22_jpg" href="OPS/image22.jpg" media-type="image/jpeg"/>
<myopf:item id="image23_jpg" href="OPS/image23.jpg" media-type="image/jpeg"/>
<myopf:item id="image24_jpg" href="OPS/image24.jpg" media-type="image/jpeg"/>
<myopf:item id="image25_jpg" href="OPS/image25.jpg" media-type="image/jpeg"/>
<myopf:item id="image26_jpg" href="OPS/image26.jpg" media-type="image/jpeg"/>
<myopf:item id="image27_jpg" href="OPS/image27.jpg" media-type="image/jpeg"/>
<myopf:item id="image28_jpg" href="OPS/image28.jpg" media-type="image/jpeg"/>
<myopf:item id="image29_jpg" href="OPS/image29.jpg" media-type="image/jpeg"/>
<myopf:item id="image3_jpg" href="OPS/image3.jpg" media-type="image/jpeg"/>
<myopf:item id="image30_jpg" href="OPS/image30.jpg" media-type="image/jpeg"/>
<myopf:item id="image31_jpg" href="OPS/image31.jpg" media-type="image/jpeg"/>
<myopf:item id="image32_jpg" href="OPS/image32.jpg" media-type="image/jpeg"/>
<myopf:item id="image33_jpg" href="OPS/image33.jpg" media-type="image/jpeg"/>
<myopf:item id="image34_jpg" href="OPS/image34.jpg" media-type="image/jpeg"/>
<myopf:item id="image35_jpg" href="OPS/image35.jpg" media-type="image/jpeg"/>
<myopf:item id="image4_jpg" href="OPS/image4.jpg" media-type="image/jpeg"/>
<myopf:item id="image5_jpg" href="OPS/image5.jpg" media-type="image/jpeg"/>
<myopf:item id="image6_jpg" href="OPS/image6.jpg" media-type="image/jpeg"/>
<myopf:item id="image7_jpg" href="OPS/image7.jpg" media-type="image/jpeg"/>
<myopf:item id="image8_jpg" href="OPS/image8.jpg" media-type="image/jpeg"/>
<myopf:item id="image9_jpg" href="OPS/image9.jpg" media-type="image/jpeg"/>
</myopf:manifest>
<myopf:spine page-progression-direction="rtl">
<myopf:itemref idref="titlepage"/>
<myopf:itemref idref="TableOfContents_html"/>
<myopf:itemref idref="section-0001_html"/>
<myopf:itemref idref="section-0002_html"/>
<myopf:itemref idref="section-0003_html"/>
<myopf:itemref idref="section-0004_html"/>
<myopf:itemref idref="section-0005_html"/>
<myopf:itemref idref="section-0006_html"/>
<myopf:itemref idref="section-0007_html"/>
<myopf:itemref idref="section-0008_html"/>
<myopf:itemref idref="section-0009_html"/>
<myopf:itemref idref="section-0010_html"/>
<myopf:itemref idref="section-0011_html"/>
<myopf:itemref idref="section-0012_html"/>
<myopf:itemref idref="section-0013_html"/>
<myopf:itemref idref="section-0014_html"/>
<myopf:itemref idref="section-0015_html"/>
<myopf:itemref idref="section-0016_html"/>
<myopf:itemref idref="section-0017_html"/>
<myopf:itemref idref="section-0018_html"/>
<myopf:itemref idref="section-0019_html"/>
<myopf:itemref idref="section-0020_html"/>
<myopf:itemref idref="section-0021_html"/>
<myopf:itemref idref="section-0022_html"/>
<myopf:itemref idref="section-0023_html"/>
</myopf:spine>
</myopf:package>