From 6e0c51ed1d88417b78cb95c01ca17102806726bb Mon Sep 17 00:00:00 2001 From: Gauthier Roebroeck Date: Wed, 15 Sep 2021 15:12:00 +0800 Subject: [PATCH] feat: index ngrams to allow partial search --- .../configuration/KomgaProperties.kt | 12 +++ .../search/LuceneConfiguration.kt | 8 +- .../infrastructure/search/LuceneHelper.kt | 7 +- .../search/MultiLingualAnalyzer.kt | 2 +- .../search/MultiLingualNGramAnalyzer.kt | 23 ++++++ .../search/MultilingualAnalyzerTest.kt | 16 ---- .../search/MultilingualNGramAnalyzerTest.kt | 81 +++++++++++++++++++ .../komga/infrastructure/search/Utils.kt | 18 +++++ 8 files changed, 146 insertions(+), 21 deletions(-) create mode 100644 komga/src/main/kotlin/org/gotson/komga/infrastructure/search/MultiLingualNGramAnalyzer.kt create mode 100644 komga/src/test/kotlin/org/gotson/komga/infrastructure/search/MultilingualNGramAnalyzerTest.kt create mode 100644 komga/src/test/kotlin/org/gotson/komga/infrastructure/search/Utils.kt diff --git a/komga/src/main/kotlin/org/gotson/komga/infrastructure/configuration/KomgaProperties.kt b/komga/src/main/kotlin/org/gotson/komga/infrastructure/configuration/KomgaProperties.kt index 342a834e3..ed3c64165 100644 --- a/komga/src/main/kotlin/org/gotson/komga/infrastructure/configuration/KomgaProperties.kt +++ b/komga/src/main/kotlin/org/gotson/komga/infrastructure/configuration/KomgaProperties.kt @@ -55,5 +55,17 @@ class KomgaProperties { class Lucene { @get:NotBlank var dataDirectory: String = "" + + var indexAnalyzer = IndexAnalyzer() + + class IndexAnalyzer { + @get:Positive + var minGram: Int = 3 + + @get:Positive + var maxGram: Int = 10 + + var preserveOriginal: Boolean = true + } } } diff --git a/komga/src/main/kotlin/org/gotson/komga/infrastructure/search/LuceneConfiguration.kt b/komga/src/main/kotlin/org/gotson/komga/infrastructure/search/LuceneConfiguration.kt index 5ea382814..50b4bdbf7 100644 --- a/komga/src/main/kotlin/org/gotson/komga/infrastructure/search/LuceneConfiguration.kt +++ b/komga/src/main/kotlin/org/gotson/komga/infrastructure/search/LuceneConfiguration.kt @@ -16,7 +16,13 @@ class LuceneConfiguration( ) { @Bean - fun analyzer() = + fun indexAnalyzer() = + with(komgaProperties.lucene.indexAnalyzer) { + MultiLingualNGramAnalyzer(minGram, maxGram, preserveOriginal).apply { version = Version.LUCENE_8_9_0 } + } + + @Bean + fun searchAnalyzer() = MultiLingualAnalyzer().apply { version = Version.LUCENE_8_9_0 } @Bean diff --git a/komga/src/main/kotlin/org/gotson/komga/infrastructure/search/LuceneHelper.kt b/komga/src/main/kotlin/org/gotson/komga/infrastructure/search/LuceneHelper.kt index f3714b01f..188c3a966 100644 --- a/komga/src/main/kotlin/org/gotson/komga/infrastructure/search/LuceneHelper.kt +++ b/komga/src/main/kotlin/org/gotson/komga/infrastructure/search/LuceneHelper.kt @@ -21,10 +21,11 @@ private val logger = KotlinLogging.logger {} @Component class LuceneHelper( private val directory: Directory, - private val analyzer: Analyzer, + private val indexAnalyzer: Analyzer, + private val searchAnalyzer: Analyzer, ) { - fun getIndexWriter() = IndexWriter(directory, IndexWriterConfig(analyzer)) + fun getIndexWriter() = IndexWriter(directory, IndexWriterConfig(indexAnalyzer)) fun getIndexReader(): DirectoryReader = DirectoryReader.open(directory) @@ -33,7 +34,7 @@ class LuceneHelper( fun searchEntitiesIds(searchTerm: String?, entity: LuceneEntity): List? { return if (!searchTerm.isNullOrBlank()) { try { - val fieldsQuery = MultiFieldQueryParser(entity.defaultFields, analyzer).apply { + val fieldsQuery = MultiFieldQueryParser(entity.defaultFields, searchAnalyzer).apply { defaultOperator = QueryParser.Operator.AND }.parse(searchTerm) diff --git a/komga/src/main/kotlin/org/gotson/komga/infrastructure/search/MultiLingualAnalyzer.kt b/komga/src/main/kotlin/org/gotson/komga/infrastructure/search/MultiLingualAnalyzer.kt index b13607b78..7dc43c5e4 100644 --- a/komga/src/main/kotlin/org/gotson/komga/infrastructure/search/MultiLingualAnalyzer.kt +++ b/komga/src/main/kotlin/org/gotson/komga/infrastructure/search/MultiLingualAnalyzer.kt @@ -9,7 +9,7 @@ import org.apache.lucene.analysis.cjk.CJKWidthFilter import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter import org.apache.lucene.analysis.standard.StandardTokenizer -class MultiLingualAnalyzer : Analyzer() { +open class MultiLingualAnalyzer : Analyzer() { override fun createComponents(fieldName: String): TokenStreamComponents { val source: Tokenizer = StandardTokenizer() // run the widthfilter first before bigramming, it sometimes combines characters. diff --git a/komga/src/main/kotlin/org/gotson/komga/infrastructure/search/MultiLingualNGramAnalyzer.kt b/komga/src/main/kotlin/org/gotson/komga/infrastructure/search/MultiLingualNGramAnalyzer.kt new file mode 100644 index 000000000..080b2f55e --- /dev/null +++ b/komga/src/main/kotlin/org/gotson/komga/infrastructure/search/MultiLingualNGramAnalyzer.kt @@ -0,0 +1,23 @@ +package org.gotson.komga.infrastructure.search + +import org.apache.lucene.analysis.LowerCaseFilter +import org.apache.lucene.analysis.TokenStream +import org.apache.lucene.analysis.Tokenizer +import org.apache.lucene.analysis.cjk.CJKBigramFilter +import org.apache.lucene.analysis.cjk.CJKWidthFilter +import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter +import org.apache.lucene.analysis.ngram.NGramTokenFilter +import org.apache.lucene.analysis.standard.StandardTokenizer + +class MultiLingualNGramAnalyzer(val minGram: Int, val maxGram: Int, val preserveOriginal: Boolean) : MultiLingualAnalyzer() { + override fun createComponents(fieldName: String): TokenStreamComponents { + val source: Tokenizer = StandardTokenizer() + // run the widthfilter first before bigramming, it sometimes combines characters. + var filter: TokenStream = CJKWidthFilter(source) + filter = LowerCaseFilter(filter) + filter = CJKBigramFilter(filter) + filter = NGramTokenFilter(filter, minGram, maxGram, preserveOriginal) + filter = ASCIIFoldingFilter(filter) + return TokenStreamComponents(source, filter) + } +} diff --git a/komga/src/test/kotlin/org/gotson/komga/infrastructure/search/MultilingualAnalyzerTest.kt b/komga/src/test/kotlin/org/gotson/komga/infrastructure/search/MultilingualAnalyzerTest.kt index 27df215df..0c5dedde7 100644 --- a/komga/src/test/kotlin/org/gotson/komga/infrastructure/search/MultilingualAnalyzerTest.kt +++ b/komga/src/test/kotlin/org/gotson/komga/infrastructure/search/MultilingualAnalyzerTest.kt @@ -1,6 +1,5 @@ package org.gotson.komga.infrastructure.search -import org.apache.lucene.analysis.Analyzer import org.assertj.core.api.Assertions.assertThat import org.junit.jupiter.api.Test @@ -8,21 +7,6 @@ class MultilingualAnalyzerTest { private val analyzer = MultiLingualAnalyzer() - private fun Analyzer.getTokens(text: String): List { - val tokenStream = tokenStream("text", text) - - val tokens = mutableListOf() - tokenStream.use { ts -> - ts.reset() - while (ts.incrementToken()) { - ts.reflectWith { _, key, value -> if (key == "term") tokens += value.toString() } - } - ts.end() - } - - return tokens - } - @Test fun `english text`() { // given diff --git a/komga/src/test/kotlin/org/gotson/komga/infrastructure/search/MultilingualNGramAnalyzerTest.kt b/komga/src/test/kotlin/org/gotson/komga/infrastructure/search/MultilingualNGramAnalyzerTest.kt new file mode 100644 index 000000000..44421aa75 --- /dev/null +++ b/komga/src/test/kotlin/org/gotson/komga/infrastructure/search/MultilingualNGramAnalyzerTest.kt @@ -0,0 +1,81 @@ +package org.gotson.komga.infrastructure.search + +import org.assertj.core.api.Assertions.assertThat +import org.junit.jupiter.api.Test + +class MultilingualNGramAnalyzerTest { + + @Test + fun `single letter`() { + // given + val text = "J" + + // when + val tokens = MultiLingualNGramAnalyzer(3, 8, false).getTokens(text) + val tokensPreserveOriginal = MultiLingualNGramAnalyzer(3, 8, true).getTokens(text) + + // then + assertThat(tokensPreserveOriginal).containsExactly("j") + assertThat(tokens).isEmpty() + } + + @Test + fun `chinese mixed`() { + // given + val text = "[不道德公會][河添太一 ][東立]Vol.04-搬运" + + // when + val tokens = MultiLingualNGramAnalyzer(3, 8, true).getTokens(text) + + // then + assertThat(tokens).containsExactly("不道", "道德", "德公", "公會", "河添", "添太", "太一", "東立", "vol", "04", "搬运") + } + + @Test + fun `chinese only`() { + // given + val text = "不道德公會河添太一東立搬运" + + // when + val tokens = MultiLingualNGramAnalyzer(3, 8, true).getTokens(text) + + // then + assertThat(tokens).containsExactly("不道", "道德", "德公", "公會", "會河", "河添", "添太", "太一", "一東", "東立", "立搬", "搬运") + } + + @Test + fun `hiragana only`() { + // given + val text = "探偵はもう、死んでいる。" + + // when + val tokens = MultiLingualNGramAnalyzer(3, 8, true).getTokens(text) + + // then + assertThat(tokens).containsExactly("探偵", "偵は", "はも", "もう", "死ん", "んで", "でい", "いる") + } + + @Test + fun `katakana only`() { + // given + val text = "ワンパンマン" + + // when + val tokens = MultiLingualNGramAnalyzer(3, 8, true).getTokens(text) + + // then + assertThat(tokens).containsExactly("ワン", "ンパ", "パン", "ンマ", "マン") + } + + @Test + fun `korean only`() { + // given + val text = "고교생을 환불해 주세요" + + // when + val tokens = MultiLingualNGramAnalyzer(3, 8, true).getTokens(text) + + // then + assertThat(tokens).containsExactly("고교", "교생", "생을", "환불", "불해", "주세", "세요") + } +} diff --git a/komga/src/test/kotlin/org/gotson/komga/infrastructure/search/Utils.kt b/komga/src/test/kotlin/org/gotson/komga/infrastructure/search/Utils.kt new file mode 100644 index 000000000..3d282d1bd --- /dev/null +++ b/komga/src/test/kotlin/org/gotson/komga/infrastructure/search/Utils.kt @@ -0,0 +1,18 @@ +package org.gotson.komga.infrastructure.search + +import org.apache.lucene.analysis.Analyzer + +fun Analyzer.getTokens(text: String): List { + val tokenStream = tokenStream("text", text) + + val tokens = mutableListOf() + tokenStream.use { ts -> + ts.reset() + while (ts.incrementToken()) { + ts.reflectWith { _, key, value -> if (key == "term") tokens += value.toString() } + } + ts.end() + } + + return tokens +}