perf(komga): better handling of Lucene index when reading and updating

This commit is contained in:
Gauthier Roebroeck 2023-10-12 15:54:09 +08:00
parent e7fcf23e56
commit 487b43967d
7 changed files with 124 additions and 46 deletions

View file

@ -110,6 +110,9 @@ class KomgaProperties {
var indexAnalyzer = IndexAnalyzer()
@DurationUnit(ChronoUnit.SECONDS)
var commitDelay: Duration = Duration.ofSeconds(2)
class IndexAnalyzer {
@get:Positive
var minGram: Int = 3

View file

@ -15,45 +15,48 @@ import org.apache.lucene.queryparser.classic.ParseException
import org.apache.lucene.queryparser.classic.QueryParser
import org.apache.lucene.search.BooleanClause
import org.apache.lucene.search.BooleanQuery
import org.apache.lucene.search.IndexSearcher
import org.apache.lucene.search.SearcherFactory
import org.apache.lucene.search.SearcherManager
import org.apache.lucene.search.TermQuery
import org.apache.lucene.store.Directory
import org.springframework.beans.factory.annotation.Value
import org.springframework.scheduling.TaskScheduler
import org.springframework.stereotype.Component
import java.time.Duration
import java.time.ZonedDateTime
import java.util.concurrent.ScheduledFuture
private val logger = KotlinLogging.logger {}
@Component
class LuceneHelper(
private val directory: Directory,
private val indexAnalyzer: Analyzer,
private val searchAnalyzer: Analyzer,
private val taskScheduler: TaskScheduler,
indexAnalyzer: Analyzer,
@Value("#{@komgaProperties.lucene.commitDelay}")
private val commitDelay: Duration,
) {
private fun getIndexWriterConfig() = IndexWriterConfig(indexAnalyzer)
fun getIndexWriter() = IndexWriter(directory, getIndexWriterConfig())
fun getIndexReader(): DirectoryReader = DirectoryReader.open(directory)
private val indexWriterConfig = IndexWriterConfig(indexAnalyzer)
private val indexWriter: IndexWriter = IndexWriter(directory, indexWriterConfig)
private val searcherManager = SearcherManager(indexWriter, SearcherFactory())
fun indexExists(): Boolean = DirectoryReader.indexExists(directory)
fun setIndexVersion(version: Int) {
getIndexWriter().use { indexWriter ->
val doc = Document().apply {
add(StringField("index_version", version.toString(), Field.Store.YES))
add(StringField("type", "index_version", Field.Store.NO))
}
indexWriter.updateDocument(Term("type", "index_version"), doc)
val doc = Document().apply {
add(StringField("index_version", version.toString(), Field.Store.YES))
add(StringField("type", "index_version", Field.Store.NO))
}
updateDocument(Term("type", "index_version"), doc)
logger.info { "Lucene index version: ${getIndexVersion()}" }
}
fun getIndexVersion(): Int =
getIndexReader().use { index ->
val searcher = IndexSearcher(index)
val topDocs = searcher.search(TermQuery(Term("type", "index_version")), 1)
topDocs.scoreDocs.map { searcher.storedFields().document(it.doc)["index_version"] }.firstOrNull()?.toIntOrNull() ?: 1
}
fun getIndexVersion(): Int {
val searcher = searcherManager.acquire()
val topDocs = searcher.search(TermQuery(Term("type", "index_version")), 1)
return topDocs.scoreDocs.map { searcher.storedFields().document(it.doc)["index_version"] }.firstOrNull()?.toIntOrNull() ?: 1
}
fun searchEntitiesIds(searchTerm: String?, entity: LuceneEntity): List<String>? {
return if (!searchTerm.isNullOrBlank()) {
@ -69,11 +72,9 @@ class LuceneHelper(
.add(typeQuery, BooleanClause.Occur.MUST)
.build()
getIndexReader().use { index ->
val searcher = IndexSearcher(index)
val topDocs = searcher.search(booleanQuery, index.numDocs())
topDocs.scoreDocs.map { searcher.storedFields().document(it.doc)[entity.id] }
}
val searcher = searcherManager.acquire()
val topDocs = searcher.search(booleanQuery, Int.MAX_VALUE)
topDocs.scoreDocs.map { searcher.storedFields().document(it.doc)[entity.id] }
} catch (e: ParseException) {
emptyList()
} catch (e: Exception) {
@ -84,7 +85,40 @@ class LuceneHelper(
}
fun upgradeIndex() {
IndexUpgrader(directory, getIndexWriterConfig(), true).upgrade()
IndexUpgrader(directory, indexWriterConfig, true).upgrade()
logger.info { "Lucene index upgraded" }
}
fun addDocument(doc: Document) {
indexWriter.addDocument(doc)
commitAndMaybeRefresh()
}
fun addDocuments(docs: Iterable<Document>) {
indexWriter.addDocuments(docs)
commitAndMaybeRefresh()
}
fun updateDocument(term: Term, doc: Document) {
indexWriter.updateDocument(term, doc)
commitAndMaybeRefresh()
}
fun deleteDocuments(term: Term) {
indexWriter.deleteDocuments(term)
commitAndMaybeRefresh()
}
@Volatile
private var commitFuture: ScheduledFuture<*>? = null
private val commitRunnable = Runnable {
indexWriter.commit()
searcherManager.maybeRefresh()
}
private fun commitAndMaybeRefresh() {
if (commitFuture == null || commitFuture!!.isDone)
commitFuture = taskScheduler.schedule(commitRunnable, ZonedDateTime.now().plus(commitDelay).toInstant())
}
}

View file

@ -66,19 +66,17 @@ class SearchIndexLifecycle(
val pages = ceil(count.toDouble() / batchSize).toInt()
logger.info { "Number of entities: $count" }
luceneHelper.getIndexWriter().use { indexWriter ->
measureTime {
indexWriter.deleteDocuments(Term(LuceneEntity.TYPE, entity.type))
measureTime {
luceneHelper.deleteDocuments(Term(LuceneEntity.TYPE, entity.type))
(0 until pages).forEach { page ->
logger.info { "Processing page ${page + 1} of $pages ($batchSize elements)" }
val entityDocs = provider(PageRequest.of(page, batchSize)).content
.mapNotNull { toDoc(it) }
indexWriter.addDocuments(entityDocs)
}
}.also { duration ->
logger.info { "Wrote ${entity.name} index in $duration" }
(0 until pages).forEach { page ->
logger.info { "Processing page ${page + 1} of $pages ($batchSize elements)" }
val entityDocs = provider(PageRequest.of(page, batchSize)).content
.mapNotNull { toDoc(it) }
luceneHelper.addDocuments(entityDocs)
}
}.also { duration ->
logger.info { "Wrote ${entity.name} index in $duration" }
}
}
@ -111,20 +109,14 @@ class SearchIndexLifecycle(
} else this.toDocument()
private fun addEntity(doc: Document) {
luceneHelper.getIndexWriter().use { indexWriter ->
indexWriter.addDocument(doc)
}
luceneHelper.addDocument(doc)
}
private fun updateEntity(entity: LuceneEntity, entityId: String, newDoc: Document) {
luceneHelper.getIndexWriter().use { indexWriter ->
indexWriter.updateDocument(Term(entity.id, entityId), newDoc)
}
luceneHelper.updateDocument(Term(entity.id, entityId), newDoc)
}
private fun deleteEntity(entity: LuceneEntity, entityId: String) {
luceneHelper.getIndexWriter().use { indexWriter ->
indexWriter.deleteDocuments(Term(entity.id, entityId))
}
luceneHelper.deleteDocuments(Term(entity.id, entityId))
}
}

View file

@ -400,6 +400,7 @@ class BookDtoDaoTest(
)
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val found = bookDtoDao.findAll(
@ -426,6 +427,7 @@ class BookDtoDaoTest(
)
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val found = bookDtoDao.findAll(
@ -486,6 +488,7 @@ class BookDtoDaoTest(
}
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val found = bookDtoDao.findAll(
@ -518,6 +521,7 @@ class BookDtoDaoTest(
}
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val found = bookDtoDao.findAll(
@ -547,6 +551,7 @@ class BookDtoDaoTest(
}
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val found = bookDtoDao.findAll(
@ -576,6 +581,7 @@ class BookDtoDaoTest(
}
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val foundGeneric = bookDtoDao.findAll(
@ -613,6 +619,7 @@ class BookDtoDaoTest(
}
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val found = bookDtoDao.findAll(
@ -641,6 +648,7 @@ class BookDtoDaoTest(
}
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val found = bookDtoDao.findAll(
@ -665,6 +673,7 @@ class BookDtoDaoTest(
}
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val found = bookDtoDao.findAll(
@ -692,6 +701,7 @@ class BookDtoDaoTest(
)
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val found = bookDtoDao.findAll(
@ -717,6 +727,7 @@ class BookDtoDaoTest(
)
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val found = bookDtoDao.findAll(
@ -744,6 +755,7 @@ class BookDtoDaoTest(
)
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val found = bookDtoDao.findAll(
@ -770,6 +782,7 @@ class BookDtoDaoTest(
)
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val found = bookDtoDao.findAll(
@ -809,6 +822,7 @@ class BookDtoDaoTest(
)
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val found = bookDtoDao.findAll(

View file

@ -281,6 +281,7 @@ class SeriesDtoDaoTest(
seriesLifecycle.createSeries(makeSeries("Batman", library.id))
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val found = seriesDtoDao.findAll(
@ -305,6 +306,7 @@ class SeriesDtoDaoTest(
}
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val found = seriesDtoDao.findAll(
@ -329,6 +331,7 @@ class SeriesDtoDaoTest(
}
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val found = seriesDtoDao.findAll(
@ -353,6 +356,7 @@ class SeriesDtoDaoTest(
}
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val found = seriesDtoDao.findAll(
@ -377,6 +381,7 @@ class SeriesDtoDaoTest(
}
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val found = seriesDtoDao.findAll(
@ -401,6 +406,7 @@ class SeriesDtoDaoTest(
}
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val found = seriesDtoDao.findAll(
@ -431,6 +437,7 @@ class SeriesDtoDaoTest(
seriesMetadataLifecycle.aggregateMetadata(series)
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val foundByBookTag = seriesDtoDao.findAll(
@ -498,6 +505,7 @@ class SeriesDtoDaoTest(
}
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val found = seriesDtoDao.findAll(
@ -522,6 +530,7 @@ class SeriesDtoDaoTest(
}
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val found = seriesDtoDao.findAll(
@ -554,6 +563,7 @@ class SeriesDtoDaoTest(
}
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val found = seriesDtoDao.findAll(
@ -587,6 +597,7 @@ class SeriesDtoDaoTest(
seriesMetadataLifecycle.aggregateMetadata(series)
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val foundGeneric = seriesDtoDao.findAll(
@ -631,6 +642,7 @@ class SeriesDtoDaoTest(
seriesMetadataLifecycle.aggregateMetadata(series)
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val found = seriesDtoDao.findAll(
@ -651,6 +663,7 @@ class SeriesDtoDaoTest(
seriesLifecycle.createSeries(makeSeries("Batman and Robin", library.id))
searchIndexLifecycle.rebuildIndex()
Thread.sleep(100) // index rebuild is done asynchronously, and need a slight delay to be updated
// when
val found = seriesDtoDao.findAll(

View file

@ -97,6 +97,7 @@ class SearchIndexLifecycleTest(
fun `given empty index when adding an entity then it is added to the index`() {
val series = seriesLifecycle.createSeries(makeSeries("Series", libraryId = library.id))
seriesLifecycle.addBooks(series, listOf(makeBook("book", seriesId = series.id, libraryId = library.id)))
Thread.sleep(100) // search index update is asynchronous, and need a slight delay to be processed
val found = luceneHelper.searchEntitiesIds("book", LuceneEntity.Book)
@ -109,6 +110,7 @@ class SearchIndexLifecycleTest(
val series = seriesLifecycle.createSeries(makeSeries("Series", libraryId = library.id))
val book = makeBook("book", seriesId = series.id, libraryId = library.id)
seriesLifecycle.addBooks(series, listOf(book))
Thread.sleep(100) // search index update is asynchronous, and need a slight delay to be processed
luceneHelper.searchEntitiesIds("book", LuceneEntity.Book).let { found ->
assertThat(found).isNotNull
@ -119,6 +121,7 @@ class SearchIndexLifecycleTest(
bookMetadataRepository.update(it.copy(title = "updated"))
}
mockEventPublisher.publishEvent(DomainEvent.BookUpdated(book))
Thread.sleep(100) // search index update is asynchronous, and need a slight delay to be processed
luceneHelper.searchEntitiesIds("book", LuceneEntity.Book).let { found ->
assertThat(found).isNotNull
@ -135,6 +138,7 @@ class SearchIndexLifecycleTest(
val series = seriesLifecycle.createSeries(makeSeries("Series", libraryId = library.id))
val book = makeBook("book", seriesId = series.id, libraryId = library.id)
seriesLifecycle.addBooks(series, listOf(book))
Thread.sleep(100) // search index update is asynchronous, and need a slight delay to be processed
luceneHelper.searchEntitiesIds("book", LuceneEntity.Book).let { found ->
assertThat(found).isNotNull
@ -142,6 +146,7 @@ class SearchIndexLifecycleTest(
}
bookLifecycle.deleteOne(book)
Thread.sleep(100) // search index update is asynchronous, and need a slight delay to be processed
luceneHelper.searchEntitiesIds("book", LuceneEntity.Book).let { found ->
assertThat(found).isNotNull
@ -155,6 +160,7 @@ class SearchIndexLifecycleTest(
@Test
fun `given empty index when adding an entity then it is added to the index`() {
seriesLifecycle.createSeries(makeSeries("Series", libraryId = library.id))
Thread.sleep(100) // search index update is asynchronous, and need a slight delay to be processed
val found = luceneHelper.searchEntitiesIds("series", LuceneEntity.Series)
@ -165,6 +171,7 @@ class SearchIndexLifecycleTest(
@Test
fun `given an entity when updating then it is updated in the index`() {
val series = seriesLifecycle.createSeries(makeSeries("Series", libraryId = library.id))
Thread.sleep(100) // search index update is asynchronous, and need a slight delay to be processed
luceneHelper.searchEntitiesIds("series", LuceneEntity.Series).let { found ->
assertThat(found).isNotNull
@ -175,6 +182,7 @@ class SearchIndexLifecycleTest(
seriesMetadataRepository.update(it.copy(title = "updated", titleSort = "updated"))
}
mockEventPublisher.publishEvent(DomainEvent.SeriesUpdated(series))
Thread.sleep(100) // search index update is asynchronous, and need a slight delay to be processed
luceneHelper.searchEntitiesIds("series", LuceneEntity.Series).let { found ->
assertThat(found).isNotNull
@ -189,6 +197,7 @@ class SearchIndexLifecycleTest(
@Test
fun `given an entity when deleting then it is removed from the index`() {
val series = seriesLifecycle.createSeries(makeSeries("Series", libraryId = library.id))
Thread.sleep(100) // search index update is asynchronous, and need a slight delay to be processed
luceneHelper.searchEntitiesIds("series", LuceneEntity.Series).let { found ->
assertThat(found).isNotNull
@ -196,6 +205,7 @@ class SearchIndexLifecycleTest(
}
seriesLifecycle.deleteMany(listOf(series))
Thread.sleep(100) // search index update is asynchronous, and need a slight delay to be processed
luceneHelper.searchEntitiesIds("series", LuceneEntity.Series).let { found ->
assertThat(found).isNotNull
@ -210,6 +220,7 @@ class SearchIndexLifecycleTest(
fun `given empty index when adding an entity then it is added to the index`() {
val collection = SeriesCollection("collection")
collectionLifecycle.addCollection(collection)
Thread.sleep(100) // search index update is asynchronous, and need a slight delay to be processed
val found = luceneHelper.searchEntitiesIds("collection", LuceneEntity.Collection)
@ -221,6 +232,7 @@ class SearchIndexLifecycleTest(
fun `given an entity when updating then it is updated in the index`() {
val collection = SeriesCollection("collection")
collectionLifecycle.addCollection(collection)
Thread.sleep(100) // search index update is asynchronous, and need a slight delay to be processed
luceneHelper.searchEntitiesIds("collection", LuceneEntity.Collection).let { found ->
assertThat(found).isNotNull
@ -231,6 +243,7 @@ class SearchIndexLifecycleTest(
collectionRepository.update(it.copy(name = "updated"))
}
mockEventPublisher.publishEvent(DomainEvent.CollectionUpdated(collection))
Thread.sleep(100) // search index update is asynchronous, and need a slight delay to be processed
luceneHelper.searchEntitiesIds("collection", LuceneEntity.Collection).let { found ->
assertThat(found).isNotNull
@ -246,6 +259,7 @@ class SearchIndexLifecycleTest(
fun `given an entity when deleting then it is removed from the index`() {
val collection = SeriesCollection("collection")
collectionLifecycle.addCollection(collection)
Thread.sleep(100) // search index update is asynchronous, and need a slight delay to be processed
luceneHelper.searchEntitiesIds("collection", LuceneEntity.Collection).let { found ->
assertThat(found).isNotNull
@ -253,6 +267,7 @@ class SearchIndexLifecycleTest(
}
collectionLifecycle.deleteCollection(collection)
Thread.sleep(100) // search index update is asynchronous, and need a slight delay to be processed
luceneHelper.searchEntitiesIds("collection", LuceneEntity.Collection).let { found ->
assertThat(found).isNotNull
@ -267,6 +282,7 @@ class SearchIndexLifecycleTest(
fun `given empty index when adding an entity then it is added to the index`() {
val readList = ReadList("readlist")
readListLifecycle.addReadList(readList)
Thread.sleep(100) // search index update is asynchronous, and need a slight delay to be processed
val found = luceneHelper.searchEntitiesIds("readlist", LuceneEntity.ReadList)
@ -278,6 +294,7 @@ class SearchIndexLifecycleTest(
fun `given an entity when updating then it is updated in the index`() {
val readList = ReadList("readlist")
readListLifecycle.addReadList(readList)
Thread.sleep(100) // search index update is asynchronous, and need a slight delay to be processed
luceneHelper.searchEntitiesIds("readlist", LuceneEntity.ReadList).let { found ->
assertThat(found).isNotNull
@ -288,6 +305,7 @@ class SearchIndexLifecycleTest(
readListRepository.update(it.copy(name = "updated"))
}
mockEventPublisher.publishEvent(DomainEvent.ReadListUpdated(readList))
Thread.sleep(100) // search index update is asynchronous, and need a slight delay to be processed
luceneHelper.searchEntitiesIds("readlist", LuceneEntity.ReadList).let { found ->
assertThat(found).isNotNull
@ -303,6 +321,7 @@ class SearchIndexLifecycleTest(
fun `given an entity when deleting then it is removed from the index`() {
val readList = ReadList("readlist")
readListLifecycle.addReadList(readList)
Thread.sleep(100) // search index update is asynchronous, and need a slight delay to be processed
luceneHelper.searchEntitiesIds("readlist", LuceneEntity.ReadList).let { found ->
assertThat(found).isNotNull
@ -310,6 +329,7 @@ class SearchIndexLifecycleTest(
}
readListLifecycle.deleteReadList(readList)
Thread.sleep(100) // search index update is asynchronous, and need a slight delay to be processed
luceneHelper.searchEntitiesIds("readlist", LuceneEntity.ReadList).let { found ->
assertThat(found).isNotNull

View file

@ -3,6 +3,8 @@ application.version: TESTING
komga:
database:
file: ":memory:"
lucene:
commit-delay: 0s
spring:
flyway: