feat(api): make pre-paginated epub containing only images compatible with divina profile

This commit is contained in:
Gauthier Roebroeck 2023-12-14 16:05:36 +08:00
parent 5b75345be7
commit c2a4d1713d
10 changed files with 96 additions and 14 deletions

View file

@ -0,0 +1,2 @@
alter table MEDIA
add column EPUB_DIVINA_COMPATIBLE boolean NOT NULL DEFAULT 0;

View file

@ -11,6 +11,7 @@ data class Media(
val comment: String? = null,
val extension: MediaExtension? = null,
val bookId: String = "",
val epubDivinaCompatible: Boolean = false,
override val createdDate: LocalDateTime = LocalDateTime.now(),
override val lastModifiedDate: LocalDateTime = createdDate,
) : Auditable {

View file

@ -72,7 +72,7 @@ class BookAnalyzer(
when (mediaType.profile) {
MediaProfile.DIVINA -> analyzeDivina(book, mediaType, analyzeDimensions)
MediaProfile.PDF -> analyzePdf(book, analyzeDimensions)
MediaProfile.EPUB -> analyzeEpub(book)
MediaProfile.EPUB -> analyzeEpub(book, analyzeDimensions)
}.copy(mediaType = mediaType.type)
} catch (ade: AccessDeniedException) {
logger.error(ade) { "Error while analyzing book: $book" }
@ -123,12 +123,14 @@ class BookAnalyzer(
return Media(status = Media.Status.READY, pages = pages, pageCount = pages.size, files = files, comment = entriesErrorSummary)
}
private fun analyzeEpub(book: Book): Media {
val manifest = epubExtractor.getManifest(book.path)
private fun analyzeEpub(book: Book, analyzeDimensions: Boolean): Media {
val manifest = epubExtractor.getManifest(book.path, analyzeDimensions)
return Media(
status = Media.Status.READY,
pages = manifest.divinaPages,
files = manifest.resources,
pageCount = manifest.pageCount,
epubDivinaCompatible = manifest.divinaPages.isNotEmpty(),
extension = MediaExtensionEpub(
toc = manifest.toc,
landmarks = manifest.landmarks,
@ -203,7 +205,9 @@ class BookAnalyzer(
return when (book.media.profile) {
MediaProfile.DIVINA -> divinaExtractors.getValue(book.media.mediaType!!).getEntryStream(book.book.path, book.media.pages[number - 1].fileName)
MediaProfile.PDF -> pdfExtractor.getPageContentAsImage(book.book.path, number).bytes
MediaProfile.EPUB -> throw MediaUnsupportedException("Epub profile does not support getting page content")
MediaProfile.EPUB ->
if (book.media.epubDivinaCompatible) epubExtractor.getEntryStream(book.book.path, book.media.pages[number - 1].fileName)
else throw MediaUnsupportedException("Epub profile does not support getting page content")
null -> throw MediaNotReadyException()
}
}

View file

@ -43,6 +43,9 @@ class KomgaProperties {
@Positive
var pageHashing: Int = 3
@Positive
var epubDivinaLetterCountThreshold: Int = 15
@Deprecated("Moved to server settings since 1.5.0")
var rememberMe = RememberMe()

View file

@ -447,6 +447,7 @@ class BookDtoDao(
mediaType = mediaType ?: "",
pagesCount = pageCount.toInt(),
comment = comment ?: "",
epubDivinaCompatible = epubDivinaCompatible,
)
private fun BookMetadataRecord.toDto(authors: List<AuthorDto>, tags: Set<String>, links: List<WebLinkDto>) =

View file

@ -49,6 +49,7 @@ class MediaDao(
m.COMMENT,
m.PAGE_COUNT,
m.EXTENSION_CLASS,
m.EPUB_DIVINA_COMPATIBLE,
*p.fields(),
)
@ -133,9 +134,10 @@ class MediaDao(
m.MEDIA_TYPE,
m.COMMENT,
m.PAGE_COUNT,
m.EPUB_DIVINA_COMPATIBLE,
m.EXTENSION_CLASS,
m.EXTENSION_VALUE_BLOB,
).values(null as String?, null, null, null, null, null, null),
).values(null as String?, null, null, null, null, null, null, null),
).also { step ->
chunk.forEach { media ->
step.bind(
@ -144,6 +146,7 @@ class MediaDao(
media.mediaType,
media.comment,
media.pageCount,
media.epubDivinaCompatible,
media.extension?.let { if (it is ProxyExtension) null else it::class.qualifiedName },
media.extension?.let { if (it is ProxyExtension) null else mapper.serializeJsonGz(it) },
)
@ -227,6 +230,7 @@ class MediaDao(
.set(m.MEDIA_TYPE, media.mediaType)
.set(m.COMMENT, media.comment)
.set(m.PAGE_COUNT, media.pageCount)
.set(m.EPUB_DIVINA_COMPATIBLE, media.epubDivinaCompatible)
.apply {
if (media.extension != null && media.extension !is ProxyExtension) {
set(m.EXTENSION_CLASS, media.extension::class.qualifiedName)
@ -277,6 +281,7 @@ class MediaDao(
extension = ProxyExtension.of(extensionClass),
comment = comment,
bookId = bookId,
epubDivinaCompatible = epubDivinaCompatible,
createdDate = createdDate.toCurrentTimeZone(),
lastModifiedDate = lastModifiedDate.toCurrentTimeZone(),
)

View file

@ -1,18 +1,32 @@
package org.gotson.komga.infrastructure.mediacontainer.epub
import mu.KotlinLogging
import org.apache.commons.compress.archivers.ArchiveEntry
import org.apache.commons.compress.archivers.zip.ZipFile
import org.gotson.komga.domain.model.BookPage
import org.gotson.komga.domain.model.EpubTocEntry
import org.gotson.komga.domain.model.MediaFile
import org.gotson.komga.domain.model.R2Locator
import org.gotson.komga.domain.model.TypedBytes
import org.gotson.komga.infrastructure.image.ImageAnalyzer
import org.gotson.komga.infrastructure.mediacontainer.ContentDetector
import org.jsoup.Jsoup
import org.springframework.beans.factory.annotation.Value
import org.springframework.stereotype.Service
import java.nio.file.Path
import kotlin.io.path.Path
import kotlin.io.path.invariantSeparatorsPathString
import kotlin.math.ceil
import kotlin.math.roundToInt
private val logger = KotlinLogging.logger {}
@Service
class EpubExtractor {
class EpubExtractor(
private val contentDetector: ContentDetector,
private val imageAnalyzer: ImageAnalyzer,
@Value("#{@komgaProperties.epubDivinaLetterCountThreshold}") private val letterCountThreshold: Int,
) {
/**
* Retrieves a specific entry by name from the zip archive
@ -44,18 +58,20 @@ class EpubExtractor {
} else null
}
fun getManifest(path: Path): EpubManifest =
fun getManifest(path: Path, analyzeDimensions: Boolean): EpubManifest =
path.epub { epub ->
val resources = getResources(epub)
val isFixedLayout = isFixedLayout(epub)
val pageCount = computePageCount(epub)
EpubManifest(
resources = resources,
toc = getToc(epub),
landmarks = getLandmarks(epub),
pageList = getPageList(epub),
pageCount = computePageCount(epub),
pageCount = pageCount,
isFixedLayout = isFixedLayout,
positions = computePositions(resources, isFixedLayout),
divinaPages = getDivinaPages(epub, isFixedLayout, pageCount, analyzeDimensions),
)
}
@ -84,6 +100,52 @@ class EpubExtractor {
}
}
private fun getDivinaPages(epub: EpubPackage, isFixedLayout: Boolean, pageCount: Int, analyzeDimensions: Boolean): List<BookPage> {
if (!isFixedLayout) return emptyList()
try {
val pagesWithImages = epub.opfDoc.select("spine > itemref")
.map { it.attr("idref") }
.mapNotNull { idref -> epub.manifest[idref]?.href?.let { normalizeHref(epub.opfDir, it) } }
.map { pagePath ->
val doc = epub.zip.getInputStream(epub.zip.getEntry(pagePath)).use { Jsoup.parse(it, null, "") }
// if a page has text over the threshold then the book is not divina compatible
if (doc.body().text().length > letterCountThreshold) return emptyList()
val img = doc.getElementsByTag("img")
.map { it.attr("src") } // get the src, which can be a relative path
val svg = doc.select("svg > image[xlink:href]")
.map { it.attr("xlink:href") } // get the source, which can be a relative path
(img + svg).map { (Path(pagePath).parent ?: Path("")).resolve(it).normalize().invariantSeparatorsPathString } // resolve it against the page folder
}
if (pagesWithImages.size != pageCount) return emptyList()
val imagesPath = pagesWithImages.flatten()
if (imagesPath.size != pageCount) return emptyList()
val divinaPages = imagesPath.mapNotNull { imagePath ->
val mediaType = epub.manifest.values.firstOrNull { normalizeHref(epub.opfDir, it.href) == imagePath }?.mediaType ?: return@mapNotNull null
val zipEntry = epub.zip.getEntry(imagePath)
if (!contentDetector.isImage(mediaType)) return@mapNotNull null
val dimension =
if (analyzeDimensions) epub.zip.getInputStream(zipEntry).use { imageAnalyzer.getDimension(it) }
else null
val fileSize = if (zipEntry.size == ArchiveEntry.SIZE_UNKNOWN) null else zipEntry.size
BookPage(fileName = imagePath, mediaType = mediaType, dimension = dimension, fileSize = fileSize)
}
if (divinaPages.size != pageCount) return emptyList()
return divinaPages
} catch (e: Exception) {
logger.warn(e) { "Error while getting divina pages" }
return emptyList()
}
}
private fun computePageCount(epub: EpubPackage): Int {
val spine = epub.opfDoc.select("spine > itemref")
.map { it.attr("idref") }

View file

@ -1,5 +1,6 @@
package org.gotson.komga.infrastructure.mediacontainer.epub
import org.gotson.komga.domain.model.BookPage
import org.gotson.komga.domain.model.EpubTocEntry
import org.gotson.komga.domain.model.MediaFile
import org.gotson.komga.domain.model.R2Locator
@ -12,4 +13,5 @@ data class EpubManifest(
val pageCount: Int,
val isFixedLayout: Boolean,
val positions: List<R2Locator>,
val divinaPages: List<BookPage>,
)

View file

@ -227,14 +227,15 @@ class WebPubGenerator(
private fun BookDto.toWPLinkDtos(uriBuilder: UriComponentsBuilder): List<WPLinkDto> {
val komgaMediaType = KomgaMediaType.fromMediaType(media.mediaType)
return listOfNotNull(
return buildList {
// most appropriate manifest
WPLinkDto(rel = OpdsLinkRel.SELF, href = uriBuilder.cloneBuilder().path("books/$id/manifest").toUriString(), type = mediaProfileToWebPub(komgaMediaType?.profile)),
// PDF is also available under the Divina profile
if (komgaMediaType?.profile == MediaProfile.PDF) WPLinkDto(href = uriBuilder.cloneBuilder().path("books/$id/manifest/divina").toUriString(), type = MEDIATYPE_DIVINA_JSON_VALUE) else null,
add(WPLinkDto(rel = OpdsLinkRel.SELF, href = uriBuilder.cloneBuilder().path("books/$id/manifest").toUriString(), type = mediaProfileToWebPub(komgaMediaType?.profile)))
// PDF is also available under the Divina profile / EPUB that are Divina compatible
if (komgaMediaType?.profile == MediaProfile.PDF || (komgaMediaType?.profile == MediaProfile.EPUB && media.epubDivinaCompatible))
add(WPLinkDto(href = uriBuilder.cloneBuilder().path("books/$id/manifest/divina").toUriString(), type = MEDIATYPE_DIVINA_JSON_VALUE))
// main acquisition link
WPLinkDto(rel = OpdsLinkRel.ACQUISITION, type = komgaMediaType?.exportType ?: media.mediaType, href = uriBuilder.cloneBuilder().path("books/$id/file").toUriString()),
)
add(WPLinkDto(rel = OpdsLinkRel.ACQUISITION, type = komgaMediaType?.exportType ?: media.mediaType, href = uriBuilder.cloneBuilder().path("books/$id/file").toUriString()))
}
}
private fun mediaProfileToWebPub(profile: MediaProfile?): String = when (profile) {

View file

@ -39,6 +39,7 @@ data class MediaDto(
val mediaType: String,
val pagesCount: Int,
val comment: String,
val epubDivinaCompatible: Boolean,
) {
val mediaProfile: String by lazy { MediaType.fromMediaType(mediaType)?.profile?.name ?: "" }
}