Files
reader-api/app/epub_parser.py
T
virtus 6b927f20e1
Build and Push Reader API Image / docker (push) Successful in 17s
refactor: replace Next.js server with Uvicorn for backend, remove backfill script
2026-05-01 19:20:30 +07:00

39 lines
1.0 KiB
Python

from __future__ import annotations
from pathlib import Path
from typing import Any
import html2text
from ebooklib import ITEM_DOCUMENT
from ebooklib import epub as epublib
def _html_to_text(html_content: str) -> str:
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
h.ignore_emphasis = False
h.body_width = 0
return h.handle(html_content).strip()
def build_chapters_from_epub(epub_path: Path) -> list[dict[str, Any]]:
book = epublib.read_epub(str(epub_path), options={"ignore_ncx": False})
out: list[dict[str, Any]] = []
idx = 1
for item in book.get_items_of_type(ITEM_DOCUMENT):
content = item.get_content().decode("utf-8", errors="replace")
txt = _html_to_text(content)
if not txt:
continue
out.append(
{
"number": idx,
"title": item.get_name() or f"Chapter {idx}",
"content": content,
"txt": txt,
}
)
idx += 1
return out