39 lines
1.0 KiB
Python
39 lines
1.0 KiB
Python
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import html2text
|
|
from ebooklib import ITEM_DOCUMENT
|
|
from ebooklib import epub as epublib
|
|
|
|
|
|
def _html_to_text(html_content: str) -> str:
|
|
h = html2text.HTML2Text()
|
|
h.ignore_links = True
|
|
h.ignore_images = True
|
|
h.ignore_emphasis = False
|
|
h.body_width = 0
|
|
return h.handle(html_content).strip()
|
|
|
|
|
|
def build_chapters_from_epub(epub_path: Path) -> list[dict[str, Any]]:
|
|
book = epublib.read_epub(str(epub_path), options={"ignore_ncx": False})
|
|
out: list[dict[str, Any]] = []
|
|
idx = 1
|
|
for item in book.get_items_of_type(ITEM_DOCUMENT):
|
|
content = item.get_content().decode("utf-8", errors="replace")
|
|
txt = _html_to_text(content)
|
|
if not txt:
|
|
continue
|
|
out.append(
|
|
{
|
|
"number": idx,
|
|
"title": item.get_name() or f"Chapter {idx}",
|
|
"content": content,
|
|
"txt": txt,
|
|
}
|
|
)
|
|
idx += 1
|
|
return out
|