feat(epub): implement tolerant EPUB reader and refactor reading functions

- Introduced a new _TolerantEpubReader class to handle missing files in the EPUB manifest gracefully. - Added read_epub_safe function to replace direct calls to epublib.read_epub, enhancing error handling. - Updated build_merged_html_from_epub and build_chapters_from_epub functions to utilize the new reading method.
2026-06-03 11:20:52 +07:00
parent bddd592146
commit 51b200caf4
1 changed files with 22 additions and 2 deletions
@@ -1,6 +1,7 @@
 from __future__ import annotations
 import html as html_lib
 import posixpath as zip_path
 import re
 from pathlib import Path
 from typing import Any
@@ -9,6 +10,25 @@ import html2text
 from ebooklib import ITEM_DOCUMENT
 from ebooklib import epub as epublib
 class _TolerantEpubReader(epublib.EpubReader):
    """Bỏ qua mục manifest trỏ tới file không có trong archive."""
    def read_file(self, name):
        name = zip_path.normpath(name)
        try:
            return self.zf.read(name)
        except KeyError:
            return b""
 def read_epub_safe(epub_path: Path, *, ignore_ncx: bool = False) -> epublib.EpubBook:
    options = {"ignore_ncx": ignore_ncx}
    reader = _TolerantEpubReader(str(epub_path), options)
    book = reader.load()
    reader.process()
    return book
 _CHAPTER_MARKER_TEXT_RE = re.compile(
    r"(?:ch(?:u(?:ơng|ong))?|chapter|hồi|hoi|phần|phan|tập|tap|quyển|quyen)\s*\d+",
    re.IGNORECASE,
@@ -29,7 +49,7 @@ def _html_to_text(html_content: str) -> str:
 def build_merged_html_from_epub(epub_path: Path) -> str:
-    book = epublib.read_epub(str(epub_path), options={"ignore_ncx": False})
+    book = read_epub_safe(epub_path)
    parts: list[str] = []
    for item in book.get_items_of_type(ITEM_DOCUMENT):
        content = item.get_content().decode("utf-8", errors="replace")
@@ -39,7 +59,7 @@ def build_merged_html_from_epub(epub_path: Path) -> str:
 def build_chapters_from_epub(epub_path: Path) -> list[dict[str, Any]]:
-    book = epublib.read_epub(str(epub_path), options={"ignore_ncx": False})
+    book = read_epub_safe(epub_path)
    out: list[dict[str, Any]] = []
    idx = 1
    for item in book.get_items_of_type(ITEM_DOCUMENT):