feat(epub): implement tolerant EPUB reader and refactor reading functions
Build and Push Reader API Image / docker (push) Successful in 1m12s

- Introduced a new _TolerantEpubReader class to handle missing files in the EPUB manifest gracefully.
- Added read_epub_safe function to replace direct calls to epublib.read_epub, enhancing error handling.
- Updated build_merged_html_from_epub and build_chapters_from_epub functions to utilize the new reading method.
This commit is contained in:
2026-06-03 11:20:52 +07:00
parent bddd592146
commit 51b200caf4
+22 -2
View File
@@ -1,6 +1,7 @@
from __future__ import annotations from __future__ import annotations
import html as html_lib import html as html_lib
import posixpath as zip_path
import re import re
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
@@ -9,6 +10,25 @@ import html2text
from ebooklib import ITEM_DOCUMENT from ebooklib import ITEM_DOCUMENT
from ebooklib import epub as epublib from ebooklib import epub as epublib
class _TolerantEpubReader(epublib.EpubReader):
"""Bỏ qua mục manifest trỏ tới file không có trong archive."""
def read_file(self, name):
name = zip_path.normpath(name)
try:
return self.zf.read(name)
except KeyError:
return b""
def read_epub_safe(epub_path: Path, *, ignore_ncx: bool = False) -> epublib.EpubBook:
options = {"ignore_ncx": ignore_ncx}
reader = _TolerantEpubReader(str(epub_path), options)
book = reader.load()
reader.process()
return book
_CHAPTER_MARKER_TEXT_RE = re.compile( _CHAPTER_MARKER_TEXT_RE = re.compile(
r"(?:ch(?:u(?:ơng|ong))?|chapter|hồi|hoi|phần|phan|tập|tap|quyển|quyen)\s*\d+", r"(?:ch(?:u(?:ơng|ong))?|chapter|hồi|hoi|phần|phan|tập|tap|quyển|quyen)\s*\d+",
re.IGNORECASE, re.IGNORECASE,
@@ -29,7 +49,7 @@ def _html_to_text(html_content: str) -> str:
def build_merged_html_from_epub(epub_path: Path) -> str: def build_merged_html_from_epub(epub_path: Path) -> str:
book = epublib.read_epub(str(epub_path), options={"ignore_ncx": False}) book = read_epub_safe(epub_path)
parts: list[str] = [] parts: list[str] = []
for item in book.get_items_of_type(ITEM_DOCUMENT): for item in book.get_items_of_type(ITEM_DOCUMENT):
content = item.get_content().decode("utf-8", errors="replace") content = item.get_content().decode("utf-8", errors="replace")
@@ -39,7 +59,7 @@ def build_merged_html_from_epub(epub_path: Path) -> str:
def build_chapters_from_epub(epub_path: Path) -> list[dict[str, Any]]: def build_chapters_from_epub(epub_path: Path) -> list[dict[str, Any]]:
book = epublib.read_epub(str(epub_path), options={"ignore_ncx": False}) book = read_epub_safe(epub_path)
out: list[dict[str, Any]] = [] out: list[dict[str, Any]] = []
idx = 1 idx = 1
for item in book.get_items_of_type(ITEM_DOCUMENT): for item in book.get_items_of_type(ITEM_DOCUMENT):