feat(epub): implement tolerant EPUB reader and refactor reading functions
Build and Push Reader API Image / docker (push) Successful in 1m12s
Build and Push Reader API Image / docker (push) Successful in 1m12s
- Introduced a new _TolerantEpubReader class to handle missing files in the EPUB manifest gracefully. - Added read_epub_safe function to replace direct calls to epublib.read_epub, enhancing error handling. - Updated build_merged_html_from_epub and build_chapters_from_epub functions to utilize the new reading method.
This commit is contained in:
+22
-2
@@ -1,6 +1,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import html as html_lib
|
import html as html_lib
|
||||||
|
import posixpath as zip_path
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
@@ -9,6 +10,25 @@ import html2text
|
|||||||
from ebooklib import ITEM_DOCUMENT
|
from ebooklib import ITEM_DOCUMENT
|
||||||
from ebooklib import epub as epublib
|
from ebooklib import epub as epublib
|
||||||
|
|
||||||
|
|
||||||
|
class _TolerantEpubReader(epublib.EpubReader):
|
||||||
|
"""Bỏ qua mục manifest trỏ tới file không có trong archive."""
|
||||||
|
|
||||||
|
def read_file(self, name):
|
||||||
|
name = zip_path.normpath(name)
|
||||||
|
try:
|
||||||
|
return self.zf.read(name)
|
||||||
|
except KeyError:
|
||||||
|
return b""
|
||||||
|
|
||||||
|
|
||||||
|
def read_epub_safe(epub_path: Path, *, ignore_ncx: bool = False) -> epublib.EpubBook:
|
||||||
|
options = {"ignore_ncx": ignore_ncx}
|
||||||
|
reader = _TolerantEpubReader(str(epub_path), options)
|
||||||
|
book = reader.load()
|
||||||
|
reader.process()
|
||||||
|
return book
|
||||||
|
|
||||||
_CHAPTER_MARKER_TEXT_RE = re.compile(
|
_CHAPTER_MARKER_TEXT_RE = re.compile(
|
||||||
r"(?:ch(?:u(?:ơng|ong))?|chapter|hồi|hoi|phần|phan|tập|tap|quyển|quyen)\s*\d+",
|
r"(?:ch(?:u(?:ơng|ong))?|chapter|hồi|hoi|phần|phan|tập|tap|quyển|quyen)\s*\d+",
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
@@ -29,7 +49,7 @@ def _html_to_text(html_content: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def build_merged_html_from_epub(epub_path: Path) -> str:
|
def build_merged_html_from_epub(epub_path: Path) -> str:
|
||||||
book = epublib.read_epub(str(epub_path), options={"ignore_ncx": False})
|
book = read_epub_safe(epub_path)
|
||||||
parts: list[str] = []
|
parts: list[str] = []
|
||||||
for item in book.get_items_of_type(ITEM_DOCUMENT):
|
for item in book.get_items_of_type(ITEM_DOCUMENT):
|
||||||
content = item.get_content().decode("utf-8", errors="replace")
|
content = item.get_content().decode("utf-8", errors="replace")
|
||||||
@@ -39,7 +59,7 @@ def build_merged_html_from_epub(epub_path: Path) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def build_chapters_from_epub(epub_path: Path) -> list[dict[str, Any]]:
|
def build_chapters_from_epub(epub_path: Path) -> list[dict[str, Any]]:
|
||||||
book = epublib.read_epub(str(epub_path), options={"ignore_ncx": False})
|
book = read_epub_safe(epub_path)
|
||||||
out: list[dict[str, Any]] = []
|
out: list[dict[str, Any]] = []
|
||||||
idx = 1
|
idx = 1
|
||||||
for item in book.get_items_of_type(ITEM_DOCUMENT):
|
for item in book.get_items_of_type(ITEM_DOCUMENT):
|
||||||
|
|||||||
Reference in New Issue
Block a user