from __future__ import annotations
import html as html_lib
import posixpath as zip_path
import re
from pathlib import Path
from typing import Any
import html2text
from ebooklib import ITEM_DOCUMENT
from ebooklib import epub as epublib
class _TolerantEpubReader(epublib.EpubReader):
"""Bỏ qua mục manifest trỏ tới file không có trong archive."""
def read_file(self, name):
name = zip_path.normpath(name)
try:
return self.zf.read(name)
except KeyError:
return b""
def read_epub_safe(epub_path: Path, *, ignore_ncx: bool = False) -> epublib.EpubBook:
options = {"ignore_ncx": ignore_ncx}
reader = _TolerantEpubReader(str(epub_path), options)
book = reader.load()
reader.process()
return book
_CHAPTER_MARKER_TEXT_RE = re.compile(
r"(?:ch(?:u(?:ơng|ong))?|chapter|hồi|hoi|phần|phan|tập|tap|quyển|quyen)\s*\d+",
re.IGNORECASE,
)
def html_to_text(html_content: str) -> str:
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
h.ignore_emphasis = False
h.body_width = 0
return h.handle(html_content).strip()
def _html_to_text(html_content: str) -> str:
return html_to_text(html_content)
def build_merged_html_from_epub(epub_path: Path) -> str:
book = read_epub_safe(epub_path)
parts: list[str] = []
for item in book.get_items_of_type(ITEM_DOCUMENT):
content = item.get_content().decode("utf-8", errors="replace")
if content.strip():
parts.append(content)
return "\n".join(parts)
def build_chapters_from_epub(epub_path: Path) -> list[dict[str, Any]]:
book = read_epub_safe(epub_path)
out: list[dict[str, Any]] = []
idx = 1
for item in book.get_items_of_type(ITEM_DOCUMENT):
content = item.get_content().decode("utf-8", errors="replace")
txt = _html_to_text(content)
if not txt:
continue
out.append(
{
"number": idx,
"title": item.get_name() or f"Chapter {idx}",
"content": content,
"txt": txt,
}
)
idx += 1
return out
def count_html_tag_opens(html: str, tag: str) -> int:
tag_re = re.escape(tag.strip().lower())
return len(re.findall(rf"<{tag_re}\b", html, flags=re.IGNORECASE))
def _strip_tags_to_text(fragment: str) -> str:
return html_lib.unescape(re.sub(r"<[^>]+>", " ", fragment or "")).strip()
def _title_from_tag_opening(opening_attrs: str, fragment: str, tag: str) -> str:
tag_re = re.escape(tag)
for attr in ("title", "alt"):
match = re.search(rf'{attr}\s*=\s*["\']([^"\']+)["\']', opening_attrs, flags=re.IGNORECASE)
if match:
title = html_lib.unescape(match.group(1)).strip()
if title and len(title) <= 160:
return title
for attr in ("id", "name"):
match = re.search(rf'{attr}\s*=\s*["\']([^"\']+)["\']', opening_attrs, flags=re.IGNORECASE)
if match:
title = html_lib.unescape(match.group(1)).strip()
if title and not title.startswith("#") and len(title) <= 160:
return title
close_match = re.search(
rf"<{tag_re}\b[^>]*>(.*?){tag_re}>",
fragment,
flags=re.IGNORECASE | re.DOTALL,
)
if not close_match:
return ""
inner = _strip_tags_to_text(close_match.group(1))
if inner and len(inner) <= 160:
return inner
return ""
def _anchor_seems_chapter_marker(opening_attrs: str, inner_text: str) -> bool:
text = (inner_text or "").strip()
if text and _CHAPTER_MARKER_TEXT_RE.search(text):
return True
attrs = opening_attrs or ""
if re.search(r'\bhref\s*=\s*["\'][^"\']*\.xhtml', attrs, flags=re.IGNORECASE):
return True
if re.search(
r'\b(?:id|name)\s*=\s*["\'][^"\']*(?:chuong|chương|chapter|ch\d|c\d|hoi|hồi)',
attrs,
flags=re.IGNORECASE,
):
return True
# TOC / nav links thường có text ngắn.
if text and len(text) <= 120:
return True
return False
def _derive_simple_chapter_title(txt: str, number: int) -> str:
for line in (txt or "").splitlines():
cleaned = line.strip()
if cleaned:
return cleaned[:160]
return f"Chương {number}"
def extract_chapters_by_html_tag(
epub_path: Path,
tag: str,
) -> tuple[list[dict[str, Any]], dict[str, int]]:
"""Tách chương tại mỗi thẻ mở ``. Trả về (chapters, stats)."""
merged_html = build_merged_html_from_epub(epub_path)
stats = {"tagOpens": 0, "tagOpensUsed": 0, "tagOpensFiltered": 0}
if not merged_html.strip():
return [], stats
tag_name = tag.strip().lower()
tag_re = re.escape(tag_name)
opener_re = re.compile(rf"<({tag_re})\b([^>]*)>", re.IGNORECASE)
matches = list(opener_re.finditer(merged_html))
stats["tagOpens"] = len(matches)
if not matches:
return [], stats
if tag_name == "a" and len(matches) > 300:
filtered: list[re.Match[str]] = []
for match in matches:
attrs = match.group(2) or ""
rest = merged_html[match.end() : match.end() + 800]
close = re.search(rf"{tag_re}>", rest, flags=re.IGNORECASE)
inner_html = rest[: close.start()] if close else rest
inner_text = _strip_tags_to_text(inner_html)
if _anchor_seems_chapter_marker(attrs, inner_text):
filtered.append(match)
if filtered:
stats["tagOpensFiltered"] = len(matches) - len(filtered)
matches = filtered
chapters: list[dict[str, Any]] = []
for index, match in enumerate(matches):
start = match.start()
end = matches[index + 1].start() if index + 1 < len(matches) else len(merged_html)
raw_html = merged_html[start:end].strip()
if not raw_html:
continue
opening_attrs = match.group(2) or ""
txt = html_to_text(raw_html)
inline_title = _title_from_tag_opening(opening_attrs, raw_html, tag_name)
number = len(chapters) + 1
title = inline_title or _derive_simple_chapter_title(txt, number)
# Bỏ qua anchor rỗng không có tiêu đề và không có nội dung theo sau.
if not txt.strip() and not inline_title:
tag_only = re.fullmatch(
rf"<{tag_re}\b[^>]*>\s*(?:{tag_re}>\s*)?",
raw_html,
flags=re.IGNORECASE | re.DOTALL,
)
if tag_only:
continue
chapters.append(
{
"number": number,
"title": title,
"raw_html": raw_html,
"txt": txt,
}
)
stats["tagOpensUsed"] = len(matches)
return chapters, stats