feat(auth, epub): enhance Google token verification and EPUB chapter extraction
Build and Push Reader API Image / docker (push) Successful in 14s
Build and Push Reader API Image / docker (push) Successful in 14s
- Added Google token verification logic to improve security and ensure valid tokens are processed. - Introduced functions for extracting chapters from EPUB files based on HTML tags, including support for chapter markers. - Updated `.env.example` to include configuration for an OpenAI-compatible router. - Refactored existing functions for better readability and maintainability.
This commit is contained in:
+154
-1
@@ -1,5 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import html as html_lib
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
@@ -7,8 +9,13 @@ import html2text
|
||||
from ebooklib import ITEM_DOCUMENT
|
||||
from ebooklib import epub as epublib
|
||||
|
||||
_CHAPTER_MARKER_TEXT_RE = re.compile(
|
||||
r"(?:ch(?:u(?:ơng|ong))?|chapter|hồi|hoi|phần|phan|tập|tap|quyển|quyen)\s*\d+",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
def _html_to_text(html_content: str) -> str:
|
||||
|
||||
def html_to_text(html_content: str) -> str:
|
||||
h = html2text.HTML2Text()
|
||||
h.ignore_links = True
|
||||
h.ignore_images = True
|
||||
@@ -17,6 +24,20 @@ def _html_to_text(html_content: str) -> str:
|
||||
return h.handle(html_content).strip()
|
||||
|
||||
|
||||
def _html_to_text(html_content: str) -> str:
|
||||
return html_to_text(html_content)
|
||||
|
||||
|
||||
def build_merged_html_from_epub(epub_path: Path) -> str:
|
||||
book = epublib.read_epub(str(epub_path), options={"ignore_ncx": False})
|
||||
parts: list[str] = []
|
||||
for item in book.get_items_of_type(ITEM_DOCUMENT):
|
||||
content = item.get_content().decode("utf-8", errors="replace")
|
||||
if content.strip():
|
||||
parts.append(content)
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def build_chapters_from_epub(epub_path: Path) -> list[dict[str, Any]]:
|
||||
book = epublib.read_epub(str(epub_path), options={"ignore_ncx": False})
|
||||
out: list[dict[str, Any]] = []
|
||||
@@ -36,3 +57,135 @@ def build_chapters_from_epub(epub_path: Path) -> list[dict[str, Any]]:
|
||||
)
|
||||
idx += 1
|
||||
return out
|
||||
|
||||
|
||||
def count_html_tag_opens(html: str, tag: str) -> int:
|
||||
tag_re = re.escape(tag.strip().lower())
|
||||
return len(re.findall(rf"<{tag_re}\b", html, flags=re.IGNORECASE))
|
||||
|
||||
|
||||
def _strip_tags_to_text(fragment: str) -> str:
|
||||
return html_lib.unescape(re.sub(r"<[^>]+>", " ", fragment or "")).strip()
|
||||
|
||||
|
||||
def _title_from_tag_opening(opening_attrs: str, fragment: str, tag: str) -> str:
|
||||
tag_re = re.escape(tag)
|
||||
for attr in ("title", "alt"):
|
||||
match = re.search(rf'{attr}\s*=\s*["\']([^"\']+)["\']', opening_attrs, flags=re.IGNORECASE)
|
||||
if match:
|
||||
title = html_lib.unescape(match.group(1)).strip()
|
||||
if title and len(title) <= 160:
|
||||
return title
|
||||
for attr in ("id", "name"):
|
||||
match = re.search(rf'{attr}\s*=\s*["\']([^"\']+)["\']', opening_attrs, flags=re.IGNORECASE)
|
||||
if match:
|
||||
title = html_lib.unescape(match.group(1)).strip()
|
||||
if title and not title.startswith("#") and len(title) <= 160:
|
||||
return title
|
||||
close_match = re.search(
|
||||
rf"<{tag_re}\b[^>]*>(.*?)</{tag_re}>",
|
||||
fragment,
|
||||
flags=re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
if not close_match:
|
||||
return ""
|
||||
inner = _strip_tags_to_text(close_match.group(1))
|
||||
if inner and len(inner) <= 160:
|
||||
return inner
|
||||
return ""
|
||||
|
||||
|
||||
def _anchor_seems_chapter_marker(opening_attrs: str, inner_text: str) -> bool:
|
||||
text = (inner_text or "").strip()
|
||||
if text and _CHAPTER_MARKER_TEXT_RE.search(text):
|
||||
return True
|
||||
attrs = opening_attrs or ""
|
||||
if re.search(r'\bhref\s*=\s*["\'][^"\']*\.xhtml', attrs, flags=re.IGNORECASE):
|
||||
return True
|
||||
if re.search(
|
||||
r'\b(?:id|name)\s*=\s*["\'][^"\']*(?:chuong|chương|chapter|ch\d|c\d|hoi|hồi)',
|
||||
attrs,
|
||||
flags=re.IGNORECASE,
|
||||
):
|
||||
return True
|
||||
# TOC / nav links thường có text ngắn.
|
||||
if text and len(text) <= 120:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _derive_simple_chapter_title(txt: str, number: int) -> str:
|
||||
for line in (txt or "").splitlines():
|
||||
cleaned = line.strip()
|
||||
if cleaned:
|
||||
return cleaned[:160]
|
||||
return f"Chương {number}"
|
||||
|
||||
|
||||
def extract_chapters_by_html_tag(
|
||||
epub_path: Path,
|
||||
tag: str,
|
||||
) -> tuple[list[dict[str, Any]], dict[str, int]]:
|
||||
"""Tách chương tại mỗi thẻ mở `<tag ...>`. Trả về (chapters, stats)."""
|
||||
merged_html = build_merged_html_from_epub(epub_path)
|
||||
stats = {"tagOpens": 0, "tagOpensUsed": 0, "tagOpensFiltered": 0}
|
||||
if not merged_html.strip():
|
||||
return [], stats
|
||||
|
||||
tag_name = tag.strip().lower()
|
||||
tag_re = re.escape(tag_name)
|
||||
opener_re = re.compile(rf"<({tag_re})\b([^>]*)>", re.IGNORECASE)
|
||||
matches = list(opener_re.finditer(merged_html))
|
||||
stats["tagOpens"] = len(matches)
|
||||
if not matches:
|
||||
return [], stats
|
||||
|
||||
if tag_name == "a" and len(matches) > 300:
|
||||
filtered: list[re.Match[str]] = []
|
||||
for match in matches:
|
||||
attrs = match.group(2) or ""
|
||||
rest = merged_html[match.end() : match.end() + 800]
|
||||
close = re.search(rf"</{tag_re}>", rest, flags=re.IGNORECASE)
|
||||
inner_html = rest[: close.start()] if close else rest
|
||||
inner_text = _strip_tags_to_text(inner_html)
|
||||
if _anchor_seems_chapter_marker(attrs, inner_text):
|
||||
filtered.append(match)
|
||||
if filtered:
|
||||
stats["tagOpensFiltered"] = len(matches) - len(filtered)
|
||||
matches = filtered
|
||||
|
||||
chapters: list[dict[str, Any]] = []
|
||||
for index, match in enumerate(matches):
|
||||
start = match.start()
|
||||
end = matches[index + 1].start() if index + 1 < len(matches) else len(merged_html)
|
||||
raw_html = merged_html[start:end].strip()
|
||||
if not raw_html:
|
||||
continue
|
||||
|
||||
opening_attrs = match.group(2) or ""
|
||||
txt = html_to_text(raw_html)
|
||||
inline_title = _title_from_tag_opening(opening_attrs, raw_html, tag_name)
|
||||
number = len(chapters) + 1
|
||||
title = inline_title or _derive_simple_chapter_title(txt, number)
|
||||
|
||||
# Bỏ qua anchor rỗng không có tiêu đề và không có nội dung theo sau.
|
||||
if not txt.strip() and not inline_title:
|
||||
tag_only = re.fullmatch(
|
||||
rf"<{tag_re}\b[^>]*>\s*(?:</{tag_re}>\s*)?",
|
||||
raw_html,
|
||||
flags=re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
if tag_only:
|
||||
continue
|
||||
|
||||
chapters.append(
|
||||
{
|
||||
"number": number,
|
||||
"title": title,
|
||||
"raw_html": raw_html,
|
||||
"txt": txt,
|
||||
}
|
||||
)
|
||||
|
||||
stats["tagOpensUsed"] = len(matches)
|
||||
return chapters, stats
|
||||
|
||||
Reference in New Issue
Block a user