feat(auth, epub): enhance Google token verification and EPUB chapter extraction
Build and Push Reader API Image / docker (push) Successful in 14s

- Added Google token verification logic to improve security and ensure valid tokens are processed.
- Introduced functions for extracting chapters from EPUB files based on HTML tags, including support for chapter markers.
- Updated `.env.example` to include configuration for an OpenAI-compatible router.
- Refactored existing functions for better readability and maintainability.
This commit is contained in:
2026-05-19 00:15:20 +07:00
parent 611213ae5a
commit bddd592146
4 changed files with 754 additions and 68 deletions
+5
View File
@@ -16,3 +16,8 @@ CORS_ORIGINS=http://localhost:3000,http://127.0.0.1:3000
# Environment label
APP_ENV=development
# OpenAI-compatible router (9router / OpenRouter) cho AI gợi ý EPUB import
# Liệt kê model: GET {ROUTER_BASE_URL}/models
ROUTER_BASE_URL=http://192.168.100.146:20128/v1
ROUTER_API_KEY=
+101
View File
@@ -1,10 +1,13 @@
from __future__ import annotations
import datetime as dt
import logging
import os
from typing import Any
from fastapi import Depends, HTTPException, Request
from google.auth.transport import requests as google_requests
from google.oauth2 import id_token as google_id_token
from jose import JWTError, jwt
from sqlalchemy import text
from sqlalchemy.ext.asyncio import AsyncSession
@@ -12,6 +15,8 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.config import settings
from app.database import get_db_session
logger = logging.getLogger(__name__)
SESSION_COOKIE_KEYS = [
"next-auth.session-token",
"__Secure-next-auth.session-token",
@@ -21,6 +26,102 @@ SESSION_COOKIE_KEYS = [
]
ACCESS_TOKEN_TTL_SECONDS = 7 * 24 * 60 * 60
GOOGLE_TOKEN_CLOCK_SKEW_SECONDS = 60
def _google_token_audiences_to_try(token: str) -> list[str | None]:
audiences: list[str | None] = []
seen: set[str] = set()
def add(value: str | None) -> None:
if value is None:
if None not in audiences:
audiences.append(None)
return
cleaned = value.strip()
if not cleaned or cleaned in seen:
return
seen.add(cleaned)
audiences.append(cleaned)
for client_id in settings.google_client_id_list:
add(client_id)
try:
claims = jwt.get_unverified_claims(token)
for key in ("aud", "azp"):
raw = claims.get(key)
if isinstance(raw, str):
add(raw)
elif isinstance(raw, list):
for item in raw:
if isinstance(item, str):
add(item)
except Exception:
pass
if not audiences:
audiences.append(None)
return audiences
def verify_google_id_token(raw_token: str) -> dict[str, Any]:
token = raw_token.strip()
if token.count(".") != 2:
raise HTTPException(status_code=400, detail="googleIdToken must be a JWT")
request = google_requests.Request()
last_exc: Exception | None = None
for audience in _google_token_audiences_to_try(token):
try:
id_info = google_id_token.verify_oauth2_token(
token,
request,
audience,
clock_skew_in_seconds=GOOGLE_TOKEN_CLOCK_SKEW_SECONDS,
)
aud = id_info.get("aud")
allowed = set(settings.google_client_id_list)
if allowed:
aud_values: set[str] = set()
if isinstance(aud, str):
aud_values.add(aud)
elif isinstance(aud, list):
aud_values.update(str(item) for item in aud)
azp = id_info.get("azp")
if isinstance(azp, str):
aud_values.add(azp)
if aud_values.isdisjoint(allowed):
last_exc = ValueError(f"token audience not allowed: {aud_values}")
continue
return id_info
except Exception as exc:
last_exc = exc
continue
try:
claims = jwt.get_unverified_claims(token)
logger.warning(
"google id token rejected len=%s iss=%s aud=%s azp=%s exp=%s err=%s",
len(token),
claims.get("iss"),
claims.get("aud"),
claims.get("azp"),
claims.get("exp"),
last_exc,
)
except Exception:
logger.warning("google id token rejected len=%s err=%s", len(token), last_exc)
err_text = str(last_exc or "").lower()
if any(x in err_text for x in ("certificate", "connection", "timeout", "urlopen", "ssl", "network")):
raise HTTPException(
status_code=503,
detail="Unable to verify Google token (reader-api cannot reach googleapis.com)",
) from last_exc
raise HTTPException(status_code=401, detail="Invalid Google token") from last_exc
def _jwt_secret() -> str:
+154 -1
View File
@@ -1,5 +1,7 @@
from __future__ import annotations
import html as html_lib
import re
from pathlib import Path
from typing import Any
@@ -7,8 +9,13 @@ import html2text
from ebooklib import ITEM_DOCUMENT
from ebooklib import epub as epublib
_CHAPTER_MARKER_TEXT_RE = re.compile(
r"(?:ch(?:u(?:ơng|ong))?|chapter|hồi|hoi|phần|phan|tập|tap|quyển|quyen)\s*\d+",
re.IGNORECASE,
)
def _html_to_text(html_content: str) -> str:
def html_to_text(html_content: str) -> str:
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
@@ -17,6 +24,20 @@ def _html_to_text(html_content: str) -> str:
return h.handle(html_content).strip()
def _html_to_text(html_content: str) -> str:
return html_to_text(html_content)
def build_merged_html_from_epub(epub_path: Path) -> str:
book = epublib.read_epub(str(epub_path), options={"ignore_ncx": False})
parts: list[str] = []
for item in book.get_items_of_type(ITEM_DOCUMENT):
content = item.get_content().decode("utf-8", errors="replace")
if content.strip():
parts.append(content)
return "\n".join(parts)
def build_chapters_from_epub(epub_path: Path) -> list[dict[str, Any]]:
book = epublib.read_epub(str(epub_path), options={"ignore_ncx": False})
out: list[dict[str, Any]] = []
@@ -36,3 +57,135 @@ def build_chapters_from_epub(epub_path: Path) -> list[dict[str, Any]]:
)
idx += 1
return out
def count_html_tag_opens(html: str, tag: str) -> int:
tag_re = re.escape(tag.strip().lower())
return len(re.findall(rf"<{tag_re}\b", html, flags=re.IGNORECASE))
def _strip_tags_to_text(fragment: str) -> str:
return html_lib.unescape(re.sub(r"<[^>]+>", " ", fragment or "")).strip()
def _title_from_tag_opening(opening_attrs: str, fragment: str, tag: str) -> str:
tag_re = re.escape(tag)
for attr in ("title", "alt"):
match = re.search(rf'{attr}\s*=\s*["\']([^"\']+)["\']', opening_attrs, flags=re.IGNORECASE)
if match:
title = html_lib.unescape(match.group(1)).strip()
if title and len(title) <= 160:
return title
for attr in ("id", "name"):
match = re.search(rf'{attr}\s*=\s*["\']([^"\']+)["\']', opening_attrs, flags=re.IGNORECASE)
if match:
title = html_lib.unescape(match.group(1)).strip()
if title and not title.startswith("#") and len(title) <= 160:
return title
close_match = re.search(
rf"<{tag_re}\b[^>]*>(.*?)</{tag_re}>",
fragment,
flags=re.IGNORECASE | re.DOTALL,
)
if not close_match:
return ""
inner = _strip_tags_to_text(close_match.group(1))
if inner and len(inner) <= 160:
return inner
return ""
def _anchor_seems_chapter_marker(opening_attrs: str, inner_text: str) -> bool:
text = (inner_text or "").strip()
if text and _CHAPTER_MARKER_TEXT_RE.search(text):
return True
attrs = opening_attrs or ""
if re.search(r'\bhref\s*=\s*["\'][^"\']*\.xhtml', attrs, flags=re.IGNORECASE):
return True
if re.search(
r'\b(?:id|name)\s*=\s*["\'][^"\']*(?:chuong|chương|chapter|ch\d|c\d|hoi|hồi)',
attrs,
flags=re.IGNORECASE,
):
return True
# TOC / nav links thường có text ngắn.
if text and len(text) <= 120:
return True
return False
def _derive_simple_chapter_title(txt: str, number: int) -> str:
for line in (txt or "").splitlines():
cleaned = line.strip()
if cleaned:
return cleaned[:160]
return f"Chương {number}"
def extract_chapters_by_html_tag(
epub_path: Path,
tag: str,
) -> tuple[list[dict[str, Any]], dict[str, int]]:
"""Tách chương tại mỗi thẻ mở `<tag ...>`. Trả về (chapters, stats)."""
merged_html = build_merged_html_from_epub(epub_path)
stats = {"tagOpens": 0, "tagOpensUsed": 0, "tagOpensFiltered": 0}
if not merged_html.strip():
return [], stats
tag_name = tag.strip().lower()
tag_re = re.escape(tag_name)
opener_re = re.compile(rf"<({tag_re})\b([^>]*)>", re.IGNORECASE)
matches = list(opener_re.finditer(merged_html))
stats["tagOpens"] = len(matches)
if not matches:
return [], stats
if tag_name == "a" and len(matches) > 300:
filtered: list[re.Match[str]] = []
for match in matches:
attrs = match.group(2) or ""
rest = merged_html[match.end() : match.end() + 800]
close = re.search(rf"</{tag_re}>", rest, flags=re.IGNORECASE)
inner_html = rest[: close.start()] if close else rest
inner_text = _strip_tags_to_text(inner_html)
if _anchor_seems_chapter_marker(attrs, inner_text):
filtered.append(match)
if filtered:
stats["tagOpensFiltered"] = len(matches) - len(filtered)
matches = filtered
chapters: list[dict[str, Any]] = []
for index, match in enumerate(matches):
start = match.start()
end = matches[index + 1].start() if index + 1 < len(matches) else len(merged_html)
raw_html = merged_html[start:end].strip()
if not raw_html:
continue
opening_attrs = match.group(2) or ""
txt = html_to_text(raw_html)
inline_title = _title_from_tag_opening(opening_attrs, raw_html, tag_name)
number = len(chapters) + 1
title = inline_title or _derive_simple_chapter_title(txt, number)
# Bỏ qua anchor rỗng không có tiêu đề và không có nội dung theo sau.
if not txt.strip() and not inline_title:
tag_only = re.fullmatch(
rf"<{tag_re}\b[^>]*>\s*(?:</{tag_re}>\s*)?",
raw_html,
flags=re.IGNORECASE | re.DOTALL,
)
if tag_only:
continue
chapters.append(
{
"number": number,
"title": title,
"raw_html": raw_html,
"txt": txt,
}
)
stats["tagOpensUsed"] = len(matches)
return chapters, stats
+493 -66
View File
@@ -4,7 +4,9 @@ import asyncio
import base64
import datetime as dt
import hashlib
import html as html_lib
import json
import logging
import os
import random
import re
@@ -24,17 +26,17 @@ from fastapi import Body, Depends, FastAPI, File, Form, HTTPException, Query, Re
from fastapi.middleware.cors import CORSMiddleware
import httpx
from fastapi.responses import Response
from google.auth.transport import requests as google_requests
from google.oauth2 import id_token as google_id_token
from pydantic import BaseModel, Field
from sqlalchemy import text
from sqlalchemy.ext.asyncio import AsyncSession
from app.auth import ACCESS_TOKEN_TTL_SECONDS, create_access_token, require_current_user
from app.auth import ACCESS_TOKEN_TTL_SECONDS, create_access_token, require_current_user, verify_google_id_token
from app.config import settings
from app.database import get_db_session
from app.storage import storage
logger = logging.getLogger(__name__)
# Giới hạn chương EPUB chỉ khi client gửi `enforceMaxChapters=true` (import nhiều / batch).
MOD_EPUB_MAX_CHAPTERS = 4000
@@ -1673,18 +1675,19 @@ async def mod_delete_chapter(
@app.post("/api/mod/chuong/bulk-delete")
async def mod_bulk_delete_chapters(
payload: ModChapterBulkDeletePayload,
payload: dict[str, Any] = Body(...),
db: AsyncSession = Depends(get_db_session),
user: dict = Depends(require_current_user),
):
if user.get("role") not in ("MOD", "ADMIN"):
raise HTTPException(status_code=403, detail="Forbidden")
from_num = min(payload.fromNumber, payload.toNumber)
to_num = max(payload.fromNumber, payload.toNumber)
parsed = ModChapterBulkDeletePayload.model_validate(payload)
from_num = min(parsed.fromNumber, parsed.toNumber)
to_num = max(parsed.fromNumber, parsed.toNumber)
ids = (
await db.execute(
text('SELECT id FROM "ChapterMeta" WHERE "novelId" = :novel_id AND number BETWEEN :from_num AND :to_num'),
{"novel_id": payload.novelId, "from_num": from_num, "to_num": to_num},
{"novel_id": parsed.novelId, "from_num": from_num, "to_num": to_num},
)
).mappings().all()
chapter_ids = [str(r["id"]) for r in ids]
@@ -1693,14 +1696,63 @@ async def mod_bulk_delete_chapters(
deleted_count = (
await db.execute(
text('DELETE FROM "ChapterMeta" WHERE "novelId" = :novel_id AND number BETWEEN :from_num AND :to_num RETURNING id'),
{"novel_id": payload.novelId, "from_num": from_num, "to_num": to_num},
{"novel_id": parsed.novelId, "from_num": from_num, "to_num": to_num},
)
).mappings().all()
await db.execute(text('UPDATE "Novel" SET "totalChapters" = (SELECT COUNT(*) FROM "ChapterMeta" WHERE "novelId" = :novel_id), "updatedAt" = NOW() WHERE id = :novel_id'), {"novel_id": payload.novelId})
await db.execute(text('UPDATE "Novel" SET "totalChapters" = (SELECT COUNT(*) FROM "ChapterMeta" WHERE "novelId" = :novel_id), "updatedAt" = NOW() WHERE id = :novel_id'), {"novel_id": parsed.novelId})
await db.commit()
return {"deletedCount": len(deleted_count)}
@app.post("/api/mod/chuong/normalize-titles/preview")
async def mod_normalize_chapter_titles_preview(
payload: dict[str, Any] = Body(...),
db: AsyncSession = Depends(get_db_session),
user: dict = Depends(require_current_user),
):
if user.get("role") not in ("MOD", "ADMIN"):
raise HTTPException(status_code=403, detail="Forbidden")
parsed = ModNormalizeTitlesPreviewPayload.model_validate(payload)
novel_id = parsed.novelId.strip()
if not novel_id:
raise HTTPException(status_code=400, detail="novelId is required")
rows = (
await db.execute(
text('SELECT id, number, title FROM "ChapterMeta" WHERE "novelId" = :novel_id ORDER BY number ASC'),
{"novel_id": novel_id},
)
).mappings().all()
items: list[dict[str, Any]] = []
for row in rows:
chapter_id = str(row["id"])
number = int(row.get("number") or 0)
current_title = str(row.get("title") or "").strip()
content = await _resolve_chapter_content(chapter_id, db) or ""
suggested_title = _infer_chapter_title_from_content(content, number, current_title).strip()
if not suggested_title or suggested_title == current_title:
continue
if parsed.overwriteGenericOnly and not _is_generic_chapter_title(current_title, number):
continue
items.append(
{
"id": chapter_id,
"number": number,
"currentTitle": current_title,
"suggestedTitle": suggested_title,
}
)
return {
"novelId": novel_id,
"scannedCount": len(rows),
"changeCount": len(items),
"items": items,
}
@app.put("/api/mod/chuong/optimize")
async def mod_optimize_chapters(
payload: dict[str, Any] = Body(...),
@@ -1866,6 +1918,7 @@ async def mod_epub_upload(
preview: str | None = Form(default=None),
splitMode: str | None = Form(default=None),
chapterRegex: str | None = Form(default=None),
chapterTag: str | None = Form(default=None),
title: str | None = Form(default=None),
originalTitle: str | None = Form(default=None),
authorName: str | None = Form(default=None),
@@ -1891,11 +1944,12 @@ async def mod_epub_upload(
tmp_path = Path(tmp.name)
try:
mode = "regex" if (splitMode or "").lower() == "regex" else "toc"
mode = _resolve_epub_split_mode(splitMode)
pattern = (chapterRegex or "").strip() or None
effective_tag = _normalize_chapter_html_tag(chapterTag) if mode == "tag" else None
source_sections = _extract_epub_chapters(tmp_path)
sections_after_filter = _filter_toc_chapters(source_sections) if mode == "toc" else source_sections
chapters = _epub_extract_with_mode(tmp_path, mode, pattern)
chapters = _epub_extract_with_mode(tmp_path, mode, pattern, effective_tag)
epub_meta = _extract_epub_metadata(tmp_path)
inferred_title = str(epub_meta.get("title") or Path(file.filename or "novel").stem)
inferred_author = str(epub_meta.get("author") or "Unknown")
@@ -1938,7 +1992,8 @@ async def mod_epub_upload(
"coverPreviewDataUrl": cover_data_url_b,
"parserInfo": {
"splitMode": mode,
"chapterRegexUsed": pattern,
"chapterRegexUsed": pattern if mode == "regex" else None,
"chapterTagUsed": effective_tag if mode == "tag" else None,
"sourceSections": len(source_sections),
"sectionsAfterFilter": len(sections_after_filter),
"sectionsDroppedByFilter": max(0, len(source_sections) - len(sections_after_filter)),
@@ -1999,7 +2054,8 @@ async def mod_epub_upload(
"coverPreviewDataUrl": cover_preview_data_url,
"parserInfo": {
"splitMode": mode,
"chapterRegexUsed": pattern,
"chapterRegexUsed": pattern if mode == "regex" else None,
"chapterTagUsed": effective_tag if mode == "tag" else None,
"sourceSections": len(source_sections),
"sectionsAfterFilter": len(sections_after_filter),
"sectionsDroppedByFilter": max(0, len(source_sections) - len(sections_after_filter)),
@@ -2590,6 +2646,11 @@ class ModChapterOptimizePayload(BaseModel):
updates: list[ModChapterOptimizeItem]
class ModNormalizeTitlesPreviewPayload(BaseModel):
novelId: str
overwriteGenericOnly: bool = True
class ModChapterGlobalReplacePayload(BaseModel):
novelId: str
action: str
@@ -2628,33 +2689,114 @@ def _asset_file_sha256(path: Path) -> str:
return h.hexdigest()
def _derive_chapter_title(txt: str, fallback: str, number: int) -> str:
lines = [line.strip().lstrip("#").strip() for line in txt.splitlines() if line.strip()]
chapter_re = re.compile(r"^(?:chuong|ch\.?|chapter|hoi|quyen|phan|tap)\s*\d+(?:[\.:\-\)]\s*|\s+).+", re.IGNORECASE)
chapter_num_re = re.compile(r"^(?:chuong|ch\.?|chapter|hoi|quyen|phan|tap)\s*\d+", re.IGNORECASE)
_CHAPTER_HEADING_PREFIX = r"(?:chuong|ch\.?|chapter|hoi|quyen|phan|tap)"
_CHAPTER_WITH_SUBTITLE_RE = re.compile(
rf"^{_CHAPTER_HEADING_PREFIX}\s*\d+(?:[\.:\-\)]\s*|\s+).+",
re.IGNORECASE,
)
_CHAPTER_NUM_ONLY_RE = re.compile(
rf"^{_CHAPTER_HEADING_PREFIX}\s*(\d+)\s*[:\-\.]?\s*$",
re.IGNORECASE,
)
_CHAPTER_NUM_PREFIX_RE = re.compile(rf"^{_CHAPTER_HEADING_PREFIX}\s*(\d+)", re.IGNORECASE)
_CHAPTER_INLINE_SUBTITLE_RE = re.compile(
r"^(?:Chương|Ch\.?|Chapter|Hồi|Quyển|Phần|Tập)\s*\d+(?:[\.:\-\)]\s*|\s+)(.+)$",
re.IGNORECASE,
)
for line in lines[:12]:
def _looks_like_body_paragraph(line: str) -> bool:
s = line.strip()
if not s:
return True
if len(s) > 200:
return True
if len(s) > 90 and s.endswith((".", "", "!", "?", "")):
return True
if len(s.split()) >= 10:
return True
low = s.lower()
if re.match(r"^(đoàn|hắn|nàng|anh|cô|tôi|người|sau khi|khi đó|trong|ngoài|bên|cả|một|hai|ba)\s", low):
if len(s.split()) >= 8:
return True
return False
def _is_plausible_subtitle_line(line: str) -> bool:
s = line.strip()
if not s or len(s) < 2 or len(s) > 160:
return False
normalized = _norm_title(s)
if _CHAPTER_WITH_SUBTITLE_RE.match(normalized) or _CHAPTER_NUM_ONLY_RE.match(normalized):
return False
if _looks_like_body_paragraph(s):
return False
if re.search(r"https?://", s, re.IGNORECASE):
return False
return True
def _is_generic_chapter_title(title: str, number: int) -> bool:
current = (title or "").strip()
if not current:
return True
n = int(number or 0)
if n <= 0:
return False
if re.fullmatch(rf"Chương\s*{n}\s*", current, re.IGNORECASE):
return True
if re.fullmatch(rf"Ch\.?\s*{n}\s*", current, re.IGNORECASE):
return True
if re.fullmatch(rf"Chapter\s*{n}\s*", current, re.IGNORECASE):
return True
return _norm_title(current) == _norm_title(f"Chương {n}")
def _infer_chapter_title_from_content(txt: str, number: int, fallback: str = "") -> str:
lines = [line.strip().lstrip("#").strip() for line in (txt or "").splitlines() if line.strip()]
for idx, line in enumerate(lines[:15]):
normalized = _norm_title(line)
if not normalized:
continue
if chapter_re.match(normalized):
return line
if chapter_num_re.match(normalized):
return line
inline = _CHAPTER_INLINE_SUBTITLE_RE.match(line.strip())
if inline:
subtitle = (inline.group(1) or "").strip()
if subtitle:
return subtitle
if _CHAPTER_WITH_SUBTITLE_RE.match(normalized):
return line.strip()
if _CHAPTER_NUM_PREFIX_RE.match(normalized):
if _CHAPTER_NUM_ONLY_RE.match(normalized):
if idx + 1 < len(lines):
next_line = lines[idx + 1].strip()
if _is_plausible_subtitle_line(next_line):
return next_line
return line.strip()
return line.strip()
if lines:
first = lines[0]
if len(first) <= 160 and len(first.split()) >= 3:
# Prefer human-readable first heading over EPUB internal filename.
if "/" in fallback or fallback.lower().endswith(".xhtml"):
first = lines[0].strip()
if _is_plausible_subtitle_line(first):
if "/" in (fallback or "") or str(fallback or "").lower().endswith(".xhtml"):
return first
if len(first.split()) >= 2:
return first
if fallback and "/" not in fallback and not fallback.lower().endswith(".xhtml"):
return fallback
cleaned_fallback = (fallback or "").strip()
if cleaned_fallback and "/" not in cleaned_fallback and not cleaned_fallback.lower().endswith(".xhtml"):
if not _is_generic_chapter_title(cleaned_fallback, number):
return cleaned_fallback
return f"Chương {number}"
def _derive_chapter_title(txt: str, fallback: str, number: int) -> str:
return _infer_chapter_title_from_content(txt, number, fallback)
def _extract_title_chapter_number(title: str) -> int | None:
normalized = _norm_title(title or "")
if not normalized:
@@ -2783,6 +2925,25 @@ def _filter_toc_chapters(chapters: list[dict[str, Any]]) -> list[dict[str, Any]]
return out
def _resolve_epub_split_mode(split_mode: str | None) -> str:
raw = (split_mode or "toc").strip().lower()
if raw == "regex":
return "regex"
if raw in {"tag", "html_tag", "html-tag", "htmltag"}:
return "tag"
return "toc"
def _normalize_chapter_html_tag(tag: str | None) -> str:
cleaned = (tag or "a").strip().lower()
if not re.fullmatch(r"[a-z][a-z0-9]*", cleaned):
raise HTTPException(
status_code=400,
detail="chapterTag must be a simple HTML tag name (letters/digits), e.g. a, h2",
)
return cleaned
def _extract_epub_chapters_by_regex(epub_path: Path, chapter_start_pattern: str) -> list[dict[str, Any]]:
chapters = _extract_epub_chapters(epub_path)
pattern = chapter_start_pattern.strip()
@@ -2852,7 +3013,12 @@ def _chapter_preview_samples(chapters: list[dict[str, Any]], sample_size: int =
return out
def _epub_extract_with_mode(epub_path: Path, split_mode: str, chapter_start_pattern: str | None) -> list[dict[str, Any]]:
def _epub_extract_with_mode(
epub_path: Path,
split_mode: str,
chapter_start_pattern: str | None,
chapter_tag: str | None = None,
) -> list[dict[str, Any]]:
if split_mode == "regex":
default_vi_regex = r"^\s*(?:[#>*\-\[]\s*)*(?:ch(?:u\.?|ương|uong)?|chapter|hồi|hoi|quyển|quyen|phần|phan|tập|tap)\s*\d+(?:[\.:\-\)]\s*|\s+).+$"
effective_pattern = chapter_start_pattern or default_vi_regex
@@ -2860,6 +3026,30 @@ def _epub_extract_with_mode(epub_path: Path, split_mode: str, chapter_start_patt
return _normalize_chapter_sequence(_extract_epub_chapters_by_regex(epub_path, effective_pattern))
except re.error as exc:
raise HTTPException(status_code=400, detail=f"Invalid chapterStartPattern: {exc}") from exc
if split_mode == "tag":
from app.epub_parser import build_merged_html_from_epub, extract_chapters_by_html_tag
effective_tag = _normalize_chapter_html_tag(chapter_tag)
merged, tag_stats = extract_chapters_by_html_tag(epub_path, effective_tag)
if not merged:
merged_html = build_merged_html_from_epub(epub_path)
tag_opens = int(tag_stats.get("tagOpens") or 0)
if not merged_html.strip():
detail = "EPUB không có nội dung HTML trong các file document."
elif tag_opens == 0:
detail = (
f"Không tìm thấy thẻ <{effective_tag}> trong EPUB. "
f"Thử thẻ khác (h2, h1, p) hoặc chế độ TOC/Regex."
)
else:
filtered = int(tag_stats.get("tagOpensFiltered") or 0)
extra = f" (đã lọc bỏ {filtered} thẻ <a> không giống mục chương)" if filtered else ""
detail = (
f"Tìm thấy {tag_opens} thẻ <{effective_tag}>{extra} "
f"nhưng không tạo được chương có nội dung. Thử thẻ khác hoặc TOC/Regex."
)
raise HTTPException(status_code=400, detail=detail)
return _normalize_chapter_sequence(merged)
return _normalize_chapter_sequence(_extract_epub_chapters(epub_path))
@@ -3346,6 +3536,236 @@ def _map_genres_to_existing(candidates: list[str], existing_genres: list[str], *
_ROUTER_MODEL_CACHE: dict[str, Any] = {"expires_at": 0.0, "models": []}
_ROUTER_PICK_LIMIT = 8
_ROUTER_FAMILY_PICK_LIMITS: dict[str, int] = {
"openai": 3,
"deepseek": 4,
"claude": 2,
"gemini": 2,
"other": 2,
}
_ROUTER_FAMILY_PICK_ORDER: tuple[str, ...] = ("openai", "deepseek", "claude", "gemini", "other")
def _router_model_family(model_id: str) -> str:
low = model_id.lower()
if "gpt" in low or low.startswith("openai/"):
return "openai"
if "deepseek" in low or low.startswith("ds/") or "/ds/" in low:
return "deepseek"
if "claude" in low or "anthropic" in low:
return "claude"
if "gemini" in low or "google" in low:
return "gemini"
return "other"
def _router_pick_models_from_candidates(candidates: list[tuple[int, str]]) -> list[str]:
by_family: dict[str, list[tuple[int, str]]] = {}
for score, model_id in candidates:
by_family.setdefault(_router_model_family(model_id), []).append((score, model_id))
for family_models in by_family.values():
family_models.sort(key=lambda x: (-x[0], x[1]))
picked: list[str] = []
for family in _ROUTER_FAMILY_PICK_ORDER:
limit = _ROUTER_FAMILY_PICK_LIMITS.get(family, 1)
for _score, model_id in by_family.get(family, [])[:limit]:
if model_id not in picked:
picked.append(model_id)
if len(picked) < _ROUTER_PICK_LIMIT:
for _score, model_id in sorted(candidates, key=lambda x: (-x[0], x[1])):
if len(picked) >= _ROUTER_PICK_LIMIT:
break
if model_id not in picked:
picked.append(model_id)
return picked[:_ROUTER_PICK_LIMIT]
def _router_model_priority_score(model_id: str) -> int:
low = model_id.lower()
if "gpt-5.5" in low:
return 1000
if "gpt-5" in low:
return 900
if _router_model_family(model_id) == "deepseek":
return 850
if "claude" in low:
return 700
if "gemini" in low:
return 650
return 100
def _router_parse_http_json(raw: str) -> Any:
"""Parse OpenAI-compatible HTTP bodies (9router may append SSE sentinels)."""
text = (raw or "").strip()
if not text:
raise ValueError("empty router response body")
done_idx = text.find("data: [DONE]")
if done_idx != -1:
text = text[:done_idx].rstrip()
try:
return json.loads(text)
except json.JSONDecodeError:
decoder = json.JSONDecoder()
obj, _end = decoder.raw_decode(text)
return obj
def _router_collect_sse_payloads(raw: str) -> list[dict[str, Any]]:
payloads: list[dict[str, Any]] = []
for line in raw.splitlines():
line = line.strip()
if not line.startswith("data:"):
continue
chunk = line[5:].strip()
if not chunk or chunk == "[DONE]":
continue
try:
parsed = json.loads(chunk)
except json.JSONDecodeError:
continue
if isinstance(parsed, dict):
payloads.append(parsed)
return payloads
def _router_merge_streaming_completion(payloads: list[dict[str, Any]]) -> dict[str, Any]:
merged: dict[str, Any] = {"choices": [{"message": {"role": "assistant", "content": ""}}]}
content_parts: list[str] = []
reasoning_parts: list[str] = []
for payload in payloads:
for choice in payload.get("choices") or []:
delta = choice.get("delta") or {}
message = choice.get("message") or {}
for key, bucket in (
("content", content_parts),
("reasoning_content", reasoning_parts),
):
piece = delta.get(key)
if piece is None:
piece = message.get(key)
if piece:
bucket.append(str(piece))
if content_parts:
merged["choices"][0]["message"]["content"] = "".join(content_parts)
if reasoning_parts:
merged["choices"][0]["message"]["reasoning_content"] = "".join(reasoning_parts)
return merged
def _router_parse_completion_body(raw: str, *, model_id: str) -> dict[str, Any]:
text = (raw or "").strip()
if not text:
raise ValueError("empty router response body")
if text.startswith("data:") or "\ndata:" in text:
payloads = _router_collect_sse_payloads(text)
if payloads:
return _router_merge_streaming_completion(payloads)
data = _router_parse_http_json(text)
if not isinstance(data, dict):
raise ValueError(f"router response is not an object for model={model_id}")
return data
def _router_strip_json_fences(text: str) -> str:
stripped = text.strip()
if stripped.startswith("```"):
stripped = re.sub(r"^```(?:json)?\s*", "", stripped, flags=re.IGNORECASE)
stripped = re.sub(r"\s*```$", "", stripped)
return stripped.strip()
def _router_parse_json_object(text: str) -> dict[str, Any] | None:
candidate = _router_strip_json_fences(text)
if not candidate:
return None
try:
parsed = json.loads(candidate)
return parsed if isinstance(parsed, dict) else None
except json.JSONDecodeError:
pass
try:
decoder = json.JSONDecoder()
obj, _end = decoder.raw_decode(candidate)
return obj if isinstance(obj, dict) else None
except json.JSONDecodeError:
pass
match = re.search(r"\{[\s\S]*\}", candidate)
if not match:
return None
try:
obj = json.loads(match.group(0))
return obj if isinstance(obj, dict) else None
except json.JSONDecodeError:
return None
def _router_normalize_message_content(content: Any) -> str:
if content is None:
return ""
if isinstance(content, str):
return content.strip()
if isinstance(content, list):
parts: list[str] = []
for item in content:
if isinstance(item, str) and item.strip():
parts.append(item.strip())
elif isinstance(item, dict):
if item.get("type") == "text":
text = str(item.get("text") or "").strip()
if text:
parts.append(text)
elif "text" in item:
text = str(item.get("text") or "").strip()
if text:
parts.append(text)
return "\n".join(parts).strip()
return str(content).strip()
def _router_extract_assistant_content(completion: dict[str, Any], model_id: str) -> str:
choice = (completion.get("choices") or [{}])[0] or {}
message = choice.get("message") or {}
family = _router_model_family(model_id)
content = _router_normalize_message_content(message.get("content"))
if content:
return content
if family == "deepseek":
reasoning = str(message.get("reasoning_content") or "").strip()
if reasoning:
parsed = _router_parse_json_object(reasoning)
if parsed:
return json.dumps(parsed, ensure_ascii=False)
tail = reasoning[-4000:]
parsed = _router_parse_json_object(tail)
if parsed:
return json.dumps(parsed, ensure_ascii=False)
if family == "gemini":
parts = message.get("parts")
if isinstance(parts, list):
return _router_normalize_message_content(parts)
return ""
def _router_parse_suggest_result(completion: dict[str, Any], model_id: str) -> dict[str, Any] | None:
content = _router_extract_assistant_content(completion, model_id)
if not content:
return None
parsed = _router_parse_json_object(content)
if not parsed:
return None
return parsed
def _normalize_vietnamese_novel_status(raw: str | None) -> str:
@@ -3379,30 +3799,20 @@ async def _router_pick_models() -> list[str]:
headers=headers,
)
response.raise_for_status()
for item in (response.json().get("data") or []):
models_payload = _router_parse_http_json(response.text)
for item in (models_payload.get("data") or []):
model_id = str(item.get("id") or "").strip()
if not model_id:
continue
low = model_id.lower()
if any(x in low for x in ["vision", "image", "audio", "realtime", "embedding", "moderation"]):
continue
score = 0
if "gpt-5.5" in low:
score += 1000
elif "gpt-5" in low:
score += 900
elif "claude" in low:
score += 700
elif "gemini" in low:
score += 650
else:
score += 100
candidates.append((score, model_id))
except Exception:
candidates.append((_router_model_priority_score(model_id), model_id))
except Exception as exc:
logger.warning("router models list failed: %s", exc)
candidates = []
candidates.sort(key=lambda x: x[0], reverse=True)
picked = [m for _, m in candidates[:6]]
picked = _router_pick_models_from_candidates(candidates)
_ROUTER_MODEL_CACHE["models"] = picked
_ROUTER_MODEL_CACHE["expires_at"] = now + 600
return picked
@@ -3479,6 +3889,7 @@ async def _router_ai_suggest(
for model_id in models:
payload = dict(base_payload)
payload["model"] = model_id
family = _router_model_family(model_id)
try:
async with httpx.AsyncClient(timeout=45.0) as client:
response = await client.post(
@@ -3486,10 +3897,24 @@ async def _router_ai_suggest(
headers=headers,
json=payload,
)
response.raise_for_status()
data = response.json()
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
parsed = json.loads(content) if isinstance(content, str) else {}
if response.status_code >= 400:
logger.info(
"router ai-suggest skip model=%s family=%s status=%s body=%s",
model_id,
family,
response.status_code,
(response.text or "")[:240],
)
continue
completion = _router_parse_completion_body(response.text, model_id=model_id)
parsed = _router_parse_suggest_result(completion, model_id)
if not parsed:
logger.info(
"router ai-suggest skip model=%s family=%s reason=unparseable_content",
model_id,
family,
)
continue
raw_genres = [str(g).strip() for g in (parsed.get("genres") or []) if str(g).strip()][:6]
genres = _map_genres_to_existing(raw_genres, existing_genres, limit=6)
short_description = str(parsed.get("shortDescription") or "").strip()
@@ -3500,6 +3925,13 @@ async def _router_ai_suggest(
confidence = 0.0
confidence = max(0.0, min(1.0, confidence))
if not short_description or not genres:
logger.info(
"router ai-suggest skip model=%s family=%s reason=empty_fields genres=%s desc_len=%s",
model_id,
family,
len(genres),
len(short_description),
)
continue
return {
"suggestedGenres": genres,
@@ -3508,7 +3940,13 @@ async def _router_ai_suggest(
"model": model_id,
"suggestedStatus": novel_status,
}
except Exception:
except Exception as exc:
logger.info(
"router ai-suggest skip model=%s family=%s reason=exception err=%s",
model_id,
family,
exc,
)
continue
return None
@@ -3570,6 +4008,7 @@ async def mod_epub_ai_suggest(
file: UploadFile = File(...),
splitMode: str | None = Form(default=None),
chapterRegex: str | None = Form(default=None),
chapterTag: str | None = Form(default=None),
title: str | None = Form(default=None),
authorName: str | None = Form(default=None),
db: AsyncSession = Depends(get_db_session),
@@ -3587,11 +4026,12 @@ async def mod_epub_ai_suggest(
tmp_path = Path(tmp.name)
try:
mode = "regex" if (splitMode or "").lower() == "regex" else "toc"
mode = _resolve_epub_split_mode(splitMode)
pattern = (chapterRegex or "").strip() or None
effective_tag = _normalize_chapter_html_tag(chapterTag) if mode == "tag" else None
source_sections = _extract_epub_chapters(tmp_path)
sections_after_filter = _filter_toc_chapters(source_sections) if mode == "toc" else source_sections
chapters = _epub_extract_with_mode(tmp_path, mode, pattern)
chapters = _epub_extract_with_mode(tmp_path, mode, pattern, effective_tag)
meta = _extract_epub_metadata(tmp_path)
resolved_title = " ".join((title or str(meta.get("title") or tmp_path.stem)).split()).strip() or tmp_path.stem
resolved_author = " ".join((authorName or str(meta.get("author") or "Unknown")).split()).strip() or "Unknown"
@@ -4198,20 +4638,7 @@ async def mobile_login(payload: MobileLoginPayload, db: AsyncSession = Depends(g
if not payload.googleIdToken.strip():
raise HTTPException(status_code=400, detail="googleIdToken is required")
allowed_client_ids = settings.google_client_id_list
try:
id_info = google_id_token.verify_oauth2_token(
payload.googleIdToken,
google_requests.Request(),
None,
)
except Exception as exc:
raise HTTPException(status_code=401, detail="Invalid Google token") from exc
aud = (id_info.get("aud") or "").strip()
if allowed_client_ids and aud not in set(allowed_client_ids):
raise HTTPException(status_code=401, detail="Invalid Google token audience")
id_info = verify_google_id_token(payload.googleIdToken)
email = id_info.get("email")
if not email: