feat(auth, epub): enhance Google token verification and EPUB chapter extraction
Build and Push Reader API Image / docker (push) Successful in 14s

- Added Google token verification logic to improve security and ensure valid tokens are processed.
- Introduced functions for extracting chapters from EPUB files based on HTML tags, including support for chapter markers.
- Updated `.env.example` to include configuration for an OpenAI-compatible router.
- Refactored existing functions for better readability and maintainability.
This commit is contained in:
2026-05-19 00:15:20 +07:00
parent 611213ae5a
commit bddd592146
4 changed files with 754 additions and 68 deletions
+5
View File
@@ -16,3 +16,8 @@ CORS_ORIGINS=http://localhost:3000,http://127.0.0.1:3000
# Environment label # Environment label
APP_ENV=development APP_ENV=development
# OpenAI-compatible router (9router / OpenRouter) cho AI gợi ý EPUB import
# Liệt kê model: GET {ROUTER_BASE_URL}/models
ROUTER_BASE_URL=http://192.168.100.146:20128/v1
ROUTER_API_KEY=
+101
View File
@@ -1,10 +1,13 @@
from __future__ import annotations from __future__ import annotations
import datetime as dt import datetime as dt
import logging
import os import os
from typing import Any from typing import Any
from fastapi import Depends, HTTPException, Request from fastapi import Depends, HTTPException, Request
from google.auth.transport import requests as google_requests
from google.oauth2 import id_token as google_id_token
from jose import JWTError, jwt from jose import JWTError, jwt
from sqlalchemy import text from sqlalchemy import text
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
@@ -12,6 +15,8 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.config import settings from app.config import settings
from app.database import get_db_session from app.database import get_db_session
logger = logging.getLogger(__name__)
SESSION_COOKIE_KEYS = [ SESSION_COOKIE_KEYS = [
"next-auth.session-token", "next-auth.session-token",
"__Secure-next-auth.session-token", "__Secure-next-auth.session-token",
@@ -21,6 +26,102 @@ SESSION_COOKIE_KEYS = [
] ]
ACCESS_TOKEN_TTL_SECONDS = 7 * 24 * 60 * 60 ACCESS_TOKEN_TTL_SECONDS = 7 * 24 * 60 * 60
GOOGLE_TOKEN_CLOCK_SKEW_SECONDS = 60
def _google_token_audiences_to_try(token: str) -> list[str | None]:
audiences: list[str | None] = []
seen: set[str] = set()
def add(value: str | None) -> None:
if value is None:
if None not in audiences:
audiences.append(None)
return
cleaned = value.strip()
if not cleaned or cleaned in seen:
return
seen.add(cleaned)
audiences.append(cleaned)
for client_id in settings.google_client_id_list:
add(client_id)
try:
claims = jwt.get_unverified_claims(token)
for key in ("aud", "azp"):
raw = claims.get(key)
if isinstance(raw, str):
add(raw)
elif isinstance(raw, list):
for item in raw:
if isinstance(item, str):
add(item)
except Exception:
pass
if not audiences:
audiences.append(None)
return audiences
def verify_google_id_token(raw_token: str) -> dict[str, Any]:
token = raw_token.strip()
if token.count(".") != 2:
raise HTTPException(status_code=400, detail="googleIdToken must be a JWT")
request = google_requests.Request()
last_exc: Exception | None = None
for audience in _google_token_audiences_to_try(token):
try:
id_info = google_id_token.verify_oauth2_token(
token,
request,
audience,
clock_skew_in_seconds=GOOGLE_TOKEN_CLOCK_SKEW_SECONDS,
)
aud = id_info.get("aud")
allowed = set(settings.google_client_id_list)
if allowed:
aud_values: set[str] = set()
if isinstance(aud, str):
aud_values.add(aud)
elif isinstance(aud, list):
aud_values.update(str(item) for item in aud)
azp = id_info.get("azp")
if isinstance(azp, str):
aud_values.add(azp)
if aud_values.isdisjoint(allowed):
last_exc = ValueError(f"token audience not allowed: {aud_values}")
continue
return id_info
except Exception as exc:
last_exc = exc
continue
try:
claims = jwt.get_unverified_claims(token)
logger.warning(
"google id token rejected len=%s iss=%s aud=%s azp=%s exp=%s err=%s",
len(token),
claims.get("iss"),
claims.get("aud"),
claims.get("azp"),
claims.get("exp"),
last_exc,
)
except Exception:
logger.warning("google id token rejected len=%s err=%s", len(token), last_exc)
err_text = str(last_exc or "").lower()
if any(x in err_text for x in ("certificate", "connection", "timeout", "urlopen", "ssl", "network")):
raise HTTPException(
status_code=503,
detail="Unable to verify Google token (reader-api cannot reach googleapis.com)",
) from last_exc
raise HTTPException(status_code=401, detail="Invalid Google token") from last_exc
def _jwt_secret() -> str: def _jwt_secret() -> str:
+154 -1
View File
@@ -1,5 +1,7 @@
from __future__ import annotations from __future__ import annotations
import html as html_lib
import re
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
@@ -7,8 +9,13 @@ import html2text
from ebooklib import ITEM_DOCUMENT from ebooklib import ITEM_DOCUMENT
from ebooklib import epub as epublib from ebooklib import epub as epublib
_CHAPTER_MARKER_TEXT_RE = re.compile(
r"(?:ch(?:u(?:ơng|ong))?|chapter|hồi|hoi|phần|phan|tập|tap|quyển|quyen)\s*\d+",
re.IGNORECASE,
)
def _html_to_text(html_content: str) -> str:
def html_to_text(html_content: str) -> str:
h = html2text.HTML2Text() h = html2text.HTML2Text()
h.ignore_links = True h.ignore_links = True
h.ignore_images = True h.ignore_images = True
@@ -17,6 +24,20 @@ def _html_to_text(html_content: str) -> str:
return h.handle(html_content).strip() return h.handle(html_content).strip()
def _html_to_text(html_content: str) -> str:
return html_to_text(html_content)
def build_merged_html_from_epub(epub_path: Path) -> str:
book = epublib.read_epub(str(epub_path), options={"ignore_ncx": False})
parts: list[str] = []
for item in book.get_items_of_type(ITEM_DOCUMENT):
content = item.get_content().decode("utf-8", errors="replace")
if content.strip():
parts.append(content)
return "\n".join(parts)
def build_chapters_from_epub(epub_path: Path) -> list[dict[str, Any]]: def build_chapters_from_epub(epub_path: Path) -> list[dict[str, Any]]:
book = epublib.read_epub(str(epub_path), options={"ignore_ncx": False}) book = epublib.read_epub(str(epub_path), options={"ignore_ncx": False})
out: list[dict[str, Any]] = [] out: list[dict[str, Any]] = []
@@ -36,3 +57,135 @@ def build_chapters_from_epub(epub_path: Path) -> list[dict[str, Any]]:
) )
idx += 1 idx += 1
return out return out
def count_html_tag_opens(html: str, tag: str) -> int:
tag_re = re.escape(tag.strip().lower())
return len(re.findall(rf"<{tag_re}\b", html, flags=re.IGNORECASE))
def _strip_tags_to_text(fragment: str) -> str:
return html_lib.unescape(re.sub(r"<[^>]+>", " ", fragment or "")).strip()
def _title_from_tag_opening(opening_attrs: str, fragment: str, tag: str) -> str:
tag_re = re.escape(tag)
for attr in ("title", "alt"):
match = re.search(rf'{attr}\s*=\s*["\']([^"\']+)["\']', opening_attrs, flags=re.IGNORECASE)
if match:
title = html_lib.unescape(match.group(1)).strip()
if title and len(title) <= 160:
return title
for attr in ("id", "name"):
match = re.search(rf'{attr}\s*=\s*["\']([^"\']+)["\']', opening_attrs, flags=re.IGNORECASE)
if match:
title = html_lib.unescape(match.group(1)).strip()
if title and not title.startswith("#") and len(title) <= 160:
return title
close_match = re.search(
rf"<{tag_re}\b[^>]*>(.*?)</{tag_re}>",
fragment,
flags=re.IGNORECASE | re.DOTALL,
)
if not close_match:
return ""
inner = _strip_tags_to_text(close_match.group(1))
if inner and len(inner) <= 160:
return inner
return ""
def _anchor_seems_chapter_marker(opening_attrs: str, inner_text: str) -> bool:
text = (inner_text or "").strip()
if text and _CHAPTER_MARKER_TEXT_RE.search(text):
return True
attrs = opening_attrs or ""
if re.search(r'\bhref\s*=\s*["\'][^"\']*\.xhtml', attrs, flags=re.IGNORECASE):
return True
if re.search(
r'\b(?:id|name)\s*=\s*["\'][^"\']*(?:chuong|chương|chapter|ch\d|c\d|hoi|hồi)',
attrs,
flags=re.IGNORECASE,
):
return True
# TOC / nav links thường có text ngắn.
if text and len(text) <= 120:
return True
return False
def _derive_simple_chapter_title(txt: str, number: int) -> str:
for line in (txt or "").splitlines():
cleaned = line.strip()
if cleaned:
return cleaned[:160]
return f"Chương {number}"
def extract_chapters_by_html_tag(
epub_path: Path,
tag: str,
) -> tuple[list[dict[str, Any]], dict[str, int]]:
"""Tách chương tại mỗi thẻ mở `<tag ...>`. Trả về (chapters, stats)."""
merged_html = build_merged_html_from_epub(epub_path)
stats = {"tagOpens": 0, "tagOpensUsed": 0, "tagOpensFiltered": 0}
if not merged_html.strip():
return [], stats
tag_name = tag.strip().lower()
tag_re = re.escape(tag_name)
opener_re = re.compile(rf"<({tag_re})\b([^>]*)>", re.IGNORECASE)
matches = list(opener_re.finditer(merged_html))
stats["tagOpens"] = len(matches)
if not matches:
return [], stats
if tag_name == "a" and len(matches) > 300:
filtered: list[re.Match[str]] = []
for match in matches:
attrs = match.group(2) or ""
rest = merged_html[match.end() : match.end() + 800]
close = re.search(rf"</{tag_re}>", rest, flags=re.IGNORECASE)
inner_html = rest[: close.start()] if close else rest
inner_text = _strip_tags_to_text(inner_html)
if _anchor_seems_chapter_marker(attrs, inner_text):
filtered.append(match)
if filtered:
stats["tagOpensFiltered"] = len(matches) - len(filtered)
matches = filtered
chapters: list[dict[str, Any]] = []
for index, match in enumerate(matches):
start = match.start()
end = matches[index + 1].start() if index + 1 < len(matches) else len(merged_html)
raw_html = merged_html[start:end].strip()
if not raw_html:
continue
opening_attrs = match.group(2) or ""
txt = html_to_text(raw_html)
inline_title = _title_from_tag_opening(opening_attrs, raw_html, tag_name)
number = len(chapters) + 1
title = inline_title or _derive_simple_chapter_title(txt, number)
# Bỏ qua anchor rỗng không có tiêu đề và không có nội dung theo sau.
if not txt.strip() and not inline_title:
tag_only = re.fullmatch(
rf"<{tag_re}\b[^>]*>\s*(?:</{tag_re}>\s*)?",
raw_html,
flags=re.IGNORECASE | re.DOTALL,
)
if tag_only:
continue
chapters.append(
{
"number": number,
"title": title,
"raw_html": raw_html,
"txt": txt,
}
)
stats["tagOpensUsed"] = len(matches)
return chapters, stats
+493 -66
View File
@@ -4,7 +4,9 @@ import asyncio
import base64 import base64
import datetime as dt import datetime as dt
import hashlib import hashlib
import html as html_lib
import json import json
import logging
import os import os
import random import random
import re import re
@@ -24,17 +26,17 @@ from fastapi import Body, Depends, FastAPI, File, Form, HTTPException, Query, Re
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
import httpx import httpx
from fastapi.responses import Response from fastapi.responses import Response
from google.auth.transport import requests as google_requests
from google.oauth2 import id_token as google_id_token
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from sqlalchemy import text from sqlalchemy import text
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from app.auth import ACCESS_TOKEN_TTL_SECONDS, create_access_token, require_current_user from app.auth import ACCESS_TOKEN_TTL_SECONDS, create_access_token, require_current_user, verify_google_id_token
from app.config import settings from app.config import settings
from app.database import get_db_session from app.database import get_db_session
from app.storage import storage from app.storage import storage
logger = logging.getLogger(__name__)
# Giới hạn chương EPUB chỉ khi client gửi `enforceMaxChapters=true` (import nhiều / batch). # Giới hạn chương EPUB chỉ khi client gửi `enforceMaxChapters=true` (import nhiều / batch).
MOD_EPUB_MAX_CHAPTERS = 4000 MOD_EPUB_MAX_CHAPTERS = 4000
@@ -1673,18 +1675,19 @@ async def mod_delete_chapter(
@app.post("/api/mod/chuong/bulk-delete") @app.post("/api/mod/chuong/bulk-delete")
async def mod_bulk_delete_chapters( async def mod_bulk_delete_chapters(
payload: ModChapterBulkDeletePayload, payload: dict[str, Any] = Body(...),
db: AsyncSession = Depends(get_db_session), db: AsyncSession = Depends(get_db_session),
user: dict = Depends(require_current_user), user: dict = Depends(require_current_user),
): ):
if user.get("role") not in ("MOD", "ADMIN"): if user.get("role") not in ("MOD", "ADMIN"):
raise HTTPException(status_code=403, detail="Forbidden") raise HTTPException(status_code=403, detail="Forbidden")
from_num = min(payload.fromNumber, payload.toNumber) parsed = ModChapterBulkDeletePayload.model_validate(payload)
to_num = max(payload.fromNumber, payload.toNumber) from_num = min(parsed.fromNumber, parsed.toNumber)
to_num = max(parsed.fromNumber, parsed.toNumber)
ids = ( ids = (
await db.execute( await db.execute(
text('SELECT id FROM "ChapterMeta" WHERE "novelId" = :novel_id AND number BETWEEN :from_num AND :to_num'), text('SELECT id FROM "ChapterMeta" WHERE "novelId" = :novel_id AND number BETWEEN :from_num AND :to_num'),
{"novel_id": payload.novelId, "from_num": from_num, "to_num": to_num}, {"novel_id": parsed.novelId, "from_num": from_num, "to_num": to_num},
) )
).mappings().all() ).mappings().all()
chapter_ids = [str(r["id"]) for r in ids] chapter_ids = [str(r["id"]) for r in ids]
@@ -1693,14 +1696,63 @@ async def mod_bulk_delete_chapters(
deleted_count = ( deleted_count = (
await db.execute( await db.execute(
text('DELETE FROM "ChapterMeta" WHERE "novelId" = :novel_id AND number BETWEEN :from_num AND :to_num RETURNING id'), text('DELETE FROM "ChapterMeta" WHERE "novelId" = :novel_id AND number BETWEEN :from_num AND :to_num RETURNING id'),
{"novel_id": payload.novelId, "from_num": from_num, "to_num": to_num}, {"novel_id": parsed.novelId, "from_num": from_num, "to_num": to_num},
) )
).mappings().all() ).mappings().all()
await db.execute(text('UPDATE "Novel" SET "totalChapters" = (SELECT COUNT(*) FROM "ChapterMeta" WHERE "novelId" = :novel_id), "updatedAt" = NOW() WHERE id = :novel_id'), {"novel_id": payload.novelId}) await db.execute(text('UPDATE "Novel" SET "totalChapters" = (SELECT COUNT(*) FROM "ChapterMeta" WHERE "novelId" = :novel_id), "updatedAt" = NOW() WHERE id = :novel_id'), {"novel_id": parsed.novelId})
await db.commit() await db.commit()
return {"deletedCount": len(deleted_count)} return {"deletedCount": len(deleted_count)}
@app.post("/api/mod/chuong/normalize-titles/preview")
async def mod_normalize_chapter_titles_preview(
payload: dict[str, Any] = Body(...),
db: AsyncSession = Depends(get_db_session),
user: dict = Depends(require_current_user),
):
if user.get("role") not in ("MOD", "ADMIN"):
raise HTTPException(status_code=403, detail="Forbidden")
parsed = ModNormalizeTitlesPreviewPayload.model_validate(payload)
novel_id = parsed.novelId.strip()
if not novel_id:
raise HTTPException(status_code=400, detail="novelId is required")
rows = (
await db.execute(
text('SELECT id, number, title FROM "ChapterMeta" WHERE "novelId" = :novel_id ORDER BY number ASC'),
{"novel_id": novel_id},
)
).mappings().all()
items: list[dict[str, Any]] = []
for row in rows:
chapter_id = str(row["id"])
number = int(row.get("number") or 0)
current_title = str(row.get("title") or "").strip()
content = await _resolve_chapter_content(chapter_id, db) or ""
suggested_title = _infer_chapter_title_from_content(content, number, current_title).strip()
if not suggested_title or suggested_title == current_title:
continue
if parsed.overwriteGenericOnly and not _is_generic_chapter_title(current_title, number):
continue
items.append(
{
"id": chapter_id,
"number": number,
"currentTitle": current_title,
"suggestedTitle": suggested_title,
}
)
return {
"novelId": novel_id,
"scannedCount": len(rows),
"changeCount": len(items),
"items": items,
}
@app.put("/api/mod/chuong/optimize") @app.put("/api/mod/chuong/optimize")
async def mod_optimize_chapters( async def mod_optimize_chapters(
payload: dict[str, Any] = Body(...), payload: dict[str, Any] = Body(...),
@@ -1866,6 +1918,7 @@ async def mod_epub_upload(
preview: str | None = Form(default=None), preview: str | None = Form(default=None),
splitMode: str | None = Form(default=None), splitMode: str | None = Form(default=None),
chapterRegex: str | None = Form(default=None), chapterRegex: str | None = Form(default=None),
chapterTag: str | None = Form(default=None),
title: str | None = Form(default=None), title: str | None = Form(default=None),
originalTitle: str | None = Form(default=None), originalTitle: str | None = Form(default=None),
authorName: str | None = Form(default=None), authorName: str | None = Form(default=None),
@@ -1891,11 +1944,12 @@ async def mod_epub_upload(
tmp_path = Path(tmp.name) tmp_path = Path(tmp.name)
try: try:
mode = "regex" if (splitMode or "").lower() == "regex" else "toc" mode = _resolve_epub_split_mode(splitMode)
pattern = (chapterRegex or "").strip() or None pattern = (chapterRegex or "").strip() or None
effective_tag = _normalize_chapter_html_tag(chapterTag) if mode == "tag" else None
source_sections = _extract_epub_chapters(tmp_path) source_sections = _extract_epub_chapters(tmp_path)
sections_after_filter = _filter_toc_chapters(source_sections) if mode == "toc" else source_sections sections_after_filter = _filter_toc_chapters(source_sections) if mode == "toc" else source_sections
chapters = _epub_extract_with_mode(tmp_path, mode, pattern) chapters = _epub_extract_with_mode(tmp_path, mode, pattern, effective_tag)
epub_meta = _extract_epub_metadata(tmp_path) epub_meta = _extract_epub_metadata(tmp_path)
inferred_title = str(epub_meta.get("title") or Path(file.filename or "novel").stem) inferred_title = str(epub_meta.get("title") or Path(file.filename or "novel").stem)
inferred_author = str(epub_meta.get("author") or "Unknown") inferred_author = str(epub_meta.get("author") or "Unknown")
@@ -1938,7 +1992,8 @@ async def mod_epub_upload(
"coverPreviewDataUrl": cover_data_url_b, "coverPreviewDataUrl": cover_data_url_b,
"parserInfo": { "parserInfo": {
"splitMode": mode, "splitMode": mode,
"chapterRegexUsed": pattern, "chapterRegexUsed": pattern if mode == "regex" else None,
"chapterTagUsed": effective_tag if mode == "tag" else None,
"sourceSections": len(source_sections), "sourceSections": len(source_sections),
"sectionsAfterFilter": len(sections_after_filter), "sectionsAfterFilter": len(sections_after_filter),
"sectionsDroppedByFilter": max(0, len(source_sections) - len(sections_after_filter)), "sectionsDroppedByFilter": max(0, len(source_sections) - len(sections_after_filter)),
@@ -1999,7 +2054,8 @@ async def mod_epub_upload(
"coverPreviewDataUrl": cover_preview_data_url, "coverPreviewDataUrl": cover_preview_data_url,
"parserInfo": { "parserInfo": {
"splitMode": mode, "splitMode": mode,
"chapterRegexUsed": pattern, "chapterRegexUsed": pattern if mode == "regex" else None,
"chapterTagUsed": effective_tag if mode == "tag" else None,
"sourceSections": len(source_sections), "sourceSections": len(source_sections),
"sectionsAfterFilter": len(sections_after_filter), "sectionsAfterFilter": len(sections_after_filter),
"sectionsDroppedByFilter": max(0, len(source_sections) - len(sections_after_filter)), "sectionsDroppedByFilter": max(0, len(source_sections) - len(sections_after_filter)),
@@ -2590,6 +2646,11 @@ class ModChapterOptimizePayload(BaseModel):
updates: list[ModChapterOptimizeItem] updates: list[ModChapterOptimizeItem]
class ModNormalizeTitlesPreviewPayload(BaseModel):
novelId: str
overwriteGenericOnly: bool = True
class ModChapterGlobalReplacePayload(BaseModel): class ModChapterGlobalReplacePayload(BaseModel):
novelId: str novelId: str
action: str action: str
@@ -2628,33 +2689,114 @@ def _asset_file_sha256(path: Path) -> str:
return h.hexdigest() return h.hexdigest()
def _derive_chapter_title(txt: str, fallback: str, number: int) -> str: _CHAPTER_HEADING_PREFIX = r"(?:chuong|ch\.?|chapter|hoi|quyen|phan|tap)"
lines = [line.strip().lstrip("#").strip() for line in txt.splitlines() if line.strip()] _CHAPTER_WITH_SUBTITLE_RE = re.compile(
chapter_re = re.compile(r"^(?:chuong|ch\.?|chapter|hoi|quyen|phan|tap)\s*\d+(?:[\.:\-\)]\s*|\s+).+", re.IGNORECASE) rf"^{_CHAPTER_HEADING_PREFIX}\s*\d+(?:[\.:\-\)]\s*|\s+).+",
chapter_num_re = re.compile(r"^(?:chuong|ch\.?|chapter|hoi|quyen|phan|tap)\s*\d+", re.IGNORECASE) re.IGNORECASE,
)
_CHAPTER_NUM_ONLY_RE = re.compile(
rf"^{_CHAPTER_HEADING_PREFIX}\s*(\d+)\s*[:\-\.]?\s*$",
re.IGNORECASE,
)
_CHAPTER_NUM_PREFIX_RE = re.compile(rf"^{_CHAPTER_HEADING_PREFIX}\s*(\d+)", re.IGNORECASE)
_CHAPTER_INLINE_SUBTITLE_RE = re.compile(
r"^(?:Chương|Ch\.?|Chapter|Hồi|Quyển|Phần|Tập)\s*\d+(?:[\.:\-\)]\s*|\s+)(.+)$",
re.IGNORECASE,
)
for line in lines[:12]:
def _looks_like_body_paragraph(line: str) -> bool:
s = line.strip()
if not s:
return True
if len(s) > 200:
return True
if len(s) > 90 and s.endswith((".", "", "!", "?", "")):
return True
if len(s.split()) >= 10:
return True
low = s.lower()
if re.match(r"^(đoàn|hắn|nàng|anh|cô|tôi|người|sau khi|khi đó|trong|ngoài|bên|cả|một|hai|ba)\s", low):
if len(s.split()) >= 8:
return True
return False
def _is_plausible_subtitle_line(line: str) -> bool:
s = line.strip()
if not s or len(s) < 2 or len(s) > 160:
return False
normalized = _norm_title(s)
if _CHAPTER_WITH_SUBTITLE_RE.match(normalized) or _CHAPTER_NUM_ONLY_RE.match(normalized):
return False
if _looks_like_body_paragraph(s):
return False
if re.search(r"https?://", s, re.IGNORECASE):
return False
return True
def _is_generic_chapter_title(title: str, number: int) -> bool:
current = (title or "").strip()
if not current:
return True
n = int(number or 0)
if n <= 0:
return False
if re.fullmatch(rf"Chương\s*{n}\s*", current, re.IGNORECASE):
return True
if re.fullmatch(rf"Ch\.?\s*{n}\s*", current, re.IGNORECASE):
return True
if re.fullmatch(rf"Chapter\s*{n}\s*", current, re.IGNORECASE):
return True
return _norm_title(current) == _norm_title(f"Chương {n}")
def _infer_chapter_title_from_content(txt: str, number: int, fallback: str = "") -> str:
lines = [line.strip().lstrip("#").strip() for line in (txt or "").splitlines() if line.strip()]
for idx, line in enumerate(lines[:15]):
normalized = _norm_title(line) normalized = _norm_title(line)
if not normalized: if not normalized:
continue continue
if chapter_re.match(normalized):
return line inline = _CHAPTER_INLINE_SUBTITLE_RE.match(line.strip())
if chapter_num_re.match(normalized): if inline:
return line subtitle = (inline.group(1) or "").strip()
if subtitle:
return subtitle
if _CHAPTER_WITH_SUBTITLE_RE.match(normalized):
return line.strip()
if _CHAPTER_NUM_PREFIX_RE.match(normalized):
if _CHAPTER_NUM_ONLY_RE.match(normalized):
if idx + 1 < len(lines):
next_line = lines[idx + 1].strip()
if _is_plausible_subtitle_line(next_line):
return next_line
return line.strip()
return line.strip()
if lines: if lines:
first = lines[0] first = lines[0].strip()
if len(first) <= 160 and len(first.split()) >= 3: if _is_plausible_subtitle_line(first):
# Prefer human-readable first heading over EPUB internal filename. if "/" in (fallback or "") or str(fallback or "").lower().endswith(".xhtml"):
if "/" in fallback or fallback.lower().endswith(".xhtml"):
return first return first
if len(first.split()) >= 2:
return first return first
if fallback and "/" not in fallback and not fallback.lower().endswith(".xhtml"): cleaned_fallback = (fallback or "").strip()
return fallback if cleaned_fallback and "/" not in cleaned_fallback and not cleaned_fallback.lower().endswith(".xhtml"):
if not _is_generic_chapter_title(cleaned_fallback, number):
return cleaned_fallback
return f"Chương {number}" return f"Chương {number}"
def _derive_chapter_title(txt: str, fallback: str, number: int) -> str:
return _infer_chapter_title_from_content(txt, number, fallback)
def _extract_title_chapter_number(title: str) -> int | None: def _extract_title_chapter_number(title: str) -> int | None:
normalized = _norm_title(title or "") normalized = _norm_title(title or "")
if not normalized: if not normalized:
@@ -2783,6 +2925,25 @@ def _filter_toc_chapters(chapters: list[dict[str, Any]]) -> list[dict[str, Any]]
return out return out
def _resolve_epub_split_mode(split_mode: str | None) -> str:
raw = (split_mode or "toc").strip().lower()
if raw == "regex":
return "regex"
if raw in {"tag", "html_tag", "html-tag", "htmltag"}:
return "tag"
return "toc"
def _normalize_chapter_html_tag(tag: str | None) -> str:
cleaned = (tag or "a").strip().lower()
if not re.fullmatch(r"[a-z][a-z0-9]*", cleaned):
raise HTTPException(
status_code=400,
detail="chapterTag must be a simple HTML tag name (letters/digits), e.g. a, h2",
)
return cleaned
def _extract_epub_chapters_by_regex(epub_path: Path, chapter_start_pattern: str) -> list[dict[str, Any]]: def _extract_epub_chapters_by_regex(epub_path: Path, chapter_start_pattern: str) -> list[dict[str, Any]]:
chapters = _extract_epub_chapters(epub_path) chapters = _extract_epub_chapters(epub_path)
pattern = chapter_start_pattern.strip() pattern = chapter_start_pattern.strip()
@@ -2852,7 +3013,12 @@ def _chapter_preview_samples(chapters: list[dict[str, Any]], sample_size: int =
return out return out
def _epub_extract_with_mode(epub_path: Path, split_mode: str, chapter_start_pattern: str | None) -> list[dict[str, Any]]: def _epub_extract_with_mode(
epub_path: Path,
split_mode: str,
chapter_start_pattern: str | None,
chapter_tag: str | None = None,
) -> list[dict[str, Any]]:
if split_mode == "regex": if split_mode == "regex":
default_vi_regex = r"^\s*(?:[#>*\-\[]\s*)*(?:ch(?:u\.?|ương|uong)?|chapter|hồi|hoi|quyển|quyen|phần|phan|tập|tap)\s*\d+(?:[\.:\-\)]\s*|\s+).+$" default_vi_regex = r"^\s*(?:[#>*\-\[]\s*)*(?:ch(?:u\.?|ương|uong)?|chapter|hồi|hoi|quyển|quyen|phần|phan|tập|tap)\s*\d+(?:[\.:\-\)]\s*|\s+).+$"
effective_pattern = chapter_start_pattern or default_vi_regex effective_pattern = chapter_start_pattern or default_vi_regex
@@ -2860,6 +3026,30 @@ def _epub_extract_with_mode(epub_path: Path, split_mode: str, chapter_start_patt
return _normalize_chapter_sequence(_extract_epub_chapters_by_regex(epub_path, effective_pattern)) return _normalize_chapter_sequence(_extract_epub_chapters_by_regex(epub_path, effective_pattern))
except re.error as exc: except re.error as exc:
raise HTTPException(status_code=400, detail=f"Invalid chapterStartPattern: {exc}") from exc raise HTTPException(status_code=400, detail=f"Invalid chapterStartPattern: {exc}") from exc
if split_mode == "tag":
from app.epub_parser import build_merged_html_from_epub, extract_chapters_by_html_tag
effective_tag = _normalize_chapter_html_tag(chapter_tag)
merged, tag_stats = extract_chapters_by_html_tag(epub_path, effective_tag)
if not merged:
merged_html = build_merged_html_from_epub(epub_path)
tag_opens = int(tag_stats.get("tagOpens") or 0)
if not merged_html.strip():
detail = "EPUB không có nội dung HTML trong các file document."
elif tag_opens == 0:
detail = (
f"Không tìm thấy thẻ <{effective_tag}> trong EPUB. "
f"Thử thẻ khác (h2, h1, p) hoặc chế độ TOC/Regex."
)
else:
filtered = int(tag_stats.get("tagOpensFiltered") or 0)
extra = f" (đã lọc bỏ {filtered} thẻ <a> không giống mục chương)" if filtered else ""
detail = (
f"Tìm thấy {tag_opens} thẻ <{effective_tag}>{extra} "
f"nhưng không tạo được chương có nội dung. Thử thẻ khác hoặc TOC/Regex."
)
raise HTTPException(status_code=400, detail=detail)
return _normalize_chapter_sequence(merged)
return _normalize_chapter_sequence(_extract_epub_chapters(epub_path)) return _normalize_chapter_sequence(_extract_epub_chapters(epub_path))
@@ -3346,6 +3536,236 @@ def _map_genres_to_existing(candidates: list[str], existing_genres: list[str], *
_ROUTER_MODEL_CACHE: dict[str, Any] = {"expires_at": 0.0, "models": []} _ROUTER_MODEL_CACHE: dict[str, Any] = {"expires_at": 0.0, "models": []}
_ROUTER_PICK_LIMIT = 8
_ROUTER_FAMILY_PICK_LIMITS: dict[str, int] = {
"openai": 3,
"deepseek": 4,
"claude": 2,
"gemini": 2,
"other": 2,
}
_ROUTER_FAMILY_PICK_ORDER: tuple[str, ...] = ("openai", "deepseek", "claude", "gemini", "other")
def _router_model_family(model_id: str) -> str:
low = model_id.lower()
if "gpt" in low or low.startswith("openai/"):
return "openai"
if "deepseek" in low or low.startswith("ds/") or "/ds/" in low:
return "deepseek"
if "claude" in low or "anthropic" in low:
return "claude"
if "gemini" in low or "google" in low:
return "gemini"
return "other"
def _router_pick_models_from_candidates(candidates: list[tuple[int, str]]) -> list[str]:
by_family: dict[str, list[tuple[int, str]]] = {}
for score, model_id in candidates:
by_family.setdefault(_router_model_family(model_id), []).append((score, model_id))
for family_models in by_family.values():
family_models.sort(key=lambda x: (-x[0], x[1]))
picked: list[str] = []
for family in _ROUTER_FAMILY_PICK_ORDER:
limit = _ROUTER_FAMILY_PICK_LIMITS.get(family, 1)
for _score, model_id in by_family.get(family, [])[:limit]:
if model_id not in picked:
picked.append(model_id)
if len(picked) < _ROUTER_PICK_LIMIT:
for _score, model_id in sorted(candidates, key=lambda x: (-x[0], x[1])):
if len(picked) >= _ROUTER_PICK_LIMIT:
break
if model_id not in picked:
picked.append(model_id)
return picked[:_ROUTER_PICK_LIMIT]
def _router_model_priority_score(model_id: str) -> int:
low = model_id.lower()
if "gpt-5.5" in low:
return 1000
if "gpt-5" in low:
return 900
if _router_model_family(model_id) == "deepseek":
return 850
if "claude" in low:
return 700
if "gemini" in low:
return 650
return 100
def _router_parse_http_json(raw: str) -> Any:
"""Parse OpenAI-compatible HTTP bodies (9router may append SSE sentinels)."""
text = (raw or "").strip()
if not text:
raise ValueError("empty router response body")
done_idx = text.find("data: [DONE]")
if done_idx != -1:
text = text[:done_idx].rstrip()
try:
return json.loads(text)
except json.JSONDecodeError:
decoder = json.JSONDecoder()
obj, _end = decoder.raw_decode(text)
return obj
def _router_collect_sse_payloads(raw: str) -> list[dict[str, Any]]:
payloads: list[dict[str, Any]] = []
for line in raw.splitlines():
line = line.strip()
if not line.startswith("data:"):
continue
chunk = line[5:].strip()
if not chunk or chunk == "[DONE]":
continue
try:
parsed = json.loads(chunk)
except json.JSONDecodeError:
continue
if isinstance(parsed, dict):
payloads.append(parsed)
return payloads
def _router_merge_streaming_completion(payloads: list[dict[str, Any]]) -> dict[str, Any]:
merged: dict[str, Any] = {"choices": [{"message": {"role": "assistant", "content": ""}}]}
content_parts: list[str] = []
reasoning_parts: list[str] = []
for payload in payloads:
for choice in payload.get("choices") or []:
delta = choice.get("delta") or {}
message = choice.get("message") or {}
for key, bucket in (
("content", content_parts),
("reasoning_content", reasoning_parts),
):
piece = delta.get(key)
if piece is None:
piece = message.get(key)
if piece:
bucket.append(str(piece))
if content_parts:
merged["choices"][0]["message"]["content"] = "".join(content_parts)
if reasoning_parts:
merged["choices"][0]["message"]["reasoning_content"] = "".join(reasoning_parts)
return merged
def _router_parse_completion_body(raw: str, *, model_id: str) -> dict[str, Any]:
text = (raw or "").strip()
if not text:
raise ValueError("empty router response body")
if text.startswith("data:") or "\ndata:" in text:
payloads = _router_collect_sse_payloads(text)
if payloads:
return _router_merge_streaming_completion(payloads)
data = _router_parse_http_json(text)
if not isinstance(data, dict):
raise ValueError(f"router response is not an object for model={model_id}")
return data
def _router_strip_json_fences(text: str) -> str:
stripped = text.strip()
if stripped.startswith("```"):
stripped = re.sub(r"^```(?:json)?\s*", "", stripped, flags=re.IGNORECASE)
stripped = re.sub(r"\s*```$", "", stripped)
return stripped.strip()
def _router_parse_json_object(text: str) -> dict[str, Any] | None:
candidate = _router_strip_json_fences(text)
if not candidate:
return None
try:
parsed = json.loads(candidate)
return parsed if isinstance(parsed, dict) else None
except json.JSONDecodeError:
pass
try:
decoder = json.JSONDecoder()
obj, _end = decoder.raw_decode(candidate)
return obj if isinstance(obj, dict) else None
except json.JSONDecodeError:
pass
match = re.search(r"\{[\s\S]*\}", candidate)
if not match:
return None
try:
obj = json.loads(match.group(0))
return obj if isinstance(obj, dict) else None
except json.JSONDecodeError:
return None
def _router_normalize_message_content(content: Any) -> str:
if content is None:
return ""
if isinstance(content, str):
return content.strip()
if isinstance(content, list):
parts: list[str] = []
for item in content:
if isinstance(item, str) and item.strip():
parts.append(item.strip())
elif isinstance(item, dict):
if item.get("type") == "text":
text = str(item.get("text") or "").strip()
if text:
parts.append(text)
elif "text" in item:
text = str(item.get("text") or "").strip()
if text:
parts.append(text)
return "\n".join(parts).strip()
return str(content).strip()
def _router_extract_assistant_content(completion: dict[str, Any], model_id: str) -> str:
choice = (completion.get("choices") or [{}])[0] or {}
message = choice.get("message") or {}
family = _router_model_family(model_id)
content = _router_normalize_message_content(message.get("content"))
if content:
return content
if family == "deepseek":
reasoning = str(message.get("reasoning_content") or "").strip()
if reasoning:
parsed = _router_parse_json_object(reasoning)
if parsed:
return json.dumps(parsed, ensure_ascii=False)
tail = reasoning[-4000:]
parsed = _router_parse_json_object(tail)
if parsed:
return json.dumps(parsed, ensure_ascii=False)
if family == "gemini":
parts = message.get("parts")
if isinstance(parts, list):
return _router_normalize_message_content(parts)
return ""
def _router_parse_suggest_result(completion: dict[str, Any], model_id: str) -> dict[str, Any] | None:
content = _router_extract_assistant_content(completion, model_id)
if not content:
return None
parsed = _router_parse_json_object(content)
if not parsed:
return None
return parsed
def _normalize_vietnamese_novel_status(raw: str | None) -> str: def _normalize_vietnamese_novel_status(raw: str | None) -> str:
@@ -3379,30 +3799,20 @@ async def _router_pick_models() -> list[str]:
headers=headers, headers=headers,
) )
response.raise_for_status() response.raise_for_status()
for item in (response.json().get("data") or []): models_payload = _router_parse_http_json(response.text)
for item in (models_payload.get("data") or []):
model_id = str(item.get("id") or "").strip() model_id = str(item.get("id") or "").strip()
if not model_id: if not model_id:
continue continue
low = model_id.lower() low = model_id.lower()
if any(x in low for x in ["vision", "image", "audio", "realtime", "embedding", "moderation"]): if any(x in low for x in ["vision", "image", "audio", "realtime", "embedding", "moderation"]):
continue continue
score = 0 candidates.append((_router_model_priority_score(model_id), model_id))
if "gpt-5.5" in low: except Exception as exc:
score += 1000 logger.warning("router models list failed: %s", exc)
elif "gpt-5" in low:
score += 900
elif "claude" in low:
score += 700
elif "gemini" in low:
score += 650
else:
score += 100
candidates.append((score, model_id))
except Exception:
candidates = [] candidates = []
candidates.sort(key=lambda x: x[0], reverse=True) picked = _router_pick_models_from_candidates(candidates)
picked = [m for _, m in candidates[:6]]
_ROUTER_MODEL_CACHE["models"] = picked _ROUTER_MODEL_CACHE["models"] = picked
_ROUTER_MODEL_CACHE["expires_at"] = now + 600 _ROUTER_MODEL_CACHE["expires_at"] = now + 600
return picked return picked
@@ -3479,6 +3889,7 @@ async def _router_ai_suggest(
for model_id in models: for model_id in models:
payload = dict(base_payload) payload = dict(base_payload)
payload["model"] = model_id payload["model"] = model_id
family = _router_model_family(model_id)
try: try:
async with httpx.AsyncClient(timeout=45.0) as client: async with httpx.AsyncClient(timeout=45.0) as client:
response = await client.post( response = await client.post(
@@ -3486,10 +3897,24 @@ async def _router_ai_suggest(
headers=headers, headers=headers,
json=payload, json=payload,
) )
response.raise_for_status() if response.status_code >= 400:
data = response.json() logger.info(
content = data.get("choices", [{}])[0].get("message", {}).get("content", "") "router ai-suggest skip model=%s family=%s status=%s body=%s",
parsed = json.loads(content) if isinstance(content, str) else {} model_id,
family,
response.status_code,
(response.text or "")[:240],
)
continue
completion = _router_parse_completion_body(response.text, model_id=model_id)
parsed = _router_parse_suggest_result(completion, model_id)
if not parsed:
logger.info(
"router ai-suggest skip model=%s family=%s reason=unparseable_content",
model_id,
family,
)
continue
raw_genres = [str(g).strip() for g in (parsed.get("genres") or []) if str(g).strip()][:6] raw_genres = [str(g).strip() for g in (parsed.get("genres") or []) if str(g).strip()][:6]
genres = _map_genres_to_existing(raw_genres, existing_genres, limit=6) genres = _map_genres_to_existing(raw_genres, existing_genres, limit=6)
short_description = str(parsed.get("shortDescription") or "").strip() short_description = str(parsed.get("shortDescription") or "").strip()
@@ -3500,6 +3925,13 @@ async def _router_ai_suggest(
confidence = 0.0 confidence = 0.0
confidence = max(0.0, min(1.0, confidence)) confidence = max(0.0, min(1.0, confidence))
if not short_description or not genres: if not short_description or not genres:
logger.info(
"router ai-suggest skip model=%s family=%s reason=empty_fields genres=%s desc_len=%s",
model_id,
family,
len(genres),
len(short_description),
)
continue continue
return { return {
"suggestedGenres": genres, "suggestedGenres": genres,
@@ -3508,7 +3940,13 @@ async def _router_ai_suggest(
"model": model_id, "model": model_id,
"suggestedStatus": novel_status, "suggestedStatus": novel_status,
} }
except Exception: except Exception as exc:
logger.info(
"router ai-suggest skip model=%s family=%s reason=exception err=%s",
model_id,
family,
exc,
)
continue continue
return None return None
@@ -3570,6 +4008,7 @@ async def mod_epub_ai_suggest(
file: UploadFile = File(...), file: UploadFile = File(...),
splitMode: str | None = Form(default=None), splitMode: str | None = Form(default=None),
chapterRegex: str | None = Form(default=None), chapterRegex: str | None = Form(default=None),
chapterTag: str | None = Form(default=None),
title: str | None = Form(default=None), title: str | None = Form(default=None),
authorName: str | None = Form(default=None), authorName: str | None = Form(default=None),
db: AsyncSession = Depends(get_db_session), db: AsyncSession = Depends(get_db_session),
@@ -3587,11 +4026,12 @@ async def mod_epub_ai_suggest(
tmp_path = Path(tmp.name) tmp_path = Path(tmp.name)
try: try:
mode = "regex" if (splitMode or "").lower() == "regex" else "toc" mode = _resolve_epub_split_mode(splitMode)
pattern = (chapterRegex or "").strip() or None pattern = (chapterRegex or "").strip() or None
effective_tag = _normalize_chapter_html_tag(chapterTag) if mode == "tag" else None
source_sections = _extract_epub_chapters(tmp_path) source_sections = _extract_epub_chapters(tmp_path)
sections_after_filter = _filter_toc_chapters(source_sections) if mode == "toc" else source_sections sections_after_filter = _filter_toc_chapters(source_sections) if mode == "toc" else source_sections
chapters = _epub_extract_with_mode(tmp_path, mode, pattern) chapters = _epub_extract_with_mode(tmp_path, mode, pattern, effective_tag)
meta = _extract_epub_metadata(tmp_path) meta = _extract_epub_metadata(tmp_path)
resolved_title = " ".join((title or str(meta.get("title") or tmp_path.stem)).split()).strip() or tmp_path.stem resolved_title = " ".join((title or str(meta.get("title") or tmp_path.stem)).split()).strip() or tmp_path.stem
resolved_author = " ".join((authorName or str(meta.get("author") or "Unknown")).split()).strip() or "Unknown" resolved_author = " ".join((authorName or str(meta.get("author") or "Unknown")).split()).strip() or "Unknown"
@@ -4198,20 +4638,7 @@ async def mobile_login(payload: MobileLoginPayload, db: AsyncSession = Depends(g
if not payload.googleIdToken.strip(): if not payload.googleIdToken.strip():
raise HTTPException(status_code=400, detail="googleIdToken is required") raise HTTPException(status_code=400, detail="googleIdToken is required")
allowed_client_ids = settings.google_client_id_list id_info = verify_google_id_token(payload.googleIdToken)
try:
id_info = google_id_token.verify_oauth2_token(
payload.googleIdToken,
google_requests.Request(),
None,
)
except Exception as exc:
raise HTTPException(status_code=401, detail="Invalid Google token") from exc
aud = (id_info.get("aud") or "").strip()
if allowed_client_ids and aud not in set(allowed_client_ids):
raise HTTPException(status_code=401, detail="Invalid Google token audience")
email = id_info.get("email") email = id_info.get("email")
if not email: if not email: