diff --git a/.env.example b/.env.example index de5dc3d..470850f 100644 --- a/.env.example +++ b/.env.example @@ -16,3 +16,8 @@ CORS_ORIGINS=http://localhost:3000,http://127.0.0.1:3000 # Environment label APP_ENV=development + +# OpenAI-compatible router (9router / OpenRouter) cho AI gợi ý EPUB import +# Liệt kê model: GET {ROUTER_BASE_URL}/models +ROUTER_BASE_URL=http://192.168.100.146:20128/v1 +ROUTER_API_KEY= diff --git a/app/auth.py b/app/auth.py index 4d57881..952ac7c 100644 --- a/app/auth.py +++ b/app/auth.py @@ -1,10 +1,13 @@ from __future__ import annotations import datetime as dt +import logging import os from typing import Any from fastapi import Depends, HTTPException, Request +from google.auth.transport import requests as google_requests +from google.oauth2 import id_token as google_id_token from jose import JWTError, jwt from sqlalchemy import text from sqlalchemy.ext.asyncio import AsyncSession @@ -12,6 +15,8 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.config import settings from app.database import get_db_session +logger = logging.getLogger(__name__) + SESSION_COOKIE_KEYS = [ "next-auth.session-token", "__Secure-next-auth.session-token", @@ -21,6 +26,102 @@ SESSION_COOKIE_KEYS = [ ] ACCESS_TOKEN_TTL_SECONDS = 7 * 24 * 60 * 60 +GOOGLE_TOKEN_CLOCK_SKEW_SECONDS = 60 + + +def _google_token_audiences_to_try(token: str) -> list[str | None]: + audiences: list[str | None] = [] + seen: set[str] = set() + + def add(value: str | None) -> None: + if value is None: + if None not in audiences: + audiences.append(None) + return + cleaned = value.strip() + if not cleaned or cleaned in seen: + return + seen.add(cleaned) + audiences.append(cleaned) + + for client_id in settings.google_client_id_list: + add(client_id) + + try: + claims = jwt.get_unverified_claims(token) + for key in ("aud", "azp"): + raw = claims.get(key) + if isinstance(raw, str): + add(raw) + elif isinstance(raw, list): + for item in raw: + if isinstance(item, str): + add(item) + except Exception: + pass + + if not audiences: + audiences.append(None) + return audiences + + +def verify_google_id_token(raw_token: str) -> dict[str, Any]: + token = raw_token.strip() + if token.count(".") != 2: + raise HTTPException(status_code=400, detail="googleIdToken must be a JWT") + + request = google_requests.Request() + last_exc: Exception | None = None + + for audience in _google_token_audiences_to_try(token): + try: + id_info = google_id_token.verify_oauth2_token( + token, + request, + audience, + clock_skew_in_seconds=GOOGLE_TOKEN_CLOCK_SKEW_SECONDS, + ) + aud = id_info.get("aud") + allowed = set(settings.google_client_id_list) + if allowed: + aud_values: set[str] = set() + if isinstance(aud, str): + aud_values.add(aud) + elif isinstance(aud, list): + aud_values.update(str(item) for item in aud) + azp = id_info.get("azp") + if isinstance(azp, str): + aud_values.add(azp) + if aud_values.isdisjoint(allowed): + last_exc = ValueError(f"token audience not allowed: {aud_values}") + continue + return id_info + except Exception as exc: + last_exc = exc + continue + + try: + claims = jwt.get_unverified_claims(token) + logger.warning( + "google id token rejected len=%s iss=%s aud=%s azp=%s exp=%s err=%s", + len(token), + claims.get("iss"), + claims.get("aud"), + claims.get("azp"), + claims.get("exp"), + last_exc, + ) + except Exception: + logger.warning("google id token rejected len=%s err=%s", len(token), last_exc) + + err_text = str(last_exc or "").lower() + if any(x in err_text for x in ("certificate", "connection", "timeout", "urlopen", "ssl", "network")): + raise HTTPException( + status_code=503, + detail="Unable to verify Google token (reader-api cannot reach googleapis.com)", + ) from last_exc + + raise HTTPException(status_code=401, detail="Invalid Google token") from last_exc def _jwt_secret() -> str: diff --git a/app/epub_parser.py b/app/epub_parser.py index 5e1afe0..6711cda 100644 --- a/app/epub_parser.py +++ b/app/epub_parser.py @@ -1,5 +1,7 @@ from __future__ import annotations +import html as html_lib +import re from pathlib import Path from typing import Any @@ -7,8 +9,13 @@ import html2text from ebooklib import ITEM_DOCUMENT from ebooklib import epub as epublib +_CHAPTER_MARKER_TEXT_RE = re.compile( + r"(?:ch(?:u(?:ơng|ong))?|chapter|hồi|hoi|phần|phan|tập|tap|quyển|quyen)\s*\d+", + re.IGNORECASE, +) -def _html_to_text(html_content: str) -> str: + +def html_to_text(html_content: str) -> str: h = html2text.HTML2Text() h.ignore_links = True h.ignore_images = True @@ -17,6 +24,20 @@ def _html_to_text(html_content: str) -> str: return h.handle(html_content).strip() +def _html_to_text(html_content: str) -> str: + return html_to_text(html_content) + + +def build_merged_html_from_epub(epub_path: Path) -> str: + book = epublib.read_epub(str(epub_path), options={"ignore_ncx": False}) + parts: list[str] = [] + for item in book.get_items_of_type(ITEM_DOCUMENT): + content = item.get_content().decode("utf-8", errors="replace") + if content.strip(): + parts.append(content) + return "\n".join(parts) + + def build_chapters_from_epub(epub_path: Path) -> list[dict[str, Any]]: book = epublib.read_epub(str(epub_path), options={"ignore_ncx": False}) out: list[dict[str, Any]] = [] @@ -36,3 +57,135 @@ def build_chapters_from_epub(epub_path: Path) -> list[dict[str, Any]]: ) idx += 1 return out + + +def count_html_tag_opens(html: str, tag: str) -> int: + tag_re = re.escape(tag.strip().lower()) + return len(re.findall(rf"<{tag_re}\b", html, flags=re.IGNORECASE)) + + +def _strip_tags_to_text(fragment: str) -> str: + return html_lib.unescape(re.sub(r"<[^>]+>", " ", fragment or "")).strip() + + +def _title_from_tag_opening(opening_attrs: str, fragment: str, tag: str) -> str: + tag_re = re.escape(tag) + for attr in ("title", "alt"): + match = re.search(rf'{attr}\s*=\s*["\']([^"\']+)["\']', opening_attrs, flags=re.IGNORECASE) + if match: + title = html_lib.unescape(match.group(1)).strip() + if title and len(title) <= 160: + return title + for attr in ("id", "name"): + match = re.search(rf'{attr}\s*=\s*["\']([^"\']+)["\']', opening_attrs, flags=re.IGNORECASE) + if match: + title = html_lib.unescape(match.group(1)).strip() + if title and not title.startswith("#") and len(title) <= 160: + return title + close_match = re.search( + rf"<{tag_re}\b[^>]*>(.*?)", + fragment, + flags=re.IGNORECASE | re.DOTALL, + ) + if not close_match: + return "" + inner = _strip_tags_to_text(close_match.group(1)) + if inner and len(inner) <= 160: + return inner + return "" + + +def _anchor_seems_chapter_marker(opening_attrs: str, inner_text: str) -> bool: + text = (inner_text or "").strip() + if text and _CHAPTER_MARKER_TEXT_RE.search(text): + return True + attrs = opening_attrs or "" + if re.search(r'\bhref\s*=\s*["\'][^"\']*\.xhtml', attrs, flags=re.IGNORECASE): + return True + if re.search( + r'\b(?:id|name)\s*=\s*["\'][^"\']*(?:chuong|chương|chapter|ch\d|c\d|hoi|hồi)', + attrs, + flags=re.IGNORECASE, + ): + return True + # TOC / nav links thường có text ngắn. + if text and len(text) <= 120: + return True + return False + + +def _derive_simple_chapter_title(txt: str, number: int) -> str: + for line in (txt or "").splitlines(): + cleaned = line.strip() + if cleaned: + return cleaned[:160] + return f"Chương {number}" + + +def extract_chapters_by_html_tag( + epub_path: Path, + tag: str, +) -> tuple[list[dict[str, Any]], dict[str, int]]: + """Tách chương tại mỗi thẻ mở ``. Trả về (chapters, stats).""" + merged_html = build_merged_html_from_epub(epub_path) + stats = {"tagOpens": 0, "tagOpensUsed": 0, "tagOpensFiltered": 0} + if not merged_html.strip(): + return [], stats + + tag_name = tag.strip().lower() + tag_re = re.escape(tag_name) + opener_re = re.compile(rf"<({tag_re})\b([^>]*)>", re.IGNORECASE) + matches = list(opener_re.finditer(merged_html)) + stats["tagOpens"] = len(matches) + if not matches: + return [], stats + + if tag_name == "a" and len(matches) > 300: + filtered: list[re.Match[str]] = [] + for match in matches: + attrs = match.group(2) or "" + rest = merged_html[match.end() : match.end() + 800] + close = re.search(rf"", rest, flags=re.IGNORECASE) + inner_html = rest[: close.start()] if close else rest + inner_text = _strip_tags_to_text(inner_html) + if _anchor_seems_chapter_marker(attrs, inner_text): + filtered.append(match) + if filtered: + stats["tagOpensFiltered"] = len(matches) - len(filtered) + matches = filtered + + chapters: list[dict[str, Any]] = [] + for index, match in enumerate(matches): + start = match.start() + end = matches[index + 1].start() if index + 1 < len(matches) else len(merged_html) + raw_html = merged_html[start:end].strip() + if not raw_html: + continue + + opening_attrs = match.group(2) or "" + txt = html_to_text(raw_html) + inline_title = _title_from_tag_opening(opening_attrs, raw_html, tag_name) + number = len(chapters) + 1 + title = inline_title or _derive_simple_chapter_title(txt, number) + + # Bỏ qua anchor rỗng không có tiêu đề và không có nội dung theo sau. + if not txt.strip() and not inline_title: + tag_only = re.fullmatch( + rf"<{tag_re}\b[^>]*>\s*(?:\s*)?", + raw_html, + flags=re.IGNORECASE | re.DOTALL, + ) + if tag_only: + continue + + chapters.append( + { + "number": number, + "title": title, + "raw_html": raw_html, + "txt": txt, + } + ) + + stats["tagOpensUsed"] = len(matches) + return chapters, stats diff --git a/app/main.py b/app/main.py index 2004389..e26122e 100644 --- a/app/main.py +++ b/app/main.py @@ -4,7 +4,9 @@ import asyncio import base64 import datetime as dt import hashlib +import html as html_lib import json +import logging import os import random import re @@ -24,17 +26,17 @@ from fastapi import Body, Depends, FastAPI, File, Form, HTTPException, Query, Re from fastapi.middleware.cors import CORSMiddleware import httpx from fastapi.responses import Response -from google.auth.transport import requests as google_requests -from google.oauth2 import id_token as google_id_token from pydantic import BaseModel, Field from sqlalchemy import text from sqlalchemy.ext.asyncio import AsyncSession -from app.auth import ACCESS_TOKEN_TTL_SECONDS, create_access_token, require_current_user +from app.auth import ACCESS_TOKEN_TTL_SECONDS, create_access_token, require_current_user, verify_google_id_token from app.config import settings from app.database import get_db_session from app.storage import storage +logger = logging.getLogger(__name__) + # Giới hạn chương EPUB chỉ khi client gửi `enforceMaxChapters=true` (import nhiều / batch). MOD_EPUB_MAX_CHAPTERS = 4000 @@ -1673,18 +1675,19 @@ async def mod_delete_chapter( @app.post("/api/mod/chuong/bulk-delete") async def mod_bulk_delete_chapters( - payload: ModChapterBulkDeletePayload, + payload: dict[str, Any] = Body(...), db: AsyncSession = Depends(get_db_session), user: dict = Depends(require_current_user), ): if user.get("role") not in ("MOD", "ADMIN"): raise HTTPException(status_code=403, detail="Forbidden") - from_num = min(payload.fromNumber, payload.toNumber) - to_num = max(payload.fromNumber, payload.toNumber) + parsed = ModChapterBulkDeletePayload.model_validate(payload) + from_num = min(parsed.fromNumber, parsed.toNumber) + to_num = max(parsed.fromNumber, parsed.toNumber) ids = ( await db.execute( text('SELECT id FROM "ChapterMeta" WHERE "novelId" = :novel_id AND number BETWEEN :from_num AND :to_num'), - {"novel_id": payload.novelId, "from_num": from_num, "to_num": to_num}, + {"novel_id": parsed.novelId, "from_num": from_num, "to_num": to_num}, ) ).mappings().all() chapter_ids = [str(r["id"]) for r in ids] @@ -1693,14 +1696,63 @@ async def mod_bulk_delete_chapters( deleted_count = ( await db.execute( text('DELETE FROM "ChapterMeta" WHERE "novelId" = :novel_id AND number BETWEEN :from_num AND :to_num RETURNING id'), - {"novel_id": payload.novelId, "from_num": from_num, "to_num": to_num}, + {"novel_id": parsed.novelId, "from_num": from_num, "to_num": to_num}, ) ).mappings().all() - await db.execute(text('UPDATE "Novel" SET "totalChapters" = (SELECT COUNT(*) FROM "ChapterMeta" WHERE "novelId" = :novel_id), "updatedAt" = NOW() WHERE id = :novel_id'), {"novel_id": payload.novelId}) + await db.execute(text('UPDATE "Novel" SET "totalChapters" = (SELECT COUNT(*) FROM "ChapterMeta" WHERE "novelId" = :novel_id), "updatedAt" = NOW() WHERE id = :novel_id'), {"novel_id": parsed.novelId}) await db.commit() return {"deletedCount": len(deleted_count)} +@app.post("/api/mod/chuong/normalize-titles/preview") +async def mod_normalize_chapter_titles_preview( + payload: dict[str, Any] = Body(...), + db: AsyncSession = Depends(get_db_session), + user: dict = Depends(require_current_user), +): + if user.get("role") not in ("MOD", "ADMIN"): + raise HTTPException(status_code=403, detail="Forbidden") + + parsed = ModNormalizeTitlesPreviewPayload.model_validate(payload) + novel_id = parsed.novelId.strip() + if not novel_id: + raise HTTPException(status_code=400, detail="novelId is required") + + rows = ( + await db.execute( + text('SELECT id, number, title FROM "ChapterMeta" WHERE "novelId" = :novel_id ORDER BY number ASC'), + {"novel_id": novel_id}, + ) + ).mappings().all() + + items: list[dict[str, Any]] = [] + for row in rows: + chapter_id = str(row["id"]) + number = int(row.get("number") or 0) + current_title = str(row.get("title") or "").strip() + content = await _resolve_chapter_content(chapter_id, db) or "" + suggested_title = _infer_chapter_title_from_content(content, number, current_title).strip() + if not suggested_title or suggested_title == current_title: + continue + if parsed.overwriteGenericOnly and not _is_generic_chapter_title(current_title, number): + continue + items.append( + { + "id": chapter_id, + "number": number, + "currentTitle": current_title, + "suggestedTitle": suggested_title, + } + ) + + return { + "novelId": novel_id, + "scannedCount": len(rows), + "changeCount": len(items), + "items": items, + } + + @app.put("/api/mod/chuong/optimize") async def mod_optimize_chapters( payload: dict[str, Any] = Body(...), @@ -1866,6 +1918,7 @@ async def mod_epub_upload( preview: str | None = Form(default=None), splitMode: str | None = Form(default=None), chapterRegex: str | None = Form(default=None), + chapterTag: str | None = Form(default=None), title: str | None = Form(default=None), originalTitle: str | None = Form(default=None), authorName: str | None = Form(default=None), @@ -1891,11 +1944,12 @@ async def mod_epub_upload( tmp_path = Path(tmp.name) try: - mode = "regex" if (splitMode or "").lower() == "regex" else "toc" + mode = _resolve_epub_split_mode(splitMode) pattern = (chapterRegex or "").strip() or None + effective_tag = _normalize_chapter_html_tag(chapterTag) if mode == "tag" else None source_sections = _extract_epub_chapters(tmp_path) sections_after_filter = _filter_toc_chapters(source_sections) if mode == "toc" else source_sections - chapters = _epub_extract_with_mode(tmp_path, mode, pattern) + chapters = _epub_extract_with_mode(tmp_path, mode, pattern, effective_tag) epub_meta = _extract_epub_metadata(tmp_path) inferred_title = str(epub_meta.get("title") or Path(file.filename or "novel").stem) inferred_author = str(epub_meta.get("author") or "Unknown") @@ -1938,7 +1992,8 @@ async def mod_epub_upload( "coverPreviewDataUrl": cover_data_url_b, "parserInfo": { "splitMode": mode, - "chapterRegexUsed": pattern, + "chapterRegexUsed": pattern if mode == "regex" else None, + "chapterTagUsed": effective_tag if mode == "tag" else None, "sourceSections": len(source_sections), "sectionsAfterFilter": len(sections_after_filter), "sectionsDroppedByFilter": max(0, len(source_sections) - len(sections_after_filter)), @@ -1999,7 +2054,8 @@ async def mod_epub_upload( "coverPreviewDataUrl": cover_preview_data_url, "parserInfo": { "splitMode": mode, - "chapterRegexUsed": pattern, + "chapterRegexUsed": pattern if mode == "regex" else None, + "chapterTagUsed": effective_tag if mode == "tag" else None, "sourceSections": len(source_sections), "sectionsAfterFilter": len(sections_after_filter), "sectionsDroppedByFilter": max(0, len(source_sections) - len(sections_after_filter)), @@ -2590,6 +2646,11 @@ class ModChapterOptimizePayload(BaseModel): updates: list[ModChapterOptimizeItem] +class ModNormalizeTitlesPreviewPayload(BaseModel): + novelId: str + overwriteGenericOnly: bool = True + + class ModChapterGlobalReplacePayload(BaseModel): novelId: str action: str @@ -2628,33 +2689,114 @@ def _asset_file_sha256(path: Path) -> str: return h.hexdigest() -def _derive_chapter_title(txt: str, fallback: str, number: int) -> str: - lines = [line.strip().lstrip("#").strip() for line in txt.splitlines() if line.strip()] - chapter_re = re.compile(r"^(?:chuong|ch\.?|chapter|hoi|quyen|phan|tap)\s*\d+(?:[\.:\-\)]\s*|\s+).+", re.IGNORECASE) - chapter_num_re = re.compile(r"^(?:chuong|ch\.?|chapter|hoi|quyen|phan|tap)\s*\d+", re.IGNORECASE) +_CHAPTER_HEADING_PREFIX = r"(?:chuong|ch\.?|chapter|hoi|quyen|phan|tap)" +_CHAPTER_WITH_SUBTITLE_RE = re.compile( + rf"^{_CHAPTER_HEADING_PREFIX}\s*\d+(?:[\.:\-\)]\s*|\s+).+", + re.IGNORECASE, +) +_CHAPTER_NUM_ONLY_RE = re.compile( + rf"^{_CHAPTER_HEADING_PREFIX}\s*(\d+)\s*[:\-\.]?\s*$", + re.IGNORECASE, +) +_CHAPTER_NUM_PREFIX_RE = re.compile(rf"^{_CHAPTER_HEADING_PREFIX}\s*(\d+)", re.IGNORECASE) +_CHAPTER_INLINE_SUBTITLE_RE = re.compile( + r"^(?:Chương|Ch\.?|Chapter|Hồi|Quyển|Phần|Tập)\s*\d+(?:[\.:\-\)]\s*|\s+)(.+)$", + re.IGNORECASE, +) - for line in lines[:12]: + +def _looks_like_body_paragraph(line: str) -> bool: + s = line.strip() + if not s: + return True + if len(s) > 200: + return True + if len(s) > 90 and s.endswith((".", "…", "!", "?", "。")): + return True + if len(s.split()) >= 10: + return True + low = s.lower() + if re.match(r"^(đoàn|hắn|nàng|anh|cô|tôi|người|sau khi|khi đó|trong|ngoài|bên|cả|một|hai|ba)\s", low): + if len(s.split()) >= 8: + return True + return False + + +def _is_plausible_subtitle_line(line: str) -> bool: + s = line.strip() + if not s or len(s) < 2 or len(s) > 160: + return False + normalized = _norm_title(s) + if _CHAPTER_WITH_SUBTITLE_RE.match(normalized) or _CHAPTER_NUM_ONLY_RE.match(normalized): + return False + if _looks_like_body_paragraph(s): + return False + if re.search(r"https?://", s, re.IGNORECASE): + return False + return True + + +def _is_generic_chapter_title(title: str, number: int) -> bool: + current = (title or "").strip() + if not current: + return True + n = int(number or 0) + if n <= 0: + return False + if re.fullmatch(rf"Chương\s*{n}\s*", current, re.IGNORECASE): + return True + if re.fullmatch(rf"Ch\.?\s*{n}\s*", current, re.IGNORECASE): + return True + if re.fullmatch(rf"Chapter\s*{n}\s*", current, re.IGNORECASE): + return True + return _norm_title(current) == _norm_title(f"Chương {n}") + + +def _infer_chapter_title_from_content(txt: str, number: int, fallback: str = "") -> str: + lines = [line.strip().lstrip("#").strip() for line in (txt or "").splitlines() if line.strip()] + + for idx, line in enumerate(lines[:15]): normalized = _norm_title(line) if not normalized: continue - if chapter_re.match(normalized): - return line - if chapter_num_re.match(normalized): - return line + + inline = _CHAPTER_INLINE_SUBTITLE_RE.match(line.strip()) + if inline: + subtitle = (inline.group(1) or "").strip() + if subtitle: + return subtitle + + if _CHAPTER_WITH_SUBTITLE_RE.match(normalized): + return line.strip() + + if _CHAPTER_NUM_PREFIX_RE.match(normalized): + if _CHAPTER_NUM_ONLY_RE.match(normalized): + if idx + 1 < len(lines): + next_line = lines[idx + 1].strip() + if _is_plausible_subtitle_line(next_line): + return next_line + return line.strip() + return line.strip() if lines: - first = lines[0] - if len(first) <= 160 and len(first.split()) >= 3: - # Prefer human-readable first heading over EPUB internal filename. - if "/" in fallback or fallback.lower().endswith(".xhtml"): + first = lines[0].strip() + if _is_plausible_subtitle_line(first): + if "/" in (fallback or "") or str(fallback or "").lower().endswith(".xhtml"): + return first + if len(first.split()) >= 2: return first - return first - if fallback and "/" not in fallback and not fallback.lower().endswith(".xhtml"): - return fallback + cleaned_fallback = (fallback or "").strip() + if cleaned_fallback and "/" not in cleaned_fallback and not cleaned_fallback.lower().endswith(".xhtml"): + if not _is_generic_chapter_title(cleaned_fallback, number): + return cleaned_fallback return f"Chương {number}" +def _derive_chapter_title(txt: str, fallback: str, number: int) -> str: + return _infer_chapter_title_from_content(txt, number, fallback) + + def _extract_title_chapter_number(title: str) -> int | None: normalized = _norm_title(title or "") if not normalized: @@ -2783,6 +2925,25 @@ def _filter_toc_chapters(chapters: list[dict[str, Any]]) -> list[dict[str, Any]] return out +def _resolve_epub_split_mode(split_mode: str | None) -> str: + raw = (split_mode or "toc").strip().lower() + if raw == "regex": + return "regex" + if raw in {"tag", "html_tag", "html-tag", "htmltag"}: + return "tag" + return "toc" + + +def _normalize_chapter_html_tag(tag: str | None) -> str: + cleaned = (tag or "a").strip().lower() + if not re.fullmatch(r"[a-z][a-z0-9]*", cleaned): + raise HTTPException( + status_code=400, + detail="chapterTag must be a simple HTML tag name (letters/digits), e.g. a, h2", + ) + return cleaned + + def _extract_epub_chapters_by_regex(epub_path: Path, chapter_start_pattern: str) -> list[dict[str, Any]]: chapters = _extract_epub_chapters(epub_path) pattern = chapter_start_pattern.strip() @@ -2852,7 +3013,12 @@ def _chapter_preview_samples(chapters: list[dict[str, Any]], sample_size: int = return out -def _epub_extract_with_mode(epub_path: Path, split_mode: str, chapter_start_pattern: str | None) -> list[dict[str, Any]]: +def _epub_extract_with_mode( + epub_path: Path, + split_mode: str, + chapter_start_pattern: str | None, + chapter_tag: str | None = None, +) -> list[dict[str, Any]]: if split_mode == "regex": default_vi_regex = r"^\s*(?:[#>*\-\[]\s*)*(?:ch(?:u\.?|ương|uong)?|chapter|hồi|hoi|quyển|quyen|phần|phan|tập|tap)\s*\d+(?:[\.:\-\)]\s*|\s+).+$" effective_pattern = chapter_start_pattern or default_vi_regex @@ -2860,6 +3026,30 @@ def _epub_extract_with_mode(epub_path: Path, split_mode: str, chapter_start_patt return _normalize_chapter_sequence(_extract_epub_chapters_by_regex(epub_path, effective_pattern)) except re.error as exc: raise HTTPException(status_code=400, detail=f"Invalid chapterStartPattern: {exc}") from exc + if split_mode == "tag": + from app.epub_parser import build_merged_html_from_epub, extract_chapters_by_html_tag + + effective_tag = _normalize_chapter_html_tag(chapter_tag) + merged, tag_stats = extract_chapters_by_html_tag(epub_path, effective_tag) + if not merged: + merged_html = build_merged_html_from_epub(epub_path) + tag_opens = int(tag_stats.get("tagOpens") or 0) + if not merged_html.strip(): + detail = "EPUB không có nội dung HTML trong các file document." + elif tag_opens == 0: + detail = ( + f"Không tìm thấy thẻ <{effective_tag}> trong EPUB. " + f"Thử thẻ khác (h2, h1, p) hoặc chế độ TOC/Regex." + ) + else: + filtered = int(tag_stats.get("tagOpensFiltered") or 0) + extra = f" (đã lọc bỏ {filtered} thẻ không giống mục chương)" if filtered else "" + detail = ( + f"Tìm thấy {tag_opens} thẻ <{effective_tag}>{extra} " + f"nhưng không tạo được chương có nội dung. Thử thẻ khác hoặc TOC/Regex." + ) + raise HTTPException(status_code=400, detail=detail) + return _normalize_chapter_sequence(merged) return _normalize_chapter_sequence(_extract_epub_chapters(epub_path)) @@ -3346,6 +3536,236 @@ def _map_genres_to_existing(candidates: list[str], existing_genres: list[str], * _ROUTER_MODEL_CACHE: dict[str, Any] = {"expires_at": 0.0, "models": []} +_ROUTER_PICK_LIMIT = 8 +_ROUTER_FAMILY_PICK_LIMITS: dict[str, int] = { + "openai": 3, + "deepseek": 4, + "claude": 2, + "gemini": 2, + "other": 2, +} +_ROUTER_FAMILY_PICK_ORDER: tuple[str, ...] = ("openai", "deepseek", "claude", "gemini", "other") + + +def _router_model_family(model_id: str) -> str: + low = model_id.lower() + if "gpt" in low or low.startswith("openai/"): + return "openai" + if "deepseek" in low or low.startswith("ds/") or "/ds/" in low: + return "deepseek" + if "claude" in low or "anthropic" in low: + return "claude" + if "gemini" in low or "google" in low: + return "gemini" + return "other" + + +def _router_pick_models_from_candidates(candidates: list[tuple[int, str]]) -> list[str]: + by_family: dict[str, list[tuple[int, str]]] = {} + for score, model_id in candidates: + by_family.setdefault(_router_model_family(model_id), []).append((score, model_id)) + for family_models in by_family.values(): + family_models.sort(key=lambda x: (-x[0], x[1])) + + picked: list[str] = [] + for family in _ROUTER_FAMILY_PICK_ORDER: + limit = _ROUTER_FAMILY_PICK_LIMITS.get(family, 1) + for _score, model_id in by_family.get(family, [])[:limit]: + if model_id not in picked: + picked.append(model_id) + + if len(picked) < _ROUTER_PICK_LIMIT: + for _score, model_id in sorted(candidates, key=lambda x: (-x[0], x[1])): + if len(picked) >= _ROUTER_PICK_LIMIT: + break + if model_id not in picked: + picked.append(model_id) + return picked[:_ROUTER_PICK_LIMIT] + + +def _router_model_priority_score(model_id: str) -> int: + low = model_id.lower() + if "gpt-5.5" in low: + return 1000 + if "gpt-5" in low: + return 900 + if _router_model_family(model_id) == "deepseek": + return 850 + if "claude" in low: + return 700 + if "gemini" in low: + return 650 + return 100 + + +def _router_parse_http_json(raw: str) -> Any: + """Parse OpenAI-compatible HTTP bodies (9router may append SSE sentinels).""" + text = (raw or "").strip() + if not text: + raise ValueError("empty router response body") + + done_idx = text.find("data: [DONE]") + if done_idx != -1: + text = text[:done_idx].rstrip() + + try: + return json.loads(text) + except json.JSONDecodeError: + decoder = json.JSONDecoder() + obj, _end = decoder.raw_decode(text) + return obj + + +def _router_collect_sse_payloads(raw: str) -> list[dict[str, Any]]: + payloads: list[dict[str, Any]] = [] + for line in raw.splitlines(): + line = line.strip() + if not line.startswith("data:"): + continue + chunk = line[5:].strip() + if not chunk or chunk == "[DONE]": + continue + try: + parsed = json.loads(chunk) + except json.JSONDecodeError: + continue + if isinstance(parsed, dict): + payloads.append(parsed) + return payloads + + +def _router_merge_streaming_completion(payloads: list[dict[str, Any]]) -> dict[str, Any]: + merged: dict[str, Any] = {"choices": [{"message": {"role": "assistant", "content": ""}}]} + content_parts: list[str] = [] + reasoning_parts: list[str] = [] + for payload in payloads: + for choice in payload.get("choices") or []: + delta = choice.get("delta") or {} + message = choice.get("message") or {} + for key, bucket in ( + ("content", content_parts), + ("reasoning_content", reasoning_parts), + ): + piece = delta.get(key) + if piece is None: + piece = message.get(key) + if piece: + bucket.append(str(piece)) + if content_parts: + merged["choices"][0]["message"]["content"] = "".join(content_parts) + if reasoning_parts: + merged["choices"][0]["message"]["reasoning_content"] = "".join(reasoning_parts) + return merged + + +def _router_parse_completion_body(raw: str, *, model_id: str) -> dict[str, Any]: + text = (raw or "").strip() + if not text: + raise ValueError("empty router response body") + + if text.startswith("data:") or "\ndata:" in text: + payloads = _router_collect_sse_payloads(text) + if payloads: + return _router_merge_streaming_completion(payloads) + + data = _router_parse_http_json(text) + if not isinstance(data, dict): + raise ValueError(f"router response is not an object for model={model_id}") + return data + + +def _router_strip_json_fences(text: str) -> str: + stripped = text.strip() + if stripped.startswith("```"): + stripped = re.sub(r"^```(?:json)?\s*", "", stripped, flags=re.IGNORECASE) + stripped = re.sub(r"\s*```$", "", stripped) + return stripped.strip() + + +def _router_parse_json_object(text: str) -> dict[str, Any] | None: + candidate = _router_strip_json_fences(text) + if not candidate: + return None + try: + parsed = json.loads(candidate) + return parsed if isinstance(parsed, dict) else None + except json.JSONDecodeError: + pass + try: + decoder = json.JSONDecoder() + obj, _end = decoder.raw_decode(candidate) + return obj if isinstance(obj, dict) else None + except json.JSONDecodeError: + pass + match = re.search(r"\{[\s\S]*\}", candidate) + if not match: + return None + try: + obj = json.loads(match.group(0)) + return obj if isinstance(obj, dict) else None + except json.JSONDecodeError: + return None + + +def _router_normalize_message_content(content: Any) -> str: + if content is None: + return "" + if isinstance(content, str): + return content.strip() + if isinstance(content, list): + parts: list[str] = [] + for item in content: + if isinstance(item, str) and item.strip(): + parts.append(item.strip()) + elif isinstance(item, dict): + if item.get("type") == "text": + text = str(item.get("text") or "").strip() + if text: + parts.append(text) + elif "text" in item: + text = str(item.get("text") or "").strip() + if text: + parts.append(text) + return "\n".join(parts).strip() + return str(content).strip() + + +def _router_extract_assistant_content(completion: dict[str, Any], model_id: str) -> str: + choice = (completion.get("choices") or [{}])[0] or {} + message = choice.get("message") or {} + family = _router_model_family(model_id) + + content = _router_normalize_message_content(message.get("content")) + if content: + return content + + if family == "deepseek": + reasoning = str(message.get("reasoning_content") or "").strip() + if reasoning: + parsed = _router_parse_json_object(reasoning) + if parsed: + return json.dumps(parsed, ensure_ascii=False) + tail = reasoning[-4000:] + parsed = _router_parse_json_object(tail) + if parsed: + return json.dumps(parsed, ensure_ascii=False) + + if family == "gemini": + parts = message.get("parts") + if isinstance(parts, list): + return _router_normalize_message_content(parts) + + return "" + + +def _router_parse_suggest_result(completion: dict[str, Any], model_id: str) -> dict[str, Any] | None: + content = _router_extract_assistant_content(completion, model_id) + if not content: + return None + parsed = _router_parse_json_object(content) + if not parsed: + return None + return parsed def _normalize_vietnamese_novel_status(raw: str | None) -> str: @@ -3379,30 +3799,20 @@ async def _router_pick_models() -> list[str]: headers=headers, ) response.raise_for_status() - for item in (response.json().get("data") or []): + models_payload = _router_parse_http_json(response.text) + for item in (models_payload.get("data") or []): model_id = str(item.get("id") or "").strip() if not model_id: continue low = model_id.lower() if any(x in low for x in ["vision", "image", "audio", "realtime", "embedding", "moderation"]): continue - score = 0 - if "gpt-5.5" in low: - score += 1000 - elif "gpt-5" in low: - score += 900 - elif "claude" in low: - score += 700 - elif "gemini" in low: - score += 650 - else: - score += 100 - candidates.append((score, model_id)) - except Exception: + candidates.append((_router_model_priority_score(model_id), model_id)) + except Exception as exc: + logger.warning("router models list failed: %s", exc) candidates = [] - candidates.sort(key=lambda x: x[0], reverse=True) - picked = [m for _, m in candidates[:6]] + picked = _router_pick_models_from_candidates(candidates) _ROUTER_MODEL_CACHE["models"] = picked _ROUTER_MODEL_CACHE["expires_at"] = now + 600 return picked @@ -3479,6 +3889,7 @@ async def _router_ai_suggest( for model_id in models: payload = dict(base_payload) payload["model"] = model_id + family = _router_model_family(model_id) try: async with httpx.AsyncClient(timeout=45.0) as client: response = await client.post( @@ -3486,10 +3897,24 @@ async def _router_ai_suggest( headers=headers, json=payload, ) - response.raise_for_status() - data = response.json() - content = data.get("choices", [{}])[0].get("message", {}).get("content", "") - parsed = json.loads(content) if isinstance(content, str) else {} + if response.status_code >= 400: + logger.info( + "router ai-suggest skip model=%s family=%s status=%s body=%s", + model_id, + family, + response.status_code, + (response.text or "")[:240], + ) + continue + completion = _router_parse_completion_body(response.text, model_id=model_id) + parsed = _router_parse_suggest_result(completion, model_id) + if not parsed: + logger.info( + "router ai-suggest skip model=%s family=%s reason=unparseable_content", + model_id, + family, + ) + continue raw_genres = [str(g).strip() for g in (parsed.get("genres") or []) if str(g).strip()][:6] genres = _map_genres_to_existing(raw_genres, existing_genres, limit=6) short_description = str(parsed.get("shortDescription") or "").strip() @@ -3500,6 +3925,13 @@ async def _router_ai_suggest( confidence = 0.0 confidence = max(0.0, min(1.0, confidence)) if not short_description or not genres: + logger.info( + "router ai-suggest skip model=%s family=%s reason=empty_fields genres=%s desc_len=%s", + model_id, + family, + len(genres), + len(short_description), + ) continue return { "suggestedGenres": genres, @@ -3508,7 +3940,13 @@ async def _router_ai_suggest( "model": model_id, "suggestedStatus": novel_status, } - except Exception: + except Exception as exc: + logger.info( + "router ai-suggest skip model=%s family=%s reason=exception err=%s", + model_id, + family, + exc, + ) continue return None @@ -3570,6 +4008,7 @@ async def mod_epub_ai_suggest( file: UploadFile = File(...), splitMode: str | None = Form(default=None), chapterRegex: str | None = Form(default=None), + chapterTag: str | None = Form(default=None), title: str | None = Form(default=None), authorName: str | None = Form(default=None), db: AsyncSession = Depends(get_db_session), @@ -3587,11 +4026,12 @@ async def mod_epub_ai_suggest( tmp_path = Path(tmp.name) try: - mode = "regex" if (splitMode or "").lower() == "regex" else "toc" + mode = _resolve_epub_split_mode(splitMode) pattern = (chapterRegex or "").strip() or None + effective_tag = _normalize_chapter_html_tag(chapterTag) if mode == "tag" else None source_sections = _extract_epub_chapters(tmp_path) sections_after_filter = _filter_toc_chapters(source_sections) if mode == "toc" else source_sections - chapters = _epub_extract_with_mode(tmp_path, mode, pattern) + chapters = _epub_extract_with_mode(tmp_path, mode, pattern, effective_tag) meta = _extract_epub_metadata(tmp_path) resolved_title = " ".join((title or str(meta.get("title") or tmp_path.stem)).split()).strip() or tmp_path.stem resolved_author = " ".join((authorName or str(meta.get("author") or "Unknown")).split()).strip() or "Unknown" @@ -4198,20 +4638,7 @@ async def mobile_login(payload: MobileLoginPayload, db: AsyncSession = Depends(g if not payload.googleIdToken.strip(): raise HTTPException(status_code=400, detail="googleIdToken is required") - allowed_client_ids = settings.google_client_id_list - - try: - id_info = google_id_token.verify_oauth2_token( - payload.googleIdToken, - google_requests.Request(), - None, - ) - except Exception as exc: - raise HTTPException(status_code=401, detail="Invalid Google token") from exc - - aud = (id_info.get("aud") or "").strip() - if allowed_client_ids and aud not in set(allowed_client_ids): - raise HTTPException(status_code=401, detail="Invalid Google token audience") + id_info = verify_google_id_token(payload.googleIdToken) email = id_info.get("email") if not email: