9a3bb4b6ce
Build and Push Reader API Image / docker (push) Successful in 1m3s
feat(docker): configure NAS content and EPUB source directories in docker-compose feat(migrations): add tables for SourceAsset, ImportJob, ChapterContentRef, and AssetNovelMapping feat(scripts): create backfill script for populating ChapterContentRef from MongoDB chapters
119 lines
3.8 KiB
Python
119 lines
3.8 KiB
Python
from __future__ import annotations
|
|
|
|
import argparse
|
|
import asyncio
|
|
import hashlib
|
|
import json
|
|
from pathlib import Path
|
|
from bson import ObjectId
|
|
from sqlalchemy import text
|
|
|
|
from app.config import settings
|
|
from app.database import SessionLocal, mongo_db
|
|
from app.storage import storage
|
|
|
|
|
|
async def backfill(limit: int, dry_run: bool, after_id: str | None, state_file: str | None) -> None:
|
|
query = {
|
|
"$or": [
|
|
{"content": {"$exists": True, "$type": "string", "$ne": ""}},
|
|
{"contentHtml": {"$exists": True, "$type": "string", "$ne": ""}},
|
|
]
|
|
}
|
|
if after_id:
|
|
query["_id"] = {"$gt": ObjectId(after_id)}
|
|
|
|
docs = (
|
|
await mongo_db["chapters"]
|
|
.find(query, {"content": 1, "contentHtml": 1})
|
|
.sort("_id", 1)
|
|
.limit(limit)
|
|
.to_list(limit)
|
|
)
|
|
|
|
mapped = 0
|
|
skipped = 0
|
|
async with SessionLocal() as db:
|
|
for doc in docs:
|
|
chapter_id = str(doc.get("_id") or "")
|
|
if not chapter_id:
|
|
skipped += 1
|
|
continue
|
|
|
|
exists = (
|
|
await db.execute(
|
|
text('SELECT "chapterId" FROM "ChapterContentRef" WHERE "chapterId" = :id LIMIT 1'),
|
|
{"id": chapter_id},
|
|
)
|
|
).mappings().first()
|
|
if exists:
|
|
skipped += 1
|
|
continue
|
|
|
|
txt = str(doc.get("content") or "").strip()
|
|
raw_html = str(doc.get("contentHtml") or doc.get("content") or "")
|
|
if not txt:
|
|
skipped += 1
|
|
continue
|
|
|
|
txt_href = f"legacy/{chapter_id}.txt"
|
|
raw_href = f"legacy/{chapter_id}.raw.html"
|
|
content_hash = hashlib.sha256(txt.encode("utf-8")).hexdigest()
|
|
|
|
if not dry_run:
|
|
storage.write_text(txt_href, txt)
|
|
storage.write_text(raw_href, raw_html)
|
|
await db.execute(
|
|
text(
|
|
'INSERT INTO "ChapterContentRef" ("chapterId", "txtHref", "rawHtmlHref", "contentHash") '
|
|
'VALUES (:chapter_id, :txt_href, :raw_href, :hash) '
|
|
'ON CONFLICT ("chapterId") DO NOTHING'
|
|
),
|
|
{
|
|
"chapter_id": chapter_id,
|
|
"txt_href": txt_href,
|
|
"raw_href": raw_href,
|
|
"hash": content_hash,
|
|
},
|
|
)
|
|
mapped += 1
|
|
|
|
if not dry_run:
|
|
await db.commit()
|
|
|
|
last_id = str(docs[-1]["_id"]) if docs else None
|
|
summary = {
|
|
"scanned": len(docs),
|
|
"mapped": mapped,
|
|
"skipped": skipped,
|
|
"dryRun": dry_run,
|
|
"contentRoot": settings.nas_content_root,
|
|
"nextAfterId": last_id,
|
|
}
|
|
if state_file and last_id and not dry_run:
|
|
Path(state_file).write_text(json.dumps({"afterId": last_id}, ensure_ascii=True), encoding="utf-8")
|
|
print(summary)
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Backfill ChapterContentRef from Mongo chapters")
|
|
parser.add_argument("--limit", type=int, default=1000)
|
|
parser.add_argument("--dry-run", action="store_true")
|
|
parser.add_argument("--after-id", type=str, default="")
|
|
parser.add_argument("--state-file", type=str, default="")
|
|
args = parser.parse_args()
|
|
after_id = args.after_id.strip() or None
|
|
state_file = args.state_file.strip() or None
|
|
if state_file and not after_id:
|
|
p = Path(state_file)
|
|
if p.exists():
|
|
try:
|
|
after_id = json.loads(p.read_text(encoding="utf-8")).get("afterId")
|
|
except Exception:
|
|
after_id = None
|
|
asyncio.run(backfill(limit=args.limit, dry_run=args.dry_run, after_id=after_id, state_file=state_file))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|