feat(storage): implement NAS content storage with read/write capabilities
Build and Push Reader API Image / docker (push) Successful in 1m3s

feat(docker): configure NAS content and EPUB source directories in docker-compose

feat(migrations): add tables for SourceAsset, ImportJob, ChapterContentRef, and AssetNovelMapping

feat(scripts): create backfill script for populating ChapterContentRef from MongoDB chapters
This commit is contained in:
2026-04-30 01:53:52 +07:00
parent 82f502acd2
commit 9a3bb4b6ce
10 changed files with 1297 additions and 18 deletions
+118
View File
@@ -0,0 +1,118 @@
from __future__ import annotations
import argparse
import asyncio
import hashlib
import json
from pathlib import Path
from bson import ObjectId
from sqlalchemy import text
from app.config import settings
from app.database import SessionLocal, mongo_db
from app.storage import storage
async def backfill(limit: int, dry_run: bool, after_id: str | None, state_file: str | None) -> None:
query = {
"$or": [
{"content": {"$exists": True, "$type": "string", "$ne": ""}},
{"contentHtml": {"$exists": True, "$type": "string", "$ne": ""}},
]
}
if after_id:
query["_id"] = {"$gt": ObjectId(after_id)}
docs = (
await mongo_db["chapters"]
.find(query, {"content": 1, "contentHtml": 1})
.sort("_id", 1)
.limit(limit)
.to_list(limit)
)
mapped = 0
skipped = 0
async with SessionLocal() as db:
for doc in docs:
chapter_id = str(doc.get("_id") or "")
if not chapter_id:
skipped += 1
continue
exists = (
await db.execute(
text('SELECT "chapterId" FROM "ChapterContentRef" WHERE "chapterId" = :id LIMIT 1'),
{"id": chapter_id},
)
).mappings().first()
if exists:
skipped += 1
continue
txt = str(doc.get("content") or "").strip()
raw_html = str(doc.get("contentHtml") or doc.get("content") or "")
if not txt:
skipped += 1
continue
txt_href = f"legacy/{chapter_id}.txt"
raw_href = f"legacy/{chapter_id}.raw.html"
content_hash = hashlib.sha256(txt.encode("utf-8")).hexdigest()
if not dry_run:
storage.write_text(txt_href, txt)
storage.write_text(raw_href, raw_html)
await db.execute(
text(
'INSERT INTO "ChapterContentRef" ("chapterId", "txtHref", "rawHtmlHref", "contentHash") '
'VALUES (:chapter_id, :txt_href, :raw_href, :hash) '
'ON CONFLICT ("chapterId") DO NOTHING'
),
{
"chapter_id": chapter_id,
"txt_href": txt_href,
"raw_href": raw_href,
"hash": content_hash,
},
)
mapped += 1
if not dry_run:
await db.commit()
last_id = str(docs[-1]["_id"]) if docs else None
summary = {
"scanned": len(docs),
"mapped": mapped,
"skipped": skipped,
"dryRun": dry_run,
"contentRoot": settings.nas_content_root,
"nextAfterId": last_id,
}
if state_file and last_id and not dry_run:
Path(state_file).write_text(json.dumps({"afterId": last_id}, ensure_ascii=True), encoding="utf-8")
print(summary)
def main() -> None:
parser = argparse.ArgumentParser(description="Backfill ChapterContentRef from Mongo chapters")
parser.add_argument("--limit", type=int, default=1000)
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--after-id", type=str, default="")
parser.add_argument("--state-file", type=str, default="")
args = parser.parse_args()
after_id = args.after_id.strip() or None
state_file = args.state_file.strip() or None
if state_file and not after_id:
p = Path(state_file)
if p.exists():
try:
after_id = json.loads(p.read_text(encoding="utf-8")).get("afterId")
except Exception:
after_id = None
asyncio.run(backfill(limit=args.limit, dry_run=args.dry_run, after_id=after_id, state_file=state_file))
if __name__ == "__main__":
main()