Files
rose-ash/blog/scripts/final_ghost_sync.py
giles 382d1b7c7a
All checks were successful
Build and Deploy / build-and-deploy (push) Successful in 2m20s
Decouple blog models and BlogService from shared layer
Move Post/Author/Tag/PostAuthor/PostTag/PostUser models from
shared/models/ghost_content.py to blog/models/content.py so blog-domain
models no longer live in the shared layer. Replace the shared
SqlBlogService + BlogService protocol with a blog-local singleton
(blog_service), and switch entry_associations.py from direct DB access
to HTTP fetch_data("blog", "post-by-id") to respect the inter-service
boundary.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-01 13:28:11 +00:00

470 lines
17 KiB
Python

#!/usr/bin/env python3
"""Final Ghost → db_blog sync with HTML verification + author→user migration.
Run once before cutting over to native writes (Phase 1).
Usage:
cd blog && python -m scripts.final_ghost_sync
Requires GHOST_ADMIN_API_URL, GHOST_ADMIN_API_KEY, DATABASE_URL,
and DATABASE_URL_ACCOUNT env vars.
"""
from __future__ import annotations
import asyncio
import difflib
import logging
import os
import re
import sys
import httpx
from sqlalchemy import select, func, delete
# Ensure project root is importable
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "shared"))
from shared.db.base import Base # noqa: E402
from shared.db.session import get_session, get_account_session, _engine # noqa: E402
from shared.infrastructure.ghost_admin_token import make_ghost_admin_jwt # noqa: E402
from blog.models.content import Post, Author, Tag, PostUser, PostAuthor # noqa: E402
from shared.models.user import User # noqa: E402
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
)
log = logging.getLogger("final_ghost_sync")
GHOST_ADMIN_API_URL = os.environ["GHOST_ADMIN_API_URL"]
def _auth_header() -> dict[str, str]:
return {"Authorization": f"Ghost {make_ghost_admin_jwt()}"}
def _slugify(name: str) -> str:
s = name.strip().lower()
s = re.sub(r"[^\w\s-]", "", s)
s = re.sub(r"[\s_]+", "-", s)
return s.strip("-")
# ---------------------------------------------------------------------------
# Ghost API fetch
# ---------------------------------------------------------------------------
async def _fetch_all(endpoint: str) -> list[dict]:
"""Fetch all items from a Ghost Admin API endpoint."""
url = (
f"{GHOST_ADMIN_API_URL}/{endpoint}/"
"?include=authors,tags&limit=all"
"&formats=html,plaintext,mobiledoc,lexical"
)
async with httpx.AsyncClient(timeout=60) as client:
resp = await client.get(url, headers=_auth_header())
resp.raise_for_status()
key = endpoint # "posts" or "pages"
return resp.json().get(key, [])
async def fetch_all_ghost_content() -> list[dict]:
"""Fetch all posts + pages from Ghost."""
posts, pages = await asyncio.gather(
_fetch_all("posts"),
_fetch_all("pages"),
)
for p in pages:
p["page"] = True
return posts + pages
# ---------------------------------------------------------------------------
# Author → User migration
# ---------------------------------------------------------------------------
async def migrate_authors_to_users(author_bucket: dict[str, dict]) -> dict[str, int]:
"""Ensure every Ghost author has a corresponding User in db_account.
Returns mapping of ghost_author_id → user_id.
"""
ghost_id_to_user_id: dict[str, int] = {}
async with get_account_session() as sess:
async with sess.begin():
for ghost_author_id, ga in author_bucket.items():
email = (ga.get("email") or "").strip().lower()
if not email:
log.warning(
"Author %s (%s) has no email — skipping user creation",
ghost_author_id, ga.get("name"),
)
continue
# Find existing user by email
user = (await sess.execute(
select(User).where(User.email == email)
)).scalar_one_or_none()
if user is None:
# Auto-create user for this Ghost author
user = User(
email=email,
name=ga.get("name"),
slug=ga.get("slug") or _slugify(ga.get("name") or email.split("@")[0]),
bio=ga.get("bio"),
profile_image=ga.get("profile_image"),
cover_image=ga.get("cover_image"),
website=ga.get("website"),
location=ga.get("location"),
facebook=ga.get("facebook"),
twitter=ga.get("twitter"),
is_admin=True, # Ghost authors are admins
)
sess.add(user)
await sess.flush()
log.info("Created user %d for author %s (%s)", user.id, ga.get("name"), email)
else:
# Update profile fields from Ghost author (fill in blanks)
if not user.slug:
user.slug = ga.get("slug") or _slugify(ga.get("name") or email.split("@")[0])
if not user.name and ga.get("name"):
user.name = ga["name"]
if not user.bio and ga.get("bio"):
user.bio = ga["bio"]
if not user.profile_image and ga.get("profile_image"):
user.profile_image = ga["profile_image"]
if not user.cover_image and ga.get("cover_image"):
user.cover_image = ga["cover_image"]
if not user.website and ga.get("website"):
user.website = ga["website"]
if not user.location and ga.get("location"):
user.location = ga["location"]
if not user.facebook and ga.get("facebook"):
user.facebook = ga["facebook"]
if not user.twitter and ga.get("twitter"):
user.twitter = ga["twitter"]
await sess.flush()
log.info("Updated user %d profile from author %s", user.id, ga.get("name"))
ghost_id_to_user_id[ghost_author_id] = user.id
return ghost_id_to_user_id
async def populate_post_users(
data: list[dict],
ghost_author_to_user: dict[str, int],
) -> int:
"""Populate post_users M2M and set user_id on all posts from author mapping.
Returns number of post_users rows created.
"""
rows_created = 0
async with get_session() as sess:
async with sess.begin():
for gp in data:
ghost_post_id = gp["id"]
post = (await sess.execute(
select(Post).where(Post.ghost_id == ghost_post_id)
)).scalar_one_or_none()
if not post:
continue
# Set primary user_id from primary author
pa = gp.get("primary_author")
if pa and pa["id"] in ghost_author_to_user:
post.user_id = ghost_author_to_user[pa["id"]]
# Build post_users from all authors
await sess.execute(
delete(PostUser).where(PostUser.post_id == post.id)
)
seen_user_ids: set[int] = set()
for idx, a in enumerate(gp.get("authors") or []):
uid = ghost_author_to_user.get(a["id"])
if uid and uid not in seen_user_ids:
seen_user_ids.add(uid)
sess.add(PostUser(post_id=post.id, user_id=uid, sort_order=idx))
rows_created += 1
await sess.flush()
return rows_created
# ---------------------------------------------------------------------------
# Content sync (reuse ghost_sync upsert logic)
# ---------------------------------------------------------------------------
async def run_sync() -> dict:
"""Run full Ghost content sync and author→user migration."""
from bp.blog.ghost.ghost_sync import (
_upsert_author,
_upsert_tag,
_upsert_post,
)
log.info("Fetching all content from Ghost...")
data = await fetch_all_ghost_content()
log.info("Received %d posts/pages from Ghost", len(data))
# Collect authors and tags
author_bucket: dict[str, dict] = {}
tag_bucket: dict[str, dict] = {}
for p in data:
for a in p.get("authors") or []:
author_bucket[a["id"]] = a
if p.get("primary_author"):
author_bucket[p["primary_author"]["id"]] = p["primary_author"]
for t in p.get("tags") or []:
tag_bucket[t["id"]] = t
if p.get("primary_tag"):
tag_bucket[p["primary_tag"]["id"]] = p["primary_tag"]
# Step 1: Upsert content into db_blog (existing Ghost sync logic)
async with get_session() as sess:
async with sess.begin():
author_map: dict[str, Author] = {}
for ga in author_bucket.values():
a = await _upsert_author(sess, ga)
author_map[ga["id"]] = a
log.info("Upserted %d authors (legacy table)", len(author_map))
tag_map: dict[str, Tag] = {}
for gt in tag_bucket.values():
t = await _upsert_tag(sess, gt)
tag_map[gt["id"]] = t
log.info("Upserted %d tags", len(tag_map))
for gp in data:
await _upsert_post(sess, gp, author_map, tag_map)
log.info("Upserted %d posts/pages", len(data))
# Step 2: Migrate authors → users in db_account
log.info("")
log.info("--- Migrating Ghost authors → Users ---")
ghost_author_to_user = await migrate_authors_to_users(author_bucket)
log.info("Mapped %d Ghost authors to User records", len(ghost_author_to_user))
# Step 3: Populate post_users M2M and set user_id
log.info("")
log.info("--- Populating post_users M2M ---")
pu_rows = await populate_post_users(data, ghost_author_to_user)
log.info("Created %d post_users rows", pu_rows)
n_posts = sum(1 for p in data if not p.get("page"))
n_pages = sum(1 for p in data if p.get("page"))
return {
"posts": n_posts,
"pages": n_pages,
"authors": len(author_bucket),
"tags": len(tag_bucket),
"users_mapped": len(ghost_author_to_user),
"post_users_rows": pu_rows,
}
# ---------------------------------------------------------------------------
# HTML rendering verification
# ---------------------------------------------------------------------------
def _normalize_html(html: str | None) -> str:
if not html:
return ""
return re.sub(r"\s+", " ", html.strip())
async def verify_html_rendering() -> dict:
"""Re-render all posts from lexical and compare with stored HTML."""
from bp.blog.ghost.lexical_renderer import render_lexical
import json
diffs_found = 0
posts_checked = 0
posts_no_lexical = 0
async with get_session() as sess:
result = await sess.execute(
select(Post).where(
Post.deleted_at.is_(None),
Post.status == "published",
)
)
posts = result.scalars().all()
for post in posts:
if not post.lexical:
posts_no_lexical += 1
continue
posts_checked += 1
try:
rendered = render_lexical(json.loads(post.lexical))
except Exception as e:
log.error(
"Render failed for post %d (%s): %s",
post.id, post.slug, e,
)
diffs_found += 1
continue
ghost_html = _normalize_html(post.html)
our_html = _normalize_html(rendered)
if ghost_html != our_html:
diffs_found += 1
diff = difflib.unified_diff(
ghost_html.splitlines(keepends=True),
our_html.splitlines(keepends=True),
fromfile=f"ghost/{post.slug}",
tofile=f"rendered/{post.slug}",
n=2,
)
diff_text = "".join(diff)
if len(diff_text) > 2000:
diff_text = diff_text[:2000] + "\n... (truncated)"
log.warning(
"HTML diff for post %d (%s):\n%s",
post.id, post.slug, diff_text,
)
return {
"checked": posts_checked,
"no_lexical": posts_no_lexical,
"diffs": diffs_found,
}
# ---------------------------------------------------------------------------
# Verification queries
# ---------------------------------------------------------------------------
async def run_verification() -> dict:
async with get_session() as sess:
total_posts = await sess.scalar(
select(func.count(Post.id)).where(Post.deleted_at.is_(None))
)
null_html = await sess.scalar(
select(func.count(Post.id)).where(
Post.deleted_at.is_(None),
Post.status == "published",
Post.html.is_(None),
)
)
null_lexical = await sess.scalar(
select(func.count(Post.id)).where(
Post.deleted_at.is_(None),
Post.status == "published",
Post.lexical.is_(None),
)
)
# Posts with user_id set
has_user = await sess.scalar(
select(func.count(Post.id)).where(
Post.deleted_at.is_(None),
Post.user_id.isnot(None),
)
)
no_user = await sess.scalar(
select(func.count(Post.id)).where(
Post.deleted_at.is_(None),
Post.user_id.is_(None),
)
)
return {
"total_posts": total_posts,
"published_null_html": null_html,
"published_null_lexical": null_lexical,
"posts_with_user": has_user,
"posts_without_user": no_user,
}
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
async def main():
log.info("=" * 60)
log.info("Final Ghost Sync — Cutover Preparation")
log.info("=" * 60)
# Step 1: Full sync + author→user migration
log.info("")
log.info("--- Step 1: Full sync from Ghost + author→user migration ---")
stats = await run_sync()
log.info(
"Sync complete: %d posts, %d pages, %d authors, %d tags, %d users mapped",
stats["posts"], stats["pages"], stats["authors"], stats["tags"],
stats["users_mapped"],
)
# Step 2: Verification queries
log.info("")
log.info("--- Step 2: Verification queries ---")
vq = await run_verification()
log.info("Total non-deleted posts/pages: %d", vq["total_posts"])
log.info("Published with NULL html: %d", vq["published_null_html"])
log.info("Published with NULL lexical: %d", vq["published_null_lexical"])
log.info("Posts with user_id: %d", vq["posts_with_user"])
log.info("Posts WITHOUT user_id: %d", vq["posts_without_user"])
if vq["published_null_html"] > 0:
log.warning("WARN: Some published posts have no HTML!")
if vq["published_null_lexical"] > 0:
log.warning("WARN: Some published posts have no Lexical JSON!")
if vq["posts_without_user"] > 0:
log.warning("WARN: Some posts have no user_id — authors may lack email!")
# Step 3: HTML rendering verification
log.info("")
log.info("--- Step 3: HTML rendering verification ---")
html_stats = await verify_html_rendering()
log.info(
"Checked %d posts, %d with diffs, %d without lexical",
html_stats["checked"], html_stats["diffs"], html_stats["no_lexical"],
)
if html_stats["diffs"] > 0:
log.warning(
"WARN: %d posts have HTML rendering differences — "
"review diffs above before cutover",
html_stats["diffs"],
)
# Summary
log.info("")
log.info("=" * 60)
log.info("SUMMARY")
log.info(" Posts synced: %d", stats["posts"])
log.info(" Pages synced: %d", stats["pages"])
log.info(" Authors synced: %d", stats["authors"])
log.info(" Tags synced: %d", stats["tags"])
log.info(" Users mapped: %d", stats["users_mapped"])
log.info(" post_users rows: %d", stats["post_users_rows"])
log.info(" HTML diffs: %d", html_stats["diffs"])
log.info(" Published null HTML: %d", vq["published_null_html"])
log.info(" Published null lex: %d", vq["published_null_lexical"])
log.info(" Posts without user: %d", vq["posts_without_user"])
log.info("=" * 60)
if (html_stats["diffs"] == 0
and vq["published_null_html"] == 0
and vq["posts_without_user"] == 0):
log.info("All checks passed — safe to proceed with cutover.")
else:
log.warning("Review warnings above before proceeding.")
await _engine.dispose()
if __name__ == "__main__":
asyncio.run(main())