"""Ghost content sync — blog-owned. Handles Ghost ↔ blog DB sync for content data only: posts, pages, authors, tags. All models live in db_blog. Membership sync (users, labels, newsletters, tiers, subscriptions) is handled by the account service — see account/services/ghost_membership.py. """ from __future__ import annotations import os import re import asyncio from datetime import datetime from html import escape as html_escape from typing import Dict, Any, Optional import httpx import nh3 from sqlalchemy import select, delete from sqlalchemy.ext.asyncio import AsyncSession from models.ghost_content import ( Post, Author, Tag, PostAuthor, PostTag ) from shared.infrastructure.data_client import fetch_data from shared.infrastructure.ghost_admin_token import make_ghost_admin_jwt GHOST_ADMIN_API_URL = os.environ["GHOST_ADMIN_API_URL"] from shared.browser.app.utils import utcnow def _sanitize_html(html: str | None) -> str | None: """Sanitize HTML content using nh3, allowing safe formatting tags.""" if not html: return html return nh3.clean( html, tags={ "a", "abbr", "acronym", "b", "blockquote", "br", "code", "div", "em", "figcaption", "figure", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "i", "img", "li", "ol", "p", "pre", "span", "strong", "sub", "sup", "table", "tbody", "td", "th", "thead", "tr", "ul", "video", "source", "picture", "iframe", "audio", }, attributes={ "*": {"class", "id", "style"}, "a": {"href", "title", "target"}, "img": {"src", "alt", "title", "width", "height", "loading"}, "video": {"src", "controls", "width", "height", "poster"}, "audio": {"src", "controls"}, "source": {"src", "type"}, "iframe": {"src", "width", "height", "frameborder", "allowfullscreen"}, "td": {"colspan", "rowspan"}, "th": {"colspan", "rowspan"}, }, link_rel="noopener noreferrer", url_schemes={"http", "https", "mailto"}, ) def _auth_header() -> dict[str, str]: return {"Authorization": f"Ghost {make_ghost_admin_jwt()}"} def _iso(val: str | None) -> datetime | None: if not val: return None return datetime.fromisoformat(val.replace("Z", "+00:00")) # ===================== # CONTENT UPSERT HELPERS # ===================== async def _upsert_author(sess: AsyncSession, ga: Dict[str, Any]) -> Author: res = await sess.execute(select(Author).where(Author.ghost_id == ga["id"])) obj = res.scalar_one_or_none() if obj is None: obj = Author(ghost_id=ga["id"]) sess.add(obj) obj.deleted_at = None obj.slug = ga.get("slug") or obj.slug obj.name = ga.get("name") or obj.name obj.email = ga.get("email") or obj.email obj.profile_image = ga.get("profile_image") obj.cover_image = ga.get("cover_image") obj.bio = ga.get("bio") obj.website = ga.get("website") obj.location = ga.get("location") obj.facebook = ga.get("facebook") obj.twitter = ga.get("twitter") obj.created_at = _iso(ga.get("created_at")) or obj.created_at or utcnow() obj.updated_at = _iso(ga.get("updated_at")) or utcnow() await sess.flush() return obj async def _upsert_tag(sess: AsyncSession, gt: Dict[str, Any]) -> Tag: res = await sess.execute(select(Tag).where(Tag.ghost_id == gt["id"])) obj = res.scalar_one_or_none() if obj is None: obj = Tag(ghost_id=gt["id"]) sess.add(obj) obj.deleted_at = None obj.slug = gt.get("slug") or obj.slug obj.name = gt.get("name") or obj.name obj.description = gt.get("description") obj.visibility = gt.get("visibility") or obj.visibility obj.feature_image = gt.get("feature_image") obj.meta_title = gt.get("meta_title") obj.meta_description = gt.get("meta_description") obj.created_at = _iso(gt.get("created_at")) or obj.created_at or utcnow() obj.updated_at = _iso(gt.get("updated_at")) or utcnow() await sess.flush() return obj def _apply_ghost_fields(obj: Post, gp: Dict[str, Any], author_map: Dict[str, Author], tag_map: Dict[str, Tag]) -> None: """Apply Ghost API fields to a Post ORM object.""" obj.deleted_at = None obj.uuid = gp.get("uuid") or obj.uuid obj.slug = gp.get("slug") or obj.slug obj.title = gp.get("title") or obj.title obj.html = _sanitize_html(gp.get("html")) obj.plaintext = gp.get("plaintext") obj.mobiledoc = gp.get("mobiledoc") obj.lexical = gp.get("lexical") obj.feature_image = gp.get("feature_image") obj.feature_image_alt = gp.get("feature_image_alt") obj.feature_image_caption = _sanitize_html(gp.get("feature_image_caption")) obj.excerpt = gp.get("excerpt") obj.custom_excerpt = gp.get("custom_excerpt") obj.visibility = gp.get("visibility") or obj.visibility obj.status = gp.get("status") or obj.status obj.featured = bool(gp.get("featured") or False) obj.is_page = bool(gp.get("page") or False) obj.email_only = bool(gp.get("email_only") or False) obj.canonical_url = gp.get("canonical_url") obj.meta_title = gp.get("meta_title") obj.meta_description = gp.get("meta_description") obj.og_image = gp.get("og_image") obj.og_title = gp.get("og_title") obj.og_description = gp.get("og_description") obj.twitter_image = gp.get("twitter_image") obj.twitter_title = gp.get("twitter_title") obj.twitter_description = gp.get("twitter_description") obj.custom_template = gp.get("custom_template") obj.reading_time = gp.get("reading_time") obj.comment_id = gp.get("comment_id") obj.published_at = _iso(gp.get("published_at")) obj.updated_at = _iso(gp.get("updated_at")) or obj.updated_at or utcnow() obj.created_at = _iso(gp.get("created_at")) or obj.created_at or utcnow() pa = gp.get("primary_author") obj.primary_author_id = author_map[pa["id"].strip()].id if pa else None pt = gp.get("primary_tag") obj.primary_tag_id = tag_map[pt["id"].strip()].id if (pt and pt["id"] in tag_map) else None async def _resolve_user_id_by_email(email: str) -> Optional[int]: """Look up user_id from account service via HTTP (cross-domain safe).""" from shared.infrastructure.data_client import fetch_data result = await fetch_data( "account", "user-by-email", params={"email": email}, required=False, ) if result and isinstance(result, dict): return result.get("user_id") return None async def _upsert_post(sess: AsyncSession, gp: Dict[str, Any], author_map: Dict[str, Author], tag_map: Dict[str, Tag]) -> tuple[Post, str | None]: """Upsert a post. Returns (post, old_status) where old_status is None for new rows.""" from sqlalchemy.exc import IntegrityError res = await sess.execute(select(Post).where(Post.ghost_id == gp["id"])) obj = res.scalar_one_or_none() old_status = obj.status if obj is not None else None if obj is not None: _apply_ghost_fields(obj, gp, author_map, tag_map) await sess.flush() else: obj = Post(ghost_id=gp["id"]) try: async with sess.begin_nested(): sess.add(obj) _apply_ghost_fields(obj, gp, author_map, tag_map) await sess.flush() except IntegrityError: res = await sess.execute(select(Post).where(Post.ghost_id == gp["id"])) obj = res.scalar_one() _apply_ghost_fields(obj, gp, author_map, tag_map) await sess.flush() # Backfill user_id from primary author email via account service if obj.user_id is None and obj.primary_author_id is not None: pa_obj = author_map.get(gp.get("primary_author", {}).get("id", "")) if pa_obj and pa_obj.email: user_id = await _resolve_user_id_by_email(pa_obj.email) if user_id: obj.user_id = user_id await sess.flush() # Rebuild post_authors + post_tags with synchronize_session to keep # identity map consistent and prevent autoflush IntegrityError. old_autoflush = sess.autoflush sess.autoflush = False try: # Delete + re-add post_authors (dedup for Ghost duplicate authors) await sess.execute( delete(PostAuthor).where(PostAuthor.post_id == obj.id), execution_options={"synchronize_session": "fetch"}, ) seen_authors: set[int] = set() for idx, a in enumerate(gp.get("authors") or []): aa = author_map[a["id"]] if aa.id not in seen_authors: seen_authors.add(aa.id) sess.add(PostAuthor(post_id=obj.id, author_id=aa.id, sort_order=idx)) # Delete + re-add post_tags (dedup similarly) await sess.execute( delete(PostTag).where(PostTag.post_id == obj.id), execution_options={"synchronize_session": "fetch"}, ) seen_tags: set[int] = set() for idx, t in enumerate(gp.get("tags") or []): tt = tag_map[t["id"]] if tt.id not in seen_tags: seen_tags.add(tt.id) sess.add(PostTag(post_id=obj.id, tag_id=tt.id, sort_order=idx)) await sess.flush() finally: sess.autoflush = old_autoflush # Auto-create PageConfig for pages (blog owns this table — direct DB, # not via HTTP, since this may run during startup before the server is ready) if obj.is_page: from shared.models.page_config import PageConfig existing = (await sess.execute( select(PageConfig).where( PageConfig.container_type == "page", PageConfig.container_id == obj.id, ) )).scalar_one_or_none() if existing is None: sess.add(PageConfig( container_type="page", container_id=obj.id, features={}, )) await sess.flush() return obj, old_status def _build_ap_post_data(post, post_url: str, tag_objs: list) -> dict: """Build rich AP object_data for a blog post/page.""" parts: list[str] = [] if post.title: parts.append(f"
{html_escape(post.title)}
") body = post.plaintext or post.custom_excerpt or post.excerpt or "" if body: for para in body.split("\n\n"): para = para.strip() if para: parts.append(f"{html_escape(para)}
") parts.append(f'') if tag_objs: ht_links = [] for t in tag_objs: clean = t.slug.replace("-", "") ht_links.append( f'#{clean}' ) parts.append(f'{" ".join(ht_links)}
') obj: dict = { "name": post.title or "", "content": "\n".join(parts), "url": post_url, } attachments: list[dict] = [] seen: set[str] = set() if post.feature_image: att: dict = {"type": "Image", "url": post.feature_image} if post.feature_image_alt: att["name"] = post.feature_image_alt attachments.append(att) seen.add(post.feature_image) if post.html: for src in re.findall(r'