Files
rose-ash/sx/content/highlight.py
giles 2b0a45b337 Fix code block rendering: escape newlines/tabs in syntax highlighter output
highlight_sx/python/bash produced SX string literals with literal newline
and tab characters, breaking the wire format parser. Add centralized
_escape() helper that properly escapes \n, \t, \r (plus existing \\ and
" escaping). Code blocks now render with correct indentation and syntax
highlighting in both server and client renders.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-07 08:35:33 +00:00

260 lines
9.3 KiB
Python

"""Syntax highlighting using Tailwind classes.
Produces sx source with coloured spans — no external CSS dependencies.
Showcases the on-demand CSS system.
"""
from __future__ import annotations
import re
def _escape(text: str) -> str:
"""Escape a token for embedding in an SX string literal."""
return (text
.replace("\\", "\\\\")
.replace('"', '\\"')
.replace("\n", "\\n")
.replace("\t", "\\t")
.replace("\r", "\\r"))
def highlight_sx(code: str) -> str:
"""Highlight s-expression source code as sx with Tailwind spans."""
tokens = _tokenize_sx(code)
parts = []
for kind, text in tokens:
escaped = _escape(text)
if kind == "comment":
parts.append(f'(span :class "text-stone-400 italic" "{escaped}")')
elif kind == "string":
parts.append(f'(span :class "text-emerald-700" "{escaped}")')
elif kind == "keyword":
parts.append(f'(span :class "text-violet-600" "{escaped}")')
elif kind == "component":
parts.append(f'(span :class "text-rose-600 font-semibold" "{escaped}")')
elif kind == "special":
parts.append(f'(span :class "text-sky-700 font-semibold" "{escaped}")')
elif kind == "paren":
parts.append(f'(span :class "text-stone-400" "{escaped}")')
elif kind == "number":
parts.append(f'(span :class "text-amber-700" "{escaped}")')
elif kind == "boolean":
parts.append(f'(span :class "text-orange-600" "{escaped}")')
else:
parts.append(f'(span "{escaped}")')
return "(<> " + " ".join(parts) + ")"
_SX_SPECIALS = {
"defcomp", "defrelation", "define",
"if", "when", "cond", "case", "and", "or", "not",
"let", "let*", "lambda", "fn",
"do", "begin", "quote",
"->", "map", "filter", "reduce", "some", "every?",
"map-indexed", "for-each",
"&key", "&rest",
}
_SX_TOKEN_RE = re.compile(
r'(;[^\n]*)' # comment
r'|("(?:[^"\\]|\\.)*")' # string
r'|(:[a-zA-Z][\w?!-]*)' # keyword
r'|(~[\w-]+)' # component
r'|([()[\]{}])' # parens/brackets
r'|(\d+\.?\d*)' # number
r'|(true|false|nil)\b' # boolean/nil
r'|([\w?!+\-*/<>=&.]+)' # symbol
r'|(\s+)' # whitespace
r'|(.)' # other
)
def _tokenize_sx(code: str) -> list[tuple[str, str]]:
tokens = []
for m in _SX_TOKEN_RE.finditer(code):
if m.group(1):
tokens.append(("comment", m.group(1)))
elif m.group(2):
tokens.append(("string", m.group(2)))
elif m.group(3):
tokens.append(("keyword", m.group(3)))
elif m.group(4):
tokens.append(("component", m.group(4)))
elif m.group(5):
tokens.append(("paren", m.group(5)))
elif m.group(6):
tokens.append(("number", m.group(6)))
elif m.group(7):
tokens.append(("boolean", m.group(7)))
elif m.group(8):
text = m.group(8)
if text in _SX_SPECIALS:
tokens.append(("special", text))
else:
tokens.append(("symbol", text))
elif m.group(9):
tokens.append(("ws", m.group(9)))
else:
tokens.append(("other", m.group(10)))
return tokens
def highlight_python(code: str) -> str:
"""Highlight Python source code as sx with Tailwind spans."""
tokens = _tokenize_python(code)
parts = []
for kind, text in tokens:
escaped = _escape(text)
if kind == "comment":
parts.append(f'(span :class "text-stone-400 italic" "{escaped}")')
elif kind == "string":
parts.append(f'(span :class "text-emerald-700" "{escaped}")')
elif kind == "keyword":
parts.append(f'(span :class "text-violet-600 font-semibold" "{escaped}")')
elif kind == "builtin":
parts.append(f'(span :class "text-sky-700" "{escaped}")')
elif kind == "decorator":
parts.append(f'(span :class "text-amber-600" "{escaped}")')
elif kind == "number":
parts.append(f'(span :class "text-amber-700" "{escaped}")')
else:
parts.append(f'(span "{escaped}")')
return "(<> " + " ".join(parts) + ")"
_PY_KEYWORDS = {
"False", "None", "True", "and", "as", "assert", "async", "await",
"break", "class", "continue", "def", "del", "elif", "else", "except",
"finally", "for", "from", "global", "if", "import", "in", "is",
"lambda", "nonlocal", "not", "or", "pass", "raise", "return",
"try", "while", "with", "yield",
}
_PY_BUILTINS = {
"print", "len", "range", "str", "int", "float", "list", "dict",
"set", "tuple", "type", "isinstance", "getattr", "setattr", "hasattr",
"super", "property", "staticmethod", "classmethod", "enumerate", "zip",
"map", "filter", "sorted", "reversed", "any", "all", "min", "max",
"abs", "sum", "open", "input", "format", "repr", "id", "hash",
}
_PY_TOKEN_RE = re.compile(
r'(#[^\n]*)' # comment
r'|("""[\s\S]*?"""|\'\'\'[\s\S]*?\'\'\')' # triple-quoted string
r'|("(?:[^"\\]|\\.)*")' # double-quoted string
r"|('(?:[^'\\]|\\.)*')" # single-quoted string
r'|(@\w+)' # decorator
r'|(\d+\.?\d*(?:e[+-]?\d+)?)' # number
r'|([a-zA-Z_]\w*)' # identifier
r'|(\s+)' # whitespace
r'|(.)' # other
)
def _tokenize_python(code: str) -> list[tuple[str, str]]:
tokens = []
for m in _PY_TOKEN_RE.finditer(code):
if m.group(1):
tokens.append(("comment", m.group(1)))
elif m.group(2):
tokens.append(("string", m.group(2)))
elif m.group(3):
tokens.append(("string", m.group(3)))
elif m.group(4):
tokens.append(("string", m.group(4)))
elif m.group(5):
tokens.append(("decorator", m.group(5)))
elif m.group(6):
tokens.append(("number", m.group(6)))
elif m.group(7):
text = m.group(7)
if text in _PY_KEYWORDS:
tokens.append(("keyword", text))
elif text in _PY_BUILTINS:
tokens.append(("builtin", text))
else:
tokens.append(("ident", text))
elif m.group(8):
tokens.append(("ws", m.group(8)))
else:
tokens.append(("other", m.group(9)))
return tokens
def highlight_bash(code: str) -> str:
"""Highlight bash source code as sx with Tailwind spans."""
tokens = _tokenize_bash(code)
parts = []
for kind, text in tokens:
escaped = _escape(text)
if kind == "comment":
parts.append(f'(span :class "text-stone-400 italic" "{escaped}")')
elif kind == "string":
parts.append(f'(span :class "text-emerald-700" "{escaped}")')
elif kind == "variable":
parts.append(f'(span :class "text-violet-600" "{escaped}")')
elif kind == "keyword":
parts.append(f'(span :class "text-sky-700 font-semibold" "{escaped}")')
elif kind == "flag":
parts.append(f'(span :class "text-amber-700" "{escaped}")')
else:
parts.append(f'(span "{escaped}")')
return "(<> " + " ".join(parts) + ")"
_BASH_KEYWORDS = {
"if", "then", "else", "elif", "fi", "for", "while", "do", "done",
"case", "esac", "in", "function", "return", "exit", "export",
"source", "local", "readonly", "declare", "set", "unset",
}
_BASH_TOKEN_RE = re.compile(
r'(#[^\n]*)' # comment
r'|("(?:[^"\\]|\\.)*")' # double-quoted string
r"|('(?:[^'\\]|\\.)*')" # single-quoted string
r'|(\$\{?\w+\}?|\$\()' # variable
r'|(--?[\w-]+)' # flag
r'|([a-zA-Z_][\w-]*)' # word
r'|(\s+)' # whitespace
r'|(.)' # other
)
def _tokenize_bash(code: str) -> list[tuple[str, str]]:
tokens = []
for m in _BASH_TOKEN_RE.finditer(code):
if m.group(1):
tokens.append(("comment", m.group(1)))
elif m.group(2):
tokens.append(("string", m.group(2)))
elif m.group(3):
tokens.append(("string", m.group(3)))
elif m.group(4):
tokens.append(("variable", m.group(4)))
elif m.group(5):
tokens.append(("flag", m.group(5)))
elif m.group(6):
text = m.group(6)
if text in _BASH_KEYWORDS:
tokens.append(("keyword", text))
else:
tokens.append(("word", text))
elif m.group(7):
tokens.append(("ws", m.group(7)))
else:
tokens.append(("other", m.group(8)))
return tokens
def highlight(code: str, language: str = "lisp"):
"""Highlight code in the given language. Returns SxExpr for wire format."""
from shared.sx.parser import SxExpr, serialize
if language in ("lisp", "sx", "sexp"):
return SxExpr(highlight_sx(code))
elif language in ("python", "py"):
return SxExpr(highlight_python(code))
elif language in ("bash", "sh", "shell"):
return SxExpr(highlight_bash(code))
# Fallback: no highlighting, just escaped text
return SxExpr("(span " + serialize(code) + ")")