import re
from bs4 import BeautifulSoup
import json
import os
from typing import Iterable, Union, List
from quart import request
def soup_of(html: str) -> BeautifulSoup:
return BeautifulSoup(html, "lxml")
def normalize_text(s: str) -> str:
return re.sub(r"\s+", " ", (s or "").strip())
def log(msg: str) -> None:
print(msg, flush=True)
def ensure_dir(path: str) -> None:
os.makedirs(path, exist_ok=True)
def dump_json(path: str, data) -> None:
ensure_dir(os.path.dirname(path))
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def _join_url_parts(parts: List[str]) -> str:
parts = [p for p in parts if p is not None and p != ""]
if not parts:
return ""
# Preserve scheme like "https://"
m = re.match(r"^([a-zA-Z][a-zA-Z0-9+.-]*://)(.*)$", parts[0])
if m:
scheme, first = m.group(1), m.group(2)
else:
scheme, first = "", parts[0]
cleaned = [first.strip("/")]
for seg in parts[1:]:
seg = str(seg)
# If a later segment is already an absolute URL, use it as the base
m2 = re.match(r"^([a-zA-Z][a-zA-Z0-9+.-]*://)(.*)$", seg)
if m2:
scheme, first = m2.group(1), m2.group(2)
cleaned = [first.strip("/")]
elif seg.startswith("?") or seg.startswith("#"):
cleaned[-1] = cleaned[-1] + seg # attach query/fragment
else:
cleaned.append(seg.strip("/"))
url = scheme + "/".join(s for s in cleaned if s != "")
# Preserve trailing slash if caller's last segment had one (and isn't ? or #)
last = str(parts[-1])
if last.endswith("/") and not last.startswith(("?", "#")) and not url.endswith("/"):
url += "/"
return url
def hx_fragment_request() -> bool:
return request.headers.get("SX-Request", "").lower() == "true" or request.headers.get("HX-Request", "").lower() == "true"
def route_prefix():
return f"{request.scheme}://{request.host}/{request.headers.get('x-forwarded-prefix', '/')}"
def join_url(value: Union[str, Iterable[str]]):
if isinstance(value, str):
parts = [value]
else:
parts = list(value)
return _join_url_parts(parts)
def host_url(value: str='', no_slash=False):
"""
Join g.route with value and ensure the resulting URL has a trailing slash
on the path, but never after query/fragment.
Examples:
http://jjj -> http://jjj/
http://jjj?hello -> http://jjj/?hello
/foo -> /foo/
/foo?x=1#frag -> /foo/?x=1#frag
"""
url = join_url([route_prefix(), value])
# Ensure trailing slash on the PATH (before ? or #)
# Split into: base (no ?/#), optional ?query, optional #fragment
if no_slash:
return url
m = re.match(r'^(?P[^?#]*)(?P\?[^#]*)?(?P#.*)?$', url)
if not m:
return url # fallback: return as-is
base = m.group('base') or ""
qs = m.group('qs') or ""
frag = m.group('frag') or ""
if base and not base.endswith('/'):
base += '/'
return f"{base}{qs}{frag}"