import re from bs4 import BeautifulSoup import json import os from typing import Iterable, Union, List from quart import request def soup_of(html: str) -> BeautifulSoup: return BeautifulSoup(html, "lxml") def normalize_text(s: str) -> str: return re.sub(r"\s+", " ", (s or "").strip()) def log(msg: str) -> None: print(msg, flush=True) def ensure_dir(path: str) -> None: os.makedirs(path, exist_ok=True) def dump_json(path: str, data) -> None: ensure_dir(os.path.dirname(path)) with open(path, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2) def _join_url_parts(parts: List[str]) -> str: parts = [p for p in parts if p is not None and p != ""] if not parts: return "" # Preserve scheme like "https://" m = re.match(r"^([a-zA-Z][a-zA-Z0-9+.-]*://)(.*)$", parts[0]) if m: scheme, first = m.group(1), m.group(2) else: scheme, first = "", parts[0] cleaned = [first.strip("/")] for seg in parts[1:]: seg = str(seg) # If a later segment is already an absolute URL, use it as the base m2 = re.match(r"^([a-zA-Z][a-zA-Z0-9+.-]*://)(.*)$", seg) if m2: scheme, first = m2.group(1), m2.group(2) cleaned = [first.strip("/")] elif seg.startswith("?") or seg.startswith("#"): cleaned[-1] = cleaned[-1] + seg # attach query/fragment else: cleaned.append(seg.strip("/")) url = scheme + "/".join(s for s in cleaned if s != "") # Preserve trailing slash if caller's last segment had one (and isn't ? or #) last = str(parts[-1]) if last.endswith("/") and not last.startswith(("?", "#")) and not url.endswith("/"): url += "/" return url def hx_fragment_request() -> bool: return request.headers.get("SX-Request", "").lower() == "true" or request.headers.get("HX-Request", "").lower() == "true" def route_prefix(): return f"{request.scheme}://{request.host}/{request.headers.get('x-forwarded-prefix', '/')}" def join_url(value: Union[str, Iterable[str]]): if isinstance(value, str): parts = [value] else: parts = list(value) return _join_url_parts(parts) def host_url(value: str='', no_slash=False): """ Join g.route with value and ensure the resulting URL has a trailing slash on the path, but never after query/fragment. Examples: http://jjj -> http://jjj/ http://jjj?hello -> http://jjj/?hello /foo -> /foo/ /foo?x=1#frag -> /foo/?x=1#frag """ url = join_url([route_prefix(), value]) # Ensure trailing slash on the PATH (before ? or #) # Split into: base (no ?/#), optional ?query, optional #fragment if no_slash: return url m = re.match(r'^(?P[^?#]*)(?P\?[^#]*)?(?P#.*)?$', url) if not m: return url # fallback: return as-is base = m.group('base') or "" qs = m.group('qs') or "" frag = m.group('frag') or "" if base and not base.endswith('/'): base += '/' return f"{base}{qs}{frag}"