# suma_browser/html_utils.py from __future__ import annotations from typing import Optional from bs4 import BeautifulSoup from urllib.parse import urljoin from shared.config import config def to_fragment(html: Optional[str]) -> str: """Return just the fragment contents (no / wrappers).""" if not html: return "" soup = BeautifulSoup(html, "lxml") # unwrap document-level containers for t in soup.find_all(["html", "body"]): t.unwrap() return "".join(str(c) for c in soup.contents).strip() def absolutize_fragment(html: Optional[str]) -> str: """Absolutize href/src against BASE_URL and return a fragment (no wrappers).""" if not html: return "" frag = BeautifulSoup(html, "lxml") for tag in frag.find_all(True): if tag.has_attr("href"): raw = str(tag["href"]) abs_href = urljoin(config()["base_url"], raw) if raw.startswith("/") else raw #if rewrite_suma_href_to_local: # local = rewrite_suma_href_to_local(abs_href) # tag["href"] = local if local else abs_href #else: tag["href"] = abs_href if tag.has_attr("src"): raw = str(tag["src"]) tag["src"] = urljoin(config()["base_url"], raw) if raw.startswith("/") else raw # unwrap wrappers and return only the inner HTML for t in frag.find_all(["html", "body"]): t.unwrap() return "".join(str(c) for c in frag.contents).strip()