Phase 1-3 of decoupling: - path_setup.py adds project root to sys.path - Market-owned models in market/models/ (market, market_place) - All imports updated: shared.infrastructure, shared.db, shared.browser, etc. - MarketPlace uses container_type/container_id instead of post_id FK Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
45 lines
1.5 KiB
Python
45 lines
1.5 KiB
Python
# suma_browser/html_utils.py
|
|
from __future__ import annotations
|
|
from typing import Optional
|
|
from bs4 import BeautifulSoup
|
|
from urllib.parse import urljoin
|
|
from shared.config import config
|
|
|
|
|
|
|
|
def to_fragment(html: Optional[str]) -> str:
|
|
"""Return just the fragment contents (no <html>/<body> wrappers)."""
|
|
if not html:
|
|
return ""
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
# unwrap document-level containers
|
|
for t in soup.find_all(["html", "body"]):
|
|
t.unwrap()
|
|
|
|
return "".join(str(c) for c in soup.contents).strip()
|
|
|
|
def absolutize_fragment(html: Optional[str]) -> str:
|
|
"""Absolutize href/src against BASE_URL and return a fragment (no wrappers)."""
|
|
if not html:
|
|
return ""
|
|
frag = BeautifulSoup(html, "lxml")
|
|
|
|
for tag in frag.find_all(True):
|
|
if tag.has_attr("href"):
|
|
raw = str(tag["href"])
|
|
abs_href = urljoin(config()["base_url"], raw) if raw.startswith("/") else raw
|
|
#if rewrite_suma_href_to_local:
|
|
# local = rewrite_suma_href_to_local(abs_href)
|
|
# tag["href"] = local if local else abs_href
|
|
#else:
|
|
tag["href"] = abs_href
|
|
if tag.has_attr("src"):
|
|
raw = str(tag["src"])
|
|
tag["src"] = urljoin(config()["base_url"], raw) if raw.startswith("/") else raw
|
|
|
|
# unwrap wrappers and return only the inner HTML
|
|
for t in frag.find_all(["html", "body"]):
|
|
t.unwrap()
|
|
return "".join(str(c) for c in frag.contents).strip()
|