feat: initialize market app with browsing, product, and scraping code
Some checks failed
Build and Deploy / build-and-deploy (push) Has been cancelled

Split from coop monolith. Includes:
- Market/browse/product blueprints
- Product sync API
- Suma scraping pipeline
- Templates for market, browse, and product views
- Dockerfile and CI workflow for independent deployment
This commit is contained in:
giles
2026-02-09 23:16:34 +00:00
commit 6271a715a1
142 changed files with 8517 additions and 0 deletions

View File

@@ -0,0 +1 @@
APP_ROOT_PLACEHOLDER = "[**__APP_ROOT__**]"

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,6 @@
def _anchor_text(a) -> str:
try:
txt = " ".join((a.get_text(" ") or "").split())
return txt[:200]
except Exception:
return ""

View File

@@ -0,0 +1,16 @@
from bs4 import BeautifulSoup
from typing import List, Optional
def _collect_html_img_srcs(html: Optional[str]) -> List[str]:
urls: List[str] = []
if not html:
return urls
try:
soup = BeautifulSoup(html, "lxml")
for img in soup.find_all("img"):
src = img.get("src")
if src:
urls.append(src)
except Exception:
pass
return urls

View File

@@ -0,0 +1,14 @@
from typing import Iterable, List, Set
def _dedupe_preserve_order(urls: Iterable[str]) -> List[str]:
seen: Set[str] = set()
out: List[str] = []
for u in urls:
if not u or not isinstance(u, str):
continue
if u in seen:
continue
seen.add(u)
out.append(u)
return out

View File

@@ -0,0 +1,32 @@
from typing import Dict,Optional, Tuple
_CF_TOKENS = (
"One moment, please...",
"Please wait while your request is being verified",
"/cdn-cgi/challenge-platform/",
"rocket-loader.min.js",
)
def _looks_like_cf_html(html: Optional[str]) -> Tuple[bool, Optional[str]]:
if not html:
return False, None
for tok in _CF_TOKENS:
if tok in html:
return True, tok
return False, None
def _product_dict_is_cf(d: Dict) -> Tuple[bool, Optional[str]]:
title = (d.get("title") or "").strip()
if title.lower() == "one moment, please...":
return True, "One moment, please..."
ok, tok = _looks_like_cf_html(d.get("description_html"))
if ok:
return True, tok
for sec in d.get("sections") or []:
if isinstance(sec, dict) and sec.get("html"):
ok2, tok2 = _looks_like_cf_html(sec["html"])
if ok2:
return True, tok2
if not d.get("images") and not d.get("description_html") and not d.get("sections"):
return True, "all_empty_heuristic"
return False, None

View File

@@ -0,0 +1,34 @@
from typing import Dict, Set
from urllib.parse import urlparse, urljoin
import httpx
async def _resolve_sub_redirects(
base_url: str,
candidates: Set[str],
allowed_tops: Set[str],
valid_subs_by_top: Dict[str, Set[str]],
) -> Dict[str, str]:
mapping: Dict[str, str] = {}
if not candidates:
return mapping
timeout = httpx.Timeout(20.0, connect=10.0)
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, http2=True) as client:
for path in sorted(candidates):
try:
url = urljoin(base_url, path)
r = await client.get(url)
final = str(r.url)
p = urlparse(final)
parts = [x for x in (p.path or "").split("/") if x]
if len(parts) >= 2:
top_new = parts[0].lower()
sub_new = parts[1].lower().removesuffix(".html").removesuffix(".htm")
if top_new in allowed_tops:
new_path = f"/{top_new}/{sub_new}"
if new_path != path:
mapping[path] = new_path
valid_subs_by_top.setdefault(top_new, set()).add(sub_new)
except Exception:
continue
return mapping

View File

@@ -0,0 +1,100 @@
from typing import Dict, List, Optional, Set
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from ._anchor_text import _anchor_text
from suma_browser.app.bp.browse.services.slugs import product_slug_from_href
from .APP_ROOT_PLACEHOLDER import APP_ROOT_PLACEHOLDER
def _rewrite_links_fragment(
html: Optional[str],
base_url: str,
known_slugs: Set[str],
category_allow_values: Set[str],
valid_subs_by_top: Dict[str, Set[str]],
current_product_slug: str,
link_errors: List[Dict],
link_externals: List[Dict],
unknown_sub_paths: Set[str],
) -> str:
if not html:
return ""
soup = BeautifulSoup(html, "lxml")
base_host = urlparse(base_url).netloc
for a in soup.find_all("a", href=True):
raw = (a.get("href") or "").strip()
if not raw:
continue
low = raw.lower()
if low.startswith(("mailto:", "tel:", "javascript:", "data:")) or low.startswith("#"):
continue
abs_href = urljoin(base_url, raw)
p = urlparse(abs_href)
if not p.scheme or not p.netloc:
continue
if p.netloc != base_host:
link_externals.append({
"product": current_product_slug,
"href": abs_href,
"text": _anchor_text(a),
"host": p.netloc,
})
continue
parts = [x for x in (p.path or "").split("/") if x]
if not parts:
continue
last = parts[-1].lower()
if last.endswith((".html", ".htm")):
target_slug = product_slug_from_href(abs_href)
if target_slug and target_slug in known_slugs:
a["href"] = f"{APP_ROOT_PLACEHOLDER}/product/{target_slug}"
else:
link_errors.append({
"product": current_product_slug,
"href": abs_href,
"text": _anchor_text(a),
"top": None,
"sub": None,
"target_slug": target_slug or None,
"type": "suma_product_unknown",
})
continue
top = parts[0].lower()
if top in category_allow_values:
if len(parts) == 1:
a["href"] = f"{APP_ROOT_PLACEHOLDER}/{top}"
else:
sub = parts[1]
if sub.lower().endswith((".html", ".htm")):
sub = sub.rsplit(".", 1)[0]
if sub in (valid_subs_by_top.get(top) or set()):
a["href"] = f"{APP_ROOT_PLACEHOLDER}/{top}/{sub}"
else:
unknown_path = f"/{top}/{sub}"
unknown_sub_paths.add(unknown_path)
a["href"] = f"{APP_ROOT_PLACEHOLDER}{unknown_path}"
link_errors.append({
"product": current_product_slug,
"href": abs_href,
"text": _anchor_text(a),
"top": top,
"sub": sub,
"target_slug": None,
"type": "suma_category_invalid_sub_pending",
})
else:
link_errors.append({
"product": current_product_slug,
"href": abs_href,
"text": _anchor_text(a),
"top": top,
"sub": parts[1] if len(parts) > 1 else None,
"target_slug": None,
"type": "suma_other",
})
for t in soup.find_all(["html", "body"]):
t.unwrap()
return "".join(str(c) for c in soup.contents).strip()

View File

@@ -0,0 +1,14 @@
from typing import Dict, Set
def candidate_subs(nav: Dict[str, Dict])-> Set[str]:
nav_sub_candidates: Set[str] = set()
for label, data in (nav.get("cats") or {}).items():
top_slug = (data or {}).get("slug")
if not top_slug:
continue
for s in (data.get("subs") or []):
sub_slug = (s.get("slug") or "").strip()
if sub_slug:
nav_sub_candidates.add(f"/{top_slug}/{sub_slug}")
return nav_sub_candidates

View File

@@ -0,0 +1,18 @@
from urllib.parse import urljoin
from config import config
from utils import log
from ...listings import scrape_products
async def capture_category(
slug: str,
):
list_url = urljoin(config()["base_url"], f"/{slug}")
log(f"[{slug}] page 1…")
items, total_pages = await scrape_products(list_url, page=1)
pmax = int(total_pages or 1)
for p in range(2, pmax + 1):
log(f"[{slug}] page {p}")
items_p, _tp = await scrape_products(list_url, page=p)
items.extend(items_p)
return (list_url, items, total_pages)

View File

@@ -0,0 +1,25 @@
from typing import Dict, Set
from .capture_category import capture_category
from .capture_sub import capture_sub
from config import config
async def capture_product_slugs(
nav: Dict[str, Dict],
capture_listing,
):
product_slugs: Set[str] = set()
for label, slug in config()["categories"]["allow"].items():
lpars = await capture_category( slug)
await capture_listing(*lpars)
(_, items, __) = lpars
for slug_ in items:
product_slugs.add(slug_)
for sub in (nav["cats"].get(label, {}).get("subs", []) or []):
lpars = await capture_sub(sub, slug)
await capture_listing(*lpars)
(_, items, __) = lpars
for slug_ in items:
product_slugs.add(slug_)
return product_slugs

View File

@@ -0,0 +1,22 @@
from urllib.parse import urljoin
from urllib.parse import urljoin
from config import config
from utils import log
from ...listings import scrape_products
async def capture_sub(
sub,
slug,
):
sub_slug = sub.get("slug")
if not sub_slug:
return
sub_url = urljoin(config()["base_url"], f"/{slug}/{sub_slug}")
log(f"[{slug}/{sub_slug}] page 1…")
items_s, total_pages_s = await scrape_products(sub_url, page=1)
spmax = int(total_pages_s or 1)
for p in range(2, spmax + 1):
log(f"[{slug}/{sub_slug}] page {p}")
items_ps, _ = await scrape_products(sub_url, page=p)
items_s.extend(items_ps)
return (sub_url, items_s, total_pages_s)

View File

@@ -0,0 +1,106 @@
import asyncio
from typing import List
import httpx
from ...html_utils import to_fragment
from suma_browser.app.bp.browse.services.slugs import suma_href_from_html_slug
from config import config
from utils import log
# DB: persistence helpers
from ...product.product_detail import scrape_product_detail
from ._product_dict_is_cf import _product_dict_is_cf
from ._rewrite_links_fragment import _rewrite_links_fragment
from ._dedupe_preserve_order import _dedupe_preserve_order
from ._collect_html_img_srcs import _collect_html_img_srcs
async def fetch_and_upsert_product(
upsert_product,
log_product_result,
sem: asyncio.Semaphore,
slug: str,
product_slugs,
category_values,
valid_subs_by_top,
link_errors,
link_externals,
unknown_sub_paths
) -> bool:
href = suma_href_from_html_slug(slug)
try:
async with sem:
d = await scrape_product_detail(href)
is_cf, cf_token = _product_dict_is_cf(d)
if is_cf:
payload = {
"slug": slug,
"href_tried": href,
"error_type": "CloudflareChallengeDetected",
"error_message": f"Detected Cloudflare interstitial via token: {cf_token}",
"cf_token": cf_token,
}
await log_product_result(ok=False, payload=payload)
log(f" ! CF challenge detected: {slug} ({cf_token})")
return False
# Rewrite embedded links; collect reports
if d.get("description_html"):
d["description_html"] = _rewrite_links_fragment(
d["description_html"], config()["base_url"], product_slugs, category_values,
valid_subs_by_top, slug, link_errors, link_externals, unknown_sub_paths
)
d["description_html"] = to_fragment(d["description_html"])
if d.get("sections"):
for sec in d["sections"]:
if isinstance(sec, dict) and sec.get("html"):
sec["html"] = _rewrite_links_fragment(
sec["html"], config()["base_url"], product_slugs, category_values,
valid_subs_by_top, slug, link_errors, link_externals, unknown_sub_paths
)
sec["html"] = to_fragment(sec["html"])
# Images
gallery = _dedupe_preserve_order(d.get("images") or [])
embedded: List[str] = []
if d.get("description_html"):
embedded += _collect_html_img_srcs(d["description_html"])
for sec in d.get("sections", []) or []:
if isinstance(sec, dict) and sec.get("html"):
embedded += _collect_html_img_srcs(sec["html"])
embedded = _dedupe_preserve_order(embedded)
all_imgs = _dedupe_preserve_order(list(gallery) + list(embedded))
d["images"] = gallery
d["embedded_image_urls"] = embedded
d["all_image_urls"] = all_imgs
await upsert_product(slug, href, d)
# DB: upsert product + success log
return True
except Exception as e:
payload = {
"slug": slug,
"href_tried": href,
"error_type": e.__class__.__name__,
"error_message": str(e),
}
try:
if isinstance(e, httpx.HTTPStatusError):
payload["http_status"] = getattr(e.response, "status_code", None)
req = getattr(e, "request", None)
if req is not None and getattr(req, "url", None) is not None:
payload["final_url"] = str(req.url)
elif isinstance(e, httpx.TransportError):
payload["transport_error"] = True
except Exception:
pass
await log_product_result(ok=False, payload=payload)
log(f" ! product failed: {slug} ({e})")
return False

View File

@@ -0,0 +1,49 @@
import asyncio
from typing import Dict, List, Set
from config import config
from utils import log
from .fetch_and_upsert_product import fetch_and_upsert_product
async def fetch_and_upsert_products(
upsert_product,
log_product_result,
save_link_reports = None,
concurrency: int=8,
product_slugs: Set[str] = set(),
valid_subs_by_top: Dict[str, Set[str]] = {},
unknown_sub_paths: Set[str] = set()
):
sem = asyncio.Semaphore(max(1, concurrency))
link_errors: List[Dict] = []
link_externals: List[Dict] = []
category_values: Set[str] = set(config()["categories"]["allow"].values())
to_fetch = sorted(list(product_slugs))
log(f"Fetching {len(to_fetch)} product details (concurrency={concurrency})…")
tasks = [asyncio.create_task(
fetch_and_upsert_product(
upsert_product,
log_product_result,
sem,
s,
product_slugs,
category_values,
valid_subs_by_top,
link_errors,
link_externals,
unknown_sub_paths
)
) for s in to_fetch]
done = 0
ok_count = 0
for coro in asyncio.as_completed(tasks):
ok = await coro
done += 1
if ok:
ok_count += 1
if done % 50 == 0 or done == len(tasks):
log(f"{done}/{len(tasks)} saved (ok={ok_count})")
if save_link_reports:
await save_link_reports(link_errors, link_externals)

View File

@@ -0,0 +1,24 @@
from typing import Dict
from urllib.parse import urljoin
from config import config
def rewrite_nav(nav: Dict[str, Dict], nav_redirects:Dict[str, str]):
if nav_redirects:
for label, data in (nav.get("cats") or {}).items():
top_slug = (data or {}).get("slug")
if not top_slug:
continue
new_subs = []
for s in (data.get("subs") or []):
old_sub = (s.get("slug") or "").strip()
if not old_sub:
continue
old_path = f"/{top_slug}/{old_sub}"
canonical_path = nav_redirects.get(old_path, old_path)
parts = [x for x in canonical_path.split("/") if x]
top2, sub2 = parts[0], parts[1]
s["slug"] = sub2
s["href"] = urljoin(config()["base_url"], f"/{top2}/{sub2}")
new_subs.append(s)
data["subs"] = new_subs

View File

@@ -0,0 +1,16 @@
from typing import Dict, Set
# make valid subs for ewch top in nav
def valid_subs(nav: Dict[str, Dict])->Dict[str, Set[str]] :
valid_subs_by_top: Dict[str, Set[str]] = {}
for label, data in (nav.get("cats") or {}).items():
top_slug = (data or {}).get("slug")
if not top_slug:
continue
subs_set = {
(s.get("slug") or "").strip()
for s in (data.get("subs") or [])
if s.get("slug")
}
valid_subs_by_top[top_slug] = subs_set
return valid_subs_by_top