Some checks failed
Build and Deploy / build-and-deploy (push) Has been cancelled
Split from coop monolith. Includes: - Market/browse/product blueprints - Product sync API - Suma scraping pipeline - Templates for market, browse, and product views - Dockerfile and CI workflow for independent deployment
105 lines
3.7 KiB
Python
105 lines
3.7 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
from typing import Dict, List, Tuple, Optional
|
|
from urllib.parse import urlparse, urljoin
|
|
|
|
from bs4 import BeautifulSoup
|
|
from config import config
|
|
from .http_client import fetch # only fetch; define soup_of locally
|
|
#from .. import cache_backend as cb
|
|
#from ..blacklist.category import is_category_blocked # Reverse map: slug -> label
|
|
|
|
|
|
# ------------------ Caches ------------------
|
|
|
|
|
|
|
|
def soup_of(html: str) -> BeautifulSoup:
|
|
return BeautifulSoup(html or "", "lxml")
|
|
|
|
|
|
def normalize_text(s: str) -> str:
|
|
return re.sub(r"\s+", " ", (s or "").strip())
|
|
|
|
|
|
async def scrape_nav_raw() -> List[Tuple[str, str]]:
|
|
html = await fetch(config()["base_url"])
|
|
soup = soup_of(html)
|
|
results: List[Tuple[str, str]] = []
|
|
for a in soup.find_all("a", href=True):
|
|
text = normalize_text(a.get_text())
|
|
if not text:
|
|
continue
|
|
href = a["href"].strip()
|
|
if href.startswith("/"):
|
|
href = urljoin(config()["base_url"], href)
|
|
if not href.startswith(config()["base_url"]):
|
|
continue
|
|
results.append((text, href))
|
|
return results
|
|
|
|
|
|
def extract_sub_slug(href: str, top_slug: str) -> Optional[str]:
|
|
p = urlparse(href)
|
|
parts = [x for x in (p.path or "").split("/") if x]
|
|
if len(parts) >= 2 and parts[0].lower() == top_slug.lower():
|
|
sub = parts[1]
|
|
if sub.lower().endswith((".html", ".htm")):
|
|
sub = re.sub(r"\.(html?|HTML?)$", "", sub)
|
|
return sub
|
|
return None
|
|
|
|
|
|
async def group_by_category(slug_to_links: Dict[str, List[Tuple[str, str]]]) -> Dict[str, Dict]:
|
|
nav = {"cats": {}}
|
|
for label, slug in config()["categories"]["allow"].items():
|
|
top_href = urljoin(config()["base_url"], f"/{slug}")
|
|
subs = []
|
|
for text, href in slug_to_links.get(slug, []):
|
|
sub_slug = extract_sub_slug(href, slug)
|
|
if sub_slug:
|
|
#list_url = _join(config()["base_url"], f"/{slug}/{sub_slug}")
|
|
#log(f"naving [{slug}/{sub_slug}] page 1…")
|
|
#items, total_pages = await scrape_products(list_url, page=1)
|
|
#for p in range(2, total_pages + 1):
|
|
# log(f"naving [{slug}/{sub_slug}] page {p}…")
|
|
# moreitems, _tp = await scrape_products(list_url, page=p)
|
|
# items.extend(
|
|
# moreitems,
|
|
# )
|
|
subs.append({"name": text, "href": href, "slug": sub_slug})
|
|
subs.sort(key=lambda x: x["name"].lower())
|
|
#list_url = _join(config()["base_url"], f"/{slug}")
|
|
#log(f"naving [{slug}] page 1…")
|
|
#items, total_pages = await scrape_products(list_url, page=1)
|
|
#for p in range(2, total_pages + 1):
|
|
# log(f"naving [{slug}] page {p}…")
|
|
# moreitems, _tp = await scrape_products(list_url, page=p)
|
|
# items.extend(
|
|
# moreitems,
|
|
# )
|
|
nav["cats"][label] = {"href": top_href, "slug": slug, "subs": subs}
|
|
return nav
|
|
|
|
|
|
async def scrape_nav_filtered() -> Dict[str, Dict]:
|
|
anchors = await scrape_nav_raw()
|
|
slug_to_links: Dict[str, List[Tuple[str, str]]] = {}
|
|
for text, href in anchors:
|
|
p = urlparse(href)
|
|
parts = [x for x in (p.path or "").split("/") if x]
|
|
if not parts:
|
|
continue
|
|
top = parts[0].lower()
|
|
if top in config()["slugs"]["skip"]:
|
|
continue
|
|
slug_to_links.setdefault(top, []).append((text, href))
|
|
return await group_by_category(slug_to_links)
|
|
|
|
async def nav_scrape() -> Dict[str, Dict]:
|
|
"""Return navigation structure; use snapshot when offline."""
|
|
|
|
nav = await scrape_nav_filtered()
|
|
return nav
|