Files
rose-ash/market/scrape/nav.py
giles f42042ccb7
All checks were successful
Build and Deploy / build-and-deploy (push) Successful in 1m5s
Monorepo: consolidate 7 repos into one
Combines shared, blog, market, cart, events, federation, and account
into a single repository. Eliminates submodule sync, sibling model
copying at build time, and per-app CI orchestration.

Changes:
- Remove per-app .git, .gitmodules, .gitea, submodule shared/ dirs
- Remove stale sibling model copies from each app
- Update all 6 Dockerfiles for monorepo build context (root = .)
- Add build directives to docker-compose.yml
- Add single .gitea/workflows/ci.yml with change detection
- Add .dockerignore for monorepo build context
- Create __init__.py for federation and account (cross-app imports)
2026-02-24 19:44:17 +00:00

105 lines
3.7 KiB
Python

from __future__ import annotations
import re
from typing import Dict, List, Tuple, Optional
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
from shared.config import config
from .http_client import fetch # only fetch; define soup_of locally
#from .. import cache_backend as cb
#from ..blacklist.category import is_category_blocked # Reverse map: slug -> label
# ------------------ Caches ------------------
def soup_of(html: str) -> BeautifulSoup:
return BeautifulSoup(html or "", "lxml")
def normalize_text(s: str) -> str:
return re.sub(r"\s+", " ", (s or "").strip())
async def scrape_nav_raw() -> List[Tuple[str, str]]:
html = await fetch(config()["base_url"])
soup = soup_of(html)
results: List[Tuple[str, str]] = []
for a in soup.find_all("a", href=True):
text = normalize_text(a.get_text())
if not text:
continue
href = a["href"].strip()
if href.startswith("/"):
href = urljoin(config()["base_url"], href)
if not href.startswith(config()["base_url"]):
continue
results.append((text, href))
return results
def extract_sub_slug(href: str, top_slug: str) -> Optional[str]:
p = urlparse(href)
parts = [x for x in (p.path or "").split("/") if x]
if len(parts) >= 2 and parts[0].lower() == top_slug.lower():
sub = parts[1]
if sub.lower().endswith((".html", ".htm")):
sub = re.sub(r"\.(html?|HTML?)$", "", sub)
return sub
return None
async def group_by_category(slug_to_links: Dict[str, List[Tuple[str, str]]]) -> Dict[str, Dict]:
nav = {"cats": {}}
for label, slug in config()["categories"]["allow"].items():
top_href = urljoin(config()["base_url"], f"/{slug}")
subs = []
for text, href in slug_to_links.get(slug, []):
sub_slug = extract_sub_slug(href, slug)
if sub_slug:
#list_url = _join(config()["base_url"], f"/{slug}/{sub_slug}")
#log(f"naving [{slug}/{sub_slug}] page 1…")
#items, total_pages = await scrape_products(list_url, page=1)
#for p in range(2, total_pages + 1):
# log(f"naving [{slug}/{sub_slug}] page {p}…")
# moreitems, _tp = await scrape_products(list_url, page=p)
# items.extend(
# moreitems,
# )
subs.append({"name": text, "href": href, "slug": sub_slug})
subs.sort(key=lambda x: x["name"].lower())
#list_url = _join(config()["base_url"], f"/{slug}")
#log(f"naving [{slug}] page 1…")
#items, total_pages = await scrape_products(list_url, page=1)
#for p in range(2, total_pages + 1):
# log(f"naving [{slug}] page {p}…")
# moreitems, _tp = await scrape_products(list_url, page=p)
# items.extend(
# moreitems,
# )
nav["cats"][label] = {"href": top_href, "slug": slug, "subs": subs}
return nav
async def scrape_nav_filtered() -> Dict[str, Dict]:
anchors = await scrape_nav_raw()
slug_to_links: Dict[str, List[Tuple[str, str]]] = {}
for text, href in anchors:
p = urlparse(href)
parts = [x for x in (p.path or "").split("/") if x]
if not parts:
continue
top = parts[0].lower()
if top in config()["slugs"]["skip"]:
continue
slug_to_links.setdefault(top, []).append((text, href))
return await group_by_category(slug_to_links)
async def nav_scrape() -> Dict[str, Dict]:
"""Return navigation structure; use snapshot when offline."""
nav = await scrape_nav_filtered()
return nav