from __future__ import annotations import re from typing import Dict, List, Tuple, Optional from urllib.parse import urlparse, urljoin from bs4 import BeautifulSoup from shared.config import config from .http_client import fetch # only fetch; define soup_of locally #from .. import cache_backend as cb #from ..blacklist.category import is_category_blocked # Reverse map: slug -> label # ------------------ Caches ------------------ def soup_of(html: str) -> BeautifulSoup: return BeautifulSoup(html or "", "lxml") def normalize_text(s: str) -> str: return re.sub(r"\s+", " ", (s or "").strip()) async def scrape_nav_raw() -> List[Tuple[str, str]]: html = await fetch(config()["base_url"]) soup = soup_of(html) results: List[Tuple[str, str]] = [] for a in soup.find_all("a", href=True): text = normalize_text(a.get_text()) if not text: continue href = a["href"].strip() if href.startswith("/"): href = urljoin(config()["base_url"], href) if not href.startswith(config()["base_url"]): continue results.append((text, href)) return results def extract_sub_slug(href: str, top_slug: str) -> Optional[str]: p = urlparse(href) parts = [x for x in (p.path or "").split("/") if x] if len(parts) >= 2 and parts[0].lower() == top_slug.lower(): sub = parts[1] if sub.lower().endswith((".html", ".htm")): sub = re.sub(r"\.(html?|HTML?)$", "", sub) return sub return None async def group_by_category(slug_to_links: Dict[str, List[Tuple[str, str]]]) -> Dict[str, Dict]: nav = {"cats": {}} for label, slug in config()["categories"]["allow"].items(): top_href = urljoin(config()["base_url"], f"/{slug}") subs = [] for text, href in slug_to_links.get(slug, []): sub_slug = extract_sub_slug(href, slug) if sub_slug: #list_url = _join(config()["base_url"], f"/{slug}/{sub_slug}") #log(f"naving [{slug}/{sub_slug}] page 1…") #items, total_pages = await scrape_products(list_url, page=1) #for p in range(2, total_pages + 1): # log(f"naving [{slug}/{sub_slug}] page {p}…") # moreitems, _tp = await scrape_products(list_url, page=p) # items.extend( # moreitems, # ) subs.append({"name": text, "href": href, "slug": sub_slug}) subs.sort(key=lambda x: x["name"].lower()) #list_url = _join(config()["base_url"], f"/{slug}") #log(f"naving [{slug}] page 1…") #items, total_pages = await scrape_products(list_url, page=1) #for p in range(2, total_pages + 1): # log(f"naving [{slug}] page {p}…") # moreitems, _tp = await scrape_products(list_url, page=p) # items.extend( # moreitems, # ) nav["cats"][label] = {"href": top_href, "slug": slug, "subs": subs} return nav async def scrape_nav_filtered() -> Dict[str, Dict]: anchors = await scrape_nav_raw() slug_to_links: Dict[str, List[Tuple[str, str]]] = {} for text, href in anchors: p = urlparse(href) parts = [x for x in (p.path or "").split("/") if x] if not parts: continue top = parts[0].lower() if top in config()["slugs"]["skip"]: continue slug_to_links.setdefault(top, []).append((text, href)) return await group_by_category(slug_to_links) async def nav_scrape() -> Dict[str, Dict]: """Return navigation structure; use snapshot when offline.""" nav = await scrape_nav_filtered() return nav