This repository has been archived on 2026-02-24. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
market/scrape/nav.py
giles 478636f799 feat: decouple market from shared_lib, add app-owned models
Phase 1-3 of decoupling:
- path_setup.py adds project root to sys.path
- Market-owned models in market/models/ (market, market_place)
- All imports updated: shared.infrastructure, shared.db, shared.browser, etc.
- MarketPlace uses container_type/container_id instead of post_id FK

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 12:46:32 +00:00

105 lines
3.7 KiB
Python

from __future__ import annotations
import re
from typing import Dict, List, Tuple, Optional
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
from shared.config import config
from .http_client import fetch # only fetch; define soup_of locally
#from .. import cache_backend as cb
#from ..blacklist.category import is_category_blocked # Reverse map: slug -> label
# ------------------ Caches ------------------
def soup_of(html: str) -> BeautifulSoup:
return BeautifulSoup(html or "", "lxml")
def normalize_text(s: str) -> str:
return re.sub(r"\s+", " ", (s or "").strip())
async def scrape_nav_raw() -> List[Tuple[str, str]]:
html = await fetch(config()["base_url"])
soup = soup_of(html)
results: List[Tuple[str, str]] = []
for a in soup.find_all("a", href=True):
text = normalize_text(a.get_text())
if not text:
continue
href = a["href"].strip()
if href.startswith("/"):
href = urljoin(config()["base_url"], href)
if not href.startswith(config()["base_url"]):
continue
results.append((text, href))
return results
def extract_sub_slug(href: str, top_slug: str) -> Optional[str]:
p = urlparse(href)
parts = [x for x in (p.path or "").split("/") if x]
if len(parts) >= 2 and parts[0].lower() == top_slug.lower():
sub = parts[1]
if sub.lower().endswith((".html", ".htm")):
sub = re.sub(r"\.(html?|HTML?)$", "", sub)
return sub
return None
async def group_by_category(slug_to_links: Dict[str, List[Tuple[str, str]]]) -> Dict[str, Dict]:
nav = {"cats": {}}
for label, slug in config()["categories"]["allow"].items():
top_href = urljoin(config()["base_url"], f"/{slug}")
subs = []
for text, href in slug_to_links.get(slug, []):
sub_slug = extract_sub_slug(href, slug)
if sub_slug:
#list_url = _join(config()["base_url"], f"/{slug}/{sub_slug}")
#log(f"naving [{slug}/{sub_slug}] page 1…")
#items, total_pages = await scrape_products(list_url, page=1)
#for p in range(2, total_pages + 1):
# log(f"naving [{slug}/{sub_slug}] page {p}…")
# moreitems, _tp = await scrape_products(list_url, page=p)
# items.extend(
# moreitems,
# )
subs.append({"name": text, "href": href, "slug": sub_slug})
subs.sort(key=lambda x: x["name"].lower())
#list_url = _join(config()["base_url"], f"/{slug}")
#log(f"naving [{slug}] page 1…")
#items, total_pages = await scrape_products(list_url, page=1)
#for p in range(2, total_pages + 1):
# log(f"naving [{slug}] page {p}…")
# moreitems, _tp = await scrape_products(list_url, page=p)
# items.extend(
# moreitems,
# )
nav["cats"][label] = {"href": top_href, "slug": slug, "subs": subs}
return nav
async def scrape_nav_filtered() -> Dict[str, Dict]:
anchors = await scrape_nav_raw()
slug_to_links: Dict[str, List[Tuple[str, str]]] = {}
for text, href in anchors:
p = urlparse(href)
parts = [x for x in (p.path or "").split("/") if x]
if not parts:
continue
top = parts[0].lower()
if top in config()["slugs"]["skip"]:
continue
slug_to_links.setdefault(top, []).append((text, href))
return await group_by_category(slug_to_links)
async def nav_scrape() -> Dict[str, Dict]:
"""Return navigation structure; use snapshot when offline."""
nav = await scrape_nav_filtered()
return nav