from __future__ import annotations from typing import Dict, List, Union from urllib.parse import urlparse from bs4 import BeautifulSoup from utils import normalize_text from ..registry import extractor @extractor def ex_breadcrumbs(soup: BeautifulSoup, url: str) -> Dict: """ Parse breadcrumbs to identify top and sub categories. """ bc_ul = (soup.select_one(".breadcrumbs ul.items") or soup.select_one("nav.breadcrumbs ul.items") or soup.select_one("ul.items")) if not bc_ul: return {} crumbs = [] for li in bc_ul.select("li.item"): a = li.find("a") if a: title = normalize_text(a.get("title") or a.get_text()) href = a.get("href") else: title = normalize_text(li.get_text()) href = None slug = None if href: try: p = urlparse(href) path = (p.path or "").strip("/") slug = path.split("/")[-1] if path else None except Exception: slug = None if slug: crumbs.append({"title": title or None, "href": href or None, "slug": slug}) category_links = [c for c in crumbs if c.get("href")] top = None sub = None for c in category_links: t = (c.get("title") or "").lower() s = (c.get("slug") or "").lower() if t == "home" or s in ("", "home"): continue if top is None: top = c continue if sub is None: sub = c break out: Dict[str, Union[str, List[Dict[str, str]]]] = { "category_breadcrumbs": crumbs } if top: out["category_top_title"] = top.get("title") out["category_top_href"] = top.get("href") out["category_top_slug"] = top.get("slug") if sub: out["category_sub_title"] = sub.get("title") out["category_sub_href"] = sub.get("href") out["category_sub_slug"] = sub.get("slug") if top and sub: out["category_path"] = f"{(top.get('slug') or '').strip()}/{(sub.get('slug') or '').strip()}" return out