market/scrape/product/extractors/breadcrumbs.py


from __future__ import annotations
from typing import Dict, List, Union
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from utils import normalize_text
from ..registry import extractor

@extractor
def ex_breadcrumbs(soup: BeautifulSoup, url: str) -> Dict:
    """
    Parse breadcrumbs to identify top and sub categories.
    """
    bc_ul = (soup.select_one(".breadcrumbs ul.items")
             or soup.select_one("nav.breadcrumbs ul.items")
             or soup.select_one("ul.items"))
    if not bc_ul:
        return {}

    crumbs = []
    for li in bc_ul.select("li.item"):
        a = li.find("a")
        if a:
            title = normalize_text(a.get("title") or a.get_text())
            href = a.get("href")
        else:
            title = normalize_text(li.get_text())
            href = None
        slug = None
        if href:
            try:
                p = urlparse(href)
                path = (p.path or "").strip("/")
                slug = path.split("/")[-1] if path else None
            except Exception:
                slug = None
        if slug:
          crumbs.append({"title": title or None, "href": href or None, "slug": slug})

    category_links = [c for c in crumbs if c.get("href")]
    top = None
    sub = None
    for c in category_links:
        t = (c.get("title") or "").lower()
        s = (c.get("slug") or "").lower()
        if t == "home" or s in ("", "home"):
            continue
        if top is None:
            top = c
            continue
        if sub is None:
            sub = c
            break

    out: Dict[str, Union[str, List[Dict[str, str]]]] = {
        "category_breadcrumbs": crumbs
    }
    if top:
        out["category_top_title"] = top.get("title")
        out["category_top_href"] = top.get("href")
        out["category_top_slug"] = top.get("slug")
    if sub:
        out["category_sub_title"] = sub.get("title")
        out["category_sub_href"] = sub.get("href")
        out["category_sub_slug"] = sub.get("slug")
    if top and sub:
        out["category_path"] = f"{(top.get('slug') or '').strip()}/{(sub.get('slug') or '').strip()}"
    return out