Phase 1-3 of decoupling: - path_setup.py adds project root to sys.path - Market-owned models in market/models/ (market, market_place) - All imports updated: shared.infrastructure, shared.db, shared.browser, etc. - MarketPlace uses container_type/container_id instead of post_id FK Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
69 lines
2.1 KiB
Python
69 lines
2.1 KiB
Python
|
|
from __future__ import annotations
|
|
from typing import Dict, List, Union
|
|
from urllib.parse import urlparse
|
|
from bs4 import BeautifulSoup
|
|
from shared.utils import normalize_text
|
|
from ..registry import extractor
|
|
|
|
@extractor
|
|
def ex_breadcrumbs(soup: BeautifulSoup, url: str) -> Dict:
|
|
"""
|
|
Parse breadcrumbs to identify top and sub categories.
|
|
"""
|
|
bc_ul = (soup.select_one(".breadcrumbs ul.items")
|
|
or soup.select_one("nav.breadcrumbs ul.items")
|
|
or soup.select_one("ul.items"))
|
|
if not bc_ul:
|
|
return {}
|
|
|
|
crumbs = []
|
|
for li in bc_ul.select("li.item"):
|
|
a = li.find("a")
|
|
if a:
|
|
title = normalize_text(a.get("title") or a.get_text())
|
|
href = a.get("href")
|
|
else:
|
|
title = normalize_text(li.get_text())
|
|
href = None
|
|
slug = None
|
|
if href:
|
|
try:
|
|
p = urlparse(href)
|
|
path = (p.path or "").strip("/")
|
|
slug = path.split("/")[-1] if path else None
|
|
except Exception:
|
|
slug = None
|
|
if slug:
|
|
crumbs.append({"title": title or None, "href": href or None, "slug": slug})
|
|
|
|
category_links = [c for c in crumbs if c.get("href")]
|
|
top = None
|
|
sub = None
|
|
for c in category_links:
|
|
t = (c.get("title") or "").lower()
|
|
s = (c.get("slug") or "").lower()
|
|
if t == "home" or s in ("", "home"):
|
|
continue
|
|
if top is None:
|
|
top = c
|
|
continue
|
|
if sub is None:
|
|
sub = c
|
|
break
|
|
|
|
out: Dict[str, Union[str, List[Dict[str, str]]]] = {
|
|
"category_breadcrumbs": crumbs
|
|
}
|
|
if top:
|
|
out["category_top_title"] = top.get("title")
|
|
out["category_top_href"] = top.get("href")
|
|
out["category_top_slug"] = top.get("slug")
|
|
if sub:
|
|
out["category_sub_title"] = sub.get("title")
|
|
out["category_sub_href"] = sub.get("href")
|
|
out["category_sub_slug"] = sub.get("slug")
|
|
if top and sub:
|
|
out["category_path"] = f"{(top.get('slug') or '').strip()}/{(sub.get('slug') or '').strip()}"
|
|
return out
|