This repository has been archived on 2026-02-24. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
market/scrape/product/extractors/breadcrumbs.py
giles 6271a715a1
Some checks failed
Build and Deploy / build-and-deploy (push) Has been cancelled
feat: initialize market app with browsing, product, and scraping code
Split from coop monolith. Includes:
- Market/browse/product blueprints
- Product sync API
- Suma scraping pipeline
- Templates for market, browse, and product views
- Dockerfile and CI workflow for independent deployment
2026-02-09 23:16:34 +00:00

69 lines
2.1 KiB
Python

from __future__ import annotations
from typing import Dict, List, Union
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from utils import normalize_text
from ..registry import extractor
@extractor
def ex_breadcrumbs(soup: BeautifulSoup, url: str) -> Dict:
"""
Parse breadcrumbs to identify top and sub categories.
"""
bc_ul = (soup.select_one(".breadcrumbs ul.items")
or soup.select_one("nav.breadcrumbs ul.items")
or soup.select_one("ul.items"))
if not bc_ul:
return {}
crumbs = []
for li in bc_ul.select("li.item"):
a = li.find("a")
if a:
title = normalize_text(a.get("title") or a.get_text())
href = a.get("href")
else:
title = normalize_text(li.get_text())
href = None
slug = None
if href:
try:
p = urlparse(href)
path = (p.path or "").strip("/")
slug = path.split("/")[-1] if path else None
except Exception:
slug = None
if slug:
crumbs.append({"title": title or None, "href": href or None, "slug": slug})
category_links = [c for c in crumbs if c.get("href")]
top = None
sub = None
for c in category_links:
t = (c.get("title") or "").lower()
s = (c.get("slug") or "").lower()
if t == "home" or s in ("", "home"):
continue
if top is None:
top = c
continue
if sub is None:
sub = c
break
out: Dict[str, Union[str, List[Dict[str, str]]]] = {
"category_breadcrumbs": crumbs
}
if top:
out["category_top_title"] = top.get("title")
out["category_top_href"] = top.get("href")
out["category_top_slug"] = top.get("slug")
if sub:
out["category_sub_title"] = sub.get("title")
out["category_sub_href"] = sub.get("href")
out["category_sub_slug"] = sub.get("slug")
if top and sub:
out["category_path"] = f"{(top.get('slug') or '').strip()}/{(sub.get('slug') or '').strip()}"
return out