feat: initialize market app with browsing, product, and scraping code
Some checks failed
Build and Deploy / build-and-deploy (push) Has been cancelled
Some checks failed
Build and Deploy / build-and-deploy (push) Has been cancelled
Split from coop monolith. Includes: - Market/browse/product blueprints - Product sync API - Suma scraping pipeline - Templates for market, browse, and product views - Dockerfile and CI workflow for independent deployment
This commit is contained in:
68
scrape/product/extractors/breadcrumbs.py
Normal file
68
scrape/product/extractors/breadcrumbs.py
Normal file
@@ -0,0 +1,68 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Dict, List, Union
|
||||
from urllib.parse import urlparse
|
||||
from bs4 import BeautifulSoup
|
||||
from utils import normalize_text
|
||||
from ..registry import extractor
|
||||
|
||||
@extractor
|
||||
def ex_breadcrumbs(soup: BeautifulSoup, url: str) -> Dict:
|
||||
"""
|
||||
Parse breadcrumbs to identify top and sub categories.
|
||||
"""
|
||||
bc_ul = (soup.select_one(".breadcrumbs ul.items")
|
||||
or soup.select_one("nav.breadcrumbs ul.items")
|
||||
or soup.select_one("ul.items"))
|
||||
if not bc_ul:
|
||||
return {}
|
||||
|
||||
crumbs = []
|
||||
for li in bc_ul.select("li.item"):
|
||||
a = li.find("a")
|
||||
if a:
|
||||
title = normalize_text(a.get("title") or a.get_text())
|
||||
href = a.get("href")
|
||||
else:
|
||||
title = normalize_text(li.get_text())
|
||||
href = None
|
||||
slug = None
|
||||
if href:
|
||||
try:
|
||||
p = urlparse(href)
|
||||
path = (p.path or "").strip("/")
|
||||
slug = path.split("/")[-1] if path else None
|
||||
except Exception:
|
||||
slug = None
|
||||
if slug:
|
||||
crumbs.append({"title": title or None, "href": href or None, "slug": slug})
|
||||
|
||||
category_links = [c for c in crumbs if c.get("href")]
|
||||
top = None
|
||||
sub = None
|
||||
for c in category_links:
|
||||
t = (c.get("title") or "").lower()
|
||||
s = (c.get("slug") or "").lower()
|
||||
if t == "home" or s in ("", "home"):
|
||||
continue
|
||||
if top is None:
|
||||
top = c
|
||||
continue
|
||||
if sub is None:
|
||||
sub = c
|
||||
break
|
||||
|
||||
out: Dict[str, Union[str, List[Dict[str, str]]]] = {
|
||||
"category_breadcrumbs": crumbs
|
||||
}
|
||||
if top:
|
||||
out["category_top_title"] = top.get("title")
|
||||
out["category_top_href"] = top.get("href")
|
||||
out["category_top_slug"] = top.get("slug")
|
||||
if sub:
|
||||
out["category_sub_title"] = sub.get("title")
|
||||
out["category_sub_href"] = sub.get("href")
|
||||
out["category_sub_slug"] = sub.get("slug")
|
||||
if top and sub:
|
||||
out["category_path"] = f"{(top.get('slug') or '').strip()}/{(sub.get('slug') or '').strip()}"
|
||||
return out
|
||||
Reference in New Issue
Block a user