feat: initialize market app with browsing, product, and scraping code
Some checks failed
Build and Deploy / build-and-deploy (push) Has been cancelled
Some checks failed
Build and Deploy / build-and-deploy (push) Has been cancelled
Split from coop monolith. Includes: - Market/browse/product blueprints - Product sync API - Suma scraping pipeline - Templates for market, browse, and product views - Dockerfile and CI workflow for independent deployment
This commit is contained in:
41
scrape/product/extractors/labels.py
Normal file
41
scrape/product/extractors/labels.py
Normal file
@@ -0,0 +1,41 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Dict, List
|
||||
from bs4 import BeautifulSoup
|
||||
from utils import normalize_text
|
||||
from ..registry import extractor
|
||||
|
||||
@extractor
|
||||
def ex_labels(soup: BeautifulSoup, url: str) -> Dict:
|
||||
"""
|
||||
From:
|
||||
<ul class="cdz-product-labels">
|
||||
<li class="label-item new"><div class="label-content">NEW</div></li>
|
||||
</ul>
|
||||
Returns "labels": lower-cased union of class hints and visible text.
|
||||
"""
|
||||
root = soup.select_one("ul.cdz-product-labels")
|
||||
if not root:
|
||||
return {}
|
||||
items: List[str] = []
|
||||
texts: List[str] = []
|
||||
|
||||
for li in root.select("li.label-item"):
|
||||
for c in (li.get("class") or []):
|
||||
c = (c or "").strip()
|
||||
if c and c.lower() != "label-item" and c not in items:
|
||||
items.append(c)
|
||||
txt = normalize_text(li.get_text())
|
||||
if txt and txt not in texts:
|
||||
texts.append(txt)
|
||||
|
||||
if not items and not texts:
|
||||
return {}
|
||||
union = []
|
||||
seen = set()
|
||||
for s in items + [t.lower() for t in texts]:
|
||||
key = (s or "").strip().lower()
|
||||
if key and key not in seen:
|
||||
seen.add(key)
|
||||
union.append(key)
|
||||
return {"labels": union}
|
||||
Reference in New Issue
Block a user