This repository has been archived on 2026-02-24. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
market/scrape/product/extractors/labels.py
giles 478636f799 feat: decouple market from shared_lib, add app-owned models
Phase 1-3 of decoupling:
- path_setup.py adds project root to sys.path
- Market-owned models in market/models/ (market, market_place)
- All imports updated: shared.infrastructure, shared.db, shared.browser, etc.
- MarketPlace uses container_type/container_id instead of post_id FK

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 12:46:32 +00:00

42 lines
1.2 KiB
Python

from __future__ import annotations
from typing import Dict, List
from bs4 import BeautifulSoup
from shared.utils import normalize_text
from ..registry import extractor
@extractor
def ex_labels(soup: BeautifulSoup, url: str) -> Dict:
"""
From:
<ul class="cdz-product-labels">
<li class="label-item new"><div class="label-content">NEW</div></li>
</ul>
Returns "labels": lower-cased union of class hints and visible text.
"""
root = soup.select_one("ul.cdz-product-labels")
if not root:
return {}
items: List[str] = []
texts: List[str] = []
for li in root.select("li.label-item"):
for c in (li.get("class") or []):
c = (c or "").strip()
if c and c.lower() != "label-item" and c not in items:
items.append(c)
txt = normalize_text(li.get_text())
if txt and txt not in texts:
texts.append(txt)
if not items and not texts:
return {}
union = []
seen = set()
for s in items + [t.lower() for t in texts]:
key = (s or "").strip().lower()
if key and key not in seen:
seen.add(key)
union.append(key)
return {"labels": union}