This repository has been archived on 2026-02-24. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
market/scrape/product/helpers/html.py
giles 478636f799 feat: decouple market from shared_lib, add app-owned models
Phase 1-3 of decoupling:
- path_setup.py adds project root to sys.path
- Market-owned models in market/models/ (market, market_place)
- All imports updated: shared.infrastructure, shared.db, shared.browser, etc.
- MarketPlace uses container_type/container_id instead of post_id FK

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 12:46:32 +00:00

54 lines
1.4 KiB
Python

from __future__ import annotations
from typing import List, Optional
from urllib.parse import urljoin, urlparse
from shared.config import config
def first_from_srcset(val: str) -> Optional[str]:
if not val:
return None
first = val.split(",")[0].strip()
parts = first.split()
return parts[0] if parts else first
def abs_url(u: Optional[str]) -> Optional[str]:
if not u:
return None
return urljoin(config()["base_url"], u) if isinstance(u, str) and u.startswith("/") else u
def collect_img_candidates(el) -> List[str]:
urls: List[str] = []
if not el:
return urls
attrs = ["src", "data-src", "data-original", "data-zoom-image", "data-thumb", "content", "href"]
for a in attrs:
v = el.get(a)
if v:
urls.append(v)
for a in ["srcset", "data-srcset"]:
v = el.get(a)
if v:
first = first_from_srcset(v)
if first:
urls.append(first)
return urls
def _filename_key(u: str) -> str:
p = urlparse(u)
path = p.path or ""
if path.endswith("/"):
path = path[:-1]
last = path.split("/")[-1]
return f"{p.netloc}:{last}".lower()
def dedup_by_filename(urls: List[str]) -> List[str]:
seen = set()
out: List[str] = []
for u in urls:
k = _filename_key(u)
if k in seen:
continue
seen.add(k)
out.append(u)
return out