Some checks failed
Build and Deploy / build-and-deploy (push) Has been cancelled
Split from coop monolith. Includes: - Market/browse/product blueprints - Product sync API - Suma scraping pipeline - Templates for market, browse, and product views - Dockerfile and CI workflow for independent deployment
54 lines
1.4 KiB
Python
54 lines
1.4 KiB
Python
|
|
from __future__ import annotations
|
|
from typing import List, Optional
|
|
from urllib.parse import urljoin, urlparse
|
|
from config import config
|
|
|
|
def first_from_srcset(val: str) -> Optional[str]:
|
|
if not val:
|
|
return None
|
|
first = val.split(",")[0].strip()
|
|
parts = first.split()
|
|
return parts[0] if parts else first
|
|
|
|
def abs_url(u: Optional[str]) -> Optional[str]:
|
|
if not u:
|
|
return None
|
|
return urljoin(config()["base_url"], u) if isinstance(u, str) and u.startswith("/") else u
|
|
|
|
def collect_img_candidates(el) -> List[str]:
|
|
urls: List[str] = []
|
|
if not el:
|
|
return urls
|
|
attrs = ["src", "data-src", "data-original", "data-zoom-image", "data-thumb", "content", "href"]
|
|
for a in attrs:
|
|
v = el.get(a)
|
|
if v:
|
|
urls.append(v)
|
|
for a in ["srcset", "data-srcset"]:
|
|
v = el.get(a)
|
|
if v:
|
|
first = first_from_srcset(v)
|
|
if first:
|
|
urls.append(first)
|
|
return urls
|
|
|
|
def _filename_key(u: str) -> str:
|
|
p = urlparse(u)
|
|
path = p.path or ""
|
|
if path.endswith("/"):
|
|
path = path[:-1]
|
|
last = path.split("/")[-1]
|
|
return f"{p.netloc}:{last}".lower()
|
|
|
|
def dedup_by_filename(urls: List[str]) -> List[str]:
|
|
seen = set()
|
|
out: List[str] = []
|
|
for u in urls:
|
|
k = _filename_key(u)
|
|
if k in seen:
|
|
continue
|
|
seen.add(k)
|
|
out.append(u)
|
|
return out
|