from __future__ import annotations from typing import List, Optional from urllib.parse import urljoin, urlparse from config import config def first_from_srcset(val: str) -> Optional[str]: if not val: return None first = val.split(",")[0].strip() parts = first.split() return parts[0] if parts else first def abs_url(u: Optional[str]) -> Optional[str]: if not u: return None return urljoin(config()["base_url"], u) if isinstance(u, str) and u.startswith("/") else u def collect_img_candidates(el) -> List[str]: urls: List[str] = [] if not el: return urls attrs = ["src", "data-src", "data-original", "data-zoom-image", "data-thumb", "content", "href"] for a in attrs: v = el.get(a) if v: urls.append(v) for a in ["srcset", "data-srcset"]: v = el.get(a) if v: first = first_from_srcset(v) if first: urls.append(first) return urls def _filename_key(u: str) -> str: p = urlparse(u) path = p.path or "" if path.endswith("/"): path = path[:-1] last = path.split("/")[-1] return f"{p.netloc}:{last}".lower() def dedup_by_filename(urls: List[str]) -> List[str]: seen = set() out: List[str] = [] for u in urls: k = _filename_key(u) if k in seen: continue seen.add(k) out.append(u) return out