from __future__ import annotations import json, re from typing import Dict, List from bs4 import BeautifulSoup from ..registry import extractor from ..helpers.html import abs_url, collect_img_candidates, dedup_by_filename @extractor def ex_images(soup: BeautifulSoup, url: str) -> Dict: images: List[str] = [] debug = False # set True while debugging # 1) Magento init script (gallery) scripts = soup.find_all("script", attrs={"type": "text/x-magento-init"}) if debug: print(f"[ex_images] x-magento-init scripts: {len(scripts)}") for script in scripts: # Use raw string as-is; no stripping/collapsing text = script.string or script.get_text() or "" if "mage/gallery/gallery" not in text: continue # Correct (not over-escaped) patterns: m = re.search(r'"data"\s*:\s*(\[[\s\S]*?\])', text) if not m: if debug: print("[ex_images] 'data' array not found in gallery block") continue arr_txt = m.group(1) added = False try: data = json.loads(arr_txt) for entry in data: u = abs_url(entry.get("full")) or abs_url(entry.get("img")) if u: images.append(u); added = True except Exception as e: if debug: print(f"[ex_images] json.loads failed: {e!r}; trying regex fallback") # Fallback to simple key extraction fulls = re.findall(r'"full"\s*:\s*"([^"]+)"', arr_txt) imgs = re.findall(r'"img"\s*:\s*"([^"]+)"', arr_txt) if not fulls else [] for u in (fulls or imgs): u = abs_url(u) if u: images.append(u); added = True if added: break # got what we need from the gallery block # 2) JSON-LD fallback if not images: for script in soup.find_all("script", attrs={"type": "application/ld+json"}): raw = script.string or script.get_text() or "" try: data = json.loads(raw) except Exception: continue def add_from(val): if isinstance(val, str): u = abs_url(val); u and images.append(u) elif isinstance(val, list): for v in val: if isinstance(v, str): u = abs_url(v); u and images.append(u) elif isinstance(v, dict) and "url" in v: u = abs_url(v["url"]); u and images.append(u) elif isinstance(val, dict) and "url" in val: u = abs_url(val["url"]); u and images.append(u) if isinstance(data, dict) and "image" in data: add_from(data["image"]) if isinstance(data, list): for item in data: if isinstance(item, dict) and "image" in item: add_from(item["image"]) # 3) Generic DOM scan fallback if not images: # consider broadening selectors if needed, e.g. '.fotorama__img' for el in soup.select(".product.media img, .gallery-placeholder img, .fotorama__stage img"): for cand in collect_img_candidates(el): u = abs_url(cand) if u: images.append(u) images = dedup_by_filename(images) if debug: print(f"[ex_images] found images: {images}") return {"images": images, "image": images[0] if images else None}