market/scrape/product/extractors/images.py

from __future__ import annotations
import json, re
from typing import Dict, List
from bs4 import BeautifulSoup
from ..registry import extractor
from ..helpers.html import abs_url, collect_img_candidates, dedup_by_filename

@extractor
def ex_images(soup: BeautifulSoup, url: str) -> Dict:
    images: List[str] = []
    debug = False  # set True while debugging

    # 1) Magento init script (gallery)
    scripts = soup.find_all("script", attrs={"type": "text/x-magento-init"})
    if debug: print(f"[ex_images] x-magento-init scripts: {len(scripts)}")

    for script in scripts:
        # Use raw string as-is; no stripping/collapsing
        text = script.string or script.get_text() or ""
        if "mage/gallery/gallery" not in text:
            continue

        # Correct (not over-escaped) patterns:
        m = re.search(r'"data"\s*:\s*(\[[\s\S]*?\])', text)
        if not m:
            if debug: print("[ex_images] 'data' array not found in gallery block")
            continue

        arr_txt = m.group(1)
        added = False
        try:
            data = json.loads(arr_txt)
            for entry in data:
                u = abs_url(entry.get("full")) or abs_url(entry.get("img"))
                if u:
                    images.append(u); added = True
        except Exception as e:
            if debug: print(f"[ex_images] json.loads failed: {e!r}; trying regex fallback")
            # Fallback to simple key extraction
            fulls = re.findall(r'"full"\s*:\s*"([^"]+)"', arr_txt)
            imgs  = re.findall(r'"img"\s*:\s*"([^"]+)"',  arr_txt) if not fulls else []
            for u in (fulls or imgs):
                u = abs_url(u)
                if u:
                    images.append(u); added = True

        if added:
            break  # got what we need from the gallery block

    # 2) JSON-LD fallback
    if not images:
        for script in soup.find_all("script", attrs={"type": "application/ld+json"}):
            raw = script.string or script.get_text() or ""
            try:
                data = json.loads(raw)
            except Exception:
                continue

            def add_from(val):
                if isinstance(val, str):
                    u = abs_url(val);  u and images.append(u)
                elif isinstance(val, list):
                    for v in val:
                        if isinstance(v, str):
                            u = abs_url(v); u and images.append(u)
                        elif isinstance(v, dict) and "url" in v:
                            u = abs_url(v["url"]); u and images.append(u)
                elif isinstance(val, dict) and "url" in val:
                    u = abs_url(val["url"]); u and images.append(u)

            if isinstance(data, dict) and "image" in data:
                add_from(data["image"])
            if isinstance(data, list):
                for item in data:
                    if isinstance(item, dict) and "image" in item:
                        add_from(item["image"])

    # 3) Generic DOM scan fallback
    if not images:
        # consider broadening selectors if needed, e.g. '.fotorama__img'
        for el in soup.select(".product.media img, .gallery-placeholder img, .fotorama__stage img"):
            for cand in collect_img_candidates(el):
                u = abs_url(cand)
                if u:
                    images.append(u)

    images = dedup_by_filename(images)
    if debug: print(f"[ex_images] found images: {images}")
    return {"images": images, "image": images[0] if images else None}