Some checks failed
Build and Deploy / build-and-deploy (push) Has been cancelled
Split from coop monolith. Includes: - Market/browse/product blueprints - Product sync API - Suma scraping pipeline - Templates for market, browse, and product views - Dockerfile and CI workflow for independent deployment
90 lines
3.5 KiB
Python
90 lines
3.5 KiB
Python
from __future__ import annotations
|
|
import json, re
|
|
from typing import Dict, List
|
|
from bs4 import BeautifulSoup
|
|
from ..registry import extractor
|
|
from ..helpers.html import abs_url, collect_img_candidates, dedup_by_filename
|
|
|
|
@extractor
|
|
def ex_images(soup: BeautifulSoup, url: str) -> Dict:
|
|
images: List[str] = []
|
|
debug = False # set True while debugging
|
|
|
|
# 1) Magento init script (gallery)
|
|
scripts = soup.find_all("script", attrs={"type": "text/x-magento-init"})
|
|
if debug: print(f"[ex_images] x-magento-init scripts: {len(scripts)}")
|
|
|
|
for script in scripts:
|
|
# Use raw string as-is; no stripping/collapsing
|
|
text = script.string or script.get_text() or ""
|
|
if "mage/gallery/gallery" not in text:
|
|
continue
|
|
|
|
# Correct (not over-escaped) patterns:
|
|
m = re.search(r'"data"\s*:\s*(\[[\s\S]*?\])', text)
|
|
if not m:
|
|
if debug: print("[ex_images] 'data' array not found in gallery block")
|
|
continue
|
|
|
|
arr_txt = m.group(1)
|
|
added = False
|
|
try:
|
|
data = json.loads(arr_txt)
|
|
for entry in data:
|
|
u = abs_url(entry.get("full")) or abs_url(entry.get("img"))
|
|
if u:
|
|
images.append(u); added = True
|
|
except Exception as e:
|
|
if debug: print(f"[ex_images] json.loads failed: {e!r}; trying regex fallback")
|
|
# Fallback to simple key extraction
|
|
fulls = re.findall(r'"full"\s*:\s*"([^"]+)"', arr_txt)
|
|
imgs = re.findall(r'"img"\s*:\s*"([^"]+)"', arr_txt) if not fulls else []
|
|
for u in (fulls or imgs):
|
|
u = abs_url(u)
|
|
if u:
|
|
images.append(u); added = True
|
|
|
|
if added:
|
|
break # got what we need from the gallery block
|
|
|
|
# 2) JSON-LD fallback
|
|
if not images:
|
|
for script in soup.find_all("script", attrs={"type": "application/ld+json"}):
|
|
raw = script.string or script.get_text() or ""
|
|
try:
|
|
data = json.loads(raw)
|
|
except Exception:
|
|
continue
|
|
|
|
def add_from(val):
|
|
if isinstance(val, str):
|
|
u = abs_url(val); u and images.append(u)
|
|
elif isinstance(val, list):
|
|
for v in val:
|
|
if isinstance(v, str):
|
|
u = abs_url(v); u and images.append(u)
|
|
elif isinstance(v, dict) and "url" in v:
|
|
u = abs_url(v["url"]); u and images.append(u)
|
|
elif isinstance(val, dict) and "url" in val:
|
|
u = abs_url(val["url"]); u and images.append(u)
|
|
|
|
if isinstance(data, dict) and "image" in data:
|
|
add_from(data["image"])
|
|
if isinstance(data, list):
|
|
for item in data:
|
|
if isinstance(item, dict) and "image" in item:
|
|
add_from(item["image"])
|
|
|
|
# 3) Generic DOM scan fallback
|
|
if not images:
|
|
# consider broadening selectors if needed, e.g. '.fotorama__img'
|
|
for el in soup.select(".product.media img, .gallery-placeholder img, .fotorama__stage img"):
|
|
for cand in collect_img_candidates(el):
|
|
u = abs_url(cand)
|
|
if u:
|
|
images.append(u)
|
|
|
|
images = dedup_by_filename(images)
|
|
if debug: print(f"[ex_images] found images: {images}")
|
|
return {"images": images, "image": images[0] if images else None}
|