This repository has been archived on 2026-02-24. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
market/scrape/product/extractors/images.py
giles 6271a715a1
Some checks failed
Build and Deploy / build-and-deploy (push) Has been cancelled
feat: initialize market app with browsing, product, and scraping code
Split from coop monolith. Includes:
- Market/browse/product blueprints
- Product sync API
- Suma scraping pipeline
- Templates for market, browse, and product views
- Dockerfile and CI workflow for independent deployment
2026-02-09 23:16:34 +00:00

90 lines
3.5 KiB
Python

from __future__ import annotations
import json, re
from typing import Dict, List
from bs4 import BeautifulSoup
from ..registry import extractor
from ..helpers.html import abs_url, collect_img_candidates, dedup_by_filename
@extractor
def ex_images(soup: BeautifulSoup, url: str) -> Dict:
images: List[str] = []
debug = False # set True while debugging
# 1) Magento init script (gallery)
scripts = soup.find_all("script", attrs={"type": "text/x-magento-init"})
if debug: print(f"[ex_images] x-magento-init scripts: {len(scripts)}")
for script in scripts:
# Use raw string as-is; no stripping/collapsing
text = script.string or script.get_text() or ""
if "mage/gallery/gallery" not in text:
continue
# Correct (not over-escaped) patterns:
m = re.search(r'"data"\s*:\s*(\[[\s\S]*?\])', text)
if not m:
if debug: print("[ex_images] 'data' array not found in gallery block")
continue
arr_txt = m.group(1)
added = False
try:
data = json.loads(arr_txt)
for entry in data:
u = abs_url(entry.get("full")) or abs_url(entry.get("img"))
if u:
images.append(u); added = True
except Exception as e:
if debug: print(f"[ex_images] json.loads failed: {e!r}; trying regex fallback")
# Fallback to simple key extraction
fulls = re.findall(r'"full"\s*:\s*"([^"]+)"', arr_txt)
imgs = re.findall(r'"img"\s*:\s*"([^"]+)"', arr_txt) if not fulls else []
for u in (fulls or imgs):
u = abs_url(u)
if u:
images.append(u); added = True
if added:
break # got what we need from the gallery block
# 2) JSON-LD fallback
if not images:
for script in soup.find_all("script", attrs={"type": "application/ld+json"}):
raw = script.string or script.get_text() or ""
try:
data = json.loads(raw)
except Exception:
continue
def add_from(val):
if isinstance(val, str):
u = abs_url(val); u and images.append(u)
elif isinstance(val, list):
for v in val:
if isinstance(v, str):
u = abs_url(v); u and images.append(u)
elif isinstance(v, dict) and "url" in v:
u = abs_url(v["url"]); u and images.append(u)
elif isinstance(val, dict) and "url" in val:
u = abs_url(val["url"]); u and images.append(u)
if isinstance(data, dict) and "image" in data:
add_from(data["image"])
if isinstance(data, list):
for item in data:
if isinstance(item, dict) and "image" in item:
add_from(item["image"])
# 3) Generic DOM scan fallback
if not images:
# consider broadening selectors if needed, e.g. '.fotorama__img'
for el in soup.select(".product.media img, .gallery-placeholder img, .fotorama__stage img"):
for cand in collect_img_candidates(el):
u = abs_url(cand)
if u:
images.append(u)
images = dedup_by_filename(images)
if debug: print(f"[ex_images] found images: {images}")
return {"images": images, "image": images[0] if images else None}