feat: initialize market app with browsing, product, and scraping code
Some checks failed
Build and Deploy / build-and-deploy (push) Has been cancelled
Some checks failed
Build and Deploy / build-and-deploy (push) Has been cancelled
Split from coop monolith. Includes: - Market/browse/product blueprints - Product sync API - Suma scraping pipeline - Templates for market, browse, and product views - Dockerfile and CI workflow for independent deployment
This commit is contained in:
89
scrape/product/extractors/images.py
Normal file
89
scrape/product/extractors/images.py
Normal file
@@ -0,0 +1,89 @@
|
||||
from __future__ import annotations
|
||||
import json, re
|
||||
from typing import Dict, List
|
||||
from bs4 import BeautifulSoup
|
||||
from ..registry import extractor
|
||||
from ..helpers.html import abs_url, collect_img_candidates, dedup_by_filename
|
||||
|
||||
@extractor
|
||||
def ex_images(soup: BeautifulSoup, url: str) -> Dict:
|
||||
images: List[str] = []
|
||||
debug = False # set True while debugging
|
||||
|
||||
# 1) Magento init script (gallery)
|
||||
scripts = soup.find_all("script", attrs={"type": "text/x-magento-init"})
|
||||
if debug: print(f"[ex_images] x-magento-init scripts: {len(scripts)}")
|
||||
|
||||
for script in scripts:
|
||||
# Use raw string as-is; no stripping/collapsing
|
||||
text = script.string or script.get_text() or ""
|
||||
if "mage/gallery/gallery" not in text:
|
||||
continue
|
||||
|
||||
# Correct (not over-escaped) patterns:
|
||||
m = re.search(r'"data"\s*:\s*(\[[\s\S]*?\])', text)
|
||||
if not m:
|
||||
if debug: print("[ex_images] 'data' array not found in gallery block")
|
||||
continue
|
||||
|
||||
arr_txt = m.group(1)
|
||||
added = False
|
||||
try:
|
||||
data = json.loads(arr_txt)
|
||||
for entry in data:
|
||||
u = abs_url(entry.get("full")) or abs_url(entry.get("img"))
|
||||
if u:
|
||||
images.append(u); added = True
|
||||
except Exception as e:
|
||||
if debug: print(f"[ex_images] json.loads failed: {e!r}; trying regex fallback")
|
||||
# Fallback to simple key extraction
|
||||
fulls = re.findall(r'"full"\s*:\s*"([^"]+)"', arr_txt)
|
||||
imgs = re.findall(r'"img"\s*:\s*"([^"]+)"', arr_txt) if not fulls else []
|
||||
for u in (fulls or imgs):
|
||||
u = abs_url(u)
|
||||
if u:
|
||||
images.append(u); added = True
|
||||
|
||||
if added:
|
||||
break # got what we need from the gallery block
|
||||
|
||||
# 2) JSON-LD fallback
|
||||
if not images:
|
||||
for script in soup.find_all("script", attrs={"type": "application/ld+json"}):
|
||||
raw = script.string or script.get_text() or ""
|
||||
try:
|
||||
data = json.loads(raw)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
def add_from(val):
|
||||
if isinstance(val, str):
|
||||
u = abs_url(val); u and images.append(u)
|
||||
elif isinstance(val, list):
|
||||
for v in val:
|
||||
if isinstance(v, str):
|
||||
u = abs_url(v); u and images.append(u)
|
||||
elif isinstance(v, dict) and "url" in v:
|
||||
u = abs_url(v["url"]); u and images.append(u)
|
||||
elif isinstance(val, dict) and "url" in val:
|
||||
u = abs_url(val["url"]); u and images.append(u)
|
||||
|
||||
if isinstance(data, dict) and "image" in data:
|
||||
add_from(data["image"])
|
||||
if isinstance(data, list):
|
||||
for item in data:
|
||||
if isinstance(item, dict) and "image" in item:
|
||||
add_from(item["image"])
|
||||
|
||||
# 3) Generic DOM scan fallback
|
||||
if not images:
|
||||
# consider broadening selectors if needed, e.g. '.fotorama__img'
|
||||
for el in soup.select(".product.media img, .gallery-placeholder img, .fotorama__stage img"):
|
||||
for cand in collect_img_candidates(el):
|
||||
u = abs_url(cand)
|
||||
if u:
|
||||
images.append(u)
|
||||
|
||||
images = dedup_by_filename(images)
|
||||
if debug: print(f"[ex_images] found images: {images}")
|
||||
return {"images": images, "image": images[0] if images else None}
|
||||
Reference in New Issue
Block a user