feat: initialize market app with browsing, product, and scraping code

Split from coop monolith. Includes: - Market/browse/product blueprints - Product sync API - Suma scraping pipeline - Templates for market, browse, and product views - Dockerfile and CI workflow for independent deployment
2026-02-09 23:16:34 +00:00
commit 6271a715a1
142 changed files with 8517 additions and 0 deletions
--- a/scrape/product/extractors/images.py
+++ b/scrape/product/extractors/images.py
@@ -0,0 +1,89 @@
+from __future__ import annotations
+import json, re
+from typing import Dict, List
+from bs4 import BeautifulSoup
+from ..registry import extractor
+from ..helpers.html import abs_url, collect_img_candidates, dedup_by_filename
+
+@extractor
+def ex_images(soup: BeautifulSoup, url: str) -> Dict:
+    images: List[str] = []
+    debug = False  # set True while debugging
+
+    # 1) Magento init script (gallery)
+    scripts = soup.find_all("script", attrs={"type": "text/x-magento-init"})
+    if debug: print(f"[ex_images] x-magento-init scripts: {len(scripts)}")
+
+    for script in scripts:
+        # Use raw string as-is; no stripping/collapsing
+        text = script.string or script.get_text() or ""
+        if "mage/gallery/gallery" not in text:
+            continue
+
+        # Correct (not over-escaped) patterns:
+        m = re.search(r'"data"\s*:\s*(\[[\s\S]*?\])', text)
+        if not m:
+            if debug: print("[ex_images] 'data' array not found in gallery block")
+            continue
+
+        arr_txt = m.group(1)
+        added = False
+        try:
+            data = json.loads(arr_txt)
+            for entry in data:
+                u = abs_url(entry.get("full")) or abs_url(entry.get("img"))
+                if u:
+                    images.append(u); added = True
+        except Exception as e:
+            if debug: print(f"[ex_images] json.loads failed: {e!r}; trying regex fallback")
+            # Fallback to simple key extraction
+            fulls = re.findall(r'"full"\s*:\s*"([^"]+)"', arr_txt)
+            imgs  = re.findall(r'"img"\s*:\s*"([^"]+)"',  arr_txt) if not fulls else []
+            for u in (fulls or imgs):
+                u = abs_url(u)
+                if u:
+                    images.append(u); added = True
+
+        if added:
+            break  # got what we need from the gallery block
+
+    # 2) JSON-LD fallback
+    if not images:
+        for script in soup.find_all("script", attrs={"type": "application/ld+json"}):
+            raw = script.string or script.get_text() or ""
+            try:
+                data = json.loads(raw)
+            except Exception:
+                continue
+
+            def add_from(val):
+                if isinstance(val, str):
+                    u = abs_url(val);  u and images.append(u)
+                elif isinstance(val, list):
+                    for v in val:
+                        if isinstance(v, str):
+                            u = abs_url(v); u and images.append(u)
+                        elif isinstance(v, dict) and "url" in v:
+                            u = abs_url(v["url"]); u and images.append(u)
+                elif isinstance(val, dict) and "url" in val:
+                    u = abs_url(val["url"]); u and images.append(u)
+
+            if isinstance(data, dict) and "image" in data:
+                add_from(data["image"])
+            if isinstance(data, list):
+                for item in data:
+                    if isinstance(item, dict) and "image" in item:
+                        add_from(item["image"])
+
+    # 3) Generic DOM scan fallback
+    if not images:
+        # consider broadening selectors if needed, e.g. '.fotorama__img'
+        for el in soup.select(".product.media img, .gallery-placeholder img, .fotorama__stage img"):
+            for cand in collect_img_candidates(el):
+                u = abs_url(cand)
+                if u:
+                    images.append(u)
+
+    images = dedup_by_filename(images)
+    if debug: print(f"[ex_images] found images: {images}")
+    return {"images": images, "image": images[0] if images else None}