Monorepo: consolidate 7 repos into one

Combines shared, blog, market, cart, events, federation, and account into a single repository. Eliminates submodule sync, sibling model copying at build time, and per-app CI orchestration. Changes: - Remove per-app .git, .gitmodules, .gitea, submodule shared/ dirs - Remove stale sibling model copies from each app - Update all 6 Dockerfiles for monorepo build context (root = .) - Add build directives to docker-compose.yml - Add single .gitea/workflows/ci.yml with change detection - Add .dockerignore for monorepo build context - Create __init__.py for federation and account (cross-app imports)
2026-02-24 19:44:17 +00:00
commit f42042ccb7
895 changed files with 61147 additions and 0 deletions
--- a/market/scrape/init.py
+++ b/market/scrape/init.py
--- a/market/scrape/build_snapshot/init.py
+++ b/market/scrape/build_snapshot/init.py
@@ -0,0 +1 @@
+from .build_snapshot import build_snapshot
--- a/market/scrape/build_snapshot/build_snapshot.py
+++ b/market/scrape/build_snapshot/build_snapshot.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import os
+from typing import Dict, Set
+        
+from ..http_client import configure_cookies
+from ..get_auth import login
+
+from shared.config import config
+
+from shared.utils import log
+
+# DB: persistence helpers
+
+from .tools import (
+    _resolve_sub_redirects,
+    valid_subs,
+    candidate_subs,
+    rewrite_nav,
+    capture_product_slugs,
+    fetch_and_upsert_products,
+)
+
+from ..nav import nav_scrape
+    
+# ------------------------ core ------------------------
+async def build_snapshot(
+        concurrency: int,
+        user: str,
+        password: str,
+        save_nav,
+        capture_listing,
+        upsert_product,
+        log_product_result,
+        save_subcategory_redirects,
+        save_link_reports = None,
+) -> None:
+    # NOTE: we keep ensure_dir for listings iteration but no longer write JSON files.
+
+    # Make project importable
+    import sys
+    sys.path.insert(0, os.path.abspath("."))
+
+
+    cookies = await login(username=user, password=password)
+    await configure_cookies(cookies)
+    for k, v in dict(cookies).items():
+        print("logged in with", k, v)
+
+    # 1) NAV
+    log("Fetching nav…")
+    nav = await nav_scrape()
+
+    # Build valid subs per top from nav
+    valid_subs_by_top: Dict[str, Set[str]] = valid_subs(nav)
+    
+    # Resolve redirects for all subs in nav first
+    nav_sub_candidates = candidate_subs(nav)
+    nav_redirects = await _resolve_sub_redirects(
+        base_url=config()["base_url"],
+        candidates=nav_sub_candidates,
+        allowed_tops=set(config()["categories"]["allow"].values()),
+        valid_subs_by_top=valid_subs_by_top,
+    )
+    rewrite_nav(nav, nav_redirects)
+    
+    # DB: save nav
+    await save_nav(nav)
+
+    product_slugs: Set[str] = await capture_product_slugs(
+        nav,
+        capture_listing
+    )
+    unknown_sub_paths: Set[str] = set()
+
+    # 3) PRODUCTS (fetch details)
+    await fetch_and_upsert_products(
+        upsert_product,
+        log_product_result,
+        save_link_reports,
+        concurrency,
+        product_slugs,
+        valid_subs_by_top,
+        unknown_sub_paths
+    )
+    
+    # Subcategory redirects from HTML
+    log("Resolving subcategory redirects…")
+    html_redirects = await _resolve_sub_redirects(
+        base_url=config()["base_url"],
+        candidates=unknown_sub_paths,
+        allowed_tops=set(config()["categories"]["allow"].values()),
+        valid_subs_by_top=valid_subs_by_top,
+    )
+    sub_redirects: Dict[str, str] = dict(nav_redirects)
+    sub_redirects.update(html_redirects)
+
+    # DB: persist redirects
+    await save_subcategory_redirects(sub_redirects)
+    
+    log("Snapshot build complete (to Postgres).")
+
+
--- a/market/scrape/build_snapshot/tools/APP_ROOT_PLACEHOLDER.py
+++ b/market/scrape/build_snapshot/tools/APP_ROOT_PLACEHOLDER.py
@@ -0,0 +1 @@
+APP_ROOT_PLACEHOLDER = "[**__APP_ROOT__**]"
--- a/market/scrape/build_snapshot/tools/init.py
+++ b/market/scrape/build_snapshot/tools/init.py
@@ -0,0 +1 @@
+
--- a/market/scrape/build_snapshot/tools/_anchor_text.py
+++ b/market/scrape/build_snapshot/tools/_anchor_text.py
@@ -0,0 +1,6 @@
+def _anchor_text(a) -> str:
+    try:
+        txt = " ".join((a.get_text(" ") or "").split())
+        return txt[:200]
+    except Exception:
+        return ""
--- a/market/scrape/build_snapshot/tools/_collect_html_img_srcs.py
+++ b/market/scrape/build_snapshot/tools/_collect_html_img_srcs.py
@@ -0,0 +1,16 @@
+from bs4 import BeautifulSoup
+from typing import List, Optional
+
+def _collect_html_img_srcs(html: Optional[str]) -> List[str]:
+    urls: List[str] = []
+    if not html:
+        return urls
+    try:
+        soup = BeautifulSoup(html, "lxml")
+        for img in soup.find_all("img"):
+            src = img.get("src")
+            if src:
+                urls.append(src)
+    except Exception:
+        pass
+    return urls
--- a/market/scrape/build_snapshot/tools/_dedupe_preserve_order.py
+++ b/market/scrape/build_snapshot/tools/_dedupe_preserve_order.py
@@ -0,0 +1,14 @@
+
+from typing import Iterable, List, Set
+
+def _dedupe_preserve_order(urls: Iterable[str]) -> List[str]:
+    seen: Set[str] = set()
+    out: List[str] = []
+    for u in urls:
+        if not u or not isinstance(u, str):
+            continue
+        if u in seen:
+            continue
+        seen.add(u)
+        out.append(u)
+    return out
--- a/market/scrape/build_snapshot/tools/_product_dict_is_cf.py
+++ b/market/scrape/build_snapshot/tools/_product_dict_is_cf.py
@@ -0,0 +1,32 @@
+from typing import Dict,Optional, Tuple
+
+_CF_TOKENS = (
+    "One moment, please...",
+    "Please wait while your request is being verified",
+    "/cdn-cgi/challenge-platform/",
+    "rocket-loader.min.js",
+)
+
+def _looks_like_cf_html(html: Optional[str]) -> Tuple[bool, Optional[str]]:
+    if not html:
+        return False, None
+    for tok in _CF_TOKENS:
+        if tok in html:
+            return True, tok
+    return False, None
+
+def _product_dict_is_cf(d: Dict) -> Tuple[bool, Optional[str]]:
+    title = (d.get("title") or "").strip()
+    if title.lower() == "one moment, please...":
+        return True, "One moment, please..."
+    ok, tok = _looks_like_cf_html(d.get("description_html"))
+    if ok:
+        return True, tok
+    for sec in d.get("sections") or []:
+        if isinstance(sec, dict) and sec.get("html"):
+            ok2, tok2 = _looks_like_cf_html(sec["html"])
+            if ok2:
+                return True, tok2
+    if not d.get("images") and not d.get("description_html") and not d.get("sections"):
+        return True, "all_empty_heuristic"
+    return False, None
--- a/market/scrape/build_snapshot/tools/_resolve_sub_redirects.py
+++ b/market/scrape/build_snapshot/tools/_resolve_sub_redirects.py
@@ -0,0 +1,34 @@
+from typing import Dict, Set
+from urllib.parse import urlparse, urljoin
+import httpx
+
+
+async def _resolve_sub_redirects(
+    base_url: str,
+    candidates: Set[str],
+    allowed_tops: Set[str],
+    valid_subs_by_top: Dict[str, Set[str]],
+) -> Dict[str, str]:
+    mapping: Dict[str, str] = {}
+    if not candidates:
+        return mapping
+    timeout = httpx.Timeout(20.0, connect=10.0)
+    async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, http2=True) as client:
+        for path in sorted(candidates):
+            try:
+                url = urljoin(base_url, path)
+                r = await client.get(url)
+                final = str(r.url)
+                p = urlparse(final)
+                parts = [x for x in (p.path or "").split("/") if x]
+                if len(parts) >= 2:
+                    top_new = parts[0].lower()
+                    sub_new = parts[1].lower().removesuffix(".html").removesuffix(".htm")
+                    if top_new in allowed_tops:
+                        new_path = f"/{top_new}/{sub_new}"
+                        if new_path != path:
+                            mapping[path] = new_path
+                            valid_subs_by_top.setdefault(top_new, set()).add(sub_new)
+            except Exception:
+                continue
+    return mapping
--- a/market/scrape/build_snapshot/tools/_rewrite_links_fragment.py
+++ b/market/scrape/build_snapshot/tools/_rewrite_links_fragment.py
@@ -0,0 +1,100 @@
+from typing import Dict, List, Optional, Set
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse, urljoin
+
+from ._anchor_text import _anchor_text
+from bp.browse.services.slugs import product_slug_from_href
+from .APP_ROOT_PLACEHOLDER import APP_ROOT_PLACEHOLDER
+
+def _rewrite_links_fragment(
+    html: Optional[str],
+    base_url: str,
+    known_slugs: Set[str],
+    category_allow_values: Set[str],
+    valid_subs_by_top: Dict[str, Set[str]],
+    current_product_slug: str,
+    link_errors: List[Dict],
+    link_externals: List[Dict],
+    unknown_sub_paths: Set[str],
+) -> str:
+    if not html:
+        return ""
+    soup = BeautifulSoup(html, "lxml")
+    base_host = urlparse(base_url).netloc
+
+    for a in soup.find_all("a", href=True):
+        raw = (a.get("href") or "").strip()
+        if not raw:
+            continue
+        low = raw.lower()
+        if low.startswith(("mailto:", "tel:", "javascript:", "data:")) or low.startswith("#"):
+            continue
+        abs_href = urljoin(base_url, raw)
+        p = urlparse(abs_href)
+        if not p.scheme or not p.netloc:
+            continue
+        if p.netloc != base_host:
+            link_externals.append({
+                "product": current_product_slug,
+                "href": abs_href,
+                "text": _anchor_text(a),
+                "host": p.netloc,
+            })
+            continue
+        parts = [x for x in (p.path or "").split("/") if x]
+        if not parts:
+            continue
+        last = parts[-1].lower()
+        if last.endswith((".html", ".htm")):
+            target_slug = product_slug_from_href(abs_href)
+            if target_slug and target_slug in known_slugs:
+                a["href"] = f"{APP_ROOT_PLACEHOLDER}/product/{target_slug}"
+            else:
+                link_errors.append({
+                    "product": current_product_slug,
+                    "href": abs_href,
+                    "text": _anchor_text(a),
+                    "top": None,
+                    "sub": None,
+                    "target_slug": target_slug or None,
+                    "type": "suma_product_unknown",
+                })
+            continue
+        top = parts[0].lower()
+        if top in category_allow_values:
+            if len(parts) == 1:
+                a["href"] = f"{APP_ROOT_PLACEHOLDER}/{top}"
+            else:
+                sub = parts[1]
+                if sub.lower().endswith((".html", ".htm")):
+                    sub = sub.rsplit(".", 1)[0]
+                if sub in (valid_subs_by_top.get(top) or set()):
+                    a["href"] = f"{APP_ROOT_PLACEHOLDER}/{top}/{sub}"
+                else:
+                    unknown_path = f"/{top}/{sub}"
+                    unknown_sub_paths.add(unknown_path)
+                    a["href"] = f"{APP_ROOT_PLACEHOLDER}{unknown_path}"
+                    link_errors.append({
+                        "product": current_product_slug,
+                        "href": abs_href,
+                        "text": _anchor_text(a),
+                        "top": top,
+                        "sub": sub,
+                        "target_slug": None,
+                        "type": "suma_category_invalid_sub_pending",
+                    })
+        else:
+            link_errors.append({
+                "product": current_product_slug,
+                "href": abs_href,
+                "text": _anchor_text(a),
+                "top": top,
+                "sub": parts[1] if len(parts) > 1 else None,
+                "target_slug": None,
+                "type": "suma_other",
+            })
+
+    for t in soup.find_all(["html", "body"]):
+        t.unwrap()
+    return "".join(str(c) for c in soup.contents).strip()
+
--- a/market/scrape/build_snapshot/tools/candidate_subs.py
+++ b/market/scrape/build_snapshot/tools/candidate_subs.py
@@ -0,0 +1,14 @@
+from typing import Dict, Set
+
+def candidate_subs(nav: Dict[str, Dict])-> Set[str]:
+    nav_sub_candidates: Set[str] = set()
+    for label, data in (nav.get("cats") or {}).items():
+        top_slug = (data or {}).get("slug")
+        if not top_slug:
+            continue
+        for s in (data.get("subs") or []):
+            sub_slug = (s.get("slug") or "").strip()
+            if sub_slug:
+                nav_sub_candidates.add(f"/{top_slug}/{sub_slug}")
+    return nav_sub_candidates
+
--- a/market/scrape/build_snapshot/tools/capture_category.py
+++ b/market/scrape/build_snapshot/tools/capture_category.py
@@ -0,0 +1,18 @@
+from urllib.parse import urljoin
+from shared.config import config
+from shared.utils import log
+from ...listings import scrape_products
+
+async def capture_category(
+    slug: str,
+):
+    list_url = urljoin(config()["base_url"], f"/{slug}")
+    log(f"[{slug}] page 1…")
+    items, total_pages = await scrape_products(list_url, page=1)
+
+    pmax = int(total_pages or 1)
+    for p in range(2, pmax + 1):
+        log(f"[{slug}] page {p}…")
+        items_p, _tp = await scrape_products(list_url, page=p)
+        items.extend(items_p)
+    return (list_url, items, total_pages)
--- a/market/scrape/build_snapshot/tools/capture_product_slugs.py
+++ b/market/scrape/build_snapshot/tools/capture_product_slugs.py
@@ -0,0 +1,25 @@
+from typing import Dict, Set
+from .capture_category import capture_category
+from .capture_sub import capture_sub
+from shared.config import config
+
+
+async def capture_product_slugs(
+   nav: Dict[str, Dict],
+   capture_listing,
+):
+    product_slugs: Set[str] = set()
+    for label, slug in config()["categories"]["allow"].items():
+        lpars = await capture_category( slug)
+        await capture_listing(*lpars)
+        (_, items, __) = lpars
+        for slug_ in items:
+            product_slugs.add(slug_)
+        for sub in (nav["cats"].get(label, {}).get("subs", []) or []):
+            lpars = await capture_sub(sub, slug)
+            await capture_listing(*lpars)
+            (_, items, __) = lpars
+            for slug_ in items:
+                product_slugs.add(slug_)
+    return product_slugs
+ 
--- a/market/scrape/build_snapshot/tools/capture_sub.py
+++ b/market/scrape/build_snapshot/tools/capture_sub.py
@@ -0,0 +1,22 @@
+from urllib.parse import urljoin
+from urllib.parse import urljoin
+from shared.config import config
+from shared.utils import log
+from ...listings import scrape_products
+
+async def capture_sub(
+  sub,
+  slug,
+):
+    sub_slug = sub.get("slug")
+    if not sub_slug:
+        return
+    sub_url = urljoin(config()["base_url"], f"/{slug}/{sub_slug}")
+    log(f"[{slug}/{sub_slug}] page 1…")
+    items_s, total_pages_s = await scrape_products(sub_url, page=1)
+    spmax = int(total_pages_s or 1)
+    for p in range(2, spmax + 1):
+        log(f"[{slug}/{sub_slug}] page {p}…")
+        items_ps, _ = await scrape_products(sub_url, page=p)
+        items_s.extend(items_ps)
+    return (sub_url, items_s, total_pages_s)
--- a/market/scrape/build_snapshot/tools/fetch_and_upsert_product.py
+++ b/market/scrape/build_snapshot/tools/fetch_and_upsert_product.py
@@ -0,0 +1,106 @@
+
+import asyncio
+from typing import List
+
+import httpx
+
+
+from ...html_utils import to_fragment
+from bp.browse.services.slugs import suma_href_from_html_slug
+
+
+from shared.config import config
+
+from shared.utils import log
+
+# DB: persistence helpers
+from ...product.product_detail import scrape_product_detail
+from ._product_dict_is_cf import _product_dict_is_cf
+from ._rewrite_links_fragment import _rewrite_links_fragment
+from ._dedupe_preserve_order import _dedupe_preserve_order
+from ._collect_html_img_srcs import _collect_html_img_srcs
+
+
+async def fetch_and_upsert_product(
+    upsert_product,
+    log_product_result,
+    sem: asyncio.Semaphore,
+    slug: str,
+    product_slugs,
+    category_values,
+    valid_subs_by_top,
+    link_errors,
+    link_externals,
+    unknown_sub_paths
+) -> bool:
+  href = suma_href_from_html_slug(slug)
+  try:
+      async with sem:
+          d = await scrape_product_detail(href)
+          
+      is_cf, cf_token = _product_dict_is_cf(d)
+      if is_cf:
+          payload = {
+              "slug": slug,
+              "href_tried": href,
+              "error_type": "CloudflareChallengeDetected",
+              "error_message": f"Detected Cloudflare interstitial via token: {cf_token}",
+              "cf_token": cf_token,
+          }
+          await log_product_result(ok=False, payload=payload)
+          log(f"  ! CF challenge detected: {slug} ({cf_token})")
+          return False
+
+      # Rewrite embedded links; collect reports
+      if d.get("description_html"):
+          d["description_html"] = _rewrite_links_fragment(
+              d["description_html"], config()["base_url"], product_slugs, category_values,
+              valid_subs_by_top, slug, link_errors, link_externals, unknown_sub_paths
+          )
+          d["description_html"] = to_fragment(d["description_html"])
+      if d.get("sections"):
+          for sec in d["sections"]:
+              if isinstance(sec, dict) and sec.get("html"):
+                  sec["html"] = _rewrite_links_fragment(
+                      sec["html"], config()["base_url"], product_slugs, category_values,
+                      valid_subs_by_top, slug, link_errors, link_externals, unknown_sub_paths
+                  )
+                  sec["html"] = to_fragment(sec["html"])
+
+      # Images
+      gallery = _dedupe_preserve_order(d.get("images") or [])
+      embedded: List[str] = []
+      if d.get("description_html"):
+          embedded += _collect_html_img_srcs(d["description_html"])
+      for sec in d.get("sections", []) or []:
+          if isinstance(sec, dict) and sec.get("html"):
+              embedded += _collect_html_img_srcs(sec["html"])
+      embedded = _dedupe_preserve_order(embedded)
+      all_imgs = _dedupe_preserve_order(list(gallery) + list(embedded))
+
+      d["images"] = gallery
+      d["embedded_image_urls"] = embedded
+      d["all_image_urls"] = all_imgs
+      await upsert_product(slug, href, d)
+      # DB: upsert product + success log
+      return True
+  except Exception as e:
+      payload = {
+          "slug": slug,
+          "href_tried": href,
+          "error_type": e.__class__.__name__,
+          "error_message": str(e),
+      }
+      try:
+          if isinstance(e, httpx.HTTPStatusError):
+              payload["http_status"] = getattr(e.response, "status_code", None)
+              req = getattr(e, "request", None)
+              if req is not None and getattr(req, "url", None) is not None:
+                  payload["final_url"] = str(req.url)
+          elif isinstance(e, httpx.TransportError):
+              payload["transport_error"] = True
+      except Exception:
+          pass
+      await log_product_result(ok=False, payload=payload)
+      log(f"  ! product failed: {slug} ({e})")
+      return False
--- a/market/scrape/build_snapshot/tools/fetch_and_upsert_products.py
+++ b/market/scrape/build_snapshot/tools/fetch_and_upsert_products.py
@@ -0,0 +1,49 @@
+import asyncio
+from typing import Dict, List, Set
+from shared.config import config
+from shared.utils import log
+from .fetch_and_upsert_product import fetch_and_upsert_product
+
+
+async def fetch_and_upsert_products(
+    upsert_product,
+    log_product_result,
+    save_link_reports = None,
+    concurrency: int=8,
+    product_slugs: Set[str] = set(),
+    valid_subs_by_top: Dict[str, Set[str]] = {},
+    unknown_sub_paths: Set[str] = set()
+):
+    sem = asyncio.Semaphore(max(1, concurrency))
+    link_errors: List[Dict] = []
+    link_externals: List[Dict] = []
+    
+    category_values: Set[str] = set(config()["categories"]["allow"].values())
+    to_fetch = sorted(list(product_slugs))
+    log(f"Fetching {len(to_fetch)} product details (concurrency={concurrency})…")
+    tasks = [asyncio.create_task(
+        fetch_and_upsert_product(
+            upsert_product,
+            log_product_result,
+            sem,
+            s,
+            product_slugs,
+            category_values,
+            valid_subs_by_top,
+            link_errors,
+            link_externals,
+            unknown_sub_paths
+        )
+    ) for s in to_fetch]
+    done = 0
+    ok_count = 0
+    for coro in asyncio.as_completed(tasks):
+        ok = await coro
+        done += 1
+        if ok:
+            ok_count += 1
+        if done % 50 == 0 or done == len(tasks):
+            log(f"  …{done}/{len(tasks)} saved (ok={ok_count})")
+    if save_link_reports:
+        await save_link_reports(link_errors, link_externals)
+    
--- a/market/scrape/build_snapshot/tools/rewrite_nav.py
+++ b/market/scrape/build_snapshot/tools/rewrite_nav.py
@@ -0,0 +1,24 @@
+
+from typing import Dict
+from urllib.parse import urljoin
+from shared.config import config
+
+def rewrite_nav(nav: Dict[str, Dict], nav_redirects:Dict[str, str]):
+    if nav_redirects:
+      for label, data in (nav.get("cats") or {}).items():
+          top_slug = (data or {}).get("slug")
+          if not top_slug:
+              continue
+          new_subs = []
+          for s in (data.get("subs") or []):
+              old_sub = (s.get("slug") or "").strip()
+              if not old_sub:
+                  continue
+              old_path = f"/{top_slug}/{old_sub}"
+              canonical_path = nav_redirects.get(old_path, old_path)
+              parts = [x for x in canonical_path.split("/") if x]
+              top2, sub2 = parts[0], parts[1]
+              s["slug"] = sub2
+              s["href"] = urljoin(config()["base_url"], f"/{top2}/{sub2}")
+              new_subs.append(s)
+          data["subs"] = new_subs
--- a/market/scrape/build_snapshot/tools/valid_subs.py
+++ b/market/scrape/build_snapshot/tools/valid_subs.py
@@ -0,0 +1,16 @@
+from typing import Dict, Set
+
+# make valid subs for ewch top in nav
+def valid_subs(nav: Dict[str, Dict])->Dict[str, Set[str]] :
+    valid_subs_by_top: Dict[str, Set[str]] = {}
+    for label, data in (nav.get("cats") or {}).items():
+        top_slug = (data or {}).get("slug")
+        if not top_slug:
+            continue
+        subs_set = {
+            (s.get("slug") or "").strip()
+            for s in (data.get("subs") or [])
+            if s.get("slug")
+        }
+        valid_subs_by_top[top_slug] = subs_set
+    return valid_subs_by_top
--- a/market/scrape/get_auth.py
+++ b/market/scrape/get_auth.py
@@ -0,0 +1,244 @@
+from typing import Optional, Dict, Any, List
+from urllib.parse import urljoin
+import httpx
+from bs4 import BeautifulSoup
+from shared.config import config
+
+class LoginFailed(Exception):
+    def __init__(self, message: str, *, debug: Dict[str, Any]):
+        super().__init__(message)
+        self.debug = debug
+
+def _ff_headers(referer: Optional[str] = None, origin: Optional[str] = None) -> Dict[str, str]:
+    h = {
+        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+        "Accept-Language": "en-GB,en;q=0.5",
+        "Accept-Encoding": "gzip, deflate, br, zstd",
+        "Connection": "keep-alive",
+        "Upgrade-Insecure-Requests": "1",
+        "DNT": "1",
+        "Sec-GPC": "1",
+        "Cache-Control": "no-cache",
+        "Pragma": "no-cache",
+    }
+    if referer:
+        h["Referer"] = referer
+    if origin:
+        h["Origin"] = origin
+    return h
+
+def _cookie_header_from_jar(jar: httpx.Cookies, domain: str, path: str = "/") -> str:
+    pairs: List[str] = []
+    for c in jar.jar:
+        if not c.name or c.value is None:
+            continue
+        dom = (c.domain or "").lstrip(".")
+        if not dom:
+            continue
+        if not (domain == dom or domain.endswith("." + dom) or dom.endswith("." + domain)):
+            continue
+        if not (path.startswith(c.path or "/")):
+            continue
+        pairs.append(f"{c.name}={c.value}")
+    return "; ".join(pairs)
+
+def _extract_magento_errors(html_text: str) -> list[str]:
+    msgs: list[str] = []
+    try:
+        soup = BeautifulSoup(html_text or "", "lxml")
+        for sel in [
+            ".message-error",
+            ".messages .message-error",
+            ".page.messages .message-error",
+            "[data-ui-id='message-error']",
+            ".message.warning",
+            ".message.notice",
+        ]:
+            for box in soup.select(sel):
+                t = " ".join((box.get_text(" ") or "").split())
+                if t and t not in msgs:
+                    msgs.append(t)
+    except Exception:
+        pass
+    return msgs
+
+def _looks_like_login_page(html_text: str) -> bool:
+    try:
+        s = BeautifulSoup(html_text or "", "lxml")
+        if s.select_one("form#login-form.form-login"):
+            return True
+        title = (s.title.get_text() if s.title else "").strip().lower()
+        if "customer login" in title:
+            return True
+    except Exception:
+        pass
+    return False
+
+def _chrome_headers(referer=None, origin=None):
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+                      "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+        "Accept-Language": "en-US,en;q=0.5",
+        "Accept-Encoding": "gzip, deflate, br",
+        "Connection": "keep-alive",
+        "Upgrade-Insecure-Requests": "1",
+    }
+    if referer:
+        headers["Referer"] = referer
+    if origin:
+        headers["Origin"] = origin
+    return headers
+
+async def login(
+    username: str,
+    password: str,
+    *,
+    extra_cookies = {},  # ok to pass cf_clearance etc., but NOT form_key
+    timeout: float = 30.0,
+) -> httpx.Cookies:
+    """
+    Attempt login and return an authenticated cookie jar.
+
+    Success criteria (strict):
+      1) /customer/section/load?sections=customer reports is_logged_in == True
+         OR
+      2) GET /customer/account/ resolves to an account page (not the login page).
+
+    Otherwise raises LoginFailed with debug info.
+    """
+    limits = httpx.Limits(max_connections=10, max_keepalive_connections=6)
+    cookies = httpx.Cookies()
+    for k, v in {
+      **extra_cookies,
+      "pr-cookie-consent": '["all"]',
+      "user_allowed_save_cookie": '{"1":1}',
+    }.items():
+        if k.lower() == "form_key":
+            continue
+        cookies.set(k, v, domain="wholesale.suma.coop", path="/")
+
+    base_login = config()["base_login"]
+    base_url = config()["base_url"]
+
+    async with httpx.AsyncClient(
+        follow_redirects=True,
+        timeout=httpx.Timeout(timeout, connect=15.0),
+        http2=True,
+        limits=limits,
+        cookies=cookies,
+        headers=_chrome_headers(),
+        trust_env=True,
+    ) as client:
+        # 1) GET login page for fresh form_key
+        import time
+        login_bust = base_login + ("&" if "?" in base_login else "?") + f"_={int(time.time()*1000)}"
+        login_bust = base_login
+        r_get = await client.get(login_bust, headers=_chrome_headers())
+        print("Login GET failed. Status:", r_get.status_code)
+        print("Login GET URL:", r_get.url)
+        print("Response text:", r_get.text[:1000])  # trim if long
+        r_get.raise_for_status()
+        soup = BeautifulSoup(r_get.text, "lxml")
+
+        form = soup.select_one("form.form.form-login#login-form") or soup.select_one("#login-form")
+        if not form:
+            raise LoginFailed(
+                "Login form not found (possible bot challenge or theme change).",
+                debug={"get_status": r_get.status_code, "final_url": str(r_get.url)},
+            )
+        action = urljoin(base_login, form.get("action") or base_login)
+        fk_el = form.find("input", attrs={"name": "form_key"})
+        hidden_form_key = (fk_el.get("value") if fk_el else "") or ""
+
+        # mirror Magento behavior: form_key also appears as a cookie
+        client.cookies.set("form_key", hidden_form_key, domain="wholesale.suma.coop", path="/")
+
+        payload = {
+            "form_key": hidden_form_key,
+            "login[username]": username,
+            "login[password]": password,
+            "send": "Login",
+        }
+
+        post_headers = _chrome_headers(referer=base_login, origin=base_url)
+        post_headers["Content-Type"] = "application/x-www-form-urlencoded"
+        post_headers["Cookie"] = _cookie_header_from_jar(
+            client.cookies, domain="wholesale.suma.coop", path="/customer/"
+        )
+
+        r_post = await client.post(action, data=payload, headers=post_headers)
+
+        # 2) Primary check: sections API must say logged in
+        is_logged_in = False
+        sections_url = "https://wholesale.suma.coop/customer/section/load/?sections=customer&force_new_section_timestamp=1"
+        section_json: Dict[str, Any] = {}
+        try:
+            r_sec = await client.get(sections_url, headers=_chrome_headers(referer=base_login))
+            if r_sec.status_code == 200:
+                section_json = r_sec.json()
+                cust = section_json.get("customer") or {}
+                is_logged_in = bool(cust.get("is_logged_in"))
+        except Exception:
+            pass
+
+        # 3) Secondary check: account page should NOT be the login page
+        looks_like_login = False
+        final_account_url = ""
+        try:
+            r_acc = await client.get("https://wholesale.suma.coop/customer/account/", headers=_chrome_headers(referer=base_login))
+            final_account_url = str(r_acc.url)
+            looks_like_login = (
+                "/customer/account/login" in final_account_url
+                or _looks_like_login_page(r_acc.text)
+            )
+        except Exception:
+            # ignore; we'll rely on section status
+            pass
+
+        # Decide success/failure strictly
+        if not (is_logged_in or (final_account_url and not looks_like_login)):
+            errors = _extract_magento_errors(r_post.text)
+            # Clean up transient form_key cookie
+            try:
+                client.cookies.jar.clear("wholesale.suma.coop", "/", "form_key")
+            except Exception:
+                pass
+            raise LoginFailed(
+                errors[0] if errors else "Invalid username or password.",
+                debug={
+                    "get_status": r_get.status_code,
+                    "post_status": r_post.status_code,
+                    "post_final_url": str(r_post.url),
+                    "sections_customer": section_json.get("customer"),
+                    "account_final_url": final_account_url,
+                    "looks_like_login_page": looks_like_login,
+                },
+            )
+        def clear_cookie_everywhere(cookies: httpx.Cookies, name: str) -> None:
+          to_delete = []
+          for c in list(cookies.jar):  # http.cookiejar.Cookie objects
+              if c.name == name:
+                  # Note: CookieJar.clear requires exact (domain, path, name)
+                  to_delete.append((c.domain, c.path, c.name))
+
+          for domain, path, nm in to_delete:
+              try:
+                  cookies.jar.clear(domain, path, nm)
+              except KeyError:
+                  # Mismatch can happen if domain has a leading dot vs not, etc.
+                  # Try again with a normalized domain variant.
+                  if domain and domain.startswith("."):
+                  
+                      cookies.jar.clear(domain.lstrip("."), path, nm)
+                  else:
+                      # or try with leading dot
+                      cookies.jar.clear("." + domain, path, nm)
+          if name in cookies:
+              del cookies[name]
+
+        clear_cookie_everywhere(client.cookies, "form_key")
+        #client.cookies.jar.clear(config()["base_host"] or "wholesale.suma.coop", "/", "form_key")
+        print('cookies', client.cookies)
+        return client.cookies
--- a/market/scrape/html_utils.py
+++ b/market/scrape/html_utils.py
@@ -0,0 +1,44 @@
+# suma_browser/html_utils.py
+from __future__ import annotations
+from typing import Optional
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+from shared.config import config
+
+
+
+def to_fragment(html: Optional[str]) -> str:
+    """Return just the fragment contents (no <html>/<body> wrappers)."""
+    if not html:
+        return ""
+    soup = BeautifulSoup(html, "lxml")
+
+    # unwrap document-level containers
+    for t in soup.find_all(["html", "body"]):
+        t.unwrap()
+
+    return "".join(str(c) for c in soup.contents).strip()
+
+def absolutize_fragment(html: Optional[str]) -> str:
+    """Absolutize href/src against BASE_URL and return a fragment (no wrappers)."""
+    if not html:
+        return ""
+    frag = BeautifulSoup(html, "lxml")
+
+    for tag in frag.find_all(True):
+        if tag.has_attr("href"):
+            raw = str(tag["href"])
+            abs_href = urljoin(config()["base_url"], raw) if raw.startswith("/") else raw
+            #if rewrite_suma_href_to_local:
+            #    local = rewrite_suma_href_to_local(abs_href)
+            #    tag["href"] = local if local else abs_href
+            #else:
+            tag["href"] = abs_href
+        if tag.has_attr("src"):
+            raw = str(tag["src"])
+            tag["src"] = urljoin(config()["base_url"], raw) if raw.startswith("/") else raw
+
+    # unwrap wrappers and return only the inner HTML
+    for t in frag.find_all(["html", "body"]):
+        t.unwrap()
+    return "".join(str(c) for c in frag.contents).strip()
--- a/market/scrape/http_client.py
+++ b/market/scrape/http_client.py
@@ -0,0 +1,220 @@
+# suma_browser/http_client.py
+from __future__ import annotations
+
+import asyncio
+import os
+import secrets
+from typing import Optional, Dict
+
+import httpx
+from shared.config import config
+
+_CLIENT: httpx.AsyncClient | None = None
+
+# ----- optional decoders -> Accept-Encoding
+BROTLI_OK = False
+ZSTD_OK = False
+try:
+    import brotli  # noqa: F401
+    BROTLI_OK = True
+except Exception:
+    pass
+try:
+    import zstandard as zstd  # noqa: F401
+    ZSTD_OK = True
+except Exception:
+    pass
+
+def _accept_encoding() -> str:
+    enc = ["gzip", "deflate"]
+    if BROTLI_OK:
+        enc.append("br")
+    if ZSTD_OK:
+        enc.append("zstd")
+    return ", ".join(enc)
+
+FIREFOX_UA = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0"
+
+def _ff_headers(referer: Optional[str] = None) -> Dict[str, str]:
+    h = {
+        "User-Agent": FIREFOX_UA,
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+        "Accept-Language": "en-GB,en;q=0.5",
+        "Accept-Encoding": _accept_encoding(),
+        "Connection": "keep-alive",
+        "Upgrade-Insecure-Requests": "1",
+        "Sec-Fetch-Dest": "document",
+        "Sec-Fetch-Mode": "navigate",
+        "Sec-Fetch-Site": "none" if not referer else "same-origin",
+        "Sec-Fetch-User": "?1",
+        "DNT": "1",
+        "Sec-GPC": "1",
+        "Priority": "u=0, i",
+        "Cache-Control": "no-cache",
+        "Pragma": "no-cache",
+    }
+    if referer:
+        h["Referer"] = referer
+    return h
+def _chrome_headers(referer=None, origin=None):
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+                      "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+        "Accept-Language": "en-US,en;q=0.5",
+        "Accept-Encoding": "gzip, deflate, br",
+        "Connection": "keep-alive",
+        "Upgrade-Insecure-Requests": "1",
+    }
+    if referer:
+        headers["Referer"] = referer
+    if origin:
+        headers["Origin"] = origin
+    return headers
+
+def _parse_cookie_header(cookie_header: str) -> Dict[str, str]:
+    jar: Dict[str, str] = {}
+    for part in cookie_header.split(";"):
+        part = part.strip()
+        if not part or "=" not in part:
+            continue
+        k, v = part.split("=", 1)
+        jar[k.strip()] = v.strip()
+    return jar
+
+def _looks_like_cloudflare(html: bytes) -> bool:
+    if not html:
+        return False
+    s = html[:40000].lower()
+    return (
+        b"please wait while your request is being verified" in s
+        or b"/cdn-cgi/challenge-platform/scripts/jsd/main.js" in s
+        or b"rocket-loader.min.js" in s
+        or b"cf-ray" in s
+        or b"challenge-platform" in s
+        or b"cf-chl-" in s
+    )
+
+# -------- runtime cookie configuration (preferred over env) --------------------
+_INITIAL_COOKIES: Dict[str, str] = {}
+_INITIAL_COOKIE_HEADER: Optional[str] = None
+
+async def configure_cookies(cookies: Dict[str, str]) -> None:
+    """
+    Configure initial cookies programmatically (preferred over env).
+    Call BEFORE the first request (i.e., before get_client()/fetch()).
+    If a client already exists, its jar is updated immediately.
+    """
+    global _INITIAL_COOKIES, _INITIAL_COOKIE_HEADER
+    _INITIAL_COOKIE_HEADER = None
+    _INITIAL_COOKIES = dict(cookies or {})
+    # If client already built, update it now
+    if _CLIENT is not None:
+        print('configuring cookies')
+        host = config()["base_host"] or "wholesale.suma.coop"
+        for k, v in _INITIAL_COOKIES.items():
+            _CLIENT.cookies.set(k, v, domain=host, path="/")
+
+def configure_cookies_from_header(cookie_header: str) -> None:
+    """
+    Configure initial cookies from a raw 'Cookie:' header string.
+    Preferred over env; call BEFORE the first request.
+    """
+    global _INITIAL_COOKIES, _INITIAL_COOKIE_HEADER
+    _INITIAL_COOKIE_HEADER = cookie_header or ""
+    _INITIAL_COOKIES = _parse_cookie_header(_INITIAL_COOKIE_HEADER)
+    if _CLIENT is not None:
+        host = config()["base_host"] or "wholesale.suma.coop"
+        for k, v in _INITIAL_COOKIES.items():
+            _CLIENT.cookies.set(k, v, domain=host, path="/")
+
+# ------------------------------------------------------------------------------
+async def get_client() -> httpx.AsyncClient:
+    """Public accessor (same as _get_client)."""
+    return await _get_client()
+
+async def _get_client() -> httpx.AsyncClient:
+    global _CLIENT
+    if _CLIENT is None:
+        timeout = httpx.Timeout(300.0, connect=150.0)
+        limits = httpx.Limits(max_keepalive_connections=8, max_connections=16)
+        _CLIENT = httpx.AsyncClient(
+            follow_redirects=True,
+            timeout=timeout,
+            http2=True,
+            limits=limits,
+            headers=_chrome_headers(),
+            trust_env=True,
+        )
+
+        # ---- Seed cookies (priority: runtime config > env var) ---------------
+        host = config()["base_host"] or "wholesale.suma.coop"
+
+        if _INITIAL_COOKIES or _INITIAL_COOKIE_HEADER:
+            # From runtime config
+            if _INITIAL_COOKIE_HEADER:
+                _CLIENT.cookies.update(_parse_cookie_header(_INITIAL_COOKIE_HEADER))
+            for k, v in _INITIAL_COOKIES.items():
+                _CLIENT.cookies.set(k, v, domain=host, path="/")
+        else:
+            # Fallback to environment
+            cookie_str = os.environ.get("SUMA_COOKIES", "").strip()
+            if cookie_str:
+                _CLIENT.cookies.update(_parse_cookie_header(cookie_str))
+
+        # Ensure private_content_version is present
+        if "private_content_version" not in _CLIENT.cookies:
+            pcv = secrets.token_hex(16)
+            _CLIENT.cookies.set("private_content_version", pcv, domain=host, path="/")
+        # ---------------------------------------------------------------------
+
+    return _CLIENT
+
+async def aclose_client() -> None:
+    global _CLIENT
+    if _CLIENT is not None:
+        await _CLIENT.aclose()
+        _CLIENT = None
+
+async def fetch(url: str, *, referer: Optional[str] = None, retries: int = 3) -> str:
+    client = await _get_client()
+
+    # Warm-up visit to look like a real session
+    if len(client.cookies.jar) == 0:
+        try:
+            await client.get(config()["base_url"].rstrip("/") + "/", headers=_chrome_headers())
+            await asyncio.sleep(0.25)
+        except Exception:
+            pass
+
+    last_exc: Optional[Exception] = None
+    for attempt in range(1, retries + 1):
+        try:
+            h = _chrome_headers(referer=referer or (config()["base_url"].rstrip("/") + "/"))
+            r = await client.get(url, headers=h)
+            if _looks_like_cloudflare(r.content):
+                if attempt < retries:
+                    await asyncio.sleep(0.9 if attempt == 1 else 1.3)
+                    try:
+                        await client.get(config()["base_url"].rstrip("/") + "/", headers=_chrome_headers())
+                        await asyncio.sleep(0.4)
+                    except Exception:
+                        pass
+                    continue
+            try:
+                r.raise_for_status()
+            except httpx.HTTPStatusError as e:
+                print(f"Fetch failed for {url}")
+                print("Status:", r.status_code)
+                print("Body:", r.text[:1000])  # Trimmed
+                raise
+            return r.text
+        except Exception as e:
+            last_exc = e
+            if attempt >= retries:
+                raise
+            await asyncio.sleep(0.45 * attempt + 0.25)
+
+    if last_exc:
+        raise last_exc
+    raise RuntimeError("fetch failed unexpectedly")
--- a/market/scrape/listings.py
+++ b/market/scrape/listings.py
@@ -0,0 +1,289 @@
+from __future__ import annotations
+
+import math
+import re
+from typing import Callable, Dict, List, Optional, Tuple
+from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
+
+
+from .http_client import fetch
+from bp.browse.services.slugs import product_slug_from_href
+from bp.browse.services.state import (
+    KNOWN_PRODUCT_SLUGS,
+    _listing_page_cache,
+    _listing_page_ttl,
+    _listing_variant_cache,
+    _listing_variant_ttl,
+    now,
+)
+from shared.utils import normalize_text, soup_of
+from shared.config import config
+
+
+def parse_total_pages_from_text(text: str) -> Optional[int]:
+    m = re.search(r"Showing\s+(\d+)\s+of\s+(\d+)", text, re.I)
+    if not m:
+        return None
+    shown = int(m.group(1))
+    total = int(m.group(2))
+    per_page = 36 if shown in (12, 24, 36) else shown
+    return max(1, math.ceil(total / per_page))
+
+
+def _first_from_srcset(val: str) -> Optional[str]:
+    if not val:
+        return None
+    first = val.split(",")[0].strip()
+    parts = first.split()
+    return parts[0] if parts else first
+
+
+def _abs_url(u: Optional[str]) -> Optional[str]:
+    if not u:
+        return None
+    return urljoin(config()["base_url"], u) if isinstance(u, str) and u.startswith("/") else u
+
+
+def _collect_img_candidates(el) -> List[str]:
+    urls: List[str] = []
+    if not el:
+        return urls
+    attrs = ["src", "data-src", "data-original", "data-zoom-image", "data-thumb", "content", "href"]
+    for a in attrs:
+        v = el.get(a)
+        if v:
+            urls.append(v)
+    for a in ["srcset", "data-srcset"]:
+        v = el.get(a)
+        if v:
+            first = _first_from_srcset(v)
+            if first:
+                urls.append(first)
+    return urls
+
+
+def _dedupe_preserve_order_by(seq: List[str], key: Callable[[str], str]) -> List[str]:
+    seen = set()
+    out: List[str] = []
+    for s in seq:
+        if not s:
+            continue
+        k = key(s)
+        if k in seen:
+            continue
+        seen.add(k)
+        out.append(s)
+    return out
+
+
+def _filename_key(u: str) -> str:
+    p = urlparse(u)
+    path = p.path or ""
+    if path.endswith("/"):
+        path = path[:-1]
+    last = path.split("/")[-1]
+    return f"{p.netloc}:{last}".lower()
+
+
+def _parse_cards_from_soup(soup) -> List[Dict]:
+    """Extract product tiles (name, href, image, desc) from listing soup.
+    De-duplicate by slug to avoid doubles from overlapping selectors."""
+    items: List[str] = []
+    seen_slugs: set[str] = set()
+
+    # Primary selectors (Magento 2 default)
+    card_wrappers = soup.select(
+        "li.product-item, .product-item, ol.products.list.items li, .products.list.items li, .product-item-info"
+    )
+    for card in card_wrappers:
+        a = (
+            card.select_one("a.product-item-link")
+            or card.select_one(".product-item-name a")
+            or card.select_one("a[href$='.html'], a[href$='.htm']")
+        )
+        if not a:
+            continue
+        #name = normalize_text(a.get_text()) or normalize_text(a.get("title") or "")
+        href = a.get("href")
+        #if not name or not href:
+        #    continue
+        if href.startswith("/"):
+            href = urljoin(config()["base_url"], href)
+
+
+        slug = product_slug_from_href(href)
+        KNOWN_PRODUCT_SLUGS.add(slug)
+
+        if slug and slug not in seen_slugs:
+            seen_slugs.add(slug)
+            items.append(slug)
+    # Secondary: any product-looking anchors inside products container
+    if not items:
+        products_container = soup.select_one(".products") or soup
+        for a in products_container.select("a[href$='.html'], a[href$='.htm']"):
+            href = a.get("href")
+            if href.startswith("/"):
+                href = urljoin(config()["base_url"], href)
+            slug = product_slug_from_href(href)
+            KNOWN_PRODUCT_SLUGS.add(slug)
+            if slug not in seen_slugs:
+                seen_slugs.add(slug)
+                items.append(slug)
+
+    # Tertiary: JSON-LD fallback (ItemList/Product)
+    if not items:
+        import json
+
+        def add_product(name: Optional[str], url: Optional[str], image: Optional[str]):
+            if not url:
+                return
+            absu = urljoin(config()["base_url"], url) if url.startswith("/") else url
+            slug = product_slug_from_href(absu)
+            if not slug:
+                return
+            KNOWN_PRODUCT_SLUGS.add(slug)
+            if slug not in seen_slugs:
+                seen_slugs.add(slug)
+                items.append(slug)
+
+        for script in soup.find_all("script", attrs={"type": "application/ld+json"}):
+            #try:
+            data = json.loads(script.get_text())
+            #except Exception:
+            #    continue
+            if isinstance(data, dict):
+                if data.get("@type") == "ItemList" and isinstance(data.get("itemListElement"), list):
+                    for it in data["itemListElement"]:
+                        if isinstance(it, dict):
+                            ent = it.get("item") or it
+                            if isinstance(ent, dict):
+                                add_product(
+                                    ent.get("name"),
+                                    ent.get("url"),
+                                    (ent.get("image") if isinstance(ent.get("image"), str) else None),
+                                )
+                if data.get("@type") == "Product":
+                    add_product(
+                        data.get("name"),
+                        data.get("url"),
+                        (data.get("image") if isinstance(data.get("image"), str) else None),
+                    )
+            elif isinstance(data, list):
+                for ent in data:
+                    if not isinstance(ent, dict):
+                        continue
+                    if ent.get("@type") == "Product":
+                        add_product(
+                            ent.get("name"),
+                            ent.get("url"),
+                            (ent.get("image") if isinstance(ent.get("image"), str) else None),
+                        )
+                    if ent.get("@type") == "ItemList":
+                        for it in ent.get("itemListElement", []):
+                            if isinstance(it, dict):
+                                obj = it.get("item") or it
+                                if isinstance(obj, dict):
+                                    add_product(
+                                        obj.get("name"),
+                                        obj.get("url"),
+                                        (obj.get("image") if isinstance(obj.get("image"), str) else None),
+                                    )
+
+    return items
+
+
+def _with_query(url: str, add: Dict[str, str]) -> str:
+    p = urlparse(url)
+    q = dict(parse_qsl(p.query, keep_blank_values=True))
+    q.update(add)
+    new_q = urlencode(q)
+    return urlunparse((p.scheme, p.netloc, p.path, p.params, new_q, p.fragment))
+
+
+def _with_page(url: str, page: int) -> str:
+    if page and page > 1:
+        return _with_query(url, {"p": str(page)})
+    return url
+
+
+def _listing_base_key(url: str) -> str:
+    p = urlparse(url)
+    path = p.path.rstrip("/")
+    return f"{p.scheme}://{p.netloc}{path}".lower()
+
+
+def _variant_cache_get(base_key: str) -> Optional[str]:
+    info = _listing_variant_cache.get(base_key)
+    if not info:
+        return None
+    url, ts = info
+    if (now() - ts) > _listing_variant_ttl:
+        _listing_variant_cache.pop(base_key, None)
+        return None
+    return url
+
+
+def _variant_cache_set(base_key: str, working_url: str) -> None:
+    _listing_variant_cache[base_key] = (working_url, now())
+
+
+def _page_cache_get(working_url: str, page: int) -> Optional[Tuple[List[Dict], int]]:
+    key = f"{working_url}|p={page}"
+    info = _listing_page_cache.get(key)
+    if not info:
+        return None
+    (items, total_pages), ts = info
+    if (now() - ts) > _listing_page_ttl:
+        _listing_page_cache.pop(key, None)
+        return None
+    return items, total_pages
+
+
+def _page_cache_set(working_url: str, page: int, items: List[Dict], total_pages: int) -> None:
+    key = f"{working_url}|p={page}"
+    _listing_page_cache[key] = ((items, total_pages), now())
+
+
+async def _fetch_parse(url: str, page: int):
+    html = await fetch(_with_page(url, page))
+    soup = soup_of(html)
+    items = _parse_cards_from_soup(soup)
+    return items, soup
+
+
+
+
+async def scrape_products(list_url: str, page: int = 1):
+    """Fast listing fetch with variant memoization + page cache."""
+    _listing_base_key(list_url)
+    items, soup = await _fetch_parse(list_url, page)
+    
+    total_pages = _derive_total_pages(soup)
+    return items, total_pages
+
+def _derive_total_pages(soup) -> int:
+    total_pages = 1
+    textdump = normalize_text(soup.get_text(" "))
+    pages_from_text = parse_total_pages_from_text(textdump)
+    if pages_from_text:
+        total_pages = pages_from_text
+    else:
+        pages = {1}
+        for a in soup.find_all("a", href=True):
+            m = re.search(r"[?&]p=(\d+)", a["href"])
+            if m:
+                pages.add(int(m.group(1)))
+        total_pages = max(pages) if pages else 1
+    return total_pages
+
+
+def _slugs_from_list_url(list_url: str) -> Tuple[str, Optional[str]]:
+    p = urlparse(list_url)
+    parts = [x for x in (p.path or "").split("/") if x]
+    top = parts[0].lower() if parts else ""
+    sub = None
+    if len(parts) >= 2:
+        sub = parts[1]
+        if sub.lower().endswith((".html", ".htm")):
+            sub = re.sub(r"\.(html?|HTML?)$", "", sub)
+    return top, sub
--- a/market/scrape/nav.py
+++ b/market/scrape/nav.py
@@ -0,0 +1,104 @@
+from __future__ import annotations
+
+import re
+from typing import Dict, List, Tuple, Optional
+from urllib.parse import urlparse, urljoin
+
+from bs4 import BeautifulSoup
+from shared.config import config
+from .http_client import fetch  # only fetch; define soup_of locally
+#from .. import cache_backend as cb
+#from ..blacklist.category import is_category_blocked # Reverse map: slug -> label
+    
+
+# ------------------ Caches ------------------
+
+
+
+def soup_of(html: str) -> BeautifulSoup:
+    return BeautifulSoup(html or "", "lxml")
+
+
+def normalize_text(s: str) -> str:
+    return re.sub(r"\s+", " ", (s or "").strip())
+
+
+async def scrape_nav_raw() -> List[Tuple[str, str]]:
+    html = await fetch(config()["base_url"])
+    soup = soup_of(html)
+    results: List[Tuple[str, str]] = []
+    for a in soup.find_all("a", href=True):
+        text = normalize_text(a.get_text())
+        if not text:
+            continue
+        href = a["href"].strip()
+        if href.startswith("/"):
+            href = urljoin(config()["base_url"], href)
+        if not href.startswith(config()["base_url"]):
+            continue
+        results.append((text, href))
+    return results
+
+
+def extract_sub_slug(href: str, top_slug: str) -> Optional[str]:
+    p = urlparse(href)
+    parts = [x for x in (p.path or "").split("/") if x]
+    if len(parts) >= 2 and parts[0].lower() == top_slug.lower():
+        sub = parts[1]
+        if sub.lower().endswith((".html", ".htm")):
+            sub = re.sub(r"\.(html?|HTML?)$", "", sub)
+        return sub
+    return None
+
+
+async def group_by_category(slug_to_links: Dict[str, List[Tuple[str, str]]]) -> Dict[str, Dict]:
+    nav = {"cats": {}}
+    for label, slug in config()["categories"]["allow"].items():
+        top_href = urljoin(config()["base_url"], f"/{slug}")
+        subs = []
+        for text, href in slug_to_links.get(slug, []):
+            sub_slug = extract_sub_slug(href, slug)
+            if sub_slug:
+                #list_url = _join(config()["base_url"], f"/{slug}/{sub_slug}")
+                #log(f"naving [{slug}/{sub_slug}] page 1…")
+                #items, total_pages = await scrape_products(list_url, page=1)
+                #for p in range(2, total_pages + 1):
+                #    log(f"naving [{slug}/{sub_slug}] page {p}…")
+                #    moreitems, _tp = await scrape_products(list_url, page=p)
+                #    items.extend(
+                #      moreitems,
+                #    )
+                subs.append({"name": text, "href": href, "slug": sub_slug})
+        subs.sort(key=lambda x: x["name"].lower())
+        #list_url = _join(config()["base_url"], f"/{slug}")
+        #log(f"naving [{slug}] page 1…")
+        #items, total_pages = await scrape_products(list_url, page=1)
+        #for p in range(2, total_pages + 1):
+        #    log(f"naving [{slug}] page {p}…")
+        #    moreitems, _tp = await scrape_products(list_url, page=p)
+        #    items.extend(
+        #      moreitems,
+        #    )
+        nav["cats"][label] = {"href": top_href, "slug": slug, "subs": subs}
+    return nav
+
+
+async def scrape_nav_filtered() -> Dict[str, Dict]:
+    anchors = await scrape_nav_raw()
+    slug_to_links: Dict[str, List[Tuple[str, str]]] = {}
+    for text, href in anchors:
+        p = urlparse(href)
+        parts = [x for x in (p.path or "").split("/") if x]
+        if not parts:
+            continue
+        top = parts[0].lower()
+        if top in config()["slugs"]["skip"]:
+            continue
+        slug_to_links.setdefault(top, []).append((text, href))
+    return await group_by_category(slug_to_links)
+
+async def nav_scrape() -> Dict[str, Dict]:
+    """Return navigation structure; use snapshot when offline."""
+    
+    nav = await scrape_nav_filtered()
+    return nav
--- a/market/scrape/persist_api/init.py
+++ b/market/scrape/persist_api/init.py
@@ -0,0 +1,6 @@
+from .upsert_product import upsert_product
+from .log_product_result import log_product_result
+from .save_nav import save_nav
+from .save_subcategory_redirects import save_subcategory_redirects
+from .capture_listing import capture_listing
+
--- a/market/scrape/persist_api/capture_listing.py
+++ b/market/scrape/persist_api/capture_listing.py
@@ -0,0 +1,27 @@
+# replace your existing upsert_product with this version
+
+import os
+import httpx
+
+from typing import List
+
+async def capture_listing(
+    url: str,
+    items: List[str],
+    total_pages: int
+):
+    
+    sync_url = os.getenv("CAPTURE_LISTING_URL", "http://localhost:8001/market/suma-market/api/products/listing/")
+
+    async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
+        _d = {
+            "url": url,
+            "items": items,
+            "total_pages": total_pages
+        }
+        resp = await client.post(sync_url, json=_d)
+    # Raise for non-2xx
+    resp.raise_for_status()
+    data = resp.json() if resp.content else {}
+    return data
+    
--- a/market/scrape/persist_api/log_product_result.py
+++ b/market/scrape/persist_api/log_product_result.py
@@ -0,0 +1,24 @@
+# replace your existing upsert_product with this version
+
+import os
+import httpx
+
+
+async def log_product_result(
+        ok: bool,
+        payload
+):
+    
+    sync_url = os.getenv("PRODUCT_LOG_URL", "http://localhost:8000/market/api/products/log/")
+
+    async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
+        _d = {
+            "ok": ok,
+            "payload": payload
+        }
+        resp = await client.post(sync_url, json=_d)
+    # Raise for non-2xx
+    resp.raise_for_status()
+    data = resp.json() if resp.content else {}
+    return data
+    
--- a/market/scrape/persist_api/save_nav.py
+++ b/market/scrape/persist_api/save_nav.py
@@ -0,0 +1,19 @@
+# replace your existing upsert_product with this version
+
+import os
+import httpx
+
+from typing import Dict
+
+async def save_nav(
+    nav: Dict,
+):
+    sync_url = os.getenv("SAVE_NAV_URL", "http://localhost:8001/market/suma-market/api/products/nav/")
+
+    async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
+        resp = await client.post(sync_url, json=nav)
+    # Raise for non-2xx
+    resp.raise_for_status()
+    data = resp.json() if resp.content else {}
+    return data
+    
--- a/market/scrape/persist_api/save_subcategory_redirects.py
+++ b/market/scrape/persist_api/save_subcategory_redirects.py
@@ -0,0 +1,15 @@
+import os
+import httpx
+
+from typing import Dict
+
+async def save_subcategory_redirects(mapping: Dict[str, str]) -> None:
+    sync_url = os.getenv("SAVE_REDIRECTS", "http://localhost:8000/market/api/products/redirects/")
+
+    async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
+        resp = await client.post(sync_url, json=mapping)
+    # Raise for non-2xx
+    resp.raise_for_status()
+    data = resp.json() if resp.content else {}
+    return data
+    
--- a/market/scrape/persist_api/upsert_product.py
+++ b/market/scrape/persist_api/upsert_product.py
@@ -0,0 +1,256 @@
+# replace your existing upsert_product with this version
+
+import os
+import httpx
+
+from typing import Dict, List, Any
+
+async def upsert_product(
+        slug,
+        href,
+        d,
+):
+    """
+    Posts the given product dict `d` to the /api/products/sync endpoint.
+    Keeps the same signature as before and preserves logging/commit behavior.
+    """
+
+
+    # Ensure slug in payload matches the function arg if present
+    if not d.get("slug"):
+        d["slug"] = slug
+
+    # Where to post; override via env if needed
+    sync_url = os.getenv("PRODUCT_SYNC_URL", "http://localhost:8001/market/suma-market/api/products/sync/")
+
+
+
+
+    payload = _massage_payload(d)
+
+    async def _do_call() -> Dict[str, Any]:
+        async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
+            resp = await client.post(sync_url, json=payload)
+            resp.raise_for_status()
+            # tolerate empty body
+            if not resp.content:
+                return {}
+            # prefer JSON if possible, otherwise return text
+            try:
+                return resp.json()
+            except ValueError:
+                return {"raw": resp.text}
+
+    async def _log_error(exc: BaseException) -> None:
+        # Optional: add your own logging here
+        print(f"[upsert_product] POST failed: {type(exc).__name__}: {exc}. Retrying in 5s... slug={slug} url={sync_url}")
+
+    return await retry_until_success(_do_call, delay=5.0, on_error=_log_error)
+
+
+
+    #async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
+    #    _d=_massage_payload(d)
+    #    resp = await client.post(sync_url, json=_d)
+    # Raise for non-2xx
+    #resp.raise_for_status()
+    #data = resp.json() if resp.content else {}
+    #return data
+
+import asyncio
+from typing import Any, Awaitable, Callable, Dict, Optional
+
+async def retry_until_success(
+    fn: Callable[[], Awaitable[Any]],
+    *,
+    delay: float = 5.0,
+    on_error: Optional[Callable[[BaseException], Awaitable[None]]] = None,
+) -> Any:
+    """
+    Repeatedly call the async no-arg function `fn` until it succeeds (returns without raising).
+    Waits `delay` seconds between attempts. Never gives up.
+    If provided, `on_error(exc)` is awaited after each failure.
+    """
+    attempt = 0
+    while True:
+        try:
+            return await fn()
+        except asyncio.CancelledError:
+            # bubble up cancellations immediately
+            raise
+        except BaseException as exc:
+            attempt += 1
+            if on_error is not None:
+                try:
+                    await on_error(exc)
+                except Exception:
+                    # don't let error handler failures prevent retrying
+                    pass
+            # fallback stderr log if no on_error handler
+            if on_error is None:
+                print(f"[retry] attempt {attempt} failed: {type(exc).__name__}: {exc}")
+            await asyncio.sleep(delay)
+
+
+
+def _get(d, key, default=None):
+    v = d.get(key)
+    return default if v in (None, "", [], {}) else v
+
+
+def _massage_payload(d: Dict[str, Any]) -> Dict[str, Any]:
+    """Mirror the DB-upsert massaging so the API sees the same structure/values."""
+    slug = d.get("slug")
+    if not slug:
+        raise ValueError("product missing slug")
+
+    # --- Top-level fields (use _get where DB upsert uses it) ---
+    out: Dict[str, Any] = {
+        "slug": slug,
+        "title": _get(d, "title"),
+        "image": _get(d, "image"),
+        "description_short": _get(d, "description_short"),
+        "description_html": _get(d, "description_html"),
+        "suma_href": _get(d, "suma_href"),
+        "brand": _get(d, "brand"),
+        "rrp": _get(d, "rrp"),
+        "rrp_currency": _get(d, "rrp_currency"),
+        "rrp_raw": _get(d, "rrp_raw"),
+        "price_per_unit": _get(d, "price_per_unit"),
+        "price_per_unit_currency": _get(d, "price_per_unit_currency"),
+        "price_per_unit_raw": _get(d, "price_per_unit_raw"),
+        "special_price": _get(d, "special_price"),
+        "special_price_currency": _get(d, "special_price_currency"),
+        "special_price_raw": _get(d, "special_price_raw"),
+        "regular_price": _get(d, "regular_price"),
+        "regular_price_currency": _get(d, "regular_price_currency"),
+        "regular_price_raw": _get(d, "regular_price_raw"),
+        "case_size_count": _get(d, "case_size_count"),
+        "case_size_item_qty": _get(d, "case_size_item_qty"),
+        "case_size_item_unit": _get(d, "case_size_item_unit"),
+        "case_size_raw": _get(d, "case_size_raw"),
+        "ean": d.get("ean") or d.get("barcode") or None,
+        "sku": d.get("sku"),
+        "unit_size": d.get("unit_size"),
+        "pack_size": d.get("pack_size"),
+    }
+
+    # --- Sections: only dicts with title+html (like DB sync) ---
+    sections_in = d.get("sections") or []
+    sections_out: List[Dict[str, Any]] = []
+    for sec in sections_in:
+        if isinstance(sec, dict) and sec.get("title") and sec.get("html"):
+            sections_out.append({"title": sec["title"], "html": sec["html"]})
+    out["sections"] = sections_out
+
+    # --- Images: same 3 buckets used in DB sync ---
+    def _coerce_str_list(x):
+        if not x:
+            return []
+        # accept list of strings or list of dicts with {"url": ...}
+        out_urls = []
+        for item in x:
+            if isinstance(item, str):
+                if item:
+                    out_urls.append(item)
+            elif isinstance(item, dict):
+                u = item.get("url")
+                if u:
+                    out_urls.append(u)
+        return out_urls
+
+    out["images"] = _coerce_str_list(d.get("images"))
+    out["embedded_image_urls"] = _coerce_str_list(d.get("embedded_image_urls"))
+    out["all_image_urls"] = _coerce_str_list(d.get("all_image_urls"))
+
+    # --- Labels: strip (DB code trims) ---
+    labels_in = d.get("labels") or []
+    out["labels"] = [str(x).strip() for x in labels_in if x]
+
+    # --- Stickers: strip + lower (DB code lower-cases) ---
+    stickers_in = d.get("stickers") or []
+    out["stickers"] = [str(x).strip().lower() for x in stickers_in if x]
+
+    # --- Attributes: pass through the same dict sources the DB code reads ---
+    out["info_table"] = d.get("info_table") or {}
+    #out["oe_list_price"] = d.get("oe_list_price") or {}
+
+    # --- Nutrition: allow dict or list of dicts, mirroring DB code ---
+    nutrition = d.get("nutrition") or []
+    if isinstance(nutrition, dict):
+        out["nutrition"] = {str(k).strip(): (None if v is None else str(v)) for k, v in nutrition.items()}
+    elif isinstance(nutrition, list):
+        rows = []
+        for row in nutrition:
+            if not isinstance(row, dict):
+                continue
+            key = str(row.get("key") or "").strip()
+            if not key:
+                continue
+            rows.append({
+                "key": key,
+                "value": None if row.get("value") is None else str(row.get("value")),
+                "unit": None if row.get("unit") is None else str(row.get("unit")),
+            })
+        out["nutrition"] = rows
+    else:
+        out["nutrition"] = []
+
+    # --- Allergens: accept str (→ contains=True) or dict ---
+    alls_in = d.get("allergens") or []
+    alls_out = []
+    for a in alls_in:
+        if isinstance(a, str):
+            nm, contains = a.strip(), True
+        elif isinstance(a, dict):
+            nm, contains = (a.get("name") or "").strip(), bool(a.get("contains", True))
+        else:
+            continue
+        if nm:
+            alls_out.append({"name": nm, "contains": contains})
+    out["allergens"] = alls_out
+
+    out["images"]=[
+        {"url": s.strip(), "kind": "gallery", "position": i}
+        for i, s in enumerate(out.get("images") or [])
+        if isinstance(s, str) and s.strip()
+    ] + [
+        {"url": s.strip(), "kind": "embedded", "position": i}
+        for i, s in enumerate(out.get("embedded_image_urls") or [])
+        if isinstance(s, str) and s.strip()
+    ] + [
+        {"url": s.strip(), "kind": "all", "position": i}
+        for i, s in enumerate(out.get("all_image_urls") or [])
+        if isinstance(s, str) and s.strip()
+    ]
+    out["labels"]= [{"name": s.strip()} for s in out["labels"] if isinstance(s, str) and s.strip()]
+    out["stickers"]= [{"name": s.strip()} for s in out["stickers"] if isinstance(s, str) and s.strip()]
+    out["attributes"] = build_attributes_list(d)
+
+
+    return out
+
+
+
+
+
+def build_attributes_list(d: Dict[str, Any]) -> List[Dict[str, Any]]:
+    attrs = []
+    for src, prefix in [
+        (d.get("info_table") or {}, "info_table"),
+        (d.get("oe_list_price") or {}, "oe_list_price"),
+    ]:
+        for k, v in src.items():
+            key = f"{prefix}/{str(k).strip()}"
+            val = None if v is None else str(v)
+            attrs.append({"key": key, "value": val})
+    # optional: dedupe by (key, value)
+    seen = set()
+    dedup = []
+    for item in attrs:
+        t = (item["key"], item["value"])
+        if t in seen:
+            continue
+        seen.add(t)
+        dedup.append(item)
+    return dedup
--- a/market/scrape/persist_snapshot/init.py
+++ b/market/scrape/persist_snapshot/init.py
@@ -0,0 +1,7 @@
+from .log_product_result import log_product_result
+from .upsert_product import upsert_product
+from .save_nav import save_nav
+from .capture_listing import capture_listing
+from .save_link_reports import save_link_reports
+from .save_subcategory_redirects import save_subcategory_redirects
+
--- a/market/scrape/persist_snapshot/_get.py
+++ b/market/scrape/persist_snapshot/_get.py
@@ -0,0 +1,3 @@
+def _get(d, key, default=None):
+    v = d.get(key)
+    return default if v in (None, "", [], {}) else v
--- a/market/scrape/persist_snapshot/capture_listing.py
+++ b/market/scrape/persist_snapshot/capture_listing.py
@@ -0,0 +1,137 @@
+# at top of persist_snapshot.py:
+from typing import Optional, List
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from typing import List, Optional, Tuple
+from sqlalchemy.dialects.postgresql import insert as pg_insert
+from datetime import datetime
+from sqlalchemy import (
+    select, update
+)
+from urllib.parse import urlparse
+import re
+
+from models.market import (
+    NavTop,
+    NavSub,
+    Listing,
+    ListingItem,
+)
+from shared.db.session import get_session
+
+# --- Models are unchanged, see original code ---
+
+# ---------------------- Helper fns called from scraper ------------------------
+
+
+
+async def capture_listing(
+      #product_slugs: Set[str],
+      url: str,
+      items: List[str],
+      total_pages: int
+  ) -> None:
+  async with get_session() as session:
+      await _capture_listing(
+          session,
+          url,
+          items,
+          total_pages   
+      )
+      await session.commit()
+
+
+async def _capture_listing(
+      session,
+      url: str,
+      items: List[str],
+      total_pages: int
+  ) -> None:
+      top_id, sub_id = await _nav_ids_from_list_url(session, url)
+      await _save_listing(session, top_id, sub_id, items, total_pages)
+
+async def _save_listing(session: AsyncSession, top_id: int, sub_id: Optional[int],
+                       items: List[str], total_pages: Optional[int]) -> None:
+    res = await session.execute(
+        select(Listing).where(Listing.top_id == top_id, Listing.sub_id == sub_id, Listing.deleted_at.is_(None))
+    )
+    listing = res.scalar_one_or_none()
+    if not listing:
+        listing = Listing(top_id=top_id, sub_id=sub_id, total_pages=total_pages)
+        session.add(listing)
+        await session.flush()
+    else:
+        listing.total_pages = total_pages
+
+    # Normalize and deduplicate incoming slugs
+    seen: set[str] = set()
+    deduped: list[str] = []
+    for s in items or []:
+        if s and isinstance(s, str) and s not in seen:
+            seen.add(s)
+            deduped.append(s)
+
+    if not deduped:
+        return
+
+    # Fetch existing slugs from the database
+    res = await session.execute(
+        select(ListingItem.slug)
+        .where(ListingItem.listing_id == listing.id, ListingItem.deleted_at.is_(None))
+    )
+    existing_slugs = set(res.scalars().all())
+
+    now = datetime.utcnow()
+
+    # Slugs to delete (present in DB but not in the new data)
+    to_delete = existing_slugs - seen
+    if to_delete:
+        await session.execute(
+            update(ListingItem)
+            .where(
+                ListingItem.listing_id == listing.id,
+                ListingItem.slug.in_(to_delete),
+                ListingItem.deleted_at.is_(None)
+            )
+            .values(deleted_at=now)
+        )
+
+    # Slugs to insert (new ones not in DB)
+    to_insert = seen - existing_slugs
+    if to_insert:
+        stmt = pg_insert(ListingItem).values(
+            [{"listing_id": listing.id, "slug": s} for s in to_insert]
+        )
+        #.on_conflict_do_nothing(
+        #    constraint="uq_listing_items_listing_slug"
+        #)
+        await session.execute(stmt)
+
+async def _nav_ids_from_list_url(session: AsyncSession, list_url: str) -> Tuple[int, Optional[int]]:
+    parts = [x for x in (urlparse(list_url).path or "").split("/") if x]
+    top_slug = parts[0].lower() if parts else ""
+    sub_slug = None
+    if len(parts) >= 2:
+        sub_slug = parts[1]
+        if sub_slug.lower().endswith((".html", ".htm")):
+            sub_slug = re.sub(r"\\.(html?|HTML?)$", "", sub_slug)
+    return await _get_nav_ids(session, top_slug, sub_slug)
+
+
+
+async def _get_nav_ids(session: AsyncSession, top_slug: str, sub_slug: Optional[str]) -> Tuple[int, Optional[int]]:
+    res_top = await session.execute(select(NavTop.id).where(NavTop.slug == top_slug, NavTop.deleted_at.is_(None)))
+    top_id = res_top.scalar_one_or_none()
+    if not top_id:
+        raise ValueError(f"NavTop not found for slug: {top_slug}")
+
+    sub_id = None
+    if sub_slug:
+        res_sub = await session.execute(
+            select(NavSub.id).where(NavSub.slug == sub_slug, NavSub.top_id == top_id, NavSub.deleted_at.is_(None))
+        )
+        sub_id = res_sub.scalar_one_or_none()
+        if sub_id is None:
+            raise ValueError(f"NavSub not found for slug: {sub_slug} under top_id={top_id}")
+
+    return top_id, sub_id
--- a/market/scrape/persist_snapshot/log_product_result.py
+++ b/market/scrape/persist_snapshot/log_product_result.py
@@ -0,0 +1,35 @@
+# at top of persist_snapshot.py:
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from typing import Dict
+from models.market import (
+    ProductLog,
+)
+from shared.db.session import get_session
+
+
+async def log_product_result(ok: bool, payload: Dict) -> None:
+    async with get_session() as session:
+        await _log_product_result(session, ok, payload)
+        await session.commit()
+
+
+async def _log_product_result(session: AsyncSession, ok: bool, payload: Dict) -> None:
+    session.add(ProductLog(
+        ok=ok,
+        slug=payload.get("slug"),
+        href_tried=payload.get("href_tried"),
+        error_type=payload.get("error_type"),
+        error_message=payload.get("error_message"),
+        http_status=payload.get("http_status"),
+        final_url=payload.get("final_url"),
+        transport_error=payload.get("transport_error"),
+        title=payload.get("title"),
+        has_description_html=payload.get("has_description_html"),
+        has_description_short=payload.get("has_description_short"),
+        sections_count=payload.get("sections_count"),
+        images_count=payload.get("images_count"),
+        embedded_images_count=payload.get("embedded_images_count"),
+        all_images_count=payload.get("all_images_count"),
+    ))
+
--- a/market/scrape/persist_snapshot/save_link_reports.py
+++ b/market/scrape/persist_snapshot/save_link_reports.py
@@ -0,0 +1,29 @@
+# at top of persist_snapshot.py:
+from typing import List
+
+from typing import Dict, List
+
+from models.market import (
+    LinkError,
+    LinkExternal,
+)
+from shared.db.session import get_session
+
+# --- Models are unchanged, see original code ---
+
+# ---------------------- Helper fns called from scraper ------------------------
+
+
+
+async def save_link_reports(link_errors: List[Dict], link_externals: List[Dict]) -> None:
+    async with get_session() as session:
+      for e in link_errors:
+          session.add(LinkError(
+              product_slug=e.get("product"), href=e.get("href"), text=e.get("text"),
+              top=e.get("top"), sub=e.get("sub"), target_slug=e.get("target_slug"), type=e.get("type"),
+          ))
+      for e in link_externals:
+          session.add(LinkExternal(
+              product_slug=e.get("product"), href=e.get("href"), text=e.get("text"), host=e.get("host"),
+          ))
+      await session.commit()
--- a/market/scrape/persist_snapshot/save_nav.py
+++ b/market/scrape/persist_snapshot/save_nav.py
@@ -0,0 +1,110 @@
+# at top of persist_snapshot.py:
+from datetime import datetime
+from sqlalchemy import (
+     select, tuple_
+)
+from typing import Dict
+
+from models.market import (
+    NavTop,
+    NavSub,
+)
+from shared.db.session import get_session
+
+
+
+
+async def save_nav(nav: Dict) -> None:
+    async with get_session() as session:
+        await _save_nav(session, nav)
+        await session.commit()
+
+async def _save_nav(session, nav: Dict, market_id=None) -> None:
+    print('===================SAVE NAV========================')
+    print(nav)
+    now = datetime.utcnow()
+
+    incoming_top_slugs = set()
+    incoming_sub_keys = set()  # (top_slug, sub_slug)
+
+    # First pass: collect slugs
+    for label, data in (nav.get("cats") or {}).items():
+        top_slug = (data or {}).get("slug")
+        if not top_slug:
+            continue
+        incoming_top_slugs.add(top_slug)
+
+        for s in (data.get("subs") or []):
+            sub_slug = s.get("slug")
+            if sub_slug:
+                incoming_sub_keys.add((top_slug, sub_slug))
+
+    # Soft-delete stale NavSub entries
+    # This requires joining NavTop to access top_slug
+    subs_to_delete = await session.execute(
+        select(NavSub)
+        .join(NavTop, NavSub.top_id == NavTop.id)
+        .where(
+            NavSub.deleted_at.is_(None),
+            ~tuple_(NavTop.slug, NavSub.slug).in_(incoming_sub_keys)
+        )
+    )
+    for sub in subs_to_delete.scalars():
+        sub.deleted_at = now
+
+    # Soft-delete stale NavTop entries
+    tops_to_delete = await session.execute(
+        select(NavTop)
+        .where(
+            NavTop.deleted_at.is_(None),
+            ~NavTop.slug.in_(incoming_top_slugs)
+        )
+    )
+    for top in tops_to_delete.scalars():
+        top.deleted_at = now
+
+    await session.flush()
+
+    # Upsert NavTop and NavSub
+    for label, data in (nav.get("cats") or {}).items():
+        top_slug = (data or {}).get("slug")
+        if not top_slug:
+            continue
+
+        res = await session.execute(
+            select(NavTop).where(NavTop.slug == top_slug)
+        )
+        top = res.scalar_one_or_none()
+
+        if top:
+            top.label = label
+            top.deleted_at = None
+            if market_id is not None and top.market_id is None:
+                top.market_id = market_id
+        else:
+            top = NavTop(label=label, slug=top_slug, market_id=market_id)
+            session.add(top)
+
+        await session.flush()
+
+        for s in (data.get("subs") or []):
+            sub_slug = s.get("slug")
+            if not sub_slug:
+                continue
+            sub_label = s.get("label")
+            sub_href = s.get("href")
+
+            res_sub = await session.execute(
+                select(NavSub).where(
+                    NavSub.slug == sub_slug,
+                    NavSub.top_id == top.id
+                )
+            )
+            sub = res_sub.scalar_one_or_none()
+            if sub:
+                sub.label = sub_label
+                sub.href = sub_href
+                sub.deleted_at = None
+            else:
+                session.add(NavSub(top_id=top.id, label=sub_label, slug=sub_slug, href=sub_href))
+
--- a/market/scrape/persist_snapshot/save_subcategory_redirects.py
+++ b/market/scrape/persist_snapshot/save_subcategory_redirects.py
@@ -0,0 +1,32 @@
+# at top of persist_snapshot.py:
+
+from typing import Dict
+from datetime import datetime
+from sqlalchemy import (
+    update
+)
+from models.market import (
+    SubcategoryRedirect,
+)
+from shared.db.session import get_session
+
+# --- Models are unchanged, see original code ---
+
+# ---------------------- Helper fns called from scraper ------------------------
+
+
+async def save_subcategory_redirects(mapping: Dict[str, str]) -> None:
+    async with get_session() as session:
+        await _save_subcategory_redirects(session, mapping)
+        await session.commit()
+
+
+async def _save_subcategory_redirects(session, mapping: Dict[str, str]) -> None:
+    await session.execute(update(SubcategoryRedirect).where(SubcategoryRedirect.deleted_at.is_(None)).values(deleted_at=datetime.utcnow()))
+    for old, new in mapping.items():
+        session.add(SubcategoryRedirect(old_path=old, new_path=new))
+
+
+
+  #for slug in items:
+  #    product_slugs.add(slug)
--- a/market/scrape/persist_snapshot/upsert_product.py
+++ b/market/scrape/persist_snapshot/upsert_product.py
@@ -0,0 +1,237 @@
+# at top of persist_snapshot.py:
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from typing import Dict
+from datetime import datetime
+from sqlalchemy import (
+    func, select, update
+)
+
+from models.market import (
+    Product,
+    ProductImage,
+    ProductSection,
+    ProductLabel,
+    ProductSticker,
+    ProductAttribute,
+    ProductNutrition,
+    ProductAllergen
+)
+from shared.db.session import get_session
+
+from ._get import _get
+from .log_product_result import _log_product_result
+
+# --- Models are unchanged, see original code ---
+
+# ---------------------- Helper fns called from scraper ------------------------
+
+
+
+
+async def _upsert_product(session: AsyncSession, d: Dict) -> Product:
+    slug = d.get("slug")
+    if not slug:
+        raise ValueError("product missing slug")
+    res = await session.execute(select(Product).where(Product.slug == slug, Product.deleted_at.is_(None)))
+    p = res.scalar_one_or_none()
+    if not p:
+        p = Product(slug=slug)
+        session.add(p)
+
+    p.title = _get(d, "title")
+    p.image = _get(d, "image")
+    p.description_short = _get(d, "description_short")
+    p.description_html = _get(d, "description_html")
+    p.suma_href = _get(d, "suma_href")
+    p.brand = _get(d, "brand")
+    p.rrp = _get(d, "rrp")
+    p.rrp_currency = _get(d, "rrp_currency")
+    p.rrp_raw = _get(d, "rrp_raw")
+    p.price_per_unit = _get(d, "price_per_unit")
+    p.price_per_unit_currency = _get(d, "price_per_unit_currency")
+    p.price_per_unit_raw = _get(d, "price_per_unit_raw")
+    p.special_price = _get(d, "special_price")
+    p.special_price_currency = _get(d, "special_price_currency")
+    p.special_price_raw = _get(d, "special_price_raw")
+    p.regular_price = _get(d, "regular_price")
+    p.regular_price_currency = _get(d, "regular_price_currency")
+    p.regular_price_raw = _get(d, "regular_price_raw")
+    p.case_size_count = _get(d, "case_size_count")
+    p.case_size_item_qty = _get(d, "case_size_item_qty")
+    p.case_size_item_unit = _get(d, "case_size_item_unit")
+    p.case_size_raw = _get(d, "case_size_raw")
+    p.ean = d.get("ean") or d.get("barcode") or None
+    p.sku = d.get("sku")
+    p.unit_size = d.get("unit_size")
+    p.pack_size = d.get("pack_size")
+    p.updated_at = func.now()
+
+    now = datetime.utcnow()
+
+    
+
+    # ProductSection sync
+    existing_sections = await session.execute(select(ProductSection).where(ProductSection.product_id == p.id, ProductSection.deleted_at.is_(None)))
+    existing_sections_set = {(s.title, s.html) for s in existing_sections.scalars()}
+
+    new_sections_set = set()
+    for sec in d.get("sections") or []:
+        if isinstance(sec, dict) and sec.get("title") and sec.get("html"):
+            new_sections_set.add((sec["title"], sec["html"]))
+            if (sec["title"], sec["html"]) not in existing_sections_set:
+                session.add(ProductSection(product_id=p.id, title=sec["title"], html=sec["html"]))
+
+    for s in existing_sections_set - new_sections_set:
+        await session.execute(update(ProductSection).where(ProductSection.product_id == p.id, ProductSection.title == s[0], ProductSection.html == s[1], ProductSection.deleted_at.is_(None)).values(deleted_at=now))
+
+    # ProductImage sync
+    existing_images = await session.execute(select(ProductImage).where(ProductImage.product_id == p.id, ProductImage.deleted_at.is_(None)))
+    existing_images_set = {(img.url, img.kind) for img in existing_images.scalars()}
+
+    new_images_set = set()
+    for kind, urls in [
+        ("gallery", d.get("images") or []),
+        ("embedded", d.get("embedded_image_urls") or []),
+        ("all", d.get("all_image_urls") or []),
+    ]:
+        for idx, url in enumerate(urls):
+            if url:
+                new_images_set.add((url, kind))
+                if (url, kind) not in existing_images_set:
+                    session.add(ProductImage(product_id=p.id, url=url, position=idx, kind=kind))
+
+    for img in existing_images_set - new_images_set:
+        await session.execute(update(ProductImage).where(ProductImage.product_id == p.id, ProductImage.url == img[0], ProductImage.kind == img[1], ProductImage.deleted_at.is_(None)).values(deleted_at=now))
+
+    # ProductLabel sync
+    existing_labels = await session.execute(select(ProductLabel).where(ProductLabel.product_id == p.id, ProductLabel.deleted_at.is_(None)))
+    existing_labels_set = {label.name.strip() for label in existing_labels.scalars()}
+
+    new_labels = {str(name).strip() for name in (d.get("labels") or []) if name}
+
+    for name in new_labels - existing_labels_set:
+        session.add(ProductLabel(product_id=p.id, name=name))
+
+    for name in existing_labels_set - new_labels:
+        await session.execute(update(ProductLabel).where(ProductLabel.product_id == p.id, ProductLabel.name == name, ProductLabel.deleted_at.is_(None)).values(deleted_at=now))
+
+    # ProductSticker sync
+    existing_stickers = await session.execute(select(ProductSticker).where(ProductSticker.product_id == p.id, ProductSticker.deleted_at.is_(None)))
+    existing_stickers_set = {sticker.name.strip() for sticker in existing_stickers.scalars()}
+
+    new_stickers = {str(name).strip().lower() for name in (d.get("stickers") or []) if name}
+
+    for name in new_stickers - existing_stickers_set:
+        session.add(ProductSticker(product_id=p.id, name=name))
+
+    for name in existing_stickers_set - new_stickers:
+        await session.execute(update(ProductSticker).where(ProductSticker.product_id == p.id, ProductSticker.name == name, ProductSticker.deleted_at.is_(None)).values(deleted_at=now))
+
+    # ProductAttribute sync
+    existing_attrs = await session.execute(select(ProductAttribute).where(ProductAttribute.product_id == p.id, ProductAttribute.deleted_at.is_(None)))
+    existing_attrs_set = {(a.key, a.value) for a in existing_attrs.scalars()}
+
+    new_attrs_set = set()
+    for src, prefix in [(d.get("info_table") or {}, "info_table"), (d.get("oe_list_price") or {}, "oe_list_price")]:
+        for k, v in src.items():
+            key = f"{prefix}/{str(k).strip()}"
+            val = None if v is None else str(v)
+            new_attrs_set.add((key, val))
+            if (key, val) not in existing_attrs_set:
+                session.add(ProductAttribute(product_id=p.id, key=key, value=val))
+
+    for key, val in existing_attrs_set - new_attrs_set:
+        await session.execute(update(ProductAttribute).where(ProductAttribute.product_id == p.id, ProductAttribute.key == key, ProductAttribute.value == val, ProductAttribute.deleted_at.is_(None)).values(deleted_at=now))
+
+    # ProductNutrition sync
+    existing_nuts = await session.execute(select(ProductNutrition).where(ProductNutrition.product_id == p.id, ProductNutrition.deleted_at.is_(None)))
+    existing_nuts_set = {(n.key, n.value, n.unit) for n in existing_nuts.scalars()}
+
+    new_nuts_set = set()
+    nutrition = d.get("nutrition") or []
+    if isinstance(nutrition, dict):
+        for k, v in nutrition.items():
+            key, val = str(k).strip(), str(v) if v is not None else None
+            new_nuts_set.add((key, val, None))
+            if (key, val, None) not in existing_nuts_set:
+                session.add(ProductNutrition(product_id=p.id, key=key, value=val, unit=None))
+    elif isinstance(nutrition, list):
+        for row in nutrition:
+            try:
+                key = str(row.get("key") or "").strip()
+                val = None if row.get("value") is None else str(row.get("value"))
+                unit = None if row.get("unit") is None else str(row.get("unit"))
+                if key:
+                    new_nuts_set.add((key, val, unit))
+                    if (key, val, unit) not in existing_nuts_set:
+                        session.add(ProductNutrition(product_id=p.id, key=key, value=val, unit=unit))
+            except Exception:
+                continue
+
+    for key, val, unit in existing_nuts_set - new_nuts_set:
+        await session.execute(update(ProductNutrition).where(ProductNutrition.product_id == p.id, ProductNutrition.key == key, ProductNutrition.value == val, ProductNutrition.unit == unit, ProductNutrition.deleted_at.is_(None)).values(deleted_at=now))
+
+    # ProductAllergen sync
+    existing_allergens = await session.execute(select(ProductAllergen).where(ProductAllergen.product_id == p.id, ProductAllergen.deleted_at.is_(None)))
+    existing_allergens_set = {(a.name, a.contains) for a in existing_allergens.scalars()}
+
+    new_allergens_set = set()
+    for a in d.get("allergens") or []:
+        if isinstance(a, str):
+            nm, contains = a.strip(), True
+        elif isinstance(a, dict):
+            nm, contains = (a.get("name") or "").strip(), bool(a.get("contains", True))
+        else:
+            continue
+        if nm:
+            new_allergens_set.add((nm, contains))
+            if (nm, contains) not in existing_allergens_set:
+                session.add(ProductAllergen(product_id=p.id, name=nm, contains=contains))
+
+    for name, contains in existing_allergens_set - new_allergens_set:
+        await session.execute(update(ProductAllergen).where(ProductAllergen.product_id == p.id, ProductAllergen.name == name, ProductAllergen.contains == contains, ProductAllergen.deleted_at.is_(None)).values(deleted_at=now))
+
+
+
+
+    await session.flush()
+    return p
+
+async def upsert_product(
+        slug,
+        href,
+        d,
+):    
+    async with get_session() as session:
+        try:
+            await _upsert_product(session, d)
+            await _log_product_result(session, ok=True, payload={
+                "slug": slug,
+                "href_tried": href,
+                "title": d.get("title"),
+                "has_description_html": bool(d.get("description_html")),
+                "has_description_short": bool(d.get("description_short")),
+                "sections_count": len(d.get("sections") or []),
+                "images_count": len(d.get("images")),
+                "embedded_images_count": len(d.get("embedded_image_urls")),
+                "all_images_count": len(d.get("all_image_urls")),
+            })
+
+        except Exception as e:
+            print(f"[ERROR] Failed to upsert product '{d.get('slug')}'")
+            print(f"  Title: {d}.get('title')")
+            print(f"  URL: {d.get('suma_href')}")
+            print(f"  Error type: {type(e).__name__}")
+            print(f"  Error message: {str(e)}")
+            import traceback
+            traceback.print_exc()
+            await _log_product_result(session, ok=False, payload={
+                "slug": d.get("slug"),
+                "href_tried": d.get("suma_href"),
+                "error_type": type(e).__name__,
+                "error_message": str(e),
+                "title": d.get("title"),
+            })
+            raise
+        await session.commit()
--- a/market/scrape/product/init.py
+++ b/market/scrape/product/init.py
@@ -0,0 +1 @@
+
--- a/market/scrape/product/extractors/init.py
+++ b/market/scrape/product/extractors/init.py
@@ -0,0 +1,13 @@
+
+# Auto-import all extractor modules so they register themselves.
+from .title import ex_title          # noqa: F401
+from .images import ex_images        # noqa: F401
+from .short_description import ex_short_description  # noqa: F401
+from .description_sections import ex_description_sections  # noqa: F401
+from .nutrition_ex import ex_nutrition  # noqa: F401
+from .stickers import ex_stickers    # noqa: F401
+from .labels import ex_labels        # noqa: F401
+from .info_table import ex_info_table  # noqa: F401
+from .oe_list_price import ex_oe_list_price  # noqa: F401
+from .regular_price_fallback import ex_regular_price_fallback  # noqa: F401
+from .breadcrumbs import ex_breadcrumbs  # noqa: F401
--- a/market/scrape/product/extractors/breadcrumbs.py
+++ b/market/scrape/product/extractors/breadcrumbs.py
@@ -0,0 +1,68 @@
+
+from __future__ import annotations
+from typing import Dict, List, Union
+from urllib.parse import urlparse
+from bs4 import BeautifulSoup
+from shared.utils import normalize_text
+from ..registry import extractor
+
+@extractor
+def ex_breadcrumbs(soup: BeautifulSoup, url: str) -> Dict:
+    """
+    Parse breadcrumbs to identify top and sub categories.
+    """
+    bc_ul = (soup.select_one(".breadcrumbs ul.items")
+             or soup.select_one("nav.breadcrumbs ul.items")
+             or soup.select_one("ul.items"))
+    if not bc_ul:
+        return {}
+
+    crumbs = []
+    for li in bc_ul.select("li.item"):
+        a = li.find("a")
+        if a:
+            title = normalize_text(a.get("title") or a.get_text())
+            href = a.get("href")
+        else:
+            title = normalize_text(li.get_text())
+            href = None
+        slug = None
+        if href:
+            try:
+                p = urlparse(href)
+                path = (p.path or "").strip("/")
+                slug = path.split("/")[-1] if path else None
+            except Exception:
+                slug = None
+        if slug:
+          crumbs.append({"title": title or None, "href": href or None, "slug": slug})
+
+    category_links = [c for c in crumbs if c.get("href")]
+    top = None
+    sub = None
+    for c in category_links:
+        t = (c.get("title") or "").lower()
+        s = (c.get("slug") or "").lower()
+        if t == "home" or s in ("", "home"):
+            continue
+        if top is None:
+            top = c
+            continue
+        if sub is None:
+            sub = c
+            break
+
+    out: Dict[str, Union[str, List[Dict[str, str]]]] = {
+        "category_breadcrumbs": crumbs
+    }
+    if top:
+        out["category_top_title"] = top.get("title")
+        out["category_top_href"] = top.get("href")
+        out["category_top_slug"] = top.get("slug")
+    if sub:
+        out["category_sub_title"] = sub.get("title")
+        out["category_sub_href"] = sub.get("href")
+        out["category_sub_slug"] = sub.get("slug")
+    if top and sub:
+        out["category_path"] = f"{(top.get('slug') or '').strip()}/{(sub.get('slug') or '').strip()}"
+    return out
--- a/market/scrape/product/extractors/description_sections.py
+++ b/market/scrape/product/extractors/description_sections.py
@@ -0,0 +1,43 @@
+
+from __future__ import annotations
+from typing import Dict, List
+from bs4 import BeautifulSoup
+from shared.utils import normalize_text
+from ...html_utils import absolutize_fragment
+from ..registry import extractor
+from ..helpers.desc import (
+    split_description_container, find_description_container,
+    pair_title_content_from_magento_tabs, scan_headings_for_sections,
+    additional_attributes_table,
+)
+from ..helpers.text import clean_title, is_blacklisted_heading
+
+@extractor
+def ex_description_sections(soup: BeautifulSoup, url: str) -> Dict:
+    description_html = None
+    sections: List[Dict] = []
+    desc_el = find_description_container(soup)
+    if desc_el:
+        open_html, sections_from_desc = split_description_container(desc_el)
+        description_html = open_html or None
+        sections.extend(sections_from_desc)
+
+    existing = {s["title"].lower() for s in sections}
+    for t, html_fragment in (pair_title_content_from_magento_tabs(soup) or scan_headings_for_sections(soup)):
+        low = t.lower()
+        if "product description" in low or low == "description" or "details" in low:
+            if not description_html and html_fragment:
+                description_html = absolutize_fragment(html_fragment)
+            continue
+        if t.lower() not in existing and normalize_text(BeautifulSoup(html_fragment, "lxml").get_text()):
+            if not is_blacklisted_heading(t):
+                sections.append({"title": clean_title(t), "html": absolutize_fragment(html_fragment)})
+                existing.add(t.lower())
+    addl = additional_attributes_table(soup)
+    if addl and "additional information" not in existing and not is_blacklisted_heading("additional information"):
+        sections.append({"title": "Additional Information", "html": addl})
+    out = {"sections": sections}
+    if description_html:
+        out["description_html"] = description_html
+    return out
+
--- a/market/scrape/product/extractors/images.py
+++ b/market/scrape/product/extractors/images.py
@@ -0,0 +1,89 @@
+from __future__ import annotations
+import json, re
+from typing import Dict, List
+from bs4 import BeautifulSoup
+from ..registry import extractor
+from ..helpers.html import abs_url, collect_img_candidates, dedup_by_filename
+
+@extractor
+def ex_images(soup: BeautifulSoup, url: str) -> Dict:
+    images: List[str] = []
+    debug = False  # set True while debugging
+
+    # 1) Magento init script (gallery)
+    scripts = soup.find_all("script", attrs={"type": "text/x-magento-init"})
+    if debug: print(f"[ex_images] x-magento-init scripts: {len(scripts)}")
+
+    for script in scripts:
+        # Use raw string as-is; no stripping/collapsing
+        text = script.string or script.get_text() or ""
+        if "mage/gallery/gallery" not in text:
+            continue
+
+        # Correct (not over-escaped) patterns:
+        m = re.search(r'"data"\s*:\s*(\[[\s\S]*?\])', text)
+        if not m:
+            if debug: print("[ex_images] 'data' array not found in gallery block")
+            continue
+
+        arr_txt = m.group(1)
+        added = False
+        try:
+            data = json.loads(arr_txt)
+            for entry in data:
+                u = abs_url(entry.get("full")) or abs_url(entry.get("img"))
+                if u:
+                    images.append(u); added = True
+        except Exception as e:
+            if debug: print(f"[ex_images] json.loads failed: {e!r}; trying regex fallback")
+            # Fallback to simple key extraction
+            fulls = re.findall(r'"full"\s*:\s*"([^"]+)"', arr_txt)
+            imgs  = re.findall(r'"img"\s*:\s*"([^"]+)"',  arr_txt) if not fulls else []
+            for u in (fulls or imgs):
+                u = abs_url(u)
+                if u:
+                    images.append(u); added = True
+
+        if added:
+            break  # got what we need from the gallery block
+
+    # 2) JSON-LD fallback
+    if not images:
+        for script in soup.find_all("script", attrs={"type": "application/ld+json"}):
+            raw = script.string or script.get_text() or ""
+            try:
+                data = json.loads(raw)
+            except Exception:
+                continue
+
+            def add_from(val):
+                if isinstance(val, str):
+                    u = abs_url(val);  u and images.append(u)
+                elif isinstance(val, list):
+                    for v in val:
+                        if isinstance(v, str):
+                            u = abs_url(v); u and images.append(u)
+                        elif isinstance(v, dict) and "url" in v:
+                            u = abs_url(v["url"]); u and images.append(u)
+                elif isinstance(val, dict) and "url" in val:
+                    u = abs_url(val["url"]); u and images.append(u)
+
+            if isinstance(data, dict) and "image" in data:
+                add_from(data["image"])
+            if isinstance(data, list):
+                for item in data:
+                    if isinstance(item, dict) and "image" in item:
+                        add_from(item["image"])
+
+    # 3) Generic DOM scan fallback
+    if not images:
+        # consider broadening selectors if needed, e.g. '.fotorama__img'
+        for el in soup.select(".product.media img, .gallery-placeholder img, .fotorama__stage img"):
+            for cand in collect_img_candidates(el):
+                u = abs_url(cand)
+                if u:
+                    images.append(u)
+
+    images = dedup_by_filename(images)
+    if debug: print(f"[ex_images] found images: {images}")
+    return {"images": images, "image": images[0] if images else None}
--- a/market/scrape/product/extractors/info_table.py
+++ b/market/scrape/product/extractors/info_table.py
@@ -0,0 +1,76 @@
+
+from __future__ import annotations
+from typing import Dict, Union
+from bs4 import BeautifulSoup
+from shared.utils import normalize_text
+from ..registry import extractor
+from ..helpers.price import parse_price, parse_case_size
+
+@extractor
+def ex_info_table(soup: BeautifulSoup, url: str) -> Dict:
+    """
+    Extracts:
+    <div class="product-page-info-table"> ... rows of label/content ... </div>
+    Produces:
+      info_table (raw map), brand, rrp[_raw|_currency], price_per_unit[_raw|_currency],
+      case_size_* fields
+    """
+    container = soup.select_one(".product-page-info-table") or None
+    if not container:
+        return {}
+    rows_parent = container.select_one(".product-page-info-table-rows") or container
+    rows = rows_parent.select(".product-page-info-table-row") or []
+    if not rows:
+        return {}
+
+    raw_map: Dict[str, str] = {}
+    for r in rows:
+        lab_el = r.select_one(".product-page-info-table__label")
+        val_el = r.select_one(".product-page-info-table__content")
+        if not lab_el or not val_el:
+            continue
+        label = normalize_text(lab_el.get_text())
+        value = normalize_text(val_el.get_text())
+        if label:
+            raw_map[label] = value
+
+    out: Dict[str, Union[str, float, int, Dict]] = {"info_table": raw_map}
+
+    # Brand
+    brand = raw_map.get("Brand") or raw_map.get("Brand Name") or None
+    if brand:
+        out["brand"] = brand
+
+    # RRP
+    rrp_val, rrp_cur, rrp_raw = parse_price(raw_map.get("RRP", ""))
+    if rrp_raw and (rrp_val is not None or rrp_cur is not None):
+        out["rrp_raw"] = rrp_raw
+    if rrp_val is not None:
+        out["rrp"] = rrp_val
+    if rrp_cur:
+        out["rrp_currency"] = rrp_cur
+
+    # Price Per Unit
+    ppu_val, ppu_cur, ppu_raw = parse_price(
+        raw_map.get("Price Per Unit", "") or raw_map.get("Unit Price", "")
+    )
+    if ppu_raw and (ppu_val is not None or ppu_cur is not None):
+        out["price_per_unit_raw"] = ppu_raw
+    if ppu_val is not None:
+        out["price_per_unit"] = ppu_val
+    if ppu_cur:
+        out["price_per_unit_currency"] = ppu_cur
+
+    # Case Size
+    cs_text = raw_map.get("Case Size", "") or raw_map.get("Pack Size", "")
+    cs_count, cs_item_qty, cs_item_unit, cs_raw = parse_case_size(cs_text)
+    if cs_raw:
+        out["case_size_raw"] = cs_raw
+    if cs_count is not None:
+        out["case_size_count"] = cs_count
+    if cs_item_qty is not None:
+        out["case_size_item_qty"] = cs_item_qty
+    if cs_item_unit:
+        out["case_size_item_unit"] = cs_item_unit
+
+    return out
--- a/market/scrape/product/extractors/labels.py
+++ b/market/scrape/product/extractors/labels.py
@@ -0,0 +1,41 @@
+
+from __future__ import annotations
+from typing import Dict, List
+from bs4 import BeautifulSoup
+from shared.utils import normalize_text
+from ..registry import extractor
+
+@extractor
+def ex_labels(soup: BeautifulSoup, url: str) -> Dict:
+    """
+    From:
+      <ul class="cdz-product-labels">
+        <li class="label-item new"><div class="label-content">NEW</div></li>
+      </ul>
+    Returns "labels": lower-cased union of class hints and visible text.
+    """
+    root = soup.select_one("ul.cdz-product-labels")
+    if not root:
+        return {}
+    items: List[str] = []
+    texts: List[str] = []
+
+    for li in root.select("li.label-item"):
+        for c in (li.get("class") or []):
+            c = (c or "").strip()
+            if c and c.lower() != "label-item" and c not in items:
+                items.append(c)
+        txt = normalize_text(li.get_text())
+        if txt and txt not in texts:
+            texts.append(txt)
+
+    if not items and not texts:
+        return {}
+    union = []
+    seen = set()
+    for s in items + [t.lower() for t in texts]:
+        key = (s or "").strip().lower()
+        if key and key not in seen:
+            seen.add(key)
+            union.append(key)
+    return {"labels": union}
--- a/market/scrape/product/extractors/nutrition_ex.py
+++ b/market/scrape/product/extractors/nutrition_ex.py
@@ -0,0 +1,129 @@
+from __future__ import annotations
+from typing import Dict, List, Optional, Tuple
+import re
+from bs4 import BeautifulSoup
+from shared.utils import normalize_text
+from ..registry import extractor
+from ..helpers.desc import (
+    split_description_container, find_description_container,
+    pair_title_content_from_magento_tabs, scan_headings_for_sections,
+)
+
+# ----- value/unit parser ------------------------------------------------------
+
+_NUM_UNIT_RE = re.compile(
+    r"""
+    ^\s*
+    (?P<num>[-+]?\d{1,3}(?:[.,]\d{3})*(?:[.,]\d+)?|\d+(?:[.,]\d+)?)
+    \s*
+    (?P<unit>[a-zA-Z%µ/]+)?
+    \s*$
+    """,
+    re.X,
+)
+
+def _parse_value_unit(s: str) -> Tuple[Optional[str], Optional[str]]:
+    if not s:
+        return None, None
+    s = re.sub(r"\s+", " ", s.strip())
+    m = _NUM_UNIT_RE.match(s)
+    if not m:
+        return None, None
+    num = (m.group("num") or "").replace(",", "")
+    unit = m.group("unit") or None
+    if unit:
+        u = unit.lower()
+        if u in {"kcal", "kcal.", "kcalories", "kcalorie"}:
+            unit = "kcal"
+        elif u in {"kj", "kj.", "kilojoule", "kilojoules"}:
+            unit = "kJ"
+    return (num or None, unit)
+
+# ----- section finder ---------------------------------------------------------
+
+def _find_nutrition_section_html(soup: BeautifulSoup) -> Optional[str]:
+    """
+    Return the HTML for the section whose title matches 'Nutritional Information'.
+    We look in the same places your description extractor does.
+    """
+    # 1) Magento tabs
+    for t, html in (pair_title_content_from_magento_tabs(soup) or []):
+        if not t or not html:
+            continue
+        title = normalize_text(t).rstrip(":").lower()
+        if "nutritional information" in title:
+            return html
+
+    # 2) Description container split into sections
+    desc_el = find_description_container(soup)
+    if desc_el:
+        _open_html, sections = split_description_container(desc_el)
+        for sec in sections or []:
+            title = normalize_text((sec.get("title") or "")).rstrip(":").lower()
+            if "nutritional information" in title:
+                return sec.get("html") or ""
+
+    # 3) Fallback: generic heading scan
+    for t, html in (scan_headings_for_sections(soup) or []):
+        if not t or not html:
+            continue
+        title = normalize_text(t).rstrip(":").lower()
+        if "nutritional information" in title:
+            return html
+
+    return None
+
+# ----- table parser -----------------------------------------------------------
+
+def _extract_rows_from_table(root: BeautifulSoup) -> List[Dict[str, str]]:
+    out: List[Dict[str, str]] = []
+    table = root.select_one("table")
+    if not table:
+        return out
+
+    for tr in table.select("tr"):
+        th = tr.find("th")
+        tds = tr.find_all("td")
+        if th and tds:
+            key = normalize_text(th.get_text(" ").strip())
+            val_raw = normalize_text(tds[0].get_text(" ").strip())
+        elif len(tds) >= 2:
+            key = normalize_text(tds[0].get_text(" ").strip())
+            val_raw = normalize_text(tds[1].get_text(" ").strip())
+        else:
+            continue
+
+        if not key or not val_raw:
+            continue
+
+        value, unit = _parse_value_unit(val_raw)
+        if value is None:  # keep raw if not parseable
+            value, unit = val_raw, None
+
+        out.append({"key": key, "value": value, "unit": unit})
+
+    # Deduplicate while preserving order
+    seen = set()
+    dedup: List[Dict[str, str]] = []
+    for r in out:
+        t = (r["key"], r.get("value"), r.get("unit"))
+        if t in seen:
+            continue
+        seen.add(t)
+        dedup.append(r)
+    return dedup
+
+# ----- extractor --------------------------------------------------------------
+
+@extractor
+def ex_nutrition(soup: BeautifulSoup, url: str) -> Dict:
+    """
+    Extract nutrition ONLY from the section titled 'Nutritional Information'.
+    Returns: {"nutrition": [{"key": "...", "value": "...", "unit": "..."}]}
+    """
+    section_html = _find_nutrition_section_html(soup)
+    if not section_html:
+        return {"nutrition": []}
+    section_soup = BeautifulSoup(section_html, "lxml")
+    rows = _extract_rows_from_table(section_soup)
+    return {"nutrition": rows}
--- a/market/scrape/product/extractors/oe_list_price.py
+++ b/market/scrape/product/extractors/oe_list_price.py
@@ -0,0 +1,56 @@
+
+from __future__ import annotations
+from typing import Dict, Union
+from bs4 import BeautifulSoup
+from ..registry import extractor
+from ..helpers.price import parse_price
+
+@extractor
+def ex_oe_list_price(soup: BeautifulSoup, url: str) -> Dict:
+    """
+    Extract Magento "oe-list-price" block:
+      <div class="oe-list-price">
+        <div class="rrp-price"><label>Regular Price: </label><span class="price">£30.50</span></div>
+        <div class="oe-final-price"><label>Special Price: </label><span>£23.63</span></div>
+      </div>
+    Produces:
+      oe_list_price: { rrp_raw, rrp, rrp_currency, special_raw, special, special_currency }
+    Also promotes special_* to top-level (special_price_*) if available.
+    """
+    box = soup.select_one(".oe-list-price")
+    if not box:
+        return {}
+    out: Dict[str, Union[str, float, dict]] = {}
+    oe: Dict[str, Union[str, float]] = {}
+
+    # RRP inside oe-list-price (if present)
+    rrp = box.select_one(".rrp-price")
+    if rrp:
+        txt = (rrp.select_one("span.price") or rrp.select_one("span") or rrp).get_text(strip=True)
+        val, cur, raw = parse_price(txt)
+        if raw:
+            oe["rrp_raw"] = raw
+        if val is not None:
+            oe["rrp"] = val
+        if cur:
+            oe["rrp_currency"] = cur
+
+    # Special Price inside oe-list-price
+    sp = box.select_one(".oe-final-price, .special-price, .final-price")
+    if sp:
+        txt = (sp.select_one("span.price") or sp.select_one("span") or sp).get_text(strip=True)
+        val, cur, raw = parse_price(txt)
+        if raw:
+            oe["special_raw"] = raw
+        if val is not None:
+            oe["special"] = val
+            out["special_price"] = val
+        if cur:
+            oe["special_currency"] = cur
+            out["special_price_currency"] = cur
+        if raw:
+            out["special_price_raw"] = raw
+
+    if oe:
+        out["oe_list_price"] = oe
+    return out
--- a/market/scrape/product/extractors/regular_price_fallback.py
+++ b/market/scrape/product/extractors/regular_price_fallback.py
@@ -0,0 +1,33 @@
+
+from __future__ import annotations
+from typing import Dict, Union
+from bs4 import BeautifulSoup
+from ..registry import extractor
+from ..helpers.price import parse_price
+
+@extractor
+def ex_regular_price_fallback(soup: BeautifulSoup, url: str) -> Dict:
+    """
+    Fallback extractor for legacy 'Regular Price' blocks outside oe-list-price:
+      <div class="rrp-price"><label>Regular Price: </label><span class="price">£16.55</span></div>
+    """
+    rrp = soup.select_one("div.rrp-price")
+    if not rrp:
+        return {}
+    span = rrp.select_one("span.price")
+    price_text = span.get_text(strip=True) if span else rrp.get_text(" ", strip=True)
+    value, currency, raw = parse_price(price_text or "")
+    out: Dict[str, Union[str, float]] = {}
+    if raw:
+        out["regular_price_raw"] = raw
+    if value is not None:
+        out["regular_price"] = value
+    if currency:
+        out["regular_price_currency"] = currency
+    if value is not None:
+        out.setdefault("rrp", value)
+    if currency:
+        out.setdefault("rrp_currency", currency)
+    if raw:
+        out.setdefault("rrp_raw", raw)
+    return out
--- a/market/scrape/product/extractors/short_description.py
+++ b/market/scrape/product/extractors/short_description.py
@@ -0,0 +1,19 @@
+
+from __future__ import annotations
+from typing import Dict
+from bs4 import BeautifulSoup
+from shared.utils import normalize_text
+from ..registry import extractor
+
+@extractor
+def ex_short_description(soup: BeautifulSoup, url: str) -> Dict:
+    desc_short = None
+    for sel in [".product.attribute.description .value", ".product.attribute.overview .value",
+                "meta[name='description']", "meta[property='og:description']"]:
+        el = soup.select_one(sel)
+        if not el:
+            continue
+        desc_short = normalize_text(el.get_text() if el.name != "meta" else el.get("content"))
+        if desc_short:
+            break
+    return {"description_short": desc_short}
--- a/market/scrape/product/extractors/stickers.py
+++ b/market/scrape/product/extractors/stickers.py
@@ -0,0 +1,30 @@
+
+from __future__ import annotations
+from typing import Dict, List
+from bs4 import BeautifulSoup
+from ..registry import extractor
+
+@extractor
+def ex_stickers(soup: BeautifulSoup, url: str) -> Dict:
+    """
+    <div class="stickers">
+      <span class="sticker xxx"></span>
+      ...
+    </div>
+    """
+    root = soup.select_one("div.stickers")
+    if not root:
+        return {"stickers": []}
+    stickers: List[str] = []
+    seen = set()
+    for sp in root.select("span.sticker"):
+        classes = sp.get("class") or []
+        extras = [c.strip() for c in classes if c and c.lower() != "sticker"]
+        data_name = (sp.get("data-sticker") or "").strip()
+        if data_name:
+            extras.append(data_name)
+        for x in extras:
+            if x and x not in seen:
+                seen.add(x)
+                stickers.append(x)
+    return {"stickers": stickers}
--- a/market/scrape/product/extractors/title.py
+++ b/market/scrape/product/extractors/title.py
@@ -0,0 +1,17 @@
+
+from __future__ import annotations
+from typing import Dict
+from bs4 import BeautifulSoup
+from shared.utils import normalize_text
+from ..registry import extractor
+
+@extractor
+def ex_title(soup: BeautifulSoup, url: str) -> Dict:
+    title = None
+    for sel in ["h1.page-title span", "h1.page-title", "h1.product-name", "meta[property='og:title']"]:
+        el = soup.select_one(sel)
+        if el:
+            title = normalize_text(el.get_text()) if el.name != "meta" else el.get("content")
+            if title:
+                break
+    return {"title": title or "Product"}
--- a/market/scrape/product/helpers/desc.py
+++ b/market/scrape/product/helpers/desc.py
@@ -0,0 +1,165 @@
+
+from __future__ import annotations
+from typing import Dict, List, Optional, Tuple
+from bs4 import BeautifulSoup, NavigableString, Tag
+from shared.utils import normalize_text
+from ...html_utils import absolutize_fragment
+from .text import clean_title, is_blacklisted_heading
+from shared.config import config
+
+
+def split_description_container(desc_el: Tag) -> Tuple[str, List[Dict]]:
+    """
+    Extract sections from accordion blocks within the description container.
+
+    Looks for headings with class 'accordion-title' and pairs each with its
+    next element-sibling having class 'accordion-details'. Returns:
+      - open_html: the remaining description HTML with those accordion blocks removed
+      - sections:  [{"title": ..., "html": ...}, ...]
+    """
+    # Work on an isolated copy to avoid mutating the original DOM
+    frag = BeautifulSoup(desc_el.decode_contents(), "lxml")
+
+    # Collect candidate (heading, details) pairs without mutating during iteration
+    pairs: List[Tuple[Tag, Tag]] = []
+    for h in frag.select("#accordion .accordion-title, .accordion .accordion-title, h5.accordion-title, .accordion-title"):
+        if not isinstance(h, Tag):
+            continue
+        title = clean_title((h.get_text() or "").strip())
+        if not title:
+            continue
+
+        # Walk forward siblings until we hit an element; accept the first with 'accordion-details'
+        sib = h.next_sibling
+        details: Optional[Tag] = None
+        while sib is not None:
+            if isinstance(sib, Tag):
+                classes = sib.get("class") or []
+                if "accordion-details" in classes:
+                    details = sib
+                break
+            sib = sib.next_sibling
+
+        if details is not None:
+            pairs.append((h, details))
+
+    sections: List[Dict] = []
+
+    # Extract sections, then remove nodes from frag
+    for h, details in pairs:
+        # Pull details HTML
+        html = details.decode_contents()
+        # Only keep non-empty (textual) content
+        if normalize_text(BeautifulSoup(html, "lxml").get_text()):
+            sections.append({
+                "title": clean_title(h.get_text() or ""),
+                "html": absolutize_fragment(html),
+            })
+        # Remove the matched nodes from the fragment copy
+        details.decompose()
+        h.decompose()
+
+    # Whatever remains is the open description html
+    open_html = absolutize_fragment(str(frag)) if frag else ""
+
+    return open_html, sections
+
+def pair_title_content_from_magento_tabs(soup: BeautifulSoup):
+    out = []
+    container = soup.select_one(".product.info.detailed .product.data.items") or soup.select_one(".product.data.items")
+    if not container:
+        return out
+    titles = container.select(".data.item.title")
+    for t in titles:
+        title = normalize_text(t.get_text())
+        if not title:
+            continue
+        content_id = t.get("aria-controls") or t.get("data-target")
+        content = soup.select_one(f"#{content_id}") if content_id else None
+        if content is None:
+            sib = t.find_next_sibling(
+                lambda x: isinstance(x, Tag) and "data" in x.get("class", []) and "item" in x.get("class", []) and "content" in x.get("class", [])
+            )
+            content = sib
+        if content:
+            html = content.decode_contents()
+            if not is_blacklisted_heading(title):
+                out.append((title, absolutize_fragment(html)))
+    return out
+
+def scan_headings_for_sections(soup: BeautifulSoup):
+    out = []
+    container = (
+        soup.select_one(".product.info.detailed")
+        or soup.select_one(".product-info-main")
+        or soup.select_one(".page-main")
+        or soup
+    )
+    heads = container.select("h2, h3, h4, h5, h6")
+    section_titles = (config().get("section-titles") or [])
+    for h in heads:
+        title = clean_title(h.get_text() or "")
+        if not title:
+            continue
+        low = title.lower()
+        if not any(k in low for k in section_titles + ["product description", "description", "details"]):
+            continue
+        parts: List[str] = []
+        for sib in h.next_siblings:
+            if isinstance(sib, NavigableString):
+                parts.append(str(sib))
+                continue
+            if isinstance(sib, Tag) and sib.name in ("h2", "h3", "h4", "h5", "h6"):
+                break
+            if isinstance(sib, Tag):
+                parts.append(str(sib))
+        html = absolutize_fragment("".join(parts).strip())
+        if html and not is_blacklisted_heading(title):
+            out.append((title, html))
+    return out
+
+def additional_attributes_table(soup: BeautifulSoup) -> Optional[str]:
+    table = soup.select_one(".additional-attributes, table.additional-attributes, .product.attribute.additional table")
+    if not table:
+        return None
+    try:
+        rows = []
+        for tr in table.select("tr"):
+            th = tr.find("th") or tr.find("td")
+            tds = tr.find_all("td")
+            key = normalize_text(th.get_text()) if th else None
+            val = normalize_text(tds[-1].get_text()) if tds else None
+            if key and val:
+                rows.append((key, val))
+        if not rows:
+            return None
+        items = "\n".join(
+            [
+                f"""<div class='grid grid-cols-3 gap-2 py-1 border-b'>
+<div class='col-span-1 font-medium'>{key}</div>
+<div class='col-span-2 text-stone-700'>{val}</div>
+</div>"""
+                for key, val in rows
+            ]
+        )
+        return f"<div class='rounded-lg border bg-white'>{items}</div>"
+    except Exception:
+        return None
+
+def find_description_container(soup: BeautifulSoup) -> Optional[Tag]:
+    for sel in ["#description", "#tab-description", ".product.attribute.description .value",
+                ".product.attribute.overview .value", ".product.info.detailed .value"]:
+        el = soup.select_one(sel)
+        if el and normalize_text(el.get_text()):
+            return el
+    for h in soup.select("h2, h3, h4, h5, h6"):
+        txt = normalize_text(h.get_text()).lower()
+        if txt.startswith("product description") or txt == "description":
+            wrapper = soup.new_tag("div")
+            for sib in h.next_siblings:
+                if isinstance(sib, Tag) and sib.name in ("h2", "h3", "h4", "h5", "h6"):
+                    break
+                wrapper.append(sib if isinstance(sib, Tag) else NavigableString(str(sib)))
+            if normalize_text(wrapper.get_text()):
+                return wrapper
+    return None
--- a/market/scrape/product/helpers/html.py
+++ b/market/scrape/product/helpers/html.py
@@ -0,0 +1,53 @@
+
+from __future__ import annotations
+from typing import List, Optional
+from urllib.parse import urljoin, urlparse
+from shared.config import config
+
+def first_from_srcset(val: str) -> Optional[str]:
+    if not val:
+        return None
+    first = val.split(",")[0].strip()
+    parts = first.split()
+    return parts[0] if parts else first
+
+def abs_url(u: Optional[str]) -> Optional[str]:
+    if not u:
+        return None
+    return urljoin(config()["base_url"], u) if isinstance(u, str) and u.startswith("/") else u
+
+def collect_img_candidates(el) -> List[str]:
+    urls: List[str] = []
+    if not el:
+        return urls
+    attrs = ["src", "data-src", "data-original", "data-zoom-image", "data-thumb", "content", "href"]
+    for a in attrs:
+        v = el.get(a)
+        if v:
+            urls.append(v)
+    for a in ["srcset", "data-srcset"]:
+        v = el.get(a)
+        if v:
+            first = first_from_srcset(v)
+            if first:
+                urls.append(first)
+    return urls
+
+def _filename_key(u: str) -> str:
+    p = urlparse(u)
+    path = p.path or ""
+    if path.endswith("/"):
+        path = path[:-1]
+    last = path.split("/")[-1]
+    return f"{p.netloc}:{last}".lower()
+
+def dedup_by_filename(urls: List[str]) -> List[str]:
+    seen = set()
+    out: List[str] = []
+    for u in urls:
+        k = _filename_key(u)
+        if k in seen:
+            continue
+        seen.add(k)
+        out.append(u)
+    return out
--- a/market/scrape/product/helpers/price.py
+++ b/market/scrape/product/helpers/price.py
@@ -0,0 +1,42 @@
+
+from __future__ import annotations
+import re
+from typing import Optional, Tuple
+
+def parse_price(text: str) -> Tuple[Optional[float], Optional[str], str]:
+    """
+    Return (value, currency, raw) from a price-like string.
+    Supports symbols £, €, $; strips thousands commas.
+    """
+    raw = (text or "").strip()
+    m = re.search(r'([£€$])?\s*([0-9][0-9.,]*)', raw)
+    if not m:
+        return None, None, raw
+    sym = m.group(1) or ""
+    num = m.group(2).replace(",", "")
+    try:
+        value = float(num)
+    except ValueError:
+        return None, None, raw
+    currency = {"£": "GBP", "€": "EUR", "$": "USD"}.get(sym, None)
+    return value, currency, raw
+
+def parse_case_size(text: str) -> Tuple[Optional[int], Optional[float], Optional[str], str]:
+    """
+    Parse strings like "6 x 500g", "12x1L", "24 × 330 ml"
+    Returns (count, item_qty, item_unit, raw)
+    """
+    raw = (text or "").strip()
+    if not raw:
+        return None, None, None, raw
+    t = re.sub(r"[×Xx]\s*", " x ", raw)
+    m = re.search(r"(\d+)\s*x\s*([0-9]*\.?[0-9]+)\s*([a-zA-Z]+)", t)
+    if not m:
+        return None, None, None, raw
+    count = int(m.group(1))
+    try:
+        item_qty = float(m.group(2))
+    except ValueError:
+        item_qty = None
+    unit = m.group(3)
+    return count, item_qty, unit, raw
--- a/market/scrape/product/helpers/text.py
+++ b/market/scrape/product/helpers/text.py
@@ -0,0 +1,16 @@
+
+from __future__ import annotations
+import re
+from shared.utils import normalize_text
+from shared.config import config
+
+def clean_title(t: str) -> str:
+    t = normalize_text(t)
+    t = re.sub(r":\s*$", "", t)
+    return t
+
+def is_blacklisted_heading(title: str) -> bool:
+    """Return True if heading should be skipped based on config blacklist."""
+    bl = (config().get("blacklist") or {}).get("product-details") or []
+    low = (title or "").strip().lower()
+    return any(low == (s or "").strip().lower() for s in bl)
--- a/market/scrape/product/product_core.py
+++ b/market/scrape/product/product_core.py
@@ -0,0 +1,48 @@
+
+from __future__ import annotations
+from typing import Dict, Tuple, Union
+from shared.utils import soup_of
+from ..http_client import fetch
+from ..html_utils import absolutize_fragment
+from bp.browse.services.slugs import product_slug_from_href
+from .registry import REGISTRY, merge_missing
+from . import extractors as _auto_register  # noqa: F401  (import-time side effects)
+
+async def scrape_product_detail(product_url: str, include_html: bool = False) -> Union[dict, Tuple[dict, str]]:
+    """
+    Returns a dict with fields (subset):
+      title, images, image, description_short, description_html, sections,
+      slug, suma_href, stickers, labels, info_table fields, oe_list_price, prices,
+      breadcrumbs-derived category_* fields.
+    If include_html=True, returns (data, html).
+    """
+    html = await fetch(product_url)
+    
+
+    data: Dict[str, Union[str, float, int, list, dict, None]] = {
+        "suma_href": product_url,
+        "slug": product_slug_from_href(product_url),
+    }
+
+    # Run all extractors
+    for fn in REGISTRY:
+        try:
+            soup = soup_of(html)
+            piece = fn(soup, product_url) or {}
+        except Exception:
+            # Tolerate site drift
+            continue
+        merge_missing(data, piece)
+    # If we found short description but not description_html, echo it
+    if not data.get("description_html") and data.get("description_short"):
+        data["description_html"] = absolutize_fragment(f"<p>{data['description_short']}</p>")
+
+    # Ensure "image" mirrors first of images if not set
+    if not data.get("image"):
+        imgs = data.get("images") or []
+        if isinstance(imgs, list) and imgs:
+            data["image"] = imgs[0]
+
+    if include_html:
+        return data, html
+    return data
--- a/market/scrape/product/product_detail.py
+++ b/market/scrape/product/product_detail.py
@@ -0,0 +1,4 @@
+
+from __future__ import annotations
+# Thin wrapper to keep import path stable
+from .product_core import scrape_product_detail  # re-export
--- a/market/scrape/product/registry.py
+++ b/market/scrape/product/registry.py
@@ -0,0 +1,20 @@
+
+from __future__ import annotations
+from typing import Callable, Dict, List, Union
+
+Extractor = Callable[[object, str], Dict[str, Union[str, float, int, list, dict, None]]]
+REGISTRY: List[Extractor] = []
+
+def extractor(fn: Extractor) -> Extractor:
+    """Decorator to register an extractor."""
+    REGISTRY.append(fn)
+    return fn
+
+def merge_missing(dst: dict, src: dict) -> None:
+    """
+    Merge src into dst. Only write keys that are missing or empty in dst.
+    "Empty" means None, "", [], {}.
+    """
+    for k, v in (src or {}).items():
+        if k not in dst or dst[k] in (None, "", [], {}):
+            dst[k] = v
				`@@ -0,0 +1 @@`
				`APP_ROOT_PLACEHOLDER = "[__APP_ROOT__]"`