This repository has been archived on 2026-02-24. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
market/scrape/persist_api/upsert_product.py
giles 6271a715a1
Some checks failed
Build and Deploy / build-and-deploy (push) Has been cancelled
feat: initialize market app with browsing, product, and scraping code
Split from coop monolith. Includes:
- Market/browse/product blueprints
- Product sync API
- Suma scraping pipeline
- Templates for market, browse, and product views
- Dockerfile and CI workflow for independent deployment
2026-02-09 23:16:34 +00:00

257 lines
8.9 KiB
Python

# replace your existing upsert_product with this version
import os
import httpx
from typing import Dict, List, Any
async def upsert_product(
slug,
href,
d,
):
"""
Posts the given product dict `d` to the /api/products/sync endpoint.
Keeps the same signature as before and preserves logging/commit behavior.
"""
# Ensure slug in payload matches the function arg if present
if not d.get("slug"):
d["slug"] = slug
# Where to post; override via env if needed
sync_url = os.getenv("PRODUCT_SYNC_URL", "http://localhost:8000/market/api/products/sync/")
payload = _massage_payload(d)
async def _do_call() -> Dict[str, Any]:
async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
resp = await client.post(sync_url, json=payload)
resp.raise_for_status()
# tolerate empty body
if not resp.content:
return {}
# prefer JSON if possible, otherwise return text
try:
return resp.json()
except ValueError:
return {"raw": resp.text}
async def _log_error(exc: BaseException) -> None:
# Optional: add your own logging here
print(f"[upsert_product] POST failed: {type(exc).__name__}: {exc}. Retrying in 5s... slug={slug} url={sync_url}")
return await retry_until_success(_do_call, delay=5.0, on_error=_log_error)
#async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
# _d=_massage_payload(d)
# resp = await client.post(sync_url, json=_d)
# Raise for non-2xx
#resp.raise_for_status()
#data = resp.json() if resp.content else {}
#return data
import asyncio
from typing import Any, Awaitable, Callable, Dict, Optional
async def retry_until_success(
fn: Callable[[], Awaitable[Any]],
*,
delay: float = 5.0,
on_error: Optional[Callable[[BaseException], Awaitable[None]]] = None,
) -> Any:
"""
Repeatedly call the async no-arg function `fn` until it succeeds (returns without raising).
Waits `delay` seconds between attempts. Never gives up.
If provided, `on_error(exc)` is awaited after each failure.
"""
attempt = 0
while True:
try:
return await fn()
except asyncio.CancelledError:
# bubble up cancellations immediately
raise
except BaseException as exc:
attempt += 1
if on_error is not None:
try:
await on_error(exc)
except Exception:
# don't let error handler failures prevent retrying
pass
# fallback stderr log if no on_error handler
if on_error is None:
print(f"[retry] attempt {attempt} failed: {type(exc).__name__}: {exc}")
await asyncio.sleep(delay)
def _get(d, key, default=None):
v = d.get(key)
return default if v in (None, "", [], {}) else v
def _massage_payload(d: Dict[str, Any]) -> Dict[str, Any]:
"""Mirror the DB-upsert massaging so the API sees the same structure/values."""
slug = d.get("slug")
if not slug:
raise ValueError("product missing slug")
# --- Top-level fields (use _get where DB upsert uses it) ---
out: Dict[str, Any] = {
"slug": slug,
"title": _get(d, "title"),
"image": _get(d, "image"),
"description_short": _get(d, "description_short"),
"description_html": _get(d, "description_html"),
"suma_href": _get(d, "suma_href"),
"brand": _get(d, "brand"),
"rrp": _get(d, "rrp"),
"rrp_currency": _get(d, "rrp_currency"),
"rrp_raw": _get(d, "rrp_raw"),
"price_per_unit": _get(d, "price_per_unit"),
"price_per_unit_currency": _get(d, "price_per_unit_currency"),
"price_per_unit_raw": _get(d, "price_per_unit_raw"),
"special_price": _get(d, "special_price"),
"special_price_currency": _get(d, "special_price_currency"),
"special_price_raw": _get(d, "special_price_raw"),
"regular_price": _get(d, "regular_price"),
"regular_price_currency": _get(d, "regular_price_currency"),
"regular_price_raw": _get(d, "regular_price_raw"),
"case_size_count": _get(d, "case_size_count"),
"case_size_item_qty": _get(d, "case_size_item_qty"),
"case_size_item_unit": _get(d, "case_size_item_unit"),
"case_size_raw": _get(d, "case_size_raw"),
"ean": d.get("ean") or d.get("barcode") or None,
"sku": d.get("sku"),
"unit_size": d.get("unit_size"),
"pack_size": d.get("pack_size"),
}
# --- Sections: only dicts with title+html (like DB sync) ---
sections_in = d.get("sections") or []
sections_out: List[Dict[str, Any]] = []
for sec in sections_in:
if isinstance(sec, dict) and sec.get("title") and sec.get("html"):
sections_out.append({"title": sec["title"], "html": sec["html"]})
out["sections"] = sections_out
# --- Images: same 3 buckets used in DB sync ---
def _coerce_str_list(x):
if not x:
return []
# accept list of strings or list of dicts with {"url": ...}
out_urls = []
for item in x:
if isinstance(item, str):
if item:
out_urls.append(item)
elif isinstance(item, dict):
u = item.get("url")
if u:
out_urls.append(u)
return out_urls
out["images"] = _coerce_str_list(d.get("images"))
out["embedded_image_urls"] = _coerce_str_list(d.get("embedded_image_urls"))
out["all_image_urls"] = _coerce_str_list(d.get("all_image_urls"))
# --- Labels: strip (DB code trims) ---
labels_in = d.get("labels") or []
out["labels"] = [str(x).strip() for x in labels_in if x]
# --- Stickers: strip + lower (DB code lower-cases) ---
stickers_in = d.get("stickers") or []
out["stickers"] = [str(x).strip().lower() for x in stickers_in if x]
# --- Attributes: pass through the same dict sources the DB code reads ---
out["info_table"] = d.get("info_table") or {}
#out["oe_list_price"] = d.get("oe_list_price") or {}
# --- Nutrition: allow dict or list of dicts, mirroring DB code ---
nutrition = d.get("nutrition") or []
if isinstance(nutrition, dict):
out["nutrition"] = {str(k).strip(): (None if v is None else str(v)) for k, v in nutrition.items()}
elif isinstance(nutrition, list):
rows = []
for row in nutrition:
if not isinstance(row, dict):
continue
key = str(row.get("key") or "").strip()
if not key:
continue
rows.append({
"key": key,
"value": None if row.get("value") is None else str(row.get("value")),
"unit": None if row.get("unit") is None else str(row.get("unit")),
})
out["nutrition"] = rows
else:
out["nutrition"] = []
# --- Allergens: accept str (→ contains=True) or dict ---
alls_in = d.get("allergens") or []
alls_out = []
for a in alls_in:
if isinstance(a, str):
nm, contains = a.strip(), True
elif isinstance(a, dict):
nm, contains = (a.get("name") or "").strip(), bool(a.get("contains", True))
else:
continue
if nm:
alls_out.append({"name": nm, "contains": contains})
out["allergens"] = alls_out
out["images"]=[
{"url": s.strip(), "kind": "gallery", "position": i}
for i, s in enumerate(out.get("images") or [])
if isinstance(s, str) and s.strip()
] + [
{"url": s.strip(), "kind": "embedded", "position": i}
for i, s in enumerate(out.get("embedded_image_urls") or [])
if isinstance(s, str) and s.strip()
] + [
{"url": s.strip(), "kind": "all", "position": i}
for i, s in enumerate(out.get("all_image_urls") or [])
if isinstance(s, str) and s.strip()
]
out["labels"]= [{"name": s.strip()} for s in out["labels"] if isinstance(s, str) and s.strip()]
out["stickers"]= [{"name": s.strip()} for s in out["stickers"] if isinstance(s, str) and s.strip()]
out["attributes"] = build_attributes_list(d)
return out
def build_attributes_list(d: Dict[str, Any]) -> List[Dict[str, Any]]:
attrs = []
for src, prefix in [
(d.get("info_table") or {}, "info_table"),
(d.get("oe_list_price") or {}, "oe_list_price"),
]:
for k, v in src.items():
key = f"{prefix}/{str(k).strip()}"
val = None if v is None else str(v)
attrs.append({"key": key, "value": val})
# optional: dedupe by (key, value)
seen = set()
dedup = []
for item in attrs:
t = (item["key"], item["value"])
if t in seen:
continue
seen.add(t)
dedup.append(item)
return dedup