feat: initialize market app with browsing, product, and scraping code
Some checks failed
Build and Deploy / build-and-deploy (push) Has been cancelled
Some checks failed
Build and Deploy / build-and-deploy (push) Has been cancelled
Split from coop monolith. Includes: - Market/browse/product blueprints - Product sync API - Suma scraping pipeline - Templates for market, browse, and product views - Dockerfile and CI workflow for independent deployment
This commit is contained in:
6
scrape/persist_api/__init__.py
Normal file
6
scrape/persist_api/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
from .upsert_product import upsert_product
|
||||
from .log_product_result import log_product_result
|
||||
from .save_nav import save_nav
|
||||
from .save_subcategory_redirects import save_subcategory_redirects
|
||||
from .capture_listing import capture_listing
|
||||
|
||||
27
scrape/persist_api/capture_listing.py
Normal file
27
scrape/persist_api/capture_listing.py
Normal file
@@ -0,0 +1,27 @@
|
||||
# replace your existing upsert_product with this version
|
||||
|
||||
import os
|
||||
import httpx
|
||||
|
||||
from typing import List
|
||||
|
||||
async def capture_listing(
|
||||
url: str,
|
||||
items: List[str],
|
||||
total_pages: int
|
||||
):
|
||||
|
||||
sync_url = os.getenv("CAPTURE_LISTING_URL", "http://localhost:8000/market/api/products/listing/")
|
||||
|
||||
async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
|
||||
_d = {
|
||||
"url": url,
|
||||
"items": items,
|
||||
"total_pages": total_pages
|
||||
}
|
||||
resp = await client.post(sync_url, json=_d)
|
||||
# Raise for non-2xx
|
||||
resp.raise_for_status()
|
||||
data = resp.json() if resp.content else {}
|
||||
return data
|
||||
|
||||
24
scrape/persist_api/log_product_result.py
Normal file
24
scrape/persist_api/log_product_result.py
Normal file
@@ -0,0 +1,24 @@
|
||||
# replace your existing upsert_product with this version
|
||||
|
||||
import os
|
||||
import httpx
|
||||
|
||||
|
||||
async def log_product_result(
|
||||
ok: bool,
|
||||
payload
|
||||
):
|
||||
|
||||
sync_url = os.getenv("PRODUCT_LOG_URL", "http://localhost:8000/market/api/products/log/")
|
||||
|
||||
async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
|
||||
_d = {
|
||||
"ok": ok,
|
||||
"payload": payload
|
||||
}
|
||||
resp = await client.post(sync_url, json=_d)
|
||||
# Raise for non-2xx
|
||||
resp.raise_for_status()
|
||||
data = resp.json() if resp.content else {}
|
||||
return data
|
||||
|
||||
19
scrape/persist_api/save_nav.py
Normal file
19
scrape/persist_api/save_nav.py
Normal file
@@ -0,0 +1,19 @@
|
||||
# replace your existing upsert_product with this version
|
||||
|
||||
import os
|
||||
import httpx
|
||||
|
||||
from typing import Dict
|
||||
|
||||
async def save_nav(
|
||||
nav: Dict,
|
||||
):
|
||||
sync_url = os.getenv("SAVE_NAV_URL", "http://localhost:8000/market/api/products/nav/")
|
||||
|
||||
async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
|
||||
resp = await client.post(sync_url, json=nav)
|
||||
# Raise for non-2xx
|
||||
resp.raise_for_status()
|
||||
data = resp.json() if resp.content else {}
|
||||
return data
|
||||
|
||||
15
scrape/persist_api/save_subcategory_redirects.py
Normal file
15
scrape/persist_api/save_subcategory_redirects.py
Normal file
@@ -0,0 +1,15 @@
|
||||
import os
|
||||
import httpx
|
||||
|
||||
from typing import Dict
|
||||
|
||||
async def save_subcategory_redirects(mapping: Dict[str, str]) -> None:
|
||||
sync_url = os.getenv("SAVE_REDIRECTS", "http://localhost:8000/market/api/products/redirects/")
|
||||
|
||||
async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
|
||||
resp = await client.post(sync_url, json=mapping)
|
||||
# Raise for non-2xx
|
||||
resp.raise_for_status()
|
||||
data = resp.json() if resp.content else {}
|
||||
return data
|
||||
|
||||
256
scrape/persist_api/upsert_product.py
Normal file
256
scrape/persist_api/upsert_product.py
Normal file
@@ -0,0 +1,256 @@
|
||||
# replace your existing upsert_product with this version
|
||||
|
||||
import os
|
||||
import httpx
|
||||
|
||||
from typing import Dict, List, Any
|
||||
|
||||
async def upsert_product(
|
||||
slug,
|
||||
href,
|
||||
d,
|
||||
):
|
||||
"""
|
||||
Posts the given product dict `d` to the /api/products/sync endpoint.
|
||||
Keeps the same signature as before and preserves logging/commit behavior.
|
||||
"""
|
||||
|
||||
|
||||
# Ensure slug in payload matches the function arg if present
|
||||
if not d.get("slug"):
|
||||
d["slug"] = slug
|
||||
|
||||
# Where to post; override via env if needed
|
||||
sync_url = os.getenv("PRODUCT_SYNC_URL", "http://localhost:8000/market/api/products/sync/")
|
||||
|
||||
|
||||
|
||||
|
||||
payload = _massage_payload(d)
|
||||
|
||||
async def _do_call() -> Dict[str, Any]:
|
||||
async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
|
||||
resp = await client.post(sync_url, json=payload)
|
||||
resp.raise_for_status()
|
||||
# tolerate empty body
|
||||
if not resp.content:
|
||||
return {}
|
||||
# prefer JSON if possible, otherwise return text
|
||||
try:
|
||||
return resp.json()
|
||||
except ValueError:
|
||||
return {"raw": resp.text}
|
||||
|
||||
async def _log_error(exc: BaseException) -> None:
|
||||
# Optional: add your own logging here
|
||||
print(f"[upsert_product] POST failed: {type(exc).__name__}: {exc}. Retrying in 5s... slug={slug} url={sync_url}")
|
||||
|
||||
return await retry_until_success(_do_call, delay=5.0, on_error=_log_error)
|
||||
|
||||
|
||||
|
||||
#async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
|
||||
# _d=_massage_payload(d)
|
||||
# resp = await client.post(sync_url, json=_d)
|
||||
# Raise for non-2xx
|
||||
#resp.raise_for_status()
|
||||
#data = resp.json() if resp.content else {}
|
||||
#return data
|
||||
|
||||
import asyncio
|
||||
from typing import Any, Awaitable, Callable, Dict, Optional
|
||||
|
||||
async def retry_until_success(
|
||||
fn: Callable[[], Awaitable[Any]],
|
||||
*,
|
||||
delay: float = 5.0,
|
||||
on_error: Optional[Callable[[BaseException], Awaitable[None]]] = None,
|
||||
) -> Any:
|
||||
"""
|
||||
Repeatedly call the async no-arg function `fn` until it succeeds (returns without raising).
|
||||
Waits `delay` seconds between attempts. Never gives up.
|
||||
If provided, `on_error(exc)` is awaited after each failure.
|
||||
"""
|
||||
attempt = 0
|
||||
while True:
|
||||
try:
|
||||
return await fn()
|
||||
except asyncio.CancelledError:
|
||||
# bubble up cancellations immediately
|
||||
raise
|
||||
except BaseException as exc:
|
||||
attempt += 1
|
||||
if on_error is not None:
|
||||
try:
|
||||
await on_error(exc)
|
||||
except Exception:
|
||||
# don't let error handler failures prevent retrying
|
||||
pass
|
||||
# fallback stderr log if no on_error handler
|
||||
if on_error is None:
|
||||
print(f"[retry] attempt {attempt} failed: {type(exc).__name__}: {exc}")
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
|
||||
|
||||
def _get(d, key, default=None):
|
||||
v = d.get(key)
|
||||
return default if v in (None, "", [], {}) else v
|
||||
|
||||
|
||||
def _massage_payload(d: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Mirror the DB-upsert massaging so the API sees the same structure/values."""
|
||||
slug = d.get("slug")
|
||||
if not slug:
|
||||
raise ValueError("product missing slug")
|
||||
|
||||
# --- Top-level fields (use _get where DB upsert uses it) ---
|
||||
out: Dict[str, Any] = {
|
||||
"slug": slug,
|
||||
"title": _get(d, "title"),
|
||||
"image": _get(d, "image"),
|
||||
"description_short": _get(d, "description_short"),
|
||||
"description_html": _get(d, "description_html"),
|
||||
"suma_href": _get(d, "suma_href"),
|
||||
"brand": _get(d, "brand"),
|
||||
"rrp": _get(d, "rrp"),
|
||||
"rrp_currency": _get(d, "rrp_currency"),
|
||||
"rrp_raw": _get(d, "rrp_raw"),
|
||||
"price_per_unit": _get(d, "price_per_unit"),
|
||||
"price_per_unit_currency": _get(d, "price_per_unit_currency"),
|
||||
"price_per_unit_raw": _get(d, "price_per_unit_raw"),
|
||||
"special_price": _get(d, "special_price"),
|
||||
"special_price_currency": _get(d, "special_price_currency"),
|
||||
"special_price_raw": _get(d, "special_price_raw"),
|
||||
"regular_price": _get(d, "regular_price"),
|
||||
"regular_price_currency": _get(d, "regular_price_currency"),
|
||||
"regular_price_raw": _get(d, "regular_price_raw"),
|
||||
"case_size_count": _get(d, "case_size_count"),
|
||||
"case_size_item_qty": _get(d, "case_size_item_qty"),
|
||||
"case_size_item_unit": _get(d, "case_size_item_unit"),
|
||||
"case_size_raw": _get(d, "case_size_raw"),
|
||||
"ean": d.get("ean") or d.get("barcode") or None,
|
||||
"sku": d.get("sku"),
|
||||
"unit_size": d.get("unit_size"),
|
||||
"pack_size": d.get("pack_size"),
|
||||
}
|
||||
|
||||
# --- Sections: only dicts with title+html (like DB sync) ---
|
||||
sections_in = d.get("sections") or []
|
||||
sections_out: List[Dict[str, Any]] = []
|
||||
for sec in sections_in:
|
||||
if isinstance(sec, dict) and sec.get("title") and sec.get("html"):
|
||||
sections_out.append({"title": sec["title"], "html": sec["html"]})
|
||||
out["sections"] = sections_out
|
||||
|
||||
# --- Images: same 3 buckets used in DB sync ---
|
||||
def _coerce_str_list(x):
|
||||
if not x:
|
||||
return []
|
||||
# accept list of strings or list of dicts with {"url": ...}
|
||||
out_urls = []
|
||||
for item in x:
|
||||
if isinstance(item, str):
|
||||
if item:
|
||||
out_urls.append(item)
|
||||
elif isinstance(item, dict):
|
||||
u = item.get("url")
|
||||
if u:
|
||||
out_urls.append(u)
|
||||
return out_urls
|
||||
|
||||
out["images"] = _coerce_str_list(d.get("images"))
|
||||
out["embedded_image_urls"] = _coerce_str_list(d.get("embedded_image_urls"))
|
||||
out["all_image_urls"] = _coerce_str_list(d.get("all_image_urls"))
|
||||
|
||||
# --- Labels: strip (DB code trims) ---
|
||||
labels_in = d.get("labels") or []
|
||||
out["labels"] = [str(x).strip() for x in labels_in if x]
|
||||
|
||||
# --- Stickers: strip + lower (DB code lower-cases) ---
|
||||
stickers_in = d.get("stickers") or []
|
||||
out["stickers"] = [str(x).strip().lower() for x in stickers_in if x]
|
||||
|
||||
# --- Attributes: pass through the same dict sources the DB code reads ---
|
||||
out["info_table"] = d.get("info_table") or {}
|
||||
#out["oe_list_price"] = d.get("oe_list_price") or {}
|
||||
|
||||
# --- Nutrition: allow dict or list of dicts, mirroring DB code ---
|
||||
nutrition = d.get("nutrition") or []
|
||||
if isinstance(nutrition, dict):
|
||||
out["nutrition"] = {str(k).strip(): (None if v is None else str(v)) for k, v in nutrition.items()}
|
||||
elif isinstance(nutrition, list):
|
||||
rows = []
|
||||
for row in nutrition:
|
||||
if not isinstance(row, dict):
|
||||
continue
|
||||
key = str(row.get("key") or "").strip()
|
||||
if not key:
|
||||
continue
|
||||
rows.append({
|
||||
"key": key,
|
||||
"value": None if row.get("value") is None else str(row.get("value")),
|
||||
"unit": None if row.get("unit") is None else str(row.get("unit")),
|
||||
})
|
||||
out["nutrition"] = rows
|
||||
else:
|
||||
out["nutrition"] = []
|
||||
|
||||
# --- Allergens: accept str (→ contains=True) or dict ---
|
||||
alls_in = d.get("allergens") or []
|
||||
alls_out = []
|
||||
for a in alls_in:
|
||||
if isinstance(a, str):
|
||||
nm, contains = a.strip(), True
|
||||
elif isinstance(a, dict):
|
||||
nm, contains = (a.get("name") or "").strip(), bool(a.get("contains", True))
|
||||
else:
|
||||
continue
|
||||
if nm:
|
||||
alls_out.append({"name": nm, "contains": contains})
|
||||
out["allergens"] = alls_out
|
||||
|
||||
out["images"]=[
|
||||
{"url": s.strip(), "kind": "gallery", "position": i}
|
||||
for i, s in enumerate(out.get("images") or [])
|
||||
if isinstance(s, str) and s.strip()
|
||||
] + [
|
||||
{"url": s.strip(), "kind": "embedded", "position": i}
|
||||
for i, s in enumerate(out.get("embedded_image_urls") or [])
|
||||
if isinstance(s, str) and s.strip()
|
||||
] + [
|
||||
{"url": s.strip(), "kind": "all", "position": i}
|
||||
for i, s in enumerate(out.get("all_image_urls") or [])
|
||||
if isinstance(s, str) and s.strip()
|
||||
]
|
||||
out["labels"]= [{"name": s.strip()} for s in out["labels"] if isinstance(s, str) and s.strip()]
|
||||
out["stickers"]= [{"name": s.strip()} for s in out["stickers"] if isinstance(s, str) and s.strip()]
|
||||
out["attributes"] = build_attributes_list(d)
|
||||
|
||||
|
||||
return out
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def build_attributes_list(d: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
attrs = []
|
||||
for src, prefix in [
|
||||
(d.get("info_table") or {}, "info_table"),
|
||||
(d.get("oe_list_price") or {}, "oe_list_price"),
|
||||
]:
|
||||
for k, v in src.items():
|
||||
key = f"{prefix}/{str(k).strip()}"
|
||||
val = None if v is None else str(v)
|
||||
attrs.append({"key": key, "value": val})
|
||||
# optional: dedupe by (key, value)
|
||||
seen = set()
|
||||
dedup = []
|
||||
for item in attrs:
|
||||
t = (item["key"], item["value"])
|
||||
if t in seen:
|
||||
continue
|
||||
seen.add(t)
|
||||
dedup.append(item)
|
||||
return dedup
|
||||
Reference in New Issue
Block a user