feat: initialize market app with browsing, product, and scraping code
Some checks failed
Build and Deploy / build-and-deploy (push) Has been cancelled

Split from coop monolith. Includes:
- Market/browse/product blueprints
- Product sync API
- Suma scraping pipeline
- Templates for market, browse, and product views
- Dockerfile and CI workflow for independent deployment
This commit is contained in:
giles
2026-02-09 23:16:34 +00:00
commit 6271a715a1
142 changed files with 8517 additions and 0 deletions

View File

@@ -0,0 +1,7 @@
from .log_product_result import log_product_result
from .upsert_product import upsert_product
from .save_nav import save_nav
from .capture_listing import capture_listing
from .save_link_reports import save_link_reports
from .save_subcategory_redirects import save_subcategory_redirects

View File

@@ -0,0 +1,3 @@
def _get(d, key, default=None):
v = d.get(key)
return default if v in (None, "", [], {}) else v

View File

@@ -0,0 +1,137 @@
# at top of persist_snapshot.py:
from typing import Optional, List
from sqlalchemy.ext.asyncio import AsyncSession
from typing import List, Optional, Tuple
from sqlalchemy.dialects.postgresql import insert as pg_insert
from datetime import datetime
from sqlalchemy import (
select, update
)
from urllib.parse import urlparse
import re
from models.market import (
NavTop,
NavSub,
Listing,
ListingItem,
)
from db.session import get_session
# --- Models are unchanged, see original code ---
# ---------------------- Helper fns called from scraper ------------------------
async def capture_listing(
#product_slugs: Set[str],
url: str,
items: List[str],
total_pages: int
) -> None:
async with get_session() as session:
await _capture_listing(
session,
url,
items,
total_pages
)
await session.commit()
async def _capture_listing(
session,
url: str,
items: List[str],
total_pages: int
) -> None:
top_id, sub_id = await _nav_ids_from_list_url(session, url)
await _save_listing(session, top_id, sub_id, items, total_pages)
async def _save_listing(session: AsyncSession, top_id: int, sub_id: Optional[int],
items: List[str], total_pages: Optional[int]) -> None:
res = await session.execute(
select(Listing).where(Listing.top_id == top_id, Listing.sub_id == sub_id, Listing.deleted_at.is_(None))
)
listing = res.scalar_one_or_none()
if not listing:
listing = Listing(top_id=top_id, sub_id=sub_id, total_pages=total_pages)
session.add(listing)
await session.flush()
else:
listing.total_pages = total_pages
# Normalize and deduplicate incoming slugs
seen: set[str] = set()
deduped: list[str] = []
for s in items or []:
if s and isinstance(s, str) and s not in seen:
seen.add(s)
deduped.append(s)
if not deduped:
return
# Fetch existing slugs from the database
res = await session.execute(
select(ListingItem.slug)
.where(ListingItem.listing_id == listing.id, ListingItem.deleted_at.is_(None))
)
existing_slugs = set(res.scalars().all())
now = datetime.utcnow()
# Slugs to delete (present in DB but not in the new data)
to_delete = existing_slugs - seen
if to_delete:
await session.execute(
update(ListingItem)
.where(
ListingItem.listing_id == listing.id,
ListingItem.slug.in_(to_delete),
ListingItem.deleted_at.is_(None)
)
.values(deleted_at=now)
)
# Slugs to insert (new ones not in DB)
to_insert = seen - existing_slugs
if to_insert:
stmt = pg_insert(ListingItem).values(
[{"listing_id": listing.id, "slug": s} for s in to_insert]
)
#.on_conflict_do_nothing(
# constraint="uq_listing_items_listing_slug"
#)
await session.execute(stmt)
async def _nav_ids_from_list_url(session: AsyncSession, list_url: str) -> Tuple[int, Optional[int]]:
parts = [x for x in (urlparse(list_url).path or "").split("/") if x]
top_slug = parts[0].lower() if parts else ""
sub_slug = None
if len(parts) >= 2:
sub_slug = parts[1]
if sub_slug.lower().endswith((".html", ".htm")):
sub_slug = re.sub(r"\\.(html?|HTML?)$", "", sub_slug)
return await _get_nav_ids(session, top_slug, sub_slug)
async def _get_nav_ids(session: AsyncSession, top_slug: str, sub_slug: Optional[str]) -> Tuple[int, Optional[int]]:
res_top = await session.execute(select(NavTop.id).where(NavTop.slug == top_slug, NavTop.deleted_at.is_(None)))
top_id = res_top.scalar_one_or_none()
if not top_id:
raise ValueError(f"NavTop not found for slug: {top_slug}")
sub_id = None
if sub_slug:
res_sub = await session.execute(
select(NavSub.id).where(NavSub.slug == sub_slug, NavSub.top_id == top_id, NavSub.deleted_at.is_(None))
)
sub_id = res_sub.scalar_one_or_none()
if sub_id is None:
raise ValueError(f"NavSub not found for slug: {sub_slug} under top_id={top_id}")
return top_id, sub_id

View File

@@ -0,0 +1,35 @@
# at top of persist_snapshot.py:
from sqlalchemy.ext.asyncio import AsyncSession
from typing import Dict
from models.market import (
ProductLog,
)
from db.session import get_session
async def log_product_result(ok: bool, payload: Dict) -> None:
async with get_session() as session:
await _log_product_result(session, ok, payload)
await session.commit()
async def _log_product_result(session: AsyncSession, ok: bool, payload: Dict) -> None:
session.add(ProductLog(
ok=ok,
slug=payload.get("slug"),
href_tried=payload.get("href_tried"),
error_type=payload.get("error_type"),
error_message=payload.get("error_message"),
http_status=payload.get("http_status"),
final_url=payload.get("final_url"),
transport_error=payload.get("transport_error"),
title=payload.get("title"),
has_description_html=payload.get("has_description_html"),
has_description_short=payload.get("has_description_short"),
sections_count=payload.get("sections_count"),
images_count=payload.get("images_count"),
embedded_images_count=payload.get("embedded_images_count"),
all_images_count=payload.get("all_images_count"),
))

View File

@@ -0,0 +1,29 @@
# at top of persist_snapshot.py:
from typing import List
from typing import Dict, List
from models.market import (
LinkError,
LinkExternal,
)
from db.session import get_session
# --- Models are unchanged, see original code ---
# ---------------------- Helper fns called from scraper ------------------------
async def save_link_reports(link_errors: List[Dict], link_externals: List[Dict]) -> None:
async with get_session() as session:
for e in link_errors:
session.add(LinkError(
product_slug=e.get("product"), href=e.get("href"), text=e.get("text"),
top=e.get("top"), sub=e.get("sub"), target_slug=e.get("target_slug"), type=e.get("type"),
))
for e in link_externals:
session.add(LinkExternal(
product_slug=e.get("product"), href=e.get("href"), text=e.get("text"), host=e.get("host"),
))
await session.commit()

View File

@@ -0,0 +1,108 @@
# at top of persist_snapshot.py:
from datetime import datetime
from sqlalchemy import (
select, tuple_
)
from typing import Dict
from models.market import (
NavTop,
NavSub,
)
from db.session import get_session
async def save_nav(nav: Dict) -> None:
async with get_session() as session:
await _save_nav(session, nav)
await session.commit()
async def _save_nav(session, nav: Dict) -> None:
print('===================SAVE NAV========================')
print(nav)
now = datetime.utcnow()
incoming_top_slugs = set()
incoming_sub_keys = set() # (top_slug, sub_slug)
# First pass: collect slugs
for label, data in (nav.get("cats") or {}).items():
top_slug = (data or {}).get("slug")
if not top_slug:
continue
incoming_top_slugs.add(top_slug)
for s in (data.get("subs") or []):
sub_slug = s.get("slug")
if sub_slug:
incoming_sub_keys.add((top_slug, sub_slug))
# Soft-delete stale NavSub entries
# This requires joining NavTop to access top_slug
subs_to_delete = await session.execute(
select(NavSub)
.join(NavTop, NavSub.top_id == NavTop.id)
.where(
NavSub.deleted_at.is_(None),
~tuple_(NavTop.slug, NavSub.slug).in_(incoming_sub_keys)
)
)
for sub in subs_to_delete.scalars():
sub.deleted_at = now
# Soft-delete stale NavTop entries
tops_to_delete = await session.execute(
select(NavTop)
.where(
NavTop.deleted_at.is_(None),
~NavTop.slug.in_(incoming_top_slugs)
)
)
for top in tops_to_delete.scalars():
top.deleted_at = now
await session.flush()
# Upsert NavTop and NavSub
for label, data in (nav.get("cats") or {}).items():
top_slug = (data or {}).get("slug")
if not top_slug:
continue
res = await session.execute(
select(NavTop).where(NavTop.slug == top_slug)
)
top = res.scalar_one_or_none()
if top:
top.label = label
top.deleted_at = None
else:
top = NavTop(label=label, slug=top_slug)
session.add(top)
await session.flush()
for s in (data.get("subs") or []):
sub_slug = s.get("slug")
if not sub_slug:
continue
sub_label = s.get("label")
sub_href = s.get("href")
res_sub = await session.execute(
select(NavSub).where(
NavSub.slug == sub_slug,
NavSub.top_id == top.id
)
)
sub = res_sub.scalar_one_or_none()
if sub:
sub.label = sub_label
sub.href = sub_href
sub.deleted_at = None
else:
session.add(NavSub(top_id=top.id, label=sub_label, slug=sub_slug, href=sub_href))

View File

@@ -0,0 +1,32 @@
# at top of persist_snapshot.py:
from typing import Dict
from datetime import datetime
from sqlalchemy import (
update
)
from models.market import (
SubcategoryRedirect,
)
from db.session import get_session
# --- Models are unchanged, see original code ---
# ---------------------- Helper fns called from scraper ------------------------
async def save_subcategory_redirects(mapping: Dict[str, str]) -> None:
async with get_session() as session:
await _save_subcategory_redirects(session, mapping)
await session.commit()
async def _save_subcategory_redirects(session, mapping: Dict[str, str]) -> None:
await session.execute(update(SubcategoryRedirect).where(SubcategoryRedirect.deleted_at.is_(None)).values(deleted_at=datetime.utcnow()))
for old, new in mapping.items():
session.add(SubcategoryRedirect(old_path=old, new_path=new))
#for slug in items:
# product_slugs.add(slug)

View File

@@ -0,0 +1,237 @@
# at top of persist_snapshot.py:
from sqlalchemy.ext.asyncio import AsyncSession
from typing import Dict
from datetime import datetime
from sqlalchemy import (
func, select, update
)
from models.market import (
Product,
ProductImage,
ProductSection,
ProductLabel,
ProductSticker,
ProductAttribute,
ProductNutrition,
ProductAllergen
)
from db.session import get_session
from ._get import _get
from .log_product_result import _log_product_result
# --- Models are unchanged, see original code ---
# ---------------------- Helper fns called from scraper ------------------------
async def _upsert_product(session: AsyncSession, d: Dict) -> Product:
slug = d.get("slug")
if not slug:
raise ValueError("product missing slug")
res = await session.execute(select(Product).where(Product.slug == slug, Product.deleted_at.is_(None)))
p = res.scalar_one_or_none()
if not p:
p = Product(slug=slug)
session.add(p)
p.title = _get(d, "title")
p.image = _get(d, "image")
p.description_short = _get(d, "description_short")
p.description_html = _get(d, "description_html")
p.suma_href = _get(d, "suma_href")
p.brand = _get(d, "brand")
p.rrp = _get(d, "rrp")
p.rrp_currency = _get(d, "rrp_currency")
p.rrp_raw = _get(d, "rrp_raw")
p.price_per_unit = _get(d, "price_per_unit")
p.price_per_unit_currency = _get(d, "price_per_unit_currency")
p.price_per_unit_raw = _get(d, "price_per_unit_raw")
p.special_price = _get(d, "special_price")
p.special_price_currency = _get(d, "special_price_currency")
p.special_price_raw = _get(d, "special_price_raw")
p.regular_price = _get(d, "regular_price")
p.regular_price_currency = _get(d, "regular_price_currency")
p.regular_price_raw = _get(d, "regular_price_raw")
p.case_size_count = _get(d, "case_size_count")
p.case_size_item_qty = _get(d, "case_size_item_qty")
p.case_size_item_unit = _get(d, "case_size_item_unit")
p.case_size_raw = _get(d, "case_size_raw")
p.ean = d.get("ean") or d.get("barcode") or None
p.sku = d.get("sku")
p.unit_size = d.get("unit_size")
p.pack_size = d.get("pack_size")
p.updated_at = func.now()
now = datetime.utcnow()
# ProductSection sync
existing_sections = await session.execute(select(ProductSection).where(ProductSection.product_id == p.id, ProductSection.deleted_at.is_(None)))
existing_sections_set = {(s.title, s.html) for s in existing_sections.scalars()}
new_sections_set = set()
for sec in d.get("sections") or []:
if isinstance(sec, dict) and sec.get("title") and sec.get("html"):
new_sections_set.add((sec["title"], sec["html"]))
if (sec["title"], sec["html"]) not in existing_sections_set:
session.add(ProductSection(product_id=p.id, title=sec["title"], html=sec["html"]))
for s in existing_sections_set - new_sections_set:
await session.execute(update(ProductSection).where(ProductSection.product_id == p.id, ProductSection.title == s[0], ProductSection.html == s[1], ProductSection.deleted_at.is_(None)).values(deleted_at=now))
# ProductImage sync
existing_images = await session.execute(select(ProductImage).where(ProductImage.product_id == p.id, ProductImage.deleted_at.is_(None)))
existing_images_set = {(img.url, img.kind) for img in existing_images.scalars()}
new_images_set = set()
for kind, urls in [
("gallery", d.get("images") or []),
("embedded", d.get("embedded_image_urls") or []),
("all", d.get("all_image_urls") or []),
]:
for idx, url in enumerate(urls):
if url:
new_images_set.add((url, kind))
if (url, kind) not in existing_images_set:
session.add(ProductImage(product_id=p.id, url=url, position=idx, kind=kind))
for img in existing_images_set - new_images_set:
await session.execute(update(ProductImage).where(ProductImage.product_id == p.id, ProductImage.url == img[0], ProductImage.kind == img[1], ProductImage.deleted_at.is_(None)).values(deleted_at=now))
# ProductLabel sync
existing_labels = await session.execute(select(ProductLabel).where(ProductLabel.product_id == p.id, ProductLabel.deleted_at.is_(None)))
existing_labels_set = {label.name.strip() for label in existing_labels.scalars()}
new_labels = {str(name).strip() for name in (d.get("labels") or []) if name}
for name in new_labels - existing_labels_set:
session.add(ProductLabel(product_id=p.id, name=name))
for name in existing_labels_set - new_labels:
await session.execute(update(ProductLabel).where(ProductLabel.product_id == p.id, ProductLabel.name == name, ProductLabel.deleted_at.is_(None)).values(deleted_at=now))
# ProductSticker sync
existing_stickers = await session.execute(select(ProductSticker).where(ProductSticker.product_id == p.id, ProductSticker.deleted_at.is_(None)))
existing_stickers_set = {sticker.name.strip() for sticker in existing_stickers.scalars()}
new_stickers = {str(name).strip().lower() for name in (d.get("stickers") or []) if name}
for name in new_stickers - existing_stickers_set:
session.add(ProductSticker(product_id=p.id, name=name))
for name in existing_stickers_set - new_stickers:
await session.execute(update(ProductSticker).where(ProductSticker.product_id == p.id, ProductSticker.name == name, ProductSticker.deleted_at.is_(None)).values(deleted_at=now))
# ProductAttribute sync
existing_attrs = await session.execute(select(ProductAttribute).where(ProductAttribute.product_id == p.id, ProductAttribute.deleted_at.is_(None)))
existing_attrs_set = {(a.key, a.value) for a in existing_attrs.scalars()}
new_attrs_set = set()
for src, prefix in [(d.get("info_table") or {}, "info_table"), (d.get("oe_list_price") or {}, "oe_list_price")]:
for k, v in src.items():
key = f"{prefix}/{str(k).strip()}"
val = None if v is None else str(v)
new_attrs_set.add((key, val))
if (key, val) not in existing_attrs_set:
session.add(ProductAttribute(product_id=p.id, key=key, value=val))
for key, val in existing_attrs_set - new_attrs_set:
await session.execute(update(ProductAttribute).where(ProductAttribute.product_id == p.id, ProductAttribute.key == key, ProductAttribute.value == val, ProductAttribute.deleted_at.is_(None)).values(deleted_at=now))
# ProductNutrition sync
existing_nuts = await session.execute(select(ProductNutrition).where(ProductNutrition.product_id == p.id, ProductNutrition.deleted_at.is_(None)))
existing_nuts_set = {(n.key, n.value, n.unit) for n in existing_nuts.scalars()}
new_nuts_set = set()
nutrition = d.get("nutrition") or []
if isinstance(nutrition, dict):
for k, v in nutrition.items():
key, val = str(k).strip(), str(v) if v is not None else None
new_nuts_set.add((key, val, None))
if (key, val, None) not in existing_nuts_set:
session.add(ProductNutrition(product_id=p.id, key=key, value=val, unit=None))
elif isinstance(nutrition, list):
for row in nutrition:
try:
key = str(row.get("key") or "").strip()
val = None if row.get("value") is None else str(row.get("value"))
unit = None if row.get("unit") is None else str(row.get("unit"))
if key:
new_nuts_set.add((key, val, unit))
if (key, val, unit) not in existing_nuts_set:
session.add(ProductNutrition(product_id=p.id, key=key, value=val, unit=unit))
except Exception:
continue
for key, val, unit in existing_nuts_set - new_nuts_set:
await session.execute(update(ProductNutrition).where(ProductNutrition.product_id == p.id, ProductNutrition.key == key, ProductNutrition.value == val, ProductNutrition.unit == unit, ProductNutrition.deleted_at.is_(None)).values(deleted_at=now))
# ProductAllergen sync
existing_allergens = await session.execute(select(ProductAllergen).where(ProductAllergen.product_id == p.id, ProductAllergen.deleted_at.is_(None)))
existing_allergens_set = {(a.name, a.contains) for a in existing_allergens.scalars()}
new_allergens_set = set()
for a in d.get("allergens") or []:
if isinstance(a, str):
nm, contains = a.strip(), True
elif isinstance(a, dict):
nm, contains = (a.get("name") or "").strip(), bool(a.get("contains", True))
else:
continue
if nm:
new_allergens_set.add((nm, contains))
if (nm, contains) not in existing_allergens_set:
session.add(ProductAllergen(product_id=p.id, name=nm, contains=contains))
for name, contains in existing_allergens_set - new_allergens_set:
await session.execute(update(ProductAllergen).where(ProductAllergen.product_id == p.id, ProductAllergen.name == name, ProductAllergen.contains == contains, ProductAllergen.deleted_at.is_(None)).values(deleted_at=now))
await session.flush()
return p
async def upsert_product(
slug,
href,
d,
):
async with get_session() as session:
try:
await _upsert_product(session, d)
await _log_product_result(session, ok=True, payload={
"slug": slug,
"href_tried": href,
"title": d.get("title"),
"has_description_html": bool(d.get("description_html")),
"has_description_short": bool(d.get("description_short")),
"sections_count": len(d.get("sections") or []),
"images_count": len(d.get("images")),
"embedded_images_count": len(d.get("embedded_image_urls")),
"all_images_count": len(d.get("all_image_urls")),
})
except Exception as e:
print(f"[ERROR] Failed to upsert product '{d.get('slug')}'")
print(f" Title: {d}.get('title')")
print(f" URL: {d.get('suma_href')}")
print(f" Error type: {type(e).__name__}")
print(f" Error message: {str(e)}")
import traceback
traceback.print_exc()
await _log_product_result(session, ok=False, payload={
"slug": d.get("slug"),
"href_tried": d.get("suma_href"),
"error_type": type(e).__name__,
"error_message": str(e),
"title": d.get("title"),
})
raise
await session.commit()