From 6271a715a1cc5db71541a14252a22d5dae2788f7 Mon Sep 17 00:00:00 2001 From: giles Date: Mon, 9 Feb 2026 23:16:34 +0000 Subject: [PATCH] feat: initialize market app with browsing, product, and scraping code Split from coop monolith. Includes: - Market/browse/product blueprints - Product sync API - Suma scraping pipeline - Templates for market, browse, and product views - Dockerfile and CI workflow for independent deployment --- .gitea/workflows/ci.yml | 63 ++ .gitignore | 12 + Dockerfile | 33 + README.md | 67 ++ app.py | 54 ++ bp/__init__.py | 0 bp/api/__init__.py | 0 bp/api/routes.py | 414 +++++++++++ bp/browse/__init__.py | 7 + bp/browse/routes.py | 162 +++++ bp/browse/services/__init__.py | 13 + bp/browse/services/blacklist/category.py | 12 + bp/browse/services/blacklist/product.py | 15 + .../services/blacklist/product_details.py | 11 + bp/browse/services/cache_backend.py | 367 ++++++++++ bp/browse/services/db_backend.py | 657 ++++++++++++++++++ bp/browse/services/nav.py | 163 +++++ bp/browse/services/products.py | 118 ++++ bp/browse/services/services.py | 179 +++++ bp/browse/services/slugs.py | 24 + bp/browse/services/state.py | 21 + bp/market/__init__.py | 7 + bp/market/admin/__init__.py | 0 bp/market/admin/routes.py | 28 + bp/market/filters/__init__.py | 0 bp/market/filters/qs.py | 101 +++ bp/market/routes.py | 44 ++ bp/product/routes.py | 248 +++++++ bp/product/services/__init__.py | 3 + bp/product/services/product_operations.py | 95 +++ entrypoint.sh | 31 + scrape-test.sh | 6 + scrape.sh | 5 + scrape/__init__.py | 0 scrape/build_snapshot/__init__.py | 1 + scrape/build_snapshot/build_snapshot.py | 104 +++ .../tools/APP_ROOT_PLACEHOLDER.py | 1 + scrape/build_snapshot/tools/__init__.py | 1 + scrape/build_snapshot/tools/_anchor_text.py | 6 + .../tools/_collect_html_img_srcs.py | 16 + .../tools/_dedupe_preserve_order.py | 14 + .../tools/_product_dict_is_cf.py | 32 + .../tools/_resolve_sub_redirects.py | 34 + .../tools/_rewrite_links_fragment.py | 100 +++ scrape/build_snapshot/tools/candidate_subs.py | 14 + .../build_snapshot/tools/capture_category.py | 18 + .../tools/capture_product_slugs.py | 25 + scrape/build_snapshot/tools/capture_sub.py | 22 + .../tools/fetch_and_upsert_product.py | 106 +++ .../tools/fetch_and_upsert_products.py | 49 ++ scrape/build_snapshot/tools/rewrite_nav.py | 24 + scrape/build_snapshot/tools/valid_subs.py | 16 + scrape/get_auth.py | 244 +++++++ scrape/html_utils.py | 44 ++ scrape/http_client.py | 220 ++++++ scrape/listings.py | 289 ++++++++ scrape/nav.py | 104 +++ scrape/persist_api/__init__.py | 6 + scrape/persist_api/capture_listing.py | 27 + scrape/persist_api/log_product_result.py | 24 + scrape/persist_api/save_nav.py | 19 + .../persist_api/save_subcategory_redirects.py | 15 + scrape/persist_api/upsert_product.py | 256 +++++++ scrape/persist_snapshot/__init__.py | 7 + scrape/persist_snapshot/_get.py | 3 + scrape/persist_snapshot/capture_listing.py | 137 ++++ scrape/persist_snapshot/log_product_result.py | 35 + scrape/persist_snapshot/save_link_reports.py | 29 + scrape/persist_snapshot/save_nav.py | 108 +++ .../save_subcategory_redirects.py | 32 + scrape/persist_snapshot/upsert_product.py | 237 +++++++ scrape/product/__init__.py | 1 + scrape/product/extractors/__init__.py | 13 + scrape/product/extractors/breadcrumbs.py | 68 ++ .../extractors/description_sections.py | 43 ++ scrape/product/extractors/images.py | 89 +++ scrape/product/extractors/info_table.py | 76 ++ scrape/product/extractors/labels.py | 41 ++ scrape/product/extractors/nutrition_ex.py | 129 ++++ scrape/product/extractors/oe_list_price.py | 56 ++ .../extractors/regular_price_fallback.py | 33 + .../product/extractors/short_description.py | 19 + scrape/product/extractors/stickers.py | 30 + scrape/product/extractors/title.py | 17 + scrape/product/helpers/desc.py | 165 +++++ scrape/product/helpers/html.py | 53 ++ scrape/product/helpers/price.py | 42 ++ scrape/product/helpers/text.py | 16 + scrape/product/product_core.py | 48 ++ scrape/product/product_detail.py | 4 + scrape/product/registry.py | 20 + templates/_types/browse/_admin.html | 7 + templates/_types/browse/_main_panel.html | 5 + templates/_types/browse/_oob_elements.html | 37 + templates/_types/browse/_product_card.html | 104 +++ templates/_types/browse/_product_cards.html | 107 +++ .../browse/desktop/_category_selector.html | 40 ++ .../_types/browse/desktop/_filter/brand.html | 40 ++ .../_types/browse/desktop/_filter/labels.html | 44 ++ .../_types/browse/desktop/_filter/like.html | 38 + .../_types/browse/desktop/_filter/search.html | 44 ++ .../_types/browse/desktop/_filter/sort.html | 34 + .../browse/desktop/_filter/stickers.html | 46 ++ templates/_types/browse/desktop/menu.html | 37 + templates/_types/browse/index.html | 13 + templates/_types/browse/like/button.html | 20 + .../browse/mobile/_filter/brand_ul.html | 40 ++ .../_types/browse/mobile/_filter/index.html | 30 + .../_types/browse/mobile/_filter/labels.html | 47 ++ .../_types/browse/mobile/_filter/like.html | 40 ++ .../_types/browse/mobile/_filter/search.html | 40 ++ .../_types/browse/mobile/_filter/sort_ul.html | 33 + .../browse/mobile/_filter/stickers.html | 50 ++ .../_types/browse/mobile/_filter/summary.html | 120 ++++ templates/_types/market/_admin.html | 7 + templates/_types/market/_main_panel.html | 23 + templates/_types/market/_oob_elements.html | 30 + templates/_types/market/_title.html | 17 + .../_types/market/admin/_main_panel.html | 1 + templates/_types/market/admin/_nav.html | 2 + .../_types/market/admin/_oob_elements.html | 29 + .../_types/market/admin/header/_header.html | 11 + templates/_types/market/admin/index.html | 19 + templates/_types/market/desktop/_nav.html | 38 + templates/_types/market/header/_header.html | 11 + templates/_types/market/index.html | 25 + .../_types/market/mobile/_nav_panel.html | 110 +++ templates/_types/market/mobile/menu.html | 6 + templates/_types/product/_added.html | 25 + templates/_types/product/_cart.html | 250 +++++++ templates/_types/product/_main_panel.html | 131 ++++ templates/_types/product/_meta.html | 106 +++ templates/_types/product/_oob_elements.html | 49 ++ templates/_types/product/_prices.html | 33 + templates/_types/product/_title.html | 2 + templates/_types/product/admin/_nav.html | 2 + .../_types/product/admin/_oob_elements.html | 40 ++ .../_types/product/admin/header/_header.html | 11 + templates/_types/product/admin/index.html | 39 ++ templates/_types/product/header/_header.html | 15 + templates/_types/product/index.html | 61 ++ templates/_types/product/prices.html | 66 ++ 142 files changed, 8517 insertions(+) create mode 100644 .gitea/workflows/ci.yml create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 README.md create mode 100644 app.py create mode 100644 bp/__init__.py create mode 100644 bp/api/__init__.py create mode 100644 bp/api/routes.py create mode 100644 bp/browse/__init__.py create mode 100644 bp/browse/routes.py create mode 100644 bp/browse/services/__init__.py create mode 100644 bp/browse/services/blacklist/category.py create mode 100644 bp/browse/services/blacklist/product.py create mode 100644 bp/browse/services/blacklist/product_details.py create mode 100644 bp/browse/services/cache_backend.py create mode 100644 bp/browse/services/db_backend.py create mode 100644 bp/browse/services/nav.py create mode 100644 bp/browse/services/products.py create mode 100644 bp/browse/services/services.py create mode 100644 bp/browse/services/slugs.py create mode 100644 bp/browse/services/state.py create mode 100644 bp/market/__init__.py create mode 100644 bp/market/admin/__init__.py create mode 100644 bp/market/admin/routes.py create mode 100644 bp/market/filters/__init__.py create mode 100644 bp/market/filters/qs.py create mode 100644 bp/market/routes.py create mode 100644 bp/product/routes.py create mode 100644 bp/product/services/__init__.py create mode 100644 bp/product/services/product_operations.py create mode 100644 entrypoint.sh create mode 100644 scrape-test.sh create mode 100644 scrape.sh create mode 100644 scrape/__init__.py create mode 100644 scrape/build_snapshot/__init__.py create mode 100644 scrape/build_snapshot/build_snapshot.py create mode 100644 scrape/build_snapshot/tools/APP_ROOT_PLACEHOLDER.py create mode 100644 scrape/build_snapshot/tools/__init__.py create mode 100644 scrape/build_snapshot/tools/_anchor_text.py create mode 100644 scrape/build_snapshot/tools/_collect_html_img_srcs.py create mode 100644 scrape/build_snapshot/tools/_dedupe_preserve_order.py create mode 100644 scrape/build_snapshot/tools/_product_dict_is_cf.py create mode 100644 scrape/build_snapshot/tools/_resolve_sub_redirects.py create mode 100644 scrape/build_snapshot/tools/_rewrite_links_fragment.py create mode 100644 scrape/build_snapshot/tools/candidate_subs.py create mode 100644 scrape/build_snapshot/tools/capture_category.py create mode 100644 scrape/build_snapshot/tools/capture_product_slugs.py create mode 100644 scrape/build_snapshot/tools/capture_sub.py create mode 100644 scrape/build_snapshot/tools/fetch_and_upsert_product.py create mode 100644 scrape/build_snapshot/tools/fetch_and_upsert_products.py create mode 100644 scrape/build_snapshot/tools/rewrite_nav.py create mode 100644 scrape/build_snapshot/tools/valid_subs.py create mode 100644 scrape/get_auth.py create mode 100644 scrape/html_utils.py create mode 100644 scrape/http_client.py create mode 100644 scrape/listings.py create mode 100644 scrape/nav.py create mode 100644 scrape/persist_api/__init__.py create mode 100644 scrape/persist_api/capture_listing.py create mode 100644 scrape/persist_api/log_product_result.py create mode 100644 scrape/persist_api/save_nav.py create mode 100644 scrape/persist_api/save_subcategory_redirects.py create mode 100644 scrape/persist_api/upsert_product.py create mode 100644 scrape/persist_snapshot/__init__.py create mode 100644 scrape/persist_snapshot/_get.py create mode 100644 scrape/persist_snapshot/capture_listing.py create mode 100644 scrape/persist_snapshot/log_product_result.py create mode 100644 scrape/persist_snapshot/save_link_reports.py create mode 100644 scrape/persist_snapshot/save_nav.py create mode 100644 scrape/persist_snapshot/save_subcategory_redirects.py create mode 100644 scrape/persist_snapshot/upsert_product.py create mode 100644 scrape/product/__init__.py create mode 100644 scrape/product/extractors/__init__.py create mode 100644 scrape/product/extractors/breadcrumbs.py create mode 100644 scrape/product/extractors/description_sections.py create mode 100644 scrape/product/extractors/images.py create mode 100644 scrape/product/extractors/info_table.py create mode 100644 scrape/product/extractors/labels.py create mode 100644 scrape/product/extractors/nutrition_ex.py create mode 100644 scrape/product/extractors/oe_list_price.py create mode 100644 scrape/product/extractors/regular_price_fallback.py create mode 100644 scrape/product/extractors/short_description.py create mode 100644 scrape/product/extractors/stickers.py create mode 100644 scrape/product/extractors/title.py create mode 100644 scrape/product/helpers/desc.py create mode 100644 scrape/product/helpers/html.py create mode 100644 scrape/product/helpers/price.py create mode 100644 scrape/product/helpers/text.py create mode 100644 scrape/product/product_core.py create mode 100644 scrape/product/product_detail.py create mode 100644 scrape/product/registry.py create mode 100644 templates/_types/browse/_admin.html create mode 100644 templates/_types/browse/_main_panel.html create mode 100644 templates/_types/browse/_oob_elements.html create mode 100644 templates/_types/browse/_product_card.html create mode 100644 templates/_types/browse/_product_cards.html create mode 100644 templates/_types/browse/desktop/_category_selector.html create mode 100644 templates/_types/browse/desktop/_filter/brand.html create mode 100644 templates/_types/browse/desktop/_filter/labels.html create mode 100644 templates/_types/browse/desktop/_filter/like.html create mode 100644 templates/_types/browse/desktop/_filter/search.html create mode 100644 templates/_types/browse/desktop/_filter/sort.html create mode 100644 templates/_types/browse/desktop/_filter/stickers.html create mode 100644 templates/_types/browse/desktop/menu.html create mode 100644 templates/_types/browse/index.html create mode 100644 templates/_types/browse/like/button.html create mode 100644 templates/_types/browse/mobile/_filter/brand_ul.html create mode 100644 templates/_types/browse/mobile/_filter/index.html create mode 100644 templates/_types/browse/mobile/_filter/labels.html create mode 100644 templates/_types/browse/mobile/_filter/like.html create mode 100644 templates/_types/browse/mobile/_filter/search.html create mode 100644 templates/_types/browse/mobile/_filter/sort_ul.html create mode 100644 templates/_types/browse/mobile/_filter/stickers.html create mode 100644 templates/_types/browse/mobile/_filter/summary.html create mode 100644 templates/_types/market/_admin.html create mode 100644 templates/_types/market/_main_panel.html create mode 100644 templates/_types/market/_oob_elements.html create mode 100644 templates/_types/market/_title.html create mode 100644 templates/_types/market/admin/_main_panel.html create mode 100644 templates/_types/market/admin/_nav.html create mode 100644 templates/_types/market/admin/_oob_elements.html create mode 100644 templates/_types/market/admin/header/_header.html create mode 100644 templates/_types/market/admin/index.html create mode 100644 templates/_types/market/desktop/_nav.html create mode 100644 templates/_types/market/header/_header.html create mode 100644 templates/_types/market/index.html create mode 100644 templates/_types/market/mobile/_nav_panel.html create mode 100644 templates/_types/market/mobile/menu.html create mode 100644 templates/_types/product/_added.html create mode 100644 templates/_types/product/_cart.html create mode 100644 templates/_types/product/_main_panel.html create mode 100644 templates/_types/product/_meta.html create mode 100644 templates/_types/product/_oob_elements.html create mode 100644 templates/_types/product/_prices.html create mode 100644 templates/_types/product/_title.html create mode 100644 templates/_types/product/admin/_nav.html create mode 100644 templates/_types/product/admin/_oob_elements.html create mode 100644 templates/_types/product/admin/header/_header.html create mode 100644 templates/_types/product/admin/index.html create mode 100644 templates/_types/product/header/_header.html create mode 100644 templates/_types/product/index.html create mode 100644 templates/_types/product/prices.html diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml new file mode 100644 index 0000000..f7f8869 --- /dev/null +++ b/.gitea/workflows/ci.yml @@ -0,0 +1,63 @@ +name: Build and Deploy + +on: + push: + branches: [main] + +env: + REGISTRY: registry.rose-ash.com:5000 + IMAGE: market + +jobs: + build-and-deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install tools + run: | + apt-get update && apt-get install -y --no-install-recommends openssh-client + + - name: Set up SSH + env: + SSH_KEY: ${{ secrets.DEPLOY_SSH_KEY }} + DEPLOY_HOST: ${{ secrets.DEPLOY_HOST }} + run: | + mkdir -p ~/.ssh + echo "$SSH_KEY" > ~/.ssh/id_rsa + chmod 600 ~/.ssh/id_rsa + ssh-keyscan -H "$DEPLOY_HOST" >> ~/.ssh/known_hosts 2>/dev/null || true + + - name: Pull latest code on server + env: + DEPLOY_HOST: ${{ secrets.DEPLOY_HOST }} + run: | + ssh "root@$DEPLOY_HOST" " + cd /root/market + git fetch origin main + git reset --hard origin/main + " + + - name: Build and push image + env: + DEPLOY_HOST: ${{ secrets.DEPLOY_HOST }} + run: | + ssh "root@$DEPLOY_HOST" " + cd /root/market + docker build --build-arg CACHEBUST=\$(date +%s) -t ${{ env.REGISTRY }}/${{ env.IMAGE }}:latest -t ${{ env.REGISTRY }}/${{ env.IMAGE }}:${{ github.sha }} . + docker push ${{ env.REGISTRY }}/${{ env.IMAGE }}:latest + docker push ${{ env.REGISTRY }}/${{ env.IMAGE }}:${{ github.sha }} + " + + - name: Deploy stack + env: + DEPLOY_HOST: ${{ secrets.DEPLOY_HOST }} + run: | + ssh "root@$DEPLOY_HOST" " + cd /root/market + source .env + docker stack deploy -c docker-compose.yml market + echo 'Waiting for services to update...' + sleep 10 + docker stack services market + " diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1e06fbc --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +__pycache__/ +*.pyc +*.pyo +.env +node_modules/ +_snapshot/ +_debug/ +*.egg-info/ +dist/ +build/ +.venv/ +venv/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..706bdc7 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,33 @@ +# syntax=docker/dockerfile:1 + +# ---------- Python application ---------- +FROM python:3.11-slim AS base + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + APP_PORT=8000 \ + APP_MODULE=app:app + +WORKDIR /app + +# Install system deps + psql client +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates \ + postgresql-client \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt ./requirements.txt +RUN pip install -r requirements.txt + +COPY . . + +# ---------- Runtime setup ---------- +COPY entrypoint.sh /usr/local/bin/entrypoint.sh +RUN chmod +x /usr/local/bin/entrypoint.sh + +RUN useradd -m -u 10001 appuser && chown -R appuser:appuser /app +USER appuser + +EXPOSE ${APP_PORT} +ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..7887579 --- /dev/null +++ b/README.md @@ -0,0 +1,67 @@ +# Market App + +Product browsing and marketplace application for the Rose Ash cooperative. + +## Overview + +The Market app is one of three microservices split from the original coop monolith: + +- **coop** (:8000) - Blog, calendar, auth, settings +- **market** (:8001) - Product browsing, categories, product detail +- **cart** (:8002) - Shopping cart, orders, payments + +## Architecture + +- **Framework:** Quart (async Flask) +- **Database:** PostgreSQL 16 with SQLAlchemy 2.0 (async) +- **Cache:** Redis (tag-based page cache) +- **Frontend:** HTMX + Jinja2 + Tailwind CSS +- **Data:** Products scraped from Suma Wholesale + +## Blueprints + +- `bp/market/` - Market root (navigation, category listing) +- `bp/browse/` - Product browsing with filters and infinite scroll +- `bp/product/` - Product detail pages +- `bp/api/` - Product sync API (used by scraper) + +## Development + + # Install dependencies + pip install -r requirements.txt + + # Set environment variables + export $(grep -v '^#' .env | xargs) + + # Run migrations + alembic upgrade head + + # Scrape products + bash scrape.sh + + # Run the dev server + hypercorn app:app --reload --bind 0.0.0.0:8001 + +## Scraping + + # Full scrape (max 50 pages, 200k products, 8 concurrent) + bash scrape.sh + + # Test scraping + bash scrape-test.sh + +## Docker + + docker build -t market . + docker run -p 8001:8000 --env-file .env market + +## Environment Variables + + DATABASE_URL_ASYNC=postgresql+asyncpg://user:pass@localhost/coop + REDIS_URL=redis://localhost:6379/0 + SECRET_KEY=your-secret-key + SUMA_USER=your-suma-username + SUMA_PASSWORD=your-suma-password + APP_URL_COOP=http://localhost:8000 + APP_URL_MARKET=http://localhost:8001 + APP_URL_CART=http://localhost:8002 diff --git a/app.py b/app.py new file mode 100644 index 0000000..a5e8e78 --- /dev/null +++ b/app.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +from quart import g + +from shared.factory import create_base_app +from config import config + +from suma_browser.app.bp import register_market_bp + + +async def market_context() -> dict: + """ + Market app context processor. + + - menu_items: fetched from coop internal API + - cart_count/cart_total: fetched from cart internal API + """ + from shared.context import base_context + from shared.internal_api import get as api_get, dictobj + + ctx = await base_context() + + # Menu items from coop API (wrapped for attribute access in templates) + menu_data = await api_get("coop", "/internal/menu-items") + ctx["menu_items"] = dictobj(menu_data) if menu_data else [] + + # Cart data from cart API + cart_data = await api_get("cart", "/internal/cart/summary", forward_session=True) + if cart_data: + ctx["cart_count"] = cart_data.get("count", 0) + ctx["cart_total"] = cart_data.get("total", 0) + else: + ctx["cart_count"] = 0 + ctx["cart_total"] = 0 + + return ctx + + +def create_app() -> "Quart": + app = create_base_app("market", context_fn=market_context) + + # Market blueprint at root (was /market in monolith) + app.register_blueprint( + register_market_bp( + url_prefix="/", + title=config()["coop_title"], + ), + url_prefix="/", + ) + + return app + + +app = create_app() diff --git a/bp/__init__.py b/bp/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bp/api/__init__.py b/bp/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bp/api/routes.py b/bp/api/routes.py new file mode 100644 index 0000000..5ab7b10 --- /dev/null +++ b/bp/api/routes.py @@ -0,0 +1,414 @@ +# products_api_async.py +from __future__ import annotations + +from datetime import datetime, timezone +from decimal import Decimal +from typing import Any, Dict, List, Tuple, Iterable, Optional + +from quart import Blueprint, request, jsonify, g +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import selectinload + +from scrape.persist_snapshot.log_product_result import _log_product_result +from scrape.persist_snapshot.save_nav import _save_nav +from scrape.persist_snapshot.capture_listing import _capture_listing +from scrape.persist_snapshot.save_subcategory_redirects import _save_subcategory_redirects + +# ⬇️ Import your models (names match your current file) +from models.market import ( + Product, + ProductImage, + ProductSection, + ProductLabel, + ProductSticker, + ProductAttribute, + ProductNutrition, + ProductAllergen, +) + +from suma_browser.app.redis_cacher import clear_cache +from suma_browser.app.csrf import csrf_exempt + + +products_api = Blueprint("products_api", __name__, url_prefix="/api/products") + +# ---- Comparison config (matches your schema) -------------------------------- + +PRODUCT_FIELDS: List[str] = [ + "slug", + "title", + "image", + "description_short", + "description_html", + "suma_href", + "brand", + "rrp", "rrp_currency", "rrp_raw", + "price_per_unit", "price_per_unit_currency", "price_per_unit_raw", + "special_price", "special_price_currency", "special_price_raw", + "regular_price", "regular_price_currency", "regular_price_raw", + "oe_list_price", + "case_size_count", "case_size_item_qty", "case_size_item_unit", "case_size_raw", + "ean", "sku", "unit_size", "pack_size", +] + +# rel_name -> (Model, fields_to_compare, key_for_orderless_compare) +CHILD_SPECS: Dict[str, Tuple[Any, List[str], str]] = { + "images": (ProductImage, ["url", "position", "kind"], "url"), + "sections": (ProductSection, ["title", "html"], "title"), + "labels": (ProductLabel, ["name"], "name"), + "stickers": (ProductSticker, ["name"], "name"), + "attributes": (ProductAttribute, ["key", "value"], "key"), + "nutrition": (ProductNutrition, ["key", "value", "unit"], "key"), + "allergens": (ProductAllergen, ["name", "contains"], "name"), +} + +def _now_utc(): + return datetime.now(timezone.utc) + +def _norm_scalar(v: Any) -> Any: + if isinstance(v, Decimal): + s = format(v.normalize(), "f") + return "0" if s in ("-0", "-0.0") else s + if isinstance(v, bool): + return bool(v) + if isinstance(v, (int, float, str)) or v is None: + return v + return str(v) + +def _normalize_row(obj: Dict[str, Any], keep: List[str]) -> Dict[str, Any]: + out: Dict[str, Any] = {} + for f in keep: + val = obj.get(f) + if isinstance(val, str): + val = val.strip() + out[f] = _norm_scalar(val) + return out + +def _list_to_index(items: Iterable[Dict[str, Any]], uniq: str) -> Dict[Any, Dict[str, Any]]: + ix: Dict[Any, Dict[str, Any]] = {} + for it in items or []: + key = it.get(uniq) + if key is None: + continue + ix[key] = it + return ix + +def _serialize_product_for_compare(p: Product) -> Dict[str, Any]: + root: Dict[str, Any] = {f: _norm_scalar(getattr(p, f)) for f in PRODUCT_FIELDS} + for rel_name, (_Model, fields, uniq) in CHILD_SPECS.items(): + rows: List[Dict[str, Any]] = [] + for child in getattr(p, rel_name) or []: + rows.append({f: _norm_scalar(getattr(child, f)) for f in fields}) + root[rel_name] = _list_to_index(rows, uniq) + return root + +def _serialize_payload_for_compare(payload: Dict[str, Any]) -> Dict[str, Any]: + root = _normalize_row(payload, PRODUCT_FIELDS) + for rel_name, (_Model, fields, uniq) in CHILD_SPECS.items(): + rows = payload.get(rel_name) or [] + rows = [r for r in rows if isinstance(r, dict)] + root[rel_name] = _list_to_index([_normalize_row(r, fields) for r in rows], uniq) + return root + +from decimal import Decimal, InvalidOperation + +def _is_numeric_like(x) -> bool: + if isinstance(x, bool): + return False + if isinstance(x, (int, float, Decimal)): + return True + if isinstance(x, str): + s = x.strip() + if not s: + return False + try: + Decimal(s) + return True + except InvalidOperation: + return False + return False + +def _to_decimal(x) -> Decimal: + if isinstance(x, Decimal): + return x + if isinstance(x, bool) or x is None: + raise InvalidOperation + if isinstance(x, (int, str)): + return Decimal(str(x).strip()) + if isinstance(x, float): + return Decimal(str(x)) # avoid float fp artifacts + # last resort: string-coerce + return Decimal(str(x).strip()) + +def values_different(av, bv) -> bool: + # match original None semantics first + if bv is None: + return av is not None + if av is None: + return True + + if _is_numeric_like(bv): + try: + return _to_decimal(av) != _to_decimal(bv) + except InvalidOperation: + # av isn't numeric-parsable → different + return True + else: + # non-numeric: compare as strings (like original) + return f"{av}" != f"{bv}" + +import re + +_cf_a_re = re.compile(r']+/cdn-cgi/l/email-protection#[^"]+"[^>]*>(.*?)', re.I | re.S) +_cf_span_re = re.compile(r']*class="__cf_email__"[^>]*>(.*?)', re.I | re.S) +_cf_data_attr_re = re.compile(r'\sdata-cfemail="[^"]+"', re.I) +_ws_re = re.compile(r'\s+') + +def normalize_cf_email(html: str) -> str: + if not isinstance(html, str): + return html + s = html + # Replace CF spans with their inner text + s = _cf_span_re.sub(r'\1', s) + # Replace CF protection anchors with their inner text + s = _cf_a_re.sub(r'\1', s) + # Drop the data-cfemail attribute if any remains + s = _cf_data_attr_re.sub('', s) + # Optional: collapse whitespace + s = _ws_re.sub(' ', s).strip() + return s + + +def _deep_equal(a: Dict[str, Any], b: Dict[str, Any]) -> bool: + # keys must match at this level + if a.keys() != b.keys(): + return False + + for k in a.keys(): + av, bv = a[k], b[k] + + # Dicts: recurse, but don't return early unless it's False + if isinstance(av, dict) and isinstance(bv, dict): + if not _deep_equal(av, bv): + # log_diff(k, av, bv) # optional + return False + continue + + # Lists/Tuples: compare length then elements (order-sensitive here) + if isinstance(av, (list, tuple)) and isinstance(bv, (list, tuple)): + if len(av) != len(bv): + # log_diff(k, av, bv) + return False + for i, (ai, bi) in enumerate(zip(av, bv)): + # nested dicts within lists + if isinstance(ai, dict) and isinstance(bi, dict): + if not _deep_equal(ai, bi): + return False + else: + if values_different(normalize_cf_email(ai), normalize_cf_email(bi)): + return False + continue + + # Scalars / everything else + if values_different(normalize_cf_email(av), normalize_cf_email(bv)): + # print('!!deep', k, av, bv) + return False + + return True + +# ---- Mutation helpers ------------------------------------------------------- + +def _apply_product_fields(p: Product, payload: Dict[str, Any]) -> None: + for f in PRODUCT_FIELDS: + setattr(p, f, payload.get(f)) + p.updated_at = _now_utc() + +def _replace_children(p: Product, payload: Dict[str, Any]) -> None: + # replace each relation wholesale (delete-orphan takes care of removal) + #p.images.clear() + for row in payload.get("images") or []: + p.images.append(ProductImage( + url=row.get("url"), + position=row.get("position") or 0, + kind=row.get("kind") or "gallery", + created_at=_now_utc(), updated_at=_now_utc(), + )) + + #p.sections.clear() + for row in payload.get("sections") or []: + p.sections.append(ProductSection( + title=row.get("title") or "", + html=row.get("html") or "", + created_at=_now_utc(), updated_at=_now_utc(), + )) + + #p.labels.clear() + for row in payload.get("labels") or []: + p.labels.append(ProductLabel( + name=row.get("name") or "", + created_at=_now_utc(), updated_at=_now_utc(), + )) + + #p.stickers.clear() + for row in payload.get("stickers") or []: + p.stickers.append(ProductSticker( + name=row.get("name") or "", + created_at=_now_utc(), updated_at=_now_utc(), + )) + + #p.attributes.clear() + for row in payload.get("attributes") or []: + p.attributes.append(ProductAttribute( + key=row.get("key") or "", + value=row.get("value"), + created_at=_now_utc(), updated_at=_now_utc(), + )) + + #p.nutrition.clear() + for row in payload.get("nutrition") or []: + p.nutrition.append(ProductNutrition( + key=row.get("key") or "", + value=row.get("value"), + unit=row.get("unit"), + created_at=_now_utc(), updated_at=_now_utc(), + )) + + #p.allergens.clear() + for row in payload.get("allergens") or []: + p.allergens.append(ProductAllergen( + name=row.get("name") or "", + contains=bool(row.get("contains", False)), + created_at=_now_utc(), updated_at=_now_utc(), + )) + +async def _create_product_from_payload(session: AsyncSession, payload: Dict[str, Any]) -> Product: + p = Product() + _apply_product_fields(p, payload) + p.created_at = _now_utc() + p.deleted_at = None + session.add(p) + #await session.flush() # get p.id + _replace_children(p, payload) + await session.flush() + return p + +# ---- API -------------------------------------------------------------------- + + +@csrf_exempt +@products_api.post("/listing/") +@clear_cache(tag='browse') +async def capture_lsting(): + data: Dict[str, Any] = await request.get_json(force=True, silent=False) + url = data['url'] + items = data['items'] + total_pages = data['total_pages'] + await _capture_listing(g.s, url,items, total_pages) + return {"ok": True} + + + +@csrf_exempt +@products_api.post("/log/") +@clear_cache(tag='browse') +async def log_product(): + data: Dict[str, Any] = await request.get_json(force=True, silent=False) + ok = bool(data["ok"]) + + payload = data.get("payload") or {} + try: + await _log_product_result(g.s, ok, payload) + return {"ok": True} + except Exception as e: + return {"ok": False} + + +@csrf_exempt +@products_api.post("/redirects/") +@clear_cache(tag='browse') +async def rediects(): + data: Dict[str, str] = await request.get_json(force=True, silent=False) + await _save_subcategory_redirects(g.s, data) + return {"ok": True} + + +@csrf_exempt +@products_api.post("/nav/") +@clear_cache(tag='browse') +async def save_nav(): + data: Dict[str, Any] = await request.get_json(force=True, silent=False) + await _save_nav(g.s, data) + return {"ok": True} + + +@csrf_exempt +@products_api.post("/sync/") +@clear_cache(tag='browse') +async def sync_product(): + """ + POST /api/products/sync + Body includes top-level fields and child arrays like: + { + "slug": "my-product", + "title": "...", + "images": [{"url":"https://..","position":0,"kind":"gallery"}], + "sections": [{"title":"Details","html":"

..

"}], + "labels": [{"name":"Vegan"}], + "stickers": [{"name":"Sale"}], + "attributes": [{"key":"Country","value":"UK"}], + "nutrition": [{"key":"Energy","value":"100","unit":"kcal"}], + "allergens": [{"name":"Nuts","contains":true}] + } + """ + payload = await request.get_json(force=True, silent=False) + if not isinstance(payload, dict): + return jsonify({"error": "Invalid JSON"}), 400 + + slug = payload.get("slug") + if not isinstance(slug, str) or not slug: + return jsonify({"error": "Missing 'slug'"}), 400 + + + # find undeleted row by slug + #stmt = select(Product).where(Product.slug == slug, Product.deleted_at.is_(None)) + + stmt = ( + select(Product) + .where(Product.slug == slug, Product.deleted_at.is_(None)) + .options( + selectinload(Product.images), + selectinload(Product.sections), + selectinload(Product.labels), + selectinload(Product.stickers), + selectinload(Product.attributes), + selectinload(Product.nutrition), + selectinload(Product.allergens), + ) + ) + existing: Optional[Product] = (await g.s.execute(stmt)).scalars().first() + + incoming_norm = _serialize_payload_for_compare(payload) + + if existing: + db_norm = _serialize_product_for_compare(existing) + + if _deep_equal(db_norm, incoming_norm): + # Exactly equal → just touch updated_at + existing.updated_at = _now_utc() + await g.s.flush() + return jsonify({"id": existing.id, "action": "touched"}), 200 + + # Different → soft delete old + create a new row + existing.deleted_at = _now_utc() + await g.s.flush() # ensure the soft-delete is persisted before inserting the new row + + new_p = await _create_product_from_payload(g.s, payload) + await g.s.flush() + return jsonify({"id": new_p.id, "action": "replaced"}), 201 + + # Not found → create + new_p = await _create_product_from_payload(g.s, payload) + await g.s.flush() + return jsonify({"id": new_p.id, "action": "created"}), 201 + diff --git a/bp/browse/__init__.py b/bp/browse/__init__.py new file mode 100644 index 0000000..85fd1a5 --- /dev/null +++ b/bp/browse/__init__.py @@ -0,0 +1,7 @@ +from __future__ import annotations + +# create the blueprint at package import time +from .routes import register # = Blueprint("browse_bp", __name__) + +# import routes AFTER browse_bp is defined so routes can attach to it +from . import routes # noqa: F401 diff --git a/bp/browse/routes.py b/bp/browse/routes.py new file mode 100644 index 0000000..bdc820b --- /dev/null +++ b/bp/browse/routes.py @@ -0,0 +1,162 @@ +from __future__ import annotations + + +from quart import ( + g, + Blueprint, + abort, + render_template, + render_template_string, + make_response, + current_app, +) +from config import config +from .services.nav import category_context, get_nav +from .services.blacklist.category import is_category_blocked + +from .services import ( + _hx_fragment_request, + _productInfo, + _vary, + _current_url_without_page, +) + +from suma_browser.app.redis_cacher import cache_page +from suma_browser.app.utils.htmx import is_htmx_request + +def register(): + browse_bp = Blueprint("browse", __name__) + + from .. import register_product + browse_bp.register_blueprint( + register_product(), + ) + + @browse_bp.get("/") + @cache_page(tag="browse") + async def home(): + """ + Market landing page. + Shows the Ghost CMS post with slug='market'. + """ + from shared.internal_api import get as api_get + + # Fetch the market post from coop internal API + p_data = await api_get("coop", "/internal/post/market") + if not p_data: + abort(404) + + # Determine which template to use based on request type + if not is_htmx_request(): + # Normal browser request: full page with layout + html = await render_template("_types/market/index.html", **p_data) + else: + # HTMX request: main panel + OOB elements + html = await render_template("_types/market/_oob_elements.html", **p_data) + + return await make_response(html) + + @browse_bp.get("/all/") + @cache_page(tag="browse") + async def browse_all(): + """ + Browse all products across all categories. + Renders full page or just product cards (HTMX pagination fragment). + """ + nav = await get_nav(g.s) + ctx = { + "category_label": "All Products", + "top_slug": "all", + "sub_slug": None, + } + + product_info = await _productInfo() + full_context = {**product_info, **ctx} + + # Determine which template to use based on request type and pagination + if not is_htmx_request(): + # Normal browser request: full page with layout + html = await render_template("_types/browse/index.html", **full_context) + elif product_info["page"] > 1: + # HTMX pagination: just product cards + sentinel + html = await render_template("_types/browse/_product_cards.html", **product_info) + else: + # HTMX navigation (page 1): main panel + OOB elements + html = await render_template("_types/browse/_oob_elements.html", **full_context) + + resp = await make_response(html) + resp.headers["Hx-Push-Url"] = _current_url_without_page() + return _vary(resp) + + + @browse_bp.get("//") + @cache_page(tag="browse") + async def browse_top(top_slug: str): + """ + Browse by top-level category (e.g. /fruit). + 404 if category not in allowed list or is blocked. + """ + REVERSE_CATEGORY = {v: k for k, v in config()["categories"]["allow"].items()} + if top_slug not in REVERSE_CATEGORY: + abort(404) + if is_category_blocked(top_slug): + abort(404) + + nav = await get_nav(g.s) + ctx = category_context(top_slug, None, nav) + + product_info = await _productInfo(top_slug) + full_context = {**product_info, **ctx} + + # Determine which template to use based on request type and pagination + if not is_htmx_request(): + # Normal browser request: full page with layout + html = await render_template("_types/browse/index.html", **full_context) + elif product_info["page"] > 1: + # HTMX pagination: just product cards + sentinel + html = await render_template("_types/browse/_product_cards.html", **product_info) + else: + html = await render_template("_types/browse/_oob_elements.html", **full_context) + + resp = await make_response(html) + resp.headers["Hx-Push-Url"] = _current_url_without_page() + return _vary(resp) + + + @browse_bp.get("///") + @cache_page(tag="browse") + async def browse_sub(top_slug: str, sub_slug: str): + """ + Browse by subcategory (e.g. /fruit/citrus). + 404 if blocked or unknown. + """ + REVERSE_CATEGORY = {v: k for k, v in config()["categories"]["allow"].items()} + if top_slug not in REVERSE_CATEGORY: + abort(404) + if is_category_blocked(top_slug, sub_slug): + abort(404) + + nav = await get_nav(g.s) + ctx = category_context(top_slug, sub_slug, nav) + + product_info = await _productInfo(top_slug, sub_slug) + full_context = {**product_info, **ctx} + + # Determine which template to use based on request type and pagination + if not is_htmx_request(): + # Normal browser request: full page with layout + html = await render_template("_types/browse/index.html", **full_context) + elif product_info["page"] > 1: + # HTMX pagination: just product cards + sentinel + html = await render_template("_types/browse/_product_cards.html", **product_info) + else: + # HTMX navigation (page 1): main panel + OOB elements + html = await render_template("_types/browse/_oob_elements.html", **full_context) + + resp = await make_response(html) + resp.headers["Hx-Push-Url"] = _current_url_without_page() + return _vary(resp) + + + + return browse_bp \ No newline at end of file diff --git a/bp/browse/services/__init__.py b/bp/browse/services/__init__.py new file mode 100644 index 0000000..70d11d0 --- /dev/null +++ b/bp/browse/services/__init__.py @@ -0,0 +1,13 @@ +from __future__ import annotations +from quart import Blueprint + + +from .services import ( + _hx_fragment_request, + _productInfo, + _order_brands_selected_first, + _massage_product, + _vary, + _current_url_without_page, + _is_liked +) diff --git a/bp/browse/services/blacklist/category.py b/bp/browse/services/blacklist/category.py new file mode 100644 index 0000000..ab8ae81 --- /dev/null +++ b/bp/browse/services/blacklist/category.py @@ -0,0 +1,12 @@ +# suma_browser/category_blacklist.py +from __future__ import annotations +from typing import Optional +from config import config + +def _norm(s: str) -> str: + return (s or "").strip().lower().strip("/") + +def is_category_blocked(top_slug: str, sub_slug: Optional[str] = None) -> bool: + if sub_slug: + return is_category_blocked(top_slug) or _norm(f"{top_slug}/{sub_slug}") in config()["blacklist"]["category"] + return _norm(top_slug) in config()["blacklist"]["category"] diff --git a/bp/browse/services/blacklist/product.py b/bp/browse/services/blacklist/product.py new file mode 100644 index 0000000..8f877aa --- /dev/null +++ b/bp/browse/services/blacklist/product.py @@ -0,0 +1,15 @@ +from typing import Set, Optional +from ..slugs import canonical_html_slug +from config import config + +_blocked: Set[str] = set() +_mtime: Optional[float] = None + +def _norm(slug: str) -> str: + slug = (slug or "").strip().strip("/").lower() + if slug.startswith("product/"): + slug = slug.split("/", 1)[1] + return canonical_html_slug(slug) + +def is_product_blocked(slug: str) -> bool: + return _norm(slug) in config()["blacklist"]["product"] diff --git a/bp/browse/services/blacklist/product_details.py b/bp/browse/services/blacklist/product_details.py new file mode 100644 index 0000000..7207e48 --- /dev/null +++ b/bp/browse/services/blacklist/product_details.py @@ -0,0 +1,11 @@ +import re +from config import config + +def _norm_title_key(t: str) -> str: + t = (t or "").strip().lower() + t = re.sub(r":\s*$", "", t) + t = re.sub(r"\s+", " ", t) + return t + +def is_blacklisted_heading(title: str) -> bool: + return _norm_title_key(title) in [s.lower() for s in config()["blacklist"]["product-details"]] diff --git a/bp/browse/services/cache_backend.py b/bp/browse/services/cache_backend.py new file mode 100644 index 0000000..1b940f6 --- /dev/null +++ b/bp/browse/services/cache_backend.py @@ -0,0 +1,367 @@ +from __future__ import annotations +import os, json +from typing import List, Optional +from config import config +from .blacklist.product import is_product_blocked + + +def _json(path: str): + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + + +def fs_nav(): + path = os.path.join(config()["cache"]["fs_root"], "nav.json") + return _json(path) + + +def _brand_of(item: dict) -> str: + b = (item.get("brand") or "").strip() + if b: + return b + try: + return (item.get("info_table", {}).get("Brand") or "").strip() + except Exception: + return "" + + +def _stickers_of(item: dict) -> List[str]: + vals = item.get("stickers") or [] + out = [] + for v in vals: + s = (str(v) or "").strip().lower() + if s: + out.append(s) + return out + + +def fs_product_by_slug(slug: str): + slug = (slug or "").strip() + if slug.endswith(".json"): + path = os.path.join(config()["cache"]["fs_root"], "products", slug) + else: + path = os.path.join(config()["cache"]["fs_root"], "products", f"{slug}.json") + return _json(path) + + +def fs_count_products_in_sub(top_slug: str, sub_slug: Optional[str]) -> int: + """ + Return how many products are in the listing for (top_slug, sub_slug), + after filtering out blocked products. + + If sub_slug is None, that's the top-level category listing. + """ + fs_root = config()["cache"]["fs_root"] + + # Build path to listings/.../items.json just like fs_products does + parts = ["listings", top_slug] + if sub_slug: + parts.append(sub_slug) + parts.append("items.json") + + path = os.path.join(fs_root, *parts) + if not os.path.exists(path): + return 0 + + try: + all_slugs = _json(path) + except Exception: + return 0 + + # Filter out blocked products + allowed = [ + slug for slug in all_slugs + if not is_product_blocked(slug) + ] + return len(allowed) + + +def fs_products( + top_slug: str | None, + sub_slug: str | None, + selected_brands: Optional[List[str]] = None, + selected_stickers: Optional[List[str]] = None, + selected_labels: Optional[List[str]] = None, + page: int = 1, + search: Optional[str] = None, + sort: Optional[str] = None, + page_size: int = 20, + + # NEW: only include products the current user has liked + liked_slugs: Optional[List[str]] = None, + liked: bool = None, +): + """ + Returns: + { + "total_pages": int, + "items": [product dict ...], # filtered + paginated (sorted) + "brands": [{"name": str, "count": int}], + "stickers": [{"name": str, "count": int}], + "labels": [{"name": str, "count": int}], + } + + Filters: + - top_slug / sub_slug scope + - selected_brands + - selected_stickers + - selected_labels + - search + - liked_slugs (if provided) + """ + + import os + from typing import List, Dict + + fs_root = config()["cache"]["fs_root"] + + # ---------- Collect slugs ---------- + slugs: List[str] = [] + if top_slug: # normal listing path + parts = ["listings", top_slug] + if sub_slug: + parts.append(sub_slug) + parts.append("items.json") + path = os.path.join(fs_root, *parts) + if os.path.exists(path): + try: + slugs = [s for s in _json(path) if not is_product_blocked(s)] + except Exception: + slugs = [] + else: + # No top slug: include ALL products from /products/*.json + products_dir = os.path.join(fs_root, "products") + try: + for fname in os.listdir(products_dir): + if not fname.endswith(".json"): + continue + slug = fname[:-5] # strip .json + if not is_product_blocked(slug): + slugs.append(slug) + except FileNotFoundError: + slugs = [] + + # ---------- Load product dicts ---------- + all_items: List[dict] = [] + for slug in slugs: + try: + item = fs_product_by_slug(slug) + if isinstance(item, dict): + all_items.append(item) + except Exception: + continue + + # Stable deterministic ordering when aggregating everything (name ASC) + def _title_key(it: dict) -> tuple: + title = (it.get("title") or it.get("name") or it.get("slug") or "").strip().lower() + return (title, it.get("slug") or "") + + all_items.sort(key=_title_key) + + # ---------- Helpers for filters & counts ---------- + def _brand_of_local(item: dict) -> str: + b = item.get("brand") or (item.get("info_table") or {}).get("Brand") + return (b or "").strip() + + def _stickers_of_local(item: dict) -> List[str]: + vals = item.get("stickers") or [] + out = [] + for s in vals: + if isinstance(s, str): + s2 = s.strip().lower() + if s2: + out.append(s2) + return out + + def _labels_of_local(item: dict) -> List[str]: + vals = item.get("labels") or [] + out = [] + for s in vals: + if isinstance(s, str): + s2 = s.strip().lower() + if s2: + out.append(s2) + return out + + sel_brands = [ + (s or "").strip().lower() + for s in (selected_brands or []) + if (s or "").strip() + ] + sel_stickers = [ + (s or "").strip().lower() + for s in (selected_stickers or []) + if (s or "").strip() + ] + sel_labels = [ + (s or "").strip().lower() + for s in (selected_labels or []) + if (s or "").strip() + ] + search_q = (search or "").strip().lower() or None + + liked_set = { + (slug or "").strip().lower() + for slug in (liked_slugs or [] if liked else []) + if (slug or "").strip() + } + + real_liked_set = { + (slug or "").strip().lower() + for slug in (liked_slugs or []) + if (slug or "").strip() + } + + def matches_brand(item: dict) -> bool: + if not sel_brands: + return True + return _brand_of_local(item).strip().lower() in sel_brands + + def has_all_selected_stickers(item: dict) -> bool: + if not sel_stickers: + return True + tags = set(_stickers_of_local(item)) + return all(s in tags for s in sel_stickers) + + def has_all_selected_labels(item: dict) -> bool: + if not sel_labels: + return True + tags = set(_labels_of_local(item)) + return all(s in tags for s in sel_labels) + + def matches_search(item: dict) -> bool: + if not search_q: + return True + desc = (item.get("description_short") or "").strip().lower() + return search_q in desc + + def is_liked(item: dict) -> bool: + """ + True if this item should be shown under the liked filter. + If liked_set is empty, treat everything as allowed. + """ + slug_val = (item.get("slug") or "").strip().lower() + return slug_val in real_liked_set + + # ---------- Counts (dependent on other filters + search + liked) ---------- + brand_counts: Dict[str, int] = {} + for b in (selected_brands or []): + brand_counts[b] = 0 + + for it in all_items: + b = _brand_of_local(it) + if not b: + continue + brand_counts[b] = brand_counts.get(b, 0) + 1 + + sticker_counts: Dict[str, int] = {} + for s in (selected_stickers or []): + sticker_counts[s] = 0 + for it in all_items: + for s in _stickers_of_local(it): + sticker_counts[s] = sticker_counts.get(s, 0) + 1 + + label_counts: Dict[str, int] = {} + for s in (selected_labels or []): + label_counts[s] = 0 + for it in all_items: + for s in _labels_of_local(it): + label_counts[s] = label_counts.get(s, 0) + 1 + + liked_count = 0 + for it in all_items: + if is_liked(it): + liked_count += 1 + + search_count=0 + for it in all_items: + if matches_search(it): + search_count += 1 + + + # ---------- Apply filters ---------- + filtered = [ + it + for it in all_items + if matches_brand(it) + and has_all_selected_stickers(it) + and has_all_selected_labels(it) + and matches_search(it) + and (not liked or is_liked(it)) + ] + + # ---------- Sorting ---------- + sort_mode = (sort or "az").strip().lower() + + def _price_key(item: dict): + p = item["regular_price"] + title, slug = _title_key(item) + return (0 if p is not None else 1, p if p is not None else 0, title, slug) + + def _price_key_desc(item: dict): + p = item["regular_price"] + title, slug = _title_key(item) + return ( + 0 if p is not None else 1, + -(p if p is not None else 0), + title, + slug, + ) + + if sort_mode in ("az",): + filtered.sort(key=_title_key) + elif sort_mode in ("za",): + filtered.sort(key=_title_key, reverse=True) + elif sort_mode in ( + "price-asc", "price_asc", "price-low", "price-low-high", "low-high", "lo-hi" + ): + filtered.sort(key=_price_key) + elif sort_mode in ( + "price-desc", "price_desc", "price-high", "price-high-low", "high-low", "hi-lo" + ): + filtered.sort(key=_price_key_desc) + else: + filtered.sort(key=_title_key) + + # ---------- Pagination ---------- + total_pages = max(1, (len(filtered) + page_size - 1) // page_size) + page = max(1, page) + start = (page - 1) * page_size + end = start + page_size + page_items = filtered[start:end] + # ---------- Format counts lists ---------- + brands_list = sorted( + [{"name": k, "count": v} for k, v in brand_counts.items()], + key=lambda x: (-x["count"], x["name"].lower()), + ) + stickers_list = sorted( + [{"name": k, "count": v} for k, v in sticker_counts.items()], + key=lambda x: (-x["count"], x["name"]), + ) + labels_list = sorted( + [{"name": k, "count": v} for k, v in label_counts.items()], + key=lambda x: (-x["count"], x["name"]), + ) + return { + "total_pages": total_pages, + "items": page_items, + "brands": brands_list, + "stickers": stickers_list, + "labels": labels_list, + "liked_count": liked_count, + "search_count": search_count + } + +# async wrappers (unchanged) +async def read_nav(): + return fs_nav() + +async def read_listing(top_slug: str, sub_slug: str | None, page: int): + return fs_products(top_slug, sub_slug, None, None, page) + +async def read_product(slug_or_path: str): + slug = (slug_or_path or "").strip() + if "/" in slug: + slug = slug.rsplit("/", 1)[-1] + slug = slug.split("?", 1)[0] + return fs_product_by_slug(slug) diff --git a/bp/browse/services/db_backend.py b/bp/browse/services/db_backend.py new file mode 100644 index 0000000..86dd41b --- /dev/null +++ b/bp/browse/services/db_backend.py @@ -0,0 +1,657 @@ +from __future__ import annotations +from typing import Dict, List, Optional + +from sqlalchemy import select, and_ +from sqlalchemy.orm import selectinload + +from config import config # if unused elsewhere, you can remove this import + +# ORM models +from models.market import ( + Product, ProductImage, ProductSection, + Listing, ListingItem, + NavTop, NavSub, + ProductSticker, ProductLabel, + ProductAttribute, ProductNutrition, ProductAllergen, ProductLike + +) +from sqlalchemy import func, case + + +# ---------- helpers ---------- +def _regular_price_of(p: Product) -> Optional[float]: + try: + return ( + float(p.regular_price) + if p.regular_price is not None + else ( + float(p.special_price) + if p.special_price is not None + else None + ) + ) + except Exception: + return None + +# ---------- NAV ---------- +async def db_nav(session) -> Dict: + tops = (await session.execute(select(NavTop))).scalars().all() + subs = (await session.execute(select(NavSub))).scalars().all() + + subs_by_top: Dict[int, List[Dict]] = {} + for s in subs: + sub_name = (s.label or s.slug or "").strip() + subs_by_top.setdefault(s.top_id, []).append({ + "label": s.label, + "name": sub_name, # back-compat for callers expecting "name" + "slug": s.slug, + "href": s.href, + }) + + cats: Dict[str, Dict] = {} + for t in tops: + top_label = (t.label or t.slug or "").strip() + cats[top_label] = { + "label": t.label, + "name": top_label, # back-compat + "slug": t.slug, + "subs": sorted(subs_by_top.get(t.id, []), key=lambda x: (x["name"] or "").lower()), + } + return {"cats": cats} + + +async def db_product_full(session, slug: str, user_id=0) -> Optional[dict]: + + liked_product_ids_subq = ( + select(ProductLike.product_slug) + .where( + and_( + ProductLike.user_id == user_id, + ProductLike.deleted_at.is_(None) + ) + ) + ) + + is_liked_case = case( + (and_( + (Product.slug.in_(liked_product_ids_subq)), + Product.deleted_at.is_(None) + ), True), + else_=False + ).label("is_liked") + + q = ( + select(Product, is_liked_case) + .where(Product.slug == slug, Product.deleted_at.is_(None)) + .options( + selectinload(Product.images.and_(ProductImage.deleted_at.is_(None))), + selectinload(Product.sections.and_(ProductSection.deleted_at.is_(None))), + selectinload(Product.labels.and_(ProductLabel.deleted_at.is_(None))), + selectinload(Product.stickers.and_(ProductSticker.deleted_at.is_(None))), + selectinload(Product.attributes.and_(ProductAttribute.deleted_at.is_(None))), + selectinload(Product.nutrition.and_(ProductNutrition.deleted_at.is_(None))), + selectinload(Product.allergens.and_(ProductAllergen.deleted_at.is_(None))), + ) + ) + result = await session.execute(q) + + row = result.first() if result is not None else None + p, is_liked = row if row else (None, None) + if not p: + return None + + gallery = [ + img.url + for img in sorted(p.images, key=lambda i: (i.kind or "gallery", i.position or 0)) + if (img.kind or "gallery") == "gallery" + ] + embedded = [ + img.url + for img in sorted(p.images, key=lambda i: i.position or 0) + if (img.kind or "") == "embedded" + ] + all_imgs = [ + img.url + for img in sorted(p.images, key=lambda i: i.position or 0) + if (img.kind or "") == "all" + ] + return { + "id": p.id, + "slug": p.slug, + "title": p.title, + "brand": p.brand, + "image": p.image, + "description_short": p.description_short, + "description_html": p.description_html, + "suma_href": p.suma_href, + "rrp": float(p.rrp) if p.rrp is not None else None, + "special_price": float(p.special_price) if p.special_price is not None else None, + "special_price_raw": p.special_price_raw, + "special_price_currency": p.special_price_currency, + "regular_price": _regular_price_of(p), + "regular_price_raw": p.regular_price_raw, + "regular_price_currency": p.regular_price_currency, + "rrp_raw": p.rrp_raw, + "rrp_currency": p.rrp_currency, + "price_per_unit_raw": p.price_per_unit_raw, + "price_per_unit": p.price_per_unit, + "price_per_unit_currency": p.price_per_unit_currency, + "oe_list_price": p.oe_list_price, + "images": gallery, + "embedded_image_urls": embedded, + "all_image_urls": all_imgs, + "sections": [{"title": s.title, "html": s.html} for s in p.sections], + "stickers": [v.name.strip().lower() for v in p.stickers if v.name], + "labels": [v.name for v in p.labels if v.name], + "ean": p.ean, + "sku": p.sku, + "unit_size": p.unit_size, + "pack_size": p.pack_size, + "case_size_raw": p.case_size_raw, + "case_size_count": p.case_size_count, + "case_size_item_qty": p.case_size_item_qty, + "case_size_item_unit": p.case_size_item_unit, + "info_table": {a.key: a.value for a in p.attributes if a.key}, + "nutrition": [{"key": n.key, "value": n.value, "unit": n.unit} for n in p.nutrition if n.key], + "allergens": [{"name": a.name, "contains": a.contains} for a in p.allergens if a.name], + "is_liked": is_liked, + "deleted_at": p.deleted_at + } + + +async def db_product_full_id(session, id:int, user_id=0) -> Optional[dict]: + liked_product_ids_subq = ( + select(ProductLike.product_slug) + .where( + and_( + ProductLike.user_id == user_id, + ProductLike.deleted_at.is_(None) + ) + ) + ) + + is_liked_case = case( + ( + (Product.slug.in_(liked_product_ids_subq)), + True + ), + else_=False + ).label("is_liked") + + q = ( + select(Product, is_liked_case) + .where(Product.id == id) + .options( + selectinload(Product.images.and_(ProductImage.deleted_at.is_(None))), + selectinload(Product.sections.and_(ProductSection.deleted_at.is_(None))), + selectinload(Product.labels.and_(ProductLabel.deleted_at.is_(None))), + selectinload(Product.stickers.and_(ProductSticker.deleted_at.is_(None))), + selectinload(Product.attributes.and_(ProductAttribute.deleted_at.is_(None))), + selectinload(Product.nutrition.and_(ProductNutrition.deleted_at.is_(None))), + selectinload(Product.allergens.and_(ProductAllergen.deleted_at.is_(None))), + ) + ) + result = await session.execute(q) + + row = result.first() if result is not None else None + p, is_liked = row if row else (None, None) + if not p: + return None + + gallery = [ + img.url + for img in sorted(p.images, key=lambda i: (i.kind or "gallery", i.position or 0)) + if (img.kind or "gallery") == "gallery" + ] + embedded = [ + img.url + for img in sorted(p.images, key=lambda i: i.position or 0) + if (img.kind or "") == "embedded" + ] + all_imgs = [ + img.url + for img in sorted(p.images, key=lambda i: i.position or 0) + if (img.kind or "") == "all" + ] + return { + "id": p.id, + "slug": p.slug, + "title": p.title, + "brand": p.brand, + "image": p.image, + "description_short": p.description_short, + "description_html": p.description_html, + "suma_href": p.suma_href, + "rrp": float(p.rrp) if p.rrp is not None else None, + "special_price": float(p.special_price) if p.special_price is not None else None, + "special_price_raw": p.special_price_raw, + "special_price_currency": p.special_price_currency, + "regular_price": _regular_price_of(p), + "regular_price_raw": p.regular_price_raw, + "regular_price_currency": p.regular_price_currency, + "rrp_raw": p.rrp_raw, + "rrp_currency": p.rrp_currency, + "price_per_unit_raw": p.price_per_unit_raw, + "price_per_unit": p.price_per_unit, + "price_per_unit_currency": p.price_per_unit_currency, + "oe_list_price": p.oe_list_price, + "images": gallery, + "embedded_image_urls": embedded, + "all_image_urls": all_imgs, + "sections": [{"title": s.title, "html": s.html} for s in p.sections], + "stickers": [v.name.strip().lower() for v in p.stickers if v.name], + "labels": [v.name for v in p.labels if v.name], + "ean": p.ean, + "sku": p.sku, + "unit_size": p.unit_size, + "pack_size": p.pack_size, + "case_size_raw": p.case_size_raw, + "case_size_count": p.case_size_count, + "case_size_item_qty": p.case_size_item_qty, + "case_size_item_unit": p.case_size_item_unit, + "info_table": {a.key: a.value for a in p.attributes if a.key}, + "nutrition": [{"key": n.key, "value": n.value, "unit": n.unit} for n in p.nutrition if n.key], + "allergens": [{"name": a.name, "contains": a.contains} for a in p.allergens if a.name], + "is_liked": is_liked, + "deleted_at": p.deleted_at + } + + + + + +# ---------- PRODUCTS LISTING ---------- + +async def db_products_nocounts( + session, + top_slug: str | None, + sub_slug: str | None, + selected_brands: Optional[List[str]] = None, + selected_stickers: Optional[List[str]] = None, + selected_labels: Optional[List[str]] = None, + page: int = 1, + search: Optional[str] = None, + sort: Optional[str] = None, + page_size: int = 20, + liked: bool = None, + user_id: int=0 +) -> Dict: + BLOCKED_SLUGS = set((config().get("blacklist", {}).get("product", []) or [])) + base_conditions = [] + if BLOCKED_SLUGS: + base_conditions.append( + ~Product.slug.in_(BLOCKED_SLUGS), + ) + + if top_slug: + + q_list = ( + select(Listing.id) + .join(NavTop, Listing.top) + .outerjoin(NavSub, Listing.sub) + .where( + Listing.deleted_at.is_(None), + NavTop.deleted_at.is_(None), + NavTop.slug == top_slug, + NavSub.deleted_at.is_(None), + NavSub.slug == sub_slug if sub_slug else Listing.sub_id.is_(None), + ) + ) + + listing_id = (await session.execute(q_list)).scalars().first() + if not listing_id: + return {"total_pages": 1, "items": []} + + base_conditions.append(Product.slug.in_( + select(ListingItem.slug).where(ListingItem.listing_id == listing_id, ListingItem.deleted_at.is_(None)) + )) + + base_ids_subq = select(Product.id).where(*base_conditions, Product.deleted_at.is_(None)) + base_ids = (await session.execute(base_ids_subq)).scalars().all() + + if not base_ids: + return {"total_pages": 1, "items": []} + sel_brands = [(b or "").strip().lower() for b in (selected_brands or []) if (b or "").strip()] + sel_stickers = [(s or "").strip().lower() for s in (selected_stickers or []) if (s or "").strip()] + sel_labels = [(l or "").strip().lower() for l in (selected_labels or []) if (l or "").strip()] + search_q = (search or "").strip().lower() + + filter_conditions = [] + if sel_brands: + filter_conditions.append(func.lower(Product.brand).in_(sel_brands)) + for sticker_name in sel_stickers: + filter_conditions.append( + Product.stickers.any( + and_( + func.lower(ProductSticker.name) == sticker_name, + ProductSticker.deleted_at.is_(None) + ) + ) + ) + for label_name in sel_labels: + filter_conditions.append( + Product.labels.any( + and_( + func.lower(ProductLabel.name) == label_name, + ProductLabel.deleted_at.is_(None), + ) + ) + ) + if search_q: + filter_conditions.append(func.lower(Product.description_short).contains(search_q)) + if liked: + liked_subq = liked_subq = ( + select(ProductLike.product_slug) + .where( + and_( + ProductLike.user_id == user_id, + ProductLike.deleted_at.is_(None) + ) + ) + .subquery() + ) + filter_conditions.append(Product.slug.in_(liked_subq)) + + filtered_count_query = select(func.count(Product.id)).where(Product.id.in_(base_ids), *filter_conditions) + total_filtered = (await session.execute(filtered_count_query)).scalars().one() + total_pages = max(1, (total_filtered + page_size - 1) // page_size) + page = max(1, page) + + + liked_product_slugs_subq = ( + select(ProductLike.product_slug) + .where( + and_( + ProductLike.user_id == user_id, + ProductLike.deleted_at.is_(None) + ) + ) + ) + is_liked_case = case( + (Product.slug.in_(liked_product_slugs_subq), True), + else_=False + ).label("is_liked") + + q_filtered = select(Product, is_liked_case).where(Product.id.in_(base_ids), *filter_conditions).options( + selectinload(Product.images), + selectinload(Product.sections), + selectinload(Product.labels), + selectinload(Product.stickers), + selectinload(Product.attributes), + selectinload(Product.nutrition), + selectinload(Product.allergens), + ) + + sort_mode = (sort or "az").strip().lower() + if sort_mode == "az": + q_filtered = q_filtered.order_by(func.lower(Product.title), Product.slug) + elif sort_mode == "za": + q_filtered = q_filtered.order_by(func.lower(Product.title).desc(), Product.slug.desc()) + elif sort_mode in ("price-asc", "price_asc", "price-low", "price-low-high", "low-high", "lo-hi"): + q_filtered = q_filtered.order_by( + case((Product.regular_price.is_(None), 1), else_=0), + Product.regular_price.asc(), + func.lower(Product.title), + Product.slug + ) + elif sort_mode in ("price-desc", "price_desc", "price-high", "price-high-low", "high-low", "hi-lo"): + q_filtered = q_filtered.order_by( + case((Product.regular_price.is_(None), 1), else_=0), + Product.regular_price.desc(), + func.lower(Product.title), + Product.slug + ) + else: + q_filtered = q_filtered.order_by(func.lower(Product.title), Product.slug) + + offset_val = (page - 1) * page_size + q_filtered = q_filtered.offset(offset_val).limit(page_size) + products_page = (await session.execute(q_filtered)).all() + + items: List[Dict] = [] + for p, is_liked in products_page: + gallery_imgs = sorted((img for img in p.images), key=lambda i: (i.kind or "gallery", i.position or 0)) + gallery = [img.url for img in gallery_imgs if (img.kind or "gallery") == "gallery"] + embedded = [img.url for img in sorted(p.images, key=lambda i: i.position or 0) if (img.kind or "") == "embedded"] + all_imgs = [img.url for img in sorted(p.images, key=lambda i: i.position or 0) if (img.kind or "") == "all"] + + items.append({ + "slug": p.slug, + "title": p.title, + "brand": p.brand, + "description_short": p.description_short, + "description_html": p.description_html, + "image": p.image, + "rrp": float(p.rrp) if p.rrp is not None else None, + "special_price": float(p.special_price) if p.special_price is not None else None, + "special_price_raw": p.special_price_raw, + "special_price_currency": p.special_price_currency, + "regular_price": _regular_price_of(p), + "regular_price_raw": p.regular_price_raw, + "regular_price_currency": p.regular_price_currency, + "rrp_raw": p.rrp_raw, + "rrp_currency": p.rrp_currency, + "price_per_unit_raw": p.price_per_unit_raw, + "price_per_unit": p.price_per_unit, + "price_per_unit_currency": p.price_per_unit_currency, + "images": gallery, + "embedded_image_urls": embedded, + "all_image_urls": all_imgs, + "sections": [{"title": s.title, "html": s.html} for s in p.sections], + "labels": [l.name for l in p.labels if l.name], + "stickers": [s.name.strip().lower() for s in p.stickers if s.name], + "info_table": {a.key: a.value for a in p.attributes if a.key}, + "nutrition": [{"key": n.key, "value": n.value, "unit": n.unit} for n in p.nutrition if n.key], + "allergens": [{"name": a.name, "contains": a.contains} for a in p.allergens if a.name], + "ean": p.ean, + "sku": p.sku, + "unit_size": p.unit_size, + "pack_size": p.pack_size, + "is_liked": is_liked, + }) + + return { + "total_pages": total_pages, + "items": items, + } + + +async def db_products_counts( + session, + top_slug: str | None, + sub_slug: str | None, + search: Optional[str] = None, + user_id: int=0 +) -> Dict: + BLOCKED_SLUGS = set((config().get("blacklist", {}).get("product", []) or [])) + base_conditions = [] + + if top_slug: + q_list = select(Listing.id).where( + Listing.deleted_at.is_(None), + Listing.top.has(slug=top_slug), + Listing.sub.has(slug=sub_slug) if sub_slug else Listing.sub_id.is_(None), + ) + listing_id = (await session.execute(q_list)).scalars().first() + if not listing_id: + return { + "brands": [], + "stickers": [], + "labels": [], + "liked_count": 0, + "search_count": 0, + } + + listing_slug_subquery = select(ListingItem.slug).where(ListingItem.listing_id == listing_id, ListingItem.deleted_at.is_(None)) + + if BLOCKED_SLUGS: + base_conditions.append( + and_( + Product.slug.in_(listing_slug_subquery), + ~Product.slug.in_(BLOCKED_SLUGS), + ) + ) + else: + base_conditions.append(Product.slug.in_(listing_slug_subquery)) + else: + if BLOCKED_SLUGS: + base_conditions.append(~Product.slug.in_(BLOCKED_SLUGS)) + base_ids = (await session.execute(select(Product.id).where(*base_conditions, Product.deleted_at.is_(None)))).scalars().all() + if base_ids: + base_products_slugs = (await session.execute( + select(Product.slug).where(Product.id.in_(base_ids), Product.deleted_at.is_(None)) + )).scalars().all() + if not base_products_slugs: + return { + "brands": [], + "stickers": [], + "labels": [], + "liked_count": 0, + "search_count": 0, + } + base_ids = (await session.execute( + select(Product.id).where(Product.slug.in_(base_products_slugs), Product.deleted_at.is_(None)) + )).scalars().all() + else: + return { + "brands": [], + "stickers": [], + "labels": [], + "liked_count": 0, + "search_count": 0, + } + + brands_list: List[Dict] = [] + stickers_list: List[Dict] = [] + labels_list: List[Dict] = [] + liked_count = 0 + search_count = 0 + liked_product_slugs_subq = ( + select(ProductLike.product_slug) + .where(ProductLike.user_id == user_id, ProductLike.deleted_at.is_(None)) + ) + liked_count = await session.scalar( + select(func.count(Product.id)) + .where( + Product.id.in_(base_ids), + Product.slug.in_(liked_product_slugs_subq), + Product.deleted_at.is_(None) + ) + ) + + liked_count = (await session.execute( + select(func.count()) + .select_from(ProductLike) + .where( + ProductLike.user_id == user_id, + ProductLike.product_slug.in_( + select(Product.slug).where(Product.id.in_(base_ids)) + ), + ProductLike.deleted_at.is_(None) + ) + )).scalar_one() if user_id else 0 + + # Brand counts + brand_count_rows = await session.execute( + select(Product.brand, func.count(Product.id)) + .where(Product.id.in_(base_ids), + Product.brand.is_not(None), + func.trim(Product.brand) != "", + Product.deleted_at.is_(None) + ) + .group_by(Product.brand) + ) + for brand_name, count in brand_count_rows: + brands_list.append({"name": brand_name, "count": count}) + brands_list.sort(key=lambda x: (-x["count"], x["name"].lower())) + + # Sticker counts + sticker_count_rows = await session.execute( + select(ProductSticker.name, func.count(ProductSticker.product_id)) + .where( + ProductSticker.product_id.in_(base_ids), + ProductSticker.deleted_at.is_(None) + ) + .group_by(ProductSticker.name) + ) + for sticker_name, count in sticker_count_rows: + if sticker_name: + stickers_list.append({"name": sticker_name.strip().lower(), "count": count}) + stickers_list.sort(key=lambda x: (-x["count"], x["name"])) + + # Label counts + label_count_rows = await session.execute( + select(ProductLabel.name, func.count(ProductLabel.product_id)) + .where( + ProductLabel.product_id.in_(base_ids), + ProductLabel.deleted_at.is_(None) + ) + .group_by(ProductLabel.name) + ) + for label_name, count in label_count_rows: + if label_name: + labels_list.append({"name": label_name, "count": count}) + labels_list.sort(key=lambda x: (-x["count"], x["name"])) + + + # Search count + search_q = (search or "").strip().lower() + if search_q: + search_count = (await session.execute( + select(func.count(Product.id)) + .where( + Product.id.in_(base_ids), + func.lower(Product.description_short).contains(search_q), + Product.deleted_at.is_(None) + ) + )).scalars().one() + else: + search_count = len(base_ids) + + return { + "brands": brands_list, + "stickers": stickers_list, + "labels": labels_list, + "liked_count": liked_count, + "search_count": search_count, + } + +async def db_products( + session, + top_slug: str | None, + sub_slug: str | None, + selected_brands: Optional[List[str]] = None, + selected_stickers: Optional[List[str]] = None, + selected_labels: Optional[List[str]] = None, + page: int = 1, + search: Optional[str] = None, + sort: Optional[str] = None, + page_size: int = 20, + liked: bool = None, + user_id: int=0 +) -> Dict: + return { + **(await db_products_nocounts( + session, + top_slug=top_slug, + sub_slug=sub_slug, + selected_brands=selected_brands, + selected_stickers=selected_stickers, + selected_labels=selected_labels, + page=page, + search=search, + sort=sort, + page_size=page_size, + liked=liked, + user_id=user_id + )), + **(await db_products_counts( + session, + top_slug=top_slug, + sub_slug=sub_slug, + search=search, + user_id=user_id + )), + } + + diff --git a/bp/browse/services/nav.py b/bp/browse/services/nav.py new file mode 100644 index 0000000..6a3a901 --- /dev/null +++ b/bp/browse/services/nav.py @@ -0,0 +1,163 @@ +from __future__ import annotations + +import time +import re +from typing import Dict, List, Tuple, Optional +from urllib.parse import urlparse, urljoin + +from config import config +from . import db_backend as cb +from .blacklist.category import is_category_blocked # Reverse map: slug -> label + +# ------------------ Caches ------------------ +_nav_cache: Dict = {} +_nav_cache_ts: float = 0.0 +_nav_ttl_seconds = 60 * 60 * 6 # 6 hours + + +def _now() -> float: + try: + return now() # type: ignore[name-defined] + except Exception: + return time.time() + + +def extract_sub_slug(href: str, top_slug: str) -> Optional[str]: + p = urlparse(href) + parts = [x for x in (p.path or "").split("/") if x] + if len(parts) >= 2 and parts[0].lower() == top_slug.lower(): + sub = parts[1] + if sub.lower().endswith((".html", ".htm")): + sub = re.sub(r"\.(html?|HTML?)$", "", sub) + return sub + return None + + +def group_by_category(slug_to_links: Dict[str, List[Tuple[str, str]]]) -> Dict[str, Dict]: + nav = {"cats": {}} + for label, slug in config()["categories"]["allow"].items(): + top_href = urljoin(config()["base_url"], f"/{slug}") + subs = [] + for text, href in slug_to_links.get(slug, []): + sub_slug = extract_sub_slug(href, slug) + if sub_slug: + subs.append({ + "name": text, + "href": href, + "slug": sub_slug, + # no count here yet in this path + }) + subs.sort(key=lambda x: x["name"].lower()) + nav["cats"][label] = {"href": top_href, "slug": slug, "subs": subs} + nav = _apply_category_blacklist(nav) + return nav + + +async def get_nav(session) -> Dict[str, Dict]: + """ + Return navigation structure; annotate each sub with product counts. + Uses snapshot for offline behaviour. + """ + global _nav_cache, _nav_cache_ts + now_ts = _now() + + # load from snapshot + nav = await cb.db_nav(session) + + # inject counts for each subcategory (and for top-level too if you like) + for label, cat in (nav.get("cats") or {}).items(): + top_slug = cat.get("slug") + if not top_slug: + continue + + + # Counts for subs + new_subs = [] + for s in cat.get("subs", []): + s.get("slug") + #if not sub_slug: + # s_count = 0 + #else: + # s_count = await cb.db_count_products_in_sub(session,top_slug, sub_slug) + #print('sub', s_count) + new_subs.append({ + **s, + #"count": s_count, + }) + cat["subs"] = new_subs + + _nav_cache = nav + _nav_cache_ts = now_ts + + nav = _apply_category_blacklist(nav) + return nav + + +def category_context(top_slug: Optional[str], sub_slug: Optional[str], nav: Dict[str, Dict]): + """Build template context for a category/subcategory page.""" + def _order_subs_selected_first(subs, sub_slug: str | None): + """Return subs with the selected subcategory (by slug) first.""" + if not subs or not sub_slug: + return subs + head = [s for s in subs if sub_slug and sub_slug.lower() == s['slug']] + tail = [s for s in subs if not (sub_slug and sub_slug.lower() == s['slug'])] + return head + tail + + REVERSE_CATEGORY = {v: k for k, v in config()["categories"]["allow"].items()} + label = REVERSE_CATEGORY.get(top_slug) + cat = nav["cats"].get(label) or {} + + top_suma_href = cat.get("href") or urljoin(config()["base_url"], f"/{top_slug}") + top_local_href = f"{top_slug}" + + # total products in this top-level category (all subs combined / top-level listing) + top_count = cat.get("count", 0) + + subs = [] + for s in cat.get("subs", []): + subs.append({ + "name": s["name"], + "slug": s.get("slug"), + "local_href": f"{top_slug}/{s.get('slug')}", + "suma_href": s["href"], + "count": s.get("count", 0), # per-subcategory product count + }) + + current_local_href = ( + f"{top_slug}/{sub_slug}" if sub_slug + else f"{top_slug}" if top_slug + else "" + ) + + return { + "category_label": label, + "top_slug": top_slug, + "sub_slug": sub_slug, + "top_suma_href": top_suma_href, + "top_local_href": top_local_href, + + # 👇 expose total count for the parent category + "top_count": top_count, + + # list of subcategories, each with its own count + "subs_local": _order_subs_selected_first(subs, sub_slug), + + #"current_local_href": current_local_href, + } + +def _apply_category_blacklist(nav: Dict[str, Dict]) -> Dict[str, Dict]: + cats = nav.get("cats", {}) + out = {"cats": {}} + for label, data in cats.items(): + top = (data or {}).get("slug") + if not top or is_category_blocked(top): + continue + # filter subs + subs = [] + for s in (data.get("subs") or []): + sub_slug = s.get("slug") + if sub_slug and not is_category_blocked(top, sub_slug): + subs.append(s) + # keep everything else (including counts) + out["cats"][label] = {**data, "subs": subs} + return out diff --git a/bp/browse/services/products.py b/bp/browse/services/products.py new file mode 100644 index 0000000..7fd931c --- /dev/null +++ b/bp/browse/services/products.py @@ -0,0 +1,118 @@ +# products.py +from __future__ import annotations +from typing import List, Optional +from urllib.parse import urlparse + +from .state import KNOWN_PRODUCT_SLUGS +from .blacklist.category import is_category_blocked +from . import db_backend as cb + +# NEW IMPORT: +from quart import g + +async def products( + list_url: str, + selected_brands: Optional[List[str]] = None, + selected_stickers: Optional[List[str]] = None, + selected_labels: Optional[List[str]] = None, + page: int = 1, + search: Optional[str] = None, + sort: Optional[str] = None, + liked: Optional[bool] = None, + + # NEW: + user_id: Optional[int] = None, +): + p = urlparse(list_url) + parts = [x for x in (p.path or "").split("/") if x] + top = parts[0] if parts else None + sub = parts[1] if len(parts) >= 2 else None + + if is_category_blocked(top, sub): + return [], [], [], [], 1 # <- note: 5 values now, keep shape consistent below + data = await cb.db_products( + g.s, + top, + sub, + selected_brands, + selected_stickers, + selected_labels, + page, + search, + sort, + liked=liked, + user_id = g.user.id if g.user else 0 + ) + items = data.get("items", []) or [] + brands = data.get("brands", []) or [] + stickers = data.get("stickers", []) or [] + labels = data.get("labels", []) or [] + total_pages = int(data.get("total_pages", 1) or 1) + + # Track known product slugs + for it in items: + try: + slug = it.get("slug") + if slug: + KNOWN_PRODUCT_SLUGS.add(slug) + except Exception: + pass + + # --- NEW BIT: mark which are liked by this user --- + + + # Return same shape you were already returning: + # items, brands, stickers, labels, total_pages + return items, brands, stickers, labels, total_pages, data.get("liked_count"), data.get("search_count") + + +async def products_nocounts( + session, + list_url: str, + selected_brands: Optional[List[str]] = None, + selected_stickers: Optional[List[str]] = None, + selected_labels: Optional[List[str]] = None, + page: int = 1, + search: Optional[str] = None, + sort: Optional[str] = None, + liked: Optional[bool] = None, + + # NEW: + user_id: Optional[int] = None, +): + p = urlparse(list_url) + parts = [x for x in (p.path or "").split("/") if x] + top = parts[0] if parts else None + sub = parts[1] if len(parts) >= 2 else None + + if is_category_blocked(top, sub): + return [], [], [], [], 1 # <- note: 5 values now, keep shape consistent below + data = await cb.db_products_nocounts( + session, + top, + sub, + selected_brands, + selected_stickers, + selected_labels, + page, + search, + sort, + liked=liked, + user_id = g.user.id if g.user else 0, + ) + items = data.get("items", []) or [] + total_pages = int(data.get("total_pages", 1) or 1) + + # Track known product slugs + for it in items: + try: + slug = it.get("slug") + if slug: + KNOWN_PRODUCT_SLUGS.add(slug) + except Exception: + pass + + + # Return same shape you were already returning: + # items, brands, stickers, labels, total_pages + return items, total_pages diff --git a/bp/browse/services/services.py b/bp/browse/services/services.py new file mode 100644 index 0000000..a3b00c2 --- /dev/null +++ b/bp/browse/services/services.py @@ -0,0 +1,179 @@ +from __future__ import annotations + +from urllib.parse import urljoin + +from quart import ( + g, + request, +) +from config import config +from .products import products, products_nocounts +from .blacklist.product_details import is_blacklisted_heading + +from utils import host_url + + +from sqlalchemy import select +from models import ProductLike +from ...market.filters.qs import decode + + +def _hx_fragment_request() -> bool: + return request.headers.get("HX-Request", "").lower() == "true" + +async def _productInfo(top_slug=None, sub_slug=None): + """ + Shared query logic for home / category / subcategory pages. + Pulls filters from qs.decode(), queries products(), and orders brands/stickers/etc. + """ + + q = decode() + page, search, sort = q.page, q.search, q.sort + selected_brands, selected_stickers, selected_labels = q.selected_brands, q.selected_stickers, q.selected_labels + liked = q.liked + + if top_slug is not None and sub_slug is not None: + list_url = urljoin(config()["base_url"], f"/{top_slug}/{sub_slug}") + else: + if top_slug is not None: + list_url = top_slug + else: + list_url = "" + if not _hx_fragment_request() or page==1: + items, brands, stickers, labels, total_pages, liked_count, search_count = await products( + list_url, + selected_brands=selected_brands, + selected_stickers=selected_stickers, + selected_labels=selected_labels, + page=page, + search=search, + sort=sort, + user_id=g.user.id if g.user else None, + liked = liked, + ) + + brands_ordered = _order_brands_selected_first(brands, selected_brands) + + return { + "products": items, + "page": page, + "search": search, + "sort": sort, + "total_pages": int(total_pages or 1), + "brands": brands_ordered, + "selected_brands": selected_brands, + "stickers": stickers, + "selected_stickers": selected_stickers, + "labels": labels, + "selected_labels": selected_labels, + "liked": liked, + "liked_count": liked_count, + "search_count": search_count + } + else: + items, total_pages = await products_nocounts( + g.s, + list_url, + selected_brands=selected_brands, + selected_stickers=selected_stickers, + selected_labels=selected_labels, + page=page, + search=search, + sort=sort, + user_id=g.user.id if g.user else None, + liked = liked, + ) + return { + "products": items, + "page": page, + "search": search, + "sort": sort, + "total_pages": int(total_pages or 1), + } + + +def _order_brands_selected_first(brands, selected): + """Return brands with the selected brand(s) first.""" + if not brands or not selected: + return brands + sel = [(s or "").strip() for s in selected] + head = [s for s in brands if (s.get("name") or "").strip() in sel] + tail = [s for s in brands if (s.get("name") or "").strip() not in sel] + return head + tail + + +def _order_stickers_selected_first( + stickers: list[dict], selected_stickers: list[str] | None +): + if not stickers or not selected_stickers: + return stickers + sel = [(s or "").strip().lower() for s in selected_stickers] + head = [s for s in stickers if (s.get("name") or "").strip().lower() in sel] + tail = [ + s + for s in stickers + if (s.get("name") or "").strip().lower() not in sel + ] + return head + tail + + +def _order_labels_selected_first( + labels: list[dict], selected_labels: list[str] | None +): + if not labels or not selected_labels: + return labels + sel = [(s or "").strip().lower() for s in selected_labels] + head = [s for s in labels if (s.get("name") or "").strip().lower() in sel] + tail = [ + s + for s in labels + if (s.get("name") or "").strip().lower() not in sel + ] + return head + tail + +def _massage_product(d): + """ + Normalise the product dict for templates: + - inject APP_ROOT into HTML + - drop blacklisted sections + """ + massaged = { + **d, + "description_html": d["description_html"].replace( + "[**__APP_ROOT__**]", g.root + ), + "sections": [ + { + **section, + "html": section["html"].replace( + "[**__APP_ROOT__**]", g.root + ), + } + for section in d["sections"] + if not is_blacklisted_heading(section["title"]) + ], + } + return massaged + + +# Re-export from canonical shared location +from shared.http_utils import vary as _vary, current_url_without_page as _current_url_without_page + +async def _is_liked(user_id: int | None, slug: str) -> bool: + """ + Check if this user has liked this product. + """ + if not user_id: + return False + # because ProductLike has composite PK (user_id, product_slug), + # we can fetch it by primary key dict: + row = await g.s.execute( + select(ProductLike).where( + ProductLike.user_id == user_id, + ProductLike.product_slug == slug, + ) + ) + row.scalar_one_or_none() + return row is not None + + diff --git a/bp/browse/services/slugs.py b/bp/browse/services/slugs.py new file mode 100644 index 0000000..f23c0c7 --- /dev/null +++ b/bp/browse/services/slugs.py @@ -0,0 +1,24 @@ +import re +from urllib.parse import urljoin, urlparse +from config import config + +def product_slug_from_href(href: str) -> str: + p = urlparse(href) + parts = [x for x in p.path.split("/") if x] + if not parts: + return "" + last = parts[-1] + if last.endswith(".html"): + last = last[:-5] + elif last.endswith(".htm"): + last = last[:-4] + last = re.sub(r"-(html|htm)+$", "", last, flags=re.I) + return f"{last}-html" + +def canonical_html_slug(slug: str) -> str: + base = re.sub(r"-(html|htm)+$", "", slug, flags=re.I) + return f"{base}-html" + +def suma_href_from_html_slug(slug: str) -> str: + canon = canonical_html_slug(slug) + return urljoin(config()["base_url"], f"/{canon}.html") diff --git a/bp/browse/services/state.py b/bp/browse/services/state.py new file mode 100644 index 0000000..2ad0495 --- /dev/null +++ b/bp/browse/services/state.py @@ -0,0 +1,21 @@ +from typing import Dict, Tuple, List +import time + +_nav_cache: dict = {} +_nav_cache_ts: float = 0.0 +_nav_ttl_seconds = 60 * 60 * 6 + +_detail_cache: Dict[str, Dict] = {} +_detail_cache_ts: Dict[str, float] = {} +_detail_ttl_seconds = 60 * 60 * 6 + +KNOWN_PRODUCT_SLUGS: set[str] = set() + +_listing_variant_cache: Dict[str, Tuple[str, float]] = {} +_listing_variant_ttl = 60 * 60 * 6 + +_listing_page_cache: Dict[str, Tuple[Tuple[List[Dict], int], float]] = {} +_listing_page_ttl = 60 * 30 + +def now() -> float: + return time.time() diff --git a/bp/market/__init__.py b/bp/market/__init__.py new file mode 100644 index 0000000..85fd1a5 --- /dev/null +++ b/bp/market/__init__.py @@ -0,0 +1,7 @@ +from __future__ import annotations + +# create the blueprint at package import time +from .routes import register # = Blueprint("browse_bp", __name__) + +# import routes AFTER browse_bp is defined so routes can attach to it +from . import routes # noqa: F401 diff --git a/bp/market/admin/__init__.py b/bp/market/admin/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bp/market/admin/routes.py b/bp/market/admin/routes.py new file mode 100644 index 0000000..ccea191 --- /dev/null +++ b/bp/market/admin/routes.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from quart import ( + render_template, make_response, Blueprint +) + + +from suma_browser.app.authz import require_admin + + +def register(): + bp = Blueprint("admin", __name__, url_prefix='/admin') + + # ---------- Pages ---------- + @bp.get("/") + @require_admin + async def admin(): + from suma_browser.app.utils.htmx import is_htmx_request + + # Determine which template to use based on request type + if not is_htmx_request(): + # Normal browser request: full page with layout + html = await render_template("_types/market/admin/index.html") + else: + html = await render_template("_types/market/admin/_oob_elements.html") + + return await make_response(html) + return bp diff --git a/bp/market/filters/__init__.py b/bp/market/filters/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bp/market/filters/qs.py b/bp/market/filters/qs.py new file mode 100644 index 0000000..c8b0949 --- /dev/null +++ b/bp/market/filters/qs.py @@ -0,0 +1,101 @@ +from quart import request + +from typing import Iterable, Optional, Union + +from suma_browser.app.filters.qs_base import ( + KEEP, _norm, make_filter_set, build_qs, +) +from suma_browser.app.filters.query_types import MarketQuery + + +def decode() -> MarketQuery: + page = int(request.args.get("page", 1)) + search = request.args.get("search") + sort = request.args.get("sort") + liked = request.args.get("liked") + + selected_brands = tuple(s.strip() for s in request.args.getlist("brand") if s.strip()) + selected_stickers = tuple(s.strip().lower() for s in request.args.getlist("sticker") if s.strip()) + selected_labels = tuple(s.strip().lower() for s in request.args.getlist("label") if s.strip()) + + return MarketQuery(page, search, sort, selected_brands, selected_stickers, selected_labels, liked) + + +def makeqs_factory(): + """ + Build a makeqs(...) that starts from the current filters + page. + Auto-resets page to 1 when filters change unless you pass page explicitly. + """ + q = decode() + base_stickers = [s for s in q.selected_stickers if (s or "").strip()] + base_labels = [s for s in q.selected_labels if (s or "").strip()] + base_brands = [s for s in q.selected_brands if (s or "").strip()] + base_search = q.search or None + base_liked = q.liked or None + base_sort = q.sort or None + base_page = int(q.page or 1) + + def makeqs( + *, + clear_filters: bool = False, + add_sticker: Union[str, Iterable[str], None] = None, + remove_sticker: Union[str, Iterable[str], None] = None, + add_label: Union[str, Iterable[str], None] = None, + remove_label: Union[str, Iterable[str], None] = None, + add_brand: Union[str, Iterable[str], None] = None, + remove_brand: Union[str, Iterable[str], None] = None, + search: Union[str, None, object] = KEEP, + sort: Union[str, None, object] = KEEP, + page: Union[int, None, object] = None, + extra: Optional[Iterable[tuple]] = None, + leading_q: bool = True, + liked: Union[bool, None, object] = KEEP, + ) -> str: + stickers = make_filter_set(base_stickers, add_sticker, remove_sticker, clear_filters) + labels = make_filter_set(base_labels, add_label, remove_label, clear_filters) + brands = make_filter_set(base_brands, add_brand, remove_brand, clear_filters) + + final_search = None if clear_filters else base_search if search is KEEP else ((search or "").strip() or None) + final_sort = base_sort if sort is KEEP else (sort or None) + final_liked = None if clear_filters else base_liked if liked is KEEP else liked + + # Did filters change? + filters_changed = ( + set(map(_norm, stickers)) != set(map(_norm, base_stickers)) + or set(map(_norm, labels)) != set(map(_norm, base_labels)) + or set(map(_norm, brands)) != set(map(_norm, base_brands)) + or final_search != base_search + or final_sort != base_sort + or final_liked != base_liked + ) + + # Page logic + if page is KEEP: + final_page = 1 if filters_changed else base_page + else: + final_page = page + + # Build params + params = [] + for s in stickers: + params.append(("sticker", s)) + for s in labels: + params.append(("label", s)) + for s in brands: + params.append(("brand", s)) + if final_search: + params.append(("search", final_search)) + if final_liked is not None: + params.append(("liked", final_liked)) + if final_sort: + params.append(("sort", final_sort)) + if final_page is not None: + params.append(("page", str(final_page))) + if extra: + for k, v in extra: + if v is not None: + params.append((k, str(v))) + + return build_qs(params, leading_q=leading_q) + + return makeqs diff --git a/bp/market/routes.py b/bp/market/routes.py new file mode 100644 index 0000000..48de972 --- /dev/null +++ b/bp/market/routes.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from quart import Blueprint, g, render_template, make_response, url_for + + +from ..browse.routes import register as register_browse_bp + +from .filters.qs import makeqs_factory +from ..browse.services.nav import get_nav +from ..api.routes import products_api +from .admin.routes import register as register_admin + + + +def register(url_prefix, title): + bp = Blueprint("market", __name__, url_prefix) + + @bp.before_request + def route(): + g.makeqs_factory = makeqs_factory + + + @bp.context_processor + async def inject_root(): + return { + "coop_title": title, + "categories": (await get_nav(g.s))["cats"], + "qs": makeqs_factory()(), + } + + bp.register_blueprint( + register_browse_bp(), + ) + bp.register_blueprint( + products_api, + ) + bp.register_blueprint( + register_admin(), + ) + + + + return bp + diff --git a/bp/product/routes.py b/bp/product/routes.py new file mode 100644 index 0000000..48942da --- /dev/null +++ b/bp/product/routes.py @@ -0,0 +1,248 @@ +from __future__ import annotations + +from quart import ( + g, + Blueprint, + abort, + redirect, + render_template, + make_response, +) +from sqlalchemy import select, func, update + +from models.market import Product, ProductLike +from ..browse.services.slugs import canonical_html_slug +from ..browse.services.blacklist.product import is_product_blocked +from ..browse.services import db_backend as cb +from ..browse.services import _massage_product +from utils import host_url +from suma_browser.app.redis_cacher import cache_page, clear_cache +from ..cart.services import total +from .services.product_operations import toggle_product_like, massage_full_product + + +def register(): + bp = Blueprint("product", __name__, url_prefix="/product/") + @bp.url_value_preprocessor + def pull_blog(endpoint, values): + g.product_slug = values.get("slug") + + # ───────────────────────────────────────────────────────────── + # BEFORE REQUEST: Slug or numeric ID resolver + # ───────────────────────────────────────────────────────────── + @bp.before_request + async def resolve_product(): + raw_slug = g.product_slug = getattr(g, "product_slug", None) + if raw_slug is None: + return + + # 1. If slug is INT → load product by ID + if raw_slug.isdigit(): + product_id = int(raw_slug) + + product = await cb.db_product_full_id( + g.s, product_id, user_id=g.user.id if g.user else 0 + ) + + if not product: + abort(404) + + # If product is deleted → SHOW as-is + if product["deleted_at"]: + d = product + g.item_data = {"d": d, "slug": product["slug"], "liked": False} + return + + # Not deleted → redirect to canonical slug + canon = canonical_html_slug(product["slug"]) + return redirect( + host_url(url_for("market.browse.product.product_detail", slug=canon)) + ) + + # 2. Normal slug-based behaviour + if is_product_blocked(raw_slug): + abort(404) + + canon = canonical_html_slug(raw_slug) + if canon != raw_slug: + return redirect( + host_url(url_for("product.product_detail", slug=canon)) + ) + + # hydrate full product + d = await cb.db_product_full( + g.s, canon, user_id=g.user.id if g.user else 0 + ) + if not d: + abort(404) + g.item_data = {"d": d, "slug": canon, "liked": d["is_liked"]} + + @bp.context_processor + def context(): + item_data = getattr(g, "item_data", None) + + if item_data: + return { + **item_data, + } + else: + return {} + + # ───────────────────────────────────────────────────────────── + # RENDER PRODUCT + # ───────────────────────────────────────────────────────────── + @bp.get("/") + @cache_page(tag="browse") + async def product_detail(slug: str): + from suma_browser.app.utils.htmx import is_htmx_request + + # Determine which template to use based on request type + if not is_htmx_request(): + # Normal browser request: full page with layout + html = await render_template("_types/product/index.html") + else: + # HTMX request: main panel + OOB elements + html = await render_template("_types/product/_oob_elements.html") + + return html + + @bp.post("/like/toggle/") + @clear_cache(tag="browse", tag_scope="user") + async def like_toggle(slug): + # Use slug from URL parameter (set by url_prefix="/product/") + product_slug = slug + + if not g.user: + html = await render_template( + "_types/browse/like/button.html", + slug=product_slug, + liked=False, + ) + resp = make_response(html, 403) + return resp + + user_id = g.user.id + + liked, error = await toggle_product_like(g.s, user_id, product_slug) + + if error: + resp = make_response(error, 404) + return resp + + html = await render_template( + "_types/browse/like/button.html", + slug=product_slug, + liked=liked, + ) + return html + + + + @bp.get("/admin/") + async def admin(slug: str): + from suma_browser.app.utils.htmx import is_htmx_request + + if not is_htmx_request(): + # Normal browser request: full page with layout + html = await render_template("_types/product/admin/index.html") + else: + # HTMX request: main panel + OOB elements + html = await render_template("_types/product/admin/_oob_elements.html") + + return await make_response(html) + + + from suma_browser.app.bp.cart.services.identity import current_cart_identity + #from suma_browser.app.bp.cart.routes import view_cart + from models.market import CartItem + from quart import request, url_for + + @bp.post("/cart/") + @clear_cache(tag="browse", tag_scope="user") + async def cart(slug: str): + # make sure product exists (we *allow* deleted_at != None later if you want) + product_id = await g.s.scalar( + select(Product.id).where( + Product.slug == slug, + Product.deleted_at.is_(None), + ) + ) + + product = await g.s.scalar( + select(Product).where(Product.id == product_id) + ) + if not product: + return await make_response("Product not found", 404) + + # --- NEW: read `count` from body (JSON or form), default to 1 --- + count = 1 + try: + if request.is_json: + data = await request.get_json() + if data is not None and "count" in data: + count = int(data["count"]) + else: + form = await request.form + if "count" in form: + count = int(form["count"]) + except (ValueError, TypeError): + # if parsing fails, just fall back to 1 + count = 1 + # --- END NEW --- + + ident = current_cart_identity() + + filters = [CartItem.deleted_at.is_(None), CartItem.product_id == product_id] + if ident["user_id"] is not None: + filters.append(CartItem.user_id == ident["user_id"]) + else: + filters.append(CartItem.session_id == ident["session_id"]) + + ci = next( + (item for item in g.cart if item.product_id == product_id), + None, + ) + + # --- NEW: set quantity based on `count` --- + if ci: + if count > 0: + ci.quantity = count + else: + # count <= 0 → remove from cart entirely + ci.quantity=0 + g.cart.remove(ci) + await g.s.delete(ci) + + else: + if count > 0: + ci = CartItem( + user_id=ident["user_id"], + session_id=ident["session_id"], + product_id=product.id, + product=product, + quantity=count, + ) + g.cart.append(ci) + g.s.add(ci) + # if count <= 0 and no existing item, do nothing + # --- END NEW --- + + # no explicit commit; your session middleware should handle it + + # htmx support (optional) + if request.headers.get("HX-Request") == "true": + # You can return a small fragment or mini-cart here + + return await render_template( + "_types/product/_added.html", + cart=g.cart, + item=ci, + total = total + ) + + # normal POST: go to cart page + return redirect(url_for("cart.view_cart")) + + + + return bp diff --git a/bp/product/services/__init__.py b/bp/product/services/__init__.py new file mode 100644 index 0000000..ce711a7 --- /dev/null +++ b/bp/product/services/__init__.py @@ -0,0 +1,3 @@ +from .product_operations import toggle_product_like, massage_full_product + +__all__ = ["toggle_product_like", "massage_full_product"] diff --git a/bp/product/services/product_operations.py b/bp/product/services/product_operations.py new file mode 100644 index 0000000..44c7212 --- /dev/null +++ b/bp/product/services/product_operations.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +from typing import Optional + +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from models.market import Product, ProductLike + + +def massage_full_product(product: Product) -> dict: + """ + Convert a Product ORM model to a dictionary with all fields. + Used for rendering product detail pages. + """ + from suma_browser.app.bp.browse.services import _massage_product + + gallery = [] + if product.image: + gallery.append(product.image) + + d = { + "id": product.id, + "slug": product.slug, + "title": product.title, + "brand": product.brand, + "image": product.image, + "description_short": product.description_short, + "description_html": product.description_html or "", + "suma_href": product.suma_href, + "rrp": float(product.rrp) if product.rrp else None, + "special_price": float(product.special_price) if product.special_price else None, + "regular_price": float(product.regular_price) if product.regular_price else None, + "images": gallery or [img.url for img in product.images], + "all_image_urls": gallery or [img.url for img in product.images], + "sections": [{"title": s.title, "html": s.html} for s in product.sections], + "stickers": [s.name.lower() for s in product.stickers], + "labels": [l.name for l in product.labels], + "nutrition": [{"key": n.key, "value": n.value, "unit": n.unit} for n in product.nutrition], + "allergens": [{"name": a.name, "contains": a.contains} for a in product.allergens], + "is_liked": False, + } + + return _massage_product(d) + + +async def toggle_product_like( + session: AsyncSession, + user_id: int, + product_slug: str, +) -> tuple[bool, Optional[str]]: + """ + Toggle a product like for a given user using soft deletes. + Returns (liked_state, error_message). + - If error_message is not None, an error occurred. + - liked_state indicates whether product is now liked (True) or unliked (False). + """ + from sqlalchemy import func, update + + # Get product_id from slug + product_id = await session.scalar( + select(Product.id).where(Product.slug == product_slug, Product.deleted_at.is_(None)) + ) + if not product_id: + return False, "Product not found" + + # Check if like exists (not deleted) + existing = await session.scalar( + select(ProductLike).where( + ProductLike.user_id == user_id, + ProductLike.product_slug == product_slug, + ProductLike.deleted_at.is_(None), + ) + ) + + if existing: + # Unlike: soft delete the like + await session.execute( + update(ProductLike) + .where( + ProductLike.user_id == user_id, + ProductLike.product_slug == product_slug, + ProductLike.deleted_at.is_(None), + ) + .values(deleted_at=func.now()) + ) + return False, None + else: + # Like: add a new like + new_like = ProductLike( + user_id=user_id, + product_slug=product_slug, + ) + session.add(new_like) + return True, None diff --git a/entrypoint.sh b/entrypoint.sh new file mode 100644 index 0000000..dd1352c --- /dev/null +++ b/entrypoint.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Optional: wait for Postgres to be reachable +if [[ -n "${DATABASE_HOST:-}" && -n "${DATABASE_PORT:-}" ]]; then + echo "Waiting for Postgres at ${DATABASE_HOST}:${DATABASE_PORT}..." + for i in {1..60}; do + (echo > /dev/tcp/${DATABASE_HOST}/${DATABASE_PORT}) >/dev/null 2>&1 && break || true + sleep 1 + done +fi + +# Run DB migrations (uses alembic.ini/env.py to resolve the DB URL) +echo "Running Alembic migrations..." +alembic upgrade head + +# Clear Redis page cache on deploy +if [[ -n "${REDIS_URL:-}" && "${REDIS_URL}" != "no" ]]; then + echo "Flushing Redis cache..." + python3 -c " +import redis, os +r = redis.from_url(os.environ['REDIS_URL']) +r.flushall() +print('Redis cache cleared.') +" || echo "Redis flush failed (non-fatal), continuing..." +fi + +# Start the app +# APP_MODULE can be overridden per-service (e.g. apps.market.app:app) +echo "Starting Hypercorn (${APP_MODULE:-suma_browser.app.app:app})..." +PYTHONUNBUFFERED=1 exec hypercorn "${APP_MODULE:-suma_browser.app.app:app}" --bind 0.0.0.0:${PORT:-8000} diff --git a/scrape-test.sh b/scrape-test.sh new file mode 100644 index 0000000..c6e299f --- /dev/null +++ b/scrape-test.sh @@ -0,0 +1,6 @@ +. .env +source venv/bin/activate +rm -rf _debug/* +python test_scrape_detail.py --out ./_debug --slug sum-saag-suma-aloo-saag-12-x-400g-vf270-2-html +#git -C _debug status +#git -C _debug diff diff --git a/scrape.sh b/scrape.sh new file mode 100644 index 0000000..639cba8 --- /dev/null +++ b/scrape.sh @@ -0,0 +1,5 @@ +. .env +echo sumauser: $SUMA_USER +source .venv/bin/activate # was venv/bin/a +python scrape_to_snapshot.py --out ./_snapshot --max-pages 50 --max-products 200000 --concurrency 8 + diff --git a/scrape/__init__.py b/scrape/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scrape/build_snapshot/__init__.py b/scrape/build_snapshot/__init__.py new file mode 100644 index 0000000..1eec55e --- /dev/null +++ b/scrape/build_snapshot/__init__.py @@ -0,0 +1 @@ +from .build_snapshot import build_snapshot diff --git a/scrape/build_snapshot/build_snapshot.py b/scrape/build_snapshot/build_snapshot.py new file mode 100644 index 0000000..b8a8ee6 --- /dev/null +++ b/scrape/build_snapshot/build_snapshot.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import os +from typing import Dict, Set + +from ..http_client import configure_cookies +from ..get_auth import login + +from config import config + +from utils import log + +# DB: persistence helpers + +from .tools import ( + _resolve_sub_redirects, + valid_subs, + candidate_subs, + rewrite_nav, + capture_product_slugs, + fetch_and_upsert_products, +) + +from ..nav import nav_scrape + +# ------------------------ core ------------------------ +async def build_snapshot( + concurrency: int, + user: str, + password: str, + save_nav, + capture_listing, + upsert_product, + log_product_result, + save_subcategory_redirects, + save_link_reports = None, +) -> None: + # NOTE: we keep ensure_dir for listings iteration but no longer write JSON files. + + # Make project importable + import sys + sys.path.insert(0, os.path.abspath(".")) + + + cookies = await login(username=user, password=password) + await configure_cookies(cookies) + for k, v in dict(cookies).items(): + print("logged in with", k, v) + + # 1) NAV + log("Fetching nav…") + nav = await nav_scrape() + + # Build valid subs per top from nav + valid_subs_by_top: Dict[str, Set[str]] = valid_subs(nav) + + # Resolve redirects for all subs in nav first + nav_sub_candidates = candidate_subs(nav) + nav_redirects = await _resolve_sub_redirects( + base_url=config()["base_url"], + candidates=nav_sub_candidates, + allowed_tops=set(config()["categories"]["allow"].values()), + valid_subs_by_top=valid_subs_by_top, + ) + rewrite_nav(nav, nav_redirects) + + # DB: save nav + await save_nav(nav) + + product_slugs: Set[str] = await capture_product_slugs( + nav, + capture_listing + ) + unknown_sub_paths: Set[str] = set() + + # 3) PRODUCTS (fetch details) + await fetch_and_upsert_products( + upsert_product, + log_product_result, + save_link_reports, + concurrency, + product_slugs, + valid_subs_by_top, + unknown_sub_paths + ) + + # Subcategory redirects from HTML + log("Resolving subcategory redirects…") + html_redirects = await _resolve_sub_redirects( + base_url=config()["base_url"], + candidates=unknown_sub_paths, + allowed_tops=set(config()["categories"]["allow"].values()), + valid_subs_by_top=valid_subs_by_top, + ) + sub_redirects: Dict[str, str] = dict(nav_redirects) + sub_redirects.update(html_redirects) + + # DB: persist redirects + await save_subcategory_redirects(sub_redirects) + + log("Snapshot build complete (to Postgres).") + + diff --git a/scrape/build_snapshot/tools/APP_ROOT_PLACEHOLDER.py b/scrape/build_snapshot/tools/APP_ROOT_PLACEHOLDER.py new file mode 100644 index 0000000..3291777 --- /dev/null +++ b/scrape/build_snapshot/tools/APP_ROOT_PLACEHOLDER.py @@ -0,0 +1 @@ +APP_ROOT_PLACEHOLDER = "[**__APP_ROOT__**]" diff --git a/scrape/build_snapshot/tools/__init__.py b/scrape/build_snapshot/tools/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/scrape/build_snapshot/tools/__init__.py @@ -0,0 +1 @@ + diff --git a/scrape/build_snapshot/tools/_anchor_text.py b/scrape/build_snapshot/tools/_anchor_text.py new file mode 100644 index 0000000..fd3ce6d --- /dev/null +++ b/scrape/build_snapshot/tools/_anchor_text.py @@ -0,0 +1,6 @@ +def _anchor_text(a) -> str: + try: + txt = " ".join((a.get_text(" ") or "").split()) + return txt[:200] + except Exception: + return "" diff --git a/scrape/build_snapshot/tools/_collect_html_img_srcs.py b/scrape/build_snapshot/tools/_collect_html_img_srcs.py new file mode 100644 index 0000000..c5feaef --- /dev/null +++ b/scrape/build_snapshot/tools/_collect_html_img_srcs.py @@ -0,0 +1,16 @@ +from bs4 import BeautifulSoup +from typing import List, Optional + +def _collect_html_img_srcs(html: Optional[str]) -> List[str]: + urls: List[str] = [] + if not html: + return urls + try: + soup = BeautifulSoup(html, "lxml") + for img in soup.find_all("img"): + src = img.get("src") + if src: + urls.append(src) + except Exception: + pass + return urls diff --git a/scrape/build_snapshot/tools/_dedupe_preserve_order.py b/scrape/build_snapshot/tools/_dedupe_preserve_order.py new file mode 100644 index 0000000..492cb5a --- /dev/null +++ b/scrape/build_snapshot/tools/_dedupe_preserve_order.py @@ -0,0 +1,14 @@ + +from typing import Iterable, List, Set + +def _dedupe_preserve_order(urls: Iterable[str]) -> List[str]: + seen: Set[str] = set() + out: List[str] = [] + for u in urls: + if not u or not isinstance(u, str): + continue + if u in seen: + continue + seen.add(u) + out.append(u) + return out diff --git a/scrape/build_snapshot/tools/_product_dict_is_cf.py b/scrape/build_snapshot/tools/_product_dict_is_cf.py new file mode 100644 index 0000000..5802af7 --- /dev/null +++ b/scrape/build_snapshot/tools/_product_dict_is_cf.py @@ -0,0 +1,32 @@ +from typing import Dict,Optional, Tuple + +_CF_TOKENS = ( + "One moment, please...", + "Please wait while your request is being verified", + "/cdn-cgi/challenge-platform/", + "rocket-loader.min.js", +) + +def _looks_like_cf_html(html: Optional[str]) -> Tuple[bool, Optional[str]]: + if not html: + return False, None + for tok in _CF_TOKENS: + if tok in html: + return True, tok + return False, None + +def _product_dict_is_cf(d: Dict) -> Tuple[bool, Optional[str]]: + title = (d.get("title") or "").strip() + if title.lower() == "one moment, please...": + return True, "One moment, please..." + ok, tok = _looks_like_cf_html(d.get("description_html")) + if ok: + return True, tok + for sec in d.get("sections") or []: + if isinstance(sec, dict) and sec.get("html"): + ok2, tok2 = _looks_like_cf_html(sec["html"]) + if ok2: + return True, tok2 + if not d.get("images") and not d.get("description_html") and not d.get("sections"): + return True, "all_empty_heuristic" + return False, None diff --git a/scrape/build_snapshot/tools/_resolve_sub_redirects.py b/scrape/build_snapshot/tools/_resolve_sub_redirects.py new file mode 100644 index 0000000..c3e4f43 --- /dev/null +++ b/scrape/build_snapshot/tools/_resolve_sub_redirects.py @@ -0,0 +1,34 @@ +from typing import Dict, Set +from urllib.parse import urlparse, urljoin +import httpx + + +async def _resolve_sub_redirects( + base_url: str, + candidates: Set[str], + allowed_tops: Set[str], + valid_subs_by_top: Dict[str, Set[str]], +) -> Dict[str, str]: + mapping: Dict[str, str] = {} + if not candidates: + return mapping + timeout = httpx.Timeout(20.0, connect=10.0) + async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, http2=True) as client: + for path in sorted(candidates): + try: + url = urljoin(base_url, path) + r = await client.get(url) + final = str(r.url) + p = urlparse(final) + parts = [x for x in (p.path or "").split("/") if x] + if len(parts) >= 2: + top_new = parts[0].lower() + sub_new = parts[1].lower().removesuffix(".html").removesuffix(".htm") + if top_new in allowed_tops: + new_path = f"/{top_new}/{sub_new}" + if new_path != path: + mapping[path] = new_path + valid_subs_by_top.setdefault(top_new, set()).add(sub_new) + except Exception: + continue + return mapping diff --git a/scrape/build_snapshot/tools/_rewrite_links_fragment.py b/scrape/build_snapshot/tools/_rewrite_links_fragment.py new file mode 100644 index 0000000..24caf64 --- /dev/null +++ b/scrape/build_snapshot/tools/_rewrite_links_fragment.py @@ -0,0 +1,100 @@ +from typing import Dict, List, Optional, Set +from bs4 import BeautifulSoup +from urllib.parse import urlparse, urljoin + +from ._anchor_text import _anchor_text +from suma_browser.app.bp.browse.services.slugs import product_slug_from_href +from .APP_ROOT_PLACEHOLDER import APP_ROOT_PLACEHOLDER + +def _rewrite_links_fragment( + html: Optional[str], + base_url: str, + known_slugs: Set[str], + category_allow_values: Set[str], + valid_subs_by_top: Dict[str, Set[str]], + current_product_slug: str, + link_errors: List[Dict], + link_externals: List[Dict], + unknown_sub_paths: Set[str], +) -> str: + if not html: + return "" + soup = BeautifulSoup(html, "lxml") + base_host = urlparse(base_url).netloc + + for a in soup.find_all("a", href=True): + raw = (a.get("href") or "").strip() + if not raw: + continue + low = raw.lower() + if low.startswith(("mailto:", "tel:", "javascript:", "data:")) or low.startswith("#"): + continue + abs_href = urljoin(base_url, raw) + p = urlparse(abs_href) + if not p.scheme or not p.netloc: + continue + if p.netloc != base_host: + link_externals.append({ + "product": current_product_slug, + "href": abs_href, + "text": _anchor_text(a), + "host": p.netloc, + }) + continue + parts = [x for x in (p.path or "").split("/") if x] + if not parts: + continue + last = parts[-1].lower() + if last.endswith((".html", ".htm")): + target_slug = product_slug_from_href(abs_href) + if target_slug and target_slug in known_slugs: + a["href"] = f"{APP_ROOT_PLACEHOLDER}/product/{target_slug}" + else: + link_errors.append({ + "product": current_product_slug, + "href": abs_href, + "text": _anchor_text(a), + "top": None, + "sub": None, + "target_slug": target_slug or None, + "type": "suma_product_unknown", + }) + continue + top = parts[0].lower() + if top in category_allow_values: + if len(parts) == 1: + a["href"] = f"{APP_ROOT_PLACEHOLDER}/{top}" + else: + sub = parts[1] + if sub.lower().endswith((".html", ".htm")): + sub = sub.rsplit(".", 1)[0] + if sub in (valid_subs_by_top.get(top) or set()): + a["href"] = f"{APP_ROOT_PLACEHOLDER}/{top}/{sub}" + else: + unknown_path = f"/{top}/{sub}" + unknown_sub_paths.add(unknown_path) + a["href"] = f"{APP_ROOT_PLACEHOLDER}{unknown_path}" + link_errors.append({ + "product": current_product_slug, + "href": abs_href, + "text": _anchor_text(a), + "top": top, + "sub": sub, + "target_slug": None, + "type": "suma_category_invalid_sub_pending", + }) + else: + link_errors.append({ + "product": current_product_slug, + "href": abs_href, + "text": _anchor_text(a), + "top": top, + "sub": parts[1] if len(parts) > 1 else None, + "target_slug": None, + "type": "suma_other", + }) + + for t in soup.find_all(["html", "body"]): + t.unwrap() + return "".join(str(c) for c in soup.contents).strip() + diff --git a/scrape/build_snapshot/tools/candidate_subs.py b/scrape/build_snapshot/tools/candidate_subs.py new file mode 100644 index 0000000..b7853b8 --- /dev/null +++ b/scrape/build_snapshot/tools/candidate_subs.py @@ -0,0 +1,14 @@ +from typing import Dict, Set + +def candidate_subs(nav: Dict[str, Dict])-> Set[str]: + nav_sub_candidates: Set[str] = set() + for label, data in (nav.get("cats") or {}).items(): + top_slug = (data or {}).get("slug") + if not top_slug: + continue + for s in (data.get("subs") or []): + sub_slug = (s.get("slug") or "").strip() + if sub_slug: + nav_sub_candidates.add(f"/{top_slug}/{sub_slug}") + return nav_sub_candidates + diff --git a/scrape/build_snapshot/tools/capture_category.py b/scrape/build_snapshot/tools/capture_category.py new file mode 100644 index 0000000..33344d1 --- /dev/null +++ b/scrape/build_snapshot/tools/capture_category.py @@ -0,0 +1,18 @@ +from urllib.parse import urljoin +from config import config +from utils import log +from ...listings import scrape_products + +async def capture_category( + slug: str, +): + list_url = urljoin(config()["base_url"], f"/{slug}") + log(f"[{slug}] page 1…") + items, total_pages = await scrape_products(list_url, page=1) + + pmax = int(total_pages or 1) + for p in range(2, pmax + 1): + log(f"[{slug}] page {p}…") + items_p, _tp = await scrape_products(list_url, page=p) + items.extend(items_p) + return (list_url, items, total_pages) diff --git a/scrape/build_snapshot/tools/capture_product_slugs.py b/scrape/build_snapshot/tools/capture_product_slugs.py new file mode 100644 index 0000000..7149dea --- /dev/null +++ b/scrape/build_snapshot/tools/capture_product_slugs.py @@ -0,0 +1,25 @@ +from typing import Dict, Set +from .capture_category import capture_category +from .capture_sub import capture_sub +from config import config + + +async def capture_product_slugs( + nav: Dict[str, Dict], + capture_listing, +): + product_slugs: Set[str] = set() + for label, slug in config()["categories"]["allow"].items(): + lpars = await capture_category( slug) + await capture_listing(*lpars) + (_, items, __) = lpars + for slug_ in items: + product_slugs.add(slug_) + for sub in (nav["cats"].get(label, {}).get("subs", []) or []): + lpars = await capture_sub(sub, slug) + await capture_listing(*lpars) + (_, items, __) = lpars + for slug_ in items: + product_slugs.add(slug_) + return product_slugs + diff --git a/scrape/build_snapshot/tools/capture_sub.py b/scrape/build_snapshot/tools/capture_sub.py new file mode 100644 index 0000000..e512cf3 --- /dev/null +++ b/scrape/build_snapshot/tools/capture_sub.py @@ -0,0 +1,22 @@ +from urllib.parse import urljoin +from urllib.parse import urljoin +from config import config +from utils import log +from ...listings import scrape_products + +async def capture_sub( + sub, + slug, +): + sub_slug = sub.get("slug") + if not sub_slug: + return + sub_url = urljoin(config()["base_url"], f"/{slug}/{sub_slug}") + log(f"[{slug}/{sub_slug}] page 1…") + items_s, total_pages_s = await scrape_products(sub_url, page=1) + spmax = int(total_pages_s or 1) + for p in range(2, spmax + 1): + log(f"[{slug}/{sub_slug}] page {p}…") + items_ps, _ = await scrape_products(sub_url, page=p) + items_s.extend(items_ps) + return (sub_url, items_s, total_pages_s) diff --git a/scrape/build_snapshot/tools/fetch_and_upsert_product.py b/scrape/build_snapshot/tools/fetch_and_upsert_product.py new file mode 100644 index 0000000..7ae492d --- /dev/null +++ b/scrape/build_snapshot/tools/fetch_and_upsert_product.py @@ -0,0 +1,106 @@ + +import asyncio +from typing import List + +import httpx + + +from ...html_utils import to_fragment +from suma_browser.app.bp.browse.services.slugs import suma_href_from_html_slug + + +from config import config + +from utils import log + +# DB: persistence helpers +from ...product.product_detail import scrape_product_detail +from ._product_dict_is_cf import _product_dict_is_cf +from ._rewrite_links_fragment import _rewrite_links_fragment +from ._dedupe_preserve_order import _dedupe_preserve_order +from ._collect_html_img_srcs import _collect_html_img_srcs + + +async def fetch_and_upsert_product( + upsert_product, + log_product_result, + sem: asyncio.Semaphore, + slug: str, + product_slugs, + category_values, + valid_subs_by_top, + link_errors, + link_externals, + unknown_sub_paths +) -> bool: + href = suma_href_from_html_slug(slug) + try: + async with sem: + d = await scrape_product_detail(href) + + is_cf, cf_token = _product_dict_is_cf(d) + if is_cf: + payload = { + "slug": slug, + "href_tried": href, + "error_type": "CloudflareChallengeDetected", + "error_message": f"Detected Cloudflare interstitial via token: {cf_token}", + "cf_token": cf_token, + } + await log_product_result(ok=False, payload=payload) + log(f" ! CF challenge detected: {slug} ({cf_token})") + return False + + # Rewrite embedded links; collect reports + if d.get("description_html"): + d["description_html"] = _rewrite_links_fragment( + d["description_html"], config()["base_url"], product_slugs, category_values, + valid_subs_by_top, slug, link_errors, link_externals, unknown_sub_paths + ) + d["description_html"] = to_fragment(d["description_html"]) + if d.get("sections"): + for sec in d["sections"]: + if isinstance(sec, dict) and sec.get("html"): + sec["html"] = _rewrite_links_fragment( + sec["html"], config()["base_url"], product_slugs, category_values, + valid_subs_by_top, slug, link_errors, link_externals, unknown_sub_paths + ) + sec["html"] = to_fragment(sec["html"]) + + # Images + gallery = _dedupe_preserve_order(d.get("images") or []) + embedded: List[str] = [] + if d.get("description_html"): + embedded += _collect_html_img_srcs(d["description_html"]) + for sec in d.get("sections", []) or []: + if isinstance(sec, dict) and sec.get("html"): + embedded += _collect_html_img_srcs(sec["html"]) + embedded = _dedupe_preserve_order(embedded) + all_imgs = _dedupe_preserve_order(list(gallery) + list(embedded)) + + d["images"] = gallery + d["embedded_image_urls"] = embedded + d["all_image_urls"] = all_imgs + await upsert_product(slug, href, d) + # DB: upsert product + success log + return True + except Exception as e: + payload = { + "slug": slug, + "href_tried": href, + "error_type": e.__class__.__name__, + "error_message": str(e), + } + try: + if isinstance(e, httpx.HTTPStatusError): + payload["http_status"] = getattr(e.response, "status_code", None) + req = getattr(e, "request", None) + if req is not None and getattr(req, "url", None) is not None: + payload["final_url"] = str(req.url) + elif isinstance(e, httpx.TransportError): + payload["transport_error"] = True + except Exception: + pass + await log_product_result(ok=False, payload=payload) + log(f" ! product failed: {slug} ({e})") + return False diff --git a/scrape/build_snapshot/tools/fetch_and_upsert_products.py b/scrape/build_snapshot/tools/fetch_and_upsert_products.py new file mode 100644 index 0000000..599b13a --- /dev/null +++ b/scrape/build_snapshot/tools/fetch_and_upsert_products.py @@ -0,0 +1,49 @@ +import asyncio +from typing import Dict, List, Set +from config import config +from utils import log +from .fetch_and_upsert_product import fetch_and_upsert_product + + +async def fetch_and_upsert_products( + upsert_product, + log_product_result, + save_link_reports = None, + concurrency: int=8, + product_slugs: Set[str] = set(), + valid_subs_by_top: Dict[str, Set[str]] = {}, + unknown_sub_paths: Set[str] = set() +): + sem = asyncio.Semaphore(max(1, concurrency)) + link_errors: List[Dict] = [] + link_externals: List[Dict] = [] + + category_values: Set[str] = set(config()["categories"]["allow"].values()) + to_fetch = sorted(list(product_slugs)) + log(f"Fetching {len(to_fetch)} product details (concurrency={concurrency})…") + tasks = [asyncio.create_task( + fetch_and_upsert_product( + upsert_product, + log_product_result, + sem, + s, + product_slugs, + category_values, + valid_subs_by_top, + link_errors, + link_externals, + unknown_sub_paths + ) + ) for s in to_fetch] + done = 0 + ok_count = 0 + for coro in asyncio.as_completed(tasks): + ok = await coro + done += 1 + if ok: + ok_count += 1 + if done % 50 == 0 or done == len(tasks): + log(f" …{done}/{len(tasks)} saved (ok={ok_count})") + if save_link_reports: + await save_link_reports(link_errors, link_externals) + \ No newline at end of file diff --git a/scrape/build_snapshot/tools/rewrite_nav.py b/scrape/build_snapshot/tools/rewrite_nav.py new file mode 100644 index 0000000..bf592b2 --- /dev/null +++ b/scrape/build_snapshot/tools/rewrite_nav.py @@ -0,0 +1,24 @@ + +from typing import Dict +from urllib.parse import urljoin +from config import config + +def rewrite_nav(nav: Dict[str, Dict], nav_redirects:Dict[str, str]): + if nav_redirects: + for label, data in (nav.get("cats") or {}).items(): + top_slug = (data or {}).get("slug") + if not top_slug: + continue + new_subs = [] + for s in (data.get("subs") or []): + old_sub = (s.get("slug") or "").strip() + if not old_sub: + continue + old_path = f"/{top_slug}/{old_sub}" + canonical_path = nav_redirects.get(old_path, old_path) + parts = [x for x in canonical_path.split("/") if x] + top2, sub2 = parts[0], parts[1] + s["slug"] = sub2 + s["href"] = urljoin(config()["base_url"], f"/{top2}/{sub2}") + new_subs.append(s) + data["subs"] = new_subs diff --git a/scrape/build_snapshot/tools/valid_subs.py b/scrape/build_snapshot/tools/valid_subs.py new file mode 100644 index 0000000..8939a10 --- /dev/null +++ b/scrape/build_snapshot/tools/valid_subs.py @@ -0,0 +1,16 @@ +from typing import Dict, Set + +# make valid subs for ewch top in nav +def valid_subs(nav: Dict[str, Dict])->Dict[str, Set[str]] : + valid_subs_by_top: Dict[str, Set[str]] = {} + for label, data in (nav.get("cats") or {}).items(): + top_slug = (data or {}).get("slug") + if not top_slug: + continue + subs_set = { + (s.get("slug") or "").strip() + for s in (data.get("subs") or []) + if s.get("slug") + } + valid_subs_by_top[top_slug] = subs_set + return valid_subs_by_top diff --git a/scrape/get_auth.py b/scrape/get_auth.py new file mode 100644 index 0000000..b6242e1 --- /dev/null +++ b/scrape/get_auth.py @@ -0,0 +1,244 @@ +from typing import Optional, Dict, Any, List +from urllib.parse import urljoin +import httpx +from bs4 import BeautifulSoup +from config import config + +class LoginFailed(Exception): + def __init__(self, message: str, *, debug: Dict[str, Any]): + super().__init__(message) + self.debug = debug + +def _ff_headers(referer: Optional[str] = None, origin: Optional[str] = None) -> Dict[str, str]: + h = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language": "en-GB,en;q=0.5", + "Accept-Encoding": "gzip, deflate, br, zstd", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + "DNT": "1", + "Sec-GPC": "1", + "Cache-Control": "no-cache", + "Pragma": "no-cache", + } + if referer: + h["Referer"] = referer + if origin: + h["Origin"] = origin + return h + +def _cookie_header_from_jar(jar: httpx.Cookies, domain: str, path: str = "/") -> str: + pairs: List[str] = [] + for c in jar.jar: + if not c.name or c.value is None: + continue + dom = (c.domain or "").lstrip(".") + if not dom: + continue + if not (domain == dom or domain.endswith("." + dom) or dom.endswith("." + domain)): + continue + if not (path.startswith(c.path or "/")): + continue + pairs.append(f"{c.name}={c.value}") + return "; ".join(pairs) + +def _extract_magento_errors(html_text: str) -> list[str]: + msgs: list[str] = [] + try: + soup = BeautifulSoup(html_text or "", "lxml") + for sel in [ + ".message-error", + ".messages .message-error", + ".page.messages .message-error", + "[data-ui-id='message-error']", + ".message.warning", + ".message.notice", + ]: + for box in soup.select(sel): + t = " ".join((box.get_text(" ") or "").split()) + if t and t not in msgs: + msgs.append(t) + except Exception: + pass + return msgs + +def _looks_like_login_page(html_text: str) -> bool: + try: + s = BeautifulSoup(html_text or "", "lxml") + if s.select_one("form#login-form.form-login"): + return True + title = (s.title.get_text() if s.title else "").strip().lower() + if "customer login" in title: + return True + except Exception: + pass + return False + +def _chrome_headers(referer=None, origin=None): + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + } + if referer: + headers["Referer"] = referer + if origin: + headers["Origin"] = origin + return headers + +async def login( + username: str, + password: str, + *, + extra_cookies = {}, # ok to pass cf_clearance etc., but NOT form_key + timeout: float = 30.0, +) -> httpx.Cookies: + """ + Attempt login and return an authenticated cookie jar. + + Success criteria (strict): + 1) /customer/section/load?sections=customer reports is_logged_in == True + OR + 2) GET /customer/account/ resolves to an account page (not the login page). + + Otherwise raises LoginFailed with debug info. + """ + limits = httpx.Limits(max_connections=10, max_keepalive_connections=6) + cookies = httpx.Cookies() + for k, v in { + **extra_cookies, + "pr-cookie-consent": '["all"]', + "user_allowed_save_cookie": '{"1":1}', + }.items(): + if k.lower() == "form_key": + continue + cookies.set(k, v, domain="wholesale.suma.coop", path="/") + + base_login = config()["base_login"] + base_url = config()["base_url"] + + async with httpx.AsyncClient( + follow_redirects=True, + timeout=httpx.Timeout(timeout, connect=15.0), + http2=True, + limits=limits, + cookies=cookies, + headers=_chrome_headers(), + trust_env=True, + ) as client: + # 1) GET login page for fresh form_key + import time + login_bust = base_login + ("&" if "?" in base_login else "?") + f"_={int(time.time()*1000)}" + login_bust = base_login + r_get = await client.get(login_bust, headers=_chrome_headers()) + print("Login GET failed. Status:", r_get.status_code) + print("Login GET URL:", r_get.url) + print("Response text:", r_get.text[:1000]) # trim if long + r_get.raise_for_status() + soup = BeautifulSoup(r_get.text, "lxml") + + form = soup.select_one("form.form.form-login#login-form") or soup.select_one("#login-form") + if not form: + raise LoginFailed( + "Login form not found (possible bot challenge or theme change).", + debug={"get_status": r_get.status_code, "final_url": str(r_get.url)}, + ) + action = urljoin(base_login, form.get("action") or base_login) + fk_el = form.find("input", attrs={"name": "form_key"}) + hidden_form_key = (fk_el.get("value") if fk_el else "") or "" + + # mirror Magento behavior: form_key also appears as a cookie + client.cookies.set("form_key", hidden_form_key, domain="wholesale.suma.coop", path="/") + + payload = { + "form_key": hidden_form_key, + "login[username]": username, + "login[password]": password, + "send": "Login", + } + + post_headers = _chrome_headers(referer=base_login, origin=base_url) + post_headers["Content-Type"] = "application/x-www-form-urlencoded" + post_headers["Cookie"] = _cookie_header_from_jar( + client.cookies, domain="wholesale.suma.coop", path="/customer/" + ) + + r_post = await client.post(action, data=payload, headers=post_headers) + + # 2) Primary check: sections API must say logged in + is_logged_in = False + sections_url = "https://wholesale.suma.coop/customer/section/load/?sections=customer&force_new_section_timestamp=1" + section_json: Dict[str, Any] = {} + try: + r_sec = await client.get(sections_url, headers=_chrome_headers(referer=base_login)) + if r_sec.status_code == 200: + section_json = r_sec.json() + cust = section_json.get("customer") or {} + is_logged_in = bool(cust.get("is_logged_in")) + except Exception: + pass + + # 3) Secondary check: account page should NOT be the login page + looks_like_login = False + final_account_url = "" + try: + r_acc = await client.get("https://wholesale.suma.coop/customer/account/", headers=_chrome_headers(referer=base_login)) + final_account_url = str(r_acc.url) + looks_like_login = ( + "/customer/account/login" in final_account_url + or _looks_like_login_page(r_acc.text) + ) + except Exception: + # ignore; we'll rely on section status + pass + + # Decide success/failure strictly + if not (is_logged_in or (final_account_url and not looks_like_login)): + errors = _extract_magento_errors(r_post.text) + # Clean up transient form_key cookie + try: + client.cookies.jar.clear("wholesale.suma.coop", "/", "form_key") + except Exception: + pass + raise LoginFailed( + errors[0] if errors else "Invalid username or password.", + debug={ + "get_status": r_get.status_code, + "post_status": r_post.status_code, + "post_final_url": str(r_post.url), + "sections_customer": section_json.get("customer"), + "account_final_url": final_account_url, + "looks_like_login_page": looks_like_login, + }, + ) + def clear_cookie_everywhere(cookies: httpx.Cookies, name: str) -> None: + to_delete = [] + for c in list(cookies.jar): # http.cookiejar.Cookie objects + if c.name == name: + # Note: CookieJar.clear requires exact (domain, path, name) + to_delete.append((c.domain, c.path, c.name)) + + for domain, path, nm in to_delete: + try: + cookies.jar.clear(domain, path, nm) + except KeyError: + # Mismatch can happen if domain has a leading dot vs not, etc. + # Try again with a normalized domain variant. + if domain and domain.startswith("."): + + cookies.jar.clear(domain.lstrip("."), path, nm) + else: + # or try with leading dot + cookies.jar.clear("." + domain, path, nm) + if name in cookies: + del cookies[name] + + clear_cookie_everywhere(client.cookies, "form_key") + #client.cookies.jar.clear(config()["base_host"] or "wholesale.suma.coop", "/", "form_key") + print('cookies', client.cookies) + return client.cookies diff --git a/scrape/html_utils.py b/scrape/html_utils.py new file mode 100644 index 0000000..d565184 --- /dev/null +++ b/scrape/html_utils.py @@ -0,0 +1,44 @@ +# suma_browser/html_utils.py +from __future__ import annotations +from typing import Optional +from bs4 import BeautifulSoup +from urllib.parse import urljoin +from config import config + + + +def to_fragment(html: Optional[str]) -> str: + """Return just the fragment contents (no / wrappers).""" + if not html: + return "" + soup = BeautifulSoup(html, "lxml") + + # unwrap document-level containers + for t in soup.find_all(["html", "body"]): + t.unwrap() + + return "".join(str(c) for c in soup.contents).strip() + +def absolutize_fragment(html: Optional[str]) -> str: + """Absolutize href/src against BASE_URL and return a fragment (no wrappers).""" + if not html: + return "" + frag = BeautifulSoup(html, "lxml") + + for tag in frag.find_all(True): + if tag.has_attr("href"): + raw = str(tag["href"]) + abs_href = urljoin(config()["base_url"], raw) if raw.startswith("/") else raw + #if rewrite_suma_href_to_local: + # local = rewrite_suma_href_to_local(abs_href) + # tag["href"] = local if local else abs_href + #else: + tag["href"] = abs_href + if tag.has_attr("src"): + raw = str(tag["src"]) + tag["src"] = urljoin(config()["base_url"], raw) if raw.startswith("/") else raw + + # unwrap wrappers and return only the inner HTML + for t in frag.find_all(["html", "body"]): + t.unwrap() + return "".join(str(c) for c in frag.contents).strip() diff --git a/scrape/http_client.py b/scrape/http_client.py new file mode 100644 index 0000000..6001157 --- /dev/null +++ b/scrape/http_client.py @@ -0,0 +1,220 @@ +# suma_browser/http_client.py +from __future__ import annotations + +import asyncio +import os +import secrets +from typing import Optional, Dict + +import httpx +from config import config + +_CLIENT: httpx.AsyncClient | None = None + +# ----- optional decoders -> Accept-Encoding +BROTLI_OK = False +ZSTD_OK = False +try: + import brotli # noqa: F401 + BROTLI_OK = True +except Exception: + pass +try: + import zstandard as zstd # noqa: F401 + ZSTD_OK = True +except Exception: + pass + +def _accept_encoding() -> str: + enc = ["gzip", "deflate"] + if BROTLI_OK: + enc.append("br") + if ZSTD_OK: + enc.append("zstd") + return ", ".join(enc) + +FIREFOX_UA = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0" + +def _ff_headers(referer: Optional[str] = None) -> Dict[str, str]: + h = { + "User-Agent": FIREFOX_UA, + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-GB,en;q=0.5", + "Accept-Encoding": _accept_encoding(), + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none" if not referer else "same-origin", + "Sec-Fetch-User": "?1", + "DNT": "1", + "Sec-GPC": "1", + "Priority": "u=0, i", + "Cache-Control": "no-cache", + "Pragma": "no-cache", + } + if referer: + h["Referer"] = referer + return h +def _chrome_headers(referer=None, origin=None): + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + } + if referer: + headers["Referer"] = referer + if origin: + headers["Origin"] = origin + return headers + +def _parse_cookie_header(cookie_header: str) -> Dict[str, str]: + jar: Dict[str, str] = {} + for part in cookie_header.split(";"): + part = part.strip() + if not part or "=" not in part: + continue + k, v = part.split("=", 1) + jar[k.strip()] = v.strip() + return jar + +def _looks_like_cloudflare(html: bytes) -> bool: + if not html: + return False + s = html[:40000].lower() + return ( + b"please wait while your request is being verified" in s + or b"/cdn-cgi/challenge-platform/scripts/jsd/main.js" in s + or b"rocket-loader.min.js" in s + or b"cf-ray" in s + or b"challenge-platform" in s + or b"cf-chl-" in s + ) + +# -------- runtime cookie configuration (preferred over env) -------------------- +_INITIAL_COOKIES: Dict[str, str] = {} +_INITIAL_COOKIE_HEADER: Optional[str] = None + +async def configure_cookies(cookies: Dict[str, str]) -> None: + """ + Configure initial cookies programmatically (preferred over env). + Call BEFORE the first request (i.e., before get_client()/fetch()). + If a client already exists, its jar is updated immediately. + """ + global _INITIAL_COOKIES, _INITIAL_COOKIE_HEADER + _INITIAL_COOKIE_HEADER = None + _INITIAL_COOKIES = dict(cookies or {}) + # If client already built, update it now + if _CLIENT is not None: + print('configuring cookies') + host = config()["base_host"] or "wholesale.suma.coop" + for k, v in _INITIAL_COOKIES.items(): + _CLIENT.cookies.set(k, v, domain=host, path="/") + +def configure_cookies_from_header(cookie_header: str) -> None: + """ + Configure initial cookies from a raw 'Cookie:' header string. + Preferred over env; call BEFORE the first request. + """ + global _INITIAL_COOKIES, _INITIAL_COOKIE_HEADER + _INITIAL_COOKIE_HEADER = cookie_header or "" + _INITIAL_COOKIES = _parse_cookie_header(_INITIAL_COOKIE_HEADER) + if _CLIENT is not None: + host = config()["base_host"] or "wholesale.suma.coop" + for k, v in _INITIAL_COOKIES.items(): + _CLIENT.cookies.set(k, v, domain=host, path="/") + +# ------------------------------------------------------------------------------ +async def get_client() -> httpx.AsyncClient: + """Public accessor (same as _get_client).""" + return await _get_client() + +async def _get_client() -> httpx.AsyncClient: + global _CLIENT + if _CLIENT is None: + timeout = httpx.Timeout(300.0, connect=150.0) + limits = httpx.Limits(max_keepalive_connections=8, max_connections=16) + _CLIENT = httpx.AsyncClient( + follow_redirects=True, + timeout=timeout, + http2=True, + limits=limits, + headers=_chrome_headers(), + trust_env=True, + ) + + # ---- Seed cookies (priority: runtime config > env var) --------------- + host = config()["base_host"] or "wholesale.suma.coop" + + if _INITIAL_COOKIES or _INITIAL_COOKIE_HEADER: + # From runtime config + if _INITIAL_COOKIE_HEADER: + _CLIENT.cookies.update(_parse_cookie_header(_INITIAL_COOKIE_HEADER)) + for k, v in _INITIAL_COOKIES.items(): + _CLIENT.cookies.set(k, v, domain=host, path="/") + else: + # Fallback to environment + cookie_str = os.environ.get("SUMA_COOKIES", "").strip() + if cookie_str: + _CLIENT.cookies.update(_parse_cookie_header(cookie_str)) + + # Ensure private_content_version is present + if "private_content_version" not in _CLIENT.cookies: + pcv = secrets.token_hex(16) + _CLIENT.cookies.set("private_content_version", pcv, domain=host, path="/") + # --------------------------------------------------------------------- + + return _CLIENT + +async def aclose_client() -> None: + global _CLIENT + if _CLIENT is not None: + await _CLIENT.aclose() + _CLIENT = None + +async def fetch(url: str, *, referer: Optional[str] = None, retries: int = 3) -> str: + client = await _get_client() + + # Warm-up visit to look like a real session + if len(client.cookies.jar) == 0: + try: + await client.get(config()["base_url"].rstrip("/") + "/", headers=_chrome_headers()) + await asyncio.sleep(0.25) + except Exception: + pass + + last_exc: Optional[Exception] = None + for attempt in range(1, retries + 1): + try: + h = _chrome_headers(referer=referer or (config()["base_url"].rstrip("/") + "/")) + r = await client.get(url, headers=h) + if _looks_like_cloudflare(r.content): + if attempt < retries: + await asyncio.sleep(0.9 if attempt == 1 else 1.3) + try: + await client.get(config()["base_url"].rstrip("/") + "/", headers=_chrome_headers()) + await asyncio.sleep(0.4) + except Exception: + pass + continue + try: + r.raise_for_status() + except httpx.HTTPStatusError as e: + print(f"Fetch failed for {url}") + print("Status:", r.status_code) + print("Body:", r.text[:1000]) # Trimmed + raise + return r.text + except Exception as e: + last_exc = e + if attempt >= retries: + raise + await asyncio.sleep(0.45 * attempt + 0.25) + + if last_exc: + raise last_exc + raise RuntimeError("fetch failed unexpectedly") diff --git a/scrape/listings.py b/scrape/listings.py new file mode 100644 index 0000000..4f86c54 --- /dev/null +++ b/scrape/listings.py @@ -0,0 +1,289 @@ +from __future__ import annotations + +import math +import re +from typing import Callable, Dict, List, Optional, Tuple +from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse + + +from .http_client import fetch +from suma_browser.app.bp.browse.services.slugs import product_slug_from_href +from suma_browser.app.bp.browse.services.state import ( + KNOWN_PRODUCT_SLUGS, + _listing_page_cache, + _listing_page_ttl, + _listing_variant_cache, + _listing_variant_ttl, + now, +) +from utils import normalize_text, soup_of +from config import config + + +def parse_total_pages_from_text(text: str) -> Optional[int]: + m = re.search(r"Showing\s+(\d+)\s+of\s+(\d+)", text, re.I) + if not m: + return None + shown = int(m.group(1)) + total = int(m.group(2)) + per_page = 36 if shown in (12, 24, 36) else shown + return max(1, math.ceil(total / per_page)) + + +def _first_from_srcset(val: str) -> Optional[str]: + if not val: + return None + first = val.split(",")[0].strip() + parts = first.split() + return parts[0] if parts else first + + +def _abs_url(u: Optional[str]) -> Optional[str]: + if not u: + return None + return urljoin(config()["base_url"], u) if isinstance(u, str) and u.startswith("/") else u + + +def _collect_img_candidates(el) -> List[str]: + urls: List[str] = [] + if not el: + return urls + attrs = ["src", "data-src", "data-original", "data-zoom-image", "data-thumb", "content", "href"] + for a in attrs: + v = el.get(a) + if v: + urls.append(v) + for a in ["srcset", "data-srcset"]: + v = el.get(a) + if v: + first = _first_from_srcset(v) + if first: + urls.append(first) + return urls + + +def _dedupe_preserve_order_by(seq: List[str], key: Callable[[str], str]) -> List[str]: + seen = set() + out: List[str] = [] + for s in seq: + if not s: + continue + k = key(s) + if k in seen: + continue + seen.add(k) + out.append(s) + return out + + +def _filename_key(u: str) -> str: + p = urlparse(u) + path = p.path or "" + if path.endswith("/"): + path = path[:-1] + last = path.split("/")[-1] + return f"{p.netloc}:{last}".lower() + + +def _parse_cards_from_soup(soup) -> List[Dict]: + """Extract product tiles (name, href, image, desc) from listing soup. + De-duplicate by slug to avoid doubles from overlapping selectors.""" + items: List[str] = [] + seen_slugs: set[str] = set() + + # Primary selectors (Magento 2 default) + card_wrappers = soup.select( + "li.product-item, .product-item, ol.products.list.items li, .products.list.items li, .product-item-info" + ) + for card in card_wrappers: + a = ( + card.select_one("a.product-item-link") + or card.select_one(".product-item-name a") + or card.select_one("a[href$='.html'], a[href$='.htm']") + ) + if not a: + continue + #name = normalize_text(a.get_text()) or normalize_text(a.get("title") or "") + href = a.get("href") + #if not name or not href: + # continue + if href.startswith("/"): + href = urljoin(config()["base_url"], href) + + + slug = product_slug_from_href(href) + KNOWN_PRODUCT_SLUGS.add(slug) + + if slug and slug not in seen_slugs: + seen_slugs.add(slug) + items.append(slug) + # Secondary: any product-looking anchors inside products container + if not items: + products_container = soup.select_one(".products") or soup + for a in products_container.select("a[href$='.html'], a[href$='.htm']"): + href = a.get("href") + if href.startswith("/"): + href = urljoin(config()["base_url"], href) + slug = product_slug_from_href(href) + KNOWN_PRODUCT_SLUGS.add(slug) + if slug not in seen_slugs: + seen_slugs.add(slug) + items.append(slug) + + # Tertiary: JSON-LD fallback (ItemList/Product) + if not items: + import json + + def add_product(name: Optional[str], url: Optional[str], image: Optional[str]): + if not url: + return + absu = urljoin(config()["base_url"], url) if url.startswith("/") else url + slug = product_slug_from_href(absu) + if not slug: + return + KNOWN_PRODUCT_SLUGS.add(slug) + if slug not in seen_slugs: + seen_slugs.add(slug) + items.append(slug) + + for script in soup.find_all("script", attrs={"type": "application/ld+json"}): + #try: + data = json.loads(script.get_text()) + #except Exception: + # continue + if isinstance(data, dict): + if data.get("@type") == "ItemList" and isinstance(data.get("itemListElement"), list): + for it in data["itemListElement"]: + if isinstance(it, dict): + ent = it.get("item") or it + if isinstance(ent, dict): + add_product( + ent.get("name"), + ent.get("url"), + (ent.get("image") if isinstance(ent.get("image"), str) else None), + ) + if data.get("@type") == "Product": + add_product( + data.get("name"), + data.get("url"), + (data.get("image") if isinstance(data.get("image"), str) else None), + ) + elif isinstance(data, list): + for ent in data: + if not isinstance(ent, dict): + continue + if ent.get("@type") == "Product": + add_product( + ent.get("name"), + ent.get("url"), + (ent.get("image") if isinstance(ent.get("image"), str) else None), + ) + if ent.get("@type") == "ItemList": + for it in ent.get("itemListElement", []): + if isinstance(it, dict): + obj = it.get("item") or it + if isinstance(obj, dict): + add_product( + obj.get("name"), + obj.get("url"), + (obj.get("image") if isinstance(obj.get("image"), str) else None), + ) + + return items + + +def _with_query(url: str, add: Dict[str, str]) -> str: + p = urlparse(url) + q = dict(parse_qsl(p.query, keep_blank_values=True)) + q.update(add) + new_q = urlencode(q) + return urlunparse((p.scheme, p.netloc, p.path, p.params, new_q, p.fragment)) + + +def _with_page(url: str, page: int) -> str: + if page and page > 1: + return _with_query(url, {"p": str(page)}) + return url + + +def _listing_base_key(url: str) -> str: + p = urlparse(url) + path = p.path.rstrip("/") + return f"{p.scheme}://{p.netloc}{path}".lower() + + +def _variant_cache_get(base_key: str) -> Optional[str]: + info = _listing_variant_cache.get(base_key) + if not info: + return None + url, ts = info + if (now() - ts) > _listing_variant_ttl: + _listing_variant_cache.pop(base_key, None) + return None + return url + + +def _variant_cache_set(base_key: str, working_url: str) -> None: + _listing_variant_cache[base_key] = (working_url, now()) + + +def _page_cache_get(working_url: str, page: int) -> Optional[Tuple[List[Dict], int]]: + key = f"{working_url}|p={page}" + info = _listing_page_cache.get(key) + if not info: + return None + (items, total_pages), ts = info + if (now() - ts) > _listing_page_ttl: + _listing_page_cache.pop(key, None) + return None + return items, total_pages + + +def _page_cache_set(working_url: str, page: int, items: List[Dict], total_pages: int) -> None: + key = f"{working_url}|p={page}" + _listing_page_cache[key] = ((items, total_pages), now()) + + +async def _fetch_parse(url: str, page: int): + html = await fetch(_with_page(url, page)) + soup = soup_of(html) + items = _parse_cards_from_soup(soup) + return items, soup + + + + +async def scrape_products(list_url: str, page: int = 1): + """Fast listing fetch with variant memoization + page cache.""" + _listing_base_key(list_url) + items, soup = await _fetch_parse(list_url, page) + + total_pages = _derive_total_pages(soup) + return items, total_pages + +def _derive_total_pages(soup) -> int: + total_pages = 1 + textdump = normalize_text(soup.get_text(" ")) + pages_from_text = parse_total_pages_from_text(textdump) + if pages_from_text: + total_pages = pages_from_text + else: + pages = {1} + for a in soup.find_all("a", href=True): + m = re.search(r"[?&]p=(\d+)", a["href"]) + if m: + pages.add(int(m.group(1))) + total_pages = max(pages) if pages else 1 + return total_pages + + +def _slugs_from_list_url(list_url: str) -> Tuple[str, Optional[str]]: + p = urlparse(list_url) + parts = [x for x in (p.path or "").split("/") if x] + top = parts[0].lower() if parts else "" + sub = None + if len(parts) >= 2: + sub = parts[1] + if sub.lower().endswith((".html", ".htm")): + sub = re.sub(r"\.(html?|HTML?)$", "", sub) + return top, sub diff --git a/scrape/nav.py b/scrape/nav.py new file mode 100644 index 0000000..1278465 --- /dev/null +++ b/scrape/nav.py @@ -0,0 +1,104 @@ +from __future__ import annotations + +import re +from typing import Dict, List, Tuple, Optional +from urllib.parse import urlparse, urljoin + +from bs4 import BeautifulSoup +from config import config +from .http_client import fetch # only fetch; define soup_of locally +#from .. import cache_backend as cb +#from ..blacklist.category import is_category_blocked # Reverse map: slug -> label + + +# ------------------ Caches ------------------ + + + +def soup_of(html: str) -> BeautifulSoup: + return BeautifulSoup(html or "", "lxml") + + +def normalize_text(s: str) -> str: + return re.sub(r"\s+", " ", (s or "").strip()) + + +async def scrape_nav_raw() -> List[Tuple[str, str]]: + html = await fetch(config()["base_url"]) + soup = soup_of(html) + results: List[Tuple[str, str]] = [] + for a in soup.find_all("a", href=True): + text = normalize_text(a.get_text()) + if not text: + continue + href = a["href"].strip() + if href.startswith("/"): + href = urljoin(config()["base_url"], href) + if not href.startswith(config()["base_url"]): + continue + results.append((text, href)) + return results + + +def extract_sub_slug(href: str, top_slug: str) -> Optional[str]: + p = urlparse(href) + parts = [x for x in (p.path or "").split("/") if x] + if len(parts) >= 2 and parts[0].lower() == top_slug.lower(): + sub = parts[1] + if sub.lower().endswith((".html", ".htm")): + sub = re.sub(r"\.(html?|HTML?)$", "", sub) + return sub + return None + + +async def group_by_category(slug_to_links: Dict[str, List[Tuple[str, str]]]) -> Dict[str, Dict]: + nav = {"cats": {}} + for label, slug in config()["categories"]["allow"].items(): + top_href = urljoin(config()["base_url"], f"/{slug}") + subs = [] + for text, href in slug_to_links.get(slug, []): + sub_slug = extract_sub_slug(href, slug) + if sub_slug: + #list_url = _join(config()["base_url"], f"/{slug}/{sub_slug}") + #log(f"naving [{slug}/{sub_slug}] page 1…") + #items, total_pages = await scrape_products(list_url, page=1) + #for p in range(2, total_pages + 1): + # log(f"naving [{slug}/{sub_slug}] page {p}…") + # moreitems, _tp = await scrape_products(list_url, page=p) + # items.extend( + # moreitems, + # ) + subs.append({"name": text, "href": href, "slug": sub_slug}) + subs.sort(key=lambda x: x["name"].lower()) + #list_url = _join(config()["base_url"], f"/{slug}") + #log(f"naving [{slug}] page 1…") + #items, total_pages = await scrape_products(list_url, page=1) + #for p in range(2, total_pages + 1): + # log(f"naving [{slug}] page {p}…") + # moreitems, _tp = await scrape_products(list_url, page=p) + # items.extend( + # moreitems, + # ) + nav["cats"][label] = {"href": top_href, "slug": slug, "subs": subs} + return nav + + +async def scrape_nav_filtered() -> Dict[str, Dict]: + anchors = await scrape_nav_raw() + slug_to_links: Dict[str, List[Tuple[str, str]]] = {} + for text, href in anchors: + p = urlparse(href) + parts = [x for x in (p.path or "").split("/") if x] + if not parts: + continue + top = parts[0].lower() + if top in config()["slugs"]["skip"]: + continue + slug_to_links.setdefault(top, []).append((text, href)) + return await group_by_category(slug_to_links) + +async def nav_scrape() -> Dict[str, Dict]: + """Return navigation structure; use snapshot when offline.""" + + nav = await scrape_nav_filtered() + return nav diff --git a/scrape/persist_api/__init__.py b/scrape/persist_api/__init__.py new file mode 100644 index 0000000..d5273af --- /dev/null +++ b/scrape/persist_api/__init__.py @@ -0,0 +1,6 @@ +from .upsert_product import upsert_product +from .log_product_result import log_product_result +from .save_nav import save_nav +from .save_subcategory_redirects import save_subcategory_redirects +from .capture_listing import capture_listing + diff --git a/scrape/persist_api/capture_listing.py b/scrape/persist_api/capture_listing.py new file mode 100644 index 0000000..280f1d0 --- /dev/null +++ b/scrape/persist_api/capture_listing.py @@ -0,0 +1,27 @@ +# replace your existing upsert_product with this version + +import os +import httpx + +from typing import List + +async def capture_listing( + url: str, + items: List[str], + total_pages: int +): + + sync_url = os.getenv("CAPTURE_LISTING_URL", "http://localhost:8000/market/api/products/listing/") + + async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client: + _d = { + "url": url, + "items": items, + "total_pages": total_pages + } + resp = await client.post(sync_url, json=_d) + # Raise for non-2xx + resp.raise_for_status() + data = resp.json() if resp.content else {} + return data + \ No newline at end of file diff --git a/scrape/persist_api/log_product_result.py b/scrape/persist_api/log_product_result.py new file mode 100644 index 0000000..bf285ed --- /dev/null +++ b/scrape/persist_api/log_product_result.py @@ -0,0 +1,24 @@ +# replace your existing upsert_product with this version + +import os +import httpx + + +async def log_product_result( + ok: bool, + payload +): + + sync_url = os.getenv("PRODUCT_LOG_URL", "http://localhost:8000/market/api/products/log/") + + async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client: + _d = { + "ok": ok, + "payload": payload + } + resp = await client.post(sync_url, json=_d) + # Raise for non-2xx + resp.raise_for_status() + data = resp.json() if resp.content else {} + return data + \ No newline at end of file diff --git a/scrape/persist_api/save_nav.py b/scrape/persist_api/save_nav.py new file mode 100644 index 0000000..238fac7 --- /dev/null +++ b/scrape/persist_api/save_nav.py @@ -0,0 +1,19 @@ +# replace your existing upsert_product with this version + +import os +import httpx + +from typing import Dict + +async def save_nav( + nav: Dict, +): + sync_url = os.getenv("SAVE_NAV_URL", "http://localhost:8000/market/api/products/nav/") + + async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client: + resp = await client.post(sync_url, json=nav) + # Raise for non-2xx + resp.raise_for_status() + data = resp.json() if resp.content else {} + return data + \ No newline at end of file diff --git a/scrape/persist_api/save_subcategory_redirects.py b/scrape/persist_api/save_subcategory_redirects.py new file mode 100644 index 0000000..60eba97 --- /dev/null +++ b/scrape/persist_api/save_subcategory_redirects.py @@ -0,0 +1,15 @@ +import os +import httpx + +from typing import Dict + +async def save_subcategory_redirects(mapping: Dict[str, str]) -> None: + sync_url = os.getenv("SAVE_REDIRECTS", "http://localhost:8000/market/api/products/redirects/") + + async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client: + resp = await client.post(sync_url, json=mapping) + # Raise for non-2xx + resp.raise_for_status() + data = resp.json() if resp.content else {} + return data + \ No newline at end of file diff --git a/scrape/persist_api/upsert_product.py b/scrape/persist_api/upsert_product.py new file mode 100644 index 0000000..7eb46d3 --- /dev/null +++ b/scrape/persist_api/upsert_product.py @@ -0,0 +1,256 @@ +# replace your existing upsert_product with this version + +import os +import httpx + +from typing import Dict, List, Any + +async def upsert_product( + slug, + href, + d, +): + """ + Posts the given product dict `d` to the /api/products/sync endpoint. + Keeps the same signature as before and preserves logging/commit behavior. + """ + + + # Ensure slug in payload matches the function arg if present + if not d.get("slug"): + d["slug"] = slug + + # Where to post; override via env if needed + sync_url = os.getenv("PRODUCT_SYNC_URL", "http://localhost:8000/market/api/products/sync/") + + + + + payload = _massage_payload(d) + + async def _do_call() -> Dict[str, Any]: + async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client: + resp = await client.post(sync_url, json=payload) + resp.raise_for_status() + # tolerate empty body + if not resp.content: + return {} + # prefer JSON if possible, otherwise return text + try: + return resp.json() + except ValueError: + return {"raw": resp.text} + + async def _log_error(exc: BaseException) -> None: + # Optional: add your own logging here + print(f"[upsert_product] POST failed: {type(exc).__name__}: {exc}. Retrying in 5s... slug={slug} url={sync_url}") + + return await retry_until_success(_do_call, delay=5.0, on_error=_log_error) + + + + #async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client: + # _d=_massage_payload(d) + # resp = await client.post(sync_url, json=_d) + # Raise for non-2xx + #resp.raise_for_status() + #data = resp.json() if resp.content else {} + #return data + +import asyncio +from typing import Any, Awaitable, Callable, Dict, Optional + +async def retry_until_success( + fn: Callable[[], Awaitable[Any]], + *, + delay: float = 5.0, + on_error: Optional[Callable[[BaseException], Awaitable[None]]] = None, +) -> Any: + """ + Repeatedly call the async no-arg function `fn` until it succeeds (returns without raising). + Waits `delay` seconds between attempts. Never gives up. + If provided, `on_error(exc)` is awaited after each failure. + """ + attempt = 0 + while True: + try: + return await fn() + except asyncio.CancelledError: + # bubble up cancellations immediately + raise + except BaseException as exc: + attempt += 1 + if on_error is not None: + try: + await on_error(exc) + except Exception: + # don't let error handler failures prevent retrying + pass + # fallback stderr log if no on_error handler + if on_error is None: + print(f"[retry] attempt {attempt} failed: {type(exc).__name__}: {exc}") + await asyncio.sleep(delay) + + + +def _get(d, key, default=None): + v = d.get(key) + return default if v in (None, "", [], {}) else v + + +def _massage_payload(d: Dict[str, Any]) -> Dict[str, Any]: + """Mirror the DB-upsert massaging so the API sees the same structure/values.""" + slug = d.get("slug") + if not slug: + raise ValueError("product missing slug") + + # --- Top-level fields (use _get where DB upsert uses it) --- + out: Dict[str, Any] = { + "slug": slug, + "title": _get(d, "title"), + "image": _get(d, "image"), + "description_short": _get(d, "description_short"), + "description_html": _get(d, "description_html"), + "suma_href": _get(d, "suma_href"), + "brand": _get(d, "brand"), + "rrp": _get(d, "rrp"), + "rrp_currency": _get(d, "rrp_currency"), + "rrp_raw": _get(d, "rrp_raw"), + "price_per_unit": _get(d, "price_per_unit"), + "price_per_unit_currency": _get(d, "price_per_unit_currency"), + "price_per_unit_raw": _get(d, "price_per_unit_raw"), + "special_price": _get(d, "special_price"), + "special_price_currency": _get(d, "special_price_currency"), + "special_price_raw": _get(d, "special_price_raw"), + "regular_price": _get(d, "regular_price"), + "regular_price_currency": _get(d, "regular_price_currency"), + "regular_price_raw": _get(d, "regular_price_raw"), + "case_size_count": _get(d, "case_size_count"), + "case_size_item_qty": _get(d, "case_size_item_qty"), + "case_size_item_unit": _get(d, "case_size_item_unit"), + "case_size_raw": _get(d, "case_size_raw"), + "ean": d.get("ean") or d.get("barcode") or None, + "sku": d.get("sku"), + "unit_size": d.get("unit_size"), + "pack_size": d.get("pack_size"), + } + + # --- Sections: only dicts with title+html (like DB sync) --- + sections_in = d.get("sections") or [] + sections_out: List[Dict[str, Any]] = [] + for sec in sections_in: + if isinstance(sec, dict) and sec.get("title") and sec.get("html"): + sections_out.append({"title": sec["title"], "html": sec["html"]}) + out["sections"] = sections_out + + # --- Images: same 3 buckets used in DB sync --- + def _coerce_str_list(x): + if not x: + return [] + # accept list of strings or list of dicts with {"url": ...} + out_urls = [] + for item in x: + if isinstance(item, str): + if item: + out_urls.append(item) + elif isinstance(item, dict): + u = item.get("url") + if u: + out_urls.append(u) + return out_urls + + out["images"] = _coerce_str_list(d.get("images")) + out["embedded_image_urls"] = _coerce_str_list(d.get("embedded_image_urls")) + out["all_image_urls"] = _coerce_str_list(d.get("all_image_urls")) + + # --- Labels: strip (DB code trims) --- + labels_in = d.get("labels") or [] + out["labels"] = [str(x).strip() for x in labels_in if x] + + # --- Stickers: strip + lower (DB code lower-cases) --- + stickers_in = d.get("stickers") or [] + out["stickers"] = [str(x).strip().lower() for x in stickers_in if x] + + # --- Attributes: pass through the same dict sources the DB code reads --- + out["info_table"] = d.get("info_table") or {} + #out["oe_list_price"] = d.get("oe_list_price") or {} + + # --- Nutrition: allow dict or list of dicts, mirroring DB code --- + nutrition = d.get("nutrition") or [] + if isinstance(nutrition, dict): + out["nutrition"] = {str(k).strip(): (None if v is None else str(v)) for k, v in nutrition.items()} + elif isinstance(nutrition, list): + rows = [] + for row in nutrition: + if not isinstance(row, dict): + continue + key = str(row.get("key") or "").strip() + if not key: + continue + rows.append({ + "key": key, + "value": None if row.get("value") is None else str(row.get("value")), + "unit": None if row.get("unit") is None else str(row.get("unit")), + }) + out["nutrition"] = rows + else: + out["nutrition"] = [] + + # --- Allergens: accept str (→ contains=True) or dict --- + alls_in = d.get("allergens") or [] + alls_out = [] + for a in alls_in: + if isinstance(a, str): + nm, contains = a.strip(), True + elif isinstance(a, dict): + nm, contains = (a.get("name") or "").strip(), bool(a.get("contains", True)) + else: + continue + if nm: + alls_out.append({"name": nm, "contains": contains}) + out["allergens"] = alls_out + + out["images"]=[ + {"url": s.strip(), "kind": "gallery", "position": i} + for i, s in enumerate(out.get("images") or []) + if isinstance(s, str) and s.strip() + ] + [ + {"url": s.strip(), "kind": "embedded", "position": i} + for i, s in enumerate(out.get("embedded_image_urls") or []) + if isinstance(s, str) and s.strip() + ] + [ + {"url": s.strip(), "kind": "all", "position": i} + for i, s in enumerate(out.get("all_image_urls") or []) + if isinstance(s, str) and s.strip() + ] + out["labels"]= [{"name": s.strip()} for s in out["labels"] if isinstance(s, str) and s.strip()] + out["stickers"]= [{"name": s.strip()} for s in out["stickers"] if isinstance(s, str) and s.strip()] + out["attributes"] = build_attributes_list(d) + + + return out + + + + + +def build_attributes_list(d: Dict[str, Any]) -> List[Dict[str, Any]]: + attrs = [] + for src, prefix in [ + (d.get("info_table") or {}, "info_table"), + (d.get("oe_list_price") or {}, "oe_list_price"), + ]: + for k, v in src.items(): + key = f"{prefix}/{str(k).strip()}" + val = None if v is None else str(v) + attrs.append({"key": key, "value": val}) + # optional: dedupe by (key, value) + seen = set() + dedup = [] + for item in attrs: + t = (item["key"], item["value"]) + if t in seen: + continue + seen.add(t) + dedup.append(item) + return dedup diff --git a/scrape/persist_snapshot/__init__.py b/scrape/persist_snapshot/__init__.py new file mode 100644 index 0000000..43d7e24 --- /dev/null +++ b/scrape/persist_snapshot/__init__.py @@ -0,0 +1,7 @@ +from .log_product_result import log_product_result +from .upsert_product import upsert_product +from .save_nav import save_nav +from .capture_listing import capture_listing +from .save_link_reports import save_link_reports +from .save_subcategory_redirects import save_subcategory_redirects + diff --git a/scrape/persist_snapshot/_get.py b/scrape/persist_snapshot/_get.py new file mode 100644 index 0000000..dd316b6 --- /dev/null +++ b/scrape/persist_snapshot/_get.py @@ -0,0 +1,3 @@ +def _get(d, key, default=None): + v = d.get(key) + return default if v in (None, "", [], {}) else v diff --git a/scrape/persist_snapshot/capture_listing.py b/scrape/persist_snapshot/capture_listing.py new file mode 100644 index 0000000..d2b4fe4 --- /dev/null +++ b/scrape/persist_snapshot/capture_listing.py @@ -0,0 +1,137 @@ +# at top of persist_snapshot.py: +from typing import Optional, List +from sqlalchemy.ext.asyncio import AsyncSession + +from typing import List, Optional, Tuple +from sqlalchemy.dialects.postgresql import insert as pg_insert +from datetime import datetime +from sqlalchemy import ( + select, update +) +from urllib.parse import urlparse +import re + +from models.market import ( + NavTop, + NavSub, + Listing, + ListingItem, +) +from db.session import get_session + +# --- Models are unchanged, see original code --- + +# ---------------------- Helper fns called from scraper ------------------------ + + + +async def capture_listing( + #product_slugs: Set[str], + url: str, + items: List[str], + total_pages: int + ) -> None: + async with get_session() as session: + await _capture_listing( + session, + url, + items, + total_pages + ) + await session.commit() + + +async def _capture_listing( + session, + url: str, + items: List[str], + total_pages: int + ) -> None: + top_id, sub_id = await _nav_ids_from_list_url(session, url) + await _save_listing(session, top_id, sub_id, items, total_pages) + +async def _save_listing(session: AsyncSession, top_id: int, sub_id: Optional[int], + items: List[str], total_pages: Optional[int]) -> None: + res = await session.execute( + select(Listing).where(Listing.top_id == top_id, Listing.sub_id == sub_id, Listing.deleted_at.is_(None)) + ) + listing = res.scalar_one_or_none() + if not listing: + listing = Listing(top_id=top_id, sub_id=sub_id, total_pages=total_pages) + session.add(listing) + await session.flush() + else: + listing.total_pages = total_pages + + # Normalize and deduplicate incoming slugs + seen: set[str] = set() + deduped: list[str] = [] + for s in items or []: + if s and isinstance(s, str) and s not in seen: + seen.add(s) + deduped.append(s) + + if not deduped: + return + + # Fetch existing slugs from the database + res = await session.execute( + select(ListingItem.slug) + .where(ListingItem.listing_id == listing.id, ListingItem.deleted_at.is_(None)) + ) + existing_slugs = set(res.scalars().all()) + + now = datetime.utcnow() + + # Slugs to delete (present in DB but not in the new data) + to_delete = existing_slugs - seen + if to_delete: + await session.execute( + update(ListingItem) + .where( + ListingItem.listing_id == listing.id, + ListingItem.slug.in_(to_delete), + ListingItem.deleted_at.is_(None) + ) + .values(deleted_at=now) + ) + + # Slugs to insert (new ones not in DB) + to_insert = seen - existing_slugs + if to_insert: + stmt = pg_insert(ListingItem).values( + [{"listing_id": listing.id, "slug": s} for s in to_insert] + ) + #.on_conflict_do_nothing( + # constraint="uq_listing_items_listing_slug" + #) + await session.execute(stmt) + +async def _nav_ids_from_list_url(session: AsyncSession, list_url: str) -> Tuple[int, Optional[int]]: + parts = [x for x in (urlparse(list_url).path or "").split("/") if x] + top_slug = parts[0].lower() if parts else "" + sub_slug = None + if len(parts) >= 2: + sub_slug = parts[1] + if sub_slug.lower().endswith((".html", ".htm")): + sub_slug = re.sub(r"\\.(html?|HTML?)$", "", sub_slug) + return await _get_nav_ids(session, top_slug, sub_slug) + + + +async def _get_nav_ids(session: AsyncSession, top_slug: str, sub_slug: Optional[str]) -> Tuple[int, Optional[int]]: + res_top = await session.execute(select(NavTop.id).where(NavTop.slug == top_slug, NavTop.deleted_at.is_(None))) + top_id = res_top.scalar_one_or_none() + if not top_id: + raise ValueError(f"NavTop not found for slug: {top_slug}") + + sub_id = None + if sub_slug: + res_sub = await session.execute( + select(NavSub.id).where(NavSub.slug == sub_slug, NavSub.top_id == top_id, NavSub.deleted_at.is_(None)) + ) + sub_id = res_sub.scalar_one_or_none() + if sub_id is None: + raise ValueError(f"NavSub not found for slug: {sub_slug} under top_id={top_id}") + + return top_id, sub_id diff --git a/scrape/persist_snapshot/log_product_result.py b/scrape/persist_snapshot/log_product_result.py new file mode 100644 index 0000000..4641e3b --- /dev/null +++ b/scrape/persist_snapshot/log_product_result.py @@ -0,0 +1,35 @@ +# at top of persist_snapshot.py: +from sqlalchemy.ext.asyncio import AsyncSession + +from typing import Dict +from models.market import ( + ProductLog, +) +from db.session import get_session + + +async def log_product_result(ok: bool, payload: Dict) -> None: + async with get_session() as session: + await _log_product_result(session, ok, payload) + await session.commit() + + +async def _log_product_result(session: AsyncSession, ok: bool, payload: Dict) -> None: + session.add(ProductLog( + ok=ok, + slug=payload.get("slug"), + href_tried=payload.get("href_tried"), + error_type=payload.get("error_type"), + error_message=payload.get("error_message"), + http_status=payload.get("http_status"), + final_url=payload.get("final_url"), + transport_error=payload.get("transport_error"), + title=payload.get("title"), + has_description_html=payload.get("has_description_html"), + has_description_short=payload.get("has_description_short"), + sections_count=payload.get("sections_count"), + images_count=payload.get("images_count"), + embedded_images_count=payload.get("embedded_images_count"), + all_images_count=payload.get("all_images_count"), + )) + diff --git a/scrape/persist_snapshot/save_link_reports.py b/scrape/persist_snapshot/save_link_reports.py new file mode 100644 index 0000000..fba1e78 --- /dev/null +++ b/scrape/persist_snapshot/save_link_reports.py @@ -0,0 +1,29 @@ +# at top of persist_snapshot.py: +from typing import List + +from typing import Dict, List + +from models.market import ( + LinkError, + LinkExternal, +) +from db.session import get_session + +# --- Models are unchanged, see original code --- + +# ---------------------- Helper fns called from scraper ------------------------ + + + +async def save_link_reports(link_errors: List[Dict], link_externals: List[Dict]) -> None: + async with get_session() as session: + for e in link_errors: + session.add(LinkError( + product_slug=e.get("product"), href=e.get("href"), text=e.get("text"), + top=e.get("top"), sub=e.get("sub"), target_slug=e.get("target_slug"), type=e.get("type"), + )) + for e in link_externals: + session.add(LinkExternal( + product_slug=e.get("product"), href=e.get("href"), text=e.get("text"), host=e.get("host"), + )) + await session.commit() diff --git a/scrape/persist_snapshot/save_nav.py b/scrape/persist_snapshot/save_nav.py new file mode 100644 index 0000000..1d3cf00 --- /dev/null +++ b/scrape/persist_snapshot/save_nav.py @@ -0,0 +1,108 @@ +# at top of persist_snapshot.py: +from datetime import datetime +from sqlalchemy import ( + select, tuple_ +) +from typing import Dict + +from models.market import ( + NavTop, + NavSub, +) +from db.session import get_session + + + + +async def save_nav(nav: Dict) -> None: + async with get_session() as session: + await _save_nav(session, nav) + await session.commit() + +async def _save_nav(session, nav: Dict) -> None: + print('===================SAVE NAV========================') + print(nav) + now = datetime.utcnow() + + incoming_top_slugs = set() + incoming_sub_keys = set() # (top_slug, sub_slug) + + # First pass: collect slugs + for label, data in (nav.get("cats") or {}).items(): + top_slug = (data or {}).get("slug") + if not top_slug: + continue + incoming_top_slugs.add(top_slug) + + for s in (data.get("subs") or []): + sub_slug = s.get("slug") + if sub_slug: + incoming_sub_keys.add((top_slug, sub_slug)) + + # Soft-delete stale NavSub entries + # This requires joining NavTop to access top_slug + subs_to_delete = await session.execute( + select(NavSub) + .join(NavTop, NavSub.top_id == NavTop.id) + .where( + NavSub.deleted_at.is_(None), + ~tuple_(NavTop.slug, NavSub.slug).in_(incoming_sub_keys) + ) + ) + for sub in subs_to_delete.scalars(): + sub.deleted_at = now + + # Soft-delete stale NavTop entries + tops_to_delete = await session.execute( + select(NavTop) + .where( + NavTop.deleted_at.is_(None), + ~NavTop.slug.in_(incoming_top_slugs) + ) + ) + for top in tops_to_delete.scalars(): + top.deleted_at = now + + await session.flush() + + # Upsert NavTop and NavSub + for label, data in (nav.get("cats") or {}).items(): + top_slug = (data or {}).get("slug") + if not top_slug: + continue + + res = await session.execute( + select(NavTop).where(NavTop.slug == top_slug) + ) + top = res.scalar_one_or_none() + + if top: + top.label = label + top.deleted_at = None + else: + top = NavTop(label=label, slug=top_slug) + session.add(top) + + await session.flush() + + for s in (data.get("subs") or []): + sub_slug = s.get("slug") + if not sub_slug: + continue + sub_label = s.get("label") + sub_href = s.get("href") + + res_sub = await session.execute( + select(NavSub).where( + NavSub.slug == sub_slug, + NavSub.top_id == top.id + ) + ) + sub = res_sub.scalar_one_or_none() + if sub: + sub.label = sub_label + sub.href = sub_href + sub.deleted_at = None + else: + session.add(NavSub(top_id=top.id, label=sub_label, slug=sub_slug, href=sub_href)) + diff --git a/scrape/persist_snapshot/save_subcategory_redirects.py b/scrape/persist_snapshot/save_subcategory_redirects.py new file mode 100644 index 0000000..a7b82f7 --- /dev/null +++ b/scrape/persist_snapshot/save_subcategory_redirects.py @@ -0,0 +1,32 @@ +# at top of persist_snapshot.py: + +from typing import Dict +from datetime import datetime +from sqlalchemy import ( + update +) +from models.market import ( + SubcategoryRedirect, +) +from db.session import get_session + +# --- Models are unchanged, see original code --- + +# ---------------------- Helper fns called from scraper ------------------------ + + +async def save_subcategory_redirects(mapping: Dict[str, str]) -> None: + async with get_session() as session: + await _save_subcategory_redirects(session, mapping) + await session.commit() + + +async def _save_subcategory_redirects(session, mapping: Dict[str, str]) -> None: + await session.execute(update(SubcategoryRedirect).where(SubcategoryRedirect.deleted_at.is_(None)).values(deleted_at=datetime.utcnow())) + for old, new in mapping.items(): + session.add(SubcategoryRedirect(old_path=old, new_path=new)) + + + + #for slug in items: + # product_slugs.add(slug) diff --git a/scrape/persist_snapshot/upsert_product.py b/scrape/persist_snapshot/upsert_product.py new file mode 100644 index 0000000..a8d9188 --- /dev/null +++ b/scrape/persist_snapshot/upsert_product.py @@ -0,0 +1,237 @@ +# at top of persist_snapshot.py: +from sqlalchemy.ext.asyncio import AsyncSession + +from typing import Dict +from datetime import datetime +from sqlalchemy import ( + func, select, update +) + +from models.market import ( + Product, + ProductImage, + ProductSection, + ProductLabel, + ProductSticker, + ProductAttribute, + ProductNutrition, + ProductAllergen +) +from db.session import get_session + +from ._get import _get +from .log_product_result import _log_product_result + +# --- Models are unchanged, see original code --- + +# ---------------------- Helper fns called from scraper ------------------------ + + + + +async def _upsert_product(session: AsyncSession, d: Dict) -> Product: + slug = d.get("slug") + if not slug: + raise ValueError("product missing slug") + res = await session.execute(select(Product).where(Product.slug == slug, Product.deleted_at.is_(None))) + p = res.scalar_one_or_none() + if not p: + p = Product(slug=slug) + session.add(p) + + p.title = _get(d, "title") + p.image = _get(d, "image") + p.description_short = _get(d, "description_short") + p.description_html = _get(d, "description_html") + p.suma_href = _get(d, "suma_href") + p.brand = _get(d, "brand") + p.rrp = _get(d, "rrp") + p.rrp_currency = _get(d, "rrp_currency") + p.rrp_raw = _get(d, "rrp_raw") + p.price_per_unit = _get(d, "price_per_unit") + p.price_per_unit_currency = _get(d, "price_per_unit_currency") + p.price_per_unit_raw = _get(d, "price_per_unit_raw") + p.special_price = _get(d, "special_price") + p.special_price_currency = _get(d, "special_price_currency") + p.special_price_raw = _get(d, "special_price_raw") + p.regular_price = _get(d, "regular_price") + p.regular_price_currency = _get(d, "regular_price_currency") + p.regular_price_raw = _get(d, "regular_price_raw") + p.case_size_count = _get(d, "case_size_count") + p.case_size_item_qty = _get(d, "case_size_item_qty") + p.case_size_item_unit = _get(d, "case_size_item_unit") + p.case_size_raw = _get(d, "case_size_raw") + p.ean = d.get("ean") or d.get("barcode") or None + p.sku = d.get("sku") + p.unit_size = d.get("unit_size") + p.pack_size = d.get("pack_size") + p.updated_at = func.now() + + now = datetime.utcnow() + + + + # ProductSection sync + existing_sections = await session.execute(select(ProductSection).where(ProductSection.product_id == p.id, ProductSection.deleted_at.is_(None))) + existing_sections_set = {(s.title, s.html) for s in existing_sections.scalars()} + + new_sections_set = set() + for sec in d.get("sections") or []: + if isinstance(sec, dict) and sec.get("title") and sec.get("html"): + new_sections_set.add((sec["title"], sec["html"])) + if (sec["title"], sec["html"]) not in existing_sections_set: + session.add(ProductSection(product_id=p.id, title=sec["title"], html=sec["html"])) + + for s in existing_sections_set - new_sections_set: + await session.execute(update(ProductSection).where(ProductSection.product_id == p.id, ProductSection.title == s[0], ProductSection.html == s[1], ProductSection.deleted_at.is_(None)).values(deleted_at=now)) + + # ProductImage sync + existing_images = await session.execute(select(ProductImage).where(ProductImage.product_id == p.id, ProductImage.deleted_at.is_(None))) + existing_images_set = {(img.url, img.kind) for img in existing_images.scalars()} + + new_images_set = set() + for kind, urls in [ + ("gallery", d.get("images") or []), + ("embedded", d.get("embedded_image_urls") or []), + ("all", d.get("all_image_urls") or []), + ]: + for idx, url in enumerate(urls): + if url: + new_images_set.add((url, kind)) + if (url, kind) not in existing_images_set: + session.add(ProductImage(product_id=p.id, url=url, position=idx, kind=kind)) + + for img in existing_images_set - new_images_set: + await session.execute(update(ProductImage).where(ProductImage.product_id == p.id, ProductImage.url == img[0], ProductImage.kind == img[1], ProductImage.deleted_at.is_(None)).values(deleted_at=now)) + + # ProductLabel sync + existing_labels = await session.execute(select(ProductLabel).where(ProductLabel.product_id == p.id, ProductLabel.deleted_at.is_(None))) + existing_labels_set = {label.name.strip() for label in existing_labels.scalars()} + + new_labels = {str(name).strip() for name in (d.get("labels") or []) if name} + + for name in new_labels - existing_labels_set: + session.add(ProductLabel(product_id=p.id, name=name)) + + for name in existing_labels_set - new_labels: + await session.execute(update(ProductLabel).where(ProductLabel.product_id == p.id, ProductLabel.name == name, ProductLabel.deleted_at.is_(None)).values(deleted_at=now)) + + # ProductSticker sync + existing_stickers = await session.execute(select(ProductSticker).where(ProductSticker.product_id == p.id, ProductSticker.deleted_at.is_(None))) + existing_stickers_set = {sticker.name.strip() for sticker in existing_stickers.scalars()} + + new_stickers = {str(name).strip().lower() for name in (d.get("stickers") or []) if name} + + for name in new_stickers - existing_stickers_set: + session.add(ProductSticker(product_id=p.id, name=name)) + + for name in existing_stickers_set - new_stickers: + await session.execute(update(ProductSticker).where(ProductSticker.product_id == p.id, ProductSticker.name == name, ProductSticker.deleted_at.is_(None)).values(deleted_at=now)) + + # ProductAttribute sync + existing_attrs = await session.execute(select(ProductAttribute).where(ProductAttribute.product_id == p.id, ProductAttribute.deleted_at.is_(None))) + existing_attrs_set = {(a.key, a.value) for a in existing_attrs.scalars()} + + new_attrs_set = set() + for src, prefix in [(d.get("info_table") or {}, "info_table"), (d.get("oe_list_price") or {}, "oe_list_price")]: + for k, v in src.items(): + key = f"{prefix}/{str(k).strip()}" + val = None if v is None else str(v) + new_attrs_set.add((key, val)) + if (key, val) not in existing_attrs_set: + session.add(ProductAttribute(product_id=p.id, key=key, value=val)) + + for key, val in existing_attrs_set - new_attrs_set: + await session.execute(update(ProductAttribute).where(ProductAttribute.product_id == p.id, ProductAttribute.key == key, ProductAttribute.value == val, ProductAttribute.deleted_at.is_(None)).values(deleted_at=now)) + + # ProductNutrition sync + existing_nuts = await session.execute(select(ProductNutrition).where(ProductNutrition.product_id == p.id, ProductNutrition.deleted_at.is_(None))) + existing_nuts_set = {(n.key, n.value, n.unit) for n in existing_nuts.scalars()} + + new_nuts_set = set() + nutrition = d.get("nutrition") or [] + if isinstance(nutrition, dict): + for k, v in nutrition.items(): + key, val = str(k).strip(), str(v) if v is not None else None + new_nuts_set.add((key, val, None)) + if (key, val, None) not in existing_nuts_set: + session.add(ProductNutrition(product_id=p.id, key=key, value=val, unit=None)) + elif isinstance(nutrition, list): + for row in nutrition: + try: + key = str(row.get("key") or "").strip() + val = None if row.get("value") is None else str(row.get("value")) + unit = None if row.get("unit") is None else str(row.get("unit")) + if key: + new_nuts_set.add((key, val, unit)) + if (key, val, unit) not in existing_nuts_set: + session.add(ProductNutrition(product_id=p.id, key=key, value=val, unit=unit)) + except Exception: + continue + + for key, val, unit in existing_nuts_set - new_nuts_set: + await session.execute(update(ProductNutrition).where(ProductNutrition.product_id == p.id, ProductNutrition.key == key, ProductNutrition.value == val, ProductNutrition.unit == unit, ProductNutrition.deleted_at.is_(None)).values(deleted_at=now)) + + # ProductAllergen sync + existing_allergens = await session.execute(select(ProductAllergen).where(ProductAllergen.product_id == p.id, ProductAllergen.deleted_at.is_(None))) + existing_allergens_set = {(a.name, a.contains) for a in existing_allergens.scalars()} + + new_allergens_set = set() + for a in d.get("allergens") or []: + if isinstance(a, str): + nm, contains = a.strip(), True + elif isinstance(a, dict): + nm, contains = (a.get("name") or "").strip(), bool(a.get("contains", True)) + else: + continue + if nm: + new_allergens_set.add((nm, contains)) + if (nm, contains) not in existing_allergens_set: + session.add(ProductAllergen(product_id=p.id, name=nm, contains=contains)) + + for name, contains in existing_allergens_set - new_allergens_set: + await session.execute(update(ProductAllergen).where(ProductAllergen.product_id == p.id, ProductAllergen.name == name, ProductAllergen.contains == contains, ProductAllergen.deleted_at.is_(None)).values(deleted_at=now)) + + + + + await session.flush() + return p + +async def upsert_product( + slug, + href, + d, +): + async with get_session() as session: + try: + await _upsert_product(session, d) + await _log_product_result(session, ok=True, payload={ + "slug": slug, + "href_tried": href, + "title": d.get("title"), + "has_description_html": bool(d.get("description_html")), + "has_description_short": bool(d.get("description_short")), + "sections_count": len(d.get("sections") or []), + "images_count": len(d.get("images")), + "embedded_images_count": len(d.get("embedded_image_urls")), + "all_images_count": len(d.get("all_image_urls")), + }) + + except Exception as e: + print(f"[ERROR] Failed to upsert product '{d.get('slug')}'") + print(f" Title: {d}.get('title')") + print(f" URL: {d.get('suma_href')}") + print(f" Error type: {type(e).__name__}") + print(f" Error message: {str(e)}") + import traceback + traceback.print_exc() + await _log_product_result(session, ok=False, payload={ + "slug": d.get("slug"), + "href_tried": d.get("suma_href"), + "error_type": type(e).__name__, + "error_message": str(e), + "title": d.get("title"), + }) + raise + await session.commit() \ No newline at end of file diff --git a/scrape/product/__init__.py b/scrape/product/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/scrape/product/__init__.py @@ -0,0 +1 @@ + diff --git a/scrape/product/extractors/__init__.py b/scrape/product/extractors/__init__.py new file mode 100644 index 0000000..9e29637 --- /dev/null +++ b/scrape/product/extractors/__init__.py @@ -0,0 +1,13 @@ + +# Auto-import all extractor modules so they register themselves. +from .title import ex_title # noqa: F401 +from .images import ex_images # noqa: F401 +from .short_description import ex_short_description # noqa: F401 +from .description_sections import ex_description_sections # noqa: F401 +from .nutrition_ex import ex_nutrition # noqa: F401 +from .stickers import ex_stickers # noqa: F401 +from .labels import ex_labels # noqa: F401 +from .info_table import ex_info_table # noqa: F401 +from .oe_list_price import ex_oe_list_price # noqa: F401 +from .regular_price_fallback import ex_regular_price_fallback # noqa: F401 +from .breadcrumbs import ex_breadcrumbs # noqa: F401 diff --git a/scrape/product/extractors/breadcrumbs.py b/scrape/product/extractors/breadcrumbs.py new file mode 100644 index 0000000..cb20c23 --- /dev/null +++ b/scrape/product/extractors/breadcrumbs.py @@ -0,0 +1,68 @@ + +from __future__ import annotations +from typing import Dict, List, Union +from urllib.parse import urlparse +from bs4 import BeautifulSoup +from utils import normalize_text +from ..registry import extractor + +@extractor +def ex_breadcrumbs(soup: BeautifulSoup, url: str) -> Dict: + """ + Parse breadcrumbs to identify top and sub categories. + """ + bc_ul = (soup.select_one(".breadcrumbs ul.items") + or soup.select_one("nav.breadcrumbs ul.items") + or soup.select_one("ul.items")) + if not bc_ul: + return {} + + crumbs = [] + for li in bc_ul.select("li.item"): + a = li.find("a") + if a: + title = normalize_text(a.get("title") or a.get_text()) + href = a.get("href") + else: + title = normalize_text(li.get_text()) + href = None + slug = None + if href: + try: + p = urlparse(href) + path = (p.path or "").strip("/") + slug = path.split("/")[-1] if path else None + except Exception: + slug = None + if slug: + crumbs.append({"title": title or None, "href": href or None, "slug": slug}) + + category_links = [c for c in crumbs if c.get("href")] + top = None + sub = None + for c in category_links: + t = (c.get("title") or "").lower() + s = (c.get("slug") or "").lower() + if t == "home" or s in ("", "home"): + continue + if top is None: + top = c + continue + if sub is None: + sub = c + break + + out: Dict[str, Union[str, List[Dict[str, str]]]] = { + "category_breadcrumbs": crumbs + } + if top: + out["category_top_title"] = top.get("title") + out["category_top_href"] = top.get("href") + out["category_top_slug"] = top.get("slug") + if sub: + out["category_sub_title"] = sub.get("title") + out["category_sub_href"] = sub.get("href") + out["category_sub_slug"] = sub.get("slug") + if top and sub: + out["category_path"] = f"{(top.get('slug') or '').strip()}/{(sub.get('slug') or '').strip()}" + return out diff --git a/scrape/product/extractors/description_sections.py b/scrape/product/extractors/description_sections.py new file mode 100644 index 0000000..80301fc --- /dev/null +++ b/scrape/product/extractors/description_sections.py @@ -0,0 +1,43 @@ + +from __future__ import annotations +from typing import Dict, List +from bs4 import BeautifulSoup +from utils import normalize_text +from ...html_utils import absolutize_fragment +from ..registry import extractor +from ..helpers.desc import ( + split_description_container, find_description_container, + pair_title_content_from_magento_tabs, scan_headings_for_sections, + additional_attributes_table, +) +from ..helpers.text import clean_title, is_blacklisted_heading + +@extractor +def ex_description_sections(soup: BeautifulSoup, url: str) -> Dict: + description_html = None + sections: List[Dict] = [] + desc_el = find_description_container(soup) + if desc_el: + open_html, sections_from_desc = split_description_container(desc_el) + description_html = open_html or None + sections.extend(sections_from_desc) + + existing = {s["title"].lower() for s in sections} + for t, html_fragment in (pair_title_content_from_magento_tabs(soup) or scan_headings_for_sections(soup)): + low = t.lower() + if "product description" in low or low == "description" or "details" in low: + if not description_html and html_fragment: + description_html = absolutize_fragment(html_fragment) + continue + if t.lower() not in existing and normalize_text(BeautifulSoup(html_fragment, "lxml").get_text()): + if not is_blacklisted_heading(t): + sections.append({"title": clean_title(t), "html": absolutize_fragment(html_fragment)}) + existing.add(t.lower()) + addl = additional_attributes_table(soup) + if addl and "additional information" not in existing and not is_blacklisted_heading("additional information"): + sections.append({"title": "Additional Information", "html": addl}) + out = {"sections": sections} + if description_html: + out["description_html"] = description_html + return out + diff --git a/scrape/product/extractors/images.py b/scrape/product/extractors/images.py new file mode 100644 index 0000000..b3d519d --- /dev/null +++ b/scrape/product/extractors/images.py @@ -0,0 +1,89 @@ +from __future__ import annotations +import json, re +from typing import Dict, List +from bs4 import BeautifulSoup +from ..registry import extractor +from ..helpers.html import abs_url, collect_img_candidates, dedup_by_filename + +@extractor +def ex_images(soup: BeautifulSoup, url: str) -> Dict: + images: List[str] = [] + debug = False # set True while debugging + + # 1) Magento init script (gallery) + scripts = soup.find_all("script", attrs={"type": "text/x-magento-init"}) + if debug: print(f"[ex_images] x-magento-init scripts: {len(scripts)}") + + for script in scripts: + # Use raw string as-is; no stripping/collapsing + text = script.string or script.get_text() or "" + if "mage/gallery/gallery" not in text: + continue + + # Correct (not over-escaped) patterns: + m = re.search(r'"data"\s*:\s*(\[[\s\S]*?\])', text) + if not m: + if debug: print("[ex_images] 'data' array not found in gallery block") + continue + + arr_txt = m.group(1) + added = False + try: + data = json.loads(arr_txt) + for entry in data: + u = abs_url(entry.get("full")) or abs_url(entry.get("img")) + if u: + images.append(u); added = True + except Exception as e: + if debug: print(f"[ex_images] json.loads failed: {e!r}; trying regex fallback") + # Fallback to simple key extraction + fulls = re.findall(r'"full"\s*:\s*"([^"]+)"', arr_txt) + imgs = re.findall(r'"img"\s*:\s*"([^"]+)"', arr_txt) if not fulls else [] + for u in (fulls or imgs): + u = abs_url(u) + if u: + images.append(u); added = True + + if added: + break # got what we need from the gallery block + + # 2) JSON-LD fallback + if not images: + for script in soup.find_all("script", attrs={"type": "application/ld+json"}): + raw = script.string or script.get_text() or "" + try: + data = json.loads(raw) + except Exception: + continue + + def add_from(val): + if isinstance(val, str): + u = abs_url(val); u and images.append(u) + elif isinstance(val, list): + for v in val: + if isinstance(v, str): + u = abs_url(v); u and images.append(u) + elif isinstance(v, dict) and "url" in v: + u = abs_url(v["url"]); u and images.append(u) + elif isinstance(val, dict) and "url" in val: + u = abs_url(val["url"]); u and images.append(u) + + if isinstance(data, dict) and "image" in data: + add_from(data["image"]) + if isinstance(data, list): + for item in data: + if isinstance(item, dict) and "image" in item: + add_from(item["image"]) + + # 3) Generic DOM scan fallback + if not images: + # consider broadening selectors if needed, e.g. '.fotorama__img' + for el in soup.select(".product.media img, .gallery-placeholder img, .fotorama__stage img"): + for cand in collect_img_candidates(el): + u = abs_url(cand) + if u: + images.append(u) + + images = dedup_by_filename(images) + if debug: print(f"[ex_images] found images: {images}") + return {"images": images, "image": images[0] if images else None} diff --git a/scrape/product/extractors/info_table.py b/scrape/product/extractors/info_table.py new file mode 100644 index 0000000..cb9a37e --- /dev/null +++ b/scrape/product/extractors/info_table.py @@ -0,0 +1,76 @@ + +from __future__ import annotations +from typing import Dict, Union +from bs4 import BeautifulSoup +from utils import normalize_text +from ..registry import extractor +from ..helpers.price import parse_price, parse_case_size + +@extractor +def ex_info_table(soup: BeautifulSoup, url: str) -> Dict: + """ + Extracts: +
... rows of label/content ...
+ Produces: + info_table (raw map), brand, rrp[_raw|_currency], price_per_unit[_raw|_currency], + case_size_* fields + """ + container = soup.select_one(".product-page-info-table") or None + if not container: + return {} + rows_parent = container.select_one(".product-page-info-table-rows") or container + rows = rows_parent.select(".product-page-info-table-row") or [] + if not rows: + return {} + + raw_map: Dict[str, str] = {} + for r in rows: + lab_el = r.select_one(".product-page-info-table__label") + val_el = r.select_one(".product-page-info-table__content") + if not lab_el or not val_el: + continue + label = normalize_text(lab_el.get_text()) + value = normalize_text(val_el.get_text()) + if label: + raw_map[label] = value + + out: Dict[str, Union[str, float, int, Dict]] = {"info_table": raw_map} + + # Brand + brand = raw_map.get("Brand") or raw_map.get("Brand Name") or None + if brand: + out["brand"] = brand + + # RRP + rrp_val, rrp_cur, rrp_raw = parse_price(raw_map.get("RRP", "")) + if rrp_raw and (rrp_val is not None or rrp_cur is not None): + out["rrp_raw"] = rrp_raw + if rrp_val is not None: + out["rrp"] = rrp_val + if rrp_cur: + out["rrp_currency"] = rrp_cur + + # Price Per Unit + ppu_val, ppu_cur, ppu_raw = parse_price( + raw_map.get("Price Per Unit", "") or raw_map.get("Unit Price", "") + ) + if ppu_raw and (ppu_val is not None or ppu_cur is not None): + out["price_per_unit_raw"] = ppu_raw + if ppu_val is not None: + out["price_per_unit"] = ppu_val + if ppu_cur: + out["price_per_unit_currency"] = ppu_cur + + # Case Size + cs_text = raw_map.get("Case Size", "") or raw_map.get("Pack Size", "") + cs_count, cs_item_qty, cs_item_unit, cs_raw = parse_case_size(cs_text) + if cs_raw: + out["case_size_raw"] = cs_raw + if cs_count is not None: + out["case_size_count"] = cs_count + if cs_item_qty is not None: + out["case_size_item_qty"] = cs_item_qty + if cs_item_unit: + out["case_size_item_unit"] = cs_item_unit + + return out diff --git a/scrape/product/extractors/labels.py b/scrape/product/extractors/labels.py new file mode 100644 index 0000000..7f3e5e5 --- /dev/null +++ b/scrape/product/extractors/labels.py @@ -0,0 +1,41 @@ + +from __future__ import annotations +from typing import Dict, List +from bs4 import BeautifulSoup +from utils import normalize_text +from ..registry import extractor + +@extractor +def ex_labels(soup: BeautifulSoup, url: str) -> Dict: + """ + From: +
    +
  • NEW
  • +
+ Returns "labels": lower-cased union of class hints and visible text. + """ + root = soup.select_one("ul.cdz-product-labels") + if not root: + return {} + items: List[str] = [] + texts: List[str] = [] + + for li in root.select("li.label-item"): + for c in (li.get("class") or []): + c = (c or "").strip() + if c and c.lower() != "label-item" and c not in items: + items.append(c) + txt = normalize_text(li.get_text()) + if txt and txt not in texts: + texts.append(txt) + + if not items and not texts: + return {} + union = [] + seen = set() + for s in items + [t.lower() for t in texts]: + key = (s or "").strip().lower() + if key and key not in seen: + seen.add(key) + union.append(key) + return {"labels": union} diff --git a/scrape/product/extractors/nutrition_ex.py b/scrape/product/extractors/nutrition_ex.py new file mode 100644 index 0000000..5339906 --- /dev/null +++ b/scrape/product/extractors/nutrition_ex.py @@ -0,0 +1,129 @@ +from __future__ import annotations +from typing import Dict, List, Optional, Tuple +import re +from bs4 import BeautifulSoup +from utils import normalize_text +from ..registry import extractor +from ..helpers.desc import ( + split_description_container, find_description_container, + pair_title_content_from_magento_tabs, scan_headings_for_sections, +) + +# ----- value/unit parser ------------------------------------------------------ + +_NUM_UNIT_RE = re.compile( + r""" + ^\s* + (?P[-+]?\d{1,3}(?:[.,]\d{3})*(?:[.,]\d+)?|\d+(?:[.,]\d+)?) + \s* + (?P[a-zA-Z%µ/]+)? + \s*$ + """, + re.X, +) + +def _parse_value_unit(s: str) -> Tuple[Optional[str], Optional[str]]: + if not s: + return None, None + s = re.sub(r"\s+", " ", s.strip()) + m = _NUM_UNIT_RE.match(s) + if not m: + return None, None + num = (m.group("num") or "").replace(",", "") + unit = m.group("unit") or None + if unit: + u = unit.lower() + if u in {"kcal", "kcal.", "kcalories", "kcalorie"}: + unit = "kcal" + elif u in {"kj", "kj.", "kilojoule", "kilojoules"}: + unit = "kJ" + return (num or None, unit) + +# ----- section finder --------------------------------------------------------- + +def _find_nutrition_section_html(soup: BeautifulSoup) -> Optional[str]: + """ + Return the HTML for the section whose title matches 'Nutritional Information'. + We look in the same places your description extractor does. + """ + # 1) Magento tabs + for t, html in (pair_title_content_from_magento_tabs(soup) or []): + if not t or not html: + continue + title = normalize_text(t).rstrip(":").lower() + if "nutritional information" in title: + return html + + # 2) Description container split into sections + desc_el = find_description_container(soup) + if desc_el: + _open_html, sections = split_description_container(desc_el) + for sec in sections or []: + title = normalize_text((sec.get("title") or "")).rstrip(":").lower() + if "nutritional information" in title: + return sec.get("html") or "" + + # 3) Fallback: generic heading scan + for t, html in (scan_headings_for_sections(soup) or []): + if not t or not html: + continue + title = normalize_text(t).rstrip(":").lower() + if "nutritional information" in title: + return html + + return None + +# ----- table parser ----------------------------------------------------------- + +def _extract_rows_from_table(root: BeautifulSoup) -> List[Dict[str, str]]: + out: List[Dict[str, str]] = [] + table = root.select_one("table") + if not table: + return out + + for tr in table.select("tr"): + th = tr.find("th") + tds = tr.find_all("td") + if th and tds: + key = normalize_text(th.get_text(" ").strip()) + val_raw = normalize_text(tds[0].get_text(" ").strip()) + elif len(tds) >= 2: + key = normalize_text(tds[0].get_text(" ").strip()) + val_raw = normalize_text(tds[1].get_text(" ").strip()) + else: + continue + + if not key or not val_raw: + continue + + value, unit = _parse_value_unit(val_raw) + if value is None: # keep raw if not parseable + value, unit = val_raw, None + + out.append({"key": key, "value": value, "unit": unit}) + + # Deduplicate while preserving order + seen = set() + dedup: List[Dict[str, str]] = [] + for r in out: + t = (r["key"], r.get("value"), r.get("unit")) + if t in seen: + continue + seen.add(t) + dedup.append(r) + return dedup + +# ----- extractor -------------------------------------------------------------- + +@extractor +def ex_nutrition(soup: BeautifulSoup, url: str) -> Dict: + """ + Extract nutrition ONLY from the section titled 'Nutritional Information'. + Returns: {"nutrition": [{"key": "...", "value": "...", "unit": "..."}]} + """ + section_html = _find_nutrition_section_html(soup) + if not section_html: + return {"nutrition": []} + section_soup = BeautifulSoup(section_html, "lxml") + rows = _extract_rows_from_table(section_soup) + return {"nutrition": rows} diff --git a/scrape/product/extractors/oe_list_price.py b/scrape/product/extractors/oe_list_price.py new file mode 100644 index 0000000..7e790fa --- /dev/null +++ b/scrape/product/extractors/oe_list_price.py @@ -0,0 +1,56 @@ + +from __future__ import annotations +from typing import Dict, Union +from bs4 import BeautifulSoup +from ..registry import extractor +from ..helpers.price import parse_price + +@extractor +def ex_oe_list_price(soup: BeautifulSoup, url: str) -> Dict: + """ + Extract Magento "oe-list-price" block: +
+
£30.50
+
£23.63
+
+ Produces: + oe_list_price: { rrp_raw, rrp, rrp_currency, special_raw, special, special_currency } + Also promotes special_* to top-level (special_price_*) if available. + """ + box = soup.select_one(".oe-list-price") + if not box: + return {} + out: Dict[str, Union[str, float, dict]] = {} + oe: Dict[str, Union[str, float]] = {} + + # RRP inside oe-list-price (if present) + rrp = box.select_one(".rrp-price") + if rrp: + txt = (rrp.select_one("span.price") or rrp.select_one("span") or rrp).get_text(strip=True) + val, cur, raw = parse_price(txt) + if raw: + oe["rrp_raw"] = raw + if val is not None: + oe["rrp"] = val + if cur: + oe["rrp_currency"] = cur + + # Special Price inside oe-list-price + sp = box.select_one(".oe-final-price, .special-price, .final-price") + if sp: + txt = (sp.select_one("span.price") or sp.select_one("span") or sp).get_text(strip=True) + val, cur, raw = parse_price(txt) + if raw: + oe["special_raw"] = raw + if val is not None: + oe["special"] = val + out["special_price"] = val + if cur: + oe["special_currency"] = cur + out["special_price_currency"] = cur + if raw: + out["special_price_raw"] = raw + + if oe: + out["oe_list_price"] = oe + return out diff --git a/scrape/product/extractors/regular_price_fallback.py b/scrape/product/extractors/regular_price_fallback.py new file mode 100644 index 0000000..2693a90 --- /dev/null +++ b/scrape/product/extractors/regular_price_fallback.py @@ -0,0 +1,33 @@ + +from __future__ import annotations +from typing import Dict, Union +from bs4 import BeautifulSoup +from ..registry import extractor +from ..helpers.price import parse_price + +@extractor +def ex_regular_price_fallback(soup: BeautifulSoup, url: str) -> Dict: + """ + Fallback extractor for legacy 'Regular Price' blocks outside oe-list-price: +
£16.55
+ """ + rrp = soup.select_one("div.rrp-price") + if not rrp: + return {} + span = rrp.select_one("span.price") + price_text = span.get_text(strip=True) if span else rrp.get_text(" ", strip=True) + value, currency, raw = parse_price(price_text or "") + out: Dict[str, Union[str, float]] = {} + if raw: + out["regular_price_raw"] = raw + if value is not None: + out["regular_price"] = value + if currency: + out["regular_price_currency"] = currency + if value is not None: + out.setdefault("rrp", value) + if currency: + out.setdefault("rrp_currency", currency) + if raw: + out.setdefault("rrp_raw", raw) + return out diff --git a/scrape/product/extractors/short_description.py b/scrape/product/extractors/short_description.py new file mode 100644 index 0000000..c3d577f --- /dev/null +++ b/scrape/product/extractors/short_description.py @@ -0,0 +1,19 @@ + +from __future__ import annotations +from typing import Dict +from bs4 import BeautifulSoup +from utils import normalize_text +from ..registry import extractor + +@extractor +def ex_short_description(soup: BeautifulSoup, url: str) -> Dict: + desc_short = None + for sel in [".product.attribute.description .value", ".product.attribute.overview .value", + "meta[name='description']", "meta[property='og:description']"]: + el = soup.select_one(sel) + if not el: + continue + desc_short = normalize_text(el.get_text() if el.name != "meta" else el.get("content")) + if desc_short: + break + return {"description_short": desc_short} diff --git a/scrape/product/extractors/stickers.py b/scrape/product/extractors/stickers.py new file mode 100644 index 0000000..6bd7444 --- /dev/null +++ b/scrape/product/extractors/stickers.py @@ -0,0 +1,30 @@ + +from __future__ import annotations +from typing import Dict, List +from bs4 import BeautifulSoup +from ..registry import extractor + +@extractor +def ex_stickers(soup: BeautifulSoup, url: str) -> Dict: + """ +
+ + ... +
+ """ + root = soup.select_one("div.stickers") + if not root: + return {"stickers": []} + stickers: List[str] = [] + seen = set() + for sp in root.select("span.sticker"): + classes = sp.get("class") or [] + extras = [c.strip() for c in classes if c and c.lower() != "sticker"] + data_name = (sp.get("data-sticker") or "").strip() + if data_name: + extras.append(data_name) + for x in extras: + if x and x not in seen: + seen.add(x) + stickers.append(x) + return {"stickers": stickers} diff --git a/scrape/product/extractors/title.py b/scrape/product/extractors/title.py new file mode 100644 index 0000000..2df3ad9 --- /dev/null +++ b/scrape/product/extractors/title.py @@ -0,0 +1,17 @@ + +from __future__ import annotations +from typing import Dict +from bs4 import BeautifulSoup +from utils import normalize_text +from ..registry import extractor + +@extractor +def ex_title(soup: BeautifulSoup, url: str) -> Dict: + title = None + for sel in ["h1.page-title span", "h1.page-title", "h1.product-name", "meta[property='og:title']"]: + el = soup.select_one(sel) + if el: + title = normalize_text(el.get_text()) if el.name != "meta" else el.get("content") + if title: + break + return {"title": title or "Product"} diff --git a/scrape/product/helpers/desc.py b/scrape/product/helpers/desc.py new file mode 100644 index 0000000..17bface --- /dev/null +++ b/scrape/product/helpers/desc.py @@ -0,0 +1,165 @@ + +from __future__ import annotations +from typing import Dict, List, Optional, Tuple +from bs4 import BeautifulSoup, NavigableString, Tag +from utils import normalize_text +from ...html_utils import absolutize_fragment +from .text import clean_title, is_blacklisted_heading +from config import config + + +def split_description_container(desc_el: Tag) -> Tuple[str, List[Dict]]: + """ + Extract sections from accordion blocks within the description container. + + Looks for headings with class 'accordion-title' and pairs each with its + next element-sibling having class 'accordion-details'. Returns: + - open_html: the remaining description HTML with those accordion blocks removed + - sections: [{"title": ..., "html": ...}, ...] + """ + # Work on an isolated copy to avoid mutating the original DOM + frag = BeautifulSoup(desc_el.decode_contents(), "lxml") + + # Collect candidate (heading, details) pairs without mutating during iteration + pairs: List[Tuple[Tag, Tag]] = [] + for h in frag.select("#accordion .accordion-title, .accordion .accordion-title, h5.accordion-title, .accordion-title"): + if not isinstance(h, Tag): + continue + title = clean_title((h.get_text() or "").strip()) + if not title: + continue + + # Walk forward siblings until we hit an element; accept the first with 'accordion-details' + sib = h.next_sibling + details: Optional[Tag] = None + while sib is not None: + if isinstance(sib, Tag): + classes = sib.get("class") or [] + if "accordion-details" in classes: + details = sib + break + sib = sib.next_sibling + + if details is not None: + pairs.append((h, details)) + + sections: List[Dict] = [] + + # Extract sections, then remove nodes from frag + for h, details in pairs: + # Pull details HTML + html = details.decode_contents() + # Only keep non-empty (textual) content + if normalize_text(BeautifulSoup(html, "lxml").get_text()): + sections.append({ + "title": clean_title(h.get_text() or ""), + "html": absolutize_fragment(html), + }) + # Remove the matched nodes from the fragment copy + details.decompose() + h.decompose() + + # Whatever remains is the open description html + open_html = absolutize_fragment(str(frag)) if frag else "" + + return open_html, sections + +def pair_title_content_from_magento_tabs(soup: BeautifulSoup): + out = [] + container = soup.select_one(".product.info.detailed .product.data.items") or soup.select_one(".product.data.items") + if not container: + return out + titles = container.select(".data.item.title") + for t in titles: + title = normalize_text(t.get_text()) + if not title: + continue + content_id = t.get("aria-controls") or t.get("data-target") + content = soup.select_one(f"#{content_id}") if content_id else None + if content is None: + sib = t.find_next_sibling( + lambda x: isinstance(x, Tag) and "data" in x.get("class", []) and "item" in x.get("class", []) and "content" in x.get("class", []) + ) + content = sib + if content: + html = content.decode_contents() + if not is_blacklisted_heading(title): + out.append((title, absolutize_fragment(html))) + return out + +def scan_headings_for_sections(soup: BeautifulSoup): + out = [] + container = ( + soup.select_one(".product.info.detailed") + or soup.select_one(".product-info-main") + or soup.select_one(".page-main") + or soup + ) + heads = container.select("h2, h3, h4, h5, h6") + section_titles = (config().get("section-titles") or []) + for h in heads: + title = clean_title(h.get_text() or "") + if not title: + continue + low = title.lower() + if not any(k in low for k in section_titles + ["product description", "description", "details"]): + continue + parts: List[str] = [] + for sib in h.next_siblings: + if isinstance(sib, NavigableString): + parts.append(str(sib)) + continue + if isinstance(sib, Tag) and sib.name in ("h2", "h3", "h4", "h5", "h6"): + break + if isinstance(sib, Tag): + parts.append(str(sib)) + html = absolutize_fragment("".join(parts).strip()) + if html and not is_blacklisted_heading(title): + out.append((title, html)) + return out + +def additional_attributes_table(soup: BeautifulSoup) -> Optional[str]: + table = soup.select_one(".additional-attributes, table.additional-attributes, .product.attribute.additional table") + if not table: + return None + try: + rows = [] + for tr in table.select("tr"): + th = tr.find("th") or tr.find("td") + tds = tr.find_all("td") + key = normalize_text(th.get_text()) if th else None + val = normalize_text(tds[-1].get_text()) if tds else None + if key and val: + rows.append((key, val)) + if not rows: + return None + items = "\n".join( + [ + f"""
+
{key}
+
{val}
+
""" + for key, val in rows + ] + ) + return f"
{items}
" + except Exception: + return None + +def find_description_container(soup: BeautifulSoup) -> Optional[Tag]: + for sel in ["#description", "#tab-description", ".product.attribute.description .value", + ".product.attribute.overview .value", ".product.info.detailed .value"]: + el = soup.select_one(sel) + if el and normalize_text(el.get_text()): + return el + for h in soup.select("h2, h3, h4, h5, h6"): + txt = normalize_text(h.get_text()).lower() + if txt.startswith("product description") or txt == "description": + wrapper = soup.new_tag("div") + for sib in h.next_siblings: + if isinstance(sib, Tag) and sib.name in ("h2", "h3", "h4", "h5", "h6"): + break + wrapper.append(sib if isinstance(sib, Tag) else NavigableString(str(sib))) + if normalize_text(wrapper.get_text()): + return wrapper + return None diff --git a/scrape/product/helpers/html.py b/scrape/product/helpers/html.py new file mode 100644 index 0000000..d334983 --- /dev/null +++ b/scrape/product/helpers/html.py @@ -0,0 +1,53 @@ + +from __future__ import annotations +from typing import List, Optional +from urllib.parse import urljoin, urlparse +from config import config + +def first_from_srcset(val: str) -> Optional[str]: + if not val: + return None + first = val.split(",")[0].strip() + parts = first.split() + return parts[0] if parts else first + +def abs_url(u: Optional[str]) -> Optional[str]: + if not u: + return None + return urljoin(config()["base_url"], u) if isinstance(u, str) and u.startswith("/") else u + +def collect_img_candidates(el) -> List[str]: + urls: List[str] = [] + if not el: + return urls + attrs = ["src", "data-src", "data-original", "data-zoom-image", "data-thumb", "content", "href"] + for a in attrs: + v = el.get(a) + if v: + urls.append(v) + for a in ["srcset", "data-srcset"]: + v = el.get(a) + if v: + first = first_from_srcset(v) + if first: + urls.append(first) + return urls + +def _filename_key(u: str) -> str: + p = urlparse(u) + path = p.path or "" + if path.endswith("/"): + path = path[:-1] + last = path.split("/")[-1] + return f"{p.netloc}:{last}".lower() + +def dedup_by_filename(urls: List[str]) -> List[str]: + seen = set() + out: List[str] = [] + for u in urls: + k = _filename_key(u) + if k in seen: + continue + seen.add(k) + out.append(u) + return out diff --git a/scrape/product/helpers/price.py b/scrape/product/helpers/price.py new file mode 100644 index 0000000..68aad1b --- /dev/null +++ b/scrape/product/helpers/price.py @@ -0,0 +1,42 @@ + +from __future__ import annotations +import re +from typing import Optional, Tuple + +def parse_price(text: str) -> Tuple[Optional[float], Optional[str], str]: + """ + Return (value, currency, raw) from a price-like string. + Supports symbols £, €, $; strips thousands commas. + """ + raw = (text or "").strip() + m = re.search(r'([£€$])?\s*([0-9][0-9.,]*)', raw) + if not m: + return None, None, raw + sym = m.group(1) or "" + num = m.group(2).replace(",", "") + try: + value = float(num) + except ValueError: + return None, None, raw + currency = {"£": "GBP", "€": "EUR", "$": "USD"}.get(sym, None) + return value, currency, raw + +def parse_case_size(text: str) -> Tuple[Optional[int], Optional[float], Optional[str], str]: + """ + Parse strings like "6 x 500g", "12x1L", "24 × 330 ml" + Returns (count, item_qty, item_unit, raw) + """ + raw = (text or "").strip() + if not raw: + return None, None, None, raw + t = re.sub(r"[×Xx]\s*", " x ", raw) + m = re.search(r"(\d+)\s*x\s*([0-9]*\.?[0-9]+)\s*([a-zA-Z]+)", t) + if not m: + return None, None, None, raw + count = int(m.group(1)) + try: + item_qty = float(m.group(2)) + except ValueError: + item_qty = None + unit = m.group(3) + return count, item_qty, unit, raw diff --git a/scrape/product/helpers/text.py b/scrape/product/helpers/text.py new file mode 100644 index 0000000..1339331 --- /dev/null +++ b/scrape/product/helpers/text.py @@ -0,0 +1,16 @@ + +from __future__ import annotations +import re +from utils import normalize_text +from config import config + +def clean_title(t: str) -> str: + t = normalize_text(t) + t = re.sub(r":\s*$", "", t) + return t + +def is_blacklisted_heading(title: str) -> bool: + """Return True if heading should be skipped based on config blacklist.""" + bl = (config().get("blacklist") or {}).get("product-details") or [] + low = (title or "").strip().lower() + return any(low == (s or "").strip().lower() for s in bl) diff --git a/scrape/product/product_core.py b/scrape/product/product_core.py new file mode 100644 index 0000000..39c03e5 --- /dev/null +++ b/scrape/product/product_core.py @@ -0,0 +1,48 @@ + +from __future__ import annotations +from typing import Dict, Tuple, Union +from utils import soup_of +from ..http_client import fetch +from ..html_utils import absolutize_fragment +from suma_browser.app.bp.browse.services.slugs import product_slug_from_href +from .registry import REGISTRY, merge_missing +from . import extractors as _auto_register # noqa: F401 (import-time side effects) + +async def scrape_product_detail(product_url: str, include_html: bool = False) -> Union[dict, Tuple[dict, str]]: + """ + Returns a dict with fields (subset): + title, images, image, description_short, description_html, sections, + slug, suma_href, stickers, labels, info_table fields, oe_list_price, prices, + breadcrumbs-derived category_* fields. + If include_html=True, returns (data, html). + """ + html = await fetch(product_url) + + + data: Dict[str, Union[str, float, int, list, dict, None]] = { + "suma_href": product_url, + "slug": product_slug_from_href(product_url), + } + + # Run all extractors + for fn in REGISTRY: + try: + soup = soup_of(html) + piece = fn(soup, product_url) or {} + except Exception: + # Tolerate site drift + continue + merge_missing(data, piece) + # If we found short description but not description_html, echo it + if not data.get("description_html") and data.get("description_short"): + data["description_html"] = absolutize_fragment(f"

{data['description_short']}

") + + # Ensure "image" mirrors first of images if not set + if not data.get("image"): + imgs = data.get("images") or [] + if isinstance(imgs, list) and imgs: + data["image"] = imgs[0] + + if include_html: + return data, html + return data diff --git a/scrape/product/product_detail.py b/scrape/product/product_detail.py new file mode 100644 index 0000000..705d35b --- /dev/null +++ b/scrape/product/product_detail.py @@ -0,0 +1,4 @@ + +from __future__ import annotations +# Thin wrapper to keep import path stable +from .product_core import scrape_product_detail # re-export diff --git a/scrape/product/registry.py b/scrape/product/registry.py new file mode 100644 index 0000000..53cabc4 --- /dev/null +++ b/scrape/product/registry.py @@ -0,0 +1,20 @@ + +from __future__ import annotations +from typing import Callable, Dict, List, Union + +Extractor = Callable[[object, str], Dict[str, Union[str, float, int, list, dict, None]]] +REGISTRY: List[Extractor] = [] + +def extractor(fn: Extractor) -> Extractor: + """Decorator to register an extractor.""" + REGISTRY.append(fn) + return fn + +def merge_missing(dst: dict, src: dict) -> None: + """ + Merge src into dst. Only write keys that are missing or empty in dst. + "Empty" means None, "", [], {}. + """ + for k, v in (src or {}).items(): + if k not in dst or dst[k] in (None, "", [], {}): + dst[k] = v diff --git a/templates/_types/browse/_admin.html b/templates/_types/browse/_admin.html new file mode 100644 index 0000000..053691d --- /dev/null +++ b/templates/_types/browse/_admin.html @@ -0,0 +1,7 @@ +{% import "macros/links.html" as links %} +{% if g.rights.admin %} + {% from 'macros/admin_nav.html' import admin_nav_item %} + {{admin_nav_item( + url_for('market.browse.product.admin', slug=slug) + )}} +{% endif %} \ No newline at end of file diff --git a/templates/_types/browse/_main_panel.html b/templates/_types/browse/_main_panel.html new file mode 100644 index 0000000..8640ce8 --- /dev/null +++ b/templates/_types/browse/_main_panel.html @@ -0,0 +1,5 @@ + +
+ {% include "_types/browse/_product_cards.html" %} +
+
diff --git a/templates/_types/browse/_oob_elements.html b/templates/_types/browse/_oob_elements.html new file mode 100644 index 0000000..d32fd78 --- /dev/null +++ b/templates/_types/browse/_oob_elements.html @@ -0,0 +1,37 @@ +{% extends 'oob_elements.html' %} + +{# OOB elements for HTMX navigation - all elements that need updating #} + +{# Import shared OOB macros #} +{% from '_types/root/header/_oob.html' import root_header_start, root_header_end with context %} +{% from '_types/root/_oob_menu.html' import mobile_menu with context %} + +{# Header with app title - includes cart-mini, navigation, and market-specific header #} + +{% block oobs %} + + {% from '_types/root/_n/macros.html' import oob_header with context %} + {{oob_header('root-header-child', 'market-header-child', '_types/market/header/_header.html')}} + + {% from '_types/root/header/_header.html' import header_row with context %} + {{ header_row(oob=True) }} +{% endblock %} + + +{% block mobile_menu %} + {% include '_types/market/mobile/_nav_panel.html' %} +{% endblock %} + +{# Filter container with child summary - from browse/index.html child_summary block #} +{% block filter %} + {% include "_types/browse/mobile/_filter/summary.html" %} +{% endblock %} + +{% block aside %} + {% include "_types/browse/desktop/menu.html" %} +{% endblock %} + + +{% block content %} + {% include "_types/browse/_main_panel.html" %} +{% endblock %} diff --git a/templates/_types/browse/_product_card.html b/templates/_types/browse/_product_card.html new file mode 100644 index 0000000..f9bc980 --- /dev/null +++ b/templates/_types/browse/_product_card.html @@ -0,0 +1,104 @@ +{% import 'macros/stickers.html' as stick %} +{% import '_types/product/prices.html' as prices %} +{% set prices_ns = namespace() %} +{{ prices.set_prices(p, prices_ns) }} +{% set item_href = url_for('market.browse.product.product_detail', slug=p.slug)|host %} + \ No newline at end of file diff --git a/templates/_types/browse/_product_cards.html b/templates/_types/browse/_product_cards.html new file mode 100644 index 0000000..cc8edb3 --- /dev/null +++ b/templates/_types/browse/_product_cards.html @@ -0,0 +1,107 @@ +{% for p in products %} + {% include "_types/browse/_product_card.html" %} +{% endfor %} +{% if page < total_pages|int %} + + + + + +{% else %} +
End of results
+{% endif %} + diff --git a/templates/_types/browse/desktop/_category_selector.html b/templates/_types/browse/desktop/_category_selector.html new file mode 100644 index 0000000..ba642b7 --- /dev/null +++ b/templates/_types/browse/desktop/_category_selector.html @@ -0,0 +1,40 @@ +{# Categories #} + diff --git a/templates/_types/browse/desktop/_filter/brand.html b/templates/_types/browse/desktop/_filter/brand.html new file mode 100644 index 0000000..616e36e --- /dev/null +++ b/templates/_types/browse/desktop/_filter/brand.html @@ -0,0 +1,40 @@ +{# Brand filter (desktop, single-select) #} + +{# Brands #} + diff --git a/templates/_types/browse/desktop/_filter/labels.html b/templates/_types/browse/desktop/_filter/labels.html new file mode 100644 index 0000000..7a4a41e --- /dev/null +++ b/templates/_types/browse/desktop/_filter/labels.html @@ -0,0 +1,44 @@ + + + +{% import 'macros/stickers.html' as stick %} + + diff --git a/templates/_types/browse/desktop/_filter/like.html b/templates/_types/browse/desktop/_filter/like.html new file mode 100644 index 0000000..c830f98 --- /dev/null +++ b/templates/_types/browse/desktop/_filter/like.html @@ -0,0 +1,38 @@ +{% import 'macros/stickers.html' as stick %} + {% set qs = {"liked": None if liked else True, "page": None}|qs %} + {% set href = (current_local_href ~ qs)|host %} + + {% if liked %} + + {% else %} + + {% endif %} + + {{ liked_count }} + + diff --git a/templates/_types/browse/desktop/_filter/search.html b/templates/_types/browse/desktop/_filter/search.html new file mode 100644 index 0000000..2e0ea8e --- /dev/null +++ b/templates/_types/browse/desktop/_filter/search.html @@ -0,0 +1,44 @@ + +{% macro search(current_local_href,search, search_count, hx_select) -%} + + +
+ + +
+ {% if search %} + {{search_count}} + {% endif %} + {{zap_filter}} +
+
+{% endmacro %} \ No newline at end of file diff --git a/templates/_types/browse/desktop/_filter/sort.html b/templates/_types/browse/desktop/_filter/sort.html new file mode 100644 index 0000000..a4b5404 --- /dev/null +++ b/templates/_types/browse/desktop/_filter/sort.html @@ -0,0 +1,34 @@ + + + + +{% import 'macros/stickers.html' as stick %} +{% set sort_val = sort|default('az', true) %} + +
    + {% for key,label,icon in sort_options %} + {% set is_on = (sort_val == key) %} + {% set qs = {"sort": None, "page": None}|qs if is_on + else {"sort": key, "page": None}|qs %} + {% set href = (current_local_href ~ qs)|host %} + +
  • + + {{ stick.sticker(asset_url(icon), label, is_on) }} + +
  • + {% endfor %} +
diff --git a/templates/_types/browse/desktop/_filter/stickers.html b/templates/_types/browse/desktop/_filter/stickers.html new file mode 100644 index 0000000..46fd22b --- /dev/null +++ b/templates/_types/browse/desktop/_filter/stickers.html @@ -0,0 +1,46 @@ + + + + +{% import 'macros/stickers.html' as stick %} + + diff --git a/templates/_types/browse/desktop/menu.html b/templates/_types/browse/desktop/menu.html new file mode 100644 index 0000000..893cf2d --- /dev/null +++ b/templates/_types/browse/desktop/menu.html @@ -0,0 +1,37 @@ + {% import '_types/browse/desktop/_filter/search.html' as s %} + {{ s.search(current_local_href, search, search_count, hx_select) }} + +
+
+
{{ category_label }}
+
+ {% include "_types/browse/desktop/_filter/sort.html" %} + + + {% if stickers %} + {% include "_types/browse/desktop/_filter/stickers.html" %} + {% endif %} + + + {% if subs_local and top_local_href %} + {% include "_types/browse/desktop/_category_selector.html" %} + {% endif %} + +
+ +
+ + {% include "_types/browse/desktop/_filter/brand.html" %} + +
diff --git a/templates/_types/browse/index.html b/templates/_types/browse/index.html new file mode 100644 index 0000000..015e6b3 --- /dev/null +++ b/templates/_types/browse/index.html @@ -0,0 +1,13 @@ +{% extends '_types/market/index.html' %} + +{% block filter %} + {% include "_types/browse/mobile/_filter/summary.html" %} +{% endblock %} + +{% block aside %} + {% include "_types/browse/desktop/menu.html" %} +{% endblock %} + +{% block content %} + {% include "_types/browse/_main_panel.html" %} +{% endblock %} diff --git a/templates/_types/browse/like/button.html b/templates/_types/browse/like/button.html new file mode 100644 index 0000000..de147a4 --- /dev/null +++ b/templates/_types/browse/like/button.html @@ -0,0 +1,20 @@ + diff --git a/templates/_types/browse/mobile/_filter/brand_ul.html b/templates/_types/browse/mobile/_filter/brand_ul.html new file mode 100644 index 0000000..ac15400 --- /dev/null +++ b/templates/_types/browse/mobile/_filter/brand_ul.html @@ -0,0 +1,40 @@ + \ No newline at end of file diff --git a/templates/_types/browse/mobile/_filter/index.html b/templates/_types/browse/mobile/_filter/index.html new file mode 100644 index 0000000..7c2a615 --- /dev/null +++ b/templates/_types/browse/mobile/_filter/index.html @@ -0,0 +1,30 @@ + + {% include "_types/browse/mobile/_filter/sort_ul.html" %} + {% if search or selected_labels|length or selected_stickers|length or selected_brands|length %} + {% set href = (current_local_href ~ {"clear_filters": True}|qs)|host %} + + {% endif %} +
+ {% include "_types/browse/mobile/_filter/like.html" %} + {% include "_types/browse/mobile/_filter/labels.html" %} +
+ {% include "_types/browse/mobile/_filter/stickers.html" %} + {% include "_types/browse/mobile/_filter/brand_ul.html" %} + diff --git a/templates/_types/browse/mobile/_filter/labels.html b/templates/_types/browse/mobile/_filter/labels.html new file mode 100644 index 0000000..3868d42 --- /dev/null +++ b/templates/_types/browse/mobile/_filter/labels.html @@ -0,0 +1,47 @@ +{% import 'macros/stickers.html' as stick %} + + +{# Optional: hide horizontal scrollbar on mobile while keeping scrollable #} + diff --git a/templates/_types/browse/mobile/_filter/like.html b/templates/_types/browse/mobile/_filter/like.html new file mode 100644 index 0000000..509ea92 --- /dev/null +++ b/templates/_types/browse/mobile/_filter/like.html @@ -0,0 +1,40 @@ +{% import 'macros/stickers.html' as stick %} + \ No newline at end of file diff --git a/templates/_types/browse/mobile/_filter/search.html b/templates/_types/browse/mobile/_filter/search.html new file mode 100644 index 0000000..0f39178 --- /dev/null +++ b/templates/_types/browse/mobile/_filter/search.html @@ -0,0 +1,40 @@ +{% macro search(current_local_href, search, search_count, hx_select) -%} + +
+ + +
+ {% if search %} + {{search_count}} + {% endif %} +
+
+{% endmacro %} \ No newline at end of file diff --git a/templates/_types/browse/mobile/_filter/sort_ul.html b/templates/_types/browse/mobile/_filter/sort_ul.html new file mode 100644 index 0000000..c02de19 --- /dev/null +++ b/templates/_types/browse/mobile/_filter/sort_ul.html @@ -0,0 +1,33 @@ + + + +{% import 'macros/stickers.html' as stick %} + + + \ No newline at end of file diff --git a/templates/_types/browse/mobile/_filter/stickers.html b/templates/_types/browse/mobile/_filter/stickers.html new file mode 100644 index 0000000..fed0927 --- /dev/null +++ b/templates/_types/browse/mobile/_filter/stickers.html @@ -0,0 +1,50 @@ +{% import 'macros/stickers.html' as stick %} + + + +{# Optional: hide horizontal scrollbar on mobile while keeping scrollable #} + diff --git a/templates/_types/browse/mobile/_filter/summary.html b/templates/_types/browse/mobile/_filter/summary.html new file mode 100644 index 0000000..07a86a1 --- /dev/null +++ b/templates/_types/browse/mobile/_filter/summary.html @@ -0,0 +1,120 @@ +{% import 'macros/stickers.html' as stick %} +{% import 'macros/layout.html' as layout %} + + + + +{% call layout.details('/filter', 'md:hidden') %} + {% call layout.filter_summary("filter-summary-mobile", current_local_href, search, search_count, hx_select) %} +
+ + +
+ {% if sort %} +
    + + {% for k,l,i in sort_options %} + {% if k == sort %} + {% set key = k %} + {% set label = l %} + {% set icon = i %} +
  • + {{ stick.sticker(asset_url(icon), label, True)}} +
  • + {% endif %} + {% endfor %} +
+ {% endif %} + {% if liked %} +
+ + {% if liked_count is not none %} +
+ {{ liked_count }} +
+ {% endif %} +
+ {% endif %} + {% if selected_labels and selected_labels|length %} +
    + {% for st in selected_labels %} + {% for s in labels %} + {% if st == s.name %} +
  • + {{ stick.sticker(asset_url('nav-labels/' + s.name + '.svg'), s.name, True)}} + {% if s.count is not none %} +
    + {{ s.count }} +
    + {% endif %} +
  • + {% endif %} + {% endfor %} + {% endfor %} +
+ {% endif %} + {% if selected_stickers and selected_stickers|length %} +
    + {% for st in selected_stickers %} + {% for s in stickers %} + {% if st == s.name %} +
  • + + {{ stick.sticker(asset_url('stickers/' + s.name + '.svg'), s.name, True)}} + {% if s.count is not none %} + + {{ s.count }} + + {% endif %} +
  • + {% endif %} + {% endfor %} + {% endfor %} +
+ {% endif %} +
+ + {% if selected_brands and selected_brands|length %} +
    + {% for b in selected_brands %} +
  • + {% set ns = namespace(count=0) %} + {% for brand in brands %} + {% if brand.name == b %} + {% set ns.count = brand.count %} + {% endif %} + {% endfor %} + {% if ns.count %} +
    {{ b }}
    +
    {{ ns.count }}
    + {% else %} +
    {{ b }}
    +
    0
    + {% endif %} +
  • + {% endfor %} + + +
+ {% endif %} +
+ {% endcall %} +
+ {% include "_types/browse/mobile/_filter/index.html" %} +
+{% endcall %} diff --git a/templates/_types/market/_admin.html b/templates/_types/market/_admin.html new file mode 100644 index 0000000..0b09927 --- /dev/null +++ b/templates/_types/market/_admin.html @@ -0,0 +1,7 @@ +{% import "macros/links.html" as links %} +{% if g.rights.admin %} + {% from 'macros/admin_nav.html' import admin_nav_item %} + {{admin_nav_item( + url_for('market.admin.admin') + )}} +{% endif %} \ No newline at end of file diff --git a/templates/_types/market/_main_panel.html b/templates/_types/market/_main_panel.html new file mode 100644 index 0000000..87bb965 --- /dev/null +++ b/templates/_types/market/_main_panel.html @@ -0,0 +1,23 @@ +{# Main panel fragment for HTMX navigation - market landing page #} +
+ {% if post.custom_excerpt %} +
+ {{post.custom_excerpt|safe}} +
+ {% endif %} + {% if post.feature_image %} +
+ +
+ {% endif %} +
+ {% if post.html %} + {{post.html|safe}} + {% endif %} +
+
+
diff --git a/templates/_types/market/_oob_elements.html b/templates/_types/market/_oob_elements.html new file mode 100644 index 0000000..b37eea0 --- /dev/null +++ b/templates/_types/market/_oob_elements.html @@ -0,0 +1,30 @@ +{% extends 'oob_elements.html' %} + +{# OOB elements for HTMX navigation - all elements that need updating #} + +{# Import shared OOB macros #} +{% from '_types/root/header/_oob.html' import root_header_start, root_header_end with context %} +{% from '_types/root/_oob_menu.html' import mobile_menu with context %} + +{# Header with app title - includes cart-mini, navigation, and market-specific header #} + +{% block oobs %} + + {% from '_types/root/_n/macros.html' import oob_header with context %} + {{oob_header('root-header-child', 'market-header-child', '_types/market/header/_header.html')}} + + {% from '_types/root/header/_header.html' import header_row with context %} + {{ header_row(oob=True) }} +{% endblock %} + + +{% block mobile_menu %} + {% include '_types/market/mobile/_nav_panel.html' %} +{% endblock %} + + +{% block content %} + {% include "_types/market/_main_panel.html" %} +{% endblock %} + + diff --git a/templates/_types/market/_title.html b/templates/_types/market/_title.html new file mode 100644 index 0000000..33e6c67 --- /dev/null +++ b/templates/_types/market/_title.html @@ -0,0 +1,17 @@ +
+
+ + {{ coop_title }} +
+
+
+ {{top_slug or ''}} +
+ {% if sub_slug %} +
+ {{sub_slug}} +
+ {% endif %} +
+
\ No newline at end of file diff --git a/templates/_types/market/admin/_main_panel.html b/templates/_types/market/admin/_main_panel.html new file mode 100644 index 0000000..a354325 --- /dev/null +++ b/templates/_types/market/admin/_main_panel.html @@ -0,0 +1 @@ +market admin \ No newline at end of file diff --git a/templates/_types/market/admin/_nav.html b/templates/_types/market/admin/_nav.html new file mode 100644 index 0000000..f5c504d --- /dev/null +++ b/templates/_types/market/admin/_nav.html @@ -0,0 +1,2 @@ +{% from 'macros/admin_nav.html' import placeholder_nav %} +{{ placeholder_nav() }} diff --git a/templates/_types/market/admin/_oob_elements.html b/templates/_types/market/admin/_oob_elements.html new file mode 100644 index 0000000..9b306fd --- /dev/null +++ b/templates/_types/market/admin/_oob_elements.html @@ -0,0 +1,29 @@ +{% extends 'oob_elements.html' %} + +{# OOB elements for HTMX navigation - all elements that need updating #} + +{# Import shared OOB macros #} +{% from '_types/root/_oob_menu.html' import mobile_menu with context %} + +{# Header with app title - includes cart-mini, navigation, and market-specific header #} + +{% block oobs %} + + {% from '_types/root/_n/macros.html' import oob_header with context %} + {{oob_header('market-header-child', 'market-admin-header-child', '_types/market/admin/header/_header.html')}} + + {% from '_types/market/header/_header.html' import header_row with context %} + {{ header_row(oob=True) }} +{% endblock %} + + +{% block mobile_menu %} + {% include '_types/market/admin/_nav.html' %} +{% endblock %} + + +{% block content %} + {% include "_types/market/admin/_main_panel.html" %} +{% endblock %} + + diff --git a/templates/_types/market/admin/header/_header.html b/templates/_types/market/admin/header/_header.html new file mode 100644 index 0000000..950eefc --- /dev/null +++ b/templates/_types/market/admin/header/_header.html @@ -0,0 +1,11 @@ +{% import 'macros/links.html' as links %} +{% macro header_row(oob=False) %} + {% call links.menu_row(id='market-admin-row', oob=oob) %} + {% call links.link(url_for('market.admin.admin'), hx_select_search) %} + {{ links.admin() }} + {% endcall %} + {% call links.desktop_nav() %} + {% include '_types/market/admin/_nav.html' %} + {% endcall %} + {% endcall %} +{% endmacro %} \ No newline at end of file diff --git a/templates/_types/market/admin/index.html b/templates/_types/market/admin/index.html new file mode 100644 index 0000000..4798c46 --- /dev/null +++ b/templates/_types/market/admin/index.html @@ -0,0 +1,19 @@ +{% extends '_types/market/index.html' %} + + +{% block market_header_child %} + {% from '_types/root/_n/macros.html' import index_row with context %} + {% call index_row('market-admin-header-child', '_types/market/admin/header/_header.html') %} + {% block market_admin_header_child %} + {% endblock %} + {% endcall %} +{% endblock %} + +{% block _main_mobile_menu %} + {% include '_types/market/admin/_nav.html' %} +{% endblock %} + + +{% block content %} + {% include '_types/market/admin/_main_panel.html' %} +{% endblock %} diff --git a/templates/_types/market/desktop/_nav.html b/templates/_types/market/desktop/_nav.html new file mode 100644 index 0000000..d4de6e6 --- /dev/null +++ b/templates/_types/market/desktop/_nav.html @@ -0,0 +1,38 @@ + + diff --git a/templates/_types/market/header/_header.html b/templates/_types/market/header/_header.html new file mode 100644 index 0000000..2d92286 --- /dev/null +++ b/templates/_types/market/header/_header.html @@ -0,0 +1,11 @@ +{% import 'macros/links.html' as links %} +{% macro header_row(oob=False) %} + {% call links.menu_row(id='market-row', oob=oob) %} + {% call links.link(url_for('market.browse.home'), hx_select_search ) %} + {% include '_types/market/_title.html' %} + {% endcall %} + {% call links.desktop_nav() %} + {% include '_types/market/desktop/_nav.html' %} + {% endcall %} + {% endcall %} +{% endmacro %} \ No newline at end of file diff --git a/templates/_types/market/index.html b/templates/_types/market/index.html new file mode 100644 index 0000000..df1ec4c --- /dev/null +++ b/templates/_types/market/index.html @@ -0,0 +1,25 @@ +{% extends '_types/root/_index.html' %} + + +{% block root_header_child %} + {% from '_types/root/_n/macros.html' import index_row with context %} + {% call index_row('market-header-child', '_types/market/header/_header.html') %} + {% block market_header_child %} + {% endblock %} + {% endcall %} +{% endblock %} + + +{% block _main_mobile_menu %} + {% include '_types/market/mobile/_nav_panel.html' %} +{% endblock %} + + + +{% block aside %} +{# No aside on landing page #} +{% endblock %} + +{% block content %} + {% include "_types/market/_main_panel.html" %} +{% endblock %} diff --git a/templates/_types/market/mobile/_nav_panel.html b/templates/_types/market/mobile/_nav_panel.html new file mode 100644 index 0000000..65a9685 --- /dev/null +++ b/templates/_types/market/mobile/_nav_panel.html @@ -0,0 +1,110 @@ +{% from 'macros/glyphs.html' import opener %} +
+
+ {% set all_href = (url_for('market.browse.browse_all') ~ qs)|host %} + {% set all_active = (category_label == 'All Products') %} + +
+ All +
+
+ {% for cat, data in categories.items() %} +
+ + + {% set href = (url_for('market.browse.browse_top', top_slug=data.slug) ~ qs)|host %} + + +
{{ cat }}
+
{{ data.count }}
+
+ {{ opener('cat')}} + +
+ +
+ {% if data.subs %} + +
+ +
+ {% for sub in data.subs %} + {% set href = (url_for('market.browse.browse_sub', top_slug=data.slug, sub_slug=sub.slug) ~qs)|host%} + {% if top_slug==(data.slug | lower) and sub_slug == sub.slug %} + +
{{ sub.html_label or sub.name }}
+
{{ sub.count }}
+
+ {% endif %} + {% endfor %} + {% for sub in data.subs %} + {% if not (top_slug==(data.slug | lower) and sub_slug == sub.slug) %} + {% set href = (url_for('market.browse.browse_sub', top_slug=data.slug, sub_slug=sub.slug) ~ qs)|host%} + +
{{ sub.name }}
+
{{ sub.count }}
+
+ {% endif %} + {% endfor %} +
+
+ {% else %} + {% set href = (url_for('market.browse.browse_top', top_slug=data.slug) ~ qs)|host%} + View all + {% endif %} +
+
+ {% endfor %} + {% include '_types/market/_admin.html' %} +
+
diff --git a/templates/_types/market/mobile/menu.html b/templates/_types/market/mobile/menu.html new file mode 100644 index 0000000..145b551 --- /dev/null +++ b/templates/_types/market/mobile/menu.html @@ -0,0 +1,6 @@ +{% extends 'mobile/menu.html' %} +{% block menu %} + {% block mobile_menu %} + {% endblock %} + {% include '_types/market/mobile/_nav_panel.html' %} +{% endblock %} diff --git a/templates/_types/product/_added.html b/templates/_types/product/_added.html new file mode 100644 index 0000000..9ee4fed --- /dev/null +++ b/templates/_types/product/_added.html @@ -0,0 +1,25 @@ +{% set oob='true' %} +{% import '_types/product/_cart.html' as _cart %} +{% from '_types/cart/_mini.html' import mini with context %} +{{mini()}} + +{{ _cart.add(d.slug, cart, oob='true')}} + +{% from '_types/product/_cart.html' import cart_item with context %} + +{% if cart | sum(attribute="quantity") > 0 %} + {% if item.quantity > 0 %} + {{ cart_item(oob='true')}} + {% else %} + {{ cart_item(oob='delete')}} + {% endif %} + {% from '_types/cart/_cart.html' import summary %} + + {{ summary(cart, total,calendar_total, calendar_cart_entries, oob='true')}} + +{% else %} + {% set cart=[] %} + {% from '_types/cart/_cart.html' import show_cart with context %} + {{ show_cart( oob='true') }} + +{% endif %} \ No newline at end of file diff --git a/templates/_types/product/_cart.html b/templates/_types/product/_cart.html new file mode 100644 index 0000000..dfdb6c1 --- /dev/null +++ b/templates/_types/product/_cart.html @@ -0,0 +1,250 @@ +{% macro add(slug, cart, oob='false') %} +{% set quantity = cart + | selectattr('product.slug', 'equalto', slug) + | sum(attribute='quantity') %} + +
+ + {% if not quantity %} +
+ + + + +
+ + {% else %} +
+ +
+ + + +
+ + + + + + + + + {{ quantity }} + + + + + + +
+ + + +
+
+ {% endif %} +
+{% endmacro %} + + + +{% macro cart_item(oob=False) %} + +{% set p = item.product %} +{% set unit_price = p.special_price or p.regular_price %} +
+
+ {% if p.image %} + {{ p.title }} + {% else %} +
+ No image +
'market', 'product', p.slug + {% endif %} +
+ + {# Details #} +
+
+
+

+ {% set href=market_url('/product/' + p.slug + '/') %} + + {{ p.title }} + +

+ + {% if p.brand %} +

+ {{ p.brand }} +

+ {% endif %} + + {% if item.is_deleted %} +

+ + This item is no longer available or price has changed +

+ {% endif %} +
+ + {# Unit price #} +
+ {% if unit_price %} + {% set symbol = "£" if p.regular_price_currency == "GBP" else p.regular_price_currency %} +

+ {{ symbol }}{{ "%.2f"|format(unit_price) }} +

+ {% if p.special_price and p.special_price != p.regular_price %} +

+ {{ symbol }}{{ "%.2f"|format(p.regular_price) }} +

+ {% endif %} + {% else %} +

No price

+ {% endif %} +
+
+ +
+
+ Quantity +
+ + + +
+ + {{ item.quantity }} + +
+ + + +
+
+ +
+ {% if unit_price %} + {% set line_total = unit_price * item.quantity %} + {% set symbol = "£" if p.regular_price_currency == "GBP" else p.regular_price_currency %} +

+ Line total: + {{ symbol }}{{ "%.2f"|format(line_total) }} +

+ {% endif %} +
+
+
+
+ +{% endmacro %} diff --git a/templates/_types/product/_main_panel.html b/templates/_types/product/_main_panel.html new file mode 100644 index 0000000..cf8df31 --- /dev/null +++ b/templates/_types/product/_main_panel.html @@ -0,0 +1,131 @@ +{# Main panel fragment for HTMX navigation - product detail content #} +{% import 'macros/stickers.html' as stick %} +{% import '_types/product/prices.html' as prices %} +{% set prices_ns = namespace() %} +{{ prices.set_prices(d, prices_ns)}} + + {# Product detail grid from content block #} +
+
+ {% if d.images and d.images|length > 0 %} +
+ {# --- like button overlay in top-right --- #} + {% if g.user %} +
+ {% set slug = d.slug %} + {% set liked = liked_by_current_user %} + {% include "_types/browse/like/button.html" %} +
+ {% endif %} + +
+
+ {{ d.title }} + + {% for l in d.labels %} + + {% endfor %} +
+
+ {{ d.brand }} +
+
+ + {% if d.images|length > 1 %} + + + {% endif %} +
+ +
+
+ {% for u in d.images %} + + + {% endfor %} +
+
+ {% else %} +
+ {# Even if no image, still render the like button in the corner for consistency #} + {% if g.user %} +
+ {% set slug = d.slug %} + {% set liked = liked_by_current_user %} + {% include "_types/browse/like/button.html" %} +
+ {% endif %} + + No image +
+ {% endif %} + +
+ {% for s in d.stickers %} + {{ stick.sticker(asset_url('stickers/' + s + '.svg'), s, True, size=40) }} + {% endfor %} +
+
+ +
+ {# Optional extras shown quietly #} +
+ {% if d.price_per_unit or d.price_per_unit_raw %} +
Unit price: {{ prices.price_str(d.price_per_unit, d.price_per_unit_raw, d.price_per_unit_currency) }}
+ {% endif %} + {% if d.case_size_raw %} +
Case size: {{ d.case_size_raw }}
+ {% endif %} + +
+ + {% if d.description_short or d.description_html %} +
+ {% if d.description_short %} +

{{ d.description_short }}

+ {% endif %} + {% if d.description_html %} +
+ {{ d.description_html | safe }} +
+ {% endif %} +
+ {% endif %} + + {% if d.sections and d.sections|length %} +
+ {% for sec in d.sections %} +
+ + {{ sec.title }} + + +
+ {{ sec.html | safe }} +
+
+ {% endfor %} +
+ {% endif %} +
+ +
+
diff --git a/templates/_types/product/_meta.html b/templates/_types/product/_meta.html new file mode 100644 index 0000000..aebb684 --- /dev/null +++ b/templates/_types/product/_meta.html @@ -0,0 +1,106 @@ +{# --- social/meta_product.html --- #} +{# Context expected: + site, d (Product), request +#} + +{# Visibility → robots: index unless soft-deleted #} +{% set robots_here = 'noindex,nofollow' if d.deleted_at else 'index,follow' %} + +{# Compute canonical #} +{% set _site_url = site().url.rstrip('/') if site and site().url else '' %} +{% set _product_path = request.path if request else ('/products/' ~ (d.slug or '')) %} +{% set canonical = _site_url ~ _product_path if _site_url else (request.url if request else None) %} + +{# Include common base (charset, viewport, robots default, RSS, Org/WebSite JSON-LD) #} +{% set robots_override = robots_here %} +{% include 'social/meta_base.html' %} + +{# ---- Titles / descriptions ---- #} +{% set base_product_title = d.title or base_title %} +{% set og_title = base_product_title %} +{% set tw_title = base_product_title %} + +{# Description: prefer short, then HTML stripped #} +{% set desc_source = d.description_short + or (d.description_html|striptags if d.description_html else '') %} +{% set description = (desc_source|trim|replace('\n',' ')|replace('\r',' ')|striptags)|truncate(160, True, '…') %} + +{# ---- Image priority: product image, then first gallery image, then site default ---- #} +{% set image_url = d.image + or ((d.images|first).url if d.images and (d.images|first).url else None) + or (site().default_image if site and site().default_image else None) %} + +{# ---- Price / offer helpers ---- #} +{% set price = d.special_price or d.regular_price or d.rrp %} +{% set price_currency = d.special_price_currency or d.regular_price_currency or d.rrp_currency %} + +{# ---- Basic meta ---- #} +{{ base_product_title }} + +{% if canonical %}{% endif %} + +{# ---- Open Graph ---- #} + + + + +{% if canonical %}{% endif %} +{% if image_url %}{% endif %} + +{# Optional product OG price tags #} +{% if price and price_currency %} + + +{% endif %} +{% if d.brand %} + +{% endif %} +{% if d.sku %} + +{% endif %} + +{# ---- Twitter ---- #} + +{% if site and site().twitter_site %}{% endif %} + + +{% if image_url %}{% endif %} + +{# ---- JSON-LD Product ---- #} +{% set jsonld = { + "@context": "https://schema.org", + "@type": "Product", + "name": d.title, + "image": image_url, + "description": description, + "sku": d.sku, + "brand": d.brand, + "url": canonical +} %} + +{# Brand as proper object if present #} +{% if d.brand %} + {% set jsonld = jsonld | combine({ + "brand": { + "@type": "Brand", + "name": d.brand + } + }) %} +{% endif %} + +{# Offers if price available #} +{% if price and price_currency %} + {% set jsonld = jsonld | combine({ + "offers": { + "@type": "Offer", + "price": price, + "priceCurrency": price_currency, + "url": canonical, + "availability": "https://schema.org/InStock" + } + }) %} +{% endif %} + + diff --git a/templates/_types/product/_oob_elements.html b/templates/_types/product/_oob_elements.html new file mode 100644 index 0000000..a651387 --- /dev/null +++ b/templates/_types/product/_oob_elements.html @@ -0,0 +1,49 @@ +{% extends 'oob_elements.html' %} +{# OOB elements for HTMX navigation - product extends browse so use similar structure #} +{% import 'macros/layout.html' as layout %} +{% import 'macros/stickers.html' as stick %} +{% import '_types/product/prices.html' as prices %} +{% set prices_ns = namespace() %} +{{ prices.set_prices(d, prices_ns)}} + +{# Import shared OOB macros #} +{% from '_types/root/header/_oob.html' import root_header_start, root_header_end with context %} +{% from '_types/root/_oob_menu.html' import mobile_menu with context %} + + + +{% block oobs %} + {% from '_types/market/header/_header.html' import header_row with context %} + {{ header_row(oob=True) }} + + {% from '_types/root/_n/macros.html' import oob_header with context %} + {{oob_header('market-header-child', 'product-header-child', '_types/product/header/_header.html')}} + +{% endblock %} + + + +{% block mobile_menu %} + {% include '_types/market/mobile/_nav_panel.html' %} + {% include '_types/browse/_admin.html' %} +{% endblock %} + +{% block filter %} + {% call layout.details() %} + {% call layout.summary('coop-child-header') %} + {% endcall %} + {% call layout.menu('blog-child-menu') %} + {% endcall %} + {% endcall %} + + {% call layout.details() %} + {% call layout.summary('product-child-header') %} + {% endcall %} + {% call layout.menu('item-child-menu') %} + {% endcall %} + {% endcall %} +{% endblock %} + +{% block content %} + {% include '_types/product/_main_panel.html' %} +{% endblock %} diff --git a/templates/_types/product/_prices.html b/templates/_types/product/_prices.html new file mode 100644 index 0000000..e56339f --- /dev/null +++ b/templates/_types/product/_prices.html @@ -0,0 +1,33 @@ +{% import '_types/product/_cart.html' as _cart %} + {# ---- Price block ---- #} + {% import '_types/product/prices.html' as prices %} + {% set prices_ns = namespace() %} + {{ prices.set_prices(d, prices_ns)}} + +
+ {{ _cart.add(d.slug, cart)}} + + {% if prices_ns.sp_val %} +
+ Special price +
+
+ {{ prices.price_str(prices_ns.sp_val, prices_ns.sp_raw, prices_ns.sp_cur) }} +
+ {% if prices_ns.sp_val and prices_ns.rp_val %} +
+ {{ prices.price_str(prices_ns.rp_val, prices_ns.rp_raw, prices_ns.rp_cur) }} +
+ {% endif %} + {% elif prices_ns.rp_val %} + +
+ {{ prices.price_str(prices_ns.rp_val, prices_ns.rp_raw, prices_ns.rp_cur) }} +
+ {% endif %} + {{ prices.rrp(prices_ns) }} + +
+ diff --git a/templates/_types/product/_title.html b/templates/_types/product/_title.html new file mode 100644 index 0000000..0b3be43 --- /dev/null +++ b/templates/_types/product/_title.html @@ -0,0 +1,2 @@ + +
{{ d.title }}
diff --git a/templates/_types/product/admin/_nav.html b/templates/_types/product/admin/_nav.html new file mode 100644 index 0000000..f5c504d --- /dev/null +++ b/templates/_types/product/admin/_nav.html @@ -0,0 +1,2 @@ +{% from 'macros/admin_nav.html' import placeholder_nav %} +{{ placeholder_nav() }} diff --git a/templates/_types/product/admin/_oob_elements.html b/templates/_types/product/admin/_oob_elements.html new file mode 100644 index 0000000..84acac6 --- /dev/null +++ b/templates/_types/product/admin/_oob_elements.html @@ -0,0 +1,40 @@ +{% extends 'oob_elements.html' %} + + +{# OOB elements for HTMX navigation - all elements that need updating #} +{# Import shared OOB macros #} +{% from '_types/root/header/_oob.html' import root_header_start, root_header_end with context %} +{% from '_types/root/_oob_menu.html' import mobile_menu with context %} + + + +{% block oobs %} + + {% from '_types/root/_n/macros.html' import oob_header with context %} + {{oob_header('product-header-child', 'product-admin-header-child', '_types/product/admin/header/_header.html')}} + + {% from '_types/product/header/_header.html' import header_row with context %} + {{ header_row(oob=True) }} +{% endblock %} + + +{% from '_types/root/_n/macros.html' import header with context %} +{% call header(id='product-header-child', oob=True) %} + {% call header() %} + {% from '_types/product/admin/header/_header.html' import header_row with context %} + {{header_row()}} +
+ +
+ {% endcall %} +{% endcall %} + + +{% block mobile_menu %} + {% include '_types/product/admin/_nav.html' %} +{% endblock %} + + +{% block content %} + {% include '_types/product/_main_panel.html' %} +{% endblock %} diff --git a/templates/_types/product/admin/header/_header.html b/templates/_types/product/admin/header/_header.html new file mode 100644 index 0000000..2a6993a --- /dev/null +++ b/templates/_types/product/admin/header/_header.html @@ -0,0 +1,11 @@ +{% import 'macros/links.html' as links %} +{% macro header_row(oob=False) %} + {% call links.menu_row(id='product-admin-row', oob=oob) %} + {% call links.link(url_for('market.browse.product.admin', slug=d.slug), hx_select_search ) %} + admin!! + {% endcall %} + {% call links.desktop_nav() %} + {% include '_types/product/admin/_nav.html' %} + {% endcall %} + {% endcall %} +{% endmacro %} \ No newline at end of file diff --git a/templates/_types/product/admin/index.html b/templates/_types/product/admin/index.html new file mode 100644 index 0000000..3afe352 --- /dev/null +++ b/templates/_types/product/admin/index.html @@ -0,0 +1,39 @@ +{% extends '_types/product/index.html' %} + +{% import 'macros/layout.html' as layout %} + +{% block product_header_child %} + {% from '_types/root/_n/macros.html' import index_row with context %} + {% call index_row('market-header-child', '_types/product/admin/header/_header.html') %} + {% block product_admin_header_child %} + {% endblock %} + {% endcall %} +{% endblock %} + + + +{% block ___app_title %} + {% import 'macros/links.html' as links %} + {% call links.menu_row() %} + {% call links.link(url_for('market.browse.product.admin', slug=slug), hx_select_search) %} + {{ links.admin() }} + {% endcall %} + {% call links.desktop_nav() %} + {% include '_types/product/admin/_nav.html' %} + {% endcall %} + {% endcall %} +{% endblock %} + + + +{% block _main_mobile_menu %} + {% include '_types/product/admin/_nav.html' %} +{% endblock %} + +{% block aside %} +{% endblock %} + + +{% block content %} +{% include '_types/product/_main_panel.html' %} +{% endblock %} diff --git a/templates/_types/product/header/_header.html b/templates/_types/product/header/_header.html new file mode 100644 index 0000000..3a8daa6 --- /dev/null +++ b/templates/_types/product/header/_header.html @@ -0,0 +1,15 @@ +{% import 'macros/links.html' as links %} +{% macro header_row(oob=False) %} + {% call links.menu_row(id='product-row', oob=oob) %} + {% call links.link(url_for('market.browse.product.product_detail', slug=d.slug), hx_select_search ) %} + {% include '_types/product/_title.html' %} + {% endcall %} + {% include '_types/product/_prices.html' %} + {% call links.desktop_nav() %} + {% include '_types/browse/_admin.html' %} + {% endcall %} + {% endcall %} +{% endmacro %} + + + diff --git a/templates/_types/product/index.html b/templates/_types/product/index.html new file mode 100644 index 0000000..bdbe8cc --- /dev/null +++ b/templates/_types/product/index.html @@ -0,0 +1,61 @@ +{% extends '_types/browse/index.html' %} + +{% block meta %} + {% include '_types/product/_meta.html' %} +{% endblock %} + + +{% import 'macros/stickers.html' as stick %} +{% import '_types/product/prices.html' as prices %} +{% set prices_ns = namespace() %} +{{ prices.set_prices(d, prices_ns)}} + + + +{% block market_header_child %} + {% from '_types/root/_n/macros.html' import index_row with context %} + {% call index_row('market-header-child', '_types/product/header/_header.html') %} + {% block product_header_child %} + {% endblock %} + {% endcall %} +{% endblock %} + + +{% block _main_mobile_menu %} + {% include '_types/browse/_admin.html' %} +{% endblock %} + + + +{% block filter %} + +{% call layout.details() %} + {% call layout.summary('coop-child-header') %} + {% block coop_child_summary %} + {% endblock %} + {% endcall %} + {% call layout.menu('blog-child-menu') %} + {% block post_child_menu %} + {% endblock %} + {% endcall %} + {% endcall %} + + {% call layout.details() %} + {% call layout.summary('product-child-header') %} + {% block item_child_summary %} + {% endblock %} + {% endcall %} + {% call layout.menu('item-child-menu') %} + {% block item_child_menu %} + {% endblock %} + {% endcall %} + {% endcall %} + +{% endblock %} + +{% block aside %} +{% endblock %} + +{% block content %} + {% include '_types/product/_main_panel.html' %} +{% endblock %} diff --git a/templates/_types/product/prices.html b/templates/_types/product/prices.html new file mode 100644 index 0000000..be9cc4c --- /dev/null +++ b/templates/_types/product/prices.html @@ -0,0 +1,66 @@ +{# ---- Price formatting helpers ---- #} +{% set _sym = {'GBP':'£','EUR':'€','USD':'$'} %} +{% macro price_str(val, raw, cur) -%} + {%- if raw -%} + {{ raw }} + {%- elif val is number -%} + {{ (_sym.get(cur) or '') ~ ('%.2f'|format(val)) }} + {%- else -%} + {{ val or '' }} + {%- endif -%} +{%- endmacro %} + + +{% macro set_prices(item, ns) -%} + +{% set ns.sp_val = item.special_price or (item.oe_list_price and item.oe_list_price.special) %} +{% set ns.sp_raw = item.special_price_raw or (item.oe_list_price and item.oe_list_price.special_raw) %} +{% set ns.sp_cur = item.special_price_currency or (item.oe_list_price and item.oe_list_price.special_currency) %} + +{% set ns.rp_val = item.regular_price or item.rrp or (item.oe_list_price and item.oe_list_price.rrp) %} +{% set ns.rp_raw = item.regular_price_raw or item.rrp_raw or (item.oe_list_price and item.oe_list_price.rrp_raw) %} +{% set ns.rp_cur = item.regular_price_currency or item.rrp_currency or (item.oe_list_price and item.oe_list_price.rrp_currency) %} + +{% set ns.case_size_count = (item.case_size_count or 1) %} +{% set ns.rrp = item.rrp_raw[0] ~ "%.2f"|format(item.rrp * (ns.case_size_count)) %} +{% set ns.rrp_raw = item.rrp_raw %} + +{%- endmacro %} + + +{% macro rrp(ns) -%} + {% if ns.rrp %} +
+ rrp: + + {{ ns.rrp }} + +
+ {% endif %} +{%- endmacro %} + + +{% macro card_price(item) %} + + +{# price block unchanged #} + {% set _sym = {'GBP':'£','EUR':'€','USD':'$'} %} + {% set sp_val = item.special_price or (item.oe_list_price and item.oe_list_price.special) %} + {% set sp_raw = item.special_price_raw or (item.oe_list_price and item.oe_list_price.special_raw) %} + {% set sp_cur = item.special_price_currency or (item.oe_list_price and item.oe_list_price.special_currency) %} + {% set rp_val = item.regular_price or item.rrp or (item.oe_list_price and item.oe_list_price.rrp) %} + {% set rp_raw = item.regular_price_raw or item.rrp_raw or (item.oe_list_price and item.oe_list_price.rrp_raw) %} + {% set rp_cur = item.regular_price_currency or item.rrp_currency or (item.oe_list_price and item.oe_list_price.rrp_currency) %} + {% set sp_str = sp_raw if sp_raw else ( (_sym.get(sp_cur, '') ~ ('%.2f'|format(sp_val))) if sp_val is number else (sp_val or '')) %} + {% set rp_str = rp_raw if rp_raw else ( (_sym.get(rp_cur, '') ~ ('%.2f'|format(rp_val))) if rp_val is number else (rp_val or '')) %} +
+ {% if sp_val %} +
{{ sp_str }}
+ {% if rp_val %} +
{{ rp_str }}
+ {% endif %} + {% elif rp_val %} +
{{ rp_str }}
+ {% endif %} +
+{% endmacro %}