This repository has been archived on 2026-02-24. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
market/scrape/build_snapshot/build_snapshot.py
giles 6271a715a1
Some checks failed
Build and Deploy / build-and-deploy (push) Has been cancelled
feat: initialize market app with browsing, product, and scraping code
Split from coop monolith. Includes:
- Market/browse/product blueprints
- Product sync API
- Suma scraping pipeline
- Templates for market, browse, and product views
- Dockerfile and CI workflow for independent deployment
2026-02-09 23:16:34 +00:00

105 lines
2.6 KiB
Python

#!/usr/bin/env python3
from __future__ import annotations
import os
from typing import Dict, Set
from ..http_client import configure_cookies
from ..get_auth import login
from config import config
from utils import log
# DB: persistence helpers
from .tools import (
_resolve_sub_redirects,
valid_subs,
candidate_subs,
rewrite_nav,
capture_product_slugs,
fetch_and_upsert_products,
)
from ..nav import nav_scrape
# ------------------------ core ------------------------
async def build_snapshot(
concurrency: int,
user: str,
password: str,
save_nav,
capture_listing,
upsert_product,
log_product_result,
save_subcategory_redirects,
save_link_reports = None,
) -> None:
# NOTE: we keep ensure_dir for listings iteration but no longer write JSON files.
# Make project importable
import sys
sys.path.insert(0, os.path.abspath("."))
cookies = await login(username=user, password=password)
await configure_cookies(cookies)
for k, v in dict(cookies).items():
print("logged in with", k, v)
# 1) NAV
log("Fetching nav…")
nav = await nav_scrape()
# Build valid subs per top from nav
valid_subs_by_top: Dict[str, Set[str]] = valid_subs(nav)
# Resolve redirects for all subs in nav first
nav_sub_candidates = candidate_subs(nav)
nav_redirects = await _resolve_sub_redirects(
base_url=config()["base_url"],
candidates=nav_sub_candidates,
allowed_tops=set(config()["categories"]["allow"].values()),
valid_subs_by_top=valid_subs_by_top,
)
rewrite_nav(nav, nav_redirects)
# DB: save nav
await save_nav(nav)
product_slugs: Set[str] = await capture_product_slugs(
nav,
capture_listing
)
unknown_sub_paths: Set[str] = set()
# 3) PRODUCTS (fetch details)
await fetch_and_upsert_products(
upsert_product,
log_product_result,
save_link_reports,
concurrency,
product_slugs,
valid_subs_by_top,
unknown_sub_paths
)
# Subcategory redirects from HTML
log("Resolving subcategory redirects…")
html_redirects = await _resolve_sub_redirects(
base_url=config()["base_url"],
candidates=unknown_sub_paths,
allowed_tops=set(config()["categories"]["allow"].values()),
valid_subs_by_top=valid_subs_by_top,
)
sub_redirects: Dict[str, str] = dict(nav_redirects)
sub_redirects.update(html_redirects)
# DB: persist redirects
await save_subcategory_redirects(sub_redirects)
log("Snapshot build complete (to Postgres).")