Monorepo: consolidate 7 repos into one

Combines shared, blog, market, cart, events, federation, and account
into a single repository. Eliminates submodule sync, sibling model
copying at build time, and per-app CI orchestration.

Changes:
- Remove per-app .git, .gitmodules, .gitea, submodule shared/ dirs
- Remove stale sibling model copies from each app
- Update all 6 Dockerfiles for monorepo build context (root = .)
- Add build directives to docker-compose.yml
- Add single .gitea/workflows/ci.yml with change detection
- Add .dockerignore for monorepo build context
- Create __init__.py for federation and account (cross-app imports)
This commit is contained in:
giles
2026-02-24 19:44:17 +00:00
commit f42042ccb7
895 changed files with 61147 additions and 0 deletions

View File

View File

@@ -0,0 +1 @@
from .build_snapshot import build_snapshot

View File

@@ -0,0 +1,104 @@
#!/usr/bin/env python3
from __future__ import annotations
import os
from typing import Dict, Set
from ..http_client import configure_cookies
from ..get_auth import login
from shared.config import config
from shared.utils import log
# DB: persistence helpers
from .tools import (
_resolve_sub_redirects,
valid_subs,
candidate_subs,
rewrite_nav,
capture_product_slugs,
fetch_and_upsert_products,
)
from ..nav import nav_scrape
# ------------------------ core ------------------------
async def build_snapshot(
concurrency: int,
user: str,
password: str,
save_nav,
capture_listing,
upsert_product,
log_product_result,
save_subcategory_redirects,
save_link_reports = None,
) -> None:
# NOTE: we keep ensure_dir for listings iteration but no longer write JSON files.
# Make project importable
import sys
sys.path.insert(0, os.path.abspath("."))
cookies = await login(username=user, password=password)
await configure_cookies(cookies)
for k, v in dict(cookies).items():
print("logged in with", k, v)
# 1) NAV
log("Fetching nav…")
nav = await nav_scrape()
# Build valid subs per top from nav
valid_subs_by_top: Dict[str, Set[str]] = valid_subs(nav)
# Resolve redirects for all subs in nav first
nav_sub_candidates = candidate_subs(nav)
nav_redirects = await _resolve_sub_redirects(
base_url=config()["base_url"],
candidates=nav_sub_candidates,
allowed_tops=set(config()["categories"]["allow"].values()),
valid_subs_by_top=valid_subs_by_top,
)
rewrite_nav(nav, nav_redirects)
# DB: save nav
await save_nav(nav)
product_slugs: Set[str] = await capture_product_slugs(
nav,
capture_listing
)
unknown_sub_paths: Set[str] = set()
# 3) PRODUCTS (fetch details)
await fetch_and_upsert_products(
upsert_product,
log_product_result,
save_link_reports,
concurrency,
product_slugs,
valid_subs_by_top,
unknown_sub_paths
)
# Subcategory redirects from HTML
log("Resolving subcategory redirects…")
html_redirects = await _resolve_sub_redirects(
base_url=config()["base_url"],
candidates=unknown_sub_paths,
allowed_tops=set(config()["categories"]["allow"].values()),
valid_subs_by_top=valid_subs_by_top,
)
sub_redirects: Dict[str, str] = dict(nav_redirects)
sub_redirects.update(html_redirects)
# DB: persist redirects
await save_subcategory_redirects(sub_redirects)
log("Snapshot build complete (to Postgres).")

View File

@@ -0,0 +1 @@
APP_ROOT_PLACEHOLDER = "[**__APP_ROOT__**]"

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,6 @@
def _anchor_text(a) -> str:
try:
txt = " ".join((a.get_text(" ") or "").split())
return txt[:200]
except Exception:
return ""

View File

@@ -0,0 +1,16 @@
from bs4 import BeautifulSoup
from typing import List, Optional
def _collect_html_img_srcs(html: Optional[str]) -> List[str]:
urls: List[str] = []
if not html:
return urls
try:
soup = BeautifulSoup(html, "lxml")
for img in soup.find_all("img"):
src = img.get("src")
if src:
urls.append(src)
except Exception:
pass
return urls

View File

@@ -0,0 +1,14 @@
from typing import Iterable, List, Set
def _dedupe_preserve_order(urls: Iterable[str]) -> List[str]:
seen: Set[str] = set()
out: List[str] = []
for u in urls:
if not u or not isinstance(u, str):
continue
if u in seen:
continue
seen.add(u)
out.append(u)
return out

View File

@@ -0,0 +1,32 @@
from typing import Dict,Optional, Tuple
_CF_TOKENS = (
"One moment, please...",
"Please wait while your request is being verified",
"/cdn-cgi/challenge-platform/",
"rocket-loader.min.js",
)
def _looks_like_cf_html(html: Optional[str]) -> Tuple[bool, Optional[str]]:
if not html:
return False, None
for tok in _CF_TOKENS:
if tok in html:
return True, tok
return False, None
def _product_dict_is_cf(d: Dict) -> Tuple[bool, Optional[str]]:
title = (d.get("title") or "").strip()
if title.lower() == "one moment, please...":
return True, "One moment, please..."
ok, tok = _looks_like_cf_html(d.get("description_html"))
if ok:
return True, tok
for sec in d.get("sections") or []:
if isinstance(sec, dict) and sec.get("html"):
ok2, tok2 = _looks_like_cf_html(sec["html"])
if ok2:
return True, tok2
if not d.get("images") and not d.get("description_html") and not d.get("sections"):
return True, "all_empty_heuristic"
return False, None

View File

@@ -0,0 +1,34 @@
from typing import Dict, Set
from urllib.parse import urlparse, urljoin
import httpx
async def _resolve_sub_redirects(
base_url: str,
candidates: Set[str],
allowed_tops: Set[str],
valid_subs_by_top: Dict[str, Set[str]],
) -> Dict[str, str]:
mapping: Dict[str, str] = {}
if not candidates:
return mapping
timeout = httpx.Timeout(20.0, connect=10.0)
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, http2=True) as client:
for path in sorted(candidates):
try:
url = urljoin(base_url, path)
r = await client.get(url)
final = str(r.url)
p = urlparse(final)
parts = [x for x in (p.path or "").split("/") if x]
if len(parts) >= 2:
top_new = parts[0].lower()
sub_new = parts[1].lower().removesuffix(".html").removesuffix(".htm")
if top_new in allowed_tops:
new_path = f"/{top_new}/{sub_new}"
if new_path != path:
mapping[path] = new_path
valid_subs_by_top.setdefault(top_new, set()).add(sub_new)
except Exception:
continue
return mapping

View File

@@ -0,0 +1,100 @@
from typing import Dict, List, Optional, Set
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from ._anchor_text import _anchor_text
from bp.browse.services.slugs import product_slug_from_href
from .APP_ROOT_PLACEHOLDER import APP_ROOT_PLACEHOLDER
def _rewrite_links_fragment(
html: Optional[str],
base_url: str,
known_slugs: Set[str],
category_allow_values: Set[str],
valid_subs_by_top: Dict[str, Set[str]],
current_product_slug: str,
link_errors: List[Dict],
link_externals: List[Dict],
unknown_sub_paths: Set[str],
) -> str:
if not html:
return ""
soup = BeautifulSoup(html, "lxml")
base_host = urlparse(base_url).netloc
for a in soup.find_all("a", href=True):
raw = (a.get("href") or "").strip()
if not raw:
continue
low = raw.lower()
if low.startswith(("mailto:", "tel:", "javascript:", "data:")) or low.startswith("#"):
continue
abs_href = urljoin(base_url, raw)
p = urlparse(abs_href)
if not p.scheme or not p.netloc:
continue
if p.netloc != base_host:
link_externals.append({
"product": current_product_slug,
"href": abs_href,
"text": _anchor_text(a),
"host": p.netloc,
})
continue
parts = [x for x in (p.path or "").split("/") if x]
if not parts:
continue
last = parts[-1].lower()
if last.endswith((".html", ".htm")):
target_slug = product_slug_from_href(abs_href)
if target_slug and target_slug in known_slugs:
a["href"] = f"{APP_ROOT_PLACEHOLDER}/product/{target_slug}"
else:
link_errors.append({
"product": current_product_slug,
"href": abs_href,
"text": _anchor_text(a),
"top": None,
"sub": None,
"target_slug": target_slug or None,
"type": "suma_product_unknown",
})
continue
top = parts[0].lower()
if top in category_allow_values:
if len(parts) == 1:
a["href"] = f"{APP_ROOT_PLACEHOLDER}/{top}"
else:
sub = parts[1]
if sub.lower().endswith((".html", ".htm")):
sub = sub.rsplit(".", 1)[0]
if sub in (valid_subs_by_top.get(top) or set()):
a["href"] = f"{APP_ROOT_PLACEHOLDER}/{top}/{sub}"
else:
unknown_path = f"/{top}/{sub}"
unknown_sub_paths.add(unknown_path)
a["href"] = f"{APP_ROOT_PLACEHOLDER}{unknown_path}"
link_errors.append({
"product": current_product_slug,
"href": abs_href,
"text": _anchor_text(a),
"top": top,
"sub": sub,
"target_slug": None,
"type": "suma_category_invalid_sub_pending",
})
else:
link_errors.append({
"product": current_product_slug,
"href": abs_href,
"text": _anchor_text(a),
"top": top,
"sub": parts[1] if len(parts) > 1 else None,
"target_slug": None,
"type": "suma_other",
})
for t in soup.find_all(["html", "body"]):
t.unwrap()
return "".join(str(c) for c in soup.contents).strip()

View File

@@ -0,0 +1,14 @@
from typing import Dict, Set
def candidate_subs(nav: Dict[str, Dict])-> Set[str]:
nav_sub_candidates: Set[str] = set()
for label, data in (nav.get("cats") or {}).items():
top_slug = (data or {}).get("slug")
if not top_slug:
continue
for s in (data.get("subs") or []):
sub_slug = (s.get("slug") or "").strip()
if sub_slug:
nav_sub_candidates.add(f"/{top_slug}/{sub_slug}")
return nav_sub_candidates

View File

@@ -0,0 +1,18 @@
from urllib.parse import urljoin
from shared.config import config
from shared.utils import log
from ...listings import scrape_products
async def capture_category(
slug: str,
):
list_url = urljoin(config()["base_url"], f"/{slug}")
log(f"[{slug}] page 1…")
items, total_pages = await scrape_products(list_url, page=1)
pmax = int(total_pages or 1)
for p in range(2, pmax + 1):
log(f"[{slug}] page {p}")
items_p, _tp = await scrape_products(list_url, page=p)
items.extend(items_p)
return (list_url, items, total_pages)

View File

@@ -0,0 +1,25 @@
from typing import Dict, Set
from .capture_category import capture_category
from .capture_sub import capture_sub
from shared.config import config
async def capture_product_slugs(
nav: Dict[str, Dict],
capture_listing,
):
product_slugs: Set[str] = set()
for label, slug in config()["categories"]["allow"].items():
lpars = await capture_category( slug)
await capture_listing(*lpars)
(_, items, __) = lpars
for slug_ in items:
product_slugs.add(slug_)
for sub in (nav["cats"].get(label, {}).get("subs", []) or []):
lpars = await capture_sub(sub, slug)
await capture_listing(*lpars)
(_, items, __) = lpars
for slug_ in items:
product_slugs.add(slug_)
return product_slugs

View File

@@ -0,0 +1,22 @@
from urllib.parse import urljoin
from urllib.parse import urljoin
from shared.config import config
from shared.utils import log
from ...listings import scrape_products
async def capture_sub(
sub,
slug,
):
sub_slug = sub.get("slug")
if not sub_slug:
return
sub_url = urljoin(config()["base_url"], f"/{slug}/{sub_slug}")
log(f"[{slug}/{sub_slug}] page 1…")
items_s, total_pages_s = await scrape_products(sub_url, page=1)
spmax = int(total_pages_s or 1)
for p in range(2, spmax + 1):
log(f"[{slug}/{sub_slug}] page {p}")
items_ps, _ = await scrape_products(sub_url, page=p)
items_s.extend(items_ps)
return (sub_url, items_s, total_pages_s)

View File

@@ -0,0 +1,106 @@
import asyncio
from typing import List
import httpx
from ...html_utils import to_fragment
from bp.browse.services.slugs import suma_href_from_html_slug
from shared.config import config
from shared.utils import log
# DB: persistence helpers
from ...product.product_detail import scrape_product_detail
from ._product_dict_is_cf import _product_dict_is_cf
from ._rewrite_links_fragment import _rewrite_links_fragment
from ._dedupe_preserve_order import _dedupe_preserve_order
from ._collect_html_img_srcs import _collect_html_img_srcs
async def fetch_and_upsert_product(
upsert_product,
log_product_result,
sem: asyncio.Semaphore,
slug: str,
product_slugs,
category_values,
valid_subs_by_top,
link_errors,
link_externals,
unknown_sub_paths
) -> bool:
href = suma_href_from_html_slug(slug)
try:
async with sem:
d = await scrape_product_detail(href)
is_cf, cf_token = _product_dict_is_cf(d)
if is_cf:
payload = {
"slug": slug,
"href_tried": href,
"error_type": "CloudflareChallengeDetected",
"error_message": f"Detected Cloudflare interstitial via token: {cf_token}",
"cf_token": cf_token,
}
await log_product_result(ok=False, payload=payload)
log(f" ! CF challenge detected: {slug} ({cf_token})")
return False
# Rewrite embedded links; collect reports
if d.get("description_html"):
d["description_html"] = _rewrite_links_fragment(
d["description_html"], config()["base_url"], product_slugs, category_values,
valid_subs_by_top, slug, link_errors, link_externals, unknown_sub_paths
)
d["description_html"] = to_fragment(d["description_html"])
if d.get("sections"):
for sec in d["sections"]:
if isinstance(sec, dict) and sec.get("html"):
sec["html"] = _rewrite_links_fragment(
sec["html"], config()["base_url"], product_slugs, category_values,
valid_subs_by_top, slug, link_errors, link_externals, unknown_sub_paths
)
sec["html"] = to_fragment(sec["html"])
# Images
gallery = _dedupe_preserve_order(d.get("images") or [])
embedded: List[str] = []
if d.get("description_html"):
embedded += _collect_html_img_srcs(d["description_html"])
for sec in d.get("sections", []) or []:
if isinstance(sec, dict) and sec.get("html"):
embedded += _collect_html_img_srcs(sec["html"])
embedded = _dedupe_preserve_order(embedded)
all_imgs = _dedupe_preserve_order(list(gallery) + list(embedded))
d["images"] = gallery
d["embedded_image_urls"] = embedded
d["all_image_urls"] = all_imgs
await upsert_product(slug, href, d)
# DB: upsert product + success log
return True
except Exception as e:
payload = {
"slug": slug,
"href_tried": href,
"error_type": e.__class__.__name__,
"error_message": str(e),
}
try:
if isinstance(e, httpx.HTTPStatusError):
payload["http_status"] = getattr(e.response, "status_code", None)
req = getattr(e, "request", None)
if req is not None and getattr(req, "url", None) is not None:
payload["final_url"] = str(req.url)
elif isinstance(e, httpx.TransportError):
payload["transport_error"] = True
except Exception:
pass
await log_product_result(ok=False, payload=payload)
log(f" ! product failed: {slug} ({e})")
return False

View File

@@ -0,0 +1,49 @@
import asyncio
from typing import Dict, List, Set
from shared.config import config
from shared.utils import log
from .fetch_and_upsert_product import fetch_and_upsert_product
async def fetch_and_upsert_products(
upsert_product,
log_product_result,
save_link_reports = None,
concurrency: int=8,
product_slugs: Set[str] = set(),
valid_subs_by_top: Dict[str, Set[str]] = {},
unknown_sub_paths: Set[str] = set()
):
sem = asyncio.Semaphore(max(1, concurrency))
link_errors: List[Dict] = []
link_externals: List[Dict] = []
category_values: Set[str] = set(config()["categories"]["allow"].values())
to_fetch = sorted(list(product_slugs))
log(f"Fetching {len(to_fetch)} product details (concurrency={concurrency})…")
tasks = [asyncio.create_task(
fetch_and_upsert_product(
upsert_product,
log_product_result,
sem,
s,
product_slugs,
category_values,
valid_subs_by_top,
link_errors,
link_externals,
unknown_sub_paths
)
) for s in to_fetch]
done = 0
ok_count = 0
for coro in asyncio.as_completed(tasks):
ok = await coro
done += 1
if ok:
ok_count += 1
if done % 50 == 0 or done == len(tasks):
log(f"{done}/{len(tasks)} saved (ok={ok_count})")
if save_link_reports:
await save_link_reports(link_errors, link_externals)

View File

@@ -0,0 +1,24 @@
from typing import Dict
from urllib.parse import urljoin
from shared.config import config
def rewrite_nav(nav: Dict[str, Dict], nav_redirects:Dict[str, str]):
if nav_redirects:
for label, data in (nav.get("cats") or {}).items():
top_slug = (data or {}).get("slug")
if not top_slug:
continue
new_subs = []
for s in (data.get("subs") or []):
old_sub = (s.get("slug") or "").strip()
if not old_sub:
continue
old_path = f"/{top_slug}/{old_sub}"
canonical_path = nav_redirects.get(old_path, old_path)
parts = [x for x in canonical_path.split("/") if x]
top2, sub2 = parts[0], parts[1]
s["slug"] = sub2
s["href"] = urljoin(config()["base_url"], f"/{top2}/{sub2}")
new_subs.append(s)
data["subs"] = new_subs

View File

@@ -0,0 +1,16 @@
from typing import Dict, Set
# make valid subs for ewch top in nav
def valid_subs(nav: Dict[str, Dict])->Dict[str, Set[str]] :
valid_subs_by_top: Dict[str, Set[str]] = {}
for label, data in (nav.get("cats") or {}).items():
top_slug = (data or {}).get("slug")
if not top_slug:
continue
subs_set = {
(s.get("slug") or "").strip()
for s in (data.get("subs") or [])
if s.get("slug")
}
valid_subs_by_top[top_slug] = subs_set
return valid_subs_by_top

244
market/scrape/get_auth.py Normal file
View File

@@ -0,0 +1,244 @@
from typing import Optional, Dict, Any, List
from urllib.parse import urljoin
import httpx
from bs4 import BeautifulSoup
from shared.config import config
class LoginFailed(Exception):
def __init__(self, message: str, *, debug: Dict[str, Any]):
super().__init__(message)
self.debug = debug
def _ff_headers(referer: Optional[str] = None, origin: Optional[str] = None) -> Dict[str, str]:
h = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-GB,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"DNT": "1",
"Sec-GPC": "1",
"Cache-Control": "no-cache",
"Pragma": "no-cache",
}
if referer:
h["Referer"] = referer
if origin:
h["Origin"] = origin
return h
def _cookie_header_from_jar(jar: httpx.Cookies, domain: str, path: str = "/") -> str:
pairs: List[str] = []
for c in jar.jar:
if not c.name or c.value is None:
continue
dom = (c.domain or "").lstrip(".")
if not dom:
continue
if not (domain == dom or domain.endswith("." + dom) or dom.endswith("." + domain)):
continue
if not (path.startswith(c.path or "/")):
continue
pairs.append(f"{c.name}={c.value}")
return "; ".join(pairs)
def _extract_magento_errors(html_text: str) -> list[str]:
msgs: list[str] = []
try:
soup = BeautifulSoup(html_text or "", "lxml")
for sel in [
".message-error",
".messages .message-error",
".page.messages .message-error",
"[data-ui-id='message-error']",
".message.warning",
".message.notice",
]:
for box in soup.select(sel):
t = " ".join((box.get_text(" ") or "").split())
if t and t not in msgs:
msgs.append(t)
except Exception:
pass
return msgs
def _looks_like_login_page(html_text: str) -> bool:
try:
s = BeautifulSoup(html_text or "", "lxml")
if s.select_one("form#login-form.form-login"):
return True
title = (s.title.get_text() if s.title else "").strip().lower()
if "customer login" in title:
return True
except Exception:
pass
return False
def _chrome_headers(referer=None, origin=None):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
if referer:
headers["Referer"] = referer
if origin:
headers["Origin"] = origin
return headers
async def login(
username: str,
password: str,
*,
extra_cookies = {}, # ok to pass cf_clearance etc., but NOT form_key
timeout: float = 30.0,
) -> httpx.Cookies:
"""
Attempt login and return an authenticated cookie jar.
Success criteria (strict):
1) /customer/section/load?sections=customer reports is_logged_in == True
OR
2) GET /customer/account/ resolves to an account page (not the login page).
Otherwise raises LoginFailed with debug info.
"""
limits = httpx.Limits(max_connections=10, max_keepalive_connections=6)
cookies = httpx.Cookies()
for k, v in {
**extra_cookies,
"pr-cookie-consent": '["all"]',
"user_allowed_save_cookie": '{"1":1}',
}.items():
if k.lower() == "form_key":
continue
cookies.set(k, v, domain="wholesale.suma.coop", path="/")
base_login = config()["base_login"]
base_url = config()["base_url"]
async with httpx.AsyncClient(
follow_redirects=True,
timeout=httpx.Timeout(timeout, connect=15.0),
http2=True,
limits=limits,
cookies=cookies,
headers=_chrome_headers(),
trust_env=True,
) as client:
# 1) GET login page for fresh form_key
import time
login_bust = base_login + ("&" if "?" in base_login else "?") + f"_={int(time.time()*1000)}"
login_bust = base_login
r_get = await client.get(login_bust, headers=_chrome_headers())
print("Login GET failed. Status:", r_get.status_code)
print("Login GET URL:", r_get.url)
print("Response text:", r_get.text[:1000]) # trim if long
r_get.raise_for_status()
soup = BeautifulSoup(r_get.text, "lxml")
form = soup.select_one("form.form.form-login#login-form") or soup.select_one("#login-form")
if not form:
raise LoginFailed(
"Login form not found (possible bot challenge or theme change).",
debug={"get_status": r_get.status_code, "final_url": str(r_get.url)},
)
action = urljoin(base_login, form.get("action") or base_login)
fk_el = form.find("input", attrs={"name": "form_key"})
hidden_form_key = (fk_el.get("value") if fk_el else "") or ""
# mirror Magento behavior: form_key also appears as a cookie
client.cookies.set("form_key", hidden_form_key, domain="wholesale.suma.coop", path="/")
payload = {
"form_key": hidden_form_key,
"login[username]": username,
"login[password]": password,
"send": "Login",
}
post_headers = _chrome_headers(referer=base_login, origin=base_url)
post_headers["Content-Type"] = "application/x-www-form-urlencoded"
post_headers["Cookie"] = _cookie_header_from_jar(
client.cookies, domain="wholesale.suma.coop", path="/customer/"
)
r_post = await client.post(action, data=payload, headers=post_headers)
# 2) Primary check: sections API must say logged in
is_logged_in = False
sections_url = "https://wholesale.suma.coop/customer/section/load/?sections=customer&force_new_section_timestamp=1"
section_json: Dict[str, Any] = {}
try:
r_sec = await client.get(sections_url, headers=_chrome_headers(referer=base_login))
if r_sec.status_code == 200:
section_json = r_sec.json()
cust = section_json.get("customer") or {}
is_logged_in = bool(cust.get("is_logged_in"))
except Exception:
pass
# 3) Secondary check: account page should NOT be the login page
looks_like_login = False
final_account_url = ""
try:
r_acc = await client.get("https://wholesale.suma.coop/customer/account/", headers=_chrome_headers(referer=base_login))
final_account_url = str(r_acc.url)
looks_like_login = (
"/customer/account/login" in final_account_url
or _looks_like_login_page(r_acc.text)
)
except Exception:
# ignore; we'll rely on section status
pass
# Decide success/failure strictly
if not (is_logged_in or (final_account_url and not looks_like_login)):
errors = _extract_magento_errors(r_post.text)
# Clean up transient form_key cookie
try:
client.cookies.jar.clear("wholesale.suma.coop", "/", "form_key")
except Exception:
pass
raise LoginFailed(
errors[0] if errors else "Invalid username or password.",
debug={
"get_status": r_get.status_code,
"post_status": r_post.status_code,
"post_final_url": str(r_post.url),
"sections_customer": section_json.get("customer"),
"account_final_url": final_account_url,
"looks_like_login_page": looks_like_login,
},
)
def clear_cookie_everywhere(cookies: httpx.Cookies, name: str) -> None:
to_delete = []
for c in list(cookies.jar): # http.cookiejar.Cookie objects
if c.name == name:
# Note: CookieJar.clear requires exact (domain, path, name)
to_delete.append((c.domain, c.path, c.name))
for domain, path, nm in to_delete:
try:
cookies.jar.clear(domain, path, nm)
except KeyError:
# Mismatch can happen if domain has a leading dot vs not, etc.
# Try again with a normalized domain variant.
if domain and domain.startswith("."):
cookies.jar.clear(domain.lstrip("."), path, nm)
else:
# or try with leading dot
cookies.jar.clear("." + domain, path, nm)
if name in cookies:
del cookies[name]
clear_cookie_everywhere(client.cookies, "form_key")
#client.cookies.jar.clear(config()["base_host"] or "wholesale.suma.coop", "/", "form_key")
print('cookies', client.cookies)
return client.cookies

View File

@@ -0,0 +1,44 @@
# suma_browser/html_utils.py
from __future__ import annotations
from typing import Optional
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from shared.config import config
def to_fragment(html: Optional[str]) -> str:
"""Return just the fragment contents (no <html>/<body> wrappers)."""
if not html:
return ""
soup = BeautifulSoup(html, "lxml")
# unwrap document-level containers
for t in soup.find_all(["html", "body"]):
t.unwrap()
return "".join(str(c) for c in soup.contents).strip()
def absolutize_fragment(html: Optional[str]) -> str:
"""Absolutize href/src against BASE_URL and return a fragment (no wrappers)."""
if not html:
return ""
frag = BeautifulSoup(html, "lxml")
for tag in frag.find_all(True):
if tag.has_attr("href"):
raw = str(tag["href"])
abs_href = urljoin(config()["base_url"], raw) if raw.startswith("/") else raw
#if rewrite_suma_href_to_local:
# local = rewrite_suma_href_to_local(abs_href)
# tag["href"] = local if local else abs_href
#else:
tag["href"] = abs_href
if tag.has_attr("src"):
raw = str(tag["src"])
tag["src"] = urljoin(config()["base_url"], raw) if raw.startswith("/") else raw
# unwrap wrappers and return only the inner HTML
for t in frag.find_all(["html", "body"]):
t.unwrap()
return "".join(str(c) for c in frag.contents).strip()

View File

@@ -0,0 +1,220 @@
# suma_browser/http_client.py
from __future__ import annotations
import asyncio
import os
import secrets
from typing import Optional, Dict
import httpx
from shared.config import config
_CLIENT: httpx.AsyncClient | None = None
# ----- optional decoders -> Accept-Encoding
BROTLI_OK = False
ZSTD_OK = False
try:
import brotli # noqa: F401
BROTLI_OK = True
except Exception:
pass
try:
import zstandard as zstd # noqa: F401
ZSTD_OK = True
except Exception:
pass
def _accept_encoding() -> str:
enc = ["gzip", "deflate"]
if BROTLI_OK:
enc.append("br")
if ZSTD_OK:
enc.append("zstd")
return ", ".join(enc)
FIREFOX_UA = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0"
def _ff_headers(referer: Optional[str] = None) -> Dict[str, str]:
h = {
"User-Agent": FIREFOX_UA,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-GB,en;q=0.5",
"Accept-Encoding": _accept_encoding(),
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none" if not referer else "same-origin",
"Sec-Fetch-User": "?1",
"DNT": "1",
"Sec-GPC": "1",
"Priority": "u=0, i",
"Cache-Control": "no-cache",
"Pragma": "no-cache",
}
if referer:
h["Referer"] = referer
return h
def _chrome_headers(referer=None, origin=None):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
if referer:
headers["Referer"] = referer
if origin:
headers["Origin"] = origin
return headers
def _parse_cookie_header(cookie_header: str) -> Dict[str, str]:
jar: Dict[str, str] = {}
for part in cookie_header.split(";"):
part = part.strip()
if not part or "=" not in part:
continue
k, v = part.split("=", 1)
jar[k.strip()] = v.strip()
return jar
def _looks_like_cloudflare(html: bytes) -> bool:
if not html:
return False
s = html[:40000].lower()
return (
b"please wait while your request is being verified" in s
or b"/cdn-cgi/challenge-platform/scripts/jsd/main.js" in s
or b"rocket-loader.min.js" in s
or b"cf-ray" in s
or b"challenge-platform" in s
or b"cf-chl-" in s
)
# -------- runtime cookie configuration (preferred over env) --------------------
_INITIAL_COOKIES: Dict[str, str] = {}
_INITIAL_COOKIE_HEADER: Optional[str] = None
async def configure_cookies(cookies: Dict[str, str]) -> None:
"""
Configure initial cookies programmatically (preferred over env).
Call BEFORE the first request (i.e., before get_client()/fetch()).
If a client already exists, its jar is updated immediately.
"""
global _INITIAL_COOKIES, _INITIAL_COOKIE_HEADER
_INITIAL_COOKIE_HEADER = None
_INITIAL_COOKIES = dict(cookies or {})
# If client already built, update it now
if _CLIENT is not None:
print('configuring cookies')
host = config()["base_host"] or "wholesale.suma.coop"
for k, v in _INITIAL_COOKIES.items():
_CLIENT.cookies.set(k, v, domain=host, path="/")
def configure_cookies_from_header(cookie_header: str) -> None:
"""
Configure initial cookies from a raw 'Cookie:' header string.
Preferred over env; call BEFORE the first request.
"""
global _INITIAL_COOKIES, _INITIAL_COOKIE_HEADER
_INITIAL_COOKIE_HEADER = cookie_header or ""
_INITIAL_COOKIES = _parse_cookie_header(_INITIAL_COOKIE_HEADER)
if _CLIENT is not None:
host = config()["base_host"] or "wholesale.suma.coop"
for k, v in _INITIAL_COOKIES.items():
_CLIENT.cookies.set(k, v, domain=host, path="/")
# ------------------------------------------------------------------------------
async def get_client() -> httpx.AsyncClient:
"""Public accessor (same as _get_client)."""
return await _get_client()
async def _get_client() -> httpx.AsyncClient:
global _CLIENT
if _CLIENT is None:
timeout = httpx.Timeout(300.0, connect=150.0)
limits = httpx.Limits(max_keepalive_connections=8, max_connections=16)
_CLIENT = httpx.AsyncClient(
follow_redirects=True,
timeout=timeout,
http2=True,
limits=limits,
headers=_chrome_headers(),
trust_env=True,
)
# ---- Seed cookies (priority: runtime config > env var) ---------------
host = config()["base_host"] or "wholesale.suma.coop"
if _INITIAL_COOKIES or _INITIAL_COOKIE_HEADER:
# From runtime config
if _INITIAL_COOKIE_HEADER:
_CLIENT.cookies.update(_parse_cookie_header(_INITIAL_COOKIE_HEADER))
for k, v in _INITIAL_COOKIES.items():
_CLIENT.cookies.set(k, v, domain=host, path="/")
else:
# Fallback to environment
cookie_str = os.environ.get("SUMA_COOKIES", "").strip()
if cookie_str:
_CLIENT.cookies.update(_parse_cookie_header(cookie_str))
# Ensure private_content_version is present
if "private_content_version" not in _CLIENT.cookies:
pcv = secrets.token_hex(16)
_CLIENT.cookies.set("private_content_version", pcv, domain=host, path="/")
# ---------------------------------------------------------------------
return _CLIENT
async def aclose_client() -> None:
global _CLIENT
if _CLIENT is not None:
await _CLIENT.aclose()
_CLIENT = None
async def fetch(url: str, *, referer: Optional[str] = None, retries: int = 3) -> str:
client = await _get_client()
# Warm-up visit to look like a real session
if len(client.cookies.jar) == 0:
try:
await client.get(config()["base_url"].rstrip("/") + "/", headers=_chrome_headers())
await asyncio.sleep(0.25)
except Exception:
pass
last_exc: Optional[Exception] = None
for attempt in range(1, retries + 1):
try:
h = _chrome_headers(referer=referer or (config()["base_url"].rstrip("/") + "/"))
r = await client.get(url, headers=h)
if _looks_like_cloudflare(r.content):
if attempt < retries:
await asyncio.sleep(0.9 if attempt == 1 else 1.3)
try:
await client.get(config()["base_url"].rstrip("/") + "/", headers=_chrome_headers())
await asyncio.sleep(0.4)
except Exception:
pass
continue
try:
r.raise_for_status()
except httpx.HTTPStatusError as e:
print(f"Fetch failed for {url}")
print("Status:", r.status_code)
print("Body:", r.text[:1000]) # Trimmed
raise
return r.text
except Exception as e:
last_exc = e
if attempt >= retries:
raise
await asyncio.sleep(0.45 * attempt + 0.25)
if last_exc:
raise last_exc
raise RuntimeError("fetch failed unexpectedly")

289
market/scrape/listings.py Normal file
View File

@@ -0,0 +1,289 @@
from __future__ import annotations
import math
import re
from typing import Callable, Dict, List, Optional, Tuple
from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
from .http_client import fetch
from bp.browse.services.slugs import product_slug_from_href
from bp.browse.services.state import (
KNOWN_PRODUCT_SLUGS,
_listing_page_cache,
_listing_page_ttl,
_listing_variant_cache,
_listing_variant_ttl,
now,
)
from shared.utils import normalize_text, soup_of
from shared.config import config
def parse_total_pages_from_text(text: str) -> Optional[int]:
m = re.search(r"Showing\s+(\d+)\s+of\s+(\d+)", text, re.I)
if not m:
return None
shown = int(m.group(1))
total = int(m.group(2))
per_page = 36 if shown in (12, 24, 36) else shown
return max(1, math.ceil(total / per_page))
def _first_from_srcset(val: str) -> Optional[str]:
if not val:
return None
first = val.split(",")[0].strip()
parts = first.split()
return parts[0] if parts else first
def _abs_url(u: Optional[str]) -> Optional[str]:
if not u:
return None
return urljoin(config()["base_url"], u) if isinstance(u, str) and u.startswith("/") else u
def _collect_img_candidates(el) -> List[str]:
urls: List[str] = []
if not el:
return urls
attrs = ["src", "data-src", "data-original", "data-zoom-image", "data-thumb", "content", "href"]
for a in attrs:
v = el.get(a)
if v:
urls.append(v)
for a in ["srcset", "data-srcset"]:
v = el.get(a)
if v:
first = _first_from_srcset(v)
if first:
urls.append(first)
return urls
def _dedupe_preserve_order_by(seq: List[str], key: Callable[[str], str]) -> List[str]:
seen = set()
out: List[str] = []
for s in seq:
if not s:
continue
k = key(s)
if k in seen:
continue
seen.add(k)
out.append(s)
return out
def _filename_key(u: str) -> str:
p = urlparse(u)
path = p.path or ""
if path.endswith("/"):
path = path[:-1]
last = path.split("/")[-1]
return f"{p.netloc}:{last}".lower()
def _parse_cards_from_soup(soup) -> List[Dict]:
"""Extract product tiles (name, href, image, desc) from listing soup.
De-duplicate by slug to avoid doubles from overlapping selectors."""
items: List[str] = []
seen_slugs: set[str] = set()
# Primary selectors (Magento 2 default)
card_wrappers = soup.select(
"li.product-item, .product-item, ol.products.list.items li, .products.list.items li, .product-item-info"
)
for card in card_wrappers:
a = (
card.select_one("a.product-item-link")
or card.select_one(".product-item-name a")
or card.select_one("a[href$='.html'], a[href$='.htm']")
)
if not a:
continue
#name = normalize_text(a.get_text()) or normalize_text(a.get("title") or "")
href = a.get("href")
#if not name or not href:
# continue
if href.startswith("/"):
href = urljoin(config()["base_url"], href)
slug = product_slug_from_href(href)
KNOWN_PRODUCT_SLUGS.add(slug)
if slug and slug not in seen_slugs:
seen_slugs.add(slug)
items.append(slug)
# Secondary: any product-looking anchors inside products container
if not items:
products_container = soup.select_one(".products") or soup
for a in products_container.select("a[href$='.html'], a[href$='.htm']"):
href = a.get("href")
if href.startswith("/"):
href = urljoin(config()["base_url"], href)
slug = product_slug_from_href(href)
KNOWN_PRODUCT_SLUGS.add(slug)
if slug not in seen_slugs:
seen_slugs.add(slug)
items.append(slug)
# Tertiary: JSON-LD fallback (ItemList/Product)
if not items:
import json
def add_product(name: Optional[str], url: Optional[str], image: Optional[str]):
if not url:
return
absu = urljoin(config()["base_url"], url) if url.startswith("/") else url
slug = product_slug_from_href(absu)
if not slug:
return
KNOWN_PRODUCT_SLUGS.add(slug)
if slug not in seen_slugs:
seen_slugs.add(slug)
items.append(slug)
for script in soup.find_all("script", attrs={"type": "application/ld+json"}):
#try:
data = json.loads(script.get_text())
#except Exception:
# continue
if isinstance(data, dict):
if data.get("@type") == "ItemList" and isinstance(data.get("itemListElement"), list):
for it in data["itemListElement"]:
if isinstance(it, dict):
ent = it.get("item") or it
if isinstance(ent, dict):
add_product(
ent.get("name"),
ent.get("url"),
(ent.get("image") if isinstance(ent.get("image"), str) else None),
)
if data.get("@type") == "Product":
add_product(
data.get("name"),
data.get("url"),
(data.get("image") if isinstance(data.get("image"), str) else None),
)
elif isinstance(data, list):
for ent in data:
if not isinstance(ent, dict):
continue
if ent.get("@type") == "Product":
add_product(
ent.get("name"),
ent.get("url"),
(ent.get("image") if isinstance(ent.get("image"), str) else None),
)
if ent.get("@type") == "ItemList":
for it in ent.get("itemListElement", []):
if isinstance(it, dict):
obj = it.get("item") or it
if isinstance(obj, dict):
add_product(
obj.get("name"),
obj.get("url"),
(obj.get("image") if isinstance(obj.get("image"), str) else None),
)
return items
def _with_query(url: str, add: Dict[str, str]) -> str:
p = urlparse(url)
q = dict(parse_qsl(p.query, keep_blank_values=True))
q.update(add)
new_q = urlencode(q)
return urlunparse((p.scheme, p.netloc, p.path, p.params, new_q, p.fragment))
def _with_page(url: str, page: int) -> str:
if page and page > 1:
return _with_query(url, {"p": str(page)})
return url
def _listing_base_key(url: str) -> str:
p = urlparse(url)
path = p.path.rstrip("/")
return f"{p.scheme}://{p.netloc}{path}".lower()
def _variant_cache_get(base_key: str) -> Optional[str]:
info = _listing_variant_cache.get(base_key)
if not info:
return None
url, ts = info
if (now() - ts) > _listing_variant_ttl:
_listing_variant_cache.pop(base_key, None)
return None
return url
def _variant_cache_set(base_key: str, working_url: str) -> None:
_listing_variant_cache[base_key] = (working_url, now())
def _page_cache_get(working_url: str, page: int) -> Optional[Tuple[List[Dict], int]]:
key = f"{working_url}|p={page}"
info = _listing_page_cache.get(key)
if not info:
return None
(items, total_pages), ts = info
if (now() - ts) > _listing_page_ttl:
_listing_page_cache.pop(key, None)
return None
return items, total_pages
def _page_cache_set(working_url: str, page: int, items: List[Dict], total_pages: int) -> None:
key = f"{working_url}|p={page}"
_listing_page_cache[key] = ((items, total_pages), now())
async def _fetch_parse(url: str, page: int):
html = await fetch(_with_page(url, page))
soup = soup_of(html)
items = _parse_cards_from_soup(soup)
return items, soup
async def scrape_products(list_url: str, page: int = 1):
"""Fast listing fetch with variant memoization + page cache."""
_listing_base_key(list_url)
items, soup = await _fetch_parse(list_url, page)
total_pages = _derive_total_pages(soup)
return items, total_pages
def _derive_total_pages(soup) -> int:
total_pages = 1
textdump = normalize_text(soup.get_text(" "))
pages_from_text = parse_total_pages_from_text(textdump)
if pages_from_text:
total_pages = pages_from_text
else:
pages = {1}
for a in soup.find_all("a", href=True):
m = re.search(r"[?&]p=(\d+)", a["href"])
if m:
pages.add(int(m.group(1)))
total_pages = max(pages) if pages else 1
return total_pages
def _slugs_from_list_url(list_url: str) -> Tuple[str, Optional[str]]:
p = urlparse(list_url)
parts = [x for x in (p.path or "").split("/") if x]
top = parts[0].lower() if parts else ""
sub = None
if len(parts) >= 2:
sub = parts[1]
if sub.lower().endswith((".html", ".htm")):
sub = re.sub(r"\.(html?|HTML?)$", "", sub)
return top, sub

104
market/scrape/nav.py Normal file
View File

@@ -0,0 +1,104 @@
from __future__ import annotations
import re
from typing import Dict, List, Tuple, Optional
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
from shared.config import config
from .http_client import fetch # only fetch; define soup_of locally
#from .. import cache_backend as cb
#from ..blacklist.category import is_category_blocked # Reverse map: slug -> label
# ------------------ Caches ------------------
def soup_of(html: str) -> BeautifulSoup:
return BeautifulSoup(html or "", "lxml")
def normalize_text(s: str) -> str:
return re.sub(r"\s+", " ", (s or "").strip())
async def scrape_nav_raw() -> List[Tuple[str, str]]:
html = await fetch(config()["base_url"])
soup = soup_of(html)
results: List[Tuple[str, str]] = []
for a in soup.find_all("a", href=True):
text = normalize_text(a.get_text())
if not text:
continue
href = a["href"].strip()
if href.startswith("/"):
href = urljoin(config()["base_url"], href)
if not href.startswith(config()["base_url"]):
continue
results.append((text, href))
return results
def extract_sub_slug(href: str, top_slug: str) -> Optional[str]:
p = urlparse(href)
parts = [x for x in (p.path or "").split("/") if x]
if len(parts) >= 2 and parts[0].lower() == top_slug.lower():
sub = parts[1]
if sub.lower().endswith((".html", ".htm")):
sub = re.sub(r"\.(html?|HTML?)$", "", sub)
return sub
return None
async def group_by_category(slug_to_links: Dict[str, List[Tuple[str, str]]]) -> Dict[str, Dict]:
nav = {"cats": {}}
for label, slug in config()["categories"]["allow"].items():
top_href = urljoin(config()["base_url"], f"/{slug}")
subs = []
for text, href in slug_to_links.get(slug, []):
sub_slug = extract_sub_slug(href, slug)
if sub_slug:
#list_url = _join(config()["base_url"], f"/{slug}/{sub_slug}")
#log(f"naving [{slug}/{sub_slug}] page 1…")
#items, total_pages = await scrape_products(list_url, page=1)
#for p in range(2, total_pages + 1):
# log(f"naving [{slug}/{sub_slug}] page {p}…")
# moreitems, _tp = await scrape_products(list_url, page=p)
# items.extend(
# moreitems,
# )
subs.append({"name": text, "href": href, "slug": sub_slug})
subs.sort(key=lambda x: x["name"].lower())
#list_url = _join(config()["base_url"], f"/{slug}")
#log(f"naving [{slug}] page 1…")
#items, total_pages = await scrape_products(list_url, page=1)
#for p in range(2, total_pages + 1):
# log(f"naving [{slug}] page {p}…")
# moreitems, _tp = await scrape_products(list_url, page=p)
# items.extend(
# moreitems,
# )
nav["cats"][label] = {"href": top_href, "slug": slug, "subs": subs}
return nav
async def scrape_nav_filtered() -> Dict[str, Dict]:
anchors = await scrape_nav_raw()
slug_to_links: Dict[str, List[Tuple[str, str]]] = {}
for text, href in anchors:
p = urlparse(href)
parts = [x for x in (p.path or "").split("/") if x]
if not parts:
continue
top = parts[0].lower()
if top in config()["slugs"]["skip"]:
continue
slug_to_links.setdefault(top, []).append((text, href))
return await group_by_category(slug_to_links)
async def nav_scrape() -> Dict[str, Dict]:
"""Return navigation structure; use snapshot when offline."""
nav = await scrape_nav_filtered()
return nav

View File

@@ -0,0 +1,6 @@
from .upsert_product import upsert_product
from .log_product_result import log_product_result
from .save_nav import save_nav
from .save_subcategory_redirects import save_subcategory_redirects
from .capture_listing import capture_listing

View File

@@ -0,0 +1,27 @@
# replace your existing upsert_product with this version
import os
import httpx
from typing import List
async def capture_listing(
url: str,
items: List[str],
total_pages: int
):
sync_url = os.getenv("CAPTURE_LISTING_URL", "http://localhost:8001/market/suma-market/api/products/listing/")
async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
_d = {
"url": url,
"items": items,
"total_pages": total_pages
}
resp = await client.post(sync_url, json=_d)
# Raise for non-2xx
resp.raise_for_status()
data = resp.json() if resp.content else {}
return data

View File

@@ -0,0 +1,24 @@
# replace your existing upsert_product with this version
import os
import httpx
async def log_product_result(
ok: bool,
payload
):
sync_url = os.getenv("PRODUCT_LOG_URL", "http://localhost:8000/market/api/products/log/")
async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
_d = {
"ok": ok,
"payload": payload
}
resp = await client.post(sync_url, json=_d)
# Raise for non-2xx
resp.raise_for_status()
data = resp.json() if resp.content else {}
return data

View File

@@ -0,0 +1,19 @@
# replace your existing upsert_product with this version
import os
import httpx
from typing import Dict
async def save_nav(
nav: Dict,
):
sync_url = os.getenv("SAVE_NAV_URL", "http://localhost:8001/market/suma-market/api/products/nav/")
async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
resp = await client.post(sync_url, json=nav)
# Raise for non-2xx
resp.raise_for_status()
data = resp.json() if resp.content else {}
return data

View File

@@ -0,0 +1,15 @@
import os
import httpx
from typing import Dict
async def save_subcategory_redirects(mapping: Dict[str, str]) -> None:
sync_url = os.getenv("SAVE_REDIRECTS", "http://localhost:8000/market/api/products/redirects/")
async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
resp = await client.post(sync_url, json=mapping)
# Raise for non-2xx
resp.raise_for_status()
data = resp.json() if resp.content else {}
return data

View File

@@ -0,0 +1,256 @@
# replace your existing upsert_product with this version
import os
import httpx
from typing import Dict, List, Any
async def upsert_product(
slug,
href,
d,
):
"""
Posts the given product dict `d` to the /api/products/sync endpoint.
Keeps the same signature as before and preserves logging/commit behavior.
"""
# Ensure slug in payload matches the function arg if present
if not d.get("slug"):
d["slug"] = slug
# Where to post; override via env if needed
sync_url = os.getenv("PRODUCT_SYNC_URL", "http://localhost:8001/market/suma-market/api/products/sync/")
payload = _massage_payload(d)
async def _do_call() -> Dict[str, Any]:
async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
resp = await client.post(sync_url, json=payload)
resp.raise_for_status()
# tolerate empty body
if not resp.content:
return {}
# prefer JSON if possible, otherwise return text
try:
return resp.json()
except ValueError:
return {"raw": resp.text}
async def _log_error(exc: BaseException) -> None:
# Optional: add your own logging here
print(f"[upsert_product] POST failed: {type(exc).__name__}: {exc}. Retrying in 5s... slug={slug} url={sync_url}")
return await retry_until_success(_do_call, delay=5.0, on_error=_log_error)
#async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
# _d=_massage_payload(d)
# resp = await client.post(sync_url, json=_d)
# Raise for non-2xx
#resp.raise_for_status()
#data = resp.json() if resp.content else {}
#return data
import asyncio
from typing import Any, Awaitable, Callable, Dict, Optional
async def retry_until_success(
fn: Callable[[], Awaitable[Any]],
*,
delay: float = 5.0,
on_error: Optional[Callable[[BaseException], Awaitable[None]]] = None,
) -> Any:
"""
Repeatedly call the async no-arg function `fn` until it succeeds (returns without raising).
Waits `delay` seconds between attempts. Never gives up.
If provided, `on_error(exc)` is awaited after each failure.
"""
attempt = 0
while True:
try:
return await fn()
except asyncio.CancelledError:
# bubble up cancellations immediately
raise
except BaseException as exc:
attempt += 1
if on_error is not None:
try:
await on_error(exc)
except Exception:
# don't let error handler failures prevent retrying
pass
# fallback stderr log if no on_error handler
if on_error is None:
print(f"[retry] attempt {attempt} failed: {type(exc).__name__}: {exc}")
await asyncio.sleep(delay)
def _get(d, key, default=None):
v = d.get(key)
return default if v in (None, "", [], {}) else v
def _massage_payload(d: Dict[str, Any]) -> Dict[str, Any]:
"""Mirror the DB-upsert massaging so the API sees the same structure/values."""
slug = d.get("slug")
if not slug:
raise ValueError("product missing slug")
# --- Top-level fields (use _get where DB upsert uses it) ---
out: Dict[str, Any] = {
"slug": slug,
"title": _get(d, "title"),
"image": _get(d, "image"),
"description_short": _get(d, "description_short"),
"description_html": _get(d, "description_html"),
"suma_href": _get(d, "suma_href"),
"brand": _get(d, "brand"),
"rrp": _get(d, "rrp"),
"rrp_currency": _get(d, "rrp_currency"),
"rrp_raw": _get(d, "rrp_raw"),
"price_per_unit": _get(d, "price_per_unit"),
"price_per_unit_currency": _get(d, "price_per_unit_currency"),
"price_per_unit_raw": _get(d, "price_per_unit_raw"),
"special_price": _get(d, "special_price"),
"special_price_currency": _get(d, "special_price_currency"),
"special_price_raw": _get(d, "special_price_raw"),
"regular_price": _get(d, "regular_price"),
"regular_price_currency": _get(d, "regular_price_currency"),
"regular_price_raw": _get(d, "regular_price_raw"),
"case_size_count": _get(d, "case_size_count"),
"case_size_item_qty": _get(d, "case_size_item_qty"),
"case_size_item_unit": _get(d, "case_size_item_unit"),
"case_size_raw": _get(d, "case_size_raw"),
"ean": d.get("ean") or d.get("barcode") or None,
"sku": d.get("sku"),
"unit_size": d.get("unit_size"),
"pack_size": d.get("pack_size"),
}
# --- Sections: only dicts with title+html (like DB sync) ---
sections_in = d.get("sections") or []
sections_out: List[Dict[str, Any]] = []
for sec in sections_in:
if isinstance(sec, dict) and sec.get("title") and sec.get("html"):
sections_out.append({"title": sec["title"], "html": sec["html"]})
out["sections"] = sections_out
# --- Images: same 3 buckets used in DB sync ---
def _coerce_str_list(x):
if not x:
return []
# accept list of strings or list of dicts with {"url": ...}
out_urls = []
for item in x:
if isinstance(item, str):
if item:
out_urls.append(item)
elif isinstance(item, dict):
u = item.get("url")
if u:
out_urls.append(u)
return out_urls
out["images"] = _coerce_str_list(d.get("images"))
out["embedded_image_urls"] = _coerce_str_list(d.get("embedded_image_urls"))
out["all_image_urls"] = _coerce_str_list(d.get("all_image_urls"))
# --- Labels: strip (DB code trims) ---
labels_in = d.get("labels") or []
out["labels"] = [str(x).strip() for x in labels_in if x]
# --- Stickers: strip + lower (DB code lower-cases) ---
stickers_in = d.get("stickers") or []
out["stickers"] = [str(x).strip().lower() for x in stickers_in if x]
# --- Attributes: pass through the same dict sources the DB code reads ---
out["info_table"] = d.get("info_table") or {}
#out["oe_list_price"] = d.get("oe_list_price") or {}
# --- Nutrition: allow dict or list of dicts, mirroring DB code ---
nutrition = d.get("nutrition") or []
if isinstance(nutrition, dict):
out["nutrition"] = {str(k).strip(): (None if v is None else str(v)) for k, v in nutrition.items()}
elif isinstance(nutrition, list):
rows = []
for row in nutrition:
if not isinstance(row, dict):
continue
key = str(row.get("key") or "").strip()
if not key:
continue
rows.append({
"key": key,
"value": None if row.get("value") is None else str(row.get("value")),
"unit": None if row.get("unit") is None else str(row.get("unit")),
})
out["nutrition"] = rows
else:
out["nutrition"] = []
# --- Allergens: accept str (→ contains=True) or dict ---
alls_in = d.get("allergens") or []
alls_out = []
for a in alls_in:
if isinstance(a, str):
nm, contains = a.strip(), True
elif isinstance(a, dict):
nm, contains = (a.get("name") or "").strip(), bool(a.get("contains", True))
else:
continue
if nm:
alls_out.append({"name": nm, "contains": contains})
out["allergens"] = alls_out
out["images"]=[
{"url": s.strip(), "kind": "gallery", "position": i}
for i, s in enumerate(out.get("images") or [])
if isinstance(s, str) and s.strip()
] + [
{"url": s.strip(), "kind": "embedded", "position": i}
for i, s in enumerate(out.get("embedded_image_urls") or [])
if isinstance(s, str) and s.strip()
] + [
{"url": s.strip(), "kind": "all", "position": i}
for i, s in enumerate(out.get("all_image_urls") or [])
if isinstance(s, str) and s.strip()
]
out["labels"]= [{"name": s.strip()} for s in out["labels"] if isinstance(s, str) and s.strip()]
out["stickers"]= [{"name": s.strip()} for s in out["stickers"] if isinstance(s, str) and s.strip()]
out["attributes"] = build_attributes_list(d)
return out
def build_attributes_list(d: Dict[str, Any]) -> List[Dict[str, Any]]:
attrs = []
for src, prefix in [
(d.get("info_table") or {}, "info_table"),
(d.get("oe_list_price") or {}, "oe_list_price"),
]:
for k, v in src.items():
key = f"{prefix}/{str(k).strip()}"
val = None if v is None else str(v)
attrs.append({"key": key, "value": val})
# optional: dedupe by (key, value)
seen = set()
dedup = []
for item in attrs:
t = (item["key"], item["value"])
if t in seen:
continue
seen.add(t)
dedup.append(item)
return dedup

View File

@@ -0,0 +1,7 @@
from .log_product_result import log_product_result
from .upsert_product import upsert_product
from .save_nav import save_nav
from .capture_listing import capture_listing
from .save_link_reports import save_link_reports
from .save_subcategory_redirects import save_subcategory_redirects

View File

@@ -0,0 +1,3 @@
def _get(d, key, default=None):
v = d.get(key)
return default if v in (None, "", [], {}) else v

View File

@@ -0,0 +1,137 @@
# at top of persist_snapshot.py:
from typing import Optional, List
from sqlalchemy.ext.asyncio import AsyncSession
from typing import List, Optional, Tuple
from sqlalchemy.dialects.postgresql import insert as pg_insert
from datetime import datetime
from sqlalchemy import (
select, update
)
from urllib.parse import urlparse
import re
from models.market import (
NavTop,
NavSub,
Listing,
ListingItem,
)
from shared.db.session import get_session
# --- Models are unchanged, see original code ---
# ---------------------- Helper fns called from scraper ------------------------
async def capture_listing(
#product_slugs: Set[str],
url: str,
items: List[str],
total_pages: int
) -> None:
async with get_session() as session:
await _capture_listing(
session,
url,
items,
total_pages
)
await session.commit()
async def _capture_listing(
session,
url: str,
items: List[str],
total_pages: int
) -> None:
top_id, sub_id = await _nav_ids_from_list_url(session, url)
await _save_listing(session, top_id, sub_id, items, total_pages)
async def _save_listing(session: AsyncSession, top_id: int, sub_id: Optional[int],
items: List[str], total_pages: Optional[int]) -> None:
res = await session.execute(
select(Listing).where(Listing.top_id == top_id, Listing.sub_id == sub_id, Listing.deleted_at.is_(None))
)
listing = res.scalar_one_or_none()
if not listing:
listing = Listing(top_id=top_id, sub_id=sub_id, total_pages=total_pages)
session.add(listing)
await session.flush()
else:
listing.total_pages = total_pages
# Normalize and deduplicate incoming slugs
seen: set[str] = set()
deduped: list[str] = []
for s in items or []:
if s and isinstance(s, str) and s not in seen:
seen.add(s)
deduped.append(s)
if not deduped:
return
# Fetch existing slugs from the database
res = await session.execute(
select(ListingItem.slug)
.where(ListingItem.listing_id == listing.id, ListingItem.deleted_at.is_(None))
)
existing_slugs = set(res.scalars().all())
now = datetime.utcnow()
# Slugs to delete (present in DB but not in the new data)
to_delete = existing_slugs - seen
if to_delete:
await session.execute(
update(ListingItem)
.where(
ListingItem.listing_id == listing.id,
ListingItem.slug.in_(to_delete),
ListingItem.deleted_at.is_(None)
)
.values(deleted_at=now)
)
# Slugs to insert (new ones not in DB)
to_insert = seen - existing_slugs
if to_insert:
stmt = pg_insert(ListingItem).values(
[{"listing_id": listing.id, "slug": s} for s in to_insert]
)
#.on_conflict_do_nothing(
# constraint="uq_listing_items_listing_slug"
#)
await session.execute(stmt)
async def _nav_ids_from_list_url(session: AsyncSession, list_url: str) -> Tuple[int, Optional[int]]:
parts = [x for x in (urlparse(list_url).path or "").split("/") if x]
top_slug = parts[0].lower() if parts else ""
sub_slug = None
if len(parts) >= 2:
sub_slug = parts[1]
if sub_slug.lower().endswith((".html", ".htm")):
sub_slug = re.sub(r"\\.(html?|HTML?)$", "", sub_slug)
return await _get_nav_ids(session, top_slug, sub_slug)
async def _get_nav_ids(session: AsyncSession, top_slug: str, sub_slug: Optional[str]) -> Tuple[int, Optional[int]]:
res_top = await session.execute(select(NavTop.id).where(NavTop.slug == top_slug, NavTop.deleted_at.is_(None)))
top_id = res_top.scalar_one_or_none()
if not top_id:
raise ValueError(f"NavTop not found for slug: {top_slug}")
sub_id = None
if sub_slug:
res_sub = await session.execute(
select(NavSub.id).where(NavSub.slug == sub_slug, NavSub.top_id == top_id, NavSub.deleted_at.is_(None))
)
sub_id = res_sub.scalar_one_or_none()
if sub_id is None:
raise ValueError(f"NavSub not found for slug: {sub_slug} under top_id={top_id}")
return top_id, sub_id

View File

@@ -0,0 +1,35 @@
# at top of persist_snapshot.py:
from sqlalchemy.ext.asyncio import AsyncSession
from typing import Dict
from models.market import (
ProductLog,
)
from shared.db.session import get_session
async def log_product_result(ok: bool, payload: Dict) -> None:
async with get_session() as session:
await _log_product_result(session, ok, payload)
await session.commit()
async def _log_product_result(session: AsyncSession, ok: bool, payload: Dict) -> None:
session.add(ProductLog(
ok=ok,
slug=payload.get("slug"),
href_tried=payload.get("href_tried"),
error_type=payload.get("error_type"),
error_message=payload.get("error_message"),
http_status=payload.get("http_status"),
final_url=payload.get("final_url"),
transport_error=payload.get("transport_error"),
title=payload.get("title"),
has_description_html=payload.get("has_description_html"),
has_description_short=payload.get("has_description_short"),
sections_count=payload.get("sections_count"),
images_count=payload.get("images_count"),
embedded_images_count=payload.get("embedded_images_count"),
all_images_count=payload.get("all_images_count"),
))

View File

@@ -0,0 +1,29 @@
# at top of persist_snapshot.py:
from typing import List
from typing import Dict, List
from models.market import (
LinkError,
LinkExternal,
)
from shared.db.session import get_session
# --- Models are unchanged, see original code ---
# ---------------------- Helper fns called from scraper ------------------------
async def save_link_reports(link_errors: List[Dict], link_externals: List[Dict]) -> None:
async with get_session() as session:
for e in link_errors:
session.add(LinkError(
product_slug=e.get("product"), href=e.get("href"), text=e.get("text"),
top=e.get("top"), sub=e.get("sub"), target_slug=e.get("target_slug"), type=e.get("type"),
))
for e in link_externals:
session.add(LinkExternal(
product_slug=e.get("product"), href=e.get("href"), text=e.get("text"), host=e.get("host"),
))
await session.commit()

View File

@@ -0,0 +1,110 @@
# at top of persist_snapshot.py:
from datetime import datetime
from sqlalchemy import (
select, tuple_
)
from typing import Dict
from models.market import (
NavTop,
NavSub,
)
from shared.db.session import get_session
async def save_nav(nav: Dict) -> None:
async with get_session() as session:
await _save_nav(session, nav)
await session.commit()
async def _save_nav(session, nav: Dict, market_id=None) -> None:
print('===================SAVE NAV========================')
print(nav)
now = datetime.utcnow()
incoming_top_slugs = set()
incoming_sub_keys = set() # (top_slug, sub_slug)
# First pass: collect slugs
for label, data in (nav.get("cats") or {}).items():
top_slug = (data or {}).get("slug")
if not top_slug:
continue
incoming_top_slugs.add(top_slug)
for s in (data.get("subs") or []):
sub_slug = s.get("slug")
if sub_slug:
incoming_sub_keys.add((top_slug, sub_slug))
# Soft-delete stale NavSub entries
# This requires joining NavTop to access top_slug
subs_to_delete = await session.execute(
select(NavSub)
.join(NavTop, NavSub.top_id == NavTop.id)
.where(
NavSub.deleted_at.is_(None),
~tuple_(NavTop.slug, NavSub.slug).in_(incoming_sub_keys)
)
)
for sub in subs_to_delete.scalars():
sub.deleted_at = now
# Soft-delete stale NavTop entries
tops_to_delete = await session.execute(
select(NavTop)
.where(
NavTop.deleted_at.is_(None),
~NavTop.slug.in_(incoming_top_slugs)
)
)
for top in tops_to_delete.scalars():
top.deleted_at = now
await session.flush()
# Upsert NavTop and NavSub
for label, data in (nav.get("cats") or {}).items():
top_slug = (data or {}).get("slug")
if not top_slug:
continue
res = await session.execute(
select(NavTop).where(NavTop.slug == top_slug)
)
top = res.scalar_one_or_none()
if top:
top.label = label
top.deleted_at = None
if market_id is not None and top.market_id is None:
top.market_id = market_id
else:
top = NavTop(label=label, slug=top_slug, market_id=market_id)
session.add(top)
await session.flush()
for s in (data.get("subs") or []):
sub_slug = s.get("slug")
if not sub_slug:
continue
sub_label = s.get("label")
sub_href = s.get("href")
res_sub = await session.execute(
select(NavSub).where(
NavSub.slug == sub_slug,
NavSub.top_id == top.id
)
)
sub = res_sub.scalar_one_or_none()
if sub:
sub.label = sub_label
sub.href = sub_href
sub.deleted_at = None
else:
session.add(NavSub(top_id=top.id, label=sub_label, slug=sub_slug, href=sub_href))

View File

@@ -0,0 +1,32 @@
# at top of persist_snapshot.py:
from typing import Dict
from datetime import datetime
from sqlalchemy import (
update
)
from models.market import (
SubcategoryRedirect,
)
from shared.db.session import get_session
# --- Models are unchanged, see original code ---
# ---------------------- Helper fns called from scraper ------------------------
async def save_subcategory_redirects(mapping: Dict[str, str]) -> None:
async with get_session() as session:
await _save_subcategory_redirects(session, mapping)
await session.commit()
async def _save_subcategory_redirects(session, mapping: Dict[str, str]) -> None:
await session.execute(update(SubcategoryRedirect).where(SubcategoryRedirect.deleted_at.is_(None)).values(deleted_at=datetime.utcnow()))
for old, new in mapping.items():
session.add(SubcategoryRedirect(old_path=old, new_path=new))
#for slug in items:
# product_slugs.add(slug)

View File

@@ -0,0 +1,237 @@
# at top of persist_snapshot.py:
from sqlalchemy.ext.asyncio import AsyncSession
from typing import Dict
from datetime import datetime
from sqlalchemy import (
func, select, update
)
from models.market import (
Product,
ProductImage,
ProductSection,
ProductLabel,
ProductSticker,
ProductAttribute,
ProductNutrition,
ProductAllergen
)
from shared.db.session import get_session
from ._get import _get
from .log_product_result import _log_product_result
# --- Models are unchanged, see original code ---
# ---------------------- Helper fns called from scraper ------------------------
async def _upsert_product(session: AsyncSession, d: Dict) -> Product:
slug = d.get("slug")
if not slug:
raise ValueError("product missing slug")
res = await session.execute(select(Product).where(Product.slug == slug, Product.deleted_at.is_(None)))
p = res.scalar_one_or_none()
if not p:
p = Product(slug=slug)
session.add(p)
p.title = _get(d, "title")
p.image = _get(d, "image")
p.description_short = _get(d, "description_short")
p.description_html = _get(d, "description_html")
p.suma_href = _get(d, "suma_href")
p.brand = _get(d, "brand")
p.rrp = _get(d, "rrp")
p.rrp_currency = _get(d, "rrp_currency")
p.rrp_raw = _get(d, "rrp_raw")
p.price_per_unit = _get(d, "price_per_unit")
p.price_per_unit_currency = _get(d, "price_per_unit_currency")
p.price_per_unit_raw = _get(d, "price_per_unit_raw")
p.special_price = _get(d, "special_price")
p.special_price_currency = _get(d, "special_price_currency")
p.special_price_raw = _get(d, "special_price_raw")
p.regular_price = _get(d, "regular_price")
p.regular_price_currency = _get(d, "regular_price_currency")
p.regular_price_raw = _get(d, "regular_price_raw")
p.case_size_count = _get(d, "case_size_count")
p.case_size_item_qty = _get(d, "case_size_item_qty")
p.case_size_item_unit = _get(d, "case_size_item_unit")
p.case_size_raw = _get(d, "case_size_raw")
p.ean = d.get("ean") or d.get("barcode") or None
p.sku = d.get("sku")
p.unit_size = d.get("unit_size")
p.pack_size = d.get("pack_size")
p.updated_at = func.now()
now = datetime.utcnow()
# ProductSection sync
existing_sections = await session.execute(select(ProductSection).where(ProductSection.product_id == p.id, ProductSection.deleted_at.is_(None)))
existing_sections_set = {(s.title, s.html) for s in existing_sections.scalars()}
new_sections_set = set()
for sec in d.get("sections") or []:
if isinstance(sec, dict) and sec.get("title") and sec.get("html"):
new_sections_set.add((sec["title"], sec["html"]))
if (sec["title"], sec["html"]) not in existing_sections_set:
session.add(ProductSection(product_id=p.id, title=sec["title"], html=sec["html"]))
for s in existing_sections_set - new_sections_set:
await session.execute(update(ProductSection).where(ProductSection.product_id == p.id, ProductSection.title == s[0], ProductSection.html == s[1], ProductSection.deleted_at.is_(None)).values(deleted_at=now))
# ProductImage sync
existing_images = await session.execute(select(ProductImage).where(ProductImage.product_id == p.id, ProductImage.deleted_at.is_(None)))
existing_images_set = {(img.url, img.kind) for img in existing_images.scalars()}
new_images_set = set()
for kind, urls in [
("gallery", d.get("images") or []),
("embedded", d.get("embedded_image_urls") or []),
("all", d.get("all_image_urls") or []),
]:
for idx, url in enumerate(urls):
if url:
new_images_set.add((url, kind))
if (url, kind) not in existing_images_set:
session.add(ProductImage(product_id=p.id, url=url, position=idx, kind=kind))
for img in existing_images_set - new_images_set:
await session.execute(update(ProductImage).where(ProductImage.product_id == p.id, ProductImage.url == img[0], ProductImage.kind == img[1], ProductImage.deleted_at.is_(None)).values(deleted_at=now))
# ProductLabel sync
existing_labels = await session.execute(select(ProductLabel).where(ProductLabel.product_id == p.id, ProductLabel.deleted_at.is_(None)))
existing_labels_set = {label.name.strip() for label in existing_labels.scalars()}
new_labels = {str(name).strip() for name in (d.get("labels") or []) if name}
for name in new_labels - existing_labels_set:
session.add(ProductLabel(product_id=p.id, name=name))
for name in existing_labels_set - new_labels:
await session.execute(update(ProductLabel).where(ProductLabel.product_id == p.id, ProductLabel.name == name, ProductLabel.deleted_at.is_(None)).values(deleted_at=now))
# ProductSticker sync
existing_stickers = await session.execute(select(ProductSticker).where(ProductSticker.product_id == p.id, ProductSticker.deleted_at.is_(None)))
existing_stickers_set = {sticker.name.strip() for sticker in existing_stickers.scalars()}
new_stickers = {str(name).strip().lower() for name in (d.get("stickers") or []) if name}
for name in new_stickers - existing_stickers_set:
session.add(ProductSticker(product_id=p.id, name=name))
for name in existing_stickers_set - new_stickers:
await session.execute(update(ProductSticker).where(ProductSticker.product_id == p.id, ProductSticker.name == name, ProductSticker.deleted_at.is_(None)).values(deleted_at=now))
# ProductAttribute sync
existing_attrs = await session.execute(select(ProductAttribute).where(ProductAttribute.product_id == p.id, ProductAttribute.deleted_at.is_(None)))
existing_attrs_set = {(a.key, a.value) for a in existing_attrs.scalars()}
new_attrs_set = set()
for src, prefix in [(d.get("info_table") or {}, "info_table"), (d.get("oe_list_price") or {}, "oe_list_price")]:
for k, v in src.items():
key = f"{prefix}/{str(k).strip()}"
val = None if v is None else str(v)
new_attrs_set.add((key, val))
if (key, val) not in existing_attrs_set:
session.add(ProductAttribute(product_id=p.id, key=key, value=val))
for key, val in existing_attrs_set - new_attrs_set:
await session.execute(update(ProductAttribute).where(ProductAttribute.product_id == p.id, ProductAttribute.key == key, ProductAttribute.value == val, ProductAttribute.deleted_at.is_(None)).values(deleted_at=now))
# ProductNutrition sync
existing_nuts = await session.execute(select(ProductNutrition).where(ProductNutrition.product_id == p.id, ProductNutrition.deleted_at.is_(None)))
existing_nuts_set = {(n.key, n.value, n.unit) for n in existing_nuts.scalars()}
new_nuts_set = set()
nutrition = d.get("nutrition") or []
if isinstance(nutrition, dict):
for k, v in nutrition.items():
key, val = str(k).strip(), str(v) if v is not None else None
new_nuts_set.add((key, val, None))
if (key, val, None) not in existing_nuts_set:
session.add(ProductNutrition(product_id=p.id, key=key, value=val, unit=None))
elif isinstance(nutrition, list):
for row in nutrition:
try:
key = str(row.get("key") or "").strip()
val = None if row.get("value") is None else str(row.get("value"))
unit = None if row.get("unit") is None else str(row.get("unit"))
if key:
new_nuts_set.add((key, val, unit))
if (key, val, unit) not in existing_nuts_set:
session.add(ProductNutrition(product_id=p.id, key=key, value=val, unit=unit))
except Exception:
continue
for key, val, unit in existing_nuts_set - new_nuts_set:
await session.execute(update(ProductNutrition).where(ProductNutrition.product_id == p.id, ProductNutrition.key == key, ProductNutrition.value == val, ProductNutrition.unit == unit, ProductNutrition.deleted_at.is_(None)).values(deleted_at=now))
# ProductAllergen sync
existing_allergens = await session.execute(select(ProductAllergen).where(ProductAllergen.product_id == p.id, ProductAllergen.deleted_at.is_(None)))
existing_allergens_set = {(a.name, a.contains) for a in existing_allergens.scalars()}
new_allergens_set = set()
for a in d.get("allergens") or []:
if isinstance(a, str):
nm, contains = a.strip(), True
elif isinstance(a, dict):
nm, contains = (a.get("name") or "").strip(), bool(a.get("contains", True))
else:
continue
if nm:
new_allergens_set.add((nm, contains))
if (nm, contains) not in existing_allergens_set:
session.add(ProductAllergen(product_id=p.id, name=nm, contains=contains))
for name, contains in existing_allergens_set - new_allergens_set:
await session.execute(update(ProductAllergen).where(ProductAllergen.product_id == p.id, ProductAllergen.name == name, ProductAllergen.contains == contains, ProductAllergen.deleted_at.is_(None)).values(deleted_at=now))
await session.flush()
return p
async def upsert_product(
slug,
href,
d,
):
async with get_session() as session:
try:
await _upsert_product(session, d)
await _log_product_result(session, ok=True, payload={
"slug": slug,
"href_tried": href,
"title": d.get("title"),
"has_description_html": bool(d.get("description_html")),
"has_description_short": bool(d.get("description_short")),
"sections_count": len(d.get("sections") or []),
"images_count": len(d.get("images")),
"embedded_images_count": len(d.get("embedded_image_urls")),
"all_images_count": len(d.get("all_image_urls")),
})
except Exception as e:
print(f"[ERROR] Failed to upsert product '{d.get('slug')}'")
print(f" Title: {d}.get('title')")
print(f" URL: {d.get('suma_href')}")
print(f" Error type: {type(e).__name__}")
print(f" Error message: {str(e)}")
import traceback
traceback.print_exc()
await _log_product_result(session, ok=False, payload={
"slug": d.get("slug"),
"href_tried": d.get("suma_href"),
"error_type": type(e).__name__,
"error_message": str(e),
"title": d.get("title"),
})
raise
await session.commit()

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,13 @@
# Auto-import all extractor modules so they register themselves.
from .title import ex_title # noqa: F401
from .images import ex_images # noqa: F401
from .short_description import ex_short_description # noqa: F401
from .description_sections import ex_description_sections # noqa: F401
from .nutrition_ex import ex_nutrition # noqa: F401
from .stickers import ex_stickers # noqa: F401
from .labels import ex_labels # noqa: F401
from .info_table import ex_info_table # noqa: F401
from .oe_list_price import ex_oe_list_price # noqa: F401
from .regular_price_fallback import ex_regular_price_fallback # noqa: F401
from .breadcrumbs import ex_breadcrumbs # noqa: F401

View File

@@ -0,0 +1,68 @@
from __future__ import annotations
from typing import Dict, List, Union
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from shared.utils import normalize_text
from ..registry import extractor
@extractor
def ex_breadcrumbs(soup: BeautifulSoup, url: str) -> Dict:
"""
Parse breadcrumbs to identify top and sub categories.
"""
bc_ul = (soup.select_one(".breadcrumbs ul.items")
or soup.select_one("nav.breadcrumbs ul.items")
or soup.select_one("ul.items"))
if not bc_ul:
return {}
crumbs = []
for li in bc_ul.select("li.item"):
a = li.find("a")
if a:
title = normalize_text(a.get("title") or a.get_text())
href = a.get("href")
else:
title = normalize_text(li.get_text())
href = None
slug = None
if href:
try:
p = urlparse(href)
path = (p.path or "").strip("/")
slug = path.split("/")[-1] if path else None
except Exception:
slug = None
if slug:
crumbs.append({"title": title or None, "href": href or None, "slug": slug})
category_links = [c for c in crumbs if c.get("href")]
top = None
sub = None
for c in category_links:
t = (c.get("title") or "").lower()
s = (c.get("slug") or "").lower()
if t == "home" or s in ("", "home"):
continue
if top is None:
top = c
continue
if sub is None:
sub = c
break
out: Dict[str, Union[str, List[Dict[str, str]]]] = {
"category_breadcrumbs": crumbs
}
if top:
out["category_top_title"] = top.get("title")
out["category_top_href"] = top.get("href")
out["category_top_slug"] = top.get("slug")
if sub:
out["category_sub_title"] = sub.get("title")
out["category_sub_href"] = sub.get("href")
out["category_sub_slug"] = sub.get("slug")
if top and sub:
out["category_path"] = f"{(top.get('slug') or '').strip()}/{(sub.get('slug') or '').strip()}"
return out

View File

@@ -0,0 +1,43 @@
from __future__ import annotations
from typing import Dict, List
from bs4 import BeautifulSoup
from shared.utils import normalize_text
from ...html_utils import absolutize_fragment
from ..registry import extractor
from ..helpers.desc import (
split_description_container, find_description_container,
pair_title_content_from_magento_tabs, scan_headings_for_sections,
additional_attributes_table,
)
from ..helpers.text import clean_title, is_blacklisted_heading
@extractor
def ex_description_sections(soup: BeautifulSoup, url: str) -> Dict:
description_html = None
sections: List[Dict] = []
desc_el = find_description_container(soup)
if desc_el:
open_html, sections_from_desc = split_description_container(desc_el)
description_html = open_html or None
sections.extend(sections_from_desc)
existing = {s["title"].lower() for s in sections}
for t, html_fragment in (pair_title_content_from_magento_tabs(soup) or scan_headings_for_sections(soup)):
low = t.lower()
if "product description" in low or low == "description" or "details" in low:
if not description_html and html_fragment:
description_html = absolutize_fragment(html_fragment)
continue
if t.lower() not in existing and normalize_text(BeautifulSoup(html_fragment, "lxml").get_text()):
if not is_blacklisted_heading(t):
sections.append({"title": clean_title(t), "html": absolutize_fragment(html_fragment)})
existing.add(t.lower())
addl = additional_attributes_table(soup)
if addl and "additional information" not in existing and not is_blacklisted_heading("additional information"):
sections.append({"title": "Additional Information", "html": addl})
out = {"sections": sections}
if description_html:
out["description_html"] = description_html
return out

View File

@@ -0,0 +1,89 @@
from __future__ import annotations
import json, re
from typing import Dict, List
from bs4 import BeautifulSoup
from ..registry import extractor
from ..helpers.html import abs_url, collect_img_candidates, dedup_by_filename
@extractor
def ex_images(soup: BeautifulSoup, url: str) -> Dict:
images: List[str] = []
debug = False # set True while debugging
# 1) Magento init script (gallery)
scripts = soup.find_all("script", attrs={"type": "text/x-magento-init"})
if debug: print(f"[ex_images] x-magento-init scripts: {len(scripts)}")
for script in scripts:
# Use raw string as-is; no stripping/collapsing
text = script.string or script.get_text() or ""
if "mage/gallery/gallery" not in text:
continue
# Correct (not over-escaped) patterns:
m = re.search(r'"data"\s*:\s*(\[[\s\S]*?\])', text)
if not m:
if debug: print("[ex_images] 'data' array not found in gallery block")
continue
arr_txt = m.group(1)
added = False
try:
data = json.loads(arr_txt)
for entry in data:
u = abs_url(entry.get("full")) or abs_url(entry.get("img"))
if u:
images.append(u); added = True
except Exception as e:
if debug: print(f"[ex_images] json.loads failed: {e!r}; trying regex fallback")
# Fallback to simple key extraction
fulls = re.findall(r'"full"\s*:\s*"([^"]+)"', arr_txt)
imgs = re.findall(r'"img"\s*:\s*"([^"]+)"', arr_txt) if not fulls else []
for u in (fulls or imgs):
u = abs_url(u)
if u:
images.append(u); added = True
if added:
break # got what we need from the gallery block
# 2) JSON-LD fallback
if not images:
for script in soup.find_all("script", attrs={"type": "application/ld+json"}):
raw = script.string or script.get_text() or ""
try:
data = json.loads(raw)
except Exception:
continue
def add_from(val):
if isinstance(val, str):
u = abs_url(val); u and images.append(u)
elif isinstance(val, list):
for v in val:
if isinstance(v, str):
u = abs_url(v); u and images.append(u)
elif isinstance(v, dict) and "url" in v:
u = abs_url(v["url"]); u and images.append(u)
elif isinstance(val, dict) and "url" in val:
u = abs_url(val["url"]); u and images.append(u)
if isinstance(data, dict) and "image" in data:
add_from(data["image"])
if isinstance(data, list):
for item in data:
if isinstance(item, dict) and "image" in item:
add_from(item["image"])
# 3) Generic DOM scan fallback
if not images:
# consider broadening selectors if needed, e.g. '.fotorama__img'
for el in soup.select(".product.media img, .gallery-placeholder img, .fotorama__stage img"):
for cand in collect_img_candidates(el):
u = abs_url(cand)
if u:
images.append(u)
images = dedup_by_filename(images)
if debug: print(f"[ex_images] found images: {images}")
return {"images": images, "image": images[0] if images else None}

View File

@@ -0,0 +1,76 @@
from __future__ import annotations
from typing import Dict, Union
from bs4 import BeautifulSoup
from shared.utils import normalize_text
from ..registry import extractor
from ..helpers.price import parse_price, parse_case_size
@extractor
def ex_info_table(soup: BeautifulSoup, url: str) -> Dict:
"""
Extracts:
<div class="product-page-info-table"> ... rows of label/content ... </div>
Produces:
info_table (raw map), brand, rrp[_raw|_currency], price_per_unit[_raw|_currency],
case_size_* fields
"""
container = soup.select_one(".product-page-info-table") or None
if not container:
return {}
rows_parent = container.select_one(".product-page-info-table-rows") or container
rows = rows_parent.select(".product-page-info-table-row") or []
if not rows:
return {}
raw_map: Dict[str, str] = {}
for r in rows:
lab_el = r.select_one(".product-page-info-table__label")
val_el = r.select_one(".product-page-info-table__content")
if not lab_el or not val_el:
continue
label = normalize_text(lab_el.get_text())
value = normalize_text(val_el.get_text())
if label:
raw_map[label] = value
out: Dict[str, Union[str, float, int, Dict]] = {"info_table": raw_map}
# Brand
brand = raw_map.get("Brand") or raw_map.get("Brand Name") or None
if brand:
out["brand"] = brand
# RRP
rrp_val, rrp_cur, rrp_raw = parse_price(raw_map.get("RRP", ""))
if rrp_raw and (rrp_val is not None or rrp_cur is not None):
out["rrp_raw"] = rrp_raw
if rrp_val is not None:
out["rrp"] = rrp_val
if rrp_cur:
out["rrp_currency"] = rrp_cur
# Price Per Unit
ppu_val, ppu_cur, ppu_raw = parse_price(
raw_map.get("Price Per Unit", "") or raw_map.get("Unit Price", "")
)
if ppu_raw and (ppu_val is not None or ppu_cur is not None):
out["price_per_unit_raw"] = ppu_raw
if ppu_val is not None:
out["price_per_unit"] = ppu_val
if ppu_cur:
out["price_per_unit_currency"] = ppu_cur
# Case Size
cs_text = raw_map.get("Case Size", "") or raw_map.get("Pack Size", "")
cs_count, cs_item_qty, cs_item_unit, cs_raw = parse_case_size(cs_text)
if cs_raw:
out["case_size_raw"] = cs_raw
if cs_count is not None:
out["case_size_count"] = cs_count
if cs_item_qty is not None:
out["case_size_item_qty"] = cs_item_qty
if cs_item_unit:
out["case_size_item_unit"] = cs_item_unit
return out

View File

@@ -0,0 +1,41 @@
from __future__ import annotations
from typing import Dict, List
from bs4 import BeautifulSoup
from shared.utils import normalize_text
from ..registry import extractor
@extractor
def ex_labels(soup: BeautifulSoup, url: str) -> Dict:
"""
From:
<ul class="cdz-product-labels">
<li class="label-item new"><div class="label-content">NEW</div></li>
</ul>
Returns "labels": lower-cased union of class hints and visible text.
"""
root = soup.select_one("ul.cdz-product-labels")
if not root:
return {}
items: List[str] = []
texts: List[str] = []
for li in root.select("li.label-item"):
for c in (li.get("class") or []):
c = (c or "").strip()
if c and c.lower() != "label-item" and c not in items:
items.append(c)
txt = normalize_text(li.get_text())
if txt and txt not in texts:
texts.append(txt)
if not items and not texts:
return {}
union = []
seen = set()
for s in items + [t.lower() for t in texts]:
key = (s or "").strip().lower()
if key and key not in seen:
seen.add(key)
union.append(key)
return {"labels": union}

View File

@@ -0,0 +1,129 @@
from __future__ import annotations
from typing import Dict, List, Optional, Tuple
import re
from bs4 import BeautifulSoup
from shared.utils import normalize_text
from ..registry import extractor
from ..helpers.desc import (
split_description_container, find_description_container,
pair_title_content_from_magento_tabs, scan_headings_for_sections,
)
# ----- value/unit parser ------------------------------------------------------
_NUM_UNIT_RE = re.compile(
r"""
^\s*
(?P<num>[-+]?\d{1,3}(?:[.,]\d{3})*(?:[.,]\d+)?|\d+(?:[.,]\d+)?)
\s*
(?P<unit>[a-zA-Z%µ/]+)?
\s*$
""",
re.X,
)
def _parse_value_unit(s: str) -> Tuple[Optional[str], Optional[str]]:
if not s:
return None, None
s = re.sub(r"\s+", " ", s.strip())
m = _NUM_UNIT_RE.match(s)
if not m:
return None, None
num = (m.group("num") or "").replace(",", "")
unit = m.group("unit") or None
if unit:
u = unit.lower()
if u in {"kcal", "kcal.", "kcalories", "kcalorie"}:
unit = "kcal"
elif u in {"kj", "kj.", "kilojoule", "kilojoules"}:
unit = "kJ"
return (num or None, unit)
# ----- section finder ---------------------------------------------------------
def _find_nutrition_section_html(soup: BeautifulSoup) -> Optional[str]:
"""
Return the HTML for the section whose title matches 'Nutritional Information'.
We look in the same places your description extractor does.
"""
# 1) Magento tabs
for t, html in (pair_title_content_from_magento_tabs(soup) or []):
if not t or not html:
continue
title = normalize_text(t).rstrip(":").lower()
if "nutritional information" in title:
return html
# 2) Description container split into sections
desc_el = find_description_container(soup)
if desc_el:
_open_html, sections = split_description_container(desc_el)
for sec in sections or []:
title = normalize_text((sec.get("title") or "")).rstrip(":").lower()
if "nutritional information" in title:
return sec.get("html") or ""
# 3) Fallback: generic heading scan
for t, html in (scan_headings_for_sections(soup) or []):
if not t or not html:
continue
title = normalize_text(t).rstrip(":").lower()
if "nutritional information" in title:
return html
return None
# ----- table parser -----------------------------------------------------------
def _extract_rows_from_table(root: BeautifulSoup) -> List[Dict[str, str]]:
out: List[Dict[str, str]] = []
table = root.select_one("table")
if not table:
return out
for tr in table.select("tr"):
th = tr.find("th")
tds = tr.find_all("td")
if th and tds:
key = normalize_text(th.get_text(" ").strip())
val_raw = normalize_text(tds[0].get_text(" ").strip())
elif len(tds) >= 2:
key = normalize_text(tds[0].get_text(" ").strip())
val_raw = normalize_text(tds[1].get_text(" ").strip())
else:
continue
if not key or not val_raw:
continue
value, unit = _parse_value_unit(val_raw)
if value is None: # keep raw if not parseable
value, unit = val_raw, None
out.append({"key": key, "value": value, "unit": unit})
# Deduplicate while preserving order
seen = set()
dedup: List[Dict[str, str]] = []
for r in out:
t = (r["key"], r.get("value"), r.get("unit"))
if t in seen:
continue
seen.add(t)
dedup.append(r)
return dedup
# ----- extractor --------------------------------------------------------------
@extractor
def ex_nutrition(soup: BeautifulSoup, url: str) -> Dict:
"""
Extract nutrition ONLY from the section titled 'Nutritional Information'.
Returns: {"nutrition": [{"key": "...", "value": "...", "unit": "..."}]}
"""
section_html = _find_nutrition_section_html(soup)
if not section_html:
return {"nutrition": []}
section_soup = BeautifulSoup(section_html, "lxml")
rows = _extract_rows_from_table(section_soup)
return {"nutrition": rows}

View File

@@ -0,0 +1,56 @@
from __future__ import annotations
from typing import Dict, Union
from bs4 import BeautifulSoup
from ..registry import extractor
from ..helpers.price import parse_price
@extractor
def ex_oe_list_price(soup: BeautifulSoup, url: str) -> Dict:
"""
Extract Magento "oe-list-price" block:
<div class="oe-list-price">
<div class="rrp-price"><label>Regular Price: </label><span class="price">£30.50</span></div>
<div class="oe-final-price"><label>Special Price: </label><span>£23.63</span></div>
</div>
Produces:
oe_list_price: { rrp_raw, rrp, rrp_currency, special_raw, special, special_currency }
Also promotes special_* to top-level (special_price_*) if available.
"""
box = soup.select_one(".oe-list-price")
if not box:
return {}
out: Dict[str, Union[str, float, dict]] = {}
oe: Dict[str, Union[str, float]] = {}
# RRP inside oe-list-price (if present)
rrp = box.select_one(".rrp-price")
if rrp:
txt = (rrp.select_one("span.price") or rrp.select_one("span") or rrp).get_text(strip=True)
val, cur, raw = parse_price(txt)
if raw:
oe["rrp_raw"] = raw
if val is not None:
oe["rrp"] = val
if cur:
oe["rrp_currency"] = cur
# Special Price inside oe-list-price
sp = box.select_one(".oe-final-price, .special-price, .final-price")
if sp:
txt = (sp.select_one("span.price") or sp.select_one("span") or sp).get_text(strip=True)
val, cur, raw = parse_price(txt)
if raw:
oe["special_raw"] = raw
if val is not None:
oe["special"] = val
out["special_price"] = val
if cur:
oe["special_currency"] = cur
out["special_price_currency"] = cur
if raw:
out["special_price_raw"] = raw
if oe:
out["oe_list_price"] = oe
return out

View File

@@ -0,0 +1,33 @@
from __future__ import annotations
from typing import Dict, Union
from bs4 import BeautifulSoup
from ..registry import extractor
from ..helpers.price import parse_price
@extractor
def ex_regular_price_fallback(soup: BeautifulSoup, url: str) -> Dict:
"""
Fallback extractor for legacy 'Regular Price' blocks outside oe-list-price:
<div class="rrp-price"><label>Regular Price: </label><span class="price">£16.55</span></div>
"""
rrp = soup.select_one("div.rrp-price")
if not rrp:
return {}
span = rrp.select_one("span.price")
price_text = span.get_text(strip=True) if span else rrp.get_text(" ", strip=True)
value, currency, raw = parse_price(price_text or "")
out: Dict[str, Union[str, float]] = {}
if raw:
out["regular_price_raw"] = raw
if value is not None:
out["regular_price"] = value
if currency:
out["regular_price_currency"] = currency
if value is not None:
out.setdefault("rrp", value)
if currency:
out.setdefault("rrp_currency", currency)
if raw:
out.setdefault("rrp_raw", raw)
return out

View File

@@ -0,0 +1,19 @@
from __future__ import annotations
from typing import Dict
from bs4 import BeautifulSoup
from shared.utils import normalize_text
from ..registry import extractor
@extractor
def ex_short_description(soup: BeautifulSoup, url: str) -> Dict:
desc_short = None
for sel in [".product.attribute.description .value", ".product.attribute.overview .value",
"meta[name='description']", "meta[property='og:description']"]:
el = soup.select_one(sel)
if not el:
continue
desc_short = normalize_text(el.get_text() if el.name != "meta" else el.get("content"))
if desc_short:
break
return {"description_short": desc_short}

View File

@@ -0,0 +1,30 @@
from __future__ import annotations
from typing import Dict, List
from bs4 import BeautifulSoup
from ..registry import extractor
@extractor
def ex_stickers(soup: BeautifulSoup, url: str) -> Dict:
"""
<div class="stickers">
<span class="sticker xxx"></span>
...
</div>
"""
root = soup.select_one("div.stickers")
if not root:
return {"stickers": []}
stickers: List[str] = []
seen = set()
for sp in root.select("span.sticker"):
classes = sp.get("class") or []
extras = [c.strip() for c in classes if c and c.lower() != "sticker"]
data_name = (sp.get("data-sticker") or "").strip()
if data_name:
extras.append(data_name)
for x in extras:
if x and x not in seen:
seen.add(x)
stickers.append(x)
return {"stickers": stickers}

View File

@@ -0,0 +1,17 @@
from __future__ import annotations
from typing import Dict
from bs4 import BeautifulSoup
from shared.utils import normalize_text
from ..registry import extractor
@extractor
def ex_title(soup: BeautifulSoup, url: str) -> Dict:
title = None
for sel in ["h1.page-title span", "h1.page-title", "h1.product-name", "meta[property='og:title']"]:
el = soup.select_one(sel)
if el:
title = normalize_text(el.get_text()) if el.name != "meta" else el.get("content")
if title:
break
return {"title": title or "Product"}

View File

@@ -0,0 +1,165 @@
from __future__ import annotations
from typing import Dict, List, Optional, Tuple
from bs4 import BeautifulSoup, NavigableString, Tag
from shared.utils import normalize_text
from ...html_utils import absolutize_fragment
from .text import clean_title, is_blacklisted_heading
from shared.config import config
def split_description_container(desc_el: Tag) -> Tuple[str, List[Dict]]:
"""
Extract sections from accordion blocks within the description container.
Looks for headings with class 'accordion-title' and pairs each with its
next element-sibling having class 'accordion-details'. Returns:
- open_html: the remaining description HTML with those accordion blocks removed
- sections: [{"title": ..., "html": ...}, ...]
"""
# Work on an isolated copy to avoid mutating the original DOM
frag = BeautifulSoup(desc_el.decode_contents(), "lxml")
# Collect candidate (heading, details) pairs without mutating during iteration
pairs: List[Tuple[Tag, Tag]] = []
for h in frag.select("#accordion .accordion-title, .accordion .accordion-title, h5.accordion-title, .accordion-title"):
if not isinstance(h, Tag):
continue
title = clean_title((h.get_text() or "").strip())
if not title:
continue
# Walk forward siblings until we hit an element; accept the first with 'accordion-details'
sib = h.next_sibling
details: Optional[Tag] = None
while sib is not None:
if isinstance(sib, Tag):
classes = sib.get("class") or []
if "accordion-details" in classes:
details = sib
break
sib = sib.next_sibling
if details is not None:
pairs.append((h, details))
sections: List[Dict] = []
# Extract sections, then remove nodes from frag
for h, details in pairs:
# Pull details HTML
html = details.decode_contents()
# Only keep non-empty (textual) content
if normalize_text(BeautifulSoup(html, "lxml").get_text()):
sections.append({
"title": clean_title(h.get_text() or ""),
"html": absolutize_fragment(html),
})
# Remove the matched nodes from the fragment copy
details.decompose()
h.decompose()
# Whatever remains is the open description html
open_html = absolutize_fragment(str(frag)) if frag else ""
return open_html, sections
def pair_title_content_from_magento_tabs(soup: BeautifulSoup):
out = []
container = soup.select_one(".product.info.detailed .product.data.items") or soup.select_one(".product.data.items")
if not container:
return out
titles = container.select(".data.item.title")
for t in titles:
title = normalize_text(t.get_text())
if not title:
continue
content_id = t.get("aria-controls") or t.get("data-target")
content = soup.select_one(f"#{content_id}") if content_id else None
if content is None:
sib = t.find_next_sibling(
lambda x: isinstance(x, Tag) and "data" in x.get("class", []) and "item" in x.get("class", []) and "content" in x.get("class", [])
)
content = sib
if content:
html = content.decode_contents()
if not is_blacklisted_heading(title):
out.append((title, absolutize_fragment(html)))
return out
def scan_headings_for_sections(soup: BeautifulSoup):
out = []
container = (
soup.select_one(".product.info.detailed")
or soup.select_one(".product-info-main")
or soup.select_one(".page-main")
or soup
)
heads = container.select("h2, h3, h4, h5, h6")
section_titles = (config().get("section-titles") or [])
for h in heads:
title = clean_title(h.get_text() or "")
if not title:
continue
low = title.lower()
if not any(k in low for k in section_titles + ["product description", "description", "details"]):
continue
parts: List[str] = []
for sib in h.next_siblings:
if isinstance(sib, NavigableString):
parts.append(str(sib))
continue
if isinstance(sib, Tag) and sib.name in ("h2", "h3", "h4", "h5", "h6"):
break
if isinstance(sib, Tag):
parts.append(str(sib))
html = absolutize_fragment("".join(parts).strip())
if html and not is_blacklisted_heading(title):
out.append((title, html))
return out
def additional_attributes_table(soup: BeautifulSoup) -> Optional[str]:
table = soup.select_one(".additional-attributes, table.additional-attributes, .product.attribute.additional table")
if not table:
return None
try:
rows = []
for tr in table.select("tr"):
th = tr.find("th") or tr.find("td")
tds = tr.find_all("td")
key = normalize_text(th.get_text()) if th else None
val = normalize_text(tds[-1].get_text()) if tds else None
if key and val:
rows.append((key, val))
if not rows:
return None
items = "\n".join(
[
f"""<div class='grid grid-cols-3 gap-2 py-1 border-b'>
<div class='col-span-1 font-medium'>{key}</div>
<div class='col-span-2 text-stone-700'>{val}</div>
</div>"""
for key, val in rows
]
)
return f"<div class='rounded-lg border bg-white'>{items}</div>"
except Exception:
return None
def find_description_container(soup: BeautifulSoup) -> Optional[Tag]:
for sel in ["#description", "#tab-description", ".product.attribute.description .value",
".product.attribute.overview .value", ".product.info.detailed .value"]:
el = soup.select_one(sel)
if el and normalize_text(el.get_text()):
return el
for h in soup.select("h2, h3, h4, h5, h6"):
txt = normalize_text(h.get_text()).lower()
if txt.startswith("product description") or txt == "description":
wrapper = soup.new_tag("div")
for sib in h.next_siblings:
if isinstance(sib, Tag) and sib.name in ("h2", "h3", "h4", "h5", "h6"):
break
wrapper.append(sib if isinstance(sib, Tag) else NavigableString(str(sib)))
if normalize_text(wrapper.get_text()):
return wrapper
return None

View File

@@ -0,0 +1,53 @@
from __future__ import annotations
from typing import List, Optional
from urllib.parse import urljoin, urlparse
from shared.config import config
def first_from_srcset(val: str) -> Optional[str]:
if not val:
return None
first = val.split(",")[0].strip()
parts = first.split()
return parts[0] if parts else first
def abs_url(u: Optional[str]) -> Optional[str]:
if not u:
return None
return urljoin(config()["base_url"], u) if isinstance(u, str) and u.startswith("/") else u
def collect_img_candidates(el) -> List[str]:
urls: List[str] = []
if not el:
return urls
attrs = ["src", "data-src", "data-original", "data-zoom-image", "data-thumb", "content", "href"]
for a in attrs:
v = el.get(a)
if v:
urls.append(v)
for a in ["srcset", "data-srcset"]:
v = el.get(a)
if v:
first = first_from_srcset(v)
if first:
urls.append(first)
return urls
def _filename_key(u: str) -> str:
p = urlparse(u)
path = p.path or ""
if path.endswith("/"):
path = path[:-1]
last = path.split("/")[-1]
return f"{p.netloc}:{last}".lower()
def dedup_by_filename(urls: List[str]) -> List[str]:
seen = set()
out: List[str] = []
for u in urls:
k = _filename_key(u)
if k in seen:
continue
seen.add(k)
out.append(u)
return out

View File

@@ -0,0 +1,42 @@
from __future__ import annotations
import re
from typing import Optional, Tuple
def parse_price(text: str) -> Tuple[Optional[float], Optional[str], str]:
"""
Return (value, currency, raw) from a price-like string.
Supports symbols £, €, $; strips thousands commas.
"""
raw = (text or "").strip()
m = re.search(r'([£€$])?\s*([0-9][0-9.,]*)', raw)
if not m:
return None, None, raw
sym = m.group(1) or ""
num = m.group(2).replace(",", "")
try:
value = float(num)
except ValueError:
return None, None, raw
currency = {"£": "GBP", "": "EUR", "$": "USD"}.get(sym, None)
return value, currency, raw
def parse_case_size(text: str) -> Tuple[Optional[int], Optional[float], Optional[str], str]:
"""
Parse strings like "6 x 500g", "12x1L", "24 × 330 ml"
Returns (count, item_qty, item_unit, raw)
"""
raw = (text or "").strip()
if not raw:
return None, None, None, raw
t = re.sub(r"[×Xx]\s*", " x ", raw)
m = re.search(r"(\d+)\s*x\s*([0-9]*\.?[0-9]+)\s*([a-zA-Z]+)", t)
if not m:
return None, None, None, raw
count = int(m.group(1))
try:
item_qty = float(m.group(2))
except ValueError:
item_qty = None
unit = m.group(3)
return count, item_qty, unit, raw

View File

@@ -0,0 +1,16 @@
from __future__ import annotations
import re
from shared.utils import normalize_text
from shared.config import config
def clean_title(t: str) -> str:
t = normalize_text(t)
t = re.sub(r":\s*$", "", t)
return t
def is_blacklisted_heading(title: str) -> bool:
"""Return True if heading should be skipped based on config blacklist."""
bl = (config().get("blacklist") or {}).get("product-details") or []
low = (title or "").strip().lower()
return any(low == (s or "").strip().lower() for s in bl)

View File

@@ -0,0 +1,48 @@
from __future__ import annotations
from typing import Dict, Tuple, Union
from shared.utils import soup_of
from ..http_client import fetch
from ..html_utils import absolutize_fragment
from bp.browse.services.slugs import product_slug_from_href
from .registry import REGISTRY, merge_missing
from . import extractors as _auto_register # noqa: F401 (import-time side effects)
async def scrape_product_detail(product_url: str, include_html: bool = False) -> Union[dict, Tuple[dict, str]]:
"""
Returns a dict with fields (subset):
title, images, image, description_short, description_html, sections,
slug, suma_href, stickers, labels, info_table fields, oe_list_price, prices,
breadcrumbs-derived category_* fields.
If include_html=True, returns (data, html).
"""
html = await fetch(product_url)
data: Dict[str, Union[str, float, int, list, dict, None]] = {
"suma_href": product_url,
"slug": product_slug_from_href(product_url),
}
# Run all extractors
for fn in REGISTRY:
try:
soup = soup_of(html)
piece = fn(soup, product_url) or {}
except Exception:
# Tolerate site drift
continue
merge_missing(data, piece)
# If we found short description but not description_html, echo it
if not data.get("description_html") and data.get("description_short"):
data["description_html"] = absolutize_fragment(f"<p>{data['description_short']}</p>")
# Ensure "image" mirrors first of images if not set
if not data.get("image"):
imgs = data.get("images") or []
if isinstance(imgs, list) and imgs:
data["image"] = imgs[0]
if include_html:
return data, html
return data

View File

@@ -0,0 +1,4 @@
from __future__ import annotations
# Thin wrapper to keep import path stable
from .product_core import scrape_product_detail # re-export

View File

@@ -0,0 +1,20 @@
from __future__ import annotations
from typing import Callable, Dict, List, Union
Extractor = Callable[[object, str], Dict[str, Union[str, float, int, list, dict, None]]]
REGISTRY: List[Extractor] = []
def extractor(fn: Extractor) -> Extractor:
"""Decorator to register an extractor."""
REGISTRY.append(fn)
return fn
def merge_missing(dst: dict, src: dict) -> None:
"""
Merge src into dst. Only write keys that are missing or empty in dst.
"Empty" means None, "", [], {}.
"""
for k, v in (src or {}).items():
if k not in dst or dst[k] in (None, "", [], {}):
dst[k] = v