Monorepo: consolidate 7 repos into one
Combines shared, blog, market, cart, events, federation, and account into a single repository. Eliminates submodule sync, sibling model copying at build time, and per-app CI orchestration. Changes: - Remove per-app .git, .gitmodules, .gitea, submodule shared/ dirs - Remove stale sibling model copies from each app - Update all 6 Dockerfiles for monorepo build context (root = .) - Add build directives to docker-compose.yml - Add single .gitea/workflows/ci.yml with change detection - Add .dockerignore for monorepo build context - Create __init__.py for federation and account (cross-app imports)
This commit is contained in:
0
market/scrape/__init__.py
Normal file
0
market/scrape/__init__.py
Normal file
1
market/scrape/build_snapshot/__init__.py
Normal file
1
market/scrape/build_snapshot/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .build_snapshot import build_snapshot
|
||||
104
market/scrape/build_snapshot/build_snapshot.py
Normal file
104
market/scrape/build_snapshot/build_snapshot.py
Normal file
@@ -0,0 +1,104 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import Dict, Set
|
||||
|
||||
from ..http_client import configure_cookies
|
||||
from ..get_auth import login
|
||||
|
||||
from shared.config import config
|
||||
|
||||
from shared.utils import log
|
||||
|
||||
# DB: persistence helpers
|
||||
|
||||
from .tools import (
|
||||
_resolve_sub_redirects,
|
||||
valid_subs,
|
||||
candidate_subs,
|
||||
rewrite_nav,
|
||||
capture_product_slugs,
|
||||
fetch_and_upsert_products,
|
||||
)
|
||||
|
||||
from ..nav import nav_scrape
|
||||
|
||||
# ------------------------ core ------------------------
|
||||
async def build_snapshot(
|
||||
concurrency: int,
|
||||
user: str,
|
||||
password: str,
|
||||
save_nav,
|
||||
capture_listing,
|
||||
upsert_product,
|
||||
log_product_result,
|
||||
save_subcategory_redirects,
|
||||
save_link_reports = None,
|
||||
) -> None:
|
||||
# NOTE: we keep ensure_dir for listings iteration but no longer write JSON files.
|
||||
|
||||
# Make project importable
|
||||
import sys
|
||||
sys.path.insert(0, os.path.abspath("."))
|
||||
|
||||
|
||||
cookies = await login(username=user, password=password)
|
||||
await configure_cookies(cookies)
|
||||
for k, v in dict(cookies).items():
|
||||
print("logged in with", k, v)
|
||||
|
||||
# 1) NAV
|
||||
log("Fetching nav…")
|
||||
nav = await nav_scrape()
|
||||
|
||||
# Build valid subs per top from nav
|
||||
valid_subs_by_top: Dict[str, Set[str]] = valid_subs(nav)
|
||||
|
||||
# Resolve redirects for all subs in nav first
|
||||
nav_sub_candidates = candidate_subs(nav)
|
||||
nav_redirects = await _resolve_sub_redirects(
|
||||
base_url=config()["base_url"],
|
||||
candidates=nav_sub_candidates,
|
||||
allowed_tops=set(config()["categories"]["allow"].values()),
|
||||
valid_subs_by_top=valid_subs_by_top,
|
||||
)
|
||||
rewrite_nav(nav, nav_redirects)
|
||||
|
||||
# DB: save nav
|
||||
await save_nav(nav)
|
||||
|
||||
product_slugs: Set[str] = await capture_product_slugs(
|
||||
nav,
|
||||
capture_listing
|
||||
)
|
||||
unknown_sub_paths: Set[str] = set()
|
||||
|
||||
# 3) PRODUCTS (fetch details)
|
||||
await fetch_and_upsert_products(
|
||||
upsert_product,
|
||||
log_product_result,
|
||||
save_link_reports,
|
||||
concurrency,
|
||||
product_slugs,
|
||||
valid_subs_by_top,
|
||||
unknown_sub_paths
|
||||
)
|
||||
|
||||
# Subcategory redirects from HTML
|
||||
log("Resolving subcategory redirects…")
|
||||
html_redirects = await _resolve_sub_redirects(
|
||||
base_url=config()["base_url"],
|
||||
candidates=unknown_sub_paths,
|
||||
allowed_tops=set(config()["categories"]["allow"].values()),
|
||||
valid_subs_by_top=valid_subs_by_top,
|
||||
)
|
||||
sub_redirects: Dict[str, str] = dict(nav_redirects)
|
||||
sub_redirects.update(html_redirects)
|
||||
|
||||
# DB: persist redirects
|
||||
await save_subcategory_redirects(sub_redirects)
|
||||
|
||||
log("Snapshot build complete (to Postgres).")
|
||||
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
APP_ROOT_PLACEHOLDER = "[**__APP_ROOT__**]"
|
||||
1
market/scrape/build_snapshot/tools/__init__.py
Normal file
1
market/scrape/build_snapshot/tools/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
6
market/scrape/build_snapshot/tools/_anchor_text.py
Normal file
6
market/scrape/build_snapshot/tools/_anchor_text.py
Normal file
@@ -0,0 +1,6 @@
|
||||
def _anchor_text(a) -> str:
|
||||
try:
|
||||
txt = " ".join((a.get_text(" ") or "").split())
|
||||
return txt[:200]
|
||||
except Exception:
|
||||
return ""
|
||||
16
market/scrape/build_snapshot/tools/_collect_html_img_srcs.py
Normal file
16
market/scrape/build_snapshot/tools/_collect_html_img_srcs.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import List, Optional
|
||||
|
||||
def _collect_html_img_srcs(html: Optional[str]) -> List[str]:
|
||||
urls: List[str] = []
|
||||
if not html:
|
||||
return urls
|
||||
try:
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
for img in soup.find_all("img"):
|
||||
src = img.get("src")
|
||||
if src:
|
||||
urls.append(src)
|
||||
except Exception:
|
||||
pass
|
||||
return urls
|
||||
14
market/scrape/build_snapshot/tools/_dedupe_preserve_order.py
Normal file
14
market/scrape/build_snapshot/tools/_dedupe_preserve_order.py
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
from typing import Iterable, List, Set
|
||||
|
||||
def _dedupe_preserve_order(urls: Iterable[str]) -> List[str]:
|
||||
seen: Set[str] = set()
|
||||
out: List[str] = []
|
||||
for u in urls:
|
||||
if not u or not isinstance(u, str):
|
||||
continue
|
||||
if u in seen:
|
||||
continue
|
||||
seen.add(u)
|
||||
out.append(u)
|
||||
return out
|
||||
32
market/scrape/build_snapshot/tools/_product_dict_is_cf.py
Normal file
32
market/scrape/build_snapshot/tools/_product_dict_is_cf.py
Normal file
@@ -0,0 +1,32 @@
|
||||
from typing import Dict,Optional, Tuple
|
||||
|
||||
_CF_TOKENS = (
|
||||
"One moment, please...",
|
||||
"Please wait while your request is being verified",
|
||||
"/cdn-cgi/challenge-platform/",
|
||||
"rocket-loader.min.js",
|
||||
)
|
||||
|
||||
def _looks_like_cf_html(html: Optional[str]) -> Tuple[bool, Optional[str]]:
|
||||
if not html:
|
||||
return False, None
|
||||
for tok in _CF_TOKENS:
|
||||
if tok in html:
|
||||
return True, tok
|
||||
return False, None
|
||||
|
||||
def _product_dict_is_cf(d: Dict) -> Tuple[bool, Optional[str]]:
|
||||
title = (d.get("title") or "").strip()
|
||||
if title.lower() == "one moment, please...":
|
||||
return True, "One moment, please..."
|
||||
ok, tok = _looks_like_cf_html(d.get("description_html"))
|
||||
if ok:
|
||||
return True, tok
|
||||
for sec in d.get("sections") or []:
|
||||
if isinstance(sec, dict) and sec.get("html"):
|
||||
ok2, tok2 = _looks_like_cf_html(sec["html"])
|
||||
if ok2:
|
||||
return True, tok2
|
||||
if not d.get("images") and not d.get("description_html") and not d.get("sections"):
|
||||
return True, "all_empty_heuristic"
|
||||
return False, None
|
||||
34
market/scrape/build_snapshot/tools/_resolve_sub_redirects.py
Normal file
34
market/scrape/build_snapshot/tools/_resolve_sub_redirects.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from typing import Dict, Set
|
||||
from urllib.parse import urlparse, urljoin
|
||||
import httpx
|
||||
|
||||
|
||||
async def _resolve_sub_redirects(
|
||||
base_url: str,
|
||||
candidates: Set[str],
|
||||
allowed_tops: Set[str],
|
||||
valid_subs_by_top: Dict[str, Set[str]],
|
||||
) -> Dict[str, str]:
|
||||
mapping: Dict[str, str] = {}
|
||||
if not candidates:
|
||||
return mapping
|
||||
timeout = httpx.Timeout(20.0, connect=10.0)
|
||||
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, http2=True) as client:
|
||||
for path in sorted(candidates):
|
||||
try:
|
||||
url = urljoin(base_url, path)
|
||||
r = await client.get(url)
|
||||
final = str(r.url)
|
||||
p = urlparse(final)
|
||||
parts = [x for x in (p.path or "").split("/") if x]
|
||||
if len(parts) >= 2:
|
||||
top_new = parts[0].lower()
|
||||
sub_new = parts[1].lower().removesuffix(".html").removesuffix(".htm")
|
||||
if top_new in allowed_tops:
|
||||
new_path = f"/{top_new}/{sub_new}"
|
||||
if new_path != path:
|
||||
mapping[path] = new_path
|
||||
valid_subs_by_top.setdefault(top_new, set()).add(sub_new)
|
||||
except Exception:
|
||||
continue
|
||||
return mapping
|
||||
100
market/scrape/build_snapshot/tools/_rewrite_links_fragment.py
Normal file
100
market/scrape/build_snapshot/tools/_rewrite_links_fragment.py
Normal file
@@ -0,0 +1,100 @@
|
||||
from typing import Dict, List, Optional, Set
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urlparse, urljoin
|
||||
|
||||
from ._anchor_text import _anchor_text
|
||||
from bp.browse.services.slugs import product_slug_from_href
|
||||
from .APP_ROOT_PLACEHOLDER import APP_ROOT_PLACEHOLDER
|
||||
|
||||
def _rewrite_links_fragment(
|
||||
html: Optional[str],
|
||||
base_url: str,
|
||||
known_slugs: Set[str],
|
||||
category_allow_values: Set[str],
|
||||
valid_subs_by_top: Dict[str, Set[str]],
|
||||
current_product_slug: str,
|
||||
link_errors: List[Dict],
|
||||
link_externals: List[Dict],
|
||||
unknown_sub_paths: Set[str],
|
||||
) -> str:
|
||||
if not html:
|
||||
return ""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
base_host = urlparse(base_url).netloc
|
||||
|
||||
for a in soup.find_all("a", href=True):
|
||||
raw = (a.get("href") or "").strip()
|
||||
if not raw:
|
||||
continue
|
||||
low = raw.lower()
|
||||
if low.startswith(("mailto:", "tel:", "javascript:", "data:")) or low.startswith("#"):
|
||||
continue
|
||||
abs_href = urljoin(base_url, raw)
|
||||
p = urlparse(abs_href)
|
||||
if not p.scheme or not p.netloc:
|
||||
continue
|
||||
if p.netloc != base_host:
|
||||
link_externals.append({
|
||||
"product": current_product_slug,
|
||||
"href": abs_href,
|
||||
"text": _anchor_text(a),
|
||||
"host": p.netloc,
|
||||
})
|
||||
continue
|
||||
parts = [x for x in (p.path or "").split("/") if x]
|
||||
if not parts:
|
||||
continue
|
||||
last = parts[-1].lower()
|
||||
if last.endswith((".html", ".htm")):
|
||||
target_slug = product_slug_from_href(abs_href)
|
||||
if target_slug and target_slug in known_slugs:
|
||||
a["href"] = f"{APP_ROOT_PLACEHOLDER}/product/{target_slug}"
|
||||
else:
|
||||
link_errors.append({
|
||||
"product": current_product_slug,
|
||||
"href": abs_href,
|
||||
"text": _anchor_text(a),
|
||||
"top": None,
|
||||
"sub": None,
|
||||
"target_slug": target_slug or None,
|
||||
"type": "suma_product_unknown",
|
||||
})
|
||||
continue
|
||||
top = parts[0].lower()
|
||||
if top in category_allow_values:
|
||||
if len(parts) == 1:
|
||||
a["href"] = f"{APP_ROOT_PLACEHOLDER}/{top}"
|
||||
else:
|
||||
sub = parts[1]
|
||||
if sub.lower().endswith((".html", ".htm")):
|
||||
sub = sub.rsplit(".", 1)[0]
|
||||
if sub in (valid_subs_by_top.get(top) or set()):
|
||||
a["href"] = f"{APP_ROOT_PLACEHOLDER}/{top}/{sub}"
|
||||
else:
|
||||
unknown_path = f"/{top}/{sub}"
|
||||
unknown_sub_paths.add(unknown_path)
|
||||
a["href"] = f"{APP_ROOT_PLACEHOLDER}{unknown_path}"
|
||||
link_errors.append({
|
||||
"product": current_product_slug,
|
||||
"href": abs_href,
|
||||
"text": _anchor_text(a),
|
||||
"top": top,
|
||||
"sub": sub,
|
||||
"target_slug": None,
|
||||
"type": "suma_category_invalid_sub_pending",
|
||||
})
|
||||
else:
|
||||
link_errors.append({
|
||||
"product": current_product_slug,
|
||||
"href": abs_href,
|
||||
"text": _anchor_text(a),
|
||||
"top": top,
|
||||
"sub": parts[1] if len(parts) > 1 else None,
|
||||
"target_slug": None,
|
||||
"type": "suma_other",
|
||||
})
|
||||
|
||||
for t in soup.find_all(["html", "body"]):
|
||||
t.unwrap()
|
||||
return "".join(str(c) for c in soup.contents).strip()
|
||||
|
||||
14
market/scrape/build_snapshot/tools/candidate_subs.py
Normal file
14
market/scrape/build_snapshot/tools/candidate_subs.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from typing import Dict, Set
|
||||
|
||||
def candidate_subs(nav: Dict[str, Dict])-> Set[str]:
|
||||
nav_sub_candidates: Set[str] = set()
|
||||
for label, data in (nav.get("cats") or {}).items():
|
||||
top_slug = (data or {}).get("slug")
|
||||
if not top_slug:
|
||||
continue
|
||||
for s in (data.get("subs") or []):
|
||||
sub_slug = (s.get("slug") or "").strip()
|
||||
if sub_slug:
|
||||
nav_sub_candidates.add(f"/{top_slug}/{sub_slug}")
|
||||
return nav_sub_candidates
|
||||
|
||||
18
market/scrape/build_snapshot/tools/capture_category.py
Normal file
18
market/scrape/build_snapshot/tools/capture_category.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from urllib.parse import urljoin
|
||||
from shared.config import config
|
||||
from shared.utils import log
|
||||
from ...listings import scrape_products
|
||||
|
||||
async def capture_category(
|
||||
slug: str,
|
||||
):
|
||||
list_url = urljoin(config()["base_url"], f"/{slug}")
|
||||
log(f"[{slug}] page 1…")
|
||||
items, total_pages = await scrape_products(list_url, page=1)
|
||||
|
||||
pmax = int(total_pages or 1)
|
||||
for p in range(2, pmax + 1):
|
||||
log(f"[{slug}] page {p}…")
|
||||
items_p, _tp = await scrape_products(list_url, page=p)
|
||||
items.extend(items_p)
|
||||
return (list_url, items, total_pages)
|
||||
25
market/scrape/build_snapshot/tools/capture_product_slugs.py
Normal file
25
market/scrape/build_snapshot/tools/capture_product_slugs.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from typing import Dict, Set
|
||||
from .capture_category import capture_category
|
||||
from .capture_sub import capture_sub
|
||||
from shared.config import config
|
||||
|
||||
|
||||
async def capture_product_slugs(
|
||||
nav: Dict[str, Dict],
|
||||
capture_listing,
|
||||
):
|
||||
product_slugs: Set[str] = set()
|
||||
for label, slug in config()["categories"]["allow"].items():
|
||||
lpars = await capture_category( slug)
|
||||
await capture_listing(*lpars)
|
||||
(_, items, __) = lpars
|
||||
for slug_ in items:
|
||||
product_slugs.add(slug_)
|
||||
for sub in (nav["cats"].get(label, {}).get("subs", []) or []):
|
||||
lpars = await capture_sub(sub, slug)
|
||||
await capture_listing(*lpars)
|
||||
(_, items, __) = lpars
|
||||
for slug_ in items:
|
||||
product_slugs.add(slug_)
|
||||
return product_slugs
|
||||
|
||||
22
market/scrape/build_snapshot/tools/capture_sub.py
Normal file
22
market/scrape/build_snapshot/tools/capture_sub.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from urllib.parse import urljoin
|
||||
from urllib.parse import urljoin
|
||||
from shared.config import config
|
||||
from shared.utils import log
|
||||
from ...listings import scrape_products
|
||||
|
||||
async def capture_sub(
|
||||
sub,
|
||||
slug,
|
||||
):
|
||||
sub_slug = sub.get("slug")
|
||||
if not sub_slug:
|
||||
return
|
||||
sub_url = urljoin(config()["base_url"], f"/{slug}/{sub_slug}")
|
||||
log(f"[{slug}/{sub_slug}] page 1…")
|
||||
items_s, total_pages_s = await scrape_products(sub_url, page=1)
|
||||
spmax = int(total_pages_s or 1)
|
||||
for p in range(2, spmax + 1):
|
||||
log(f"[{slug}/{sub_slug}] page {p}…")
|
||||
items_ps, _ = await scrape_products(sub_url, page=p)
|
||||
items_s.extend(items_ps)
|
||||
return (sub_url, items_s, total_pages_s)
|
||||
106
market/scrape/build_snapshot/tools/fetch_and_upsert_product.py
Normal file
106
market/scrape/build_snapshot/tools/fetch_and_upsert_product.py
Normal file
@@ -0,0 +1,106 @@
|
||||
|
||||
import asyncio
|
||||
from typing import List
|
||||
|
||||
import httpx
|
||||
|
||||
|
||||
from ...html_utils import to_fragment
|
||||
from bp.browse.services.slugs import suma_href_from_html_slug
|
||||
|
||||
|
||||
from shared.config import config
|
||||
|
||||
from shared.utils import log
|
||||
|
||||
# DB: persistence helpers
|
||||
from ...product.product_detail import scrape_product_detail
|
||||
from ._product_dict_is_cf import _product_dict_is_cf
|
||||
from ._rewrite_links_fragment import _rewrite_links_fragment
|
||||
from ._dedupe_preserve_order import _dedupe_preserve_order
|
||||
from ._collect_html_img_srcs import _collect_html_img_srcs
|
||||
|
||||
|
||||
async def fetch_and_upsert_product(
|
||||
upsert_product,
|
||||
log_product_result,
|
||||
sem: asyncio.Semaphore,
|
||||
slug: str,
|
||||
product_slugs,
|
||||
category_values,
|
||||
valid_subs_by_top,
|
||||
link_errors,
|
||||
link_externals,
|
||||
unknown_sub_paths
|
||||
) -> bool:
|
||||
href = suma_href_from_html_slug(slug)
|
||||
try:
|
||||
async with sem:
|
||||
d = await scrape_product_detail(href)
|
||||
|
||||
is_cf, cf_token = _product_dict_is_cf(d)
|
||||
if is_cf:
|
||||
payload = {
|
||||
"slug": slug,
|
||||
"href_tried": href,
|
||||
"error_type": "CloudflareChallengeDetected",
|
||||
"error_message": f"Detected Cloudflare interstitial via token: {cf_token}",
|
||||
"cf_token": cf_token,
|
||||
}
|
||||
await log_product_result(ok=False, payload=payload)
|
||||
log(f" ! CF challenge detected: {slug} ({cf_token})")
|
||||
return False
|
||||
|
||||
# Rewrite embedded links; collect reports
|
||||
if d.get("description_html"):
|
||||
d["description_html"] = _rewrite_links_fragment(
|
||||
d["description_html"], config()["base_url"], product_slugs, category_values,
|
||||
valid_subs_by_top, slug, link_errors, link_externals, unknown_sub_paths
|
||||
)
|
||||
d["description_html"] = to_fragment(d["description_html"])
|
||||
if d.get("sections"):
|
||||
for sec in d["sections"]:
|
||||
if isinstance(sec, dict) and sec.get("html"):
|
||||
sec["html"] = _rewrite_links_fragment(
|
||||
sec["html"], config()["base_url"], product_slugs, category_values,
|
||||
valid_subs_by_top, slug, link_errors, link_externals, unknown_sub_paths
|
||||
)
|
||||
sec["html"] = to_fragment(sec["html"])
|
||||
|
||||
# Images
|
||||
gallery = _dedupe_preserve_order(d.get("images") or [])
|
||||
embedded: List[str] = []
|
||||
if d.get("description_html"):
|
||||
embedded += _collect_html_img_srcs(d["description_html"])
|
||||
for sec in d.get("sections", []) or []:
|
||||
if isinstance(sec, dict) and sec.get("html"):
|
||||
embedded += _collect_html_img_srcs(sec["html"])
|
||||
embedded = _dedupe_preserve_order(embedded)
|
||||
all_imgs = _dedupe_preserve_order(list(gallery) + list(embedded))
|
||||
|
||||
d["images"] = gallery
|
||||
d["embedded_image_urls"] = embedded
|
||||
d["all_image_urls"] = all_imgs
|
||||
await upsert_product(slug, href, d)
|
||||
# DB: upsert product + success log
|
||||
return True
|
||||
except Exception as e:
|
||||
payload = {
|
||||
"slug": slug,
|
||||
"href_tried": href,
|
||||
"error_type": e.__class__.__name__,
|
||||
"error_message": str(e),
|
||||
}
|
||||
try:
|
||||
if isinstance(e, httpx.HTTPStatusError):
|
||||
payload["http_status"] = getattr(e.response, "status_code", None)
|
||||
req = getattr(e, "request", None)
|
||||
if req is not None and getattr(req, "url", None) is not None:
|
||||
payload["final_url"] = str(req.url)
|
||||
elif isinstance(e, httpx.TransportError):
|
||||
payload["transport_error"] = True
|
||||
except Exception:
|
||||
pass
|
||||
await log_product_result(ok=False, payload=payload)
|
||||
log(f" ! product failed: {slug} ({e})")
|
||||
return False
|
||||
@@ -0,0 +1,49 @@
|
||||
import asyncio
|
||||
from typing import Dict, List, Set
|
||||
from shared.config import config
|
||||
from shared.utils import log
|
||||
from .fetch_and_upsert_product import fetch_and_upsert_product
|
||||
|
||||
|
||||
async def fetch_and_upsert_products(
|
||||
upsert_product,
|
||||
log_product_result,
|
||||
save_link_reports = None,
|
||||
concurrency: int=8,
|
||||
product_slugs: Set[str] = set(),
|
||||
valid_subs_by_top: Dict[str, Set[str]] = {},
|
||||
unknown_sub_paths: Set[str] = set()
|
||||
):
|
||||
sem = asyncio.Semaphore(max(1, concurrency))
|
||||
link_errors: List[Dict] = []
|
||||
link_externals: List[Dict] = []
|
||||
|
||||
category_values: Set[str] = set(config()["categories"]["allow"].values())
|
||||
to_fetch = sorted(list(product_slugs))
|
||||
log(f"Fetching {len(to_fetch)} product details (concurrency={concurrency})…")
|
||||
tasks = [asyncio.create_task(
|
||||
fetch_and_upsert_product(
|
||||
upsert_product,
|
||||
log_product_result,
|
||||
sem,
|
||||
s,
|
||||
product_slugs,
|
||||
category_values,
|
||||
valid_subs_by_top,
|
||||
link_errors,
|
||||
link_externals,
|
||||
unknown_sub_paths
|
||||
)
|
||||
) for s in to_fetch]
|
||||
done = 0
|
||||
ok_count = 0
|
||||
for coro in asyncio.as_completed(tasks):
|
||||
ok = await coro
|
||||
done += 1
|
||||
if ok:
|
||||
ok_count += 1
|
||||
if done % 50 == 0 or done == len(tasks):
|
||||
log(f" …{done}/{len(tasks)} saved (ok={ok_count})")
|
||||
if save_link_reports:
|
||||
await save_link_reports(link_errors, link_externals)
|
||||
|
||||
24
market/scrape/build_snapshot/tools/rewrite_nav.py
Normal file
24
market/scrape/build_snapshot/tools/rewrite_nav.py
Normal file
@@ -0,0 +1,24 @@
|
||||
|
||||
from typing import Dict
|
||||
from urllib.parse import urljoin
|
||||
from shared.config import config
|
||||
|
||||
def rewrite_nav(nav: Dict[str, Dict], nav_redirects:Dict[str, str]):
|
||||
if nav_redirects:
|
||||
for label, data in (nav.get("cats") or {}).items():
|
||||
top_slug = (data or {}).get("slug")
|
||||
if not top_slug:
|
||||
continue
|
||||
new_subs = []
|
||||
for s in (data.get("subs") or []):
|
||||
old_sub = (s.get("slug") or "").strip()
|
||||
if not old_sub:
|
||||
continue
|
||||
old_path = f"/{top_slug}/{old_sub}"
|
||||
canonical_path = nav_redirects.get(old_path, old_path)
|
||||
parts = [x for x in canonical_path.split("/") if x]
|
||||
top2, sub2 = parts[0], parts[1]
|
||||
s["slug"] = sub2
|
||||
s["href"] = urljoin(config()["base_url"], f"/{top2}/{sub2}")
|
||||
new_subs.append(s)
|
||||
data["subs"] = new_subs
|
||||
16
market/scrape/build_snapshot/tools/valid_subs.py
Normal file
16
market/scrape/build_snapshot/tools/valid_subs.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from typing import Dict, Set
|
||||
|
||||
# make valid subs for ewch top in nav
|
||||
def valid_subs(nav: Dict[str, Dict])->Dict[str, Set[str]] :
|
||||
valid_subs_by_top: Dict[str, Set[str]] = {}
|
||||
for label, data in (nav.get("cats") or {}).items():
|
||||
top_slug = (data or {}).get("slug")
|
||||
if not top_slug:
|
||||
continue
|
||||
subs_set = {
|
||||
(s.get("slug") or "").strip()
|
||||
for s in (data.get("subs") or [])
|
||||
if s.get("slug")
|
||||
}
|
||||
valid_subs_by_top[top_slug] = subs_set
|
||||
return valid_subs_by_top
|
||||
244
market/scrape/get_auth.py
Normal file
244
market/scrape/get_auth.py
Normal file
@@ -0,0 +1,244 @@
|
||||
from typing import Optional, Dict, Any, List
|
||||
from urllib.parse import urljoin
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
from shared.config import config
|
||||
|
||||
class LoginFailed(Exception):
|
||||
def __init__(self, message: str, *, debug: Dict[str, Any]):
|
||||
super().__init__(message)
|
||||
self.debug = debug
|
||||
|
||||
def _ff_headers(referer: Optional[str] = None, origin: Optional[str] = None) -> Dict[str, str]:
|
||||
h = {
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "en-GB,en;q=0.5",
|
||||
"Accept-Encoding": "gzip, deflate, br, zstd",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"DNT": "1",
|
||||
"Sec-GPC": "1",
|
||||
"Cache-Control": "no-cache",
|
||||
"Pragma": "no-cache",
|
||||
}
|
||||
if referer:
|
||||
h["Referer"] = referer
|
||||
if origin:
|
||||
h["Origin"] = origin
|
||||
return h
|
||||
|
||||
def _cookie_header_from_jar(jar: httpx.Cookies, domain: str, path: str = "/") -> str:
|
||||
pairs: List[str] = []
|
||||
for c in jar.jar:
|
||||
if not c.name or c.value is None:
|
||||
continue
|
||||
dom = (c.domain or "").lstrip(".")
|
||||
if not dom:
|
||||
continue
|
||||
if not (domain == dom or domain.endswith("." + dom) or dom.endswith("." + domain)):
|
||||
continue
|
||||
if not (path.startswith(c.path or "/")):
|
||||
continue
|
||||
pairs.append(f"{c.name}={c.value}")
|
||||
return "; ".join(pairs)
|
||||
|
||||
def _extract_magento_errors(html_text: str) -> list[str]:
|
||||
msgs: list[str] = []
|
||||
try:
|
||||
soup = BeautifulSoup(html_text or "", "lxml")
|
||||
for sel in [
|
||||
".message-error",
|
||||
".messages .message-error",
|
||||
".page.messages .message-error",
|
||||
"[data-ui-id='message-error']",
|
||||
".message.warning",
|
||||
".message.notice",
|
||||
]:
|
||||
for box in soup.select(sel):
|
||||
t = " ".join((box.get_text(" ") or "").split())
|
||||
if t and t not in msgs:
|
||||
msgs.append(t)
|
||||
except Exception:
|
||||
pass
|
||||
return msgs
|
||||
|
||||
def _looks_like_login_page(html_text: str) -> bool:
|
||||
try:
|
||||
s = BeautifulSoup(html_text or "", "lxml")
|
||||
if s.select_one("form#login-form.form-login"):
|
||||
return True
|
||||
title = (s.title.get_text() if s.title else "").strip().lower()
|
||||
if "customer login" in title:
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
def _chrome_headers(referer=None, origin=None):
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
}
|
||||
if referer:
|
||||
headers["Referer"] = referer
|
||||
if origin:
|
||||
headers["Origin"] = origin
|
||||
return headers
|
||||
|
||||
async def login(
|
||||
username: str,
|
||||
password: str,
|
||||
*,
|
||||
extra_cookies = {}, # ok to pass cf_clearance etc., but NOT form_key
|
||||
timeout: float = 30.0,
|
||||
) -> httpx.Cookies:
|
||||
"""
|
||||
Attempt login and return an authenticated cookie jar.
|
||||
|
||||
Success criteria (strict):
|
||||
1) /customer/section/load?sections=customer reports is_logged_in == True
|
||||
OR
|
||||
2) GET /customer/account/ resolves to an account page (not the login page).
|
||||
|
||||
Otherwise raises LoginFailed with debug info.
|
||||
"""
|
||||
limits = httpx.Limits(max_connections=10, max_keepalive_connections=6)
|
||||
cookies = httpx.Cookies()
|
||||
for k, v in {
|
||||
**extra_cookies,
|
||||
"pr-cookie-consent": '["all"]',
|
||||
"user_allowed_save_cookie": '{"1":1}',
|
||||
}.items():
|
||||
if k.lower() == "form_key":
|
||||
continue
|
||||
cookies.set(k, v, domain="wholesale.suma.coop", path="/")
|
||||
|
||||
base_login = config()["base_login"]
|
||||
base_url = config()["base_url"]
|
||||
|
||||
async with httpx.AsyncClient(
|
||||
follow_redirects=True,
|
||||
timeout=httpx.Timeout(timeout, connect=15.0),
|
||||
http2=True,
|
||||
limits=limits,
|
||||
cookies=cookies,
|
||||
headers=_chrome_headers(),
|
||||
trust_env=True,
|
||||
) as client:
|
||||
# 1) GET login page for fresh form_key
|
||||
import time
|
||||
login_bust = base_login + ("&" if "?" in base_login else "?") + f"_={int(time.time()*1000)}"
|
||||
login_bust = base_login
|
||||
r_get = await client.get(login_bust, headers=_chrome_headers())
|
||||
print("Login GET failed. Status:", r_get.status_code)
|
||||
print("Login GET URL:", r_get.url)
|
||||
print("Response text:", r_get.text[:1000]) # trim if long
|
||||
r_get.raise_for_status()
|
||||
soup = BeautifulSoup(r_get.text, "lxml")
|
||||
|
||||
form = soup.select_one("form.form.form-login#login-form") or soup.select_one("#login-form")
|
||||
if not form:
|
||||
raise LoginFailed(
|
||||
"Login form not found (possible bot challenge or theme change).",
|
||||
debug={"get_status": r_get.status_code, "final_url": str(r_get.url)},
|
||||
)
|
||||
action = urljoin(base_login, form.get("action") or base_login)
|
||||
fk_el = form.find("input", attrs={"name": "form_key"})
|
||||
hidden_form_key = (fk_el.get("value") if fk_el else "") or ""
|
||||
|
||||
# mirror Magento behavior: form_key also appears as a cookie
|
||||
client.cookies.set("form_key", hidden_form_key, domain="wholesale.suma.coop", path="/")
|
||||
|
||||
payload = {
|
||||
"form_key": hidden_form_key,
|
||||
"login[username]": username,
|
||||
"login[password]": password,
|
||||
"send": "Login",
|
||||
}
|
||||
|
||||
post_headers = _chrome_headers(referer=base_login, origin=base_url)
|
||||
post_headers["Content-Type"] = "application/x-www-form-urlencoded"
|
||||
post_headers["Cookie"] = _cookie_header_from_jar(
|
||||
client.cookies, domain="wholesale.suma.coop", path="/customer/"
|
||||
)
|
||||
|
||||
r_post = await client.post(action, data=payload, headers=post_headers)
|
||||
|
||||
# 2) Primary check: sections API must say logged in
|
||||
is_logged_in = False
|
||||
sections_url = "https://wholesale.suma.coop/customer/section/load/?sections=customer&force_new_section_timestamp=1"
|
||||
section_json: Dict[str, Any] = {}
|
||||
try:
|
||||
r_sec = await client.get(sections_url, headers=_chrome_headers(referer=base_login))
|
||||
if r_sec.status_code == 200:
|
||||
section_json = r_sec.json()
|
||||
cust = section_json.get("customer") or {}
|
||||
is_logged_in = bool(cust.get("is_logged_in"))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 3) Secondary check: account page should NOT be the login page
|
||||
looks_like_login = False
|
||||
final_account_url = ""
|
||||
try:
|
||||
r_acc = await client.get("https://wholesale.suma.coop/customer/account/", headers=_chrome_headers(referer=base_login))
|
||||
final_account_url = str(r_acc.url)
|
||||
looks_like_login = (
|
||||
"/customer/account/login" in final_account_url
|
||||
or _looks_like_login_page(r_acc.text)
|
||||
)
|
||||
except Exception:
|
||||
# ignore; we'll rely on section status
|
||||
pass
|
||||
|
||||
# Decide success/failure strictly
|
||||
if not (is_logged_in or (final_account_url and not looks_like_login)):
|
||||
errors = _extract_magento_errors(r_post.text)
|
||||
# Clean up transient form_key cookie
|
||||
try:
|
||||
client.cookies.jar.clear("wholesale.suma.coop", "/", "form_key")
|
||||
except Exception:
|
||||
pass
|
||||
raise LoginFailed(
|
||||
errors[0] if errors else "Invalid username or password.",
|
||||
debug={
|
||||
"get_status": r_get.status_code,
|
||||
"post_status": r_post.status_code,
|
||||
"post_final_url": str(r_post.url),
|
||||
"sections_customer": section_json.get("customer"),
|
||||
"account_final_url": final_account_url,
|
||||
"looks_like_login_page": looks_like_login,
|
||||
},
|
||||
)
|
||||
def clear_cookie_everywhere(cookies: httpx.Cookies, name: str) -> None:
|
||||
to_delete = []
|
||||
for c in list(cookies.jar): # http.cookiejar.Cookie objects
|
||||
if c.name == name:
|
||||
# Note: CookieJar.clear requires exact (domain, path, name)
|
||||
to_delete.append((c.domain, c.path, c.name))
|
||||
|
||||
for domain, path, nm in to_delete:
|
||||
try:
|
||||
cookies.jar.clear(domain, path, nm)
|
||||
except KeyError:
|
||||
# Mismatch can happen if domain has a leading dot vs not, etc.
|
||||
# Try again with a normalized domain variant.
|
||||
if domain and domain.startswith("."):
|
||||
|
||||
cookies.jar.clear(domain.lstrip("."), path, nm)
|
||||
else:
|
||||
# or try with leading dot
|
||||
cookies.jar.clear("." + domain, path, nm)
|
||||
if name in cookies:
|
||||
del cookies[name]
|
||||
|
||||
clear_cookie_everywhere(client.cookies, "form_key")
|
||||
#client.cookies.jar.clear(config()["base_host"] or "wholesale.suma.coop", "/", "form_key")
|
||||
print('cookies', client.cookies)
|
||||
return client.cookies
|
||||
44
market/scrape/html_utils.py
Normal file
44
market/scrape/html_utils.py
Normal file
@@ -0,0 +1,44 @@
|
||||
# suma_browser/html_utils.py
|
||||
from __future__ import annotations
|
||||
from typing import Optional
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin
|
||||
from shared.config import config
|
||||
|
||||
|
||||
|
||||
def to_fragment(html: Optional[str]) -> str:
|
||||
"""Return just the fragment contents (no <html>/<body> wrappers)."""
|
||||
if not html:
|
||||
return ""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
|
||||
# unwrap document-level containers
|
||||
for t in soup.find_all(["html", "body"]):
|
||||
t.unwrap()
|
||||
|
||||
return "".join(str(c) for c in soup.contents).strip()
|
||||
|
||||
def absolutize_fragment(html: Optional[str]) -> str:
|
||||
"""Absolutize href/src against BASE_URL and return a fragment (no wrappers)."""
|
||||
if not html:
|
||||
return ""
|
||||
frag = BeautifulSoup(html, "lxml")
|
||||
|
||||
for tag in frag.find_all(True):
|
||||
if tag.has_attr("href"):
|
||||
raw = str(tag["href"])
|
||||
abs_href = urljoin(config()["base_url"], raw) if raw.startswith("/") else raw
|
||||
#if rewrite_suma_href_to_local:
|
||||
# local = rewrite_suma_href_to_local(abs_href)
|
||||
# tag["href"] = local if local else abs_href
|
||||
#else:
|
||||
tag["href"] = abs_href
|
||||
if tag.has_attr("src"):
|
||||
raw = str(tag["src"])
|
||||
tag["src"] = urljoin(config()["base_url"], raw) if raw.startswith("/") else raw
|
||||
|
||||
# unwrap wrappers and return only the inner HTML
|
||||
for t in frag.find_all(["html", "body"]):
|
||||
t.unwrap()
|
||||
return "".join(str(c) for c in frag.contents).strip()
|
||||
220
market/scrape/http_client.py
Normal file
220
market/scrape/http_client.py
Normal file
@@ -0,0 +1,220 @@
|
||||
# suma_browser/http_client.py
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import secrets
|
||||
from typing import Optional, Dict
|
||||
|
||||
import httpx
|
||||
from shared.config import config
|
||||
|
||||
_CLIENT: httpx.AsyncClient | None = None
|
||||
|
||||
# ----- optional decoders -> Accept-Encoding
|
||||
BROTLI_OK = False
|
||||
ZSTD_OK = False
|
||||
try:
|
||||
import brotli # noqa: F401
|
||||
BROTLI_OK = True
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
import zstandard as zstd # noqa: F401
|
||||
ZSTD_OK = True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _accept_encoding() -> str:
|
||||
enc = ["gzip", "deflate"]
|
||||
if BROTLI_OK:
|
||||
enc.append("br")
|
||||
if ZSTD_OK:
|
||||
enc.append("zstd")
|
||||
return ", ".join(enc)
|
||||
|
||||
FIREFOX_UA = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0"
|
||||
|
||||
def _ff_headers(referer: Optional[str] = None) -> Dict[str, str]:
|
||||
h = {
|
||||
"User-Agent": FIREFOX_UA,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-GB,en;q=0.5",
|
||||
"Accept-Encoding": _accept_encoding(),
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"Sec-Fetch-Dest": "document",
|
||||
"Sec-Fetch-Mode": "navigate",
|
||||
"Sec-Fetch-Site": "none" if not referer else "same-origin",
|
||||
"Sec-Fetch-User": "?1",
|
||||
"DNT": "1",
|
||||
"Sec-GPC": "1",
|
||||
"Priority": "u=0, i",
|
||||
"Cache-Control": "no-cache",
|
||||
"Pragma": "no-cache",
|
||||
}
|
||||
if referer:
|
||||
h["Referer"] = referer
|
||||
return h
|
||||
def _chrome_headers(referer=None, origin=None):
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
}
|
||||
if referer:
|
||||
headers["Referer"] = referer
|
||||
if origin:
|
||||
headers["Origin"] = origin
|
||||
return headers
|
||||
|
||||
def _parse_cookie_header(cookie_header: str) -> Dict[str, str]:
|
||||
jar: Dict[str, str] = {}
|
||||
for part in cookie_header.split(";"):
|
||||
part = part.strip()
|
||||
if not part or "=" not in part:
|
||||
continue
|
||||
k, v = part.split("=", 1)
|
||||
jar[k.strip()] = v.strip()
|
||||
return jar
|
||||
|
||||
def _looks_like_cloudflare(html: bytes) -> bool:
|
||||
if not html:
|
||||
return False
|
||||
s = html[:40000].lower()
|
||||
return (
|
||||
b"please wait while your request is being verified" in s
|
||||
or b"/cdn-cgi/challenge-platform/scripts/jsd/main.js" in s
|
||||
or b"rocket-loader.min.js" in s
|
||||
or b"cf-ray" in s
|
||||
or b"challenge-platform" in s
|
||||
or b"cf-chl-" in s
|
||||
)
|
||||
|
||||
# -------- runtime cookie configuration (preferred over env) --------------------
|
||||
_INITIAL_COOKIES: Dict[str, str] = {}
|
||||
_INITIAL_COOKIE_HEADER: Optional[str] = None
|
||||
|
||||
async def configure_cookies(cookies: Dict[str, str]) -> None:
|
||||
"""
|
||||
Configure initial cookies programmatically (preferred over env).
|
||||
Call BEFORE the first request (i.e., before get_client()/fetch()).
|
||||
If a client already exists, its jar is updated immediately.
|
||||
"""
|
||||
global _INITIAL_COOKIES, _INITIAL_COOKIE_HEADER
|
||||
_INITIAL_COOKIE_HEADER = None
|
||||
_INITIAL_COOKIES = dict(cookies or {})
|
||||
# If client already built, update it now
|
||||
if _CLIENT is not None:
|
||||
print('configuring cookies')
|
||||
host = config()["base_host"] or "wholesale.suma.coop"
|
||||
for k, v in _INITIAL_COOKIES.items():
|
||||
_CLIENT.cookies.set(k, v, domain=host, path="/")
|
||||
|
||||
def configure_cookies_from_header(cookie_header: str) -> None:
|
||||
"""
|
||||
Configure initial cookies from a raw 'Cookie:' header string.
|
||||
Preferred over env; call BEFORE the first request.
|
||||
"""
|
||||
global _INITIAL_COOKIES, _INITIAL_COOKIE_HEADER
|
||||
_INITIAL_COOKIE_HEADER = cookie_header or ""
|
||||
_INITIAL_COOKIES = _parse_cookie_header(_INITIAL_COOKIE_HEADER)
|
||||
if _CLIENT is not None:
|
||||
host = config()["base_host"] or "wholesale.suma.coop"
|
||||
for k, v in _INITIAL_COOKIES.items():
|
||||
_CLIENT.cookies.set(k, v, domain=host, path="/")
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
async def get_client() -> httpx.AsyncClient:
|
||||
"""Public accessor (same as _get_client)."""
|
||||
return await _get_client()
|
||||
|
||||
async def _get_client() -> httpx.AsyncClient:
|
||||
global _CLIENT
|
||||
if _CLIENT is None:
|
||||
timeout = httpx.Timeout(300.0, connect=150.0)
|
||||
limits = httpx.Limits(max_keepalive_connections=8, max_connections=16)
|
||||
_CLIENT = httpx.AsyncClient(
|
||||
follow_redirects=True,
|
||||
timeout=timeout,
|
||||
http2=True,
|
||||
limits=limits,
|
||||
headers=_chrome_headers(),
|
||||
trust_env=True,
|
||||
)
|
||||
|
||||
# ---- Seed cookies (priority: runtime config > env var) ---------------
|
||||
host = config()["base_host"] or "wholesale.suma.coop"
|
||||
|
||||
if _INITIAL_COOKIES or _INITIAL_COOKIE_HEADER:
|
||||
# From runtime config
|
||||
if _INITIAL_COOKIE_HEADER:
|
||||
_CLIENT.cookies.update(_parse_cookie_header(_INITIAL_COOKIE_HEADER))
|
||||
for k, v in _INITIAL_COOKIES.items():
|
||||
_CLIENT.cookies.set(k, v, domain=host, path="/")
|
||||
else:
|
||||
# Fallback to environment
|
||||
cookie_str = os.environ.get("SUMA_COOKIES", "").strip()
|
||||
if cookie_str:
|
||||
_CLIENT.cookies.update(_parse_cookie_header(cookie_str))
|
||||
|
||||
# Ensure private_content_version is present
|
||||
if "private_content_version" not in _CLIENT.cookies:
|
||||
pcv = secrets.token_hex(16)
|
||||
_CLIENT.cookies.set("private_content_version", pcv, domain=host, path="/")
|
||||
# ---------------------------------------------------------------------
|
||||
|
||||
return _CLIENT
|
||||
|
||||
async def aclose_client() -> None:
|
||||
global _CLIENT
|
||||
if _CLIENT is not None:
|
||||
await _CLIENT.aclose()
|
||||
_CLIENT = None
|
||||
|
||||
async def fetch(url: str, *, referer: Optional[str] = None, retries: int = 3) -> str:
|
||||
client = await _get_client()
|
||||
|
||||
# Warm-up visit to look like a real session
|
||||
if len(client.cookies.jar) == 0:
|
||||
try:
|
||||
await client.get(config()["base_url"].rstrip("/") + "/", headers=_chrome_headers())
|
||||
await asyncio.sleep(0.25)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
last_exc: Optional[Exception] = None
|
||||
for attempt in range(1, retries + 1):
|
||||
try:
|
||||
h = _chrome_headers(referer=referer or (config()["base_url"].rstrip("/") + "/"))
|
||||
r = await client.get(url, headers=h)
|
||||
if _looks_like_cloudflare(r.content):
|
||||
if attempt < retries:
|
||||
await asyncio.sleep(0.9 if attempt == 1 else 1.3)
|
||||
try:
|
||||
await client.get(config()["base_url"].rstrip("/") + "/", headers=_chrome_headers())
|
||||
await asyncio.sleep(0.4)
|
||||
except Exception:
|
||||
pass
|
||||
continue
|
||||
try:
|
||||
r.raise_for_status()
|
||||
except httpx.HTTPStatusError as e:
|
||||
print(f"Fetch failed for {url}")
|
||||
print("Status:", r.status_code)
|
||||
print("Body:", r.text[:1000]) # Trimmed
|
||||
raise
|
||||
return r.text
|
||||
except Exception as e:
|
||||
last_exc = e
|
||||
if attempt >= retries:
|
||||
raise
|
||||
await asyncio.sleep(0.45 * attempt + 0.25)
|
||||
|
||||
if last_exc:
|
||||
raise last_exc
|
||||
raise RuntimeError("fetch failed unexpectedly")
|
||||
289
market/scrape/listings.py
Normal file
289
market/scrape/listings.py
Normal file
@@ -0,0 +1,289 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
import re
|
||||
from typing import Callable, Dict, List, Optional, Tuple
|
||||
from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
|
||||
|
||||
|
||||
from .http_client import fetch
|
||||
from bp.browse.services.slugs import product_slug_from_href
|
||||
from bp.browse.services.state import (
|
||||
KNOWN_PRODUCT_SLUGS,
|
||||
_listing_page_cache,
|
||||
_listing_page_ttl,
|
||||
_listing_variant_cache,
|
||||
_listing_variant_ttl,
|
||||
now,
|
||||
)
|
||||
from shared.utils import normalize_text, soup_of
|
||||
from shared.config import config
|
||||
|
||||
|
||||
def parse_total_pages_from_text(text: str) -> Optional[int]:
|
||||
m = re.search(r"Showing\s+(\d+)\s+of\s+(\d+)", text, re.I)
|
||||
if not m:
|
||||
return None
|
||||
shown = int(m.group(1))
|
||||
total = int(m.group(2))
|
||||
per_page = 36 if shown in (12, 24, 36) else shown
|
||||
return max(1, math.ceil(total / per_page))
|
||||
|
||||
|
||||
def _first_from_srcset(val: str) -> Optional[str]:
|
||||
if not val:
|
||||
return None
|
||||
first = val.split(",")[0].strip()
|
||||
parts = first.split()
|
||||
return parts[0] if parts else first
|
||||
|
||||
|
||||
def _abs_url(u: Optional[str]) -> Optional[str]:
|
||||
if not u:
|
||||
return None
|
||||
return urljoin(config()["base_url"], u) if isinstance(u, str) and u.startswith("/") else u
|
||||
|
||||
|
||||
def _collect_img_candidates(el) -> List[str]:
|
||||
urls: List[str] = []
|
||||
if not el:
|
||||
return urls
|
||||
attrs = ["src", "data-src", "data-original", "data-zoom-image", "data-thumb", "content", "href"]
|
||||
for a in attrs:
|
||||
v = el.get(a)
|
||||
if v:
|
||||
urls.append(v)
|
||||
for a in ["srcset", "data-srcset"]:
|
||||
v = el.get(a)
|
||||
if v:
|
||||
first = _first_from_srcset(v)
|
||||
if first:
|
||||
urls.append(first)
|
||||
return urls
|
||||
|
||||
|
||||
def _dedupe_preserve_order_by(seq: List[str], key: Callable[[str], str]) -> List[str]:
|
||||
seen = set()
|
||||
out: List[str] = []
|
||||
for s in seq:
|
||||
if not s:
|
||||
continue
|
||||
k = key(s)
|
||||
if k in seen:
|
||||
continue
|
||||
seen.add(k)
|
||||
out.append(s)
|
||||
return out
|
||||
|
||||
|
||||
def _filename_key(u: str) -> str:
|
||||
p = urlparse(u)
|
||||
path = p.path or ""
|
||||
if path.endswith("/"):
|
||||
path = path[:-1]
|
||||
last = path.split("/")[-1]
|
||||
return f"{p.netloc}:{last}".lower()
|
||||
|
||||
|
||||
def _parse_cards_from_soup(soup) -> List[Dict]:
|
||||
"""Extract product tiles (name, href, image, desc) from listing soup.
|
||||
De-duplicate by slug to avoid doubles from overlapping selectors."""
|
||||
items: List[str] = []
|
||||
seen_slugs: set[str] = set()
|
||||
|
||||
# Primary selectors (Magento 2 default)
|
||||
card_wrappers = soup.select(
|
||||
"li.product-item, .product-item, ol.products.list.items li, .products.list.items li, .product-item-info"
|
||||
)
|
||||
for card in card_wrappers:
|
||||
a = (
|
||||
card.select_one("a.product-item-link")
|
||||
or card.select_one(".product-item-name a")
|
||||
or card.select_one("a[href$='.html'], a[href$='.htm']")
|
||||
)
|
||||
if not a:
|
||||
continue
|
||||
#name = normalize_text(a.get_text()) or normalize_text(a.get("title") or "")
|
||||
href = a.get("href")
|
||||
#if not name or not href:
|
||||
# continue
|
||||
if href.startswith("/"):
|
||||
href = urljoin(config()["base_url"], href)
|
||||
|
||||
|
||||
slug = product_slug_from_href(href)
|
||||
KNOWN_PRODUCT_SLUGS.add(slug)
|
||||
|
||||
if slug and slug not in seen_slugs:
|
||||
seen_slugs.add(slug)
|
||||
items.append(slug)
|
||||
# Secondary: any product-looking anchors inside products container
|
||||
if not items:
|
||||
products_container = soup.select_one(".products") or soup
|
||||
for a in products_container.select("a[href$='.html'], a[href$='.htm']"):
|
||||
href = a.get("href")
|
||||
if href.startswith("/"):
|
||||
href = urljoin(config()["base_url"], href)
|
||||
slug = product_slug_from_href(href)
|
||||
KNOWN_PRODUCT_SLUGS.add(slug)
|
||||
if slug not in seen_slugs:
|
||||
seen_slugs.add(slug)
|
||||
items.append(slug)
|
||||
|
||||
# Tertiary: JSON-LD fallback (ItemList/Product)
|
||||
if not items:
|
||||
import json
|
||||
|
||||
def add_product(name: Optional[str], url: Optional[str], image: Optional[str]):
|
||||
if not url:
|
||||
return
|
||||
absu = urljoin(config()["base_url"], url) if url.startswith("/") else url
|
||||
slug = product_slug_from_href(absu)
|
||||
if not slug:
|
||||
return
|
||||
KNOWN_PRODUCT_SLUGS.add(slug)
|
||||
if slug not in seen_slugs:
|
||||
seen_slugs.add(slug)
|
||||
items.append(slug)
|
||||
|
||||
for script in soup.find_all("script", attrs={"type": "application/ld+json"}):
|
||||
#try:
|
||||
data = json.loads(script.get_text())
|
||||
#except Exception:
|
||||
# continue
|
||||
if isinstance(data, dict):
|
||||
if data.get("@type") == "ItemList" and isinstance(data.get("itemListElement"), list):
|
||||
for it in data["itemListElement"]:
|
||||
if isinstance(it, dict):
|
||||
ent = it.get("item") or it
|
||||
if isinstance(ent, dict):
|
||||
add_product(
|
||||
ent.get("name"),
|
||||
ent.get("url"),
|
||||
(ent.get("image") if isinstance(ent.get("image"), str) else None),
|
||||
)
|
||||
if data.get("@type") == "Product":
|
||||
add_product(
|
||||
data.get("name"),
|
||||
data.get("url"),
|
||||
(data.get("image") if isinstance(data.get("image"), str) else None),
|
||||
)
|
||||
elif isinstance(data, list):
|
||||
for ent in data:
|
||||
if not isinstance(ent, dict):
|
||||
continue
|
||||
if ent.get("@type") == "Product":
|
||||
add_product(
|
||||
ent.get("name"),
|
||||
ent.get("url"),
|
||||
(ent.get("image") if isinstance(ent.get("image"), str) else None),
|
||||
)
|
||||
if ent.get("@type") == "ItemList":
|
||||
for it in ent.get("itemListElement", []):
|
||||
if isinstance(it, dict):
|
||||
obj = it.get("item") or it
|
||||
if isinstance(obj, dict):
|
||||
add_product(
|
||||
obj.get("name"),
|
||||
obj.get("url"),
|
||||
(obj.get("image") if isinstance(obj.get("image"), str) else None),
|
||||
)
|
||||
|
||||
return items
|
||||
|
||||
|
||||
def _with_query(url: str, add: Dict[str, str]) -> str:
|
||||
p = urlparse(url)
|
||||
q = dict(parse_qsl(p.query, keep_blank_values=True))
|
||||
q.update(add)
|
||||
new_q = urlencode(q)
|
||||
return urlunparse((p.scheme, p.netloc, p.path, p.params, new_q, p.fragment))
|
||||
|
||||
|
||||
def _with_page(url: str, page: int) -> str:
|
||||
if page and page > 1:
|
||||
return _with_query(url, {"p": str(page)})
|
||||
return url
|
||||
|
||||
|
||||
def _listing_base_key(url: str) -> str:
|
||||
p = urlparse(url)
|
||||
path = p.path.rstrip("/")
|
||||
return f"{p.scheme}://{p.netloc}{path}".lower()
|
||||
|
||||
|
||||
def _variant_cache_get(base_key: str) -> Optional[str]:
|
||||
info = _listing_variant_cache.get(base_key)
|
||||
if not info:
|
||||
return None
|
||||
url, ts = info
|
||||
if (now() - ts) > _listing_variant_ttl:
|
||||
_listing_variant_cache.pop(base_key, None)
|
||||
return None
|
||||
return url
|
||||
|
||||
|
||||
def _variant_cache_set(base_key: str, working_url: str) -> None:
|
||||
_listing_variant_cache[base_key] = (working_url, now())
|
||||
|
||||
|
||||
def _page_cache_get(working_url: str, page: int) -> Optional[Tuple[List[Dict], int]]:
|
||||
key = f"{working_url}|p={page}"
|
||||
info = _listing_page_cache.get(key)
|
||||
if not info:
|
||||
return None
|
||||
(items, total_pages), ts = info
|
||||
if (now() - ts) > _listing_page_ttl:
|
||||
_listing_page_cache.pop(key, None)
|
||||
return None
|
||||
return items, total_pages
|
||||
|
||||
|
||||
def _page_cache_set(working_url: str, page: int, items: List[Dict], total_pages: int) -> None:
|
||||
key = f"{working_url}|p={page}"
|
||||
_listing_page_cache[key] = ((items, total_pages), now())
|
||||
|
||||
|
||||
async def _fetch_parse(url: str, page: int):
|
||||
html = await fetch(_with_page(url, page))
|
||||
soup = soup_of(html)
|
||||
items = _parse_cards_from_soup(soup)
|
||||
return items, soup
|
||||
|
||||
|
||||
|
||||
|
||||
async def scrape_products(list_url: str, page: int = 1):
|
||||
"""Fast listing fetch with variant memoization + page cache."""
|
||||
_listing_base_key(list_url)
|
||||
items, soup = await _fetch_parse(list_url, page)
|
||||
|
||||
total_pages = _derive_total_pages(soup)
|
||||
return items, total_pages
|
||||
|
||||
def _derive_total_pages(soup) -> int:
|
||||
total_pages = 1
|
||||
textdump = normalize_text(soup.get_text(" "))
|
||||
pages_from_text = parse_total_pages_from_text(textdump)
|
||||
if pages_from_text:
|
||||
total_pages = pages_from_text
|
||||
else:
|
||||
pages = {1}
|
||||
for a in soup.find_all("a", href=True):
|
||||
m = re.search(r"[?&]p=(\d+)", a["href"])
|
||||
if m:
|
||||
pages.add(int(m.group(1)))
|
||||
total_pages = max(pages) if pages else 1
|
||||
return total_pages
|
||||
|
||||
|
||||
def _slugs_from_list_url(list_url: str) -> Tuple[str, Optional[str]]:
|
||||
p = urlparse(list_url)
|
||||
parts = [x for x in (p.path or "").split("/") if x]
|
||||
top = parts[0].lower() if parts else ""
|
||||
sub = None
|
||||
if len(parts) >= 2:
|
||||
sub = parts[1]
|
||||
if sub.lower().endswith((".html", ".htm")):
|
||||
sub = re.sub(r"\.(html?|HTML?)$", "", sub)
|
||||
return top, sub
|
||||
104
market/scrape/nav.py
Normal file
104
market/scrape/nav.py
Normal file
@@ -0,0 +1,104 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
from urllib.parse import urlparse, urljoin
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from shared.config import config
|
||||
from .http_client import fetch # only fetch; define soup_of locally
|
||||
#from .. import cache_backend as cb
|
||||
#from ..blacklist.category import is_category_blocked # Reverse map: slug -> label
|
||||
|
||||
|
||||
# ------------------ Caches ------------------
|
||||
|
||||
|
||||
|
||||
def soup_of(html: str) -> BeautifulSoup:
|
||||
return BeautifulSoup(html or "", "lxml")
|
||||
|
||||
|
||||
def normalize_text(s: str) -> str:
|
||||
return re.sub(r"\s+", " ", (s or "").strip())
|
||||
|
||||
|
||||
async def scrape_nav_raw() -> List[Tuple[str, str]]:
|
||||
html = await fetch(config()["base_url"])
|
||||
soup = soup_of(html)
|
||||
results: List[Tuple[str, str]] = []
|
||||
for a in soup.find_all("a", href=True):
|
||||
text = normalize_text(a.get_text())
|
||||
if not text:
|
||||
continue
|
||||
href = a["href"].strip()
|
||||
if href.startswith("/"):
|
||||
href = urljoin(config()["base_url"], href)
|
||||
if not href.startswith(config()["base_url"]):
|
||||
continue
|
||||
results.append((text, href))
|
||||
return results
|
||||
|
||||
|
||||
def extract_sub_slug(href: str, top_slug: str) -> Optional[str]:
|
||||
p = urlparse(href)
|
||||
parts = [x for x in (p.path or "").split("/") if x]
|
||||
if len(parts) >= 2 and parts[0].lower() == top_slug.lower():
|
||||
sub = parts[1]
|
||||
if sub.lower().endswith((".html", ".htm")):
|
||||
sub = re.sub(r"\.(html?|HTML?)$", "", sub)
|
||||
return sub
|
||||
return None
|
||||
|
||||
|
||||
async def group_by_category(slug_to_links: Dict[str, List[Tuple[str, str]]]) -> Dict[str, Dict]:
|
||||
nav = {"cats": {}}
|
||||
for label, slug in config()["categories"]["allow"].items():
|
||||
top_href = urljoin(config()["base_url"], f"/{slug}")
|
||||
subs = []
|
||||
for text, href in slug_to_links.get(slug, []):
|
||||
sub_slug = extract_sub_slug(href, slug)
|
||||
if sub_slug:
|
||||
#list_url = _join(config()["base_url"], f"/{slug}/{sub_slug}")
|
||||
#log(f"naving [{slug}/{sub_slug}] page 1…")
|
||||
#items, total_pages = await scrape_products(list_url, page=1)
|
||||
#for p in range(2, total_pages + 1):
|
||||
# log(f"naving [{slug}/{sub_slug}] page {p}…")
|
||||
# moreitems, _tp = await scrape_products(list_url, page=p)
|
||||
# items.extend(
|
||||
# moreitems,
|
||||
# )
|
||||
subs.append({"name": text, "href": href, "slug": sub_slug})
|
||||
subs.sort(key=lambda x: x["name"].lower())
|
||||
#list_url = _join(config()["base_url"], f"/{slug}")
|
||||
#log(f"naving [{slug}] page 1…")
|
||||
#items, total_pages = await scrape_products(list_url, page=1)
|
||||
#for p in range(2, total_pages + 1):
|
||||
# log(f"naving [{slug}] page {p}…")
|
||||
# moreitems, _tp = await scrape_products(list_url, page=p)
|
||||
# items.extend(
|
||||
# moreitems,
|
||||
# )
|
||||
nav["cats"][label] = {"href": top_href, "slug": slug, "subs": subs}
|
||||
return nav
|
||||
|
||||
|
||||
async def scrape_nav_filtered() -> Dict[str, Dict]:
|
||||
anchors = await scrape_nav_raw()
|
||||
slug_to_links: Dict[str, List[Tuple[str, str]]] = {}
|
||||
for text, href in anchors:
|
||||
p = urlparse(href)
|
||||
parts = [x for x in (p.path or "").split("/") if x]
|
||||
if not parts:
|
||||
continue
|
||||
top = parts[0].lower()
|
||||
if top in config()["slugs"]["skip"]:
|
||||
continue
|
||||
slug_to_links.setdefault(top, []).append((text, href))
|
||||
return await group_by_category(slug_to_links)
|
||||
|
||||
async def nav_scrape() -> Dict[str, Dict]:
|
||||
"""Return navigation structure; use snapshot when offline."""
|
||||
|
||||
nav = await scrape_nav_filtered()
|
||||
return nav
|
||||
6
market/scrape/persist_api/__init__.py
Normal file
6
market/scrape/persist_api/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
from .upsert_product import upsert_product
|
||||
from .log_product_result import log_product_result
|
||||
from .save_nav import save_nav
|
||||
from .save_subcategory_redirects import save_subcategory_redirects
|
||||
from .capture_listing import capture_listing
|
||||
|
||||
27
market/scrape/persist_api/capture_listing.py
Normal file
27
market/scrape/persist_api/capture_listing.py
Normal file
@@ -0,0 +1,27 @@
|
||||
# replace your existing upsert_product with this version
|
||||
|
||||
import os
|
||||
import httpx
|
||||
|
||||
from typing import List
|
||||
|
||||
async def capture_listing(
|
||||
url: str,
|
||||
items: List[str],
|
||||
total_pages: int
|
||||
):
|
||||
|
||||
sync_url = os.getenv("CAPTURE_LISTING_URL", "http://localhost:8001/market/suma-market/api/products/listing/")
|
||||
|
||||
async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
|
||||
_d = {
|
||||
"url": url,
|
||||
"items": items,
|
||||
"total_pages": total_pages
|
||||
}
|
||||
resp = await client.post(sync_url, json=_d)
|
||||
# Raise for non-2xx
|
||||
resp.raise_for_status()
|
||||
data = resp.json() if resp.content else {}
|
||||
return data
|
||||
|
||||
24
market/scrape/persist_api/log_product_result.py
Normal file
24
market/scrape/persist_api/log_product_result.py
Normal file
@@ -0,0 +1,24 @@
|
||||
# replace your existing upsert_product with this version
|
||||
|
||||
import os
|
||||
import httpx
|
||||
|
||||
|
||||
async def log_product_result(
|
||||
ok: bool,
|
||||
payload
|
||||
):
|
||||
|
||||
sync_url = os.getenv("PRODUCT_LOG_URL", "http://localhost:8000/market/api/products/log/")
|
||||
|
||||
async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
|
||||
_d = {
|
||||
"ok": ok,
|
||||
"payload": payload
|
||||
}
|
||||
resp = await client.post(sync_url, json=_d)
|
||||
# Raise for non-2xx
|
||||
resp.raise_for_status()
|
||||
data = resp.json() if resp.content else {}
|
||||
return data
|
||||
|
||||
19
market/scrape/persist_api/save_nav.py
Normal file
19
market/scrape/persist_api/save_nav.py
Normal file
@@ -0,0 +1,19 @@
|
||||
# replace your existing upsert_product with this version
|
||||
|
||||
import os
|
||||
import httpx
|
||||
|
||||
from typing import Dict
|
||||
|
||||
async def save_nav(
|
||||
nav: Dict,
|
||||
):
|
||||
sync_url = os.getenv("SAVE_NAV_URL", "http://localhost:8001/market/suma-market/api/products/nav/")
|
||||
|
||||
async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
|
||||
resp = await client.post(sync_url, json=nav)
|
||||
# Raise for non-2xx
|
||||
resp.raise_for_status()
|
||||
data = resp.json() if resp.content else {}
|
||||
return data
|
||||
|
||||
15
market/scrape/persist_api/save_subcategory_redirects.py
Normal file
15
market/scrape/persist_api/save_subcategory_redirects.py
Normal file
@@ -0,0 +1,15 @@
|
||||
import os
|
||||
import httpx
|
||||
|
||||
from typing import Dict
|
||||
|
||||
async def save_subcategory_redirects(mapping: Dict[str, str]) -> None:
|
||||
sync_url = os.getenv("SAVE_REDIRECTS", "http://localhost:8000/market/api/products/redirects/")
|
||||
|
||||
async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
|
||||
resp = await client.post(sync_url, json=mapping)
|
||||
# Raise for non-2xx
|
||||
resp.raise_for_status()
|
||||
data = resp.json() if resp.content else {}
|
||||
return data
|
||||
|
||||
256
market/scrape/persist_api/upsert_product.py
Normal file
256
market/scrape/persist_api/upsert_product.py
Normal file
@@ -0,0 +1,256 @@
|
||||
# replace your existing upsert_product with this version
|
||||
|
||||
import os
|
||||
import httpx
|
||||
|
||||
from typing import Dict, List, Any
|
||||
|
||||
async def upsert_product(
|
||||
slug,
|
||||
href,
|
||||
d,
|
||||
):
|
||||
"""
|
||||
Posts the given product dict `d` to the /api/products/sync endpoint.
|
||||
Keeps the same signature as before and preserves logging/commit behavior.
|
||||
"""
|
||||
|
||||
|
||||
# Ensure slug in payload matches the function arg if present
|
||||
if not d.get("slug"):
|
||||
d["slug"] = slug
|
||||
|
||||
# Where to post; override via env if needed
|
||||
sync_url = os.getenv("PRODUCT_SYNC_URL", "http://localhost:8001/market/suma-market/api/products/sync/")
|
||||
|
||||
|
||||
|
||||
|
||||
payload = _massage_payload(d)
|
||||
|
||||
async def _do_call() -> Dict[str, Any]:
|
||||
async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
|
||||
resp = await client.post(sync_url, json=payload)
|
||||
resp.raise_for_status()
|
||||
# tolerate empty body
|
||||
if not resp.content:
|
||||
return {}
|
||||
# prefer JSON if possible, otherwise return text
|
||||
try:
|
||||
return resp.json()
|
||||
except ValueError:
|
||||
return {"raw": resp.text}
|
||||
|
||||
async def _log_error(exc: BaseException) -> None:
|
||||
# Optional: add your own logging here
|
||||
print(f"[upsert_product] POST failed: {type(exc).__name__}: {exc}. Retrying in 5s... slug={slug} url={sync_url}")
|
||||
|
||||
return await retry_until_success(_do_call, delay=5.0, on_error=_log_error)
|
||||
|
||||
|
||||
|
||||
#async with httpx.AsyncClient(timeout=httpx.Timeout(20.0, connect=10.0)) as client:
|
||||
# _d=_massage_payload(d)
|
||||
# resp = await client.post(sync_url, json=_d)
|
||||
# Raise for non-2xx
|
||||
#resp.raise_for_status()
|
||||
#data = resp.json() if resp.content else {}
|
||||
#return data
|
||||
|
||||
import asyncio
|
||||
from typing import Any, Awaitable, Callable, Dict, Optional
|
||||
|
||||
async def retry_until_success(
|
||||
fn: Callable[[], Awaitable[Any]],
|
||||
*,
|
||||
delay: float = 5.0,
|
||||
on_error: Optional[Callable[[BaseException], Awaitable[None]]] = None,
|
||||
) -> Any:
|
||||
"""
|
||||
Repeatedly call the async no-arg function `fn` until it succeeds (returns without raising).
|
||||
Waits `delay` seconds between attempts. Never gives up.
|
||||
If provided, `on_error(exc)` is awaited after each failure.
|
||||
"""
|
||||
attempt = 0
|
||||
while True:
|
||||
try:
|
||||
return await fn()
|
||||
except asyncio.CancelledError:
|
||||
# bubble up cancellations immediately
|
||||
raise
|
||||
except BaseException as exc:
|
||||
attempt += 1
|
||||
if on_error is not None:
|
||||
try:
|
||||
await on_error(exc)
|
||||
except Exception:
|
||||
# don't let error handler failures prevent retrying
|
||||
pass
|
||||
# fallback stderr log if no on_error handler
|
||||
if on_error is None:
|
||||
print(f"[retry] attempt {attempt} failed: {type(exc).__name__}: {exc}")
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
|
||||
|
||||
def _get(d, key, default=None):
|
||||
v = d.get(key)
|
||||
return default if v in (None, "", [], {}) else v
|
||||
|
||||
|
||||
def _massage_payload(d: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Mirror the DB-upsert massaging so the API sees the same structure/values."""
|
||||
slug = d.get("slug")
|
||||
if not slug:
|
||||
raise ValueError("product missing slug")
|
||||
|
||||
# --- Top-level fields (use _get where DB upsert uses it) ---
|
||||
out: Dict[str, Any] = {
|
||||
"slug": slug,
|
||||
"title": _get(d, "title"),
|
||||
"image": _get(d, "image"),
|
||||
"description_short": _get(d, "description_short"),
|
||||
"description_html": _get(d, "description_html"),
|
||||
"suma_href": _get(d, "suma_href"),
|
||||
"brand": _get(d, "brand"),
|
||||
"rrp": _get(d, "rrp"),
|
||||
"rrp_currency": _get(d, "rrp_currency"),
|
||||
"rrp_raw": _get(d, "rrp_raw"),
|
||||
"price_per_unit": _get(d, "price_per_unit"),
|
||||
"price_per_unit_currency": _get(d, "price_per_unit_currency"),
|
||||
"price_per_unit_raw": _get(d, "price_per_unit_raw"),
|
||||
"special_price": _get(d, "special_price"),
|
||||
"special_price_currency": _get(d, "special_price_currency"),
|
||||
"special_price_raw": _get(d, "special_price_raw"),
|
||||
"regular_price": _get(d, "regular_price"),
|
||||
"regular_price_currency": _get(d, "regular_price_currency"),
|
||||
"regular_price_raw": _get(d, "regular_price_raw"),
|
||||
"case_size_count": _get(d, "case_size_count"),
|
||||
"case_size_item_qty": _get(d, "case_size_item_qty"),
|
||||
"case_size_item_unit": _get(d, "case_size_item_unit"),
|
||||
"case_size_raw": _get(d, "case_size_raw"),
|
||||
"ean": d.get("ean") or d.get("barcode") or None,
|
||||
"sku": d.get("sku"),
|
||||
"unit_size": d.get("unit_size"),
|
||||
"pack_size": d.get("pack_size"),
|
||||
}
|
||||
|
||||
# --- Sections: only dicts with title+html (like DB sync) ---
|
||||
sections_in = d.get("sections") or []
|
||||
sections_out: List[Dict[str, Any]] = []
|
||||
for sec in sections_in:
|
||||
if isinstance(sec, dict) and sec.get("title") and sec.get("html"):
|
||||
sections_out.append({"title": sec["title"], "html": sec["html"]})
|
||||
out["sections"] = sections_out
|
||||
|
||||
# --- Images: same 3 buckets used in DB sync ---
|
||||
def _coerce_str_list(x):
|
||||
if not x:
|
||||
return []
|
||||
# accept list of strings or list of dicts with {"url": ...}
|
||||
out_urls = []
|
||||
for item in x:
|
||||
if isinstance(item, str):
|
||||
if item:
|
||||
out_urls.append(item)
|
||||
elif isinstance(item, dict):
|
||||
u = item.get("url")
|
||||
if u:
|
||||
out_urls.append(u)
|
||||
return out_urls
|
||||
|
||||
out["images"] = _coerce_str_list(d.get("images"))
|
||||
out["embedded_image_urls"] = _coerce_str_list(d.get("embedded_image_urls"))
|
||||
out["all_image_urls"] = _coerce_str_list(d.get("all_image_urls"))
|
||||
|
||||
# --- Labels: strip (DB code trims) ---
|
||||
labels_in = d.get("labels") or []
|
||||
out["labels"] = [str(x).strip() for x in labels_in if x]
|
||||
|
||||
# --- Stickers: strip + lower (DB code lower-cases) ---
|
||||
stickers_in = d.get("stickers") or []
|
||||
out["stickers"] = [str(x).strip().lower() for x in stickers_in if x]
|
||||
|
||||
# --- Attributes: pass through the same dict sources the DB code reads ---
|
||||
out["info_table"] = d.get("info_table") or {}
|
||||
#out["oe_list_price"] = d.get("oe_list_price") or {}
|
||||
|
||||
# --- Nutrition: allow dict or list of dicts, mirroring DB code ---
|
||||
nutrition = d.get("nutrition") or []
|
||||
if isinstance(nutrition, dict):
|
||||
out["nutrition"] = {str(k).strip(): (None if v is None else str(v)) for k, v in nutrition.items()}
|
||||
elif isinstance(nutrition, list):
|
||||
rows = []
|
||||
for row in nutrition:
|
||||
if not isinstance(row, dict):
|
||||
continue
|
||||
key = str(row.get("key") or "").strip()
|
||||
if not key:
|
||||
continue
|
||||
rows.append({
|
||||
"key": key,
|
||||
"value": None if row.get("value") is None else str(row.get("value")),
|
||||
"unit": None if row.get("unit") is None else str(row.get("unit")),
|
||||
})
|
||||
out["nutrition"] = rows
|
||||
else:
|
||||
out["nutrition"] = []
|
||||
|
||||
# --- Allergens: accept str (→ contains=True) or dict ---
|
||||
alls_in = d.get("allergens") or []
|
||||
alls_out = []
|
||||
for a in alls_in:
|
||||
if isinstance(a, str):
|
||||
nm, contains = a.strip(), True
|
||||
elif isinstance(a, dict):
|
||||
nm, contains = (a.get("name") or "").strip(), bool(a.get("contains", True))
|
||||
else:
|
||||
continue
|
||||
if nm:
|
||||
alls_out.append({"name": nm, "contains": contains})
|
||||
out["allergens"] = alls_out
|
||||
|
||||
out["images"]=[
|
||||
{"url": s.strip(), "kind": "gallery", "position": i}
|
||||
for i, s in enumerate(out.get("images") or [])
|
||||
if isinstance(s, str) and s.strip()
|
||||
] + [
|
||||
{"url": s.strip(), "kind": "embedded", "position": i}
|
||||
for i, s in enumerate(out.get("embedded_image_urls") or [])
|
||||
if isinstance(s, str) and s.strip()
|
||||
] + [
|
||||
{"url": s.strip(), "kind": "all", "position": i}
|
||||
for i, s in enumerate(out.get("all_image_urls") or [])
|
||||
if isinstance(s, str) and s.strip()
|
||||
]
|
||||
out["labels"]= [{"name": s.strip()} for s in out["labels"] if isinstance(s, str) and s.strip()]
|
||||
out["stickers"]= [{"name": s.strip()} for s in out["stickers"] if isinstance(s, str) and s.strip()]
|
||||
out["attributes"] = build_attributes_list(d)
|
||||
|
||||
|
||||
return out
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def build_attributes_list(d: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
attrs = []
|
||||
for src, prefix in [
|
||||
(d.get("info_table") or {}, "info_table"),
|
||||
(d.get("oe_list_price") or {}, "oe_list_price"),
|
||||
]:
|
||||
for k, v in src.items():
|
||||
key = f"{prefix}/{str(k).strip()}"
|
||||
val = None if v is None else str(v)
|
||||
attrs.append({"key": key, "value": val})
|
||||
# optional: dedupe by (key, value)
|
||||
seen = set()
|
||||
dedup = []
|
||||
for item in attrs:
|
||||
t = (item["key"], item["value"])
|
||||
if t in seen:
|
||||
continue
|
||||
seen.add(t)
|
||||
dedup.append(item)
|
||||
return dedup
|
||||
7
market/scrape/persist_snapshot/__init__.py
Normal file
7
market/scrape/persist_snapshot/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from .log_product_result import log_product_result
|
||||
from .upsert_product import upsert_product
|
||||
from .save_nav import save_nav
|
||||
from .capture_listing import capture_listing
|
||||
from .save_link_reports import save_link_reports
|
||||
from .save_subcategory_redirects import save_subcategory_redirects
|
||||
|
||||
3
market/scrape/persist_snapshot/_get.py
Normal file
3
market/scrape/persist_snapshot/_get.py
Normal file
@@ -0,0 +1,3 @@
|
||||
def _get(d, key, default=None):
|
||||
v = d.get(key)
|
||||
return default if v in (None, "", [], {}) else v
|
||||
137
market/scrape/persist_snapshot/capture_listing.py
Normal file
137
market/scrape/persist_snapshot/capture_listing.py
Normal file
@@ -0,0 +1,137 @@
|
||||
# at top of persist_snapshot.py:
|
||||
from typing import Optional, List
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from typing import List, Optional, Tuple
|
||||
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
||||
from datetime import datetime
|
||||
from sqlalchemy import (
|
||||
select, update
|
||||
)
|
||||
from urllib.parse import urlparse
|
||||
import re
|
||||
|
||||
from models.market import (
|
||||
NavTop,
|
||||
NavSub,
|
||||
Listing,
|
||||
ListingItem,
|
||||
)
|
||||
from shared.db.session import get_session
|
||||
|
||||
# --- Models are unchanged, see original code ---
|
||||
|
||||
# ---------------------- Helper fns called from scraper ------------------------
|
||||
|
||||
|
||||
|
||||
async def capture_listing(
|
||||
#product_slugs: Set[str],
|
||||
url: str,
|
||||
items: List[str],
|
||||
total_pages: int
|
||||
) -> None:
|
||||
async with get_session() as session:
|
||||
await _capture_listing(
|
||||
session,
|
||||
url,
|
||||
items,
|
||||
total_pages
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
|
||||
async def _capture_listing(
|
||||
session,
|
||||
url: str,
|
||||
items: List[str],
|
||||
total_pages: int
|
||||
) -> None:
|
||||
top_id, sub_id = await _nav_ids_from_list_url(session, url)
|
||||
await _save_listing(session, top_id, sub_id, items, total_pages)
|
||||
|
||||
async def _save_listing(session: AsyncSession, top_id: int, sub_id: Optional[int],
|
||||
items: List[str], total_pages: Optional[int]) -> None:
|
||||
res = await session.execute(
|
||||
select(Listing).where(Listing.top_id == top_id, Listing.sub_id == sub_id, Listing.deleted_at.is_(None))
|
||||
)
|
||||
listing = res.scalar_one_or_none()
|
||||
if not listing:
|
||||
listing = Listing(top_id=top_id, sub_id=sub_id, total_pages=total_pages)
|
||||
session.add(listing)
|
||||
await session.flush()
|
||||
else:
|
||||
listing.total_pages = total_pages
|
||||
|
||||
# Normalize and deduplicate incoming slugs
|
||||
seen: set[str] = set()
|
||||
deduped: list[str] = []
|
||||
for s in items or []:
|
||||
if s and isinstance(s, str) and s not in seen:
|
||||
seen.add(s)
|
||||
deduped.append(s)
|
||||
|
||||
if not deduped:
|
||||
return
|
||||
|
||||
# Fetch existing slugs from the database
|
||||
res = await session.execute(
|
||||
select(ListingItem.slug)
|
||||
.where(ListingItem.listing_id == listing.id, ListingItem.deleted_at.is_(None))
|
||||
)
|
||||
existing_slugs = set(res.scalars().all())
|
||||
|
||||
now = datetime.utcnow()
|
||||
|
||||
# Slugs to delete (present in DB but not in the new data)
|
||||
to_delete = existing_slugs - seen
|
||||
if to_delete:
|
||||
await session.execute(
|
||||
update(ListingItem)
|
||||
.where(
|
||||
ListingItem.listing_id == listing.id,
|
||||
ListingItem.slug.in_(to_delete),
|
||||
ListingItem.deleted_at.is_(None)
|
||||
)
|
||||
.values(deleted_at=now)
|
||||
)
|
||||
|
||||
# Slugs to insert (new ones not in DB)
|
||||
to_insert = seen - existing_slugs
|
||||
if to_insert:
|
||||
stmt = pg_insert(ListingItem).values(
|
||||
[{"listing_id": listing.id, "slug": s} for s in to_insert]
|
||||
)
|
||||
#.on_conflict_do_nothing(
|
||||
# constraint="uq_listing_items_listing_slug"
|
||||
#)
|
||||
await session.execute(stmt)
|
||||
|
||||
async def _nav_ids_from_list_url(session: AsyncSession, list_url: str) -> Tuple[int, Optional[int]]:
|
||||
parts = [x for x in (urlparse(list_url).path or "").split("/") if x]
|
||||
top_slug = parts[0].lower() if parts else ""
|
||||
sub_slug = None
|
||||
if len(parts) >= 2:
|
||||
sub_slug = parts[1]
|
||||
if sub_slug.lower().endswith((".html", ".htm")):
|
||||
sub_slug = re.sub(r"\\.(html?|HTML?)$", "", sub_slug)
|
||||
return await _get_nav_ids(session, top_slug, sub_slug)
|
||||
|
||||
|
||||
|
||||
async def _get_nav_ids(session: AsyncSession, top_slug: str, sub_slug: Optional[str]) -> Tuple[int, Optional[int]]:
|
||||
res_top = await session.execute(select(NavTop.id).where(NavTop.slug == top_slug, NavTop.deleted_at.is_(None)))
|
||||
top_id = res_top.scalar_one_or_none()
|
||||
if not top_id:
|
||||
raise ValueError(f"NavTop not found for slug: {top_slug}")
|
||||
|
||||
sub_id = None
|
||||
if sub_slug:
|
||||
res_sub = await session.execute(
|
||||
select(NavSub.id).where(NavSub.slug == sub_slug, NavSub.top_id == top_id, NavSub.deleted_at.is_(None))
|
||||
)
|
||||
sub_id = res_sub.scalar_one_or_none()
|
||||
if sub_id is None:
|
||||
raise ValueError(f"NavSub not found for slug: {sub_slug} under top_id={top_id}")
|
||||
|
||||
return top_id, sub_id
|
||||
35
market/scrape/persist_snapshot/log_product_result.py
Normal file
35
market/scrape/persist_snapshot/log_product_result.py
Normal file
@@ -0,0 +1,35 @@
|
||||
# at top of persist_snapshot.py:
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from typing import Dict
|
||||
from models.market import (
|
||||
ProductLog,
|
||||
)
|
||||
from shared.db.session import get_session
|
||||
|
||||
|
||||
async def log_product_result(ok: bool, payload: Dict) -> None:
|
||||
async with get_session() as session:
|
||||
await _log_product_result(session, ok, payload)
|
||||
await session.commit()
|
||||
|
||||
|
||||
async def _log_product_result(session: AsyncSession, ok: bool, payload: Dict) -> None:
|
||||
session.add(ProductLog(
|
||||
ok=ok,
|
||||
slug=payload.get("slug"),
|
||||
href_tried=payload.get("href_tried"),
|
||||
error_type=payload.get("error_type"),
|
||||
error_message=payload.get("error_message"),
|
||||
http_status=payload.get("http_status"),
|
||||
final_url=payload.get("final_url"),
|
||||
transport_error=payload.get("transport_error"),
|
||||
title=payload.get("title"),
|
||||
has_description_html=payload.get("has_description_html"),
|
||||
has_description_short=payload.get("has_description_short"),
|
||||
sections_count=payload.get("sections_count"),
|
||||
images_count=payload.get("images_count"),
|
||||
embedded_images_count=payload.get("embedded_images_count"),
|
||||
all_images_count=payload.get("all_images_count"),
|
||||
))
|
||||
|
||||
29
market/scrape/persist_snapshot/save_link_reports.py
Normal file
29
market/scrape/persist_snapshot/save_link_reports.py
Normal file
@@ -0,0 +1,29 @@
|
||||
# at top of persist_snapshot.py:
|
||||
from typing import List
|
||||
|
||||
from typing import Dict, List
|
||||
|
||||
from models.market import (
|
||||
LinkError,
|
||||
LinkExternal,
|
||||
)
|
||||
from shared.db.session import get_session
|
||||
|
||||
# --- Models are unchanged, see original code ---
|
||||
|
||||
# ---------------------- Helper fns called from scraper ------------------------
|
||||
|
||||
|
||||
|
||||
async def save_link_reports(link_errors: List[Dict], link_externals: List[Dict]) -> None:
|
||||
async with get_session() as session:
|
||||
for e in link_errors:
|
||||
session.add(LinkError(
|
||||
product_slug=e.get("product"), href=e.get("href"), text=e.get("text"),
|
||||
top=e.get("top"), sub=e.get("sub"), target_slug=e.get("target_slug"), type=e.get("type"),
|
||||
))
|
||||
for e in link_externals:
|
||||
session.add(LinkExternal(
|
||||
product_slug=e.get("product"), href=e.get("href"), text=e.get("text"), host=e.get("host"),
|
||||
))
|
||||
await session.commit()
|
||||
110
market/scrape/persist_snapshot/save_nav.py
Normal file
110
market/scrape/persist_snapshot/save_nav.py
Normal file
@@ -0,0 +1,110 @@
|
||||
# at top of persist_snapshot.py:
|
||||
from datetime import datetime
|
||||
from sqlalchemy import (
|
||||
select, tuple_
|
||||
)
|
||||
from typing import Dict
|
||||
|
||||
from models.market import (
|
||||
NavTop,
|
||||
NavSub,
|
||||
)
|
||||
from shared.db.session import get_session
|
||||
|
||||
|
||||
|
||||
|
||||
async def save_nav(nav: Dict) -> None:
|
||||
async with get_session() as session:
|
||||
await _save_nav(session, nav)
|
||||
await session.commit()
|
||||
|
||||
async def _save_nav(session, nav: Dict, market_id=None) -> None:
|
||||
print('===================SAVE NAV========================')
|
||||
print(nav)
|
||||
now = datetime.utcnow()
|
||||
|
||||
incoming_top_slugs = set()
|
||||
incoming_sub_keys = set() # (top_slug, sub_slug)
|
||||
|
||||
# First pass: collect slugs
|
||||
for label, data in (nav.get("cats") or {}).items():
|
||||
top_slug = (data or {}).get("slug")
|
||||
if not top_slug:
|
||||
continue
|
||||
incoming_top_slugs.add(top_slug)
|
||||
|
||||
for s in (data.get("subs") or []):
|
||||
sub_slug = s.get("slug")
|
||||
if sub_slug:
|
||||
incoming_sub_keys.add((top_slug, sub_slug))
|
||||
|
||||
# Soft-delete stale NavSub entries
|
||||
# This requires joining NavTop to access top_slug
|
||||
subs_to_delete = await session.execute(
|
||||
select(NavSub)
|
||||
.join(NavTop, NavSub.top_id == NavTop.id)
|
||||
.where(
|
||||
NavSub.deleted_at.is_(None),
|
||||
~tuple_(NavTop.slug, NavSub.slug).in_(incoming_sub_keys)
|
||||
)
|
||||
)
|
||||
for sub in subs_to_delete.scalars():
|
||||
sub.deleted_at = now
|
||||
|
||||
# Soft-delete stale NavTop entries
|
||||
tops_to_delete = await session.execute(
|
||||
select(NavTop)
|
||||
.where(
|
||||
NavTop.deleted_at.is_(None),
|
||||
~NavTop.slug.in_(incoming_top_slugs)
|
||||
)
|
||||
)
|
||||
for top in tops_to_delete.scalars():
|
||||
top.deleted_at = now
|
||||
|
||||
await session.flush()
|
||||
|
||||
# Upsert NavTop and NavSub
|
||||
for label, data in (nav.get("cats") or {}).items():
|
||||
top_slug = (data or {}).get("slug")
|
||||
if not top_slug:
|
||||
continue
|
||||
|
||||
res = await session.execute(
|
||||
select(NavTop).where(NavTop.slug == top_slug)
|
||||
)
|
||||
top = res.scalar_one_or_none()
|
||||
|
||||
if top:
|
||||
top.label = label
|
||||
top.deleted_at = None
|
||||
if market_id is not None and top.market_id is None:
|
||||
top.market_id = market_id
|
||||
else:
|
||||
top = NavTop(label=label, slug=top_slug, market_id=market_id)
|
||||
session.add(top)
|
||||
|
||||
await session.flush()
|
||||
|
||||
for s in (data.get("subs") or []):
|
||||
sub_slug = s.get("slug")
|
||||
if not sub_slug:
|
||||
continue
|
||||
sub_label = s.get("label")
|
||||
sub_href = s.get("href")
|
||||
|
||||
res_sub = await session.execute(
|
||||
select(NavSub).where(
|
||||
NavSub.slug == sub_slug,
|
||||
NavSub.top_id == top.id
|
||||
)
|
||||
)
|
||||
sub = res_sub.scalar_one_or_none()
|
||||
if sub:
|
||||
sub.label = sub_label
|
||||
sub.href = sub_href
|
||||
sub.deleted_at = None
|
||||
else:
|
||||
session.add(NavSub(top_id=top.id, label=sub_label, slug=sub_slug, href=sub_href))
|
||||
|
||||
32
market/scrape/persist_snapshot/save_subcategory_redirects.py
Normal file
32
market/scrape/persist_snapshot/save_subcategory_redirects.py
Normal file
@@ -0,0 +1,32 @@
|
||||
# at top of persist_snapshot.py:
|
||||
|
||||
from typing import Dict
|
||||
from datetime import datetime
|
||||
from sqlalchemy import (
|
||||
update
|
||||
)
|
||||
from models.market import (
|
||||
SubcategoryRedirect,
|
||||
)
|
||||
from shared.db.session import get_session
|
||||
|
||||
# --- Models are unchanged, see original code ---
|
||||
|
||||
# ---------------------- Helper fns called from scraper ------------------------
|
||||
|
||||
|
||||
async def save_subcategory_redirects(mapping: Dict[str, str]) -> None:
|
||||
async with get_session() as session:
|
||||
await _save_subcategory_redirects(session, mapping)
|
||||
await session.commit()
|
||||
|
||||
|
||||
async def _save_subcategory_redirects(session, mapping: Dict[str, str]) -> None:
|
||||
await session.execute(update(SubcategoryRedirect).where(SubcategoryRedirect.deleted_at.is_(None)).values(deleted_at=datetime.utcnow()))
|
||||
for old, new in mapping.items():
|
||||
session.add(SubcategoryRedirect(old_path=old, new_path=new))
|
||||
|
||||
|
||||
|
||||
#for slug in items:
|
||||
# product_slugs.add(slug)
|
||||
237
market/scrape/persist_snapshot/upsert_product.py
Normal file
237
market/scrape/persist_snapshot/upsert_product.py
Normal file
@@ -0,0 +1,237 @@
|
||||
# at top of persist_snapshot.py:
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from typing import Dict
|
||||
from datetime import datetime
|
||||
from sqlalchemy import (
|
||||
func, select, update
|
||||
)
|
||||
|
||||
from models.market import (
|
||||
Product,
|
||||
ProductImage,
|
||||
ProductSection,
|
||||
ProductLabel,
|
||||
ProductSticker,
|
||||
ProductAttribute,
|
||||
ProductNutrition,
|
||||
ProductAllergen
|
||||
)
|
||||
from shared.db.session import get_session
|
||||
|
||||
from ._get import _get
|
||||
from .log_product_result import _log_product_result
|
||||
|
||||
# --- Models are unchanged, see original code ---
|
||||
|
||||
# ---------------------- Helper fns called from scraper ------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
async def _upsert_product(session: AsyncSession, d: Dict) -> Product:
|
||||
slug = d.get("slug")
|
||||
if not slug:
|
||||
raise ValueError("product missing slug")
|
||||
res = await session.execute(select(Product).where(Product.slug == slug, Product.deleted_at.is_(None)))
|
||||
p = res.scalar_one_or_none()
|
||||
if not p:
|
||||
p = Product(slug=slug)
|
||||
session.add(p)
|
||||
|
||||
p.title = _get(d, "title")
|
||||
p.image = _get(d, "image")
|
||||
p.description_short = _get(d, "description_short")
|
||||
p.description_html = _get(d, "description_html")
|
||||
p.suma_href = _get(d, "suma_href")
|
||||
p.brand = _get(d, "brand")
|
||||
p.rrp = _get(d, "rrp")
|
||||
p.rrp_currency = _get(d, "rrp_currency")
|
||||
p.rrp_raw = _get(d, "rrp_raw")
|
||||
p.price_per_unit = _get(d, "price_per_unit")
|
||||
p.price_per_unit_currency = _get(d, "price_per_unit_currency")
|
||||
p.price_per_unit_raw = _get(d, "price_per_unit_raw")
|
||||
p.special_price = _get(d, "special_price")
|
||||
p.special_price_currency = _get(d, "special_price_currency")
|
||||
p.special_price_raw = _get(d, "special_price_raw")
|
||||
p.regular_price = _get(d, "regular_price")
|
||||
p.regular_price_currency = _get(d, "regular_price_currency")
|
||||
p.regular_price_raw = _get(d, "regular_price_raw")
|
||||
p.case_size_count = _get(d, "case_size_count")
|
||||
p.case_size_item_qty = _get(d, "case_size_item_qty")
|
||||
p.case_size_item_unit = _get(d, "case_size_item_unit")
|
||||
p.case_size_raw = _get(d, "case_size_raw")
|
||||
p.ean = d.get("ean") or d.get("barcode") or None
|
||||
p.sku = d.get("sku")
|
||||
p.unit_size = d.get("unit_size")
|
||||
p.pack_size = d.get("pack_size")
|
||||
p.updated_at = func.now()
|
||||
|
||||
now = datetime.utcnow()
|
||||
|
||||
|
||||
|
||||
# ProductSection sync
|
||||
existing_sections = await session.execute(select(ProductSection).where(ProductSection.product_id == p.id, ProductSection.deleted_at.is_(None)))
|
||||
existing_sections_set = {(s.title, s.html) for s in existing_sections.scalars()}
|
||||
|
||||
new_sections_set = set()
|
||||
for sec in d.get("sections") or []:
|
||||
if isinstance(sec, dict) and sec.get("title") and sec.get("html"):
|
||||
new_sections_set.add((sec["title"], sec["html"]))
|
||||
if (sec["title"], sec["html"]) not in existing_sections_set:
|
||||
session.add(ProductSection(product_id=p.id, title=sec["title"], html=sec["html"]))
|
||||
|
||||
for s in existing_sections_set - new_sections_set:
|
||||
await session.execute(update(ProductSection).where(ProductSection.product_id == p.id, ProductSection.title == s[0], ProductSection.html == s[1], ProductSection.deleted_at.is_(None)).values(deleted_at=now))
|
||||
|
||||
# ProductImage sync
|
||||
existing_images = await session.execute(select(ProductImage).where(ProductImage.product_id == p.id, ProductImage.deleted_at.is_(None)))
|
||||
existing_images_set = {(img.url, img.kind) for img in existing_images.scalars()}
|
||||
|
||||
new_images_set = set()
|
||||
for kind, urls in [
|
||||
("gallery", d.get("images") or []),
|
||||
("embedded", d.get("embedded_image_urls") or []),
|
||||
("all", d.get("all_image_urls") or []),
|
||||
]:
|
||||
for idx, url in enumerate(urls):
|
||||
if url:
|
||||
new_images_set.add((url, kind))
|
||||
if (url, kind) not in existing_images_set:
|
||||
session.add(ProductImage(product_id=p.id, url=url, position=idx, kind=kind))
|
||||
|
||||
for img in existing_images_set - new_images_set:
|
||||
await session.execute(update(ProductImage).where(ProductImage.product_id == p.id, ProductImage.url == img[0], ProductImage.kind == img[1], ProductImage.deleted_at.is_(None)).values(deleted_at=now))
|
||||
|
||||
# ProductLabel sync
|
||||
existing_labels = await session.execute(select(ProductLabel).where(ProductLabel.product_id == p.id, ProductLabel.deleted_at.is_(None)))
|
||||
existing_labels_set = {label.name.strip() for label in existing_labels.scalars()}
|
||||
|
||||
new_labels = {str(name).strip() for name in (d.get("labels") or []) if name}
|
||||
|
||||
for name in new_labels - existing_labels_set:
|
||||
session.add(ProductLabel(product_id=p.id, name=name))
|
||||
|
||||
for name in existing_labels_set - new_labels:
|
||||
await session.execute(update(ProductLabel).where(ProductLabel.product_id == p.id, ProductLabel.name == name, ProductLabel.deleted_at.is_(None)).values(deleted_at=now))
|
||||
|
||||
# ProductSticker sync
|
||||
existing_stickers = await session.execute(select(ProductSticker).where(ProductSticker.product_id == p.id, ProductSticker.deleted_at.is_(None)))
|
||||
existing_stickers_set = {sticker.name.strip() for sticker in existing_stickers.scalars()}
|
||||
|
||||
new_stickers = {str(name).strip().lower() for name in (d.get("stickers") or []) if name}
|
||||
|
||||
for name in new_stickers - existing_stickers_set:
|
||||
session.add(ProductSticker(product_id=p.id, name=name))
|
||||
|
||||
for name in existing_stickers_set - new_stickers:
|
||||
await session.execute(update(ProductSticker).where(ProductSticker.product_id == p.id, ProductSticker.name == name, ProductSticker.deleted_at.is_(None)).values(deleted_at=now))
|
||||
|
||||
# ProductAttribute sync
|
||||
existing_attrs = await session.execute(select(ProductAttribute).where(ProductAttribute.product_id == p.id, ProductAttribute.deleted_at.is_(None)))
|
||||
existing_attrs_set = {(a.key, a.value) for a in existing_attrs.scalars()}
|
||||
|
||||
new_attrs_set = set()
|
||||
for src, prefix in [(d.get("info_table") or {}, "info_table"), (d.get("oe_list_price") or {}, "oe_list_price")]:
|
||||
for k, v in src.items():
|
||||
key = f"{prefix}/{str(k).strip()}"
|
||||
val = None if v is None else str(v)
|
||||
new_attrs_set.add((key, val))
|
||||
if (key, val) not in existing_attrs_set:
|
||||
session.add(ProductAttribute(product_id=p.id, key=key, value=val))
|
||||
|
||||
for key, val in existing_attrs_set - new_attrs_set:
|
||||
await session.execute(update(ProductAttribute).where(ProductAttribute.product_id == p.id, ProductAttribute.key == key, ProductAttribute.value == val, ProductAttribute.deleted_at.is_(None)).values(deleted_at=now))
|
||||
|
||||
# ProductNutrition sync
|
||||
existing_nuts = await session.execute(select(ProductNutrition).where(ProductNutrition.product_id == p.id, ProductNutrition.deleted_at.is_(None)))
|
||||
existing_nuts_set = {(n.key, n.value, n.unit) for n in existing_nuts.scalars()}
|
||||
|
||||
new_nuts_set = set()
|
||||
nutrition = d.get("nutrition") or []
|
||||
if isinstance(nutrition, dict):
|
||||
for k, v in nutrition.items():
|
||||
key, val = str(k).strip(), str(v) if v is not None else None
|
||||
new_nuts_set.add((key, val, None))
|
||||
if (key, val, None) not in existing_nuts_set:
|
||||
session.add(ProductNutrition(product_id=p.id, key=key, value=val, unit=None))
|
||||
elif isinstance(nutrition, list):
|
||||
for row in nutrition:
|
||||
try:
|
||||
key = str(row.get("key") or "").strip()
|
||||
val = None if row.get("value") is None else str(row.get("value"))
|
||||
unit = None if row.get("unit") is None else str(row.get("unit"))
|
||||
if key:
|
||||
new_nuts_set.add((key, val, unit))
|
||||
if (key, val, unit) not in existing_nuts_set:
|
||||
session.add(ProductNutrition(product_id=p.id, key=key, value=val, unit=unit))
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
for key, val, unit in existing_nuts_set - new_nuts_set:
|
||||
await session.execute(update(ProductNutrition).where(ProductNutrition.product_id == p.id, ProductNutrition.key == key, ProductNutrition.value == val, ProductNutrition.unit == unit, ProductNutrition.deleted_at.is_(None)).values(deleted_at=now))
|
||||
|
||||
# ProductAllergen sync
|
||||
existing_allergens = await session.execute(select(ProductAllergen).where(ProductAllergen.product_id == p.id, ProductAllergen.deleted_at.is_(None)))
|
||||
existing_allergens_set = {(a.name, a.contains) for a in existing_allergens.scalars()}
|
||||
|
||||
new_allergens_set = set()
|
||||
for a in d.get("allergens") or []:
|
||||
if isinstance(a, str):
|
||||
nm, contains = a.strip(), True
|
||||
elif isinstance(a, dict):
|
||||
nm, contains = (a.get("name") or "").strip(), bool(a.get("contains", True))
|
||||
else:
|
||||
continue
|
||||
if nm:
|
||||
new_allergens_set.add((nm, contains))
|
||||
if (nm, contains) not in existing_allergens_set:
|
||||
session.add(ProductAllergen(product_id=p.id, name=nm, contains=contains))
|
||||
|
||||
for name, contains in existing_allergens_set - new_allergens_set:
|
||||
await session.execute(update(ProductAllergen).where(ProductAllergen.product_id == p.id, ProductAllergen.name == name, ProductAllergen.contains == contains, ProductAllergen.deleted_at.is_(None)).values(deleted_at=now))
|
||||
|
||||
|
||||
|
||||
|
||||
await session.flush()
|
||||
return p
|
||||
|
||||
async def upsert_product(
|
||||
slug,
|
||||
href,
|
||||
d,
|
||||
):
|
||||
async with get_session() as session:
|
||||
try:
|
||||
await _upsert_product(session, d)
|
||||
await _log_product_result(session, ok=True, payload={
|
||||
"slug": slug,
|
||||
"href_tried": href,
|
||||
"title": d.get("title"),
|
||||
"has_description_html": bool(d.get("description_html")),
|
||||
"has_description_short": bool(d.get("description_short")),
|
||||
"sections_count": len(d.get("sections") or []),
|
||||
"images_count": len(d.get("images")),
|
||||
"embedded_images_count": len(d.get("embedded_image_urls")),
|
||||
"all_images_count": len(d.get("all_image_urls")),
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Failed to upsert product '{d.get('slug')}'")
|
||||
print(f" Title: {d}.get('title')")
|
||||
print(f" URL: {d.get('suma_href')}")
|
||||
print(f" Error type: {type(e).__name__}")
|
||||
print(f" Error message: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
await _log_product_result(session, ok=False, payload={
|
||||
"slug": d.get("slug"),
|
||||
"href_tried": d.get("suma_href"),
|
||||
"error_type": type(e).__name__,
|
||||
"error_message": str(e),
|
||||
"title": d.get("title"),
|
||||
})
|
||||
raise
|
||||
await session.commit()
|
||||
1
market/scrape/product/__init__.py
Normal file
1
market/scrape/product/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
13
market/scrape/product/extractors/__init__.py
Normal file
13
market/scrape/product/extractors/__init__.py
Normal file
@@ -0,0 +1,13 @@
|
||||
|
||||
# Auto-import all extractor modules so they register themselves.
|
||||
from .title import ex_title # noqa: F401
|
||||
from .images import ex_images # noqa: F401
|
||||
from .short_description import ex_short_description # noqa: F401
|
||||
from .description_sections import ex_description_sections # noqa: F401
|
||||
from .nutrition_ex import ex_nutrition # noqa: F401
|
||||
from .stickers import ex_stickers # noqa: F401
|
||||
from .labels import ex_labels # noqa: F401
|
||||
from .info_table import ex_info_table # noqa: F401
|
||||
from .oe_list_price import ex_oe_list_price # noqa: F401
|
||||
from .regular_price_fallback import ex_regular_price_fallback # noqa: F401
|
||||
from .breadcrumbs import ex_breadcrumbs # noqa: F401
|
||||
68
market/scrape/product/extractors/breadcrumbs.py
Normal file
68
market/scrape/product/extractors/breadcrumbs.py
Normal file
@@ -0,0 +1,68 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Dict, List, Union
|
||||
from urllib.parse import urlparse
|
||||
from bs4 import BeautifulSoup
|
||||
from shared.utils import normalize_text
|
||||
from ..registry import extractor
|
||||
|
||||
@extractor
|
||||
def ex_breadcrumbs(soup: BeautifulSoup, url: str) -> Dict:
|
||||
"""
|
||||
Parse breadcrumbs to identify top and sub categories.
|
||||
"""
|
||||
bc_ul = (soup.select_one(".breadcrumbs ul.items")
|
||||
or soup.select_one("nav.breadcrumbs ul.items")
|
||||
or soup.select_one("ul.items"))
|
||||
if not bc_ul:
|
||||
return {}
|
||||
|
||||
crumbs = []
|
||||
for li in bc_ul.select("li.item"):
|
||||
a = li.find("a")
|
||||
if a:
|
||||
title = normalize_text(a.get("title") or a.get_text())
|
||||
href = a.get("href")
|
||||
else:
|
||||
title = normalize_text(li.get_text())
|
||||
href = None
|
||||
slug = None
|
||||
if href:
|
||||
try:
|
||||
p = urlparse(href)
|
||||
path = (p.path or "").strip("/")
|
||||
slug = path.split("/")[-1] if path else None
|
||||
except Exception:
|
||||
slug = None
|
||||
if slug:
|
||||
crumbs.append({"title": title or None, "href": href or None, "slug": slug})
|
||||
|
||||
category_links = [c for c in crumbs if c.get("href")]
|
||||
top = None
|
||||
sub = None
|
||||
for c in category_links:
|
||||
t = (c.get("title") or "").lower()
|
||||
s = (c.get("slug") or "").lower()
|
||||
if t == "home" or s in ("", "home"):
|
||||
continue
|
||||
if top is None:
|
||||
top = c
|
||||
continue
|
||||
if sub is None:
|
||||
sub = c
|
||||
break
|
||||
|
||||
out: Dict[str, Union[str, List[Dict[str, str]]]] = {
|
||||
"category_breadcrumbs": crumbs
|
||||
}
|
||||
if top:
|
||||
out["category_top_title"] = top.get("title")
|
||||
out["category_top_href"] = top.get("href")
|
||||
out["category_top_slug"] = top.get("slug")
|
||||
if sub:
|
||||
out["category_sub_title"] = sub.get("title")
|
||||
out["category_sub_href"] = sub.get("href")
|
||||
out["category_sub_slug"] = sub.get("slug")
|
||||
if top and sub:
|
||||
out["category_path"] = f"{(top.get('slug') or '').strip()}/{(sub.get('slug') or '').strip()}"
|
||||
return out
|
||||
43
market/scrape/product/extractors/description_sections.py
Normal file
43
market/scrape/product/extractors/description_sections.py
Normal file
@@ -0,0 +1,43 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Dict, List
|
||||
from bs4 import BeautifulSoup
|
||||
from shared.utils import normalize_text
|
||||
from ...html_utils import absolutize_fragment
|
||||
from ..registry import extractor
|
||||
from ..helpers.desc import (
|
||||
split_description_container, find_description_container,
|
||||
pair_title_content_from_magento_tabs, scan_headings_for_sections,
|
||||
additional_attributes_table,
|
||||
)
|
||||
from ..helpers.text import clean_title, is_blacklisted_heading
|
||||
|
||||
@extractor
|
||||
def ex_description_sections(soup: BeautifulSoup, url: str) -> Dict:
|
||||
description_html = None
|
||||
sections: List[Dict] = []
|
||||
desc_el = find_description_container(soup)
|
||||
if desc_el:
|
||||
open_html, sections_from_desc = split_description_container(desc_el)
|
||||
description_html = open_html or None
|
||||
sections.extend(sections_from_desc)
|
||||
|
||||
existing = {s["title"].lower() for s in sections}
|
||||
for t, html_fragment in (pair_title_content_from_magento_tabs(soup) or scan_headings_for_sections(soup)):
|
||||
low = t.lower()
|
||||
if "product description" in low or low == "description" or "details" in low:
|
||||
if not description_html and html_fragment:
|
||||
description_html = absolutize_fragment(html_fragment)
|
||||
continue
|
||||
if t.lower() not in existing and normalize_text(BeautifulSoup(html_fragment, "lxml").get_text()):
|
||||
if not is_blacklisted_heading(t):
|
||||
sections.append({"title": clean_title(t), "html": absolutize_fragment(html_fragment)})
|
||||
existing.add(t.lower())
|
||||
addl = additional_attributes_table(soup)
|
||||
if addl and "additional information" not in existing and not is_blacklisted_heading("additional information"):
|
||||
sections.append({"title": "Additional Information", "html": addl})
|
||||
out = {"sections": sections}
|
||||
if description_html:
|
||||
out["description_html"] = description_html
|
||||
return out
|
||||
|
||||
89
market/scrape/product/extractors/images.py
Normal file
89
market/scrape/product/extractors/images.py
Normal file
@@ -0,0 +1,89 @@
|
||||
from __future__ import annotations
|
||||
import json, re
|
||||
from typing import Dict, List
|
||||
from bs4 import BeautifulSoup
|
||||
from ..registry import extractor
|
||||
from ..helpers.html import abs_url, collect_img_candidates, dedup_by_filename
|
||||
|
||||
@extractor
|
||||
def ex_images(soup: BeautifulSoup, url: str) -> Dict:
|
||||
images: List[str] = []
|
||||
debug = False # set True while debugging
|
||||
|
||||
# 1) Magento init script (gallery)
|
||||
scripts = soup.find_all("script", attrs={"type": "text/x-magento-init"})
|
||||
if debug: print(f"[ex_images] x-magento-init scripts: {len(scripts)}")
|
||||
|
||||
for script in scripts:
|
||||
# Use raw string as-is; no stripping/collapsing
|
||||
text = script.string or script.get_text() or ""
|
||||
if "mage/gallery/gallery" not in text:
|
||||
continue
|
||||
|
||||
# Correct (not over-escaped) patterns:
|
||||
m = re.search(r'"data"\s*:\s*(\[[\s\S]*?\])', text)
|
||||
if not m:
|
||||
if debug: print("[ex_images] 'data' array not found in gallery block")
|
||||
continue
|
||||
|
||||
arr_txt = m.group(1)
|
||||
added = False
|
||||
try:
|
||||
data = json.loads(arr_txt)
|
||||
for entry in data:
|
||||
u = abs_url(entry.get("full")) or abs_url(entry.get("img"))
|
||||
if u:
|
||||
images.append(u); added = True
|
||||
except Exception as e:
|
||||
if debug: print(f"[ex_images] json.loads failed: {e!r}; trying regex fallback")
|
||||
# Fallback to simple key extraction
|
||||
fulls = re.findall(r'"full"\s*:\s*"([^"]+)"', arr_txt)
|
||||
imgs = re.findall(r'"img"\s*:\s*"([^"]+)"', arr_txt) if not fulls else []
|
||||
for u in (fulls or imgs):
|
||||
u = abs_url(u)
|
||||
if u:
|
||||
images.append(u); added = True
|
||||
|
||||
if added:
|
||||
break # got what we need from the gallery block
|
||||
|
||||
# 2) JSON-LD fallback
|
||||
if not images:
|
||||
for script in soup.find_all("script", attrs={"type": "application/ld+json"}):
|
||||
raw = script.string or script.get_text() or ""
|
||||
try:
|
||||
data = json.loads(raw)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
def add_from(val):
|
||||
if isinstance(val, str):
|
||||
u = abs_url(val); u and images.append(u)
|
||||
elif isinstance(val, list):
|
||||
for v in val:
|
||||
if isinstance(v, str):
|
||||
u = abs_url(v); u and images.append(u)
|
||||
elif isinstance(v, dict) and "url" in v:
|
||||
u = abs_url(v["url"]); u and images.append(u)
|
||||
elif isinstance(val, dict) and "url" in val:
|
||||
u = abs_url(val["url"]); u and images.append(u)
|
||||
|
||||
if isinstance(data, dict) and "image" in data:
|
||||
add_from(data["image"])
|
||||
if isinstance(data, list):
|
||||
for item in data:
|
||||
if isinstance(item, dict) and "image" in item:
|
||||
add_from(item["image"])
|
||||
|
||||
# 3) Generic DOM scan fallback
|
||||
if not images:
|
||||
# consider broadening selectors if needed, e.g. '.fotorama__img'
|
||||
for el in soup.select(".product.media img, .gallery-placeholder img, .fotorama__stage img"):
|
||||
for cand in collect_img_candidates(el):
|
||||
u = abs_url(cand)
|
||||
if u:
|
||||
images.append(u)
|
||||
|
||||
images = dedup_by_filename(images)
|
||||
if debug: print(f"[ex_images] found images: {images}")
|
||||
return {"images": images, "image": images[0] if images else None}
|
||||
76
market/scrape/product/extractors/info_table.py
Normal file
76
market/scrape/product/extractors/info_table.py
Normal file
@@ -0,0 +1,76 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Dict, Union
|
||||
from bs4 import BeautifulSoup
|
||||
from shared.utils import normalize_text
|
||||
from ..registry import extractor
|
||||
from ..helpers.price import parse_price, parse_case_size
|
||||
|
||||
@extractor
|
||||
def ex_info_table(soup: BeautifulSoup, url: str) -> Dict:
|
||||
"""
|
||||
Extracts:
|
||||
<div class="product-page-info-table"> ... rows of label/content ... </div>
|
||||
Produces:
|
||||
info_table (raw map), brand, rrp[_raw|_currency], price_per_unit[_raw|_currency],
|
||||
case_size_* fields
|
||||
"""
|
||||
container = soup.select_one(".product-page-info-table") or None
|
||||
if not container:
|
||||
return {}
|
||||
rows_parent = container.select_one(".product-page-info-table-rows") or container
|
||||
rows = rows_parent.select(".product-page-info-table-row") or []
|
||||
if not rows:
|
||||
return {}
|
||||
|
||||
raw_map: Dict[str, str] = {}
|
||||
for r in rows:
|
||||
lab_el = r.select_one(".product-page-info-table__label")
|
||||
val_el = r.select_one(".product-page-info-table__content")
|
||||
if not lab_el or not val_el:
|
||||
continue
|
||||
label = normalize_text(lab_el.get_text())
|
||||
value = normalize_text(val_el.get_text())
|
||||
if label:
|
||||
raw_map[label] = value
|
||||
|
||||
out: Dict[str, Union[str, float, int, Dict]] = {"info_table": raw_map}
|
||||
|
||||
# Brand
|
||||
brand = raw_map.get("Brand") or raw_map.get("Brand Name") or None
|
||||
if brand:
|
||||
out["brand"] = brand
|
||||
|
||||
# RRP
|
||||
rrp_val, rrp_cur, rrp_raw = parse_price(raw_map.get("RRP", ""))
|
||||
if rrp_raw and (rrp_val is not None or rrp_cur is not None):
|
||||
out["rrp_raw"] = rrp_raw
|
||||
if rrp_val is not None:
|
||||
out["rrp"] = rrp_val
|
||||
if rrp_cur:
|
||||
out["rrp_currency"] = rrp_cur
|
||||
|
||||
# Price Per Unit
|
||||
ppu_val, ppu_cur, ppu_raw = parse_price(
|
||||
raw_map.get("Price Per Unit", "") or raw_map.get("Unit Price", "")
|
||||
)
|
||||
if ppu_raw and (ppu_val is not None or ppu_cur is not None):
|
||||
out["price_per_unit_raw"] = ppu_raw
|
||||
if ppu_val is not None:
|
||||
out["price_per_unit"] = ppu_val
|
||||
if ppu_cur:
|
||||
out["price_per_unit_currency"] = ppu_cur
|
||||
|
||||
# Case Size
|
||||
cs_text = raw_map.get("Case Size", "") or raw_map.get("Pack Size", "")
|
||||
cs_count, cs_item_qty, cs_item_unit, cs_raw = parse_case_size(cs_text)
|
||||
if cs_raw:
|
||||
out["case_size_raw"] = cs_raw
|
||||
if cs_count is not None:
|
||||
out["case_size_count"] = cs_count
|
||||
if cs_item_qty is not None:
|
||||
out["case_size_item_qty"] = cs_item_qty
|
||||
if cs_item_unit:
|
||||
out["case_size_item_unit"] = cs_item_unit
|
||||
|
||||
return out
|
||||
41
market/scrape/product/extractors/labels.py
Normal file
41
market/scrape/product/extractors/labels.py
Normal file
@@ -0,0 +1,41 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Dict, List
|
||||
from bs4 import BeautifulSoup
|
||||
from shared.utils import normalize_text
|
||||
from ..registry import extractor
|
||||
|
||||
@extractor
|
||||
def ex_labels(soup: BeautifulSoup, url: str) -> Dict:
|
||||
"""
|
||||
From:
|
||||
<ul class="cdz-product-labels">
|
||||
<li class="label-item new"><div class="label-content">NEW</div></li>
|
||||
</ul>
|
||||
Returns "labels": lower-cased union of class hints and visible text.
|
||||
"""
|
||||
root = soup.select_one("ul.cdz-product-labels")
|
||||
if not root:
|
||||
return {}
|
||||
items: List[str] = []
|
||||
texts: List[str] = []
|
||||
|
||||
for li in root.select("li.label-item"):
|
||||
for c in (li.get("class") or []):
|
||||
c = (c or "").strip()
|
||||
if c and c.lower() != "label-item" and c not in items:
|
||||
items.append(c)
|
||||
txt = normalize_text(li.get_text())
|
||||
if txt and txt not in texts:
|
||||
texts.append(txt)
|
||||
|
||||
if not items and not texts:
|
||||
return {}
|
||||
union = []
|
||||
seen = set()
|
||||
for s in items + [t.lower() for t in texts]:
|
||||
key = (s or "").strip().lower()
|
||||
if key and key not in seen:
|
||||
seen.add(key)
|
||||
union.append(key)
|
||||
return {"labels": union}
|
||||
129
market/scrape/product/extractors/nutrition_ex.py
Normal file
129
market/scrape/product/extractors/nutrition_ex.py
Normal file
@@ -0,0 +1,129 @@
|
||||
from __future__ import annotations
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from shared.utils import normalize_text
|
||||
from ..registry import extractor
|
||||
from ..helpers.desc import (
|
||||
split_description_container, find_description_container,
|
||||
pair_title_content_from_magento_tabs, scan_headings_for_sections,
|
||||
)
|
||||
|
||||
# ----- value/unit parser ------------------------------------------------------
|
||||
|
||||
_NUM_UNIT_RE = re.compile(
|
||||
r"""
|
||||
^\s*
|
||||
(?P<num>[-+]?\d{1,3}(?:[.,]\d{3})*(?:[.,]\d+)?|\d+(?:[.,]\d+)?)
|
||||
\s*
|
||||
(?P<unit>[a-zA-Z%µ/]+)?
|
||||
\s*$
|
||||
""",
|
||||
re.X,
|
||||
)
|
||||
|
||||
def _parse_value_unit(s: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
if not s:
|
||||
return None, None
|
||||
s = re.sub(r"\s+", " ", s.strip())
|
||||
m = _NUM_UNIT_RE.match(s)
|
||||
if not m:
|
||||
return None, None
|
||||
num = (m.group("num") or "").replace(",", "")
|
||||
unit = m.group("unit") or None
|
||||
if unit:
|
||||
u = unit.lower()
|
||||
if u in {"kcal", "kcal.", "kcalories", "kcalorie"}:
|
||||
unit = "kcal"
|
||||
elif u in {"kj", "kj.", "kilojoule", "kilojoules"}:
|
||||
unit = "kJ"
|
||||
return (num or None, unit)
|
||||
|
||||
# ----- section finder ---------------------------------------------------------
|
||||
|
||||
def _find_nutrition_section_html(soup: BeautifulSoup) -> Optional[str]:
|
||||
"""
|
||||
Return the HTML for the section whose title matches 'Nutritional Information'.
|
||||
We look in the same places your description extractor does.
|
||||
"""
|
||||
# 1) Magento tabs
|
||||
for t, html in (pair_title_content_from_magento_tabs(soup) or []):
|
||||
if not t or not html:
|
||||
continue
|
||||
title = normalize_text(t).rstrip(":").lower()
|
||||
if "nutritional information" in title:
|
||||
return html
|
||||
|
||||
# 2) Description container split into sections
|
||||
desc_el = find_description_container(soup)
|
||||
if desc_el:
|
||||
_open_html, sections = split_description_container(desc_el)
|
||||
for sec in sections or []:
|
||||
title = normalize_text((sec.get("title") or "")).rstrip(":").lower()
|
||||
if "nutritional information" in title:
|
||||
return sec.get("html") or ""
|
||||
|
||||
# 3) Fallback: generic heading scan
|
||||
for t, html in (scan_headings_for_sections(soup) or []):
|
||||
if not t or not html:
|
||||
continue
|
||||
title = normalize_text(t).rstrip(":").lower()
|
||||
if "nutritional information" in title:
|
||||
return html
|
||||
|
||||
return None
|
||||
|
||||
# ----- table parser -----------------------------------------------------------
|
||||
|
||||
def _extract_rows_from_table(root: BeautifulSoup) -> List[Dict[str, str]]:
|
||||
out: List[Dict[str, str]] = []
|
||||
table = root.select_one("table")
|
||||
if not table:
|
||||
return out
|
||||
|
||||
for tr in table.select("tr"):
|
||||
th = tr.find("th")
|
||||
tds = tr.find_all("td")
|
||||
if th and tds:
|
||||
key = normalize_text(th.get_text(" ").strip())
|
||||
val_raw = normalize_text(tds[0].get_text(" ").strip())
|
||||
elif len(tds) >= 2:
|
||||
key = normalize_text(tds[0].get_text(" ").strip())
|
||||
val_raw = normalize_text(tds[1].get_text(" ").strip())
|
||||
else:
|
||||
continue
|
||||
|
||||
if not key or not val_raw:
|
||||
continue
|
||||
|
||||
value, unit = _parse_value_unit(val_raw)
|
||||
if value is None: # keep raw if not parseable
|
||||
value, unit = val_raw, None
|
||||
|
||||
out.append({"key": key, "value": value, "unit": unit})
|
||||
|
||||
# Deduplicate while preserving order
|
||||
seen = set()
|
||||
dedup: List[Dict[str, str]] = []
|
||||
for r in out:
|
||||
t = (r["key"], r.get("value"), r.get("unit"))
|
||||
if t in seen:
|
||||
continue
|
||||
seen.add(t)
|
||||
dedup.append(r)
|
||||
return dedup
|
||||
|
||||
# ----- extractor --------------------------------------------------------------
|
||||
|
||||
@extractor
|
||||
def ex_nutrition(soup: BeautifulSoup, url: str) -> Dict:
|
||||
"""
|
||||
Extract nutrition ONLY from the section titled 'Nutritional Information'.
|
||||
Returns: {"nutrition": [{"key": "...", "value": "...", "unit": "..."}]}
|
||||
"""
|
||||
section_html = _find_nutrition_section_html(soup)
|
||||
if not section_html:
|
||||
return {"nutrition": []}
|
||||
section_soup = BeautifulSoup(section_html, "lxml")
|
||||
rows = _extract_rows_from_table(section_soup)
|
||||
return {"nutrition": rows}
|
||||
56
market/scrape/product/extractors/oe_list_price.py
Normal file
56
market/scrape/product/extractors/oe_list_price.py
Normal file
@@ -0,0 +1,56 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Dict, Union
|
||||
from bs4 import BeautifulSoup
|
||||
from ..registry import extractor
|
||||
from ..helpers.price import parse_price
|
||||
|
||||
@extractor
|
||||
def ex_oe_list_price(soup: BeautifulSoup, url: str) -> Dict:
|
||||
"""
|
||||
Extract Magento "oe-list-price" block:
|
||||
<div class="oe-list-price">
|
||||
<div class="rrp-price"><label>Regular Price: </label><span class="price">£30.50</span></div>
|
||||
<div class="oe-final-price"><label>Special Price: </label><span>£23.63</span></div>
|
||||
</div>
|
||||
Produces:
|
||||
oe_list_price: { rrp_raw, rrp, rrp_currency, special_raw, special, special_currency }
|
||||
Also promotes special_* to top-level (special_price_*) if available.
|
||||
"""
|
||||
box = soup.select_one(".oe-list-price")
|
||||
if not box:
|
||||
return {}
|
||||
out: Dict[str, Union[str, float, dict]] = {}
|
||||
oe: Dict[str, Union[str, float]] = {}
|
||||
|
||||
# RRP inside oe-list-price (if present)
|
||||
rrp = box.select_one(".rrp-price")
|
||||
if rrp:
|
||||
txt = (rrp.select_one("span.price") or rrp.select_one("span") or rrp).get_text(strip=True)
|
||||
val, cur, raw = parse_price(txt)
|
||||
if raw:
|
||||
oe["rrp_raw"] = raw
|
||||
if val is not None:
|
||||
oe["rrp"] = val
|
||||
if cur:
|
||||
oe["rrp_currency"] = cur
|
||||
|
||||
# Special Price inside oe-list-price
|
||||
sp = box.select_one(".oe-final-price, .special-price, .final-price")
|
||||
if sp:
|
||||
txt = (sp.select_one("span.price") or sp.select_one("span") or sp).get_text(strip=True)
|
||||
val, cur, raw = parse_price(txt)
|
||||
if raw:
|
||||
oe["special_raw"] = raw
|
||||
if val is not None:
|
||||
oe["special"] = val
|
||||
out["special_price"] = val
|
||||
if cur:
|
||||
oe["special_currency"] = cur
|
||||
out["special_price_currency"] = cur
|
||||
if raw:
|
||||
out["special_price_raw"] = raw
|
||||
|
||||
if oe:
|
||||
out["oe_list_price"] = oe
|
||||
return out
|
||||
33
market/scrape/product/extractors/regular_price_fallback.py
Normal file
33
market/scrape/product/extractors/regular_price_fallback.py
Normal file
@@ -0,0 +1,33 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Dict, Union
|
||||
from bs4 import BeautifulSoup
|
||||
from ..registry import extractor
|
||||
from ..helpers.price import parse_price
|
||||
|
||||
@extractor
|
||||
def ex_regular_price_fallback(soup: BeautifulSoup, url: str) -> Dict:
|
||||
"""
|
||||
Fallback extractor for legacy 'Regular Price' blocks outside oe-list-price:
|
||||
<div class="rrp-price"><label>Regular Price: </label><span class="price">£16.55</span></div>
|
||||
"""
|
||||
rrp = soup.select_one("div.rrp-price")
|
||||
if not rrp:
|
||||
return {}
|
||||
span = rrp.select_one("span.price")
|
||||
price_text = span.get_text(strip=True) if span else rrp.get_text(" ", strip=True)
|
||||
value, currency, raw = parse_price(price_text or "")
|
||||
out: Dict[str, Union[str, float]] = {}
|
||||
if raw:
|
||||
out["regular_price_raw"] = raw
|
||||
if value is not None:
|
||||
out["regular_price"] = value
|
||||
if currency:
|
||||
out["regular_price_currency"] = currency
|
||||
if value is not None:
|
||||
out.setdefault("rrp", value)
|
||||
if currency:
|
||||
out.setdefault("rrp_currency", currency)
|
||||
if raw:
|
||||
out.setdefault("rrp_raw", raw)
|
||||
return out
|
||||
19
market/scrape/product/extractors/short_description.py
Normal file
19
market/scrape/product/extractors/short_description.py
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Dict
|
||||
from bs4 import BeautifulSoup
|
||||
from shared.utils import normalize_text
|
||||
from ..registry import extractor
|
||||
|
||||
@extractor
|
||||
def ex_short_description(soup: BeautifulSoup, url: str) -> Dict:
|
||||
desc_short = None
|
||||
for sel in [".product.attribute.description .value", ".product.attribute.overview .value",
|
||||
"meta[name='description']", "meta[property='og:description']"]:
|
||||
el = soup.select_one(sel)
|
||||
if not el:
|
||||
continue
|
||||
desc_short = normalize_text(el.get_text() if el.name != "meta" else el.get("content"))
|
||||
if desc_short:
|
||||
break
|
||||
return {"description_short": desc_short}
|
||||
30
market/scrape/product/extractors/stickers.py
Normal file
30
market/scrape/product/extractors/stickers.py
Normal file
@@ -0,0 +1,30 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Dict, List
|
||||
from bs4 import BeautifulSoup
|
||||
from ..registry import extractor
|
||||
|
||||
@extractor
|
||||
def ex_stickers(soup: BeautifulSoup, url: str) -> Dict:
|
||||
"""
|
||||
<div class="stickers">
|
||||
<span class="sticker xxx"></span>
|
||||
...
|
||||
</div>
|
||||
"""
|
||||
root = soup.select_one("div.stickers")
|
||||
if not root:
|
||||
return {"stickers": []}
|
||||
stickers: List[str] = []
|
||||
seen = set()
|
||||
for sp in root.select("span.sticker"):
|
||||
classes = sp.get("class") or []
|
||||
extras = [c.strip() for c in classes if c and c.lower() != "sticker"]
|
||||
data_name = (sp.get("data-sticker") or "").strip()
|
||||
if data_name:
|
||||
extras.append(data_name)
|
||||
for x in extras:
|
||||
if x and x not in seen:
|
||||
seen.add(x)
|
||||
stickers.append(x)
|
||||
return {"stickers": stickers}
|
||||
17
market/scrape/product/extractors/title.py
Normal file
17
market/scrape/product/extractors/title.py
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Dict
|
||||
from bs4 import BeautifulSoup
|
||||
from shared.utils import normalize_text
|
||||
from ..registry import extractor
|
||||
|
||||
@extractor
|
||||
def ex_title(soup: BeautifulSoup, url: str) -> Dict:
|
||||
title = None
|
||||
for sel in ["h1.page-title span", "h1.page-title", "h1.product-name", "meta[property='og:title']"]:
|
||||
el = soup.select_one(sel)
|
||||
if el:
|
||||
title = normalize_text(el.get_text()) if el.name != "meta" else el.get("content")
|
||||
if title:
|
||||
break
|
||||
return {"title": title or "Product"}
|
||||
165
market/scrape/product/helpers/desc.py
Normal file
165
market/scrape/product/helpers/desc.py
Normal file
@@ -0,0 +1,165 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from bs4 import BeautifulSoup, NavigableString, Tag
|
||||
from shared.utils import normalize_text
|
||||
from ...html_utils import absolutize_fragment
|
||||
from .text import clean_title, is_blacklisted_heading
|
||||
from shared.config import config
|
||||
|
||||
|
||||
def split_description_container(desc_el: Tag) -> Tuple[str, List[Dict]]:
|
||||
"""
|
||||
Extract sections from accordion blocks within the description container.
|
||||
|
||||
Looks for headings with class 'accordion-title' and pairs each with its
|
||||
next element-sibling having class 'accordion-details'. Returns:
|
||||
- open_html: the remaining description HTML with those accordion blocks removed
|
||||
- sections: [{"title": ..., "html": ...}, ...]
|
||||
"""
|
||||
# Work on an isolated copy to avoid mutating the original DOM
|
||||
frag = BeautifulSoup(desc_el.decode_contents(), "lxml")
|
||||
|
||||
# Collect candidate (heading, details) pairs without mutating during iteration
|
||||
pairs: List[Tuple[Tag, Tag]] = []
|
||||
for h in frag.select("#accordion .accordion-title, .accordion .accordion-title, h5.accordion-title, .accordion-title"):
|
||||
if not isinstance(h, Tag):
|
||||
continue
|
||||
title = clean_title((h.get_text() or "").strip())
|
||||
if not title:
|
||||
continue
|
||||
|
||||
# Walk forward siblings until we hit an element; accept the first with 'accordion-details'
|
||||
sib = h.next_sibling
|
||||
details: Optional[Tag] = None
|
||||
while sib is not None:
|
||||
if isinstance(sib, Tag):
|
||||
classes = sib.get("class") or []
|
||||
if "accordion-details" in classes:
|
||||
details = sib
|
||||
break
|
||||
sib = sib.next_sibling
|
||||
|
||||
if details is not None:
|
||||
pairs.append((h, details))
|
||||
|
||||
sections: List[Dict] = []
|
||||
|
||||
# Extract sections, then remove nodes from frag
|
||||
for h, details in pairs:
|
||||
# Pull details HTML
|
||||
html = details.decode_contents()
|
||||
# Only keep non-empty (textual) content
|
||||
if normalize_text(BeautifulSoup(html, "lxml").get_text()):
|
||||
sections.append({
|
||||
"title": clean_title(h.get_text() or ""),
|
||||
"html": absolutize_fragment(html),
|
||||
})
|
||||
# Remove the matched nodes from the fragment copy
|
||||
details.decompose()
|
||||
h.decompose()
|
||||
|
||||
# Whatever remains is the open description html
|
||||
open_html = absolutize_fragment(str(frag)) if frag else ""
|
||||
|
||||
return open_html, sections
|
||||
|
||||
def pair_title_content_from_magento_tabs(soup: BeautifulSoup):
|
||||
out = []
|
||||
container = soup.select_one(".product.info.detailed .product.data.items") or soup.select_one(".product.data.items")
|
||||
if not container:
|
||||
return out
|
||||
titles = container.select(".data.item.title")
|
||||
for t in titles:
|
||||
title = normalize_text(t.get_text())
|
||||
if not title:
|
||||
continue
|
||||
content_id = t.get("aria-controls") or t.get("data-target")
|
||||
content = soup.select_one(f"#{content_id}") if content_id else None
|
||||
if content is None:
|
||||
sib = t.find_next_sibling(
|
||||
lambda x: isinstance(x, Tag) and "data" in x.get("class", []) and "item" in x.get("class", []) and "content" in x.get("class", [])
|
||||
)
|
||||
content = sib
|
||||
if content:
|
||||
html = content.decode_contents()
|
||||
if not is_blacklisted_heading(title):
|
||||
out.append((title, absolutize_fragment(html)))
|
||||
return out
|
||||
|
||||
def scan_headings_for_sections(soup: BeautifulSoup):
|
||||
out = []
|
||||
container = (
|
||||
soup.select_one(".product.info.detailed")
|
||||
or soup.select_one(".product-info-main")
|
||||
or soup.select_one(".page-main")
|
||||
or soup
|
||||
)
|
||||
heads = container.select("h2, h3, h4, h5, h6")
|
||||
section_titles = (config().get("section-titles") or [])
|
||||
for h in heads:
|
||||
title = clean_title(h.get_text() or "")
|
||||
if not title:
|
||||
continue
|
||||
low = title.lower()
|
||||
if not any(k in low for k in section_titles + ["product description", "description", "details"]):
|
||||
continue
|
||||
parts: List[str] = []
|
||||
for sib in h.next_siblings:
|
||||
if isinstance(sib, NavigableString):
|
||||
parts.append(str(sib))
|
||||
continue
|
||||
if isinstance(sib, Tag) and sib.name in ("h2", "h3", "h4", "h5", "h6"):
|
||||
break
|
||||
if isinstance(sib, Tag):
|
||||
parts.append(str(sib))
|
||||
html = absolutize_fragment("".join(parts).strip())
|
||||
if html and not is_blacklisted_heading(title):
|
||||
out.append((title, html))
|
||||
return out
|
||||
|
||||
def additional_attributes_table(soup: BeautifulSoup) -> Optional[str]:
|
||||
table = soup.select_one(".additional-attributes, table.additional-attributes, .product.attribute.additional table")
|
||||
if not table:
|
||||
return None
|
||||
try:
|
||||
rows = []
|
||||
for tr in table.select("tr"):
|
||||
th = tr.find("th") or tr.find("td")
|
||||
tds = tr.find_all("td")
|
||||
key = normalize_text(th.get_text()) if th else None
|
||||
val = normalize_text(tds[-1].get_text()) if tds else None
|
||||
if key and val:
|
||||
rows.append((key, val))
|
||||
if not rows:
|
||||
return None
|
||||
items = "\n".join(
|
||||
[
|
||||
f"""<div class='grid grid-cols-3 gap-2 py-1 border-b'>
|
||||
<div class='col-span-1 font-medium'>{key}</div>
|
||||
<div class='col-span-2 text-stone-700'>{val}</div>
|
||||
</div>"""
|
||||
for key, val in rows
|
||||
]
|
||||
)
|
||||
return f"<div class='rounded-lg border bg-white'>{items}</div>"
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def find_description_container(soup: BeautifulSoup) -> Optional[Tag]:
|
||||
for sel in ["#description", "#tab-description", ".product.attribute.description .value",
|
||||
".product.attribute.overview .value", ".product.info.detailed .value"]:
|
||||
el = soup.select_one(sel)
|
||||
if el and normalize_text(el.get_text()):
|
||||
return el
|
||||
for h in soup.select("h2, h3, h4, h5, h6"):
|
||||
txt = normalize_text(h.get_text()).lower()
|
||||
if txt.startswith("product description") or txt == "description":
|
||||
wrapper = soup.new_tag("div")
|
||||
for sib in h.next_siblings:
|
||||
if isinstance(sib, Tag) and sib.name in ("h2", "h3", "h4", "h5", "h6"):
|
||||
break
|
||||
wrapper.append(sib if isinstance(sib, Tag) else NavigableString(str(sib)))
|
||||
if normalize_text(wrapper.get_text()):
|
||||
return wrapper
|
||||
return None
|
||||
53
market/scrape/product/helpers/html.py
Normal file
53
market/scrape/product/helpers/html.py
Normal file
@@ -0,0 +1,53 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import List, Optional
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from shared.config import config
|
||||
|
||||
def first_from_srcset(val: str) -> Optional[str]:
|
||||
if not val:
|
||||
return None
|
||||
first = val.split(",")[0].strip()
|
||||
parts = first.split()
|
||||
return parts[0] if parts else first
|
||||
|
||||
def abs_url(u: Optional[str]) -> Optional[str]:
|
||||
if not u:
|
||||
return None
|
||||
return urljoin(config()["base_url"], u) if isinstance(u, str) and u.startswith("/") else u
|
||||
|
||||
def collect_img_candidates(el) -> List[str]:
|
||||
urls: List[str] = []
|
||||
if not el:
|
||||
return urls
|
||||
attrs = ["src", "data-src", "data-original", "data-zoom-image", "data-thumb", "content", "href"]
|
||||
for a in attrs:
|
||||
v = el.get(a)
|
||||
if v:
|
||||
urls.append(v)
|
||||
for a in ["srcset", "data-srcset"]:
|
||||
v = el.get(a)
|
||||
if v:
|
||||
first = first_from_srcset(v)
|
||||
if first:
|
||||
urls.append(first)
|
||||
return urls
|
||||
|
||||
def _filename_key(u: str) -> str:
|
||||
p = urlparse(u)
|
||||
path = p.path or ""
|
||||
if path.endswith("/"):
|
||||
path = path[:-1]
|
||||
last = path.split("/")[-1]
|
||||
return f"{p.netloc}:{last}".lower()
|
||||
|
||||
def dedup_by_filename(urls: List[str]) -> List[str]:
|
||||
seen = set()
|
||||
out: List[str] = []
|
||||
for u in urls:
|
||||
k = _filename_key(u)
|
||||
if k in seen:
|
||||
continue
|
||||
seen.add(k)
|
||||
out.append(u)
|
||||
return out
|
||||
42
market/scrape/product/helpers/price.py
Normal file
42
market/scrape/product/helpers/price.py
Normal file
@@ -0,0 +1,42 @@
|
||||
|
||||
from __future__ import annotations
|
||||
import re
|
||||
from typing import Optional, Tuple
|
||||
|
||||
def parse_price(text: str) -> Tuple[Optional[float], Optional[str], str]:
|
||||
"""
|
||||
Return (value, currency, raw) from a price-like string.
|
||||
Supports symbols £, €, $; strips thousands commas.
|
||||
"""
|
||||
raw = (text or "").strip()
|
||||
m = re.search(r'([£€$])?\s*([0-9][0-9.,]*)', raw)
|
||||
if not m:
|
||||
return None, None, raw
|
||||
sym = m.group(1) or ""
|
||||
num = m.group(2).replace(",", "")
|
||||
try:
|
||||
value = float(num)
|
||||
except ValueError:
|
||||
return None, None, raw
|
||||
currency = {"£": "GBP", "€": "EUR", "$": "USD"}.get(sym, None)
|
||||
return value, currency, raw
|
||||
|
||||
def parse_case_size(text: str) -> Tuple[Optional[int], Optional[float], Optional[str], str]:
|
||||
"""
|
||||
Parse strings like "6 x 500g", "12x1L", "24 × 330 ml"
|
||||
Returns (count, item_qty, item_unit, raw)
|
||||
"""
|
||||
raw = (text or "").strip()
|
||||
if not raw:
|
||||
return None, None, None, raw
|
||||
t = re.sub(r"[×Xx]\s*", " x ", raw)
|
||||
m = re.search(r"(\d+)\s*x\s*([0-9]*\.?[0-9]+)\s*([a-zA-Z]+)", t)
|
||||
if not m:
|
||||
return None, None, None, raw
|
||||
count = int(m.group(1))
|
||||
try:
|
||||
item_qty = float(m.group(2))
|
||||
except ValueError:
|
||||
item_qty = None
|
||||
unit = m.group(3)
|
||||
return count, item_qty, unit, raw
|
||||
16
market/scrape/product/helpers/text.py
Normal file
16
market/scrape/product/helpers/text.py
Normal file
@@ -0,0 +1,16 @@
|
||||
|
||||
from __future__ import annotations
|
||||
import re
|
||||
from shared.utils import normalize_text
|
||||
from shared.config import config
|
||||
|
||||
def clean_title(t: str) -> str:
|
||||
t = normalize_text(t)
|
||||
t = re.sub(r":\s*$", "", t)
|
||||
return t
|
||||
|
||||
def is_blacklisted_heading(title: str) -> bool:
|
||||
"""Return True if heading should be skipped based on config blacklist."""
|
||||
bl = (config().get("blacklist") or {}).get("product-details") or []
|
||||
low = (title or "").strip().lower()
|
||||
return any(low == (s or "").strip().lower() for s in bl)
|
||||
48
market/scrape/product/product_core.py
Normal file
48
market/scrape/product/product_core.py
Normal file
@@ -0,0 +1,48 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Dict, Tuple, Union
|
||||
from shared.utils import soup_of
|
||||
from ..http_client import fetch
|
||||
from ..html_utils import absolutize_fragment
|
||||
from bp.browse.services.slugs import product_slug_from_href
|
||||
from .registry import REGISTRY, merge_missing
|
||||
from . import extractors as _auto_register # noqa: F401 (import-time side effects)
|
||||
|
||||
async def scrape_product_detail(product_url: str, include_html: bool = False) -> Union[dict, Tuple[dict, str]]:
|
||||
"""
|
||||
Returns a dict with fields (subset):
|
||||
title, images, image, description_short, description_html, sections,
|
||||
slug, suma_href, stickers, labels, info_table fields, oe_list_price, prices,
|
||||
breadcrumbs-derived category_* fields.
|
||||
If include_html=True, returns (data, html).
|
||||
"""
|
||||
html = await fetch(product_url)
|
||||
|
||||
|
||||
data: Dict[str, Union[str, float, int, list, dict, None]] = {
|
||||
"suma_href": product_url,
|
||||
"slug": product_slug_from_href(product_url),
|
||||
}
|
||||
|
||||
# Run all extractors
|
||||
for fn in REGISTRY:
|
||||
try:
|
||||
soup = soup_of(html)
|
||||
piece = fn(soup, product_url) or {}
|
||||
except Exception:
|
||||
# Tolerate site drift
|
||||
continue
|
||||
merge_missing(data, piece)
|
||||
# If we found short description but not description_html, echo it
|
||||
if not data.get("description_html") and data.get("description_short"):
|
||||
data["description_html"] = absolutize_fragment(f"<p>{data['description_short']}</p>")
|
||||
|
||||
# Ensure "image" mirrors first of images if not set
|
||||
if not data.get("image"):
|
||||
imgs = data.get("images") or []
|
||||
if isinstance(imgs, list) and imgs:
|
||||
data["image"] = imgs[0]
|
||||
|
||||
if include_html:
|
||||
return data, html
|
||||
return data
|
||||
4
market/scrape/product/product_detail.py
Normal file
4
market/scrape/product/product_detail.py
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
from __future__ import annotations
|
||||
# Thin wrapper to keep import path stable
|
||||
from .product_core import scrape_product_detail # re-export
|
||||
20
market/scrape/product/registry.py
Normal file
20
market/scrape/product/registry.py
Normal file
@@ -0,0 +1,20 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Callable, Dict, List, Union
|
||||
|
||||
Extractor = Callable[[object, str], Dict[str, Union[str, float, int, list, dict, None]]]
|
||||
REGISTRY: List[Extractor] = []
|
||||
|
||||
def extractor(fn: Extractor) -> Extractor:
|
||||
"""Decorator to register an extractor."""
|
||||
REGISTRY.append(fn)
|
||||
return fn
|
||||
|
||||
def merge_missing(dst: dict, src: dict) -> None:
|
||||
"""
|
||||
Merge src into dst. Only write keys that are missing or empty in dst.
|
||||
"Empty" means None, "", [], {}.
|
||||
"""
|
||||
for k, v in (src or {}).items():
|
||||
if k not in dst or dst[k] in (None, "", [], {}):
|
||||
dst[k] = v
|
||||
Reference in New Issue
Block a user