feat: initialize market app with browsing, product, and scraping code
Some checks failed
Build and Deploy / build-and-deploy (push) Has been cancelled
Some checks failed
Build and Deploy / build-and-deploy (push) Has been cancelled
Split from coop monolith. Includes: - Market/browse/product blueprints - Product sync API - Suma scraping pipeline - Templates for market, browse, and product views - Dockerfile and CI workflow for independent deployment
This commit is contained in:
1
scrape/build_snapshot/tools/APP_ROOT_PLACEHOLDER.py
Normal file
1
scrape/build_snapshot/tools/APP_ROOT_PLACEHOLDER.py
Normal file
@@ -0,0 +1 @@
|
||||
APP_ROOT_PLACEHOLDER = "[**__APP_ROOT__**]"
|
||||
1
scrape/build_snapshot/tools/__init__.py
Normal file
1
scrape/build_snapshot/tools/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
6
scrape/build_snapshot/tools/_anchor_text.py
Normal file
6
scrape/build_snapshot/tools/_anchor_text.py
Normal file
@@ -0,0 +1,6 @@
|
||||
def _anchor_text(a) -> str:
|
||||
try:
|
||||
txt = " ".join((a.get_text(" ") or "").split())
|
||||
return txt[:200]
|
||||
except Exception:
|
||||
return ""
|
||||
16
scrape/build_snapshot/tools/_collect_html_img_srcs.py
Normal file
16
scrape/build_snapshot/tools/_collect_html_img_srcs.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import List, Optional
|
||||
|
||||
def _collect_html_img_srcs(html: Optional[str]) -> List[str]:
|
||||
urls: List[str] = []
|
||||
if not html:
|
||||
return urls
|
||||
try:
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
for img in soup.find_all("img"):
|
||||
src = img.get("src")
|
||||
if src:
|
||||
urls.append(src)
|
||||
except Exception:
|
||||
pass
|
||||
return urls
|
||||
14
scrape/build_snapshot/tools/_dedupe_preserve_order.py
Normal file
14
scrape/build_snapshot/tools/_dedupe_preserve_order.py
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
from typing import Iterable, List, Set
|
||||
|
||||
def _dedupe_preserve_order(urls: Iterable[str]) -> List[str]:
|
||||
seen: Set[str] = set()
|
||||
out: List[str] = []
|
||||
for u in urls:
|
||||
if not u or not isinstance(u, str):
|
||||
continue
|
||||
if u in seen:
|
||||
continue
|
||||
seen.add(u)
|
||||
out.append(u)
|
||||
return out
|
||||
32
scrape/build_snapshot/tools/_product_dict_is_cf.py
Normal file
32
scrape/build_snapshot/tools/_product_dict_is_cf.py
Normal file
@@ -0,0 +1,32 @@
|
||||
from typing import Dict,Optional, Tuple
|
||||
|
||||
_CF_TOKENS = (
|
||||
"One moment, please...",
|
||||
"Please wait while your request is being verified",
|
||||
"/cdn-cgi/challenge-platform/",
|
||||
"rocket-loader.min.js",
|
||||
)
|
||||
|
||||
def _looks_like_cf_html(html: Optional[str]) -> Tuple[bool, Optional[str]]:
|
||||
if not html:
|
||||
return False, None
|
||||
for tok in _CF_TOKENS:
|
||||
if tok in html:
|
||||
return True, tok
|
||||
return False, None
|
||||
|
||||
def _product_dict_is_cf(d: Dict) -> Tuple[bool, Optional[str]]:
|
||||
title = (d.get("title") or "").strip()
|
||||
if title.lower() == "one moment, please...":
|
||||
return True, "One moment, please..."
|
||||
ok, tok = _looks_like_cf_html(d.get("description_html"))
|
||||
if ok:
|
||||
return True, tok
|
||||
for sec in d.get("sections") or []:
|
||||
if isinstance(sec, dict) and sec.get("html"):
|
||||
ok2, tok2 = _looks_like_cf_html(sec["html"])
|
||||
if ok2:
|
||||
return True, tok2
|
||||
if not d.get("images") and not d.get("description_html") and not d.get("sections"):
|
||||
return True, "all_empty_heuristic"
|
||||
return False, None
|
||||
34
scrape/build_snapshot/tools/_resolve_sub_redirects.py
Normal file
34
scrape/build_snapshot/tools/_resolve_sub_redirects.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from typing import Dict, Set
|
||||
from urllib.parse import urlparse, urljoin
|
||||
import httpx
|
||||
|
||||
|
||||
async def _resolve_sub_redirects(
|
||||
base_url: str,
|
||||
candidates: Set[str],
|
||||
allowed_tops: Set[str],
|
||||
valid_subs_by_top: Dict[str, Set[str]],
|
||||
) -> Dict[str, str]:
|
||||
mapping: Dict[str, str] = {}
|
||||
if not candidates:
|
||||
return mapping
|
||||
timeout = httpx.Timeout(20.0, connect=10.0)
|
||||
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout, http2=True) as client:
|
||||
for path in sorted(candidates):
|
||||
try:
|
||||
url = urljoin(base_url, path)
|
||||
r = await client.get(url)
|
||||
final = str(r.url)
|
||||
p = urlparse(final)
|
||||
parts = [x for x in (p.path or "").split("/") if x]
|
||||
if len(parts) >= 2:
|
||||
top_new = parts[0].lower()
|
||||
sub_new = parts[1].lower().removesuffix(".html").removesuffix(".htm")
|
||||
if top_new in allowed_tops:
|
||||
new_path = f"/{top_new}/{sub_new}"
|
||||
if new_path != path:
|
||||
mapping[path] = new_path
|
||||
valid_subs_by_top.setdefault(top_new, set()).add(sub_new)
|
||||
except Exception:
|
||||
continue
|
||||
return mapping
|
||||
100
scrape/build_snapshot/tools/_rewrite_links_fragment.py
Normal file
100
scrape/build_snapshot/tools/_rewrite_links_fragment.py
Normal file
@@ -0,0 +1,100 @@
|
||||
from typing import Dict, List, Optional, Set
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urlparse, urljoin
|
||||
|
||||
from ._anchor_text import _anchor_text
|
||||
from suma_browser.app.bp.browse.services.slugs import product_slug_from_href
|
||||
from .APP_ROOT_PLACEHOLDER import APP_ROOT_PLACEHOLDER
|
||||
|
||||
def _rewrite_links_fragment(
|
||||
html: Optional[str],
|
||||
base_url: str,
|
||||
known_slugs: Set[str],
|
||||
category_allow_values: Set[str],
|
||||
valid_subs_by_top: Dict[str, Set[str]],
|
||||
current_product_slug: str,
|
||||
link_errors: List[Dict],
|
||||
link_externals: List[Dict],
|
||||
unknown_sub_paths: Set[str],
|
||||
) -> str:
|
||||
if not html:
|
||||
return ""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
base_host = urlparse(base_url).netloc
|
||||
|
||||
for a in soup.find_all("a", href=True):
|
||||
raw = (a.get("href") or "").strip()
|
||||
if not raw:
|
||||
continue
|
||||
low = raw.lower()
|
||||
if low.startswith(("mailto:", "tel:", "javascript:", "data:")) or low.startswith("#"):
|
||||
continue
|
||||
abs_href = urljoin(base_url, raw)
|
||||
p = urlparse(abs_href)
|
||||
if not p.scheme or not p.netloc:
|
||||
continue
|
||||
if p.netloc != base_host:
|
||||
link_externals.append({
|
||||
"product": current_product_slug,
|
||||
"href": abs_href,
|
||||
"text": _anchor_text(a),
|
||||
"host": p.netloc,
|
||||
})
|
||||
continue
|
||||
parts = [x for x in (p.path or "").split("/") if x]
|
||||
if not parts:
|
||||
continue
|
||||
last = parts[-1].lower()
|
||||
if last.endswith((".html", ".htm")):
|
||||
target_slug = product_slug_from_href(abs_href)
|
||||
if target_slug and target_slug in known_slugs:
|
||||
a["href"] = f"{APP_ROOT_PLACEHOLDER}/product/{target_slug}"
|
||||
else:
|
||||
link_errors.append({
|
||||
"product": current_product_slug,
|
||||
"href": abs_href,
|
||||
"text": _anchor_text(a),
|
||||
"top": None,
|
||||
"sub": None,
|
||||
"target_slug": target_slug or None,
|
||||
"type": "suma_product_unknown",
|
||||
})
|
||||
continue
|
||||
top = parts[0].lower()
|
||||
if top in category_allow_values:
|
||||
if len(parts) == 1:
|
||||
a["href"] = f"{APP_ROOT_PLACEHOLDER}/{top}"
|
||||
else:
|
||||
sub = parts[1]
|
||||
if sub.lower().endswith((".html", ".htm")):
|
||||
sub = sub.rsplit(".", 1)[0]
|
||||
if sub in (valid_subs_by_top.get(top) or set()):
|
||||
a["href"] = f"{APP_ROOT_PLACEHOLDER}/{top}/{sub}"
|
||||
else:
|
||||
unknown_path = f"/{top}/{sub}"
|
||||
unknown_sub_paths.add(unknown_path)
|
||||
a["href"] = f"{APP_ROOT_PLACEHOLDER}{unknown_path}"
|
||||
link_errors.append({
|
||||
"product": current_product_slug,
|
||||
"href": abs_href,
|
||||
"text": _anchor_text(a),
|
||||
"top": top,
|
||||
"sub": sub,
|
||||
"target_slug": None,
|
||||
"type": "suma_category_invalid_sub_pending",
|
||||
})
|
||||
else:
|
||||
link_errors.append({
|
||||
"product": current_product_slug,
|
||||
"href": abs_href,
|
||||
"text": _anchor_text(a),
|
||||
"top": top,
|
||||
"sub": parts[1] if len(parts) > 1 else None,
|
||||
"target_slug": None,
|
||||
"type": "suma_other",
|
||||
})
|
||||
|
||||
for t in soup.find_all(["html", "body"]):
|
||||
t.unwrap()
|
||||
return "".join(str(c) for c in soup.contents).strip()
|
||||
|
||||
14
scrape/build_snapshot/tools/candidate_subs.py
Normal file
14
scrape/build_snapshot/tools/candidate_subs.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from typing import Dict, Set
|
||||
|
||||
def candidate_subs(nav: Dict[str, Dict])-> Set[str]:
|
||||
nav_sub_candidates: Set[str] = set()
|
||||
for label, data in (nav.get("cats") or {}).items():
|
||||
top_slug = (data or {}).get("slug")
|
||||
if not top_slug:
|
||||
continue
|
||||
for s in (data.get("subs") or []):
|
||||
sub_slug = (s.get("slug") or "").strip()
|
||||
if sub_slug:
|
||||
nav_sub_candidates.add(f"/{top_slug}/{sub_slug}")
|
||||
return nav_sub_candidates
|
||||
|
||||
18
scrape/build_snapshot/tools/capture_category.py
Normal file
18
scrape/build_snapshot/tools/capture_category.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from urllib.parse import urljoin
|
||||
from config import config
|
||||
from utils import log
|
||||
from ...listings import scrape_products
|
||||
|
||||
async def capture_category(
|
||||
slug: str,
|
||||
):
|
||||
list_url = urljoin(config()["base_url"], f"/{slug}")
|
||||
log(f"[{slug}] page 1…")
|
||||
items, total_pages = await scrape_products(list_url, page=1)
|
||||
|
||||
pmax = int(total_pages or 1)
|
||||
for p in range(2, pmax + 1):
|
||||
log(f"[{slug}] page {p}…")
|
||||
items_p, _tp = await scrape_products(list_url, page=p)
|
||||
items.extend(items_p)
|
||||
return (list_url, items, total_pages)
|
||||
25
scrape/build_snapshot/tools/capture_product_slugs.py
Normal file
25
scrape/build_snapshot/tools/capture_product_slugs.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from typing import Dict, Set
|
||||
from .capture_category import capture_category
|
||||
from .capture_sub import capture_sub
|
||||
from config import config
|
||||
|
||||
|
||||
async def capture_product_slugs(
|
||||
nav: Dict[str, Dict],
|
||||
capture_listing,
|
||||
):
|
||||
product_slugs: Set[str] = set()
|
||||
for label, slug in config()["categories"]["allow"].items():
|
||||
lpars = await capture_category( slug)
|
||||
await capture_listing(*lpars)
|
||||
(_, items, __) = lpars
|
||||
for slug_ in items:
|
||||
product_slugs.add(slug_)
|
||||
for sub in (nav["cats"].get(label, {}).get("subs", []) or []):
|
||||
lpars = await capture_sub(sub, slug)
|
||||
await capture_listing(*lpars)
|
||||
(_, items, __) = lpars
|
||||
for slug_ in items:
|
||||
product_slugs.add(slug_)
|
||||
return product_slugs
|
||||
|
||||
22
scrape/build_snapshot/tools/capture_sub.py
Normal file
22
scrape/build_snapshot/tools/capture_sub.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from urllib.parse import urljoin
|
||||
from urllib.parse import urljoin
|
||||
from config import config
|
||||
from utils import log
|
||||
from ...listings import scrape_products
|
||||
|
||||
async def capture_sub(
|
||||
sub,
|
||||
slug,
|
||||
):
|
||||
sub_slug = sub.get("slug")
|
||||
if not sub_slug:
|
||||
return
|
||||
sub_url = urljoin(config()["base_url"], f"/{slug}/{sub_slug}")
|
||||
log(f"[{slug}/{sub_slug}] page 1…")
|
||||
items_s, total_pages_s = await scrape_products(sub_url, page=1)
|
||||
spmax = int(total_pages_s or 1)
|
||||
for p in range(2, spmax + 1):
|
||||
log(f"[{slug}/{sub_slug}] page {p}…")
|
||||
items_ps, _ = await scrape_products(sub_url, page=p)
|
||||
items_s.extend(items_ps)
|
||||
return (sub_url, items_s, total_pages_s)
|
||||
106
scrape/build_snapshot/tools/fetch_and_upsert_product.py
Normal file
106
scrape/build_snapshot/tools/fetch_and_upsert_product.py
Normal file
@@ -0,0 +1,106 @@
|
||||
|
||||
import asyncio
|
||||
from typing import List
|
||||
|
||||
import httpx
|
||||
|
||||
|
||||
from ...html_utils import to_fragment
|
||||
from suma_browser.app.bp.browse.services.slugs import suma_href_from_html_slug
|
||||
|
||||
|
||||
from config import config
|
||||
|
||||
from utils import log
|
||||
|
||||
# DB: persistence helpers
|
||||
from ...product.product_detail import scrape_product_detail
|
||||
from ._product_dict_is_cf import _product_dict_is_cf
|
||||
from ._rewrite_links_fragment import _rewrite_links_fragment
|
||||
from ._dedupe_preserve_order import _dedupe_preserve_order
|
||||
from ._collect_html_img_srcs import _collect_html_img_srcs
|
||||
|
||||
|
||||
async def fetch_and_upsert_product(
|
||||
upsert_product,
|
||||
log_product_result,
|
||||
sem: asyncio.Semaphore,
|
||||
slug: str,
|
||||
product_slugs,
|
||||
category_values,
|
||||
valid_subs_by_top,
|
||||
link_errors,
|
||||
link_externals,
|
||||
unknown_sub_paths
|
||||
) -> bool:
|
||||
href = suma_href_from_html_slug(slug)
|
||||
try:
|
||||
async with sem:
|
||||
d = await scrape_product_detail(href)
|
||||
|
||||
is_cf, cf_token = _product_dict_is_cf(d)
|
||||
if is_cf:
|
||||
payload = {
|
||||
"slug": slug,
|
||||
"href_tried": href,
|
||||
"error_type": "CloudflareChallengeDetected",
|
||||
"error_message": f"Detected Cloudflare interstitial via token: {cf_token}",
|
||||
"cf_token": cf_token,
|
||||
}
|
||||
await log_product_result(ok=False, payload=payload)
|
||||
log(f" ! CF challenge detected: {slug} ({cf_token})")
|
||||
return False
|
||||
|
||||
# Rewrite embedded links; collect reports
|
||||
if d.get("description_html"):
|
||||
d["description_html"] = _rewrite_links_fragment(
|
||||
d["description_html"], config()["base_url"], product_slugs, category_values,
|
||||
valid_subs_by_top, slug, link_errors, link_externals, unknown_sub_paths
|
||||
)
|
||||
d["description_html"] = to_fragment(d["description_html"])
|
||||
if d.get("sections"):
|
||||
for sec in d["sections"]:
|
||||
if isinstance(sec, dict) and sec.get("html"):
|
||||
sec["html"] = _rewrite_links_fragment(
|
||||
sec["html"], config()["base_url"], product_slugs, category_values,
|
||||
valid_subs_by_top, slug, link_errors, link_externals, unknown_sub_paths
|
||||
)
|
||||
sec["html"] = to_fragment(sec["html"])
|
||||
|
||||
# Images
|
||||
gallery = _dedupe_preserve_order(d.get("images") or [])
|
||||
embedded: List[str] = []
|
||||
if d.get("description_html"):
|
||||
embedded += _collect_html_img_srcs(d["description_html"])
|
||||
for sec in d.get("sections", []) or []:
|
||||
if isinstance(sec, dict) and sec.get("html"):
|
||||
embedded += _collect_html_img_srcs(sec["html"])
|
||||
embedded = _dedupe_preserve_order(embedded)
|
||||
all_imgs = _dedupe_preserve_order(list(gallery) + list(embedded))
|
||||
|
||||
d["images"] = gallery
|
||||
d["embedded_image_urls"] = embedded
|
||||
d["all_image_urls"] = all_imgs
|
||||
await upsert_product(slug, href, d)
|
||||
# DB: upsert product + success log
|
||||
return True
|
||||
except Exception as e:
|
||||
payload = {
|
||||
"slug": slug,
|
||||
"href_tried": href,
|
||||
"error_type": e.__class__.__name__,
|
||||
"error_message": str(e),
|
||||
}
|
||||
try:
|
||||
if isinstance(e, httpx.HTTPStatusError):
|
||||
payload["http_status"] = getattr(e.response, "status_code", None)
|
||||
req = getattr(e, "request", None)
|
||||
if req is not None and getattr(req, "url", None) is not None:
|
||||
payload["final_url"] = str(req.url)
|
||||
elif isinstance(e, httpx.TransportError):
|
||||
payload["transport_error"] = True
|
||||
except Exception:
|
||||
pass
|
||||
await log_product_result(ok=False, payload=payload)
|
||||
log(f" ! product failed: {slug} ({e})")
|
||||
return False
|
||||
49
scrape/build_snapshot/tools/fetch_and_upsert_products.py
Normal file
49
scrape/build_snapshot/tools/fetch_and_upsert_products.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import asyncio
|
||||
from typing import Dict, List, Set
|
||||
from config import config
|
||||
from utils import log
|
||||
from .fetch_and_upsert_product import fetch_and_upsert_product
|
||||
|
||||
|
||||
async def fetch_and_upsert_products(
|
||||
upsert_product,
|
||||
log_product_result,
|
||||
save_link_reports = None,
|
||||
concurrency: int=8,
|
||||
product_slugs: Set[str] = set(),
|
||||
valid_subs_by_top: Dict[str, Set[str]] = {},
|
||||
unknown_sub_paths: Set[str] = set()
|
||||
):
|
||||
sem = asyncio.Semaphore(max(1, concurrency))
|
||||
link_errors: List[Dict] = []
|
||||
link_externals: List[Dict] = []
|
||||
|
||||
category_values: Set[str] = set(config()["categories"]["allow"].values())
|
||||
to_fetch = sorted(list(product_slugs))
|
||||
log(f"Fetching {len(to_fetch)} product details (concurrency={concurrency})…")
|
||||
tasks = [asyncio.create_task(
|
||||
fetch_and_upsert_product(
|
||||
upsert_product,
|
||||
log_product_result,
|
||||
sem,
|
||||
s,
|
||||
product_slugs,
|
||||
category_values,
|
||||
valid_subs_by_top,
|
||||
link_errors,
|
||||
link_externals,
|
||||
unknown_sub_paths
|
||||
)
|
||||
) for s in to_fetch]
|
||||
done = 0
|
||||
ok_count = 0
|
||||
for coro in asyncio.as_completed(tasks):
|
||||
ok = await coro
|
||||
done += 1
|
||||
if ok:
|
||||
ok_count += 1
|
||||
if done % 50 == 0 or done == len(tasks):
|
||||
log(f" …{done}/{len(tasks)} saved (ok={ok_count})")
|
||||
if save_link_reports:
|
||||
await save_link_reports(link_errors, link_externals)
|
||||
|
||||
24
scrape/build_snapshot/tools/rewrite_nav.py
Normal file
24
scrape/build_snapshot/tools/rewrite_nav.py
Normal file
@@ -0,0 +1,24 @@
|
||||
|
||||
from typing import Dict
|
||||
from urllib.parse import urljoin
|
||||
from config import config
|
||||
|
||||
def rewrite_nav(nav: Dict[str, Dict], nav_redirects:Dict[str, str]):
|
||||
if nav_redirects:
|
||||
for label, data in (nav.get("cats") or {}).items():
|
||||
top_slug = (data or {}).get("slug")
|
||||
if not top_slug:
|
||||
continue
|
||||
new_subs = []
|
||||
for s in (data.get("subs") or []):
|
||||
old_sub = (s.get("slug") or "").strip()
|
||||
if not old_sub:
|
||||
continue
|
||||
old_path = f"/{top_slug}/{old_sub}"
|
||||
canonical_path = nav_redirects.get(old_path, old_path)
|
||||
parts = [x for x in canonical_path.split("/") if x]
|
||||
top2, sub2 = parts[0], parts[1]
|
||||
s["slug"] = sub2
|
||||
s["href"] = urljoin(config()["base_url"], f"/{top2}/{sub2}")
|
||||
new_subs.append(s)
|
||||
data["subs"] = new_subs
|
||||
16
scrape/build_snapshot/tools/valid_subs.py
Normal file
16
scrape/build_snapshot/tools/valid_subs.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from typing import Dict, Set
|
||||
|
||||
# make valid subs for ewch top in nav
|
||||
def valid_subs(nav: Dict[str, Dict])->Dict[str, Set[str]] :
|
||||
valid_subs_by_top: Dict[str, Set[str]] = {}
|
||||
for label, data in (nav.get("cats") or {}).items():
|
||||
top_slug = (data or {}).get("slug")
|
||||
if not top_slug:
|
||||
continue
|
||||
subs_set = {
|
||||
(s.get("slug") or "").strip()
|
||||
for s in (data.get("subs") or [])
|
||||
if s.get("slug")
|
||||
}
|
||||
valid_subs_by_top[top_slug] = subs_set
|
||||
return valid_subs_by_top
|
||||
Reference in New Issue
Block a user