feat: initialize market app with browsing, product, and scraping code

Split from coop monolith. Includes: - Market/browse/product blueprints - Product sync API - Suma scraping pipeline - Templates for market, browse, and product views - Dockerfile and CI workflow for independent deployment
2026-02-09 23:16:34 +00:00
commit 6271a715a1
142 changed files with 8517 additions and 0 deletions
--- a/scrape/product/helpers/desc.py
+++ b/scrape/product/helpers/desc.py
@@ -0,0 +1,165 @@
+
+from __future__ import annotations
+from typing import Dict, List, Optional, Tuple
+from bs4 import BeautifulSoup, NavigableString, Tag
+from utils import normalize_text
+from ...html_utils import absolutize_fragment
+from .text import clean_title, is_blacklisted_heading
+from config import config
+
+
+def split_description_container(desc_el: Tag) -> Tuple[str, List[Dict]]:
+    """
+    Extract sections from accordion blocks within the description container.
+
+    Looks for headings with class 'accordion-title' and pairs each with its
+    next element-sibling having class 'accordion-details'. Returns:
+      - open_html: the remaining description HTML with those accordion blocks removed
+      - sections:  [{"title": ..., "html": ...}, ...]
+    """
+    # Work on an isolated copy to avoid mutating the original DOM
+    frag = BeautifulSoup(desc_el.decode_contents(), "lxml")
+
+    # Collect candidate (heading, details) pairs without mutating during iteration
+    pairs: List[Tuple[Tag, Tag]] = []
+    for h in frag.select("#accordion .accordion-title, .accordion .accordion-title, h5.accordion-title, .accordion-title"):
+        if not isinstance(h, Tag):
+            continue
+        title = clean_title((h.get_text() or "").strip())
+        if not title:
+            continue
+
+        # Walk forward siblings until we hit an element; accept the first with 'accordion-details'
+        sib = h.next_sibling
+        details: Optional[Tag] = None
+        while sib is not None:
+            if isinstance(sib, Tag):
+                classes = sib.get("class") or []
+                if "accordion-details" in classes:
+                    details = sib
+                break
+            sib = sib.next_sibling
+
+        if details is not None:
+            pairs.append((h, details))
+
+    sections: List[Dict] = []
+
+    # Extract sections, then remove nodes from frag
+    for h, details in pairs:
+        # Pull details HTML
+        html = details.decode_contents()
+        # Only keep non-empty (textual) content
+        if normalize_text(BeautifulSoup(html, "lxml").get_text()):
+            sections.append({
+                "title": clean_title(h.get_text() or ""),
+                "html": absolutize_fragment(html),
+            })
+        # Remove the matched nodes from the fragment copy
+        details.decompose()
+        h.decompose()
+
+    # Whatever remains is the open description html
+    open_html = absolutize_fragment(str(frag)) if frag else ""
+
+    return open_html, sections
+
+def pair_title_content_from_magento_tabs(soup: BeautifulSoup):
+    out = []
+    container = soup.select_one(".product.info.detailed .product.data.items") or soup.select_one(".product.data.items")
+    if not container:
+        return out
+    titles = container.select(".data.item.title")
+    for t in titles:
+        title = normalize_text(t.get_text())
+        if not title:
+            continue
+        content_id = t.get("aria-controls") or t.get("data-target")
+        content = soup.select_one(f"#{content_id}") if content_id else None
+        if content is None:
+            sib = t.find_next_sibling(
+                lambda x: isinstance(x, Tag) and "data" in x.get("class", []) and "item" in x.get("class", []) and "content" in x.get("class", [])
+            )
+            content = sib
+        if content:
+            html = content.decode_contents()
+            if not is_blacklisted_heading(title):
+                out.append((title, absolutize_fragment(html)))
+    return out
+
+def scan_headings_for_sections(soup: BeautifulSoup):
+    out = []
+    container = (
+        soup.select_one(".product.info.detailed")
+        or soup.select_one(".product-info-main")
+        or soup.select_one(".page-main")
+        or soup
+    )
+    heads = container.select("h2, h3, h4, h5, h6")
+    section_titles = (config().get("section-titles") or [])
+    for h in heads:
+        title = clean_title(h.get_text() or "")
+        if not title:
+            continue
+        low = title.lower()
+        if not any(k in low for k in section_titles + ["product description", "description", "details"]):
+            continue
+        parts: List[str] = []
+        for sib in h.next_siblings:
+            if isinstance(sib, NavigableString):
+                parts.append(str(sib))
+                continue
+            if isinstance(sib, Tag) and sib.name in ("h2", "h3", "h4", "h5", "h6"):
+                break
+            if isinstance(sib, Tag):
+                parts.append(str(sib))
+        html = absolutize_fragment("".join(parts).strip())
+        if html and not is_blacklisted_heading(title):
+            out.append((title, html))
+    return out
+
+def additional_attributes_table(soup: BeautifulSoup) -> Optional[str]:
+    table = soup.select_one(".additional-attributes, table.additional-attributes, .product.attribute.additional table")
+    if not table:
+        return None
+    try:
+        rows = []
+        for tr in table.select("tr"):
+            th = tr.find("th") or tr.find("td")
+            tds = tr.find_all("td")
+            key = normalize_text(th.get_text()) if th else None
+            val = normalize_text(tds[-1].get_text()) if tds else None
+            if key and val:
+                rows.append((key, val))
+        if not rows:
+            return None
+        items = "\n".join(
+            [
+                f"""<div class='grid grid-cols-3 gap-2 py-1 border-b'>
+<div class='col-span-1 font-medium'>{key}</div>
+<div class='col-span-2 text-stone-700'>{val}</div>
+</div>"""
+                for key, val in rows
+            ]
+        )
+        return f"<div class='rounded-lg border bg-white'>{items}</div>"
+    except Exception:
+        return None
+
+def find_description_container(soup: BeautifulSoup) -> Optional[Tag]:
+    for sel in ["#description", "#tab-description", ".product.attribute.description .value",
+                ".product.attribute.overview .value", ".product.info.detailed .value"]:
+        el = soup.select_one(sel)
+        if el and normalize_text(el.get_text()):
+            return el
+    for h in soup.select("h2, h3, h4, h5, h6"):
+        txt = normalize_text(h.get_text()).lower()
+        if txt.startswith("product description") or txt == "description":
+            wrapper = soup.new_tag("div")
+            for sib in h.next_siblings:
+                if isinstance(sib, Tag) and sib.name in ("h2", "h3", "h4", "h5", "h6"):
+                    break
+                wrapper.append(sib if isinstance(sib, Tag) else NavigableString(str(sib)))
+            if normalize_text(wrapper.get_text()):
+                return wrapper
+    return None
--- a/scrape/product/helpers/html.py
+++ b/scrape/product/helpers/html.py
@@ -0,0 +1,53 @@
+
+from __future__ import annotations
+from typing import List, Optional
+from urllib.parse import urljoin, urlparse
+from config import config
+
+def first_from_srcset(val: str) -> Optional[str]:
+    if not val:
+        return None
+    first = val.split(",")[0].strip()
+    parts = first.split()
+    return parts[0] if parts else first
+
+def abs_url(u: Optional[str]) -> Optional[str]:
+    if not u:
+        return None
+    return urljoin(config()["base_url"], u) if isinstance(u, str) and u.startswith("/") else u
+
+def collect_img_candidates(el) -> List[str]:
+    urls: List[str] = []
+    if not el:
+        return urls
+    attrs = ["src", "data-src", "data-original", "data-zoom-image", "data-thumb", "content", "href"]
+    for a in attrs:
+        v = el.get(a)
+        if v:
+            urls.append(v)
+    for a in ["srcset", "data-srcset"]:
+        v = el.get(a)
+        if v:
+            first = first_from_srcset(v)
+            if first:
+                urls.append(first)
+    return urls
+
+def _filename_key(u: str) -> str:
+    p = urlparse(u)
+    path = p.path or ""
+    if path.endswith("/"):
+        path = path[:-1]
+    last = path.split("/")[-1]
+    return f"{p.netloc}:{last}".lower()
+
+def dedup_by_filename(urls: List[str]) -> List[str]:
+    seen = set()
+    out: List[str] = []
+    for u in urls:
+        k = _filename_key(u)
+        if k in seen:
+            continue
+        seen.add(k)
+        out.append(u)
+    return out
--- a/scrape/product/helpers/price.py
+++ b/scrape/product/helpers/price.py
@@ -0,0 +1,42 @@
+
+from __future__ import annotations
+import re
+from typing import Optional, Tuple
+
+def parse_price(text: str) -> Tuple[Optional[float], Optional[str], str]:
+    """
+    Return (value, currency, raw) from a price-like string.
+    Supports symbols £, €, $; strips thousands commas.
+    """
+    raw = (text or "").strip()
+    m = re.search(r'([£€$])?\s*([0-9][0-9.,]*)', raw)
+    if not m:
+        return None, None, raw
+    sym = m.group(1) or ""
+    num = m.group(2).replace(",", "")
+    try:
+        value = float(num)
+    except ValueError:
+        return None, None, raw
+    currency = {"£": "GBP", "€": "EUR", "$": "USD"}.get(sym, None)
+    return value, currency, raw
+
+def parse_case_size(text: str) -> Tuple[Optional[int], Optional[float], Optional[str], str]:
+    """
+    Parse strings like "6 x 500g", "12x1L", "24 × 330 ml"
+    Returns (count, item_qty, item_unit, raw)
+    """
+    raw = (text or "").strip()
+    if not raw:
+        return None, None, None, raw
+    t = re.sub(r"[×Xx]\s*", " x ", raw)
+    m = re.search(r"(\d+)\s*x\s*([0-9]*\.?[0-9]+)\s*([a-zA-Z]+)", t)
+    if not m:
+        return None, None, None, raw
+    count = int(m.group(1))
+    try:
+        item_qty = float(m.group(2))
+    except ValueError:
+        item_qty = None
+    unit = m.group(3)
+    return count, item_qty, unit, raw
--- a/scrape/product/helpers/text.py
+++ b/scrape/product/helpers/text.py
@@ -0,0 +1,16 @@
+
+from __future__ import annotations
+import re
+from utils import normalize_text
+from config import config
+
+def clean_title(t: str) -> str:
+    t = normalize_text(t)
+    t = re.sub(r":\s*$", "", t)
+    return t
+
+def is_blacklisted_heading(title: str) -> bool:
+    """Return True if heading should be skipped based on config blacklist."""
+    bl = (config().get("blacklist") or {}).get("product-details") or []
+    low = (title or "").strip().lower()
+    return any(low == (s or "").strip().lower() for s in bl)