feat: initialize market app with browsing, product, and scraping code
Some checks failed
Build and Deploy / build-and-deploy (push) Has been cancelled
Some checks failed
Build and Deploy / build-and-deploy (push) Has been cancelled
Split from coop monolith. Includes: - Market/browse/product blueprints - Product sync API - Suma scraping pipeline - Templates for market, browse, and product views - Dockerfile and CI workflow for independent deployment
This commit is contained in:
165
scrape/product/helpers/desc.py
Normal file
165
scrape/product/helpers/desc.py
Normal file
@@ -0,0 +1,165 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from bs4 import BeautifulSoup, NavigableString, Tag
|
||||
from utils import normalize_text
|
||||
from ...html_utils import absolutize_fragment
|
||||
from .text import clean_title, is_blacklisted_heading
|
||||
from config import config
|
||||
|
||||
|
||||
def split_description_container(desc_el: Tag) -> Tuple[str, List[Dict]]:
|
||||
"""
|
||||
Extract sections from accordion blocks within the description container.
|
||||
|
||||
Looks for headings with class 'accordion-title' and pairs each with its
|
||||
next element-sibling having class 'accordion-details'. Returns:
|
||||
- open_html: the remaining description HTML with those accordion blocks removed
|
||||
- sections: [{"title": ..., "html": ...}, ...]
|
||||
"""
|
||||
# Work on an isolated copy to avoid mutating the original DOM
|
||||
frag = BeautifulSoup(desc_el.decode_contents(), "lxml")
|
||||
|
||||
# Collect candidate (heading, details) pairs without mutating during iteration
|
||||
pairs: List[Tuple[Tag, Tag]] = []
|
||||
for h in frag.select("#accordion .accordion-title, .accordion .accordion-title, h5.accordion-title, .accordion-title"):
|
||||
if not isinstance(h, Tag):
|
||||
continue
|
||||
title = clean_title((h.get_text() or "").strip())
|
||||
if not title:
|
||||
continue
|
||||
|
||||
# Walk forward siblings until we hit an element; accept the first with 'accordion-details'
|
||||
sib = h.next_sibling
|
||||
details: Optional[Tag] = None
|
||||
while sib is not None:
|
||||
if isinstance(sib, Tag):
|
||||
classes = sib.get("class") or []
|
||||
if "accordion-details" in classes:
|
||||
details = sib
|
||||
break
|
||||
sib = sib.next_sibling
|
||||
|
||||
if details is not None:
|
||||
pairs.append((h, details))
|
||||
|
||||
sections: List[Dict] = []
|
||||
|
||||
# Extract sections, then remove nodes from frag
|
||||
for h, details in pairs:
|
||||
# Pull details HTML
|
||||
html = details.decode_contents()
|
||||
# Only keep non-empty (textual) content
|
||||
if normalize_text(BeautifulSoup(html, "lxml").get_text()):
|
||||
sections.append({
|
||||
"title": clean_title(h.get_text() or ""),
|
||||
"html": absolutize_fragment(html),
|
||||
})
|
||||
# Remove the matched nodes from the fragment copy
|
||||
details.decompose()
|
||||
h.decompose()
|
||||
|
||||
# Whatever remains is the open description html
|
||||
open_html = absolutize_fragment(str(frag)) if frag else ""
|
||||
|
||||
return open_html, sections
|
||||
|
||||
def pair_title_content_from_magento_tabs(soup: BeautifulSoup):
|
||||
out = []
|
||||
container = soup.select_one(".product.info.detailed .product.data.items") or soup.select_one(".product.data.items")
|
||||
if not container:
|
||||
return out
|
||||
titles = container.select(".data.item.title")
|
||||
for t in titles:
|
||||
title = normalize_text(t.get_text())
|
||||
if not title:
|
||||
continue
|
||||
content_id = t.get("aria-controls") or t.get("data-target")
|
||||
content = soup.select_one(f"#{content_id}") if content_id else None
|
||||
if content is None:
|
||||
sib = t.find_next_sibling(
|
||||
lambda x: isinstance(x, Tag) and "data" in x.get("class", []) and "item" in x.get("class", []) and "content" in x.get("class", [])
|
||||
)
|
||||
content = sib
|
||||
if content:
|
||||
html = content.decode_contents()
|
||||
if not is_blacklisted_heading(title):
|
||||
out.append((title, absolutize_fragment(html)))
|
||||
return out
|
||||
|
||||
def scan_headings_for_sections(soup: BeautifulSoup):
|
||||
out = []
|
||||
container = (
|
||||
soup.select_one(".product.info.detailed")
|
||||
or soup.select_one(".product-info-main")
|
||||
or soup.select_one(".page-main")
|
||||
or soup
|
||||
)
|
||||
heads = container.select("h2, h3, h4, h5, h6")
|
||||
section_titles = (config().get("section-titles") or [])
|
||||
for h in heads:
|
||||
title = clean_title(h.get_text() or "")
|
||||
if not title:
|
||||
continue
|
||||
low = title.lower()
|
||||
if not any(k in low for k in section_titles + ["product description", "description", "details"]):
|
||||
continue
|
||||
parts: List[str] = []
|
||||
for sib in h.next_siblings:
|
||||
if isinstance(sib, NavigableString):
|
||||
parts.append(str(sib))
|
||||
continue
|
||||
if isinstance(sib, Tag) and sib.name in ("h2", "h3", "h4", "h5", "h6"):
|
||||
break
|
||||
if isinstance(sib, Tag):
|
||||
parts.append(str(sib))
|
||||
html = absolutize_fragment("".join(parts).strip())
|
||||
if html and not is_blacklisted_heading(title):
|
||||
out.append((title, html))
|
||||
return out
|
||||
|
||||
def additional_attributes_table(soup: BeautifulSoup) -> Optional[str]:
|
||||
table = soup.select_one(".additional-attributes, table.additional-attributes, .product.attribute.additional table")
|
||||
if not table:
|
||||
return None
|
||||
try:
|
||||
rows = []
|
||||
for tr in table.select("tr"):
|
||||
th = tr.find("th") or tr.find("td")
|
||||
tds = tr.find_all("td")
|
||||
key = normalize_text(th.get_text()) if th else None
|
||||
val = normalize_text(tds[-1].get_text()) if tds else None
|
||||
if key and val:
|
||||
rows.append((key, val))
|
||||
if not rows:
|
||||
return None
|
||||
items = "\n".join(
|
||||
[
|
||||
f"""<div class='grid grid-cols-3 gap-2 py-1 border-b'>
|
||||
<div class='col-span-1 font-medium'>{key}</div>
|
||||
<div class='col-span-2 text-stone-700'>{val}</div>
|
||||
</div>"""
|
||||
for key, val in rows
|
||||
]
|
||||
)
|
||||
return f"<div class='rounded-lg border bg-white'>{items}</div>"
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def find_description_container(soup: BeautifulSoup) -> Optional[Tag]:
|
||||
for sel in ["#description", "#tab-description", ".product.attribute.description .value",
|
||||
".product.attribute.overview .value", ".product.info.detailed .value"]:
|
||||
el = soup.select_one(sel)
|
||||
if el and normalize_text(el.get_text()):
|
||||
return el
|
||||
for h in soup.select("h2, h3, h4, h5, h6"):
|
||||
txt = normalize_text(h.get_text()).lower()
|
||||
if txt.startswith("product description") or txt == "description":
|
||||
wrapper = soup.new_tag("div")
|
||||
for sib in h.next_siblings:
|
||||
if isinstance(sib, Tag) and sib.name in ("h2", "h3", "h4", "h5", "h6"):
|
||||
break
|
||||
wrapper.append(sib if isinstance(sib, Tag) else NavigableString(str(sib)))
|
||||
if normalize_text(wrapper.get_text()):
|
||||
return wrapper
|
||||
return None
|
||||
53
scrape/product/helpers/html.py
Normal file
53
scrape/product/helpers/html.py
Normal file
@@ -0,0 +1,53 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import List, Optional
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from config import config
|
||||
|
||||
def first_from_srcset(val: str) -> Optional[str]:
|
||||
if not val:
|
||||
return None
|
||||
first = val.split(",")[0].strip()
|
||||
parts = first.split()
|
||||
return parts[0] if parts else first
|
||||
|
||||
def abs_url(u: Optional[str]) -> Optional[str]:
|
||||
if not u:
|
||||
return None
|
||||
return urljoin(config()["base_url"], u) if isinstance(u, str) and u.startswith("/") else u
|
||||
|
||||
def collect_img_candidates(el) -> List[str]:
|
||||
urls: List[str] = []
|
||||
if not el:
|
||||
return urls
|
||||
attrs = ["src", "data-src", "data-original", "data-zoom-image", "data-thumb", "content", "href"]
|
||||
for a in attrs:
|
||||
v = el.get(a)
|
||||
if v:
|
||||
urls.append(v)
|
||||
for a in ["srcset", "data-srcset"]:
|
||||
v = el.get(a)
|
||||
if v:
|
||||
first = first_from_srcset(v)
|
||||
if first:
|
||||
urls.append(first)
|
||||
return urls
|
||||
|
||||
def _filename_key(u: str) -> str:
|
||||
p = urlparse(u)
|
||||
path = p.path or ""
|
||||
if path.endswith("/"):
|
||||
path = path[:-1]
|
||||
last = path.split("/")[-1]
|
||||
return f"{p.netloc}:{last}".lower()
|
||||
|
||||
def dedup_by_filename(urls: List[str]) -> List[str]:
|
||||
seen = set()
|
||||
out: List[str] = []
|
||||
for u in urls:
|
||||
k = _filename_key(u)
|
||||
if k in seen:
|
||||
continue
|
||||
seen.add(k)
|
||||
out.append(u)
|
||||
return out
|
||||
42
scrape/product/helpers/price.py
Normal file
42
scrape/product/helpers/price.py
Normal file
@@ -0,0 +1,42 @@
|
||||
|
||||
from __future__ import annotations
|
||||
import re
|
||||
from typing import Optional, Tuple
|
||||
|
||||
def parse_price(text: str) -> Tuple[Optional[float], Optional[str], str]:
|
||||
"""
|
||||
Return (value, currency, raw) from a price-like string.
|
||||
Supports symbols £, €, $; strips thousands commas.
|
||||
"""
|
||||
raw = (text or "").strip()
|
||||
m = re.search(r'([£€$])?\s*([0-9][0-9.,]*)', raw)
|
||||
if not m:
|
||||
return None, None, raw
|
||||
sym = m.group(1) or ""
|
||||
num = m.group(2).replace(",", "")
|
||||
try:
|
||||
value = float(num)
|
||||
except ValueError:
|
||||
return None, None, raw
|
||||
currency = {"£": "GBP", "€": "EUR", "$": "USD"}.get(sym, None)
|
||||
return value, currency, raw
|
||||
|
||||
def parse_case_size(text: str) -> Tuple[Optional[int], Optional[float], Optional[str], str]:
|
||||
"""
|
||||
Parse strings like "6 x 500g", "12x1L", "24 × 330 ml"
|
||||
Returns (count, item_qty, item_unit, raw)
|
||||
"""
|
||||
raw = (text or "").strip()
|
||||
if not raw:
|
||||
return None, None, None, raw
|
||||
t = re.sub(r"[×Xx]\s*", " x ", raw)
|
||||
m = re.search(r"(\d+)\s*x\s*([0-9]*\.?[0-9]+)\s*([a-zA-Z]+)", t)
|
||||
if not m:
|
||||
return None, None, None, raw
|
||||
count = int(m.group(1))
|
||||
try:
|
||||
item_qty = float(m.group(2))
|
||||
except ValueError:
|
||||
item_qty = None
|
||||
unit = m.group(3)
|
||||
return count, item_qty, unit, raw
|
||||
16
scrape/product/helpers/text.py
Normal file
16
scrape/product/helpers/text.py
Normal file
@@ -0,0 +1,16 @@
|
||||
|
||||
from __future__ import annotations
|
||||
import re
|
||||
from utils import normalize_text
|
||||
from config import config
|
||||
|
||||
def clean_title(t: str) -> str:
|
||||
t = normalize_text(t)
|
||||
t = re.sub(r":\s*$", "", t)
|
||||
return t
|
||||
|
||||
def is_blacklisted_heading(title: str) -> bool:
|
||||
"""Return True if heading should be skipped based on config blacklist."""
|
||||
bl = (config().get("blacklist") or {}).get("product-details") or []
|
||||
low = (title or "").strip().lower()
|
||||
return any(low == (s or "").strip().lower() for s in bl)
|
||||
Reference in New Issue
Block a user