feat: initialize market app with browsing, product, and scraping code
Some checks failed
Build and Deploy / build-and-deploy (push) Has been cancelled
Some checks failed
Build and Deploy / build-and-deploy (push) Has been cancelled
Split from coop monolith. Includes: - Market/browse/product blueprints - Product sync API - Suma scraping pipeline - Templates for market, browse, and product views - Dockerfile and CI workflow for independent deployment
This commit is contained in:
13
scrape/product/extractors/__init__.py
Normal file
13
scrape/product/extractors/__init__.py
Normal file
@@ -0,0 +1,13 @@
|
||||
|
||||
# Auto-import all extractor modules so they register themselves.
|
||||
from .title import ex_title # noqa: F401
|
||||
from .images import ex_images # noqa: F401
|
||||
from .short_description import ex_short_description # noqa: F401
|
||||
from .description_sections import ex_description_sections # noqa: F401
|
||||
from .nutrition_ex import ex_nutrition # noqa: F401
|
||||
from .stickers import ex_stickers # noqa: F401
|
||||
from .labels import ex_labels # noqa: F401
|
||||
from .info_table import ex_info_table # noqa: F401
|
||||
from .oe_list_price import ex_oe_list_price # noqa: F401
|
||||
from .regular_price_fallback import ex_regular_price_fallback # noqa: F401
|
||||
from .breadcrumbs import ex_breadcrumbs # noqa: F401
|
||||
68
scrape/product/extractors/breadcrumbs.py
Normal file
68
scrape/product/extractors/breadcrumbs.py
Normal file
@@ -0,0 +1,68 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Dict, List, Union
|
||||
from urllib.parse import urlparse
|
||||
from bs4 import BeautifulSoup
|
||||
from utils import normalize_text
|
||||
from ..registry import extractor
|
||||
|
||||
@extractor
|
||||
def ex_breadcrumbs(soup: BeautifulSoup, url: str) -> Dict:
|
||||
"""
|
||||
Parse breadcrumbs to identify top and sub categories.
|
||||
"""
|
||||
bc_ul = (soup.select_one(".breadcrumbs ul.items")
|
||||
or soup.select_one("nav.breadcrumbs ul.items")
|
||||
or soup.select_one("ul.items"))
|
||||
if not bc_ul:
|
||||
return {}
|
||||
|
||||
crumbs = []
|
||||
for li in bc_ul.select("li.item"):
|
||||
a = li.find("a")
|
||||
if a:
|
||||
title = normalize_text(a.get("title") or a.get_text())
|
||||
href = a.get("href")
|
||||
else:
|
||||
title = normalize_text(li.get_text())
|
||||
href = None
|
||||
slug = None
|
||||
if href:
|
||||
try:
|
||||
p = urlparse(href)
|
||||
path = (p.path or "").strip("/")
|
||||
slug = path.split("/")[-1] if path else None
|
||||
except Exception:
|
||||
slug = None
|
||||
if slug:
|
||||
crumbs.append({"title": title or None, "href": href or None, "slug": slug})
|
||||
|
||||
category_links = [c for c in crumbs if c.get("href")]
|
||||
top = None
|
||||
sub = None
|
||||
for c in category_links:
|
||||
t = (c.get("title") or "").lower()
|
||||
s = (c.get("slug") or "").lower()
|
||||
if t == "home" or s in ("", "home"):
|
||||
continue
|
||||
if top is None:
|
||||
top = c
|
||||
continue
|
||||
if sub is None:
|
||||
sub = c
|
||||
break
|
||||
|
||||
out: Dict[str, Union[str, List[Dict[str, str]]]] = {
|
||||
"category_breadcrumbs": crumbs
|
||||
}
|
||||
if top:
|
||||
out["category_top_title"] = top.get("title")
|
||||
out["category_top_href"] = top.get("href")
|
||||
out["category_top_slug"] = top.get("slug")
|
||||
if sub:
|
||||
out["category_sub_title"] = sub.get("title")
|
||||
out["category_sub_href"] = sub.get("href")
|
||||
out["category_sub_slug"] = sub.get("slug")
|
||||
if top and sub:
|
||||
out["category_path"] = f"{(top.get('slug') or '').strip()}/{(sub.get('slug') or '').strip()}"
|
||||
return out
|
||||
43
scrape/product/extractors/description_sections.py
Normal file
43
scrape/product/extractors/description_sections.py
Normal file
@@ -0,0 +1,43 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Dict, List
|
||||
from bs4 import BeautifulSoup
|
||||
from utils import normalize_text
|
||||
from ...html_utils import absolutize_fragment
|
||||
from ..registry import extractor
|
||||
from ..helpers.desc import (
|
||||
split_description_container, find_description_container,
|
||||
pair_title_content_from_magento_tabs, scan_headings_for_sections,
|
||||
additional_attributes_table,
|
||||
)
|
||||
from ..helpers.text import clean_title, is_blacklisted_heading
|
||||
|
||||
@extractor
|
||||
def ex_description_sections(soup: BeautifulSoup, url: str) -> Dict:
|
||||
description_html = None
|
||||
sections: List[Dict] = []
|
||||
desc_el = find_description_container(soup)
|
||||
if desc_el:
|
||||
open_html, sections_from_desc = split_description_container(desc_el)
|
||||
description_html = open_html or None
|
||||
sections.extend(sections_from_desc)
|
||||
|
||||
existing = {s["title"].lower() for s in sections}
|
||||
for t, html_fragment in (pair_title_content_from_magento_tabs(soup) or scan_headings_for_sections(soup)):
|
||||
low = t.lower()
|
||||
if "product description" in low or low == "description" or "details" in low:
|
||||
if not description_html and html_fragment:
|
||||
description_html = absolutize_fragment(html_fragment)
|
||||
continue
|
||||
if t.lower() not in existing and normalize_text(BeautifulSoup(html_fragment, "lxml").get_text()):
|
||||
if not is_blacklisted_heading(t):
|
||||
sections.append({"title": clean_title(t), "html": absolutize_fragment(html_fragment)})
|
||||
existing.add(t.lower())
|
||||
addl = additional_attributes_table(soup)
|
||||
if addl and "additional information" not in existing and not is_blacklisted_heading("additional information"):
|
||||
sections.append({"title": "Additional Information", "html": addl})
|
||||
out = {"sections": sections}
|
||||
if description_html:
|
||||
out["description_html"] = description_html
|
||||
return out
|
||||
|
||||
89
scrape/product/extractors/images.py
Normal file
89
scrape/product/extractors/images.py
Normal file
@@ -0,0 +1,89 @@
|
||||
from __future__ import annotations
|
||||
import json, re
|
||||
from typing import Dict, List
|
||||
from bs4 import BeautifulSoup
|
||||
from ..registry import extractor
|
||||
from ..helpers.html import abs_url, collect_img_candidates, dedup_by_filename
|
||||
|
||||
@extractor
|
||||
def ex_images(soup: BeautifulSoup, url: str) -> Dict:
|
||||
images: List[str] = []
|
||||
debug = False # set True while debugging
|
||||
|
||||
# 1) Magento init script (gallery)
|
||||
scripts = soup.find_all("script", attrs={"type": "text/x-magento-init"})
|
||||
if debug: print(f"[ex_images] x-magento-init scripts: {len(scripts)}")
|
||||
|
||||
for script in scripts:
|
||||
# Use raw string as-is; no stripping/collapsing
|
||||
text = script.string or script.get_text() or ""
|
||||
if "mage/gallery/gallery" not in text:
|
||||
continue
|
||||
|
||||
# Correct (not over-escaped) patterns:
|
||||
m = re.search(r'"data"\s*:\s*(\[[\s\S]*?\])', text)
|
||||
if not m:
|
||||
if debug: print("[ex_images] 'data' array not found in gallery block")
|
||||
continue
|
||||
|
||||
arr_txt = m.group(1)
|
||||
added = False
|
||||
try:
|
||||
data = json.loads(arr_txt)
|
||||
for entry in data:
|
||||
u = abs_url(entry.get("full")) or abs_url(entry.get("img"))
|
||||
if u:
|
||||
images.append(u); added = True
|
||||
except Exception as e:
|
||||
if debug: print(f"[ex_images] json.loads failed: {e!r}; trying regex fallback")
|
||||
# Fallback to simple key extraction
|
||||
fulls = re.findall(r'"full"\s*:\s*"([^"]+)"', arr_txt)
|
||||
imgs = re.findall(r'"img"\s*:\s*"([^"]+)"', arr_txt) if not fulls else []
|
||||
for u in (fulls or imgs):
|
||||
u = abs_url(u)
|
||||
if u:
|
||||
images.append(u); added = True
|
||||
|
||||
if added:
|
||||
break # got what we need from the gallery block
|
||||
|
||||
# 2) JSON-LD fallback
|
||||
if not images:
|
||||
for script in soup.find_all("script", attrs={"type": "application/ld+json"}):
|
||||
raw = script.string or script.get_text() or ""
|
||||
try:
|
||||
data = json.loads(raw)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
def add_from(val):
|
||||
if isinstance(val, str):
|
||||
u = abs_url(val); u and images.append(u)
|
||||
elif isinstance(val, list):
|
||||
for v in val:
|
||||
if isinstance(v, str):
|
||||
u = abs_url(v); u and images.append(u)
|
||||
elif isinstance(v, dict) and "url" in v:
|
||||
u = abs_url(v["url"]); u and images.append(u)
|
||||
elif isinstance(val, dict) and "url" in val:
|
||||
u = abs_url(val["url"]); u and images.append(u)
|
||||
|
||||
if isinstance(data, dict) and "image" in data:
|
||||
add_from(data["image"])
|
||||
if isinstance(data, list):
|
||||
for item in data:
|
||||
if isinstance(item, dict) and "image" in item:
|
||||
add_from(item["image"])
|
||||
|
||||
# 3) Generic DOM scan fallback
|
||||
if not images:
|
||||
# consider broadening selectors if needed, e.g. '.fotorama__img'
|
||||
for el in soup.select(".product.media img, .gallery-placeholder img, .fotorama__stage img"):
|
||||
for cand in collect_img_candidates(el):
|
||||
u = abs_url(cand)
|
||||
if u:
|
||||
images.append(u)
|
||||
|
||||
images = dedup_by_filename(images)
|
||||
if debug: print(f"[ex_images] found images: {images}")
|
||||
return {"images": images, "image": images[0] if images else None}
|
||||
76
scrape/product/extractors/info_table.py
Normal file
76
scrape/product/extractors/info_table.py
Normal file
@@ -0,0 +1,76 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Dict, Union
|
||||
from bs4 import BeautifulSoup
|
||||
from utils import normalize_text
|
||||
from ..registry import extractor
|
||||
from ..helpers.price import parse_price, parse_case_size
|
||||
|
||||
@extractor
|
||||
def ex_info_table(soup: BeautifulSoup, url: str) -> Dict:
|
||||
"""
|
||||
Extracts:
|
||||
<div class="product-page-info-table"> ... rows of label/content ... </div>
|
||||
Produces:
|
||||
info_table (raw map), brand, rrp[_raw|_currency], price_per_unit[_raw|_currency],
|
||||
case_size_* fields
|
||||
"""
|
||||
container = soup.select_one(".product-page-info-table") or None
|
||||
if not container:
|
||||
return {}
|
||||
rows_parent = container.select_one(".product-page-info-table-rows") or container
|
||||
rows = rows_parent.select(".product-page-info-table-row") or []
|
||||
if not rows:
|
||||
return {}
|
||||
|
||||
raw_map: Dict[str, str] = {}
|
||||
for r in rows:
|
||||
lab_el = r.select_one(".product-page-info-table__label")
|
||||
val_el = r.select_one(".product-page-info-table__content")
|
||||
if not lab_el or not val_el:
|
||||
continue
|
||||
label = normalize_text(lab_el.get_text())
|
||||
value = normalize_text(val_el.get_text())
|
||||
if label:
|
||||
raw_map[label] = value
|
||||
|
||||
out: Dict[str, Union[str, float, int, Dict]] = {"info_table": raw_map}
|
||||
|
||||
# Brand
|
||||
brand = raw_map.get("Brand") or raw_map.get("Brand Name") or None
|
||||
if brand:
|
||||
out["brand"] = brand
|
||||
|
||||
# RRP
|
||||
rrp_val, rrp_cur, rrp_raw = parse_price(raw_map.get("RRP", ""))
|
||||
if rrp_raw and (rrp_val is not None or rrp_cur is not None):
|
||||
out["rrp_raw"] = rrp_raw
|
||||
if rrp_val is not None:
|
||||
out["rrp"] = rrp_val
|
||||
if rrp_cur:
|
||||
out["rrp_currency"] = rrp_cur
|
||||
|
||||
# Price Per Unit
|
||||
ppu_val, ppu_cur, ppu_raw = parse_price(
|
||||
raw_map.get("Price Per Unit", "") or raw_map.get("Unit Price", "")
|
||||
)
|
||||
if ppu_raw and (ppu_val is not None or ppu_cur is not None):
|
||||
out["price_per_unit_raw"] = ppu_raw
|
||||
if ppu_val is not None:
|
||||
out["price_per_unit"] = ppu_val
|
||||
if ppu_cur:
|
||||
out["price_per_unit_currency"] = ppu_cur
|
||||
|
||||
# Case Size
|
||||
cs_text = raw_map.get("Case Size", "") or raw_map.get("Pack Size", "")
|
||||
cs_count, cs_item_qty, cs_item_unit, cs_raw = parse_case_size(cs_text)
|
||||
if cs_raw:
|
||||
out["case_size_raw"] = cs_raw
|
||||
if cs_count is not None:
|
||||
out["case_size_count"] = cs_count
|
||||
if cs_item_qty is not None:
|
||||
out["case_size_item_qty"] = cs_item_qty
|
||||
if cs_item_unit:
|
||||
out["case_size_item_unit"] = cs_item_unit
|
||||
|
||||
return out
|
||||
41
scrape/product/extractors/labels.py
Normal file
41
scrape/product/extractors/labels.py
Normal file
@@ -0,0 +1,41 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Dict, List
|
||||
from bs4 import BeautifulSoup
|
||||
from utils import normalize_text
|
||||
from ..registry import extractor
|
||||
|
||||
@extractor
|
||||
def ex_labels(soup: BeautifulSoup, url: str) -> Dict:
|
||||
"""
|
||||
From:
|
||||
<ul class="cdz-product-labels">
|
||||
<li class="label-item new"><div class="label-content">NEW</div></li>
|
||||
</ul>
|
||||
Returns "labels": lower-cased union of class hints and visible text.
|
||||
"""
|
||||
root = soup.select_one("ul.cdz-product-labels")
|
||||
if not root:
|
||||
return {}
|
||||
items: List[str] = []
|
||||
texts: List[str] = []
|
||||
|
||||
for li in root.select("li.label-item"):
|
||||
for c in (li.get("class") or []):
|
||||
c = (c or "").strip()
|
||||
if c and c.lower() != "label-item" and c not in items:
|
||||
items.append(c)
|
||||
txt = normalize_text(li.get_text())
|
||||
if txt and txt not in texts:
|
||||
texts.append(txt)
|
||||
|
||||
if not items and not texts:
|
||||
return {}
|
||||
union = []
|
||||
seen = set()
|
||||
for s in items + [t.lower() for t in texts]:
|
||||
key = (s or "").strip().lower()
|
||||
if key and key not in seen:
|
||||
seen.add(key)
|
||||
union.append(key)
|
||||
return {"labels": union}
|
||||
129
scrape/product/extractors/nutrition_ex.py
Normal file
129
scrape/product/extractors/nutrition_ex.py
Normal file
@@ -0,0 +1,129 @@
|
||||
from __future__ import annotations
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from utils import normalize_text
|
||||
from ..registry import extractor
|
||||
from ..helpers.desc import (
|
||||
split_description_container, find_description_container,
|
||||
pair_title_content_from_magento_tabs, scan_headings_for_sections,
|
||||
)
|
||||
|
||||
# ----- value/unit parser ------------------------------------------------------
|
||||
|
||||
_NUM_UNIT_RE = re.compile(
|
||||
r"""
|
||||
^\s*
|
||||
(?P<num>[-+]?\d{1,3}(?:[.,]\d{3})*(?:[.,]\d+)?|\d+(?:[.,]\d+)?)
|
||||
\s*
|
||||
(?P<unit>[a-zA-Z%µ/]+)?
|
||||
\s*$
|
||||
""",
|
||||
re.X,
|
||||
)
|
||||
|
||||
def _parse_value_unit(s: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
if not s:
|
||||
return None, None
|
||||
s = re.sub(r"\s+", " ", s.strip())
|
||||
m = _NUM_UNIT_RE.match(s)
|
||||
if not m:
|
||||
return None, None
|
||||
num = (m.group("num") or "").replace(",", "")
|
||||
unit = m.group("unit") or None
|
||||
if unit:
|
||||
u = unit.lower()
|
||||
if u in {"kcal", "kcal.", "kcalories", "kcalorie"}:
|
||||
unit = "kcal"
|
||||
elif u in {"kj", "kj.", "kilojoule", "kilojoules"}:
|
||||
unit = "kJ"
|
||||
return (num or None, unit)
|
||||
|
||||
# ----- section finder ---------------------------------------------------------
|
||||
|
||||
def _find_nutrition_section_html(soup: BeautifulSoup) -> Optional[str]:
|
||||
"""
|
||||
Return the HTML for the section whose title matches 'Nutritional Information'.
|
||||
We look in the same places your description extractor does.
|
||||
"""
|
||||
# 1) Magento tabs
|
||||
for t, html in (pair_title_content_from_magento_tabs(soup) or []):
|
||||
if not t or not html:
|
||||
continue
|
||||
title = normalize_text(t).rstrip(":").lower()
|
||||
if "nutritional information" in title:
|
||||
return html
|
||||
|
||||
# 2) Description container split into sections
|
||||
desc_el = find_description_container(soup)
|
||||
if desc_el:
|
||||
_open_html, sections = split_description_container(desc_el)
|
||||
for sec in sections or []:
|
||||
title = normalize_text((sec.get("title") or "")).rstrip(":").lower()
|
||||
if "nutritional information" in title:
|
||||
return sec.get("html") or ""
|
||||
|
||||
# 3) Fallback: generic heading scan
|
||||
for t, html in (scan_headings_for_sections(soup) or []):
|
||||
if not t or not html:
|
||||
continue
|
||||
title = normalize_text(t).rstrip(":").lower()
|
||||
if "nutritional information" in title:
|
||||
return html
|
||||
|
||||
return None
|
||||
|
||||
# ----- table parser -----------------------------------------------------------
|
||||
|
||||
def _extract_rows_from_table(root: BeautifulSoup) -> List[Dict[str, str]]:
|
||||
out: List[Dict[str, str]] = []
|
||||
table = root.select_one("table")
|
||||
if not table:
|
||||
return out
|
||||
|
||||
for tr in table.select("tr"):
|
||||
th = tr.find("th")
|
||||
tds = tr.find_all("td")
|
||||
if th and tds:
|
||||
key = normalize_text(th.get_text(" ").strip())
|
||||
val_raw = normalize_text(tds[0].get_text(" ").strip())
|
||||
elif len(tds) >= 2:
|
||||
key = normalize_text(tds[0].get_text(" ").strip())
|
||||
val_raw = normalize_text(tds[1].get_text(" ").strip())
|
||||
else:
|
||||
continue
|
||||
|
||||
if not key or not val_raw:
|
||||
continue
|
||||
|
||||
value, unit = _parse_value_unit(val_raw)
|
||||
if value is None: # keep raw if not parseable
|
||||
value, unit = val_raw, None
|
||||
|
||||
out.append({"key": key, "value": value, "unit": unit})
|
||||
|
||||
# Deduplicate while preserving order
|
||||
seen = set()
|
||||
dedup: List[Dict[str, str]] = []
|
||||
for r in out:
|
||||
t = (r["key"], r.get("value"), r.get("unit"))
|
||||
if t in seen:
|
||||
continue
|
||||
seen.add(t)
|
||||
dedup.append(r)
|
||||
return dedup
|
||||
|
||||
# ----- extractor --------------------------------------------------------------
|
||||
|
||||
@extractor
|
||||
def ex_nutrition(soup: BeautifulSoup, url: str) -> Dict:
|
||||
"""
|
||||
Extract nutrition ONLY from the section titled 'Nutritional Information'.
|
||||
Returns: {"nutrition": [{"key": "...", "value": "...", "unit": "..."}]}
|
||||
"""
|
||||
section_html = _find_nutrition_section_html(soup)
|
||||
if not section_html:
|
||||
return {"nutrition": []}
|
||||
section_soup = BeautifulSoup(section_html, "lxml")
|
||||
rows = _extract_rows_from_table(section_soup)
|
||||
return {"nutrition": rows}
|
||||
56
scrape/product/extractors/oe_list_price.py
Normal file
56
scrape/product/extractors/oe_list_price.py
Normal file
@@ -0,0 +1,56 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Dict, Union
|
||||
from bs4 import BeautifulSoup
|
||||
from ..registry import extractor
|
||||
from ..helpers.price import parse_price
|
||||
|
||||
@extractor
|
||||
def ex_oe_list_price(soup: BeautifulSoup, url: str) -> Dict:
|
||||
"""
|
||||
Extract Magento "oe-list-price" block:
|
||||
<div class="oe-list-price">
|
||||
<div class="rrp-price"><label>Regular Price: </label><span class="price">£30.50</span></div>
|
||||
<div class="oe-final-price"><label>Special Price: </label><span>£23.63</span></div>
|
||||
</div>
|
||||
Produces:
|
||||
oe_list_price: { rrp_raw, rrp, rrp_currency, special_raw, special, special_currency }
|
||||
Also promotes special_* to top-level (special_price_*) if available.
|
||||
"""
|
||||
box = soup.select_one(".oe-list-price")
|
||||
if not box:
|
||||
return {}
|
||||
out: Dict[str, Union[str, float, dict]] = {}
|
||||
oe: Dict[str, Union[str, float]] = {}
|
||||
|
||||
# RRP inside oe-list-price (if present)
|
||||
rrp = box.select_one(".rrp-price")
|
||||
if rrp:
|
||||
txt = (rrp.select_one("span.price") or rrp.select_one("span") or rrp).get_text(strip=True)
|
||||
val, cur, raw = parse_price(txt)
|
||||
if raw:
|
||||
oe["rrp_raw"] = raw
|
||||
if val is not None:
|
||||
oe["rrp"] = val
|
||||
if cur:
|
||||
oe["rrp_currency"] = cur
|
||||
|
||||
# Special Price inside oe-list-price
|
||||
sp = box.select_one(".oe-final-price, .special-price, .final-price")
|
||||
if sp:
|
||||
txt = (sp.select_one("span.price") or sp.select_one("span") or sp).get_text(strip=True)
|
||||
val, cur, raw = parse_price(txt)
|
||||
if raw:
|
||||
oe["special_raw"] = raw
|
||||
if val is not None:
|
||||
oe["special"] = val
|
||||
out["special_price"] = val
|
||||
if cur:
|
||||
oe["special_currency"] = cur
|
||||
out["special_price_currency"] = cur
|
||||
if raw:
|
||||
out["special_price_raw"] = raw
|
||||
|
||||
if oe:
|
||||
out["oe_list_price"] = oe
|
||||
return out
|
||||
33
scrape/product/extractors/regular_price_fallback.py
Normal file
33
scrape/product/extractors/regular_price_fallback.py
Normal file
@@ -0,0 +1,33 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Dict, Union
|
||||
from bs4 import BeautifulSoup
|
||||
from ..registry import extractor
|
||||
from ..helpers.price import parse_price
|
||||
|
||||
@extractor
|
||||
def ex_regular_price_fallback(soup: BeautifulSoup, url: str) -> Dict:
|
||||
"""
|
||||
Fallback extractor for legacy 'Regular Price' blocks outside oe-list-price:
|
||||
<div class="rrp-price"><label>Regular Price: </label><span class="price">£16.55</span></div>
|
||||
"""
|
||||
rrp = soup.select_one("div.rrp-price")
|
||||
if not rrp:
|
||||
return {}
|
||||
span = rrp.select_one("span.price")
|
||||
price_text = span.get_text(strip=True) if span else rrp.get_text(" ", strip=True)
|
||||
value, currency, raw = parse_price(price_text or "")
|
||||
out: Dict[str, Union[str, float]] = {}
|
||||
if raw:
|
||||
out["regular_price_raw"] = raw
|
||||
if value is not None:
|
||||
out["regular_price"] = value
|
||||
if currency:
|
||||
out["regular_price_currency"] = currency
|
||||
if value is not None:
|
||||
out.setdefault("rrp", value)
|
||||
if currency:
|
||||
out.setdefault("rrp_currency", currency)
|
||||
if raw:
|
||||
out.setdefault("rrp_raw", raw)
|
||||
return out
|
||||
19
scrape/product/extractors/short_description.py
Normal file
19
scrape/product/extractors/short_description.py
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Dict
|
||||
from bs4 import BeautifulSoup
|
||||
from utils import normalize_text
|
||||
from ..registry import extractor
|
||||
|
||||
@extractor
|
||||
def ex_short_description(soup: BeautifulSoup, url: str) -> Dict:
|
||||
desc_short = None
|
||||
for sel in [".product.attribute.description .value", ".product.attribute.overview .value",
|
||||
"meta[name='description']", "meta[property='og:description']"]:
|
||||
el = soup.select_one(sel)
|
||||
if not el:
|
||||
continue
|
||||
desc_short = normalize_text(el.get_text() if el.name != "meta" else el.get("content"))
|
||||
if desc_short:
|
||||
break
|
||||
return {"description_short": desc_short}
|
||||
30
scrape/product/extractors/stickers.py
Normal file
30
scrape/product/extractors/stickers.py
Normal file
@@ -0,0 +1,30 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Dict, List
|
||||
from bs4 import BeautifulSoup
|
||||
from ..registry import extractor
|
||||
|
||||
@extractor
|
||||
def ex_stickers(soup: BeautifulSoup, url: str) -> Dict:
|
||||
"""
|
||||
<div class="stickers">
|
||||
<span class="sticker xxx"></span>
|
||||
...
|
||||
</div>
|
||||
"""
|
||||
root = soup.select_one("div.stickers")
|
||||
if not root:
|
||||
return {"stickers": []}
|
||||
stickers: List[str] = []
|
||||
seen = set()
|
||||
for sp in root.select("span.sticker"):
|
||||
classes = sp.get("class") or []
|
||||
extras = [c.strip() for c in classes if c and c.lower() != "sticker"]
|
||||
data_name = (sp.get("data-sticker") or "").strip()
|
||||
if data_name:
|
||||
extras.append(data_name)
|
||||
for x in extras:
|
||||
if x and x not in seen:
|
||||
seen.add(x)
|
||||
stickers.append(x)
|
||||
return {"stickers": stickers}
|
||||
17
scrape/product/extractors/title.py
Normal file
17
scrape/product/extractors/title.py
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Dict
|
||||
from bs4 import BeautifulSoup
|
||||
from utils import normalize_text
|
||||
from ..registry import extractor
|
||||
|
||||
@extractor
|
||||
def ex_title(soup: BeautifulSoup, url: str) -> Dict:
|
||||
title = None
|
||||
for sel in ["h1.page-title span", "h1.page-title", "h1.product-name", "meta[property='og:title']"]:
|
||||
el = soup.select_one(sel)
|
||||
if el:
|
||||
title = normalize_text(el.get_text()) if el.name != "meta" else el.get("content")
|
||||
if title:
|
||||
break
|
||||
return {"title": title or "Product"}
|
||||
Reference in New Issue
Block a user