This repository has been archived on 2026-02-24. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
market/scrape/product/helpers/desc.py
giles 478636f799 feat: decouple market from shared_lib, add app-owned models
Phase 1-3 of decoupling:
- path_setup.py adds project root to sys.path
- Market-owned models in market/models/ (market, market_place)
- All imports updated: shared.infrastructure, shared.db, shared.browser, etc.
- MarketPlace uses container_type/container_id instead of post_id FK

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 12:46:32 +00:00

166 lines
6.4 KiB
Python

from __future__ import annotations
from typing import Dict, List, Optional, Tuple
from bs4 import BeautifulSoup, NavigableString, Tag
from shared.utils import normalize_text
from ...html_utils import absolutize_fragment
from .text import clean_title, is_blacklisted_heading
from shared.config import config
def split_description_container(desc_el: Tag) -> Tuple[str, List[Dict]]:
"""
Extract sections from accordion blocks within the description container.
Looks for headings with class 'accordion-title' and pairs each with its
next element-sibling having class 'accordion-details'. Returns:
- open_html: the remaining description HTML with those accordion blocks removed
- sections: [{"title": ..., "html": ...}, ...]
"""
# Work on an isolated copy to avoid mutating the original DOM
frag = BeautifulSoup(desc_el.decode_contents(), "lxml")
# Collect candidate (heading, details) pairs without mutating during iteration
pairs: List[Tuple[Tag, Tag]] = []
for h in frag.select("#accordion .accordion-title, .accordion .accordion-title, h5.accordion-title, .accordion-title"):
if not isinstance(h, Tag):
continue
title = clean_title((h.get_text() or "").strip())
if not title:
continue
# Walk forward siblings until we hit an element; accept the first with 'accordion-details'
sib = h.next_sibling
details: Optional[Tag] = None
while sib is not None:
if isinstance(sib, Tag):
classes = sib.get("class") or []
if "accordion-details" in classes:
details = sib
break
sib = sib.next_sibling
if details is not None:
pairs.append((h, details))
sections: List[Dict] = []
# Extract sections, then remove nodes from frag
for h, details in pairs:
# Pull details HTML
html = details.decode_contents()
# Only keep non-empty (textual) content
if normalize_text(BeautifulSoup(html, "lxml").get_text()):
sections.append({
"title": clean_title(h.get_text() or ""),
"html": absolutize_fragment(html),
})
# Remove the matched nodes from the fragment copy
details.decompose()
h.decompose()
# Whatever remains is the open description html
open_html = absolutize_fragment(str(frag)) if frag else ""
return open_html, sections
def pair_title_content_from_magento_tabs(soup: BeautifulSoup):
out = []
container = soup.select_one(".product.info.detailed .product.data.items") or soup.select_one(".product.data.items")
if not container:
return out
titles = container.select(".data.item.title")
for t in titles:
title = normalize_text(t.get_text())
if not title:
continue
content_id = t.get("aria-controls") or t.get("data-target")
content = soup.select_one(f"#{content_id}") if content_id else None
if content is None:
sib = t.find_next_sibling(
lambda x: isinstance(x, Tag) and "data" in x.get("class", []) and "item" in x.get("class", []) and "content" in x.get("class", [])
)
content = sib
if content:
html = content.decode_contents()
if not is_blacklisted_heading(title):
out.append((title, absolutize_fragment(html)))
return out
def scan_headings_for_sections(soup: BeautifulSoup):
out = []
container = (
soup.select_one(".product.info.detailed")
or soup.select_one(".product-info-main")
or soup.select_one(".page-main")
or soup
)
heads = container.select("h2, h3, h4, h5, h6")
section_titles = (config().get("section-titles") or [])
for h in heads:
title = clean_title(h.get_text() or "")
if not title:
continue
low = title.lower()
if not any(k in low for k in section_titles + ["product description", "description", "details"]):
continue
parts: List[str] = []
for sib in h.next_siblings:
if isinstance(sib, NavigableString):
parts.append(str(sib))
continue
if isinstance(sib, Tag) and sib.name in ("h2", "h3", "h4", "h5", "h6"):
break
if isinstance(sib, Tag):
parts.append(str(sib))
html = absolutize_fragment("".join(parts).strip())
if html and not is_blacklisted_heading(title):
out.append((title, html))
return out
def additional_attributes_table(soup: BeautifulSoup) -> Optional[str]:
table = soup.select_one(".additional-attributes, table.additional-attributes, .product.attribute.additional table")
if not table:
return None
try:
rows = []
for tr in table.select("tr"):
th = tr.find("th") or tr.find("td")
tds = tr.find_all("td")
key = normalize_text(th.get_text()) if th else None
val = normalize_text(tds[-1].get_text()) if tds else None
if key and val:
rows.append((key, val))
if not rows:
return None
items = "\n".join(
[
f"""<div class='grid grid-cols-3 gap-2 py-1 border-b'>
<div class='col-span-1 font-medium'>{key}</div>
<div class='col-span-2 text-stone-700'>{val}</div>
</div>"""
for key, val in rows
]
)
return f"<div class='rounded-lg border bg-white'>{items}</div>"
except Exception:
return None
def find_description_container(soup: BeautifulSoup) -> Optional[Tag]:
for sel in ["#description", "#tab-description", ".product.attribute.description .value",
".product.attribute.overview .value", ".product.info.detailed .value"]:
el = soup.select_one(sel)
if el and normalize_text(el.get_text()):
return el
for h in soup.select("h2, h3, h4, h5, h6"):
txt = normalize_text(h.get_text()).lower()
if txt.startswith("product description") or txt == "description":
wrapper = soup.new_tag("div")
for sib in h.next_siblings:
if isinstance(sib, Tag) and sib.name in ("h2", "h3", "h4", "h5", "h6"):
break
wrapper.append(sib if isinstance(sib, Tag) else NavigableString(str(sib)))
if normalize_text(wrapper.get_text()):
return wrapper
return None