from __future__ import annotations from typing import Dict, List, Optional, Tuple from bs4 import BeautifulSoup, NavigableString, Tag from shared.utils import normalize_text from ...html_utils import absolutize_fragment from .text import clean_title, is_blacklisted_heading from shared.config import config def split_description_container(desc_el: Tag) -> Tuple[str, List[Dict]]: """ Extract sections from accordion blocks within the description container. Looks for headings with class 'accordion-title' and pairs each with its next element-sibling having class 'accordion-details'. Returns: - open_html: the remaining description HTML with those accordion blocks removed - sections: [{"title": ..., "html": ...}, ...] """ # Work on an isolated copy to avoid mutating the original DOM frag = BeautifulSoup(desc_el.decode_contents(), "lxml") # Collect candidate (heading, details) pairs without mutating during iteration pairs: List[Tuple[Tag, Tag]] = [] for h in frag.select("#accordion .accordion-title, .accordion .accordion-title, h5.accordion-title, .accordion-title"): if not isinstance(h, Tag): continue title = clean_title((h.get_text() or "").strip()) if not title: continue # Walk forward siblings until we hit an element; accept the first with 'accordion-details' sib = h.next_sibling details: Optional[Tag] = None while sib is not None: if isinstance(sib, Tag): classes = sib.get("class") or [] if "accordion-details" in classes: details = sib break sib = sib.next_sibling if details is not None: pairs.append((h, details)) sections: List[Dict] = [] # Extract sections, then remove nodes from frag for h, details in pairs: # Pull details HTML html = details.decode_contents() # Only keep non-empty (textual) content if normalize_text(BeautifulSoup(html, "lxml").get_text()): sections.append({ "title": clean_title(h.get_text() or ""), "html": absolutize_fragment(html), }) # Remove the matched nodes from the fragment copy details.decompose() h.decompose() # Whatever remains is the open description html open_html = absolutize_fragment(str(frag)) if frag else "" return open_html, sections def pair_title_content_from_magento_tabs(soup: BeautifulSoup): out = [] container = soup.select_one(".product.info.detailed .product.data.items") or soup.select_one(".product.data.items") if not container: return out titles = container.select(".data.item.title") for t in titles: title = normalize_text(t.get_text()) if not title: continue content_id = t.get("aria-controls") or t.get("data-target") content = soup.select_one(f"#{content_id}") if content_id else None if content is None: sib = t.find_next_sibling( lambda x: isinstance(x, Tag) and "data" in x.get("class", []) and "item" in x.get("class", []) and "content" in x.get("class", []) ) content = sib if content: html = content.decode_contents() if not is_blacklisted_heading(title): out.append((title, absolutize_fragment(html))) return out def scan_headings_for_sections(soup: BeautifulSoup): out = [] container = ( soup.select_one(".product.info.detailed") or soup.select_one(".product-info-main") or soup.select_one(".page-main") or soup ) heads = container.select("h2, h3, h4, h5, h6") section_titles = (config().get("section-titles") or []) for h in heads: title = clean_title(h.get_text() or "") if not title: continue low = title.lower() if not any(k in low for k in section_titles + ["product description", "description", "details"]): continue parts: List[str] = [] for sib in h.next_siblings: if isinstance(sib, NavigableString): parts.append(str(sib)) continue if isinstance(sib, Tag) and sib.name in ("h2", "h3", "h4", "h5", "h6"): break if isinstance(sib, Tag): parts.append(str(sib)) html = absolutize_fragment("".join(parts).strip()) if html and not is_blacklisted_heading(title): out.append((title, html)) return out def additional_attributes_table(soup: BeautifulSoup) -> Optional[str]: table = soup.select_one(".additional-attributes, table.additional-attributes, .product.attribute.additional table") if not table: return None try: rows = [] for tr in table.select("tr"): th = tr.find("th") or tr.find("td") tds = tr.find_all("td") key = normalize_text(th.get_text()) if th else None val = normalize_text(tds[-1].get_text()) if tds else None if key and val: rows.append((key, val)) if not rows: return None items = "\n".join( [ f"""