market/scrape/product/helpers/desc.py


from __future__ import annotations
from typing import Dict, List, Optional, Tuple
from bs4 import BeautifulSoup, NavigableString, Tag
from shared.utils import normalize_text
from ...html_utils import absolutize_fragment
from .text import clean_title, is_blacklisted_heading
from shared.config import config


def split_description_container(desc_el: Tag) -> Tuple[str, List[Dict]]:
    """
    Extract sections from accordion blocks within the description container.

    Looks for headings with class 'accordion-title' and pairs each with its
    next element-sibling having class 'accordion-details'. Returns:
      - open_html: the remaining description HTML with those accordion blocks removed
      - sections:  [{"title": ..., "html": ...}, ...]
    """
    # Work on an isolated copy to avoid mutating the original DOM
    frag = BeautifulSoup(desc_el.decode_contents(), "lxml")

    # Collect candidate (heading, details) pairs without mutating during iteration
    pairs: List[Tuple[Tag, Tag]] = []
    for h in frag.select("#accordion .accordion-title, .accordion .accordion-title, h5.accordion-title, .accordion-title"):
        if not isinstance(h, Tag):
            continue
        title = clean_title((h.get_text() or "").strip())
        if not title:
            continue

        # Walk forward siblings until we hit an element; accept the first with 'accordion-details'
        sib = h.next_sibling
        details: Optional[Tag] = None
        while sib is not None:
            if isinstance(sib, Tag):
                classes = sib.get("class") or []
                if "accordion-details" in classes:
                    details = sib
                break
            sib = sib.next_sibling

        if details is not None:
            pairs.append((h, details))

    sections: List[Dict] = []

    # Extract sections, then remove nodes from frag
    for h, details in pairs:
        # Pull details HTML
        html = details.decode_contents()
        # Only keep non-empty (textual) content
        if normalize_text(BeautifulSoup(html, "lxml").get_text()):
            sections.append({
                "title": clean_title(h.get_text() or ""),
                "html": absolutize_fragment(html),
            })
        # Remove the matched nodes from the fragment copy
        details.decompose()
        h.decompose()

    # Whatever remains is the open description html
    open_html = absolutize_fragment(str(frag)) if frag else ""

    return open_html, sections

def pair_title_content_from_magento_tabs(soup: BeautifulSoup):
    out = []
    container = soup.select_one(".product.info.detailed .product.data.items") or soup.select_one(".product.data.items")
    if not container:
        return out
    titles = container.select(".data.item.title")
    for t in titles:
        title = normalize_text(t.get_text())
        if not title:
            continue
        content_id = t.get("aria-controls") or t.get("data-target")
        content = soup.select_one(f"#{content_id}") if content_id else None
        if content is None:
            sib = t.find_next_sibling(
                lambda x: isinstance(x, Tag) and "data" in x.get("class", []) and "item" in x.get("class", []) and "content" in x.get("class", [])
            )
            content = sib
        if content:
            html = content.decode_contents()
            if not is_blacklisted_heading(title):
                out.append((title, absolutize_fragment(html)))
    return out

def scan_headings_for_sections(soup: BeautifulSoup):
    out = []
    container = (
        soup.select_one(".product.info.detailed")
        or soup.select_one(".product-info-main")
        or soup.select_one(".page-main")
        or soup
    )
    heads = container.select("h2, h3, h4, h5, h6")
    section_titles = (config().get("section-titles") or [])
    for h in heads:
        title = clean_title(h.get_text() or "")
        if not title:
            continue
        low = title.lower()
        if not any(k in low for k in section_titles + ["product description", "description", "details"]):
            continue
        parts: List[str] = []
        for sib in h.next_siblings:
            if isinstance(sib, NavigableString):
                parts.append(str(sib))
                continue
            if isinstance(sib, Tag) and sib.name in ("h2", "h3", "h4", "h5", "h6"):
                break
            if isinstance(sib, Tag):
                parts.append(str(sib))
        html = absolutize_fragment("".join(parts).strip())
        if html and not is_blacklisted_heading(title):
            out.append((title, html))
    return out

def additional_attributes_table(soup: BeautifulSoup) -> Optional[str]:
    table = soup.select_one(".additional-attributes, table.additional-attributes, .product.attribute.additional table")
    if not table:
        return None
    try:
        rows = []
        for tr in table.select("tr"):
            th = tr.find("th") or tr.find("td")
            tds = tr.find_all("td")
            key = normalize_text(th.get_text()) if th else None
            val = normalize_text(tds[-1].get_text()) if tds else None
            if key and val:
                rows.append((key, val))
        if not rows:
            return None
        items = "\n".join(
            [
                f"""<div class='grid grid-cols-3 gap-2 py-1 border-b'>
<div class='col-span-1 font-medium'>{key}</div>
<div class='col-span-2 text-stone-700'>{val}</div>
</div>"""
                for key, val in rows
            ]
        )
        return f"<div class='rounded-lg border bg-white'>{items}</div>"
    except Exception:
        return None

def find_description_container(soup: BeautifulSoup) -> Optional[Tag]:
    for sel in ["#description", "#tab-description", ".product.attribute.description .value",
                ".product.attribute.overview .value", ".product.info.detailed .value"]:
        el = soup.select_one(sel)
        if el and normalize_text(el.get_text()):
            return el
    for h in soup.select("h2, h3, h4, h5, h6"):
        txt = normalize_text(h.get_text()).lower()
        if txt.startswith("product description") or txt == "description":
            wrapper = soup.new_tag("div")
            for sib in h.next_siblings:
                if isinstance(sib, Tag) and sib.name in ("h2", "h3", "h4", "h5", "h6"):
                    break
                wrapper.append(sib if isinstance(sib, Tag) else NavigableString(str(sib)))
            if normalize_text(wrapper.get_text()):
                return wrapper
    return None