fn_registry/python/functions/datascience/parse_amazon_ranking_html.py

"""Pure HTML parser for Amazon ranking pages (Best Sellers and Movers & Shakers).

This module holds the *pure* DOM-parsing core shared by the HTTP scraper
(``scrape_amazon_bestsellers``) and the CDP/browser scraper
(``scrape_amazon_movers_cdp``). It takes a chunk of already-fetched HTML (from
``requests`` or from a rendered ``outerHTML`` via Chrome DevTools Protocol) and
returns a list of product dicts. No I/O, no network, deterministic for a fixed
input string — so it can be unit-tested with HTML fixtures and reused by any
fetch strategy.

Amazon serves several DOM templates at once (A/B tests) and rotates them often,
so every field is parsed defensively with multiple fallback selectors. A field
that no known template exposes is returned as ``None`` rather than raising.
"""

from __future__ import annotations

import re
from urllib.parse import urljoin

from bs4 import BeautifulSoup

# Currency guessed from the marketplace TLD (used only as a fallback when the
# price string has no recognisable symbol).
_CURRENCY_BY_MARKET = {
    "amazon.es": "EUR",
    "amazon.com": "USD",
    "amazon.co.uk": "GBP",
    "amazon.de": "EUR",
    "amazon.fr": "EUR",
    "amazon.it": "EUR",
    "amazon.com.mx": "MXN",
    "amazon.com.br": "BRL",
}

# Map common currency symbols to ISO codes.
_SYMBOL_TO_CURRENCY = {
    "€": "EUR",
    "$": "USD",
    "£": "GBP",
    "R$": "BRL",
    "US$": "USD",
}

_ASIN_RE = re.compile(r"/(?:dp|gp/product)/([A-Z0-9]{10})(?:[/?]|$)")
_RANK_RE = re.compile(r"#?\s*(\d+)")
_PRICE_NUM_RE = re.compile(r"[-+]?\d[\d.,]*")
_REVIEWS_RE = re.compile(r"[\d.,]+")
_RATING_RE = re.compile(r"([\d.,]+)\s*(?:out of|de|von|su|sur|de um total de)")
_PCT_RE = re.compile(r"([\d.,]+)\s*%")


def _text(node) -> str:
    return node.get_text(" ", strip=True) if node is not None else ""


def _parse_asin(card) -> str | None:
    """ASIN from a data-asin attribute or any /dp/<ASIN>/ link inside the card."""
    asin = card.get("data-asin")
    if asin and re.fullmatch(r"[A-Z0-9]{10}", asin):
        return asin
    # Some templates put data-asin on a descendant, not the card root.
    inner = card.select_one("[data-asin]")
    if inner is not None:
        inner_asin = inner.get("data-asin")
        if inner_asin and re.fullmatch(r"[A-Z0-9]{10}", inner_asin):
            return inner_asin
    for a in card.find_all("a", href=True):
        m = _ASIN_RE.search(a["href"])
        if m:
            return m.group(1)
    return None


def _parse_url(card, marketplace: str) -> str | None:
    """Absolute product URL from the first /dp/ link in the card."""
    base = f"https://www.{marketplace}"
    for a in card.find_all("a", href=True):
        if _ASIN_RE.search(a["href"]):
            return urljoin(base, a["href"].split("?")[0])
    # Fall back to the first link at all.
    first = card.find("a", href=True)
    if first is not None:
        return urljoin(base, first["href"].split("?")[0])
    return None


def _parse_rank(card) -> int | None:
    """Rank badge. Amazon renders it as '#1', '1', etc."""
    badge = card.select_one(".zg-bdg-text, .zg-badge-text, [class*='badge']")
    txt = _text(badge)
    if not txt:
        # Sometimes the rank is in a class like a11y .zg-bdg-text sibling.
        for sel in (".a-badge-text", "[class*='rank']"):
            node = card.select_one(sel)
            txt = _text(node)
            if txt:
                break
    m = _RANK_RE.search(txt)
    return int(m.group(1)) if m else None


def _parse_title(card) -> str | None:
    """Product title — several templates over the years."""
    for sel in (
        "._cDEzb_p13n-sc-css-line-clamp-3_g3dy1",
        "._cDEzb_p13n-sc-css-line-clamp-2_EWgCb",
        "[class*='line-clamp']",
        ".p13n-sc-truncate",
        ".p13n-sc-truncated",
        "a.a-link-normal[title]",
        "img[alt]",
    ):
        node = card.select_one(sel)
        if node is None:
            continue
        if node.name == "img":
            alt = node.get("alt")
            if alt:
                return alt.strip()
            continue
        if node.has_attr("title") and node["title"].strip():
            return node["title"].strip()
        txt = _text(node)
        if txt:
            return txt
    return None


def _parse_price(card, marketplace: str) -> tuple[float | None, str | None]:
    """Price value (float) and ISO currency, best-effort across templates."""
    for sel in (
        "._cDEzb_p13n-sc-price_3mJ9Z",
        ".p13n-sc-price",
        "span.a-price > span.a-offscreen",
        ".a-price .a-offscreen",
        "[class*='price']",
    ):
        node = card.select_one(sel)
        txt = _text(node)
        if not txt:
            continue

        currency = None
        for sym, iso in _SYMBOL_TO_CURRENCY.items():
            if sym in txt:
                currency = iso
                break
        if currency is None:
            currency = _CURRENCY_BY_MARKET.get(marketplace)

        m = _PRICE_NUM_RE.search(txt)
        if not m:
            continue
        raw = m.group(0)
        value = _to_float(raw)
        if value is not None:
            return value, currency
    return None, None


def _parse_rating(card) -> float | None:
    """Star rating, e.g. '4,5 de 5 estrellas' / '4.5 out of 5 stars'."""
    for sel in ("[class*='review-stars']", ".a-icon-alt", "[title*='star']", "[aria-label*='star']"):
        node = card.select_one(sel)
        txt = _text(node) or (node.get("title", "") if node is not None else "") or (
            node.get("aria-label", "") if node is not None else ""
        )
        if not txt:
            continue
        m = _RATING_RE.search(txt)
        if m:
            return _to_float(m.group(1))
        # Some templates only render the number ('4,5').
        m2 = _PRICE_NUM_RE.search(txt)
        if m2 and ("star" in txt.lower() or "estrella" in txt.lower()):
            return _to_float(m2.group(0))
    return None


def _parse_reviews(card) -> int | None:
    """Number of ratings/reviews shown next to the stars."""
    for sel in (
        "a.a-size-small.a-link-normal",
        ".a-size-small.a-link-normal",
        "[class*='review-count']",
        "span.a-size-small",
    ):
        for node in card.select(sel):
            txt = _text(node)
            if not txt:
                continue
            m = _REVIEWS_RE.search(txt)
            if not m:
                continue
            digits = m.group(0).replace(".", "").replace(",", "")
            if digits.isdigit() and len(digits) >= 1:
                # Avoid catching rank/price by requiring a plausible count token.
                return int(digits)
    return None


def _parse_pct_change(card) -> float | None:
    """Movers & Shakers percentage change ('+150%').

    Targets the sales-rank-gain badge specific to the movers grid, NOT the
    generic discount/savings percent (``apex-savings-percent``) that appears on
    bestseller/deal cards — matching those would report a bogus pct_change.
    """
    for sel in (
        ".zg-percent-change",
        "[class*='sales-movement']",
        "[class*='percent-change']",
        "[class*='zg_percent']",
    ):
        node = card.select_one(sel)
        txt = _text(node)
        if not txt:
            continue
        m = _PCT_RE.search(txt)
        if m:
            value = _to_float(m.group(1))
            if value is None:
                continue
            return -value if txt.strip().startswith("-") else value
    return None


def _to_float(raw: str) -> float | None:
    """Parse a numeric string with EU or US decimal/grouping conventions."""
    if raw is None:
        return None
    s = raw.strip().replace("\xa0", "").replace(" ", "")
    if not s:
        return None
    if "," in s and "." in s:
        # The rightmost separator is the decimal one.
        if s.rfind(",") > s.rfind("."):
            s = s.replace(".", "").replace(",", ".")
        else:
            s = s.replace(",", "")
    elif "," in s:
        # Treat a single comma as decimal separator (EU markets).
        s = s.replace(",", ".")
    try:
        return float(s)
    except ValueError:
        return None


def _select_cards(soup: BeautifulSoup) -> list:
    """Locate the list-item cards across known Amazon templates.

    Prefers the grid *wrapper* (``gridItemRoot``) over the inner faceout: the
    rank badge (``span.zg-bdg-text``) is a sibling of the faceout *inside* the
    wrapper, so selecting the wrapper keeps both rank and product data in the
    same card. Older / alternative templates fall back to their own roots.
    """
    selectors = (
        'div[id="gridItemRoot"]',
        "div[id^='gridItemRoot']",
        "div.zg-grid-general-faceout",
        "li.zg-item-immersion",
        "div.a-cardui[data-asin]",
        "div.p13n-sc-uncoverable-faceout",
        "div[data-asin]",
    )
    for sel in selectors:
        cards = soup.select(sel)
        if cards:
            return cards
    return []


def parse_amazon_ranking_html(
    html: str,
    marketplace: str = "amazon.es",
    list_type: str = "bestsellers",
    max_items: int = 50,
) -> list[dict]:
    """Parse Amazon ranking HTML into a list of product dicts (pure).

    Pure function: given a fixed HTML string it always returns the same list,
    with no I/O. Used by both the HTTP scraper (``scrape_amazon_bestsellers``)
    and the CDP scraper (``scrape_amazon_movers_cdp``).

    Args:
        html: Raw HTML of an Amazon ranking page (or the rendered ``outerHTML``
            of the grid container). May be the whole document or just the grid.
        marketplace: Amazon domain, e.g. ``"amazon.es"``, ``"amazon.com"``. Used
            to build absolute product URLs and to infer the fallback currency.
        list_type: ``"bestsellers"`` or ``"movers_shakers"``. Only affects
            whether ``pct_change`` is parsed (movers) or forced to ``None``.
        max_items: Maximum number of products returned.

    Returns:
        A list of dicts, one per product, with exactly these keys:
        ``marketplace, list_type, category, rank, asin, title, price,
        currency, rating, reviews, pct_change, url``. Missing values are
        ``None``. ``price``/``rating``/``pct_change`` are floats,
        ``rank``/``reviews`` are ints. ``category`` is always ``None`` here —
        the caller (which knows the URL) fills it in. Returns ``[]`` for empty
        or card-less HTML (never raises on missing fields).
    """
    if not html:
        return []

    soup = BeautifulSoup(html, "lxml")
    cards = _select_cards(soup)

    results: list[dict] = []
    count = 0
    for idx, card in enumerate(cards):
        if count >= max_items:
            break
        asin = _parse_asin(card)
        title = _parse_title(card)
        # Skip empty / non-product wrappers.
        if asin is None and title is None:
            continue

        rank = _parse_rank(card)
        if rank is None:
            rank = idx + 1  # positional fallback when no badge is rendered

        price, currency = _parse_price(card, marketplace)
        results.append(
            {
                "marketplace": marketplace,
                "list_type": list_type,
                "category": None,
                "rank": rank,
                "asin": asin,
                "title": title,
                "price": price,
                "currency": currency,
                "rating": _parse_rating(card),
                "reviews": _parse_reviews(card),
                "pct_change": _parse_pct_change(card)
                if list_type == "movers_shakers"
                else None,
                "url": _parse_url(card, marketplace),
            }
        )
        count += 1

    return results