"""Pure HTML parser for Amazon ranking pages (Best Sellers and Movers & Shakers). This module holds the *pure* DOM-parsing core shared by the HTTP scraper (``scrape_amazon_bestsellers``) and the CDP/browser scraper (``scrape_amazon_movers_cdp``). It takes a chunk of already-fetched HTML (from ``requests`` or from a rendered ``outerHTML`` via Chrome DevTools Protocol) and returns a list of product dicts. No I/O, no network, deterministic for a fixed input string — so it can be unit-tested with HTML fixtures and reused by any fetch strategy. Amazon serves several DOM templates at once (A/B tests) and rotates them often, so every field is parsed defensively with multiple fallback selectors. A field that no known template exposes is returned as ``None`` rather than raising. """ from __future__ import annotations import re from urllib.parse import urljoin from bs4 import BeautifulSoup # Currency guessed from the marketplace TLD (used only as a fallback when the # price string has no recognisable symbol). _CURRENCY_BY_MARKET = { "amazon.es": "EUR", "amazon.com": "USD", "amazon.co.uk": "GBP", "amazon.de": "EUR", "amazon.fr": "EUR", "amazon.it": "EUR", "amazon.com.mx": "MXN", "amazon.com.br": "BRL", } # Map common currency symbols to ISO codes. _SYMBOL_TO_CURRENCY = { "€": "EUR", "$": "USD", "£": "GBP", "R$": "BRL", "US$": "USD", } _ASIN_RE = re.compile(r"/(?:dp|gp/product)/([A-Z0-9]{10})(?:[/?]|$)") _RANK_RE = re.compile(r"#?\s*(\d+)") _PRICE_NUM_RE = re.compile(r"[-+]?\d[\d.,]*") _REVIEWS_RE = re.compile(r"[\d.,]+") _RATING_RE = re.compile(r"([\d.,]+)\s*(?:out of|de|von|su|sur|de um total de)") _PCT_RE = re.compile(r"([\d.,]+)\s*%") def _text(node) -> str: return node.get_text(" ", strip=True) if node is not None else "" def _parse_asin(card) -> str | None: """ASIN from a data-asin attribute or any /dp// link inside the card.""" asin = card.get("data-asin") if asin and re.fullmatch(r"[A-Z0-9]{10}", asin): return asin # Some templates put data-asin on a descendant, not the card root. inner = card.select_one("[data-asin]") if inner is not None: inner_asin = inner.get("data-asin") if inner_asin and re.fullmatch(r"[A-Z0-9]{10}", inner_asin): return inner_asin for a in card.find_all("a", href=True): m = _ASIN_RE.search(a["href"]) if m: return m.group(1) return None def _parse_url(card, marketplace: str) -> str | None: """Absolute product URL from the first /dp/ link in the card.""" base = f"https://www.{marketplace}" for a in card.find_all("a", href=True): if _ASIN_RE.search(a["href"]): return urljoin(base, a["href"].split("?")[0]) # Fall back to the first link at all. first = card.find("a", href=True) if first is not None: return urljoin(base, first["href"].split("?")[0]) return None def _parse_rank(card) -> int | None: """Rank badge. Amazon renders it as '#1', '1', etc.""" badge = card.select_one(".zg-bdg-text, .zg-badge-text, [class*='badge']") txt = _text(badge) if not txt: # Sometimes the rank is in a class like a11y .zg-bdg-text sibling. for sel in (".a-badge-text", "[class*='rank']"): node = card.select_one(sel) txt = _text(node) if txt: break m = _RANK_RE.search(txt) return int(m.group(1)) if m else None def _parse_title(card) -> str | None: """Product title — several templates over the years.""" for sel in ( "._cDEzb_p13n-sc-css-line-clamp-3_g3dy1", "._cDEzb_p13n-sc-css-line-clamp-2_EWgCb", "[class*='line-clamp']", ".p13n-sc-truncate", ".p13n-sc-truncated", "a.a-link-normal[title]", "img[alt]", ): node = card.select_one(sel) if node is None: continue if node.name == "img": alt = node.get("alt") if alt: return alt.strip() continue if node.has_attr("title") and node["title"].strip(): return node["title"].strip() txt = _text(node) if txt: return txt return None def _parse_price(card, marketplace: str) -> tuple[float | None, str | None]: """Price value (float) and ISO currency, best-effort across templates.""" for sel in ( "._cDEzb_p13n-sc-price_3mJ9Z", ".p13n-sc-price", "span.a-price > span.a-offscreen", ".a-price .a-offscreen", "[class*='price']", ): node = card.select_one(sel) txt = _text(node) if not txt: continue currency = None for sym, iso in _SYMBOL_TO_CURRENCY.items(): if sym in txt: currency = iso break if currency is None: currency = _CURRENCY_BY_MARKET.get(marketplace) m = _PRICE_NUM_RE.search(txt) if not m: continue raw = m.group(0) value = _to_float(raw) if value is not None: return value, currency return None, None def _parse_rating(card) -> float | None: """Star rating, e.g. '4,5 de 5 estrellas' / '4.5 out of 5 stars'.""" for sel in ("[class*='review-stars']", ".a-icon-alt", "[title*='star']", "[aria-label*='star']"): node = card.select_one(sel) txt = _text(node) or (node.get("title", "") if node is not None else "") or ( node.get("aria-label", "") if node is not None else "" ) if not txt: continue m = _RATING_RE.search(txt) if m: return _to_float(m.group(1)) # Some templates only render the number ('4,5'). m2 = _PRICE_NUM_RE.search(txt) if m2 and ("star" in txt.lower() or "estrella" in txt.lower()): return _to_float(m2.group(0)) return None def _parse_reviews(card) -> int | None: """Number of ratings/reviews shown next to the stars.""" for sel in ( "a.a-size-small.a-link-normal", ".a-size-small.a-link-normal", "[class*='review-count']", "span.a-size-small", ): for node in card.select(sel): txt = _text(node) if not txt: continue m = _REVIEWS_RE.search(txt) if not m: continue digits = m.group(0).replace(".", "").replace(",", "") if digits.isdigit() and len(digits) >= 1: # Avoid catching rank/price by requiring a plausible count token. return int(digits) return None def _parse_pct_change(card) -> float | None: """Movers & Shakers percentage change ('+150%'). Targets the sales-rank-gain badge specific to the movers grid, NOT the generic discount/savings percent (``apex-savings-percent``) that appears on bestseller/deal cards — matching those would report a bogus pct_change. """ for sel in ( ".zg-percent-change", "[class*='sales-movement']", "[class*='percent-change']", "[class*='zg_percent']", ): node = card.select_one(sel) txt = _text(node) if not txt: continue m = _PCT_RE.search(txt) if m: value = _to_float(m.group(1)) if value is None: continue return -value if txt.strip().startswith("-") else value return None def _to_float(raw: str) -> float | None: """Parse a numeric string with EU or US decimal/grouping conventions.""" if raw is None: return None s = raw.strip().replace("\xa0", "").replace(" ", "") if not s: return None if "," in s and "." in s: # The rightmost separator is the decimal one. if s.rfind(",") > s.rfind("."): s = s.replace(".", "").replace(",", ".") else: s = s.replace(",", "") elif "," in s: # Treat a single comma as decimal separator (EU markets). s = s.replace(",", ".") try: return float(s) except ValueError: return None def _select_cards(soup: BeautifulSoup) -> list: """Locate the list-item cards across known Amazon templates. Prefers the grid *wrapper* (``gridItemRoot``) over the inner faceout: the rank badge (``span.zg-bdg-text``) is a sibling of the faceout *inside* the wrapper, so selecting the wrapper keeps both rank and product data in the same card. Older / alternative templates fall back to their own roots. """ selectors = ( 'div[id="gridItemRoot"]', "div[id^='gridItemRoot']", "div.zg-grid-general-faceout", "li.zg-item-immersion", "div.a-cardui[data-asin]", "div.p13n-sc-uncoverable-faceout", "div[data-asin]", ) for sel in selectors: cards = soup.select(sel) if cards: return cards return [] def parse_amazon_ranking_html( html: str, marketplace: str = "amazon.es", list_type: str = "bestsellers", max_items: int = 50, ) -> list[dict]: """Parse Amazon ranking HTML into a list of product dicts (pure). Pure function: given a fixed HTML string it always returns the same list, with no I/O. Used by both the HTTP scraper (``scrape_amazon_bestsellers``) and the CDP scraper (``scrape_amazon_movers_cdp``). Args: html: Raw HTML of an Amazon ranking page (or the rendered ``outerHTML`` of the grid container). May be the whole document or just the grid. marketplace: Amazon domain, e.g. ``"amazon.es"``, ``"amazon.com"``. Used to build absolute product URLs and to infer the fallback currency. list_type: ``"bestsellers"`` or ``"movers_shakers"``. Only affects whether ``pct_change`` is parsed (movers) or forced to ``None``. max_items: Maximum number of products returned. Returns: A list of dicts, one per product, with exactly these keys: ``marketplace, list_type, category, rank, asin, title, price, currency, rating, reviews, pct_change, url``. Missing values are ``None``. ``price``/``rating``/``pct_change`` are floats, ``rank``/``reviews`` are ints. ``category`` is always ``None`` here — the caller (which knows the URL) fills it in. Returns ``[]`` for empty or card-less HTML (never raises on missing fields). """ if not html: return [] soup = BeautifulSoup(html, "lxml") cards = _select_cards(soup) results: list[dict] = [] count = 0 for idx, card in enumerate(cards): if count >= max_items: break asin = _parse_asin(card) title = _parse_title(card) # Skip empty / non-product wrappers. if asin is None and title is None: continue rank = _parse_rank(card) if rank is None: rank = idx + 1 # positional fallback when no badge is rendered price, currency = _parse_price(card, marketplace) results.append( { "marketplace": marketplace, "list_type": list_type, "category": None, "rank": rank, "asin": asin, "title": title, "price": price, "currency": currency, "rating": _parse_rating(card), "reviews": _parse_reviews(card), "pct_change": _parse_pct_change(card) if list_type == "movers_shakers" else None, "url": _parse_url(card, marketplace), } ) count += 1 return results