"""Scrape Amazon Best Sellers and Movers & Shakers ranking pages for product demand signals.""" from __future__ import annotations import re import time from urllib.parse import urljoin import requests from bs4 import BeautifulSoup # Accept-Language hint per marketplace TLD. Falls back to a generic value. _ACCEPT_LANGUAGE = { "amazon.es": "es-ES,es;q=0.9,en;q=0.6", "amazon.com": "en-US,en;q=0.9", "amazon.co.uk": "en-GB,en;q=0.9", "amazon.de": "de-DE,de;q=0.9,en;q=0.6", "amazon.fr": "fr-FR,fr;q=0.9,en;q=0.6", "amazon.it": "it-IT,it;q=0.9,en;q=0.6", "amazon.com.mx": "es-MX,es;q=0.9,en;q=0.6", "amazon.com.br": "pt-BR,pt;q=0.9,en;q=0.6", } # Currency guessed from the marketplace TLD (used only as a fallback when the # price string has no recognisable symbol). _CURRENCY_BY_MARKET = { "amazon.es": "EUR", "amazon.com": "USD", "amazon.co.uk": "GBP", "amazon.de": "EUR", "amazon.fr": "EUR", "amazon.it": "EUR", "amazon.com.mx": "MXN", "amazon.com.br": "BRL", } # Map common currency symbols to ISO codes. _SYMBOL_TO_CURRENCY = { "€": "EUR", "$": "USD", "£": "GBP", "R$": "BRL", "US$": "USD", } _USER_AGENT = ( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" ) # Signals that Amazon served an anti-bot / captcha / throttling page instead of # the ranking content. _BLOCK_MARKERS = ( "api-services-support@amazon", "captcha", "to discuss automated access", "enter the characters you see below", "robot check", ) def _build_headers(marketplace: str) -> dict: """Realistic browser-ish headers for the given marketplace.""" return { "User-Agent": _USER_AGENT, "Accept": ( "text/html,application/xhtml+xml,application/xml;q=0.9," "image/avif,image/webp,*/*;q=0.8" ), "Accept-Language": _ACCEPT_LANGUAGE.get(marketplace, "en-US,en;q=0.9"), "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Sec-Fetch-User": "?1", } def _build_url(marketplace: str, list_type: str, category: str | None) -> str: """Compose the ranking URL for a marketplace / list type / category slug.""" base = "movers-and-shakers" if list_type == "movers_shakers" else "bestsellers" url = f"https://www.{marketplace}/gp/{base}" if category: url = f"{url}/{category.strip('/')}" return url def _looks_blocked(status_code: int, html: str) -> bool: """Heuristic: did Amazon serve an anti-bot / throttling page?""" if status_code in (429, 503): return True lowered = html.lower() return any(marker in lowered for marker in _BLOCK_MARKERS) def _fetch(url: str, headers: dict, timeout: int, retries: int) -> requests.Response: """GET with small retry + backoff. Raises on persistent failure / block.""" last_exc: Exception | None = None for attempt in range(retries + 1): try: resp = requests.get(url, headers=headers, timeout=timeout) except requests.RequestException as exc: # network / timeout last_exc = exc if attempt < retries: time.sleep(1.5 * (attempt + 1)) continue raise RuntimeError(f"request to {url} failed: {exc}") from exc if _looks_blocked(resp.status_code, resp.text): if attempt < retries: time.sleep(2.0 * (attempt + 1)) continue raise RuntimeError( f"Amazon anti-bot block on {url} (HTTP {resp.status_code}). " "HTTP scraping is being throttled/captcha'd; fall back to the " "browser MCP/CDP path of the ecosystem." ) if resp.status_code != 200: last_exc = RuntimeError( f"unexpected HTTP {resp.status_code} for {url}" ) if attempt < retries: time.sleep(1.5 * (attempt + 1)) continue raise last_exc return resp # Should not reach here, but be defensive. raise RuntimeError(f"could not fetch {url}: {last_exc}") _ASIN_RE = re.compile(r"/(?:dp|gp/product)/([A-Z0-9]{10})(?:[/?]|$)") _RANK_RE = re.compile(r"#?\s*(\d+)") _PRICE_NUM_RE = re.compile(r"[-+]?\d[\d.,]*") _REVIEWS_RE = re.compile(r"[\d.,]+") _RATING_RE = re.compile(r"([\d.,]+)\s*(?:out of|de|von|su|sur|de um total de)") _PCT_RE = re.compile(r"([\d.,]+)\s*%") def _text(node) -> str: return node.get_text(" ", strip=True) if node is not None else "" def _parse_asin(card) -> str | None: """ASIN from a data-asin attribute or any /dp// link inside the card.""" asin = card.get("data-asin") if asin and re.fullmatch(r"[A-Z0-9]{10}", asin): return asin for a in card.find_all("a", href=True): m = _ASIN_RE.search(a["href"]) if m: return m.group(1) return None def _parse_url(card, marketplace: str) -> str | None: """Absolute product URL from the first /dp/ link in the card.""" base = f"https://www.{marketplace}" for a in card.find_all("a", href=True): if _ASIN_RE.search(a["href"]): return urljoin(base, a["href"].split("?")[0]) # Fall back to the first link at all. first = card.find("a", href=True) if first is not None: return urljoin(base, first["href"].split("?")[0]) return None def _parse_rank(card) -> int | None: """Rank badge. Amazon renders it as '#1', '1', etc.""" badge = card.select_one(".zg-bdg-text, .zg-badge-text, [class*='badge']") txt = _text(badge) if not txt: # Sometimes the rank is in a class like a11y .zg-bdg-text sibling. for sel in (".a-badge-text", "[class*='rank']"): node = card.select_one(sel) txt = _text(node) if txt: break m = _RANK_RE.search(txt) return int(m.group(1)) if m else None def _parse_title(card) -> str | None: """Product title — several templates over the years.""" for sel in ( "._cDEzb_p13n-sc-css-line-clamp-3_g3dy1", "._cDEzb_p13n-sc-css-line-clamp-2_EWgCb", "[class*='line-clamp']", ".p13n-sc-truncate", ".p13n-sc-truncated", "a.a-link-normal[title]", "img[alt]", ): node = card.select_one(sel) if node is None: continue if node.name == "img": alt = node.get("alt") if alt: return alt.strip() continue if node.has_attr("title") and node["title"].strip(): return node["title"].strip() txt = _text(node) if txt: return txt return None def _parse_price(card, marketplace: str) -> tuple[float | None, str | None]: """Price value (float) and ISO currency, best-effort across templates.""" for sel in ( "._cDEzb_p13n-sc-price_3mJ9Z", ".p13n-sc-price", "span.a-price > span.a-offscreen", ".a-price .a-offscreen", "[class*='price']", ): node = card.select_one(sel) txt = _text(node) if not txt: continue currency = None for sym, iso in _SYMBOL_TO_CURRENCY.items(): if sym in txt: currency = iso break if currency is None: currency = _CURRENCY_BY_MARKET.get(marketplace) m = _PRICE_NUM_RE.search(txt) if not m: continue raw = m.group(0) value = _to_float(raw) if value is not None: return value, currency return None, None def _parse_rating(card) -> float | None: """Star rating, e.g. '4,5 de 5 estrellas' / '4.5 out of 5 stars'.""" for sel in ("[class*='review-stars']", ".a-icon-alt", "[title*='star']", "[aria-label*='star']"): node = card.select_one(sel) txt = _text(node) or (node.get("title", "") if node is not None else "") or ( node.get("aria-label", "") if node is not None else "" ) if not txt: continue m = _RATING_RE.search(txt) if m: return _to_float(m.group(1)) # Some templates only render the number ('4,5'). m2 = _PRICE_NUM_RE.search(txt) if m2 and ("star" in txt.lower() or "estrella" in txt.lower()): return _to_float(m2.group(0)) return None def _parse_reviews(card) -> int | None: """Number of ratings/reviews shown next to the stars.""" for sel in ( "a.a-size-small.a-link-normal", ".a-size-small.a-link-normal", "[class*='review-count']", "span.a-size-small", ): for node in card.select(sel): txt = _text(node) if not txt: continue m = _REVIEWS_RE.search(txt) if not m: continue digits = m.group(0).replace(".", "").replace(",", "") if digits.isdigit() and len(digits) >= 1: # Avoid catching rank/price by requiring a plausible count token. return int(digits) return None def _parse_pct_change(card) -> float | None: """Movers & Shakers percentage change ('+150%').""" for sel in (".zg-percent-change", "[class*='percent']", "[class*='sales-movement']"): node = card.select_one(sel) txt = _text(node) if not txt: continue m = _PCT_RE.search(txt) if m: value = _to_float(m.group(1)) if value is None: continue return -value if txt.strip().startswith("-") else value return None def _to_float(raw: str) -> float | None: """Parse a numeric string with EU or US decimal/grouping conventions.""" if raw is None: return None s = raw.strip().replace("\xa0", "").replace(" ", "") if not s: return None if "," in s and "." in s: # The rightmost separator is the decimal one. if s.rfind(",") > s.rfind("."): s = s.replace(".", "").replace(",", ".") else: s = s.replace(",", "") elif "," in s: # Treat a single comma as decimal separator (EU markets). s = s.replace(",", ".") try: return float(s) except ValueError: return None def _select_cards(soup: BeautifulSoup) -> list: """Locate the list-item cards across known Amazon templates.""" selectors = ( "div.p13n-sc-uncoverable-faceout", "div[id^='gridItemRoot']", "div.zg-grid-general-faceout", "li.zg-item-immersion", "div.a-cardui[data-asin]", "div[data-asin]", ) for sel in selectors: cards = soup.select(sel) if cards: return cards return [] def scrape_amazon_bestsellers( marketplace: str = "amazon.es", categories: list[str] | None = None, list_type: str = "bestsellers", max_items: int = 50, ) -> list[dict]: """Scrape Amazon Best Sellers / Movers & Shakers ranking pages. Captures demand signals (rank, title, price, rating, reviews and — for Movers & Shakers — percentage change) from one or more category ranking pages of a given Amazon marketplace. Args: marketplace: Amazon domain, e.g. ``"amazon.es"``, ``"amazon.com"``. categories: Category slugs (e.g. ``"electronics"``, ``"videogames"``). If ``None`` the general front page of the chosen list is scraped. list_type: ``"bestsellers"`` (URL ``/gp/bestsellers/``) or ``"movers_shakers"`` (URL ``/gp/movers-and-shakers/``). max_items: Maximum number of items collected per category. Returns: A list of dicts, one per product, with exactly these keys: ``marketplace, list_type, category, rank, asin, title, price, currency, rating, reviews, pct_change, url``. Missing values are ``None``. ``price``/``rating``/``pct_change`` are floats, ``rank``/``reviews`` are ints. Raises: ValueError: If ``list_type`` is not one of the allowed values. RuntimeError: On network failure or when Amazon serves an anti-bot / captcha / throttling page. """ if list_type not in ("bestsellers", "movers_shakers"): raise ValueError( f"list_type must be 'bestsellers' or 'movers_shakers', got {list_type!r}" ) cats: list[str | None] = list(categories) if categories else [None] headers = _build_headers(marketplace) results: list[dict] = [] for category in cats: url = _build_url(marketplace, list_type, category) resp = _fetch(url, headers, timeout=20, retries=2) soup = BeautifulSoup(resp.text, "lxml") cards = _select_cards(soup) count = 0 for idx, card in enumerate(cards): if count >= max_items: break asin = _parse_asin(card) title = _parse_title(card) # Skip empty / non-product wrappers. if asin is None and title is None: continue rank = _parse_rank(card) if rank is None: rank = idx + 1 # positional fallback when no badge is rendered price, currency = _parse_price(card, marketplace) results.append( { "marketplace": marketplace, "list_type": list_type, "category": category, "rank": rank, "asin": asin, "title": title, "price": price, "currency": currency, "rating": _parse_rating(card), "reviews": _parse_reviews(card), "pct_change": _parse_pct_change(card) if list_type == "movers_shakers" else None, "url": _parse_url(card, marketplace), } ) count += 1 return results