"""Scrape Amazon Best Sellers and Movers & Shakers ranking pages for product demand signals. HTTP fetch strategy: fetches each ranking page with ``requests`` (browser-ish headers + retry/backoff) and delegates DOM parsing to the pure, reusable ``parse_amazon_ranking_html`` function of the registry — so the HTTP scraper and the CDP scraper (``scrape_amazon_movers_cdp``) share one robust parser. """ from __future__ import annotations import os import sys import time import requests sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from datascience.parse_amazon_ranking_html import parse_amazon_ranking_html # Accept-Language hint per marketplace TLD. Falls back to a generic value. _ACCEPT_LANGUAGE = { "amazon.es": "es-ES,es;q=0.9,en;q=0.6", "amazon.com": "en-US,en;q=0.9", "amazon.co.uk": "en-GB,en;q=0.9", "amazon.de": "de-DE,de;q=0.9,en;q=0.6", "amazon.fr": "fr-FR,fr;q=0.9,en;q=0.6", "amazon.it": "it-IT,it;q=0.9,en;q=0.6", "amazon.com.mx": "es-MX,es;q=0.9,en;q=0.6", "amazon.com.br": "pt-BR,pt;q=0.9,en;q=0.6", } _USER_AGENT = ( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" ) # Signals that Amazon served an anti-bot / captcha / throttling page instead of # the ranking content. _BLOCK_MARKERS = ( "api-services-support@amazon", "captcha", "to discuss automated access", "enter the characters you see below", "robot check", ) def _build_headers(marketplace: str) -> dict: """Realistic browser-ish headers for the given marketplace.""" return { "User-Agent": _USER_AGENT, "Accept": ( "text/html,application/xhtml+xml,application/xml;q=0.9," "image/avif,image/webp,*/*;q=0.8" ), "Accept-Language": _ACCEPT_LANGUAGE.get(marketplace, "en-US,en;q=0.9"), "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Sec-Fetch-User": "?1", } def _build_url(marketplace: str, list_type: str, category: str | None) -> str: """Compose the ranking URL for a marketplace / list type / category slug.""" base = "movers-and-shakers" if list_type == "movers_shakers" else "bestsellers" url = f"https://www.{marketplace}/gp/{base}" if category: url = f"{url}/{category.strip('/')}" return url def _looks_blocked(status_code: int, html: str) -> bool: """Heuristic: did Amazon serve an anti-bot / throttling page?""" if status_code in (429, 503): return True lowered = html.lower() return any(marker in lowered for marker in _BLOCK_MARKERS) def _fetch(url: str, headers: dict, timeout: int, retries: int) -> requests.Response: """GET with small retry + backoff. Raises on persistent failure / block.""" last_exc: Exception | None = None for attempt in range(retries + 1): try: resp = requests.get(url, headers=headers, timeout=timeout) except requests.RequestException as exc: # network / timeout last_exc = exc if attempt < retries: time.sleep(1.5 * (attempt + 1)) continue raise RuntimeError(f"request to {url} failed: {exc}") from exc if _looks_blocked(resp.status_code, resp.text): if attempt < retries: time.sleep(2.0 * (attempt + 1)) continue raise RuntimeError( f"Amazon anti-bot block on {url} (HTTP {resp.status_code}). " "HTTP scraping is being throttled/captcha'd; fall back to the " "browser MCP/CDP path of the ecosystem." ) if resp.status_code != 200: last_exc = RuntimeError( f"unexpected HTTP {resp.status_code} for {url}" ) if attempt < retries: time.sleep(1.5 * (attempt + 1)) continue raise last_exc return resp # Should not reach here, but be defensive. raise RuntimeError(f"could not fetch {url}: {last_exc}") def scrape_amazon_bestsellers( marketplace: str = "amazon.es", categories: list[str] | None = None, list_type: str = "bestsellers", max_items: int = 50, ) -> list[dict]: """Scrape Amazon Best Sellers / Movers & Shakers ranking pages. Captures demand signals (rank, title, price, rating, reviews and — for Movers & Shakers — percentage change) from one or more category ranking pages of a given Amazon marketplace. Args: marketplace: Amazon domain, e.g. ``"amazon.es"``, ``"amazon.com"``. categories: Category slugs (e.g. ``"electronics"``, ``"videogames"``). If ``None`` the general front page of the chosen list is scraped. list_type: ``"bestsellers"`` (URL ``/gp/bestsellers/``) or ``"movers_shakers"`` (URL ``/gp/movers-and-shakers/``). max_items: Maximum number of items collected per category. Returns: A list of dicts, one per product, with exactly these keys: ``marketplace, list_type, category, rank, asin, title, price, currency, rating, reviews, pct_change, url``. Missing values are ``None``. ``price``/``rating``/``pct_change`` are floats, ``rank``/``reviews`` are ints. ``pct_change`` only filled for ``movers_shakers``. Raises: ValueError: If ``list_type`` is not one of the allowed values. RuntimeError: On network failure or when Amazon serves an anti-bot / captcha / throttling page. """ if list_type not in ("bestsellers", "movers_shakers"): raise ValueError( f"list_type must be 'bestsellers' or 'movers_shakers', got {list_type!r}" ) cats: list[str | None] = list(categories) if categories else [None] headers = _build_headers(marketplace) results: list[dict] = [] for category in cats: url = _build_url(marketplace, list_type, category) resp = _fetch(url, headers, timeout=20, retries=2) rows = parse_amazon_ranking_html( resp.text, marketplace=marketplace, list_type=list_type, max_items=max_items, ) # The pure parser leaves category=None (it doesn't know the URL); # stamp the category we requested. for row in rows: row["category"] = category results.extend(rows) return results