fn_registry/python/functions/datascience/scrape_amazon_bestsellers.py

"""Scrape Amazon Best Sellers and Movers & Shakers ranking pages for product demand signals.

HTTP fetch strategy: fetches each ranking page with ``requests`` (browser-ish
headers + retry/backoff) and delegates DOM parsing to the pure, reusable
``parse_amazon_ranking_html`` function of the registry — so the HTTP scraper and
the CDP scraper (``scrape_amazon_movers_cdp``) share one robust parser.
"""

from __future__ import annotations

import os
import sys
import time

import requests

sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

from datascience.parse_amazon_ranking_html import parse_amazon_ranking_html

# Accept-Language hint per marketplace TLD. Falls back to a generic value.
_ACCEPT_LANGUAGE = {
    "amazon.es": "es-ES,es;q=0.9,en;q=0.6",
    "amazon.com": "en-US,en;q=0.9",
    "amazon.co.uk": "en-GB,en;q=0.9",
    "amazon.de": "de-DE,de;q=0.9,en;q=0.6",
    "amazon.fr": "fr-FR,fr;q=0.9,en;q=0.6",
    "amazon.it": "it-IT,it;q=0.9,en;q=0.6",
    "amazon.com.mx": "es-MX,es;q=0.9,en;q=0.6",
    "amazon.com.br": "pt-BR,pt;q=0.9,en;q=0.6",
}

_USER_AGENT = (
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
)

# Signals that Amazon served an anti-bot / captcha / throttling page instead of
# the ranking content.
_BLOCK_MARKERS = (
    "api-services-support@amazon",
    "captcha",
    "to discuss automated access",
    "enter the characters you see below",
    "robot check",
)


def _build_headers(marketplace: str) -> dict:
    """Realistic browser-ish headers for the given marketplace."""
    return {
        "User-Agent": _USER_AGENT,
        "Accept": (
            "text/html,application/xhtml+xml,application/xml;q=0.9,"
            "image/avif,image/webp,*/*;q=0.8"
        ),
        "Accept-Language": _ACCEPT_LANGUAGE.get(marketplace, "en-US,en;q=0.9"),
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
    }


def _build_url(marketplace: str, list_type: str, category: str | None) -> str:
    """Compose the ranking URL for a marketplace / list type / category slug."""
    base = "movers-and-shakers" if list_type == "movers_shakers" else "bestsellers"
    url = f"https://www.{marketplace}/gp/{base}"
    if category:
        url = f"{url}/{category.strip('/')}"
    return url


def _looks_blocked(status_code: int, html: str) -> bool:
    """Heuristic: did Amazon serve an anti-bot / throttling page?"""
    if status_code in (429, 503):
        return True
    lowered = html.lower()
    return any(marker in lowered for marker in _BLOCK_MARKERS)


def _fetch(url: str, headers: dict, timeout: int, retries: int) -> requests.Response:
    """GET with small retry + backoff. Raises on persistent failure / block."""
    last_exc: Exception | None = None
    for attempt in range(retries + 1):
        try:
            resp = requests.get(url, headers=headers, timeout=timeout)
        except requests.RequestException as exc:  # network / timeout
            last_exc = exc
            if attempt < retries:
                time.sleep(1.5 * (attempt + 1))
                continue
            raise RuntimeError(f"request to {url} failed: {exc}") from exc

        if _looks_blocked(resp.status_code, resp.text):
            if attempt < retries:
                time.sleep(2.0 * (attempt + 1))
                continue
            raise RuntimeError(
                f"Amazon anti-bot block on {url} (HTTP {resp.status_code}). "
                "HTTP scraping is being throttled/captcha'd; fall back to the "
                "browser MCP/CDP path of the ecosystem."
            )

        if resp.status_code != 200:
            last_exc = RuntimeError(
                f"unexpected HTTP {resp.status_code} for {url}"
            )
            if attempt < retries:
                time.sleep(1.5 * (attempt + 1))
                continue
            raise last_exc

        return resp

    # Should not reach here, but be defensive.
    raise RuntimeError(f"could not fetch {url}: {last_exc}")


def scrape_amazon_bestsellers(
    marketplace: str = "amazon.es",
    categories: list[str] | None = None,
    list_type: str = "bestsellers",
    max_items: int = 50,
) -> list[dict]:
    """Scrape Amazon Best Sellers / Movers & Shakers ranking pages.

    Captures demand signals (rank, title, price, rating, reviews and — for
    Movers & Shakers — percentage change) from one or more category ranking
    pages of a given Amazon marketplace.

    Args:
        marketplace: Amazon domain, e.g. ``"amazon.es"``, ``"amazon.com"``.
        categories: Category slugs (e.g. ``"electronics"``, ``"videogames"``).
            If ``None`` the general front page of the chosen list is scraped.
        list_type: ``"bestsellers"`` (URL ``/gp/bestsellers/<cat>``) or
            ``"movers_shakers"`` (URL ``/gp/movers-and-shakers/<cat>``).
        max_items: Maximum number of items collected per category.

    Returns:
        A list of dicts, one per product, with exactly these keys:
        ``marketplace, list_type, category, rank, asin, title, price,
        currency, rating, reviews, pct_change, url``. Missing values are
        ``None``. ``price``/``rating``/``pct_change`` are floats,
        ``rank``/``reviews`` are ints. ``pct_change`` only filled for
        ``movers_shakers``.

    Raises:
        ValueError: If ``list_type`` is not one of the allowed values.
        RuntimeError: On network failure or when Amazon serves an anti-bot /
            captcha / throttling page.
    """
    if list_type not in ("bestsellers", "movers_shakers"):
        raise ValueError(
            f"list_type must be 'bestsellers' or 'movers_shakers', got {list_type!r}"
        )

    cats: list[str | None] = list(categories) if categories else [None]
    headers = _build_headers(marketplace)
    results: list[dict] = []

    for category in cats:
        url = _build_url(marketplace, list_type, category)
        resp = _fetch(url, headers, timeout=20, retries=2)
        rows = parse_amazon_ranking_html(
            resp.text,
            marketplace=marketplace,
            list_type=list_type,
            max_items=max_items,
        )
        # The pure parser leaves category=None (it doesn't know the URL);
        # stamp the category we requested.
        for row in rows:
            row["category"] = category
        results.extend(rows)

    return results