Files
fn_registry/python/functions/datascience/scrape_amazon_bestsellers.py
T
egutierrez 763e06c127 feat(browser): auto-commit con 178 cambios
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-20 18:22:23 +02:00

181 lines
6.5 KiB
Python

"""Scrape Amazon Best Sellers and Movers & Shakers ranking pages for product demand signals.
HTTP fetch strategy: fetches each ranking page with ``requests`` (browser-ish
headers + retry/backoff) and delegates DOM parsing to the pure, reusable
``parse_amazon_ranking_html`` function of the registry — so the HTTP scraper and
the CDP scraper (``scrape_amazon_movers_cdp``) share one robust parser.
"""
from __future__ import annotations
import os
import sys
import time
import requests
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from datascience.parse_amazon_ranking_html import parse_amazon_ranking_html
# Accept-Language hint per marketplace TLD. Falls back to a generic value.
_ACCEPT_LANGUAGE = {
"amazon.es": "es-ES,es;q=0.9,en;q=0.6",
"amazon.com": "en-US,en;q=0.9",
"amazon.co.uk": "en-GB,en;q=0.9",
"amazon.de": "de-DE,de;q=0.9,en;q=0.6",
"amazon.fr": "fr-FR,fr;q=0.9,en;q=0.6",
"amazon.it": "it-IT,it;q=0.9,en;q=0.6",
"amazon.com.mx": "es-MX,es;q=0.9,en;q=0.6",
"amazon.com.br": "pt-BR,pt;q=0.9,en;q=0.6",
}
_USER_AGENT = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
)
# Signals that Amazon served an anti-bot / captcha / throttling page instead of
# the ranking content.
_BLOCK_MARKERS = (
"api-services-support@amazon",
"captcha",
"to discuss automated access",
"enter the characters you see below",
"robot check",
)
def _build_headers(marketplace: str) -> dict:
"""Realistic browser-ish headers for the given marketplace."""
return {
"User-Agent": _USER_AGENT,
"Accept": (
"text/html,application/xhtml+xml,application/xml;q=0.9,"
"image/avif,image/webp,*/*;q=0.8"
),
"Accept-Language": _ACCEPT_LANGUAGE.get(marketplace, "en-US,en;q=0.9"),
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
}
def _build_url(marketplace: str, list_type: str, category: str | None) -> str:
"""Compose the ranking URL for a marketplace / list type / category slug."""
base = "movers-and-shakers" if list_type == "movers_shakers" else "bestsellers"
url = f"https://www.{marketplace}/gp/{base}"
if category:
url = f"{url}/{category.strip('/')}"
return url
def _looks_blocked(status_code: int, html: str) -> bool:
"""Heuristic: did Amazon serve an anti-bot / throttling page?"""
if status_code in (429, 503):
return True
lowered = html.lower()
return any(marker in lowered for marker in _BLOCK_MARKERS)
def _fetch(url: str, headers: dict, timeout: int, retries: int) -> requests.Response:
"""GET with small retry + backoff. Raises on persistent failure / block."""
last_exc: Exception | None = None
for attempt in range(retries + 1):
try:
resp = requests.get(url, headers=headers, timeout=timeout)
except requests.RequestException as exc: # network / timeout
last_exc = exc
if attempt < retries:
time.sleep(1.5 * (attempt + 1))
continue
raise RuntimeError(f"request to {url} failed: {exc}") from exc
if _looks_blocked(resp.status_code, resp.text):
if attempt < retries:
time.sleep(2.0 * (attempt + 1))
continue
raise RuntimeError(
f"Amazon anti-bot block on {url} (HTTP {resp.status_code}). "
"HTTP scraping is being throttled/captcha'd; fall back to the "
"browser MCP/CDP path of the ecosystem."
)
if resp.status_code != 200:
last_exc = RuntimeError(
f"unexpected HTTP {resp.status_code} for {url}"
)
if attempt < retries:
time.sleep(1.5 * (attempt + 1))
continue
raise last_exc
return resp
# Should not reach here, but be defensive.
raise RuntimeError(f"could not fetch {url}: {last_exc}")
def scrape_amazon_bestsellers(
marketplace: str = "amazon.es",
categories: list[str] | None = None,
list_type: str = "bestsellers",
max_items: int = 50,
) -> list[dict]:
"""Scrape Amazon Best Sellers / Movers & Shakers ranking pages.
Captures demand signals (rank, title, price, rating, reviews and — for
Movers & Shakers — percentage change) from one or more category ranking
pages of a given Amazon marketplace.
Args:
marketplace: Amazon domain, e.g. ``"amazon.es"``, ``"amazon.com"``.
categories: Category slugs (e.g. ``"electronics"``, ``"videogames"``).
If ``None`` the general front page of the chosen list is scraped.
list_type: ``"bestsellers"`` (URL ``/gp/bestsellers/<cat>``) or
``"movers_shakers"`` (URL ``/gp/movers-and-shakers/<cat>``).
max_items: Maximum number of items collected per category.
Returns:
A list of dicts, one per product, with exactly these keys:
``marketplace, list_type, category, rank, asin, title, price,
currency, rating, reviews, pct_change, url``. Missing values are
``None``. ``price``/``rating``/``pct_change`` are floats,
``rank``/``reviews`` are ints. ``pct_change`` only filled for
``movers_shakers``.
Raises:
ValueError: If ``list_type`` is not one of the allowed values.
RuntimeError: On network failure or when Amazon serves an anti-bot /
captcha / throttling page.
"""
if list_type not in ("bestsellers", "movers_shakers"):
raise ValueError(
f"list_type must be 'bestsellers' or 'movers_shakers', got {list_type!r}"
)
cats: list[str | None] = list(categories) if categories else [None]
headers = _build_headers(marketplace)
results: list[dict] = []
for category in cats:
url = _build_url(marketplace, list_type, category)
resp = _fetch(url, headers, timeout=20, retries=2)
rows = parse_amazon_ranking_html(
resp.text,
marketplace=marketplace,
list_type=list_type,
max_items=max_items,
)
# The pure parser leaves category=None (it doesn't know the URL);
# stamp the category we requested.
for row in rows:
row["category"] = category
results.extend(rows)
return results