feat(shell): auto-commit con 31 cambios
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,425 @@
|
||||
"""Scrape Amazon Best Sellers and Movers & Shakers ranking pages for product demand signals."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import time
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Accept-Language hint per marketplace TLD. Falls back to a generic value.
|
||||
_ACCEPT_LANGUAGE = {
|
||||
"amazon.es": "es-ES,es;q=0.9,en;q=0.6",
|
||||
"amazon.com": "en-US,en;q=0.9",
|
||||
"amazon.co.uk": "en-GB,en;q=0.9",
|
||||
"amazon.de": "de-DE,de;q=0.9,en;q=0.6",
|
||||
"amazon.fr": "fr-FR,fr;q=0.9,en;q=0.6",
|
||||
"amazon.it": "it-IT,it;q=0.9,en;q=0.6",
|
||||
"amazon.com.mx": "es-MX,es;q=0.9,en;q=0.6",
|
||||
"amazon.com.br": "pt-BR,pt;q=0.9,en;q=0.6",
|
||||
}
|
||||
|
||||
# Currency guessed from the marketplace TLD (used only as a fallback when the
|
||||
# price string has no recognisable symbol).
|
||||
_CURRENCY_BY_MARKET = {
|
||||
"amazon.es": "EUR",
|
||||
"amazon.com": "USD",
|
||||
"amazon.co.uk": "GBP",
|
||||
"amazon.de": "EUR",
|
||||
"amazon.fr": "EUR",
|
||||
"amazon.it": "EUR",
|
||||
"amazon.com.mx": "MXN",
|
||||
"amazon.com.br": "BRL",
|
||||
}
|
||||
|
||||
# Map common currency symbols to ISO codes.
|
||||
_SYMBOL_TO_CURRENCY = {
|
||||
"€": "EUR",
|
||||
"$": "USD",
|
||||
"£": "GBP",
|
||||
"R$": "BRL",
|
||||
"US$": "USD",
|
||||
}
|
||||
|
||||
_USER_AGENT = (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
# Signals that Amazon served an anti-bot / captcha / throttling page instead of
|
||||
# the ranking content.
|
||||
_BLOCK_MARKERS = (
|
||||
"api-services-support@amazon",
|
||||
"captcha",
|
||||
"to discuss automated access",
|
||||
"enter the characters you see below",
|
||||
"robot check",
|
||||
)
|
||||
|
||||
|
||||
def _build_headers(marketplace: str) -> dict:
|
||||
"""Realistic browser-ish headers for the given marketplace."""
|
||||
return {
|
||||
"User-Agent": _USER_AGENT,
|
||||
"Accept": (
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,"
|
||||
"image/avif,image/webp,*/*;q=0.8"
|
||||
),
|
||||
"Accept-Language": _ACCEPT_LANGUAGE.get(marketplace, "en-US,en;q=0.9"),
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"Sec-Fetch-Dest": "document",
|
||||
"Sec-Fetch-Mode": "navigate",
|
||||
"Sec-Fetch-Site": "none",
|
||||
"Sec-Fetch-User": "?1",
|
||||
}
|
||||
|
||||
|
||||
def _build_url(marketplace: str, list_type: str, category: str | None) -> str:
|
||||
"""Compose the ranking URL for a marketplace / list type / category slug."""
|
||||
base = "movers-and-shakers" if list_type == "movers_shakers" else "bestsellers"
|
||||
url = f"https://www.{marketplace}/gp/{base}"
|
||||
if category:
|
||||
url = f"{url}/{category.strip('/')}"
|
||||
return url
|
||||
|
||||
|
||||
def _looks_blocked(status_code: int, html: str) -> bool:
|
||||
"""Heuristic: did Amazon serve an anti-bot / throttling page?"""
|
||||
if status_code in (429, 503):
|
||||
return True
|
||||
lowered = html.lower()
|
||||
return any(marker in lowered for marker in _BLOCK_MARKERS)
|
||||
|
||||
|
||||
def _fetch(url: str, headers: dict, timeout: int, retries: int) -> requests.Response:
|
||||
"""GET with small retry + backoff. Raises on persistent failure / block."""
|
||||
last_exc: Exception | None = None
|
||||
for attempt in range(retries + 1):
|
||||
try:
|
||||
resp = requests.get(url, headers=headers, timeout=timeout)
|
||||
except requests.RequestException as exc: # network / timeout
|
||||
last_exc = exc
|
||||
if attempt < retries:
|
||||
time.sleep(1.5 * (attempt + 1))
|
||||
continue
|
||||
raise RuntimeError(f"request to {url} failed: {exc}") from exc
|
||||
|
||||
if _looks_blocked(resp.status_code, resp.text):
|
||||
if attempt < retries:
|
||||
time.sleep(2.0 * (attempt + 1))
|
||||
continue
|
||||
raise RuntimeError(
|
||||
f"Amazon anti-bot block on {url} (HTTP {resp.status_code}). "
|
||||
"HTTP scraping is being throttled/captcha'd; fall back to the "
|
||||
"browser MCP/CDP path of the ecosystem."
|
||||
)
|
||||
|
||||
if resp.status_code != 200:
|
||||
last_exc = RuntimeError(
|
||||
f"unexpected HTTP {resp.status_code} for {url}"
|
||||
)
|
||||
if attempt < retries:
|
||||
time.sleep(1.5 * (attempt + 1))
|
||||
continue
|
||||
raise last_exc
|
||||
|
||||
return resp
|
||||
|
||||
# Should not reach here, but be defensive.
|
||||
raise RuntimeError(f"could not fetch {url}: {last_exc}")
|
||||
|
||||
|
||||
_ASIN_RE = re.compile(r"/(?:dp|gp/product)/([A-Z0-9]{10})(?:[/?]|$)")
|
||||
_RANK_RE = re.compile(r"#?\s*(\d+)")
|
||||
_PRICE_NUM_RE = re.compile(r"[-+]?\d[\d.,]*")
|
||||
_REVIEWS_RE = re.compile(r"[\d.,]+")
|
||||
_RATING_RE = re.compile(r"([\d.,]+)\s*(?:out of|de|von|su|sur|de um total de)")
|
||||
_PCT_RE = re.compile(r"([\d.,]+)\s*%")
|
||||
|
||||
|
||||
def _text(node) -> str:
|
||||
return node.get_text(" ", strip=True) if node is not None else ""
|
||||
|
||||
|
||||
def _parse_asin(card) -> str | None:
|
||||
"""ASIN from a data-asin attribute or any /dp/<ASIN>/ link inside the card."""
|
||||
asin = card.get("data-asin")
|
||||
if asin and re.fullmatch(r"[A-Z0-9]{10}", asin):
|
||||
return asin
|
||||
for a in card.find_all("a", href=True):
|
||||
m = _ASIN_RE.search(a["href"])
|
||||
if m:
|
||||
return m.group(1)
|
||||
return None
|
||||
|
||||
|
||||
def _parse_url(card, marketplace: str) -> str | None:
|
||||
"""Absolute product URL from the first /dp/ link in the card."""
|
||||
base = f"https://www.{marketplace}"
|
||||
for a in card.find_all("a", href=True):
|
||||
if _ASIN_RE.search(a["href"]):
|
||||
return urljoin(base, a["href"].split("?")[0])
|
||||
# Fall back to the first link at all.
|
||||
first = card.find("a", href=True)
|
||||
if first is not None:
|
||||
return urljoin(base, first["href"].split("?")[0])
|
||||
return None
|
||||
|
||||
|
||||
def _parse_rank(card) -> int | None:
|
||||
"""Rank badge. Amazon renders it as '#1', '1', etc."""
|
||||
badge = card.select_one(".zg-bdg-text, .zg-badge-text, [class*='badge']")
|
||||
txt = _text(badge)
|
||||
if not txt:
|
||||
# Sometimes the rank is in a class like a11y .zg-bdg-text sibling.
|
||||
for sel in (".a-badge-text", "[class*='rank']"):
|
||||
node = card.select_one(sel)
|
||||
txt = _text(node)
|
||||
if txt:
|
||||
break
|
||||
m = _RANK_RE.search(txt)
|
||||
return int(m.group(1)) if m else None
|
||||
|
||||
|
||||
def _parse_title(card) -> str | None:
|
||||
"""Product title — several templates over the years."""
|
||||
for sel in (
|
||||
"._cDEzb_p13n-sc-css-line-clamp-3_g3dy1",
|
||||
"._cDEzb_p13n-sc-css-line-clamp-2_EWgCb",
|
||||
"[class*='line-clamp']",
|
||||
".p13n-sc-truncate",
|
||||
".p13n-sc-truncated",
|
||||
"a.a-link-normal[title]",
|
||||
"img[alt]",
|
||||
):
|
||||
node = card.select_one(sel)
|
||||
if node is None:
|
||||
continue
|
||||
if node.name == "img":
|
||||
alt = node.get("alt")
|
||||
if alt:
|
||||
return alt.strip()
|
||||
continue
|
||||
if node.has_attr("title") and node["title"].strip():
|
||||
return node["title"].strip()
|
||||
txt = _text(node)
|
||||
if txt:
|
||||
return txt
|
||||
return None
|
||||
|
||||
|
||||
def _parse_price(card, marketplace: str) -> tuple[float | None, str | None]:
|
||||
"""Price value (float) and ISO currency, best-effort across templates."""
|
||||
for sel in (
|
||||
"._cDEzb_p13n-sc-price_3mJ9Z",
|
||||
".p13n-sc-price",
|
||||
"span.a-price > span.a-offscreen",
|
||||
".a-price .a-offscreen",
|
||||
"[class*='price']",
|
||||
):
|
||||
node = card.select_one(sel)
|
||||
txt = _text(node)
|
||||
if not txt:
|
||||
continue
|
||||
|
||||
currency = None
|
||||
for sym, iso in _SYMBOL_TO_CURRENCY.items():
|
||||
if sym in txt:
|
||||
currency = iso
|
||||
break
|
||||
if currency is None:
|
||||
currency = _CURRENCY_BY_MARKET.get(marketplace)
|
||||
|
||||
m = _PRICE_NUM_RE.search(txt)
|
||||
if not m:
|
||||
continue
|
||||
raw = m.group(0)
|
||||
value = _to_float(raw)
|
||||
if value is not None:
|
||||
return value, currency
|
||||
return None, None
|
||||
|
||||
|
||||
def _parse_rating(card) -> float | None:
|
||||
"""Star rating, e.g. '4,5 de 5 estrellas' / '4.5 out of 5 stars'."""
|
||||
for sel in ("[class*='review-stars']", ".a-icon-alt", "[title*='star']", "[aria-label*='star']"):
|
||||
node = card.select_one(sel)
|
||||
txt = _text(node) or (node.get("title", "") if node is not None else "") or (
|
||||
node.get("aria-label", "") if node is not None else ""
|
||||
)
|
||||
if not txt:
|
||||
continue
|
||||
m = _RATING_RE.search(txt)
|
||||
if m:
|
||||
return _to_float(m.group(1))
|
||||
# Some templates only render the number ('4,5').
|
||||
m2 = _PRICE_NUM_RE.search(txt)
|
||||
if m2 and ("star" in txt.lower() or "estrella" in txt.lower()):
|
||||
return _to_float(m2.group(0))
|
||||
return None
|
||||
|
||||
|
||||
def _parse_reviews(card) -> int | None:
|
||||
"""Number of ratings/reviews shown next to the stars."""
|
||||
for sel in (
|
||||
"a.a-size-small.a-link-normal",
|
||||
".a-size-small.a-link-normal",
|
||||
"[class*='review-count']",
|
||||
"span.a-size-small",
|
||||
):
|
||||
for node in card.select(sel):
|
||||
txt = _text(node)
|
||||
if not txt:
|
||||
continue
|
||||
m = _REVIEWS_RE.search(txt)
|
||||
if not m:
|
||||
continue
|
||||
digits = m.group(0).replace(".", "").replace(",", "")
|
||||
if digits.isdigit() and len(digits) >= 1:
|
||||
# Avoid catching rank/price by requiring a plausible count token.
|
||||
return int(digits)
|
||||
return None
|
||||
|
||||
|
||||
def _parse_pct_change(card) -> float | None:
|
||||
"""Movers & Shakers percentage change ('+150%')."""
|
||||
for sel in (".zg-percent-change", "[class*='percent']", "[class*='sales-movement']"):
|
||||
node = card.select_one(sel)
|
||||
txt = _text(node)
|
||||
if not txt:
|
||||
continue
|
||||
m = _PCT_RE.search(txt)
|
||||
if m:
|
||||
value = _to_float(m.group(1))
|
||||
if value is None:
|
||||
continue
|
||||
return -value if txt.strip().startswith("-") else value
|
||||
return None
|
||||
|
||||
|
||||
def _to_float(raw: str) -> float | None:
|
||||
"""Parse a numeric string with EU or US decimal/grouping conventions."""
|
||||
if raw is None:
|
||||
return None
|
||||
s = raw.strip().replace("\xa0", "").replace(" ", "")
|
||||
if not s:
|
||||
return None
|
||||
if "," in s and "." in s:
|
||||
# The rightmost separator is the decimal one.
|
||||
if s.rfind(",") > s.rfind("."):
|
||||
s = s.replace(".", "").replace(",", ".")
|
||||
else:
|
||||
s = s.replace(",", "")
|
||||
elif "," in s:
|
||||
# Treat a single comma as decimal separator (EU markets).
|
||||
s = s.replace(",", ".")
|
||||
try:
|
||||
return float(s)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _select_cards(soup: BeautifulSoup) -> list:
|
||||
"""Locate the list-item cards across known Amazon templates."""
|
||||
selectors = (
|
||||
"div.p13n-sc-uncoverable-faceout",
|
||||
"div[id^='gridItemRoot']",
|
||||
"div.zg-grid-general-faceout",
|
||||
"li.zg-item-immersion",
|
||||
"div.a-cardui[data-asin]",
|
||||
"div[data-asin]",
|
||||
)
|
||||
for sel in selectors:
|
||||
cards = soup.select(sel)
|
||||
if cards:
|
||||
return cards
|
||||
return []
|
||||
|
||||
|
||||
def scrape_amazon_bestsellers(
|
||||
marketplace: str = "amazon.es",
|
||||
categories: list[str] | None = None,
|
||||
list_type: str = "bestsellers",
|
||||
max_items: int = 50,
|
||||
) -> list[dict]:
|
||||
"""Scrape Amazon Best Sellers / Movers & Shakers ranking pages.
|
||||
|
||||
Captures demand signals (rank, title, price, rating, reviews and — for
|
||||
Movers & Shakers — percentage change) from one or more category ranking
|
||||
pages of a given Amazon marketplace.
|
||||
|
||||
Args:
|
||||
marketplace: Amazon domain, e.g. ``"amazon.es"``, ``"amazon.com"``.
|
||||
categories: Category slugs (e.g. ``"electronics"``, ``"videogames"``).
|
||||
If ``None`` the general front page of the chosen list is scraped.
|
||||
list_type: ``"bestsellers"`` (URL ``/gp/bestsellers/<cat>``) or
|
||||
``"movers_shakers"`` (URL ``/gp/movers-and-shakers/<cat>``).
|
||||
max_items: Maximum number of items collected per category.
|
||||
|
||||
Returns:
|
||||
A list of dicts, one per product, with exactly these keys:
|
||||
``marketplace, list_type, category, rank, asin, title, price,
|
||||
currency, rating, reviews, pct_change, url``. Missing values are
|
||||
``None``. ``price``/``rating``/``pct_change`` are floats,
|
||||
``rank``/``reviews`` are ints.
|
||||
|
||||
Raises:
|
||||
ValueError: If ``list_type`` is not one of the allowed values.
|
||||
RuntimeError: On network failure or when Amazon serves an anti-bot /
|
||||
captcha / throttling page.
|
||||
"""
|
||||
if list_type not in ("bestsellers", "movers_shakers"):
|
||||
raise ValueError(
|
||||
f"list_type must be 'bestsellers' or 'movers_shakers', got {list_type!r}"
|
||||
)
|
||||
|
||||
cats: list[str | None] = list(categories) if categories else [None]
|
||||
headers = _build_headers(marketplace)
|
||||
results: list[dict] = []
|
||||
|
||||
for category in cats:
|
||||
url = _build_url(marketplace, list_type, category)
|
||||
resp = _fetch(url, headers, timeout=20, retries=2)
|
||||
soup = BeautifulSoup(resp.text, "lxml")
|
||||
cards = _select_cards(soup)
|
||||
|
||||
count = 0
|
||||
for idx, card in enumerate(cards):
|
||||
if count >= max_items:
|
||||
break
|
||||
asin = _parse_asin(card)
|
||||
title = _parse_title(card)
|
||||
# Skip empty / non-product wrappers.
|
||||
if asin is None and title is None:
|
||||
continue
|
||||
|
||||
rank = _parse_rank(card)
|
||||
if rank is None:
|
||||
rank = idx + 1 # positional fallback when no badge is rendered
|
||||
|
||||
price, currency = _parse_price(card, marketplace)
|
||||
results.append(
|
||||
{
|
||||
"marketplace": marketplace,
|
||||
"list_type": list_type,
|
||||
"category": category,
|
||||
"rank": rank,
|
||||
"asin": asin,
|
||||
"title": title,
|
||||
"price": price,
|
||||
"currency": currency,
|
||||
"rating": _parse_rating(card),
|
||||
"reviews": _parse_reviews(card),
|
||||
"pct_change": _parse_pct_change(card)
|
||||
if list_type == "movers_shakers"
|
||||
else None,
|
||||
"url": _parse_url(card, marketplace),
|
||||
}
|
||||
)
|
||||
count += 1
|
||||
|
||||
return results
|
||||
Reference in New Issue
Block a user