feat(browser): auto-commit con 178 cambios

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-20 18:22:23 +02:00
parent 7d100e7f3e
commit 763e06c127
178 changed files with 19917 additions and 317 deletions
@@ -1,13 +1,22 @@
"""Scrape Amazon Best Sellers and Movers & Shakers ranking pages for product demand signals."""
"""Scrape Amazon Best Sellers and Movers & Shakers ranking pages for product demand signals.
HTTP fetch strategy: fetches each ranking page with ``requests`` (browser-ish
headers + retry/backoff) and delegates DOM parsing to the pure, reusable
``parse_amazon_ranking_html`` function of the registry — so the HTTP scraper and
the CDP scraper (``scrape_amazon_movers_cdp``) share one robust parser.
"""
from __future__ import annotations
import re
import os
import sys
import time
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from datascience.parse_amazon_ranking_html import parse_amazon_ranking_html
# Accept-Language hint per marketplace TLD. Falls back to a generic value.
_ACCEPT_LANGUAGE = {
@@ -21,28 +30,6 @@ _ACCEPT_LANGUAGE = {
"amazon.com.br": "pt-BR,pt;q=0.9,en;q=0.6",
}
# Currency guessed from the marketplace TLD (used only as a fallback when the
# price string has no recognisable symbol).
_CURRENCY_BY_MARKET = {
"amazon.es": "EUR",
"amazon.com": "USD",
"amazon.co.uk": "GBP",
"amazon.de": "EUR",
"amazon.fr": "EUR",
"amazon.it": "EUR",
"amazon.com.mx": "MXN",
"amazon.com.br": "BRL",
}
# Map common currency symbols to ISO codes.
_SYMBOL_TO_CURRENCY = {
"": "EUR",
"$": "USD",
"£": "GBP",
"R$": "BRL",
"US$": "USD",
}
_USER_AGENT = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
@@ -133,213 +120,6 @@ def _fetch(url: str, headers: dict, timeout: int, retries: int) -> requests.Resp
raise RuntimeError(f"could not fetch {url}: {last_exc}")
_ASIN_RE = re.compile(r"/(?:dp|gp/product)/([A-Z0-9]{10})(?:[/?]|$)")
_RANK_RE = re.compile(r"#?\s*(\d+)")
_PRICE_NUM_RE = re.compile(r"[-+]?\d[\d.,]*")
_REVIEWS_RE = re.compile(r"[\d.,]+")
_RATING_RE = re.compile(r"([\d.,]+)\s*(?:out of|de|von|su|sur|de um total de)")
_PCT_RE = re.compile(r"([\d.,]+)\s*%")
def _text(node) -> str:
return node.get_text(" ", strip=True) if node is not None else ""
def _parse_asin(card) -> str | None:
"""ASIN from a data-asin attribute or any /dp/<ASIN>/ link inside the card."""
asin = card.get("data-asin")
if asin and re.fullmatch(r"[A-Z0-9]{10}", asin):
return asin
for a in card.find_all("a", href=True):
m = _ASIN_RE.search(a["href"])
if m:
return m.group(1)
return None
def _parse_url(card, marketplace: str) -> str | None:
"""Absolute product URL from the first /dp/ link in the card."""
base = f"https://www.{marketplace}"
for a in card.find_all("a", href=True):
if _ASIN_RE.search(a["href"]):
return urljoin(base, a["href"].split("?")[0])
# Fall back to the first link at all.
first = card.find("a", href=True)
if first is not None:
return urljoin(base, first["href"].split("?")[0])
return None
def _parse_rank(card) -> int | None:
"""Rank badge. Amazon renders it as '#1', '1', etc."""
badge = card.select_one(".zg-bdg-text, .zg-badge-text, [class*='badge']")
txt = _text(badge)
if not txt:
# Sometimes the rank is in a class like a11y .zg-bdg-text sibling.
for sel in (".a-badge-text", "[class*='rank']"):
node = card.select_one(sel)
txt = _text(node)
if txt:
break
m = _RANK_RE.search(txt)
return int(m.group(1)) if m else None
def _parse_title(card) -> str | None:
"""Product title — several templates over the years."""
for sel in (
"._cDEzb_p13n-sc-css-line-clamp-3_g3dy1",
"._cDEzb_p13n-sc-css-line-clamp-2_EWgCb",
"[class*='line-clamp']",
".p13n-sc-truncate",
".p13n-sc-truncated",
"a.a-link-normal[title]",
"img[alt]",
):
node = card.select_one(sel)
if node is None:
continue
if node.name == "img":
alt = node.get("alt")
if alt:
return alt.strip()
continue
if node.has_attr("title") and node["title"].strip():
return node["title"].strip()
txt = _text(node)
if txt:
return txt
return None
def _parse_price(card, marketplace: str) -> tuple[float | None, str | None]:
"""Price value (float) and ISO currency, best-effort across templates."""
for sel in (
"._cDEzb_p13n-sc-price_3mJ9Z",
".p13n-sc-price",
"span.a-price > span.a-offscreen",
".a-price .a-offscreen",
"[class*='price']",
):
node = card.select_one(sel)
txt = _text(node)
if not txt:
continue
currency = None
for sym, iso in _SYMBOL_TO_CURRENCY.items():
if sym in txt:
currency = iso
break
if currency is None:
currency = _CURRENCY_BY_MARKET.get(marketplace)
m = _PRICE_NUM_RE.search(txt)
if not m:
continue
raw = m.group(0)
value = _to_float(raw)
if value is not None:
return value, currency
return None, None
def _parse_rating(card) -> float | None:
"""Star rating, e.g. '4,5 de 5 estrellas' / '4.5 out of 5 stars'."""
for sel in ("[class*='review-stars']", ".a-icon-alt", "[title*='star']", "[aria-label*='star']"):
node = card.select_one(sel)
txt = _text(node) or (node.get("title", "") if node is not None else "") or (
node.get("aria-label", "") if node is not None else ""
)
if not txt:
continue
m = _RATING_RE.search(txt)
if m:
return _to_float(m.group(1))
# Some templates only render the number ('4,5').
m2 = _PRICE_NUM_RE.search(txt)
if m2 and ("star" in txt.lower() or "estrella" in txt.lower()):
return _to_float(m2.group(0))
return None
def _parse_reviews(card) -> int | None:
"""Number of ratings/reviews shown next to the stars."""
for sel in (
"a.a-size-small.a-link-normal",
".a-size-small.a-link-normal",
"[class*='review-count']",
"span.a-size-small",
):
for node in card.select(sel):
txt = _text(node)
if not txt:
continue
m = _REVIEWS_RE.search(txt)
if not m:
continue
digits = m.group(0).replace(".", "").replace(",", "")
if digits.isdigit() and len(digits) >= 1:
# Avoid catching rank/price by requiring a plausible count token.
return int(digits)
return None
def _parse_pct_change(card) -> float | None:
"""Movers & Shakers percentage change ('+150%')."""
for sel in (".zg-percent-change", "[class*='percent']", "[class*='sales-movement']"):
node = card.select_one(sel)
txt = _text(node)
if not txt:
continue
m = _PCT_RE.search(txt)
if m:
value = _to_float(m.group(1))
if value is None:
continue
return -value if txt.strip().startswith("-") else value
return None
def _to_float(raw: str) -> float | None:
"""Parse a numeric string with EU or US decimal/grouping conventions."""
if raw is None:
return None
s = raw.strip().replace("\xa0", "").replace(" ", "")
if not s:
return None
if "," in s and "." in s:
# The rightmost separator is the decimal one.
if s.rfind(",") > s.rfind("."):
s = s.replace(".", "").replace(",", ".")
else:
s = s.replace(",", "")
elif "," in s:
# Treat a single comma as decimal separator (EU markets).
s = s.replace(",", ".")
try:
return float(s)
except ValueError:
return None
def _select_cards(soup: BeautifulSoup) -> list:
"""Locate the list-item cards across known Amazon templates."""
selectors = (
"div.p13n-sc-uncoverable-faceout",
"div[id^='gridItemRoot']",
"div.zg-grid-general-faceout",
"li.zg-item-immersion",
"div.a-cardui[data-asin]",
"div[data-asin]",
)
for sel in selectors:
cards = soup.select(sel)
if cards:
return cards
return []
def scrape_amazon_bestsellers(
marketplace: str = "amazon.es",
categories: list[str] | None = None,
@@ -365,7 +145,8 @@ def scrape_amazon_bestsellers(
``marketplace, list_type, category, rank, asin, title, price,
currency, rating, reviews, pct_change, url``. Missing values are
``None``. ``price``/``rating``/``pct_change`` are floats,
``rank``/``reviews`` are ints.
``rank``/``reviews`` are ints. ``pct_change`` only filled for
``movers_shakers``.
Raises:
ValueError: If ``list_type`` is not one of the allowed values.
@@ -384,42 +165,16 @@ def scrape_amazon_bestsellers(
for category in cats:
url = _build_url(marketplace, list_type, category)
resp = _fetch(url, headers, timeout=20, retries=2)
soup = BeautifulSoup(resp.text, "lxml")
cards = _select_cards(soup)
count = 0
for idx, card in enumerate(cards):
if count >= max_items:
break
asin = _parse_asin(card)
title = _parse_title(card)
# Skip empty / non-product wrappers.
if asin is None and title is None:
continue
rank = _parse_rank(card)
if rank is None:
rank = idx + 1 # positional fallback when no badge is rendered
price, currency = _parse_price(card, marketplace)
results.append(
{
"marketplace": marketplace,
"list_type": list_type,
"category": category,
"rank": rank,
"asin": asin,
"title": title,
"price": price,
"currency": currency,
"rating": _parse_rating(card),
"reviews": _parse_reviews(card),
"pct_change": _parse_pct_change(card)
if list_type == "movers_shakers"
else None,
"url": _parse_url(card, marketplace),
}
)
count += 1
rows = parse_amazon_ranking_html(
resp.text,
marketplace=marketplace,
list_type=list_type,
max_items=max_items,
)
# The pure parser leaves category=None (it doesn't know the URL);
# stamp the category we requested.
for row in rows:
row["category"] = category
results.extend(rows)
return results