feat(browser): auto-commit con 178 cambios

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-20 18:22:23 +02:00
parent 7d100e7f3e
commit 763e06c127
178 changed files with 19917 additions and 317 deletions
@@ -1,13 +1,22 @@
-"""Scrape Amazon Best Sellers and Movers & Shakers ranking pages for product demand signals."""
+"""Scrape Amazon Best Sellers and Movers & Shakers ranking pages for product demand signals.
+
+HTTP fetch strategy: fetches each ranking page with ``requests`` (browser-ish
+headers + retry/backoff) and delegates DOM parsing to the pure, reusable
+``parse_amazon_ranking_html`` function of the registry — so the HTTP scraper and
+the CDP scraper (``scrape_amazon_movers_cdp``) share one robust parser.
+"""

 from __future__ import annotations

-import re
+import os
+import sys
 import time
-from urllib.parse import urljoin

 import requests
-from bs4 import BeautifulSoup
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from datascience.parse_amazon_ranking_html import parse_amazon_ranking_html

 # Accept-Language hint per marketplace TLD. Falls back to a generic value.
 _ACCEPT_LANGUAGE = {
@@ -21,28 +30,6 @@ _ACCEPT_LANGUAGE = {
    "amazon.com.br": "pt-BR,pt;q=0.9,en;q=0.6",
 }

-# Currency guessed from the marketplace TLD (used only as a fallback when the
-# price string has no recognisable symbol).
-_CURRENCY_BY_MARKET = {
-    "amazon.es": "EUR",
-    "amazon.com": "USD",
-    "amazon.co.uk": "GBP",
-    "amazon.de": "EUR",
-    "amazon.fr": "EUR",
-    "amazon.it": "EUR",
-    "amazon.com.mx": "MXN",
-    "amazon.com.br": "BRL",
-}
-
-# Map common currency symbols to ISO codes.
-_SYMBOL_TO_CURRENCY = {
-    "€": "EUR",
-    "$": "USD",
-    "£": "GBP",
-    "R$": "BRL",
-    "US$": "USD",
-}
-
 _USER_AGENT = (
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
@@ -133,213 +120,6 @@ def _fetch(url: str, headers: dict, timeout: int, retries: int) -> requests.Resp
    raise RuntimeError(f"could not fetch {url}: {last_exc}")


-_ASIN_RE = re.compile(r"/(?:dp|gp/product)/([A-Z0-9]{10})(?:[/?]|$)")
-_RANK_RE = re.compile(r"#?\s*(\d+)")
-_PRICE_NUM_RE = re.compile(r"[-+]?\d[\d.,]*")
-_REVIEWS_RE = re.compile(r"[\d.,]+")
-_RATING_RE = re.compile(r"([\d.,]+)\s*(?:out of|de|von|su|sur|de um total de)")
-_PCT_RE = re.compile(r"([\d.,]+)\s*%")
-
-
-def _text(node) -> str:
-    return node.get_text(" ", strip=True) if node is not None else ""
-
-
-def _parse_asin(card) -> str | None:
-    """ASIN from a data-asin attribute or any /dp/<ASIN>/ link inside the card."""
-    asin = card.get("data-asin")
-    if asin and re.fullmatch(r"[A-Z0-9]{10}", asin):
-        return asin
-    for a in card.find_all("a", href=True):
-        m = _ASIN_RE.search(a["href"])
-        if m:
-            return m.group(1)
-    return None
-
-
-def _parse_url(card, marketplace: str) -> str | None:
-    """Absolute product URL from the first /dp/ link in the card."""
-    base = f"https://www.{marketplace}"
-    for a in card.find_all("a", href=True):
-        if _ASIN_RE.search(a["href"]):
-            return urljoin(base, a["href"].split("?")[0])
-    # Fall back to the first link at all.
-    first = card.find("a", href=True)
-    if first is not None:
-        return urljoin(base, first["href"].split("?")[0])
-    return None
-
-
-def _parse_rank(card) -> int | None:
-    """Rank badge. Amazon renders it as '#1', '1', etc."""
-    badge = card.select_one(".zg-bdg-text, .zg-badge-text, [class*='badge']")
-    txt = _text(badge)
-    if not txt:
-        # Sometimes the rank is in a class like a11y .zg-bdg-text sibling.
-        for sel in (".a-badge-text", "[class*='rank']"):
-            node = card.select_one(sel)
-            txt = _text(node)
-            if txt:
-                break
-    m = _RANK_RE.search(txt)
-    return int(m.group(1)) if m else None
-
-
-def _parse_title(card) -> str | None:
-    """Product title — several templates over the years."""
-    for sel in (
-        "._cDEzb_p13n-sc-css-line-clamp-3_g3dy1",
-        "._cDEzb_p13n-sc-css-line-clamp-2_EWgCb",
-        "[class*='line-clamp']",
-        ".p13n-sc-truncate",
-        ".p13n-sc-truncated",
-        "a.a-link-normal[title]",
-        "img[alt]",
-    ):
-        node = card.select_one(sel)
-        if node is None:
-            continue
-        if node.name == "img":
-            alt = node.get("alt")
-            if alt:
-                return alt.strip()
-            continue
-        if node.has_attr("title") and node["title"].strip():
-            return node["title"].strip()
-        txt = _text(node)
-        if txt:
-            return txt
-    return None
-
-
-def _parse_price(card, marketplace: str) -> tuple[float | None, str | None]:
-    """Price value (float) and ISO currency, best-effort across templates."""
-    for sel in (
-        "._cDEzb_p13n-sc-price_3mJ9Z",
-        ".p13n-sc-price",
-        "span.a-price > span.a-offscreen",
-        ".a-price .a-offscreen",
-        "[class*='price']",
-    ):
-        node = card.select_one(sel)
-        txt = _text(node)
-        if not txt:
-            continue
-
-        currency = None
-        for sym, iso in _SYMBOL_TO_CURRENCY.items():
-            if sym in txt:
-                currency = iso
-                break
-        if currency is None:
-            currency = _CURRENCY_BY_MARKET.get(marketplace)
-
-        m = _PRICE_NUM_RE.search(txt)
-        if not m:
-            continue
-        raw = m.group(0)
-        value = _to_float(raw)
-        if value is not None:
-            return value, currency
-    return None, None
-
-
-def _parse_rating(card) -> float | None:
-    """Star rating, e.g. '4,5 de 5 estrellas' / '4.5 out of 5 stars'."""
-    for sel in ("[class*='review-stars']", ".a-icon-alt", "[title*='star']", "[aria-label*='star']"):
-        node = card.select_one(sel)
-        txt = _text(node) or (node.get("title", "") if node is not None else "") or (
-            node.get("aria-label", "") if node is not None else ""
-        )
-        if not txt:
-            continue
-        m = _RATING_RE.search(txt)
-        if m:
-            return _to_float(m.group(1))
-        # Some templates only render the number ('4,5').
-        m2 = _PRICE_NUM_RE.search(txt)
-        if m2 and ("star" in txt.lower() or "estrella" in txt.lower()):
-            return _to_float(m2.group(0))
-    return None
-
-
-def _parse_reviews(card) -> int | None:
-    """Number of ratings/reviews shown next to the stars."""
-    for sel in (
-        "a.a-size-small.a-link-normal",
-        ".a-size-small.a-link-normal",
-        "[class*='review-count']",
-        "span.a-size-small",
-    ):
-        for node in card.select(sel):
-            txt = _text(node)
-            if not txt:
-                continue
-            m = _REVIEWS_RE.search(txt)
-            if not m:
-                continue
-            digits = m.group(0).replace(".", "").replace(",", "")
-            if digits.isdigit() and len(digits) >= 1:
-                # Avoid catching rank/price by requiring a plausible count token.
-                return int(digits)
-    return None
-
-
-def _parse_pct_change(card) -> float | None:
-    """Movers & Shakers percentage change ('+150%')."""
-    for sel in (".zg-percent-change", "[class*='percent']", "[class*='sales-movement']"):
-        node = card.select_one(sel)
-        txt = _text(node)
-        if not txt:
-            continue
-        m = _PCT_RE.search(txt)
-        if m:
-            value = _to_float(m.group(1))
-            if value is None:
-                continue
-            return -value if txt.strip().startswith("-") else value
-    return None
-
-
-def _to_float(raw: str) -> float | None:
-    """Parse a numeric string with EU or US decimal/grouping conventions."""
-    if raw is None:
-        return None
-    s = raw.strip().replace("\xa0", "").replace(" ", "")
-    if not s:
-        return None
-    if "," in s and "." in s:
-        # The rightmost separator is the decimal one.
-        if s.rfind(",") > s.rfind("."):
-            s = s.replace(".", "").replace(",", ".")
-        else:
-            s = s.replace(",", "")
-    elif "," in s:
-        # Treat a single comma as decimal separator (EU markets).
-        s = s.replace(",", ".")
-    try:
-        return float(s)
-    except ValueError:
-        return None
-
-
-def _select_cards(soup: BeautifulSoup) -> list:
-    """Locate the list-item cards across known Amazon templates."""
-    selectors = (
-        "div.p13n-sc-uncoverable-faceout",
-        "div[id^='gridItemRoot']",
-        "div.zg-grid-general-faceout",
-        "li.zg-item-immersion",
-        "div.a-cardui[data-asin]",
-        "div[data-asin]",
-    )
-    for sel in selectors:
-        cards = soup.select(sel)
-        if cards:
-            return cards
-    return []
-
-
 def scrape_amazon_bestsellers(
    marketplace: str = "amazon.es",
    categories: list[str] | None = None,
@@ -365,7 +145,8 @@ def scrape_amazon_bestsellers(
        ``marketplace, list_type, category, rank, asin, title, price,
        currency, rating, reviews, pct_change, url``. Missing values are
        ``None``. ``price``/``rating``/``pct_change`` are floats,
-        ``rank``/``reviews`` are ints.
+        ``rank``/``reviews`` are ints. ``pct_change`` only filled for
+        ``movers_shakers``.

    Raises:
        ValueError: If ``list_type`` is not one of the allowed values.
@@ -384,42 +165,16 @@ def scrape_amazon_bestsellers(
    for category in cats:
        url = _build_url(marketplace, list_type, category)
        resp = _fetch(url, headers, timeout=20, retries=2)
-        soup = BeautifulSoup(resp.text, "lxml")
-        cards = _select_cards(soup)
-
-        count = 0
-        for idx, card in enumerate(cards):
-            if count >= max_items:
-                break
-            asin = _parse_asin(card)
-            title = _parse_title(card)
-            # Skip empty / non-product wrappers.
-            if asin is None and title is None:
-                continue
-
-            rank = _parse_rank(card)
-            if rank is None:
-                rank = idx + 1  # positional fallback when no badge is rendered
-
-            price, currency = _parse_price(card, marketplace)
-            results.append(
-                {
-                    "marketplace": marketplace,
-                    "list_type": list_type,
-                    "category": category,
-                    "rank": rank,
-                    "asin": asin,
-                    "title": title,
-                    "price": price,
-                    "currency": currency,
-                    "rating": _parse_rating(card),
-                    "reviews": _parse_reviews(card),
-                    "pct_change": _parse_pct_change(card)
-                    if list_type == "movers_shakers"
-                    else None,
-                    "url": _parse_url(card, marketplace),
-                }
-            )
-            count += 1
+        rows = parse_amazon_ranking_html(
+            resp.text,
+            marketplace=marketplace,
+            list_type=list_type,
+            max_items=max_items,
+        )
+        # The pure parser leaves category=None (it doesn't know the URL);
+        # stamp the category we requested.
+        for row in rows:
+            row["category"] = category
+        results.extend(rows)

    return results