feat(browser): auto-commit con 178 cambios

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-20 18:22:23 +02:00
parent 7d100e7f3e
commit 763e06c127
178 changed files with 19917 additions and 317 deletions
@@ -0,0 +1,340 @@
+"""Scraper de Amazon Movers & Shakers via Chrome DevTools Protocol (CDP).
+
+Funcion IMPURA: la pagina ``/gp/movers-and-shakers/`` de Amazon monta sus cards
+por JavaScript (el GET HTTP puro devuelve 0 productos), asi que esta funcion
+renderiza la pagina en un Chrome con remote debugging, espera a que el grid de
+ranking monte async, extrae el ``outerHTML`` renderizado y se lo pasa al parser
+PURO del registry (``parse_amazon_ranking_html``) — el mismo que usa el scraper
+HTTP de bestsellers, sin reescribir el parsing.
+
+Movers & Shakers = productos cuyo ranking de ventas mas sube en las ultimas 24h
+= la mejor senal publica de demanda emergente (clave para dropshipping). Aporta
+el PRECIO DE VENTA en el marketplace (ej. amazon.es en EUR) y el % de subida en
+ranking por producto.
+
+Compone DOS funciones del registry (no reescribe transporte CDP ni parsing):
+  1. ``cdp_open_url_and_wait`` (pipeline) — crea tab nuevo en el Chrome remoto,
+     navega a la URL de listado y espera ``Page.loadEventFired``.
+  2. ``cdp_eval`` (browser) — evalua JS en la pestana cuyo URL contiene un
+     substring (polling de cards + extraccion del ``outerHTML`` del grid).
+
+Devuelve SIEMPRE un dict autosuficiente (estilo del grupo market-intel): nunca
+lanza. NUNCA inventa datos: si no hay cards tras el timeout devuelve
+``status="error"``; si Amazon sirve un captcha, ``status="captcha"``.
+"""
+
+import json
+import os
+import sys
+import time
+from datetime import datetime, timezone
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from browser.cdp_eval import cdp_eval
+from datascience.parse_amazon_ranking_html import parse_amazon_ranking_html
+from pipelines.cdp_open_url_and_wait import cdp_open_url_and_wait
+
+# Marcadores de un interstitial anti-bot / captcha de Amazon.
+_CAPTCHA_MARKERS = (
+    "enter the characters you see below",
+    "to discuss automated access",
+    "api-services-support@amazon",
+    "robot check",
+    "/errors/validatecaptcha",
+)
+
+# Selectores de los cards del grid de ranking (movers comparte plantilla con
+# bestsellers). Se usan en el JS de polling para contar cards montados.
+_CARD_COUNT_JS = (
+    "(document.querySelectorAll('div[id=\"gridItemRoot\"]').length || "
+    "document.querySelectorAll('li.zg-item-immersion').length || "
+    "document.querySelectorAll('.p13n-desktop-grid div[data-asin]').length)"
+)
+
+
+def _build_url(marketplace: str, category: str | None) -> str:
+    """URL de Movers & Shakers para un marketplace y slug de categoria.
+
+    Base: ``https://www.<marketplace>/gp/movers-and-shakers``. Si ``category``
+    es None se usa la portada general; si no, se anade ``/<slug>``.
+    """
+    url = f"https://www.{marketplace}/gp/movers-and-shakers"
+    if category:
+        url = f"{url}/{category.strip('/')}"
+    return url
+
+
+def _detect_captcha(port: int, target_substr: str) -> bool:
+    """True si la pagina renderizada parece un interstitial anti-bot/captcha."""
+    r = cdp_eval(
+        "document.body ? document.body.innerText.slice(0, 4000) : ''",
+        port=port,
+        target_url_substr=target_substr,
+        timeout_s=10.0,
+    )
+    if not r.get("ok"):
+        return False
+    lowered = (r.get("value") or "").lower()
+    return any(m in lowered for m in _CAPTCHA_MARKERS)
+
+
+def _wait_for_cards(port: int, target_substr: str, deadline: float) -> int:
+    """Polling de ``document.querySelectorAll`` hasta >0 cards o deadline.
+
+    El grid monta async tras la hidratacion, asi que el load event NO garantiza
+    que las cards esten en el DOM. Devuelve el numero de cards (0 si se agota).
+    """
+    while time.time() < deadline:
+        r = cdp_eval(
+            _CARD_COUNT_JS,
+            port=port,
+            target_url_substr=target_substr,
+            timeout_s=10.0,
+        )
+        if r.get("ok"):
+            try:
+                n = int(r.get("value") or 0)
+            except (TypeError, ValueError):
+                n = 0
+            if n > 0:
+                return n
+        time.sleep(1.0)
+    return 0
+
+
+def _grab_grid_html(port: int, target_substr: str, timeout_s: float) -> str:
+    """Extrae el ``outerHTML`` del grid de ranking renderizado (o del body)."""
+    expr = (
+        "(() => { const g = document.querySelector('.p13n-desktop-grid'); "
+        "return g ? g.outerHTML : (document.body ? document.body.outerHTML : ''); })()"
+    )
+    r = cdp_eval(
+        expr,
+        port=port,
+        target_url_substr=target_substr,
+        timeout_s=max(15.0, timeout_s),
+    )
+    if not r.get("ok"):
+        return ""
+    return r.get("value") or ""
+
+
+def _scrape_one_category(
+    marketplace: str,
+    category: str | None,
+    port: int,
+    max_items: int,
+    timeout_s: float,
+    scraped_at: str,
+) -> dict:
+    """Navega a una categoria de movers, espera cards y extrae los productos.
+
+    Devuelve ``{"ok": bool, "products": [...], "error": str, "captcha": bool}``.
+    Cada product lleva ya ``marketplace``, ``category``, ``source`` y
+    ``scraped_at``. Filtra filas sin asin ni title.
+    """
+    url = _build_url(marketplace, category)
+    target_substr = "movers-and-shakers"
+
+    # 1. Navegar: crea tab nuevo en el Chrome remoto y espera el load event.
+    try:
+        cdp_open_url_and_wait(port, url, int(timeout_s))
+    except Exception as e:  # noqa: BLE001 — RuntimeError de cdp_open_url_and_wait
+        msg = str(e)
+        if (
+            "no se pudo crear tab" in msg
+            or "URLError" in msg
+            or "Connection refused" in msg
+            or "timeout" in msg.lower()
+        ):
+            msg = (
+                f"no hay Chrome usable en el puerto {port} "
+                f"(¿remote debugging activo?): {e}"
+            )
+        return {"ok": False, "products": [], "error": msg, "captcha": False}
+
+    # 2. Detectar captcha lo antes posible.
+    if _detect_captcha(port, target_substr):
+        return {
+            "ok": False,
+            "products": [],
+            "error": "Amazon sirvio un captcha / interstitial anti-bot",
+            "captcha": True,
+        }
+
+    # 3. Polling hasta que los cards monten (render async tras hidratacion).
+    deadline = time.time() + timeout_s
+    n_cards = _wait_for_cards(port, target_substr, deadline)
+    if n_cards == 0:
+        # Re-chequear captcha (puede haber aparecido tras la hidratacion).
+        if _detect_captcha(port, target_substr):
+            return {
+                "ok": False,
+                "products": [],
+                "error": "Amazon sirvio un captcha / interstitial anti-bot",
+                "captcha": True,
+            }
+        return {
+            "ok": False,
+            "products": [],
+            "error": (
+                "no hay cards de ranking (la categoria puede no tener movers ahora "
+                "—Amazon muestra 'no movers and shakers available'— o el chromium "
+                "del puerto sirvio una pagina degradada / no logueada)"
+            ),
+            "captcha": False,
+        }
+
+    # 4. Extraer el outerHTML del grid y parsearlo con el parser PURO.
+    html = _grab_grid_html(port, target_substr, timeout_s)
+    rows = parse_amazon_ranking_html(
+        html,
+        marketplace=marketplace,
+        list_type="movers_shakers",
+        max_items=max_items,
+    )
+
+    # 5. Enriquecer: category + source + scraped_at; filtrar filas vacias.
+    products = []
+    for row in rows:
+        if not row.get("asin") and not row.get("title"):
+            continue
+        row["category"] = category
+        row["source"] = "amazon_movers"
+        row["scraped_at"] = scraped_at
+        products.append(row)
+
+    if not products:
+        return {
+            "ok": False,
+            "products": [],
+            "error": (
+                f"se montaron {n_cards} cards pero el parser no extrajo productos "
+                "(¿Amazon roto la plantilla del DOM?)"
+            ),
+            "captcha": False,
+        }
+
+    return {"ok": True, "products": products, "error": "", "captcha": False}
+
+
+def scrape_amazon_movers_cdp(
+    marketplace: str = "amazon.es",
+    categories: list[str] | None = None,
+    port: int = 9222,
+    max_items: int = 30,
+    timeout_s: float = 25.0,
+) -> dict:
+    """Scrapea Amazon Movers & Shakers renderizando la pagina via CDP.
+
+    Funcion IMPURA: necesita un Chrome con remote debugging escuchando en
+    ``port`` (el navegador diario residential en 9222 pasa el anti-bot mejor que
+    ``requests``). Por cada categoria navega a la URL de movers, espera a que el
+    grid (montado por JS) aparezca, extrae el ``outerHTML`` renderizado y lo pasa
+    al parser PURO ``parse_amazon_ranking_html``. Nunca lanza: cualquier fallo
+    devuelve ``{"status": "error"|"captcha", ...}`` con ``products: []``. NUNCA
+    inventa datos.
+
+    Args:
+        marketplace: Dominio Amazon objetivo (``"amazon.es"``, ``"amazon.com"``,
+            ...). Determina la URL y la moneda fallback del parser.
+        categories: Lista de slugs de categoria de movers (ej. ``"automotive"``,
+            ``"pet-supplies"``). Si es None, scrapea la portada general de movers.
+            Cada slug navega a ``/gp/movers-and-shakers/<slug>``.
+        port: Puerto de remote debugging del Chrome a usar. Default 9222 (el
+            chromium-personal residential de produccion). Para un Chrome aislado
+            apunta a 9333 (el del browser_mcp).
+        max_items: Numero maximo de productos recolectados por categoria.
+        timeout_s: Timeout (segundos) por categoria, tanto para la navegacion como
+            para el polling de aparicion de cards. Default 25.0.
+
+    Returns:
+        dict autosuficiente. En exito::
+
+            {
+                "status": "ok",
+                "source": "amazon_movers",
+                "count": <N productos>,
+                "products": [ {product_dict}, ... ],
+            }
+
+        donde cada product_dict tiene las claves: marketplace, list_type
+        ("movers_shakers"), category, rank (int), asin, title, price (float EUR),
+        currency, rating (float|None), reviews (int|None), pct_change (float|None),
+        url, source ("amazon_movers"), scraped_at (ISO8601 UTC).
+
+        En error::
+
+            {"status": "error",   "error": <msg>, "source": "amazon_movers", "products": []}
+
+        Si Amazon sirve captcha::
+
+            {"status": "captcha", "error": <msg>, "source": "amazon_movers", "products": []}
+    """
+    scraped_at = datetime.now(timezone.utc).isoformat()
+    cats: list[str | None] = list(categories) if categories else [None]
+
+    all_products: list[dict] = []
+    last_error = ""
+    saw_captcha = False
+
+    for category in cats:
+        res = _scrape_one_category(
+            marketplace=marketplace,
+            category=category,
+            port=port,
+            max_items=max_items,
+            timeout_s=timeout_s,
+            scraped_at=scraped_at,
+        )
+        if res["ok"]:
+            all_products.extend(res["products"])
+        else:
+            last_error = res["error"]
+            if res.get("captcha"):
+                saw_captcha = True
+
+    if all_products:
+        return {
+            "status": "ok",
+            "source": "amazon_movers",
+            "count": len(all_products),
+            "products": all_products,
+        }
+
+    # Sin productos en ninguna categoria: error o captcha.
+    return {
+        "status": "captcha" if saw_captcha else "error",
+        "error": last_error or "no se extrajo ningun producto",
+        "source": "amazon_movers",
+        "products": [],
+    }
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Scraper de Amazon Movers & Shakers via CDP."
+    )
+    parser.add_argument("--marketplace", default="amazon.es")
+    parser.add_argument(
+        "--categories",
+        default="",
+        help="slugs separados por coma (ej. automotive,pet-supplies). Vacio = portada.",
+    )
+    parser.add_argument("--port", type=int, default=9222)
+    parser.add_argument("--max-items", type=int, default=30)
+    parser.add_argument("--timeout-s", type=float, default=25.0)
+    args = parser.parse_args()
+
+    cats = [c.strip() for c in args.categories.split(",") if c.strip()] or None
+    out = scrape_amazon_movers_cdp(
+        marketplace=args.marketplace,
+        categories=cats,
+        port=args.port,
+        max_items=args.max_items,
+        timeout_s=args.timeout_s,
+    )
+    print(json.dumps(out, ensure_ascii=False, indent=2))