feat(shell): auto-commit con 31 cambios

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-14 23:55:16 +02:00
parent 1430039688
commit e1e9bb7499
31 changed files with 3917 additions and 0 deletions
@@ -0,0 +1,393 @@
+"""Capta productos populares de AliExpress como señal de e-commerce/dropshipping.
+
+Extrae el JSON que AliExpress embebe en el HTML de su página de búsqueda/listado
+(``window.runParams`` / ``_dida_config`` / scripts ``data``) en lugar de parsear
+el DOM renderizado por JS. AliExpress es anti-bot fuerte (captcha, 403, fingerprint
+sobre headless/datacenter), por lo que esta función es best-effort: cuando el fetch
+real es bloqueado lanza ``RuntimeError`` con un mensaje claro. NUNCA inventa datos.
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from typing import Any
+
+
+_BASE = "https://www.aliexpress.com"
+_WHOLESALE = f"{_BASE}/wholesale"
+
+# Headers realistas de un navegador desktop. AliExpress fingerprint-ea agresivamente,
+# así que enviamos un perfil coherente (Chrome estable + Accept-Language acorde a region).
+_DESKTOP_HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/124.0.0.0 Safari/537.36"
+    ),
+    "Accept": (
+        "text/html,application/xhtml+xml,application/xml;q=0.9,"
+        "image/avif,image/webp,image/apng,*/*;q=0.8"
+    ),
+    "Accept-Encoding": "gzip, deflate, br",
+    "Connection": "keep-alive",
+    "Upgrade-Insecure-Requests": "1",
+    "Sec-Fetch-Dest": "document",
+    "Sec-Fetch-Mode": "navigate",
+    "Sec-Fetch-Site": "none",
+    "Sec-Fetch-User": "?1",
+    "Cache-Control": "max-age=0",
+}
+
+# AliExpress decide moneda/region por estas cookies. Mapa ship_to -> (region, locale, currency).
+_REGION_MAP: dict[str, tuple[str, str, str]] = {
+    "ES": ("ES", "es_ES", "EUR"),
+    "US": ("US", "en_US", "USD"),
+    "GB": ("GB", "en_GB", "GBP"),
+    "FR": ("FR", "fr_FR", "EUR"),
+    "DE": ("DE", "de_DE", "EUR"),
+    "IT": ("IT", "it_IT", "EUR"),
+    "PT": ("PT", "pt_PT", "EUR"),
+    "MX": ("MX", "es_MX", "USD"),
+    "BR": ("BR", "pt_BR", "BRL"),
+}
+
+# Señales de bloqueo anti-bot en la respuesta.
+_BLOCK_MARKERS = (
+    "punish",          # /_____tmd_____/punish — captcha slider de AliExpress
+    "nc_token",        # NoCaptcha de Alibaba
+    "captcha",
+    "Access Denied",
+    "baxia-dialog",    # widget de verificacion
+)
+
+
+def _region_cookies(ship_to: str) -> dict[str, str]:
+    region, locale, currency = _REGION_MAP.get(
+        ship_to.upper(), _REGION_MAP["ES"]
+    )
+    return {
+        "aep_usuc_f": f"site=glo&c_tp={currency}&region={region}&b_locale={locale}",
+        "intl_locale": locale,
+        "xman_us_f": f"x_l=0&no_popup_today=n&zero_order=n&x_locale={locale}",
+    }
+
+
+def _build_url(query: str | None, category: str | None) -> str:
+    if query:
+        # /wholesale?SearchText=... es el listado de búsqueda con runParams embebido.
+        from urllib.parse import quote_plus
+
+        return f"{_WHOLESALE}?SearchText={quote_plus(query)}&SortType=total_tranpro_desc"
+    if category:
+        # Categorías numéricas: /category/<id>/x.html. Si llega un slug, lo usamos como texto.
+        if category.isdigit():
+            return f"{_BASE}/category/{category}/x.html?SortType=total_tranpro_desc"
+        from urllib.parse import quote_plus
+
+        return f"{_WHOLESALE}?SearchText={quote_plus(category)}&SortType=total_tranpro_desc"
+    # Sin query ni categoría: listado de best-selling genérico.
+    return f"{_WHOLESALE}?SearchText=hot+products&SortType=total_tranpro_desc"
+
+
+def _looks_blocked(html: str, status_code: int) -> bool:
+    if status_code in (403, 429, 503):
+        return True
+    head = html[:6000].lower()
+    return any(marker.lower() in head for marker in _BLOCK_MARKERS)
+
+
+def _extract_embedded_json(html: str) -> dict[str, Any] | None:
+    """Intenta varios patrones de JSON embebido que AliExpress ha usado a lo largo del tiempo.
+
+    El nombre/forma cambia con frecuencia, así que probamos en orden y nos quedamos
+    con el primero que parsee y contenga algo con pinta de items.
+    """
+    patterns = (
+        r"window\.runParams\s*=\s*({.*?})\s*;\s*</script>",
+        r"window\._dida_config_\s*=\s*({.*?})\s*;",
+        r"_init_data_\s*=\s*{\s*data:\s*({.*?})\s*}\s*</script>",
+        r"window\.runParams\s*=\s*({.*?});",
+    )
+    for pat in patterns:
+        m = re.search(pat, html, re.DOTALL)
+        if not m:
+            continue
+        blob = m.group(1)
+        try:
+            data = json.loads(blob)
+        except (json.JSONDecodeError, ValueError):
+            continue
+        if isinstance(data, dict):
+            return data
+    return None
+
+
+def _dig_items(data: dict[str, Any]) -> list[dict[str, Any]]:
+    """Localiza la lista de productos dentro del JSON embebido, sea cual sea su anidación.
+
+    Las claves han variado entre 'mods.itemList.content', 'items', 'result.items'...
+    así que hacemos un walk genérico buscando la primera lista de dicts con pinta de
+    producto (tienen productId/title/trade).
+    """
+    found: list[dict[str, Any]] = []
+
+    def _is_product(d: dict[str, Any]) -> bool:
+        keys = set(d.keys())
+        id_keys = {"productId", "product_id", "productid", "id"}
+        title_keys = {"title", "subject", "name"}
+        return bool(keys & id_keys) and bool(keys & title_keys)
+
+    def _walk(node: Any) -> None:
+        if found:
+            return
+        if isinstance(node, list):
+            product_like = [x for x in node if isinstance(x, dict) and _is_product(x)]
+            if len(product_like) >= 2:
+                found.extend(product_like)
+                return
+            for x in node:
+                _walk(x)
+        elif isinstance(node, dict):
+            for v in node.values():
+                _walk(v)
+
+    _walk(data)
+    return found
+
+
+def _to_float(value: Any) -> float | None:
+    if value is None:
+        return None
+    if isinstance(value, (int, float)):
+        return float(value)
+    s = str(value)
+    # Quita símbolos de moneda y separadores de miles; deja el primer número decimal.
+    m = re.search(r"\d[\d.,]*", s.replace(" ", " "))
+    if not m:
+        return None
+    num = m.group(0)
+    # Heurística: si hay coma y punto, asume coma = miles. Si solo coma, coma = decimal.
+    if "," in num and "." in num:
+        num = num.replace(",", "")
+    elif "," in num:
+        num = num.replace(",", ".")
+    try:
+        return float(num)
+    except ValueError:
+        return None
+
+
+def _to_orders(value: Any) -> int | None:
+    if value is None:
+        return None
+    if isinstance(value, int):
+        return value
+    s = str(value).lower()
+    # Formatos: "1,234 sold", "2.3k sold", "10000+ orders".
+    mult = 1
+    if "k" in s:
+        mult = 1000
+    m = re.search(r"\d[\d.,]*", s)
+    if not m:
+        return None
+    num = m.group(0).replace(",", "")
+    try:
+        base = float(num)
+    except ValueError:
+        return None
+    return int(base * mult)
+
+
+def _normalize_item(
+    raw: dict[str, Any], category: str | None
+) -> dict[str, Any] | None:
+    pid = (
+        raw.get("productId")
+        or raw.get("product_id")
+        or raw.get("productid")
+        or raw.get("id")
+    )
+    if pid is None:
+        return None
+    product_id = str(pid)
+
+    title = raw.get("title") or raw.get("subject") or raw.get("name")
+    if isinstance(title, dict):
+        title = title.get("displayTitle") or title.get("seoTitle")
+    title = str(title).strip() if title else None
+
+    # Precio: AliExpress lo mete en 'prices.salePrice.minPrice' o variantes planas.
+    price_node = (
+        raw.get("prices", {}).get("salePrice", {})
+        if isinstance(raw.get("prices"), dict)
+        else {}
+    )
+    price = _to_float(
+        (price_node.get("minPrice") if isinstance(price_node, dict) else None)
+        or raw.get("salePrice")
+        or raw.get("price")
+        or raw.get("minPrice")
+    )
+
+    currency = None
+    if isinstance(price_node, dict):
+        currency = price_node.get("currencyCode")
+    currency = currency or raw.get("currency") or raw.get("currencyCode")
+    currency = str(currency) if currency else None
+
+    orders = _to_orders(
+        raw.get("trade", {}).get("tradeDesc")
+        if isinstance(raw.get("trade"), dict)
+        else None
+    )
+    if orders is None:
+        orders = _to_orders(
+            raw.get("orders") or raw.get("tradeCount") or raw.get("sales")
+        )
+
+    rating = _to_float(
+        (
+            raw.get("evaluation", {}).get("starRating")
+            if isinstance(raw.get("evaluation"), dict)
+            else None
+        )
+        or raw.get("rating")
+        or raw.get("averageStar")
+        or raw.get("starRating")
+    )
+
+    url = raw.get("productDetailUrl") or raw.get("url") or raw.get("detail_url")
+    if url:
+        url = str(url)
+        if url.startswith("//"):
+            url = "https:" + url
+    else:
+        url = f"{_BASE}/item/{product_id}.html"
+
+    return {
+        "category": category,
+        "product_id": product_id,
+        "title": title,
+        "price": price,
+        "currency": currency,
+        "orders": orders,
+        "rating": rating,
+        "url": url,
+    }
+
+
+def scrape_aliexpress_trending(
+    query: str | None = None,
+    category: str | None = None,
+    limit: int = 40,
+    ship_to: str = "ES",
+) -> list[dict]:
+    """Capta productos populares de AliExpress (señal e-commerce/dropshipping).
+
+    Hace UNA request HTTP a la página de listado de AliExpress ordenada por número
+    de pedidos (``total_tranpro_desc``) y extrae el JSON embebido en el HTML. Es
+    best-effort: AliExpress bloquea agresivamente headless/datacenter, por lo que
+    ante un bloqueo (403/429/captcha) lanza ``RuntimeError`` con un mensaje claro y
+    ante un HTML sin JSON parseable devuelve ``[]``. NUNCA inventa datos.
+
+    Args:
+        query: Texto de búsqueda (ej. "kitchen gadgets"). Si se da, manda en la URL.
+        category: ID numérico de categoría AliExpress o slug. Ignorado si hay ``query``.
+        limit: Número máximo de productos a devolver. Default 40.
+        ship_to: Código de país ISO-2 para fijar región/moneda via cookies. Default "ES".
+
+    Returns:
+        Lista de dicts con claves exactas:
+        ``category, product_id, title, price, currency, orders, rating, url``.
+        ``price``/``rating`` son ``float | None``, ``orders`` es ``int | None``.
+        Lista vacía si el HTML no traía JSON parseable.
+
+    Raises:
+        RuntimeError: Si AliExpress bloquea la request (captcha/403/429) o la red falla.
+    """
+    import requests
+
+    url = _build_url(query, category)
+    cookies = _region_cookies(ship_to)
+    headers = dict(_DESKTOP_HEADERS)
+    _, locale, _ = _REGION_MAP.get(ship_to.upper(), _REGION_MAP["ES"])
+    headers["Accept-Language"] = f"{locale.replace('_', '-')},en;q=0.8"
+
+    try:
+        resp = requests.get(
+            url,
+            headers=headers,
+            cookies=cookies,
+            timeout=20,
+            allow_redirects=True,
+        )
+    except requests.RequestException as exc:
+        raise RuntimeError(
+            f"scrape_aliexpress_trending: fallo de red contra {url}: {exc}"
+        ) from exc
+
+    html = resp.text or ""
+
+    if _looks_blocked(html, resp.status_code):
+        raise RuntimeError(
+            f"scrape_aliexpress_trending: AliExpress bloqueó la request "
+            f"(status={resp.status_code}, captcha/anti-bot). "
+            f"Usa el browser MCP/CDP con sesión real para esta fuente."
+        )
+
+    data = _extract_embedded_json(html)
+    if data is None:
+        # HTML sin el JSON esperado: layout cambió o respondió un shell vacío.
+        # Devolvemos [] honesto en vez de inventar.
+        return []
+
+    raw_items = _dig_items(data)
+    cat_label = category if (category and not query) else (query or category)
+
+    out: list[dict] = []
+    seen: set[str] = set()
+    for raw in raw_items:
+        norm = _normalize_item(raw, cat_label)
+        if norm is None:
+            continue
+        if norm["product_id"] in seen:
+            continue
+        seen.add(norm["product_id"])
+        out.append(norm)
+        if len(out) >= limit:
+            break
+
+    return out
+
+
+if __name__ == "__main__":
+    # Self-test honesto: import OK obligatorio + UN fetch real en try/except.
+    # NUNCA falla la build por la red.
+    print("import OK: scrape_aliexpress_trending")
+    expected_keys = {
+        "category",
+        "product_id",
+        "title",
+        "price",
+        "currency",
+        "orders",
+        "rating",
+        "url",
+    }
+    try:
+        rows = scrape_aliexpress_trending(query="phone holder", limit=5, ship_to="ES")
+        if rows:
+            got_keys = set(rows[0].keys())
+            keys_ok = got_keys == expected_keys
+            print(
+                f"fetch real: {len(rows)} filas obtenidas | "
+                f"claves correctas={keys_ok}"
+            )
+            print(f"  muestra: {rows[0]}")
+        else:
+            print(
+                "fetch real: 0 filas (HTML sin JSON embebido parseable "
+                "— layout cambió o shell vacío). NO se inventan datos."
+            )
+    except RuntimeError as exc:
+        print(f"fetch real: BLOQUEADO/ERROR honesto -> {exc}")