feat(shell): auto-commit con 31 cambios

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-14 23:55:16 +02:00
parent 1430039688
commit e1e9bb7499
31 changed files with 3917 additions and 0 deletions
@@ -10,8 +10,18 @@ from .datascience import (
    autocorrelation,
    linspace,
 )
+from .scrape_amazon_bestsellers import scrape_amazon_bestsellers
+from .scrape_google_trends import scrape_google_trends
+from .scrape_competitor_prices import scrape_competitor_prices
+from .scrape_tiktok_creative import scrape_tiktok_creative
+from .scrape_aliexpress_trending import scrape_aliexpress_trending

 __all__ = [
+    "scrape_amazon_bestsellers",
+    "scrape_google_trends",
+    "scrape_competitor_prices",
+    "scrape_tiktok_creative",
+    "scrape_aliexpress_trending",
    "pearson",
    "standardize",
    "min_max_scale",
@@ -0,0 +1,274 @@
+"""Scrapea productos trending de AliExpress conduciendo un Chrome real por CDP.
+
+Variante que SI funciona frente al bloqueo por captcha: en vez de pedir el HTML
+por HTTP (que devuelve un challenge/captcha para la busqueda de AliExpress),
+abre una pestana en un Chrome con perfil real (puerto de remote debugging) que
+ejecuta el JavaScript de la SPA y renderiza los productos. La extraccion se hace
+con `cdp_eval` del registry, scrolleando para forzar el lazy-load de tarjetas.
+
+Devuelve dicts con claves 1:1 con la tabla Postgres `aliexpress_trends`
+(sin id/snapshot_date/scraped_at), listos para insertar.
+"""
+
+import json
+import re
+import time
+import urllib.parse
+
+import requests
+
+from browser.cdp_eval import cdp_eval
+
+
+# Expresion JS de extraccion. Se evalua una sola vez tras el scroll y devuelve
+# JSON.stringify de la lista de filas. Tolerante: campos ausentes -> null, nunca
+# aborta una tarjeta. Deduplica por product_id dentro del propio JS.
+_EXTRACT_JS = r"""
+(function () {
+  // 1. product_id desde el href: /item/<ID>.html, o promo (?productIds=<ID>:...
+  //    o x_object_id=<ID>).
+  function productIdFromHref(href) {
+    if (!href) return null;
+    var m = href.match(/\/item\/(\d+)\.html/);
+    if (m) return m[1];
+    m = href.match(/[?&]productIds=(\d+)/);
+    if (m) return m[1];
+    m = href.match(/x_object_id(?:%3A|:|=)(\d+)/);
+    if (m) return m[1];
+    return null;
+  }
+
+  // 2. href absoluto al producto. Prefiere un <a href*="/item/"> dentro del card;
+  //    si no, el href del propio anchor de la tarjeta.
+  function absUrl(href) {
+    if (!href) return null;
+    if (href.indexOf("//") === 0) return "https:" + href;
+    if (href.indexOf("http") === 0) return href;
+    return "https://www.aliexpress.com" + href;
+  }
+
+  // 3. precio EUR -> float (coma decimal ES). "0,33€" -> 0.33. "GRATIS" -> null.
+  function parsePrice(txt) {
+    if (!txt) return null;
+    // primer token monetario con € o EUR
+    var m = txt.match(/([\d.]+,\d+)\s*(?:€|EUR)/);
+    if (!m) m = txt.match(/(?:€|EUR)\s*([\d.]+,\d+)/);
+    if (!m) m = txt.match(/([\d.]+)\s*(?:€|EUR)/);
+    if (!m) return null;
+    var raw = m[1].replace(/\./g, "").replace(",", ".");
+    var v = parseFloat(raw);
+    return isFinite(v) ? v : null;
+  }
+
+  // 4. pedidos: "100K+ vendidos", "50.000+ vendidos", "1.000+ sold", "234 sold".
+  function parseOrders(txt) {
+    if (!txt) return null;
+    var m = txt.match(/([\d.,]+)\s*([KkMm])?\s*\+?\s*(?:vendidos|sold|orders|pedidos)/);
+    if (!m) return null;
+    var num = m[1].replace(/\./g, "").replace(/,/g, ".");
+    var val = parseFloat(num);
+    if (!isFinite(val)) return null;
+    var suf = (m[2] || "").toLowerCase();
+    if (suf === "k") val *= 1000;
+    else if (suf === "m") val *= 1000000;
+    return Math.round(val);
+  }
+
+  // 5. rating: primer "4.9" / "4,9" tras el bloque de precio (0-5).
+  function parseRating(txt) {
+    if (!txt) return null;
+    var matches = txt.match(/\b([0-5][.,]\d)\b/g);
+    if (!matches) return null;
+    for (var i = 0; i < matches.length; i++) {
+      var v = parseFloat(matches[i].replace(",", "."));
+      if (v >= 0 && v <= 5) return v;
+    }
+    return null;
+  }
+
+  var anchors = Array.prototype.slice.call(
+    document.querySelectorAll("a.search-card-item")
+  );
+  var seen = {};
+  var rows = [];
+
+  for (var i = 0; i < anchors.length; i++) {
+    var a = anchors[i];
+    var card = a.closest(".search-item-card-wrapper-gallery") || a;
+
+    // href al producto: primero un <a href*="/item/"> dentro del card.
+    var href = null;
+    var inner = card.querySelectorAll("a");
+    for (var j = 0; j < inner.length; j++) {
+      var h = inner[j].getAttribute("href") || "";
+      if (/\/item\/\d+\.html/.test(h)) { href = h; break; }
+    }
+    if (!href) href = a.getAttribute("href") || "";
+
+    var pid = productIdFromHref(href);
+    if (!pid || seen[pid]) continue;
+    seen[pid] = true;
+
+    var img = card.querySelector("img");
+    var title = img ? (img.getAttribute("alt") || "") : "";
+    if (!title) title = (a.innerText || "").trim();
+    title = (title || "").trim() || null;
+
+    var text = card.innerText || "";
+
+    rows.push({
+      product_id: pid,
+      title: title,
+      price: parsePrice(text),
+      currency: "EUR",
+      orders: parseOrders(text),
+      rating: parseRating(text),
+      url: absUrl(href)
+    });
+  }
+
+  return JSON.stringify(rows);
+})()
+"""
+
+
+def cdp_scrape_aliexpress_trending(
+    query: str = "gadgets",
+    limit: int = 40,
+    ship_to: str = "ES",
+    port: int = 9222,
+) -> list[dict]:
+    """Scrapea productos trending de AliExpress via CDP sobre un Chrome real.
+
+    Abre una pestana en la busqueda de AliExpress ordenada por popularidad
+    (numero de pedidos), espera al render, scrollea para disparar el lazy-load
+    de tarjetas y extrae los productos con un unico `cdp_eval`.
+
+    Args:
+        query: Termino de busqueda. Tambien se usa como `category` en cada fila.
+        limit: Maximo de productos a devolver tras deduplicar por product_id.
+        ship_to: Codigo de pais de envio (afecta precios/moneda mostrados).
+        port: Puerto de remote debugging del Chrome con perfil real. Default 9222.
+
+    Returns:
+        Lista de dicts con claves exactas (1:1 con la tabla `aliexpress_trends`):
+            category, product_id, title, price, currency, orders, rating, url.
+        price es float|None, orders int|None, rating float|None; el resto str.
+
+    Raises:
+        RuntimeError: si no se puede abrir la pestana, si CDP devuelve un error
+            de evaluacion, o si el JSON de extraccion no se puede parsear.
+    """
+    base = "http://localhost:%d" % port
+    target_url = (
+        "https://www.aliexpress.com/w/wholesale-%s.html"
+        "?SortType=total_tranpro_desc&shipCountry=%s"
+        % (urllib.parse.quote(query), urllib.parse.quote(ship_to))
+    )
+
+    # 1. Abrir pestana via DevTools HTTP API (esta build exige PUT en /json/new).
+    tab_id = ""
+    try:
+        new_url = "%s/json/new?%s" % (base, urllib.parse.quote(target_url, safe=""))
+        resp = requests.put(new_url, timeout=10)
+        if resp.status_code != 200:
+            # Fallback a POST por compatibilidad con builds antiguas.
+            resp = requests.post(new_url, timeout=10)
+        resp.raise_for_status()
+        tab = resp.json()
+        tab_id = tab.get("id", "")
+        if not tab_id:
+            raise RuntimeError("DevTools /json/new no devolvio id de pestana")
+    except Exception as exc:  # noqa: BLE001 — red/HTTP/JSON
+        raise RuntimeError("no se pudo abrir pestana en %s: %s" % (base, exc))
+
+    substr = "aliexpress.com/w/wholesale-%s" % urllib.parse.quote(query)
+
+    try:
+        # 2. Esperar render inicial.
+        time.sleep(6.0)
+
+        # 3. Scroll en bucle para forzar lazy-load hasta tener >= limit tarjetas
+        #    o hasta que el conteo deje de crecer (estabilizado).
+        count_js = (
+            'document.querySelectorAll("a.search-card-item").length'
+        )
+        prev = -1
+        stable = 0
+        for _ in range(15):
+            cdp_eval(
+                "window.scrollBy(0, 2500)",
+                port=port,
+                target_url_substr=substr,
+            )
+            time.sleep(1.2)
+            res = cdp_eval(count_js, port=port, target_url_substr=substr)
+            n = res.get("value") if res.get("ok") else None
+            n = int(n) if isinstance(n, (int, float)) else 0
+            if n >= limit:
+                break
+            if n <= prev:
+                stable += 1
+                if stable >= 2:
+                    break
+            else:
+                stable = 0
+            prev = n
+
+        # 4. Extraer con un unico cdp_eval (devuelve JSON.stringify de las filas).
+        res = cdp_eval(_EXTRACT_JS, port=port, target_url_substr=substr)
+        if not res.get("ok"):
+            raise RuntimeError(
+                "cdp_eval fallo en la extraccion: %s" % res.get("error", "")
+            )
+        raw = res.get("value")
+        if not raw:
+            return []
+        try:
+            rows = json.loads(raw)
+        except Exception as exc:  # noqa: BLE001 — JSON malformado
+            raise RuntimeError("JSON de extraccion invalido: %s" % exc)
+
+        # 5. Anadir category y truncar a limit. Saneo defensivo de tipos.
+        out: list[dict] = []
+        seen: set[str] = set()
+        for r in rows:
+            pid = r.get("product_id")
+            if not pid or pid in seen:
+                continue
+            seen.add(pid)
+            price = r.get("price")
+            orders = r.get("orders")
+            rating = r.get("rating")
+            out.append(
+                {
+                    "category": query,
+                    "product_id": str(pid),
+                    "title": r.get("title"),
+                    "price": float(price) if isinstance(price, (int, float)) else None,
+                    "currency": r.get("currency") or "EUR",
+                    "orders": int(orders) if isinstance(orders, (int, float)) else None,
+                    "rating": float(rating) if isinstance(rating, (int, float)) else None,
+                    "url": r.get("url"),
+                }
+            )
+            if len(out) >= limit:
+                break
+        return out
+    finally:
+        # 6. Cerrar la pestana siempre (best-effort).
+        if tab_id:
+            try:
+                requests.get("%s/json/close/%s" % (base, tab_id), timeout=5)
+            except Exception:  # noqa: BLE001 — cierre best-effort
+                pass
+
+
+if __name__ == "__main__":
+    import sys
+
+    q = sys.argv[1] if len(sys.argv) > 1 else "gadgets"
+    lim = int(sys.argv[2]) if len(sys.argv) > 2 else 40
+    products = cdp_scrape_aliexpress_trending(query=q, limit=lim, port=9222)
+    print("%d productos" % len(products))
+    print(json.dumps(products[:5], ensure_ascii=False, indent=2))
@@ -0,0 +1,81 @@
+---
+name: scrape_aliexpress_trending
+kind: function
+lang: py
+domain: datascience
+version: "1.0.0"
+purity: impure
+signature: "def scrape_aliexpress_trending(query: str | None = None, category: str | None = None, limit: int = 40, ship_to: str = 'ES') -> list[dict]"
+description: "Capta productos populares de AliExpress como señal de e-commerce/dropshipping (orders, rating, precio). Hace una request HTTP a la página de listado ordenada por número de pedidos y extrae el JSON embebido en el HTML (window.runParams / _dida_config). Best-effort: ante anti-bot lanza RuntimeError, ante HTML sin JSON devuelve []. NUNCA inventa datos."
+tags: [aliexpress, ecommerce, dropshipping, trends, market-intel, datascience]
+params:
+  - name: query
+    desc: "Texto de búsqueda (ej. 'kitchen gadgets'). Si se da, manda en la URL sobre category."
+  - name: category
+    desc: "ID numérico de categoría AliExpress o slug. Ignorado si hay query. None usa un listado 'hot products' genérico."
+  - name: limit
+    desc: "Número máximo de productos a devolver. Default 40."
+  - name: ship_to
+    desc: "Código de país ISO-2 (ES, US, GB, DE, ...) que fija región y moneda via cookies de AliExpress. Default 'ES'."
+output: "Lista de dicts con claves exactas (casan 1:1 con la tabla Postgres aliexpress_trends, sin id/snapshot_date/scraped_at): category (str|None), product_id (str), title (str|None), price (float|None), currency (str|None), orders (int|None), rating (float|None), url (str). Lista vacía si el HTML no traía JSON parseable."
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [requests]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/datascience/scrape_aliexpress_trending.py"
+---
+
+## Ejemplo
+
+```python
+import sys, os
+sys.path.insert(0, os.path.join("python", "functions"))
+from datascience.scrape_aliexpress_trending import scrape_aliexpress_trending
+
+# Top productos por número de pedidos para una búsqueda concreta, enviando a España.
+rows = scrape_aliexpress_trending(query="phone holder", limit=20, ship_to="ES")
+for r in rows[:3]:
+    print(r["title"], "->", r["orders"], "pedidos |", r["price"], r["currency"])
+
+# Cada dict (las 8 claves casan con la tabla aliexpress_trends):
+# {"category": "phone holder", "product_id": "100500...", "title": "...",
+#  "price": 3.21, "currency": "EUR", "orders": 12000, "rating": 4.8,
+#  "url": "https://www.aliexpress.com/item/100500....html"}
+```
+
+## Cuando usarla
+
+Cuando necesites una señal de qué productos están vendiendo bien en AliExpress para
+research de dropshipping o market-intel: detectar tendencias, sourcing de productos
+ganadores, o alimentar un histórico (tabla `aliexpress_trends`) que cruce orders /
+rating / precio por categoría. Úsala antes de decidir un nicho o para vigilar
+periódicamente una keyword. El output va directo a un `INSERT` Postgres (las 8 claves
+coinciden con las columnas no autogeneradas).
+
+## Gotchas
+
+- **Anti-bot fuerte (CRÍTICO):** AliExpress bloquea agresivamente headless/datacenter
+  con captcha (`/_____tmd_____/punish`), 403/429 y fingerprinting. Desde una IP de
+  datacenter o un patrón de scraping evidente, esta función **lanzará `RuntimeError`**
+  con frecuencia. Para extracción fiable y sostenida, la alternativa robusta es el
+  **browser MCP/CDP con sesión real** (Chrome del usuario, cookies legítimas), no
+  `requests`. Esta función es la vía barata; si falla repetidamente, sube de nivel.
+- **JSON embebido volátil:** el nombre/estructura del blob (`window.runParams`,
+  `_dida_config_`, `_init_data_`) cambia con frecuencia. Se prueban varios patrones y
+  un walk genérico, pero si AliExpress cambia el layout la función devuelve `[]`
+  (HTML válido sin JSON parseable) — **NO inventa datos**. Diferencia clave:
+  `RuntimeError` = bloqueado; `[]` = layout cambiado o shell vacío.
+- **Región/moneda dependen de `ship_to`:** se setean por cookies (`aep_usuc_f`,
+  `intl_locale`). Un `ship_to` no mapeado cae a `ES`/`EUR`. El `currency` devuelto
+  depende de lo que AliExpress decida servir, no se fuerza tras el fetch.
+- **`orders`/`price`/`rating` pueden venir `None`** si el item no expone ese campo en
+  el JSON (productos nuevos sin ventas, listados sin rating). No asumir no-null.
+- **Una sola página:** devuelve hasta `limit` items de la primera página de resultados;
+  no pagina. Para más volumen, llamar con queries/categorías distintas.
+- **Sin reintentos ni rotación de proxy/UA:** es una request única con headers fijos.
+  Para uso periódico, orquestar reintentos y backoff fuera de la función.
@@ -0,0 +1,393 @@
+"""Capta productos populares de AliExpress como señal de e-commerce/dropshipping.
+
+Extrae el JSON que AliExpress embebe en el HTML de su página de búsqueda/listado
+(``window.runParams`` / ``_dida_config`` / scripts ``data``) en lugar de parsear
+el DOM renderizado por JS. AliExpress es anti-bot fuerte (captcha, 403, fingerprint
+sobre headless/datacenter), por lo que esta función es best-effort: cuando el fetch
+real es bloqueado lanza ``RuntimeError`` con un mensaje claro. NUNCA inventa datos.
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from typing import Any
+
+
+_BASE = "https://www.aliexpress.com"
+_WHOLESALE = f"{_BASE}/wholesale"
+
+# Headers realistas de un navegador desktop. AliExpress fingerprint-ea agresivamente,
+# así que enviamos un perfil coherente (Chrome estable + Accept-Language acorde a region).
+_DESKTOP_HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/124.0.0.0 Safari/537.36"
+    ),
+    "Accept": (
+        "text/html,application/xhtml+xml,application/xml;q=0.9,"
+        "image/avif,image/webp,image/apng,*/*;q=0.8"
+    ),
+    "Accept-Encoding": "gzip, deflate, br",
+    "Connection": "keep-alive",
+    "Upgrade-Insecure-Requests": "1",
+    "Sec-Fetch-Dest": "document",
+    "Sec-Fetch-Mode": "navigate",
+    "Sec-Fetch-Site": "none",
+    "Sec-Fetch-User": "?1",
+    "Cache-Control": "max-age=0",
+}
+
+# AliExpress decide moneda/region por estas cookies. Mapa ship_to -> (region, locale, currency).
+_REGION_MAP: dict[str, tuple[str, str, str]] = {
+    "ES": ("ES", "es_ES", "EUR"),
+    "US": ("US", "en_US", "USD"),
+    "GB": ("GB", "en_GB", "GBP"),
+    "FR": ("FR", "fr_FR", "EUR"),
+    "DE": ("DE", "de_DE", "EUR"),
+    "IT": ("IT", "it_IT", "EUR"),
+    "PT": ("PT", "pt_PT", "EUR"),
+    "MX": ("MX", "es_MX", "USD"),
+    "BR": ("BR", "pt_BR", "BRL"),
+}
+
+# Señales de bloqueo anti-bot en la respuesta.
+_BLOCK_MARKERS = (
+    "punish",          # /_____tmd_____/punish — captcha slider de AliExpress
+    "nc_token",        # NoCaptcha de Alibaba
+    "captcha",
+    "Access Denied",
+    "baxia-dialog",    # widget de verificacion
+)
+
+
+def _region_cookies(ship_to: str) -> dict[str, str]:
+    region, locale, currency = _REGION_MAP.get(
+        ship_to.upper(), _REGION_MAP["ES"]
+    )
+    return {
+        "aep_usuc_f": f"site=glo&c_tp={currency}&region={region}&b_locale={locale}",
+        "intl_locale": locale,
+        "xman_us_f": f"x_l=0&no_popup_today=n&zero_order=n&x_locale={locale}",
+    }
+
+
+def _build_url(query: str | None, category: str | None) -> str:
+    if query:
+        # /wholesale?SearchText=... es el listado de búsqueda con runParams embebido.
+        from urllib.parse import quote_plus
+
+        return f"{_WHOLESALE}?SearchText={quote_plus(query)}&SortType=total_tranpro_desc"
+    if category:
+        # Categorías numéricas: /category/<id>/x.html. Si llega un slug, lo usamos como texto.
+        if category.isdigit():
+            return f"{_BASE}/category/{category}/x.html?SortType=total_tranpro_desc"
+        from urllib.parse import quote_plus
+
+        return f"{_WHOLESALE}?SearchText={quote_plus(category)}&SortType=total_tranpro_desc"
+    # Sin query ni categoría: listado de best-selling genérico.
+    return f"{_WHOLESALE}?SearchText=hot+products&SortType=total_tranpro_desc"
+
+
+def _looks_blocked(html: str, status_code: int) -> bool:
+    if status_code in (403, 429, 503):
+        return True
+    head = html[:6000].lower()
+    return any(marker.lower() in head for marker in _BLOCK_MARKERS)
+
+
+def _extract_embedded_json(html: str) -> dict[str, Any] | None:
+    """Intenta varios patrones de JSON embebido que AliExpress ha usado a lo largo del tiempo.
+
+    El nombre/forma cambia con frecuencia, así que probamos en orden y nos quedamos
+    con el primero que parsee y contenga algo con pinta de items.
+    """
+    patterns = (
+        r"window\.runParams\s*=\s*({.*?})\s*;\s*</script>",
+        r"window\._dida_config_\s*=\s*({.*?})\s*;",
+        r"_init_data_\s*=\s*{\s*data:\s*({.*?})\s*}\s*</script>",
+        r"window\.runParams\s*=\s*({.*?});",
+    )
+    for pat in patterns:
+        m = re.search(pat, html, re.DOTALL)
+        if not m:
+            continue
+        blob = m.group(1)
+        try:
+            data = json.loads(blob)
+        except (json.JSONDecodeError, ValueError):
+            continue
+        if isinstance(data, dict):
+            return data
+    return None
+
+
+def _dig_items(data: dict[str, Any]) -> list[dict[str, Any]]:
+    """Localiza la lista de productos dentro del JSON embebido, sea cual sea su anidación.
+
+    Las claves han variado entre 'mods.itemList.content', 'items', 'result.items'...
+    así que hacemos un walk genérico buscando la primera lista de dicts con pinta de
+    producto (tienen productId/title/trade).
+    """
+    found: list[dict[str, Any]] = []
+
+    def _is_product(d: dict[str, Any]) -> bool:
+        keys = set(d.keys())
+        id_keys = {"productId", "product_id", "productid", "id"}
+        title_keys = {"title", "subject", "name"}
+        return bool(keys & id_keys) and bool(keys & title_keys)
+
+    def _walk(node: Any) -> None:
+        if found:
+            return
+        if isinstance(node, list):
+            product_like = [x for x in node if isinstance(x, dict) and _is_product(x)]
+            if len(product_like) >= 2:
+                found.extend(product_like)
+                return
+            for x in node:
+                _walk(x)
+        elif isinstance(node, dict):
+            for v in node.values():
+                _walk(v)
+
+    _walk(data)
+    return found
+
+
+def _to_float(value: Any) -> float | None:
+    if value is None:
+        return None
+    if isinstance(value, (int, float)):
+        return float(value)
+    s = str(value)
+    # Quita símbolos de moneda y separadores de miles; deja el primer número decimal.
+    m = re.search(r"\d[\d.,]*", s.replace(" ", " "))
+    if not m:
+        return None
+    num = m.group(0)
+    # Heurística: si hay coma y punto, asume coma = miles. Si solo coma, coma = decimal.
+    if "," in num and "." in num:
+        num = num.replace(",", "")
+    elif "," in num:
+        num = num.replace(",", ".")
+    try:
+        return float(num)
+    except ValueError:
+        return None
+
+
+def _to_orders(value: Any) -> int | None:
+    if value is None:
+        return None
+    if isinstance(value, int):
+        return value
+    s = str(value).lower()
+    # Formatos: "1,234 sold", "2.3k sold", "10000+ orders".
+    mult = 1
+    if "k" in s:
+        mult = 1000
+    m = re.search(r"\d[\d.,]*", s)
+    if not m:
+        return None
+    num = m.group(0).replace(",", "")
+    try:
+        base = float(num)
+    except ValueError:
+        return None
+    return int(base * mult)
+
+
+def _normalize_item(
+    raw: dict[str, Any], category: str | None
+) -> dict[str, Any] | None:
+    pid = (
+        raw.get("productId")
+        or raw.get("product_id")
+        or raw.get("productid")
+        or raw.get("id")
+    )
+    if pid is None:
+        return None
+    product_id = str(pid)
+
+    title = raw.get("title") or raw.get("subject") or raw.get("name")
+    if isinstance(title, dict):
+        title = title.get("displayTitle") or title.get("seoTitle")
+    title = str(title).strip() if title else None
+
+    # Precio: AliExpress lo mete en 'prices.salePrice.minPrice' o variantes planas.
+    price_node = (
+        raw.get("prices", {}).get("salePrice", {})
+        if isinstance(raw.get("prices"), dict)
+        else {}
+    )
+    price = _to_float(
+        (price_node.get("minPrice") if isinstance(price_node, dict) else None)
+        or raw.get("salePrice")
+        or raw.get("price")
+        or raw.get("minPrice")
+    )
+
+    currency = None
+    if isinstance(price_node, dict):
+        currency = price_node.get("currencyCode")
+    currency = currency or raw.get("currency") or raw.get("currencyCode")
+    currency = str(currency) if currency else None
+
+    orders = _to_orders(
+        raw.get("trade", {}).get("tradeDesc")
+        if isinstance(raw.get("trade"), dict)
+        else None
+    )
+    if orders is None:
+        orders = _to_orders(
+            raw.get("orders") or raw.get("tradeCount") or raw.get("sales")
+        )
+
+    rating = _to_float(
+        (
+            raw.get("evaluation", {}).get("starRating")
+            if isinstance(raw.get("evaluation"), dict)
+            else None
+        )
+        or raw.get("rating")
+        or raw.get("averageStar")
+        or raw.get("starRating")
+    )
+
+    url = raw.get("productDetailUrl") or raw.get("url") or raw.get("detail_url")
+    if url:
+        url = str(url)
+        if url.startswith("//"):
+            url = "https:" + url
+    else:
+        url = f"{_BASE}/item/{product_id}.html"
+
+    return {
+        "category": category,
+        "product_id": product_id,
+        "title": title,
+        "price": price,
+        "currency": currency,
+        "orders": orders,
+        "rating": rating,
+        "url": url,
+    }
+
+
+def scrape_aliexpress_trending(
+    query: str | None = None,
+    category: str | None = None,
+    limit: int = 40,
+    ship_to: str = "ES",
+) -> list[dict]:
+    """Capta productos populares de AliExpress (señal e-commerce/dropshipping).
+
+    Hace UNA request HTTP a la página de listado de AliExpress ordenada por número
+    de pedidos (``total_tranpro_desc``) y extrae el JSON embebido en el HTML. Es
+    best-effort: AliExpress bloquea agresivamente headless/datacenter, por lo que
+    ante un bloqueo (403/429/captcha) lanza ``RuntimeError`` con un mensaje claro y
+    ante un HTML sin JSON parseable devuelve ``[]``. NUNCA inventa datos.
+
+    Args:
+        query: Texto de búsqueda (ej. "kitchen gadgets"). Si se da, manda en la URL.
+        category: ID numérico de categoría AliExpress o slug. Ignorado si hay ``query``.
+        limit: Número máximo de productos a devolver. Default 40.
+        ship_to: Código de país ISO-2 para fijar región/moneda via cookies. Default "ES".
+
+    Returns:
+        Lista de dicts con claves exactas:
+        ``category, product_id, title, price, currency, orders, rating, url``.
+        ``price``/``rating`` son ``float | None``, ``orders`` es ``int | None``.
+        Lista vacía si el HTML no traía JSON parseable.
+
+    Raises:
+        RuntimeError: Si AliExpress bloquea la request (captcha/403/429) o la red falla.
+    """
+    import requests
+
+    url = _build_url(query, category)
+    cookies = _region_cookies(ship_to)
+    headers = dict(_DESKTOP_HEADERS)
+    _, locale, _ = _REGION_MAP.get(ship_to.upper(), _REGION_MAP["ES"])
+    headers["Accept-Language"] = f"{locale.replace('_', '-')},en;q=0.8"
+
+    try:
+        resp = requests.get(
+            url,
+            headers=headers,
+            cookies=cookies,
+            timeout=20,
+            allow_redirects=True,
+        )
+    except requests.RequestException as exc:
+        raise RuntimeError(
+            f"scrape_aliexpress_trending: fallo de red contra {url}: {exc}"
+        ) from exc
+
+    html = resp.text or ""
+
+    if _looks_blocked(html, resp.status_code):
+        raise RuntimeError(
+            f"scrape_aliexpress_trending: AliExpress bloqueó la request "
+            f"(status={resp.status_code}, captcha/anti-bot). "
+            f"Usa el browser MCP/CDP con sesión real para esta fuente."
+        )
+
+    data = _extract_embedded_json(html)
+    if data is None:
+        # HTML sin el JSON esperado: layout cambió o respondió un shell vacío.
+        # Devolvemos [] honesto en vez de inventar.
+        return []
+
+    raw_items = _dig_items(data)
+    cat_label = category if (category and not query) else (query or category)
+
+    out: list[dict] = []
+    seen: set[str] = set()
+    for raw in raw_items:
+        norm = _normalize_item(raw, cat_label)
+        if norm is None:
+            continue
+        if norm["product_id"] in seen:
+            continue
+        seen.add(norm["product_id"])
+        out.append(norm)
+        if len(out) >= limit:
+            break
+
+    return out
+
+
+if __name__ == "__main__":
+    # Self-test honesto: import OK obligatorio + UN fetch real en try/except.
+    # NUNCA falla la build por la red.
+    print("import OK: scrape_aliexpress_trending")
+    expected_keys = {
+        "category",
+        "product_id",
+        "title",
+        "price",
+        "currency",
+        "orders",
+        "rating",
+        "url",
+    }
+    try:
+        rows = scrape_aliexpress_trending(query="phone holder", limit=5, ship_to="ES")
+        if rows:
+            got_keys = set(rows[0].keys())
+            keys_ok = got_keys == expected_keys
+            print(
+                f"fetch real: {len(rows)} filas obtenidas | "
+                f"claves correctas={keys_ok}"
+            )
+            print(f"  muestra: {rows[0]}")
+        else:
+            print(
+                "fetch real: 0 filas (HTML sin JSON embebido parseable "
+                "— layout cambió o shell vacío). NO se inventan datos."
+            )
+    except RuntimeError as exc:
+        print(f"fetch real: BLOQUEADO/ERROR honesto -> {exc}")
@@ -0,0 +1,72 @@
+---
+name: scrape_amazon_bestsellers
+kind: function
+lang: py
+domain: datascience
+version: "1.0.0"
+purity: impure
+signature: "def scrape_amazon_bestsellers(marketplace: str = 'amazon.es', categories: list[str] | None = None, list_type: str = 'bestsellers', max_items: int = 50) -> list[dict]"
+description: "Scrapea los rankings de Amazon (Best Sellers y Movers & Shakers) de un marketplace para captar señales de demanda de productos: rank, ASIN, titulo, precio, rating, reseñas y, en movers, el cambio porcentual."
+tags: [amazon, scraping, trends, market-intel, datascience]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [requests, bs4]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/datascience/scrape_amazon_bestsellers.py"
+params:
+  - name: marketplace
+    desc: "Dominio Amazon objetivo (amazon.es, amazon.com, amazon.co.uk, amazon.de, ...). Determina la URL, el Accept-Language enviado y la moneda fallback."
+  - name: categories
+    desc: "Lista de slugs de categoria a scrapear (ej. 'electronics', 'videogames'). Si es None, scrapea la portada general del ranking elegido. Cada slug genera una pagina/peticion."
+  - name: list_type
+    desc: "Tipo de ranking: 'bestsellers' (URL /gp/bestsellers/<cat>) o 'movers_shakers' (URL /gp/movers-and-shakers/<cat>). Cualquier otro valor lanza ValueError."
+  - name: max_items
+    desc: "Numero maximo de productos recolectados por categoria. Default 50 (una pagina de ranking suele tener ~50 items)."
+output: "Lista de dicts, uno por producto, con exactamente estas claves: marketplace, list_type, category, rank, asin, title, price, currency, rating, reviews, pct_change, url. None donde no haya dato. price/rating/pct_change son float; rank/reviews son int. pct_change solo se rellena en movers_shakers. Casa 1:1 con la tabla Postgres amazon_bestsellers (el ingest añade id/snapshot_date/scraped_at)."
+---
+
+## Ejemplo
+
+```python
+import sys, os
+sys.path.insert(0, os.path.join("python", "functions"))
+from datascience.scrape_amazon_bestsellers import scrape_amazon_bestsellers
+
+# Best Sellers de electronica y videojuegos en Amazon.es
+rows = scrape_amazon_bestsellers(
+    marketplace="amazon.es",
+    categories=["electronics", "videogames"],
+    list_type="bestsellers",
+    max_items=50,
+)
+print(len(rows), "items")
+print(rows[0])
+# {'marketplace': 'amazon.es', 'list_type': 'bestsellers', 'category': 'electronics',
+#  'rank': 1, 'asin': 'B0...', 'title': '...', 'price': 29.99, 'currency': 'EUR',
+#  'rating': 4.5, 'reviews': 1234, 'pct_change': None, 'url': 'https://www.amazon.es/dp/B0...'}
+
+# Movers & Shakers (productos que mas suben) — incluye pct_change
+movers = scrape_amazon_bestsellers(
+    marketplace="amazon.com",
+    list_type="movers_shakers",
+    max_items=30,
+)
+```
+
+## Cuando usarla
+
+Usala cuando necesites captar señales de demanda de mercado desde Amazon: que se esta vendiendo mas (Best Sellers) o que esta subiendo de golpe en ventas (Movers & Shakers), por marketplace y categoria. Util como fuente de un pipeline de market intelligence / trend detection que luego ingesta a la tabla `amazon_bestsellers` y cruza snapshots diarios para detectar productos al alza. Llamala antes de cualquier analisis de tendencias de catalogo; el dict devuelto esta listo para insertar tras añadir `snapshot_date`/`scraped_at`.
+
+## Gotchas
+
+- **Anti-bot fuerte**: Amazon detecta scraping HTTP puro y puede devolver captcha, `503` o `429`. La funcion detecta el bloqueo (status 429/503 o markers de captcha en el HTML) y, tras agotar reintentos, lanza `RuntimeError` con el status. **Si HTTP puro falla repetidamente, la alternativa es el navegador del ecosistema (browser MCP / CDP)** sobre una pestaña real de Chrome, que pasa el anti-bot mejor que `requests`.
+- **HTML fragil**: Amazon cambia las plantillas del DOM con frecuencia y sirve varias a la vez segun A/B test. Los selectores estan escritos defensivamente (varios fallbacks por campo) pero **pueden necesitar mantenimiento** cuando Amazon rota plantillas. Si un campo no aparece en ninguna plantilla conocida, se devuelve `None` en vez de petar.
+- **Campos opcionales = None**: no todos los items traen precio/rating/reviews/pct_change. `pct_change` solo se rellena en `list_type="movers_shakers"`; en bestsellers siempre es `None`.
+- **rank fallback posicional**: si Amazon no renderiza el badge de rank, se usa la posición (1-indexada) del item en la pagina como rank.
+- **Una peticion por categoria**: cada slug en `categories` dispara una peticion HTTP independiente (con 2 reintentos + backoff). Listas largas de categorias multiplican el riesgo de throttling — espacia las llamadas si scrapeas muchas.
+- **Moneda best-effort**: `currency` se infiere del simbolo en el precio (€, $, £, R$) y, si no hay simbolo reconocible, del TLD del marketplace. Puede ser `None` si no se pudo determinar.
@@ -0,0 +1,425 @@
+"""Scrape Amazon Best Sellers and Movers & Shakers ranking pages for product demand signals."""
+
+from __future__ import annotations
+
+import re
+import time
+from urllib.parse import urljoin
+
+import requests
+from bs4 import BeautifulSoup
+
+# Accept-Language hint per marketplace TLD. Falls back to a generic value.
+_ACCEPT_LANGUAGE = {
+    "amazon.es": "es-ES,es;q=0.9,en;q=0.6",
+    "amazon.com": "en-US,en;q=0.9",
+    "amazon.co.uk": "en-GB,en;q=0.9",
+    "amazon.de": "de-DE,de;q=0.9,en;q=0.6",
+    "amazon.fr": "fr-FR,fr;q=0.9,en;q=0.6",
+    "amazon.it": "it-IT,it;q=0.9,en;q=0.6",
+    "amazon.com.mx": "es-MX,es;q=0.9,en;q=0.6",
+    "amazon.com.br": "pt-BR,pt;q=0.9,en;q=0.6",
+}
+
+# Currency guessed from the marketplace TLD (used only as a fallback when the
+# price string has no recognisable symbol).
+_CURRENCY_BY_MARKET = {
+    "amazon.es": "EUR",
+    "amazon.com": "USD",
+    "amazon.co.uk": "GBP",
+    "amazon.de": "EUR",
+    "amazon.fr": "EUR",
+    "amazon.it": "EUR",
+    "amazon.com.mx": "MXN",
+    "amazon.com.br": "BRL",
+}
+
+# Map common currency symbols to ISO codes.
+_SYMBOL_TO_CURRENCY = {
+    "€": "EUR",
+    "$": "USD",
+    "£": "GBP",
+    "R$": "BRL",
+    "US$": "USD",
+}
+
+_USER_AGENT = (
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+    "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
+)
+
+# Signals that Amazon served an anti-bot / captcha / throttling page instead of
+# the ranking content.
+_BLOCK_MARKERS = (
+    "api-services-support@amazon",
+    "captcha",
+    "to discuss automated access",
+    "enter the characters you see below",
+    "robot check",
+)
+
+
+def _build_headers(marketplace: str) -> dict:
+    """Realistic browser-ish headers for the given marketplace."""
+    return {
+        "User-Agent": _USER_AGENT,
+        "Accept": (
+            "text/html,application/xhtml+xml,application/xml;q=0.9,"
+            "image/avif,image/webp,*/*;q=0.8"
+        ),
+        "Accept-Language": _ACCEPT_LANGUAGE.get(marketplace, "en-US,en;q=0.9"),
+        "Accept-Encoding": "gzip, deflate, br",
+        "Connection": "keep-alive",
+        "Upgrade-Insecure-Requests": "1",
+        "Sec-Fetch-Dest": "document",
+        "Sec-Fetch-Mode": "navigate",
+        "Sec-Fetch-Site": "none",
+        "Sec-Fetch-User": "?1",
+    }
+
+
+def _build_url(marketplace: str, list_type: str, category: str | None) -> str:
+    """Compose the ranking URL for a marketplace / list type / category slug."""
+    base = "movers-and-shakers" if list_type == "movers_shakers" else "bestsellers"
+    url = f"https://www.{marketplace}/gp/{base}"
+    if category:
+        url = f"{url}/{category.strip('/')}"
+    return url
+
+
+def _looks_blocked(status_code: int, html: str) -> bool:
+    """Heuristic: did Amazon serve an anti-bot / throttling page?"""
+    if status_code in (429, 503):
+        return True
+    lowered = html.lower()
+    return any(marker in lowered for marker in _BLOCK_MARKERS)
+
+
+def _fetch(url: str, headers: dict, timeout: int, retries: int) -> requests.Response:
+    """GET with small retry + backoff. Raises on persistent failure / block."""
+    last_exc: Exception | None = None
+    for attempt in range(retries + 1):
+        try:
+            resp = requests.get(url, headers=headers, timeout=timeout)
+        except requests.RequestException as exc:  # network / timeout
+            last_exc = exc
+            if attempt < retries:
+                time.sleep(1.5 * (attempt + 1))
+                continue
+            raise RuntimeError(f"request to {url} failed: {exc}") from exc
+
+        if _looks_blocked(resp.status_code, resp.text):
+            if attempt < retries:
+                time.sleep(2.0 * (attempt + 1))
+                continue
+            raise RuntimeError(
+                f"Amazon anti-bot block on {url} (HTTP {resp.status_code}). "
+                "HTTP scraping is being throttled/captcha'd; fall back to the "
+                "browser MCP/CDP path of the ecosystem."
+            )
+
+        if resp.status_code != 200:
+            last_exc = RuntimeError(
+                f"unexpected HTTP {resp.status_code} for {url}"
+            )
+            if attempt < retries:
+                time.sleep(1.5 * (attempt + 1))
+                continue
+            raise last_exc
+
+        return resp
+
+    # Should not reach here, but be defensive.
+    raise RuntimeError(f"could not fetch {url}: {last_exc}")
+
+
+_ASIN_RE = re.compile(r"/(?:dp|gp/product)/([A-Z0-9]{10})(?:[/?]|$)")
+_RANK_RE = re.compile(r"#?\s*(\d+)")
+_PRICE_NUM_RE = re.compile(r"[-+]?\d[\d.,]*")
+_REVIEWS_RE = re.compile(r"[\d.,]+")
+_RATING_RE = re.compile(r"([\d.,]+)\s*(?:out of|de|von|su|sur|de um total de)")
+_PCT_RE = re.compile(r"([\d.,]+)\s*%")
+
+
+def _text(node) -> str:
+    return node.get_text(" ", strip=True) if node is not None else ""
+
+
+def _parse_asin(card) -> str | None:
+    """ASIN from a data-asin attribute or any /dp/<ASIN>/ link inside the card."""
+    asin = card.get("data-asin")
+    if asin and re.fullmatch(r"[A-Z0-9]{10}", asin):
+        return asin
+    for a in card.find_all("a", href=True):
+        m = _ASIN_RE.search(a["href"])
+        if m:
+            return m.group(1)
+    return None
+
+
+def _parse_url(card, marketplace: str) -> str | None:
+    """Absolute product URL from the first /dp/ link in the card."""
+    base = f"https://www.{marketplace}"
+    for a in card.find_all("a", href=True):
+        if _ASIN_RE.search(a["href"]):
+            return urljoin(base, a["href"].split("?")[0])
+    # Fall back to the first link at all.
+    first = card.find("a", href=True)
+    if first is not None:
+        return urljoin(base, first["href"].split("?")[0])
+    return None
+
+
+def _parse_rank(card) -> int | None:
+    """Rank badge. Amazon renders it as '#1', '1', etc."""
+    badge = card.select_one(".zg-bdg-text, .zg-badge-text, [class*='badge']")
+    txt = _text(badge)
+    if not txt:
+        # Sometimes the rank is in a class like a11y .zg-bdg-text sibling.
+        for sel in (".a-badge-text", "[class*='rank']"):
+            node = card.select_one(sel)
+            txt = _text(node)
+            if txt:
+                break
+    m = _RANK_RE.search(txt)
+    return int(m.group(1)) if m else None
+
+
+def _parse_title(card) -> str | None:
+    """Product title — several templates over the years."""
+    for sel in (
+        "._cDEzb_p13n-sc-css-line-clamp-3_g3dy1",
+        "._cDEzb_p13n-sc-css-line-clamp-2_EWgCb",
+        "[class*='line-clamp']",
+        ".p13n-sc-truncate",
+        ".p13n-sc-truncated",
+        "a.a-link-normal[title]",
+        "img[alt]",
+    ):
+        node = card.select_one(sel)
+        if node is None:
+            continue
+        if node.name == "img":
+            alt = node.get("alt")
+            if alt:
+                return alt.strip()
+            continue
+        if node.has_attr("title") and node["title"].strip():
+            return node["title"].strip()
+        txt = _text(node)
+        if txt:
+            return txt
+    return None
+
+
+def _parse_price(card, marketplace: str) -> tuple[float | None, str | None]:
+    """Price value (float) and ISO currency, best-effort across templates."""
+    for sel in (
+        "._cDEzb_p13n-sc-price_3mJ9Z",
+        ".p13n-sc-price",
+        "span.a-price > span.a-offscreen",
+        ".a-price .a-offscreen",
+        "[class*='price']",
+    ):
+        node = card.select_one(sel)
+        txt = _text(node)
+        if not txt:
+            continue
+
+        currency = None
+        for sym, iso in _SYMBOL_TO_CURRENCY.items():
+            if sym in txt:
+                currency = iso
+                break
+        if currency is None:
+            currency = _CURRENCY_BY_MARKET.get(marketplace)
+
+        m = _PRICE_NUM_RE.search(txt)
+        if not m:
+            continue
+        raw = m.group(0)
+        value = _to_float(raw)
+        if value is not None:
+            return value, currency
+    return None, None
+
+
+def _parse_rating(card) -> float | None:
+    """Star rating, e.g. '4,5 de 5 estrellas' / '4.5 out of 5 stars'."""
+    for sel in ("[class*='review-stars']", ".a-icon-alt", "[title*='star']", "[aria-label*='star']"):
+        node = card.select_one(sel)
+        txt = _text(node) or (node.get("title", "") if node is not None else "") or (
+            node.get("aria-label", "") if node is not None else ""
+        )
+        if not txt:
+            continue
+        m = _RATING_RE.search(txt)
+        if m:
+            return _to_float(m.group(1))
+        # Some templates only render the number ('4,5').
+        m2 = _PRICE_NUM_RE.search(txt)
+        if m2 and ("star" in txt.lower() or "estrella" in txt.lower()):
+            return _to_float(m2.group(0))
+    return None
+
+
+def _parse_reviews(card) -> int | None:
+    """Number of ratings/reviews shown next to the stars."""
+    for sel in (
+        "a.a-size-small.a-link-normal",
+        ".a-size-small.a-link-normal",
+        "[class*='review-count']",
+        "span.a-size-small",
+    ):
+        for node in card.select(sel):
+            txt = _text(node)
+            if not txt:
+                continue
+            m = _REVIEWS_RE.search(txt)
+            if not m:
+                continue
+            digits = m.group(0).replace(".", "").replace(",", "")
+            if digits.isdigit() and len(digits) >= 1:
+                # Avoid catching rank/price by requiring a plausible count token.
+                return int(digits)
+    return None
+
+
+def _parse_pct_change(card) -> float | None:
+    """Movers & Shakers percentage change ('+150%')."""
+    for sel in (".zg-percent-change", "[class*='percent']", "[class*='sales-movement']"):
+        node = card.select_one(sel)
+        txt = _text(node)
+        if not txt:
+            continue
+        m = _PCT_RE.search(txt)
+        if m:
+            value = _to_float(m.group(1))
+            if value is None:
+                continue
+            return -value if txt.strip().startswith("-") else value
+    return None
+
+
+def _to_float(raw: str) -> float | None:
+    """Parse a numeric string with EU or US decimal/grouping conventions."""
+    if raw is None:
+        return None
+    s = raw.strip().replace("\xa0", "").replace(" ", "")
+    if not s:
+        return None
+    if "," in s and "." in s:
+        # The rightmost separator is the decimal one.
+        if s.rfind(",") > s.rfind("."):
+            s = s.replace(".", "").replace(",", ".")
+        else:
+            s = s.replace(",", "")
+    elif "," in s:
+        # Treat a single comma as decimal separator (EU markets).
+        s = s.replace(",", ".")
+    try:
+        return float(s)
+    except ValueError:
+        return None
+
+
+def _select_cards(soup: BeautifulSoup) -> list:
+    """Locate the list-item cards across known Amazon templates."""
+    selectors = (
+        "div.p13n-sc-uncoverable-faceout",
+        "div[id^='gridItemRoot']",
+        "div.zg-grid-general-faceout",
+        "li.zg-item-immersion",
+        "div.a-cardui[data-asin]",
+        "div[data-asin]",
+    )
+    for sel in selectors:
+        cards = soup.select(sel)
+        if cards:
+            return cards
+    return []
+
+
+def scrape_amazon_bestsellers(
+    marketplace: str = "amazon.es",
+    categories: list[str] | None = None,
+    list_type: str = "bestsellers",
+    max_items: int = 50,
+) -> list[dict]:
+    """Scrape Amazon Best Sellers / Movers & Shakers ranking pages.
+
+    Captures demand signals (rank, title, price, rating, reviews and — for
+    Movers & Shakers — percentage change) from one or more category ranking
+    pages of a given Amazon marketplace.
+
+    Args:
+        marketplace: Amazon domain, e.g. ``"amazon.es"``, ``"amazon.com"``.
+        categories: Category slugs (e.g. ``"electronics"``, ``"videogames"``).
+            If ``None`` the general front page of the chosen list is scraped.
+        list_type: ``"bestsellers"`` (URL ``/gp/bestsellers/<cat>``) or
+            ``"movers_shakers"`` (URL ``/gp/movers-and-shakers/<cat>``).
+        max_items: Maximum number of items collected per category.
+
+    Returns:
+        A list of dicts, one per product, with exactly these keys:
+        ``marketplace, list_type, category, rank, asin, title, price,
+        currency, rating, reviews, pct_change, url``. Missing values are
+        ``None``. ``price``/``rating``/``pct_change`` are floats,
+        ``rank``/``reviews`` are ints.
+
+    Raises:
+        ValueError: If ``list_type`` is not one of the allowed values.
+        RuntimeError: On network failure or when Amazon serves an anti-bot /
+            captcha / throttling page.
+    """
+    if list_type not in ("bestsellers", "movers_shakers"):
+        raise ValueError(
+            f"list_type must be 'bestsellers' or 'movers_shakers', got {list_type!r}"
+        )
+
+    cats: list[str | None] = list(categories) if categories else [None]
+    headers = _build_headers(marketplace)
+    results: list[dict] = []
+
+    for category in cats:
+        url = _build_url(marketplace, list_type, category)
+        resp = _fetch(url, headers, timeout=20, retries=2)
+        soup = BeautifulSoup(resp.text, "lxml")
+        cards = _select_cards(soup)
+
+        count = 0
+        for idx, card in enumerate(cards):
+            if count >= max_items:
+                break
+            asin = _parse_asin(card)
+            title = _parse_title(card)
+            # Skip empty / non-product wrappers.
+            if asin is None and title is None:
+                continue
+
+            rank = _parse_rank(card)
+            if rank is None:
+                rank = idx + 1  # positional fallback when no badge is rendered
+
+            price, currency = _parse_price(card, marketplace)
+            results.append(
+                {
+                    "marketplace": marketplace,
+                    "list_type": list_type,
+                    "category": category,
+                    "rank": rank,
+                    "asin": asin,
+                    "title": title,
+                    "price": price,
+                    "currency": currency,
+                    "rating": _parse_rating(card),
+                    "reviews": _parse_reviews(card),
+                    "pct_change": _parse_pct_change(card)
+                    if list_type == "movers_shakers"
+                    else None,
+                    "url": _parse_url(card, marketplace),
+                }
+            )
+            count += 1
+
+    return results
@@ -0,0 +1,73 @@
+---
+name: scrape_competitor_prices
+kind: function
+lang: py
+domain: datascience
+version: "1.0.0"
+purity: impure
+signature: "def scrape_competitor_prices(targets: list[dict]) -> list[dict]"
+description: "Vigila precios de la competencia: dada una lista de objetivos (URL de producto + competidor), hace GET con headers realistas (timeout + 1 reintento) y extrae el precio actual de cada pagina con una cascada de estrategias (CSS selector, JSON-LD offers, meta tags, heuristica de clases). Normaliza a float (tolera coma/punto, simbolos, miles) y detecta in_stock. Devuelve una fila por target con claves 1:1 de la tabla Postgres competitor_prices; si falla un target devuelve price=None sin abortar los demas."
+tags: [competitor, pricing, scraping, market-intel, datascience, recon]
+params:
+  - name: targets
+    desc: "Lista de dicts, uno por producto a vigilar. Cada dict: competitor (str, nombre/id del competidor), product_key (str, clave interna estable), product_name (str, nombre legible), url (str, URL de la pagina del producto), price_selector (str, opcional, selector CSS que apunta al nodo del precio — lo mas robusto), currency (str, opcional, codigo de moneda a estampar, default 'EUR')."
+output: "Lista de dicts, una fila por target, con EXACTAMENTE estas claves (casan 1:1 con la tabla Postgres competitor_prices, sin id/snapshot_date/scraped_at): competitor (str), product_key (str), product_name (str), url (str), price (float | None), currency (str), in_stock (bool | None). price=None si no se pudo extraer; in_stock=None si la pagina fallo."
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [requests, beautifulsoup4, lxml]
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/datascience/scrape_competitor_prices.py"
+---
+
+## Ejemplo
+
+```python
+import sys, os
+sys.path.insert(0, os.path.join("python", "functions"))
+from datascience.scrape_competitor_prices import scrape_competitor_prices
+
+targets = [
+    {
+        "competitor": "books-to-scrape",
+        "product_key": "light-in-the-attic",
+        "product_name": "A Light in the Attic",
+        "url": "http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html",
+        "price_selector": "p.price_color",   # el selector por target es lo mas fiable
+        "currency": "GBP",
+    },
+    {
+        "competitor": "competidor_b",
+        "product_key": "SKU-4242",
+        "product_name": "Filtro de aceite XYZ",
+        "url": "https://www.ejemplo-tienda.com/producto/4242",
+        # sin price_selector -> autodeteccion JSON-LD / meta / heuristica de clases
+        "currency": "EUR",
+    },
+]
+
+rows = scrape_competitor_prices(targets)
+# rows[0] -> {"competitor": "books-to-scrape", "product_key": "light-in-the-attic",
+#             "product_name": "A Light in the Attic", "url": "...",
+#             "price": 51.77, "currency": "GBP", "in_stock": True}
+# Listo para INSERT en la tabla competitor_prices (anade tu snapshot_date/scraped_at).
+```
+
+## Cuando usarla
+
+Cuando necesites un snapshot puntual del precio de uno o varios productos de la competencia para alimentar una tabla de market intelligence (`competitor_prices`). Util en un cron/pipeline que lee una lista de objetivos, scrapea, y persiste una fila por producto. Pasa `price_selector` por target siempre que conozcas el sitio: es la via mas robusta. Si no lo pasas, la funcion intenta autodetectar (JSON-LD `offers.price`, meta tags de precio, clases comunes de e-commerce). Las filas salen con las claves exactas de la tabla destino, asi que el caller solo anade `snapshot_date`/`scraped_at` antes del INSERT.
+
+## Gotchas
+
+- **Funcion impura**: hace I/O de red (HTTP GET). Depende del HTML real de cada sitio en el momento de la llamada.
+- **El scraping de precios es muy especifico por sitio.** Sin `price_selector`, la autodeteccion acierta en muchos e-commerce estandar (los que exponen JSON-LD `Product/Offer`, meta `og:price:amount`/`itemprop=price`, o clases tipicas `.price`), pero **falla en SPAs / paginas JS-rendered** (React/Vue/Angular que pintan el precio tras cargar) y en sitios con **anti-bot** (Cloudflare, captchas, fingerprinting). Para esos casos el GET devuelve un HTML sin el precio o un challenge, y la fila sale con `price=None`.
+- **Para sitios JS-rendered o con anti-bot usa el navegador del ecosistema** (browser MCP / CDP: `page_perceive`, `cdp_get_text`, `cdp_perceive_outline`) para renderizar la pagina y extraer el precio del DOM ya pintado, en lugar de esta funcion de HTTP puro. Esta funcion es para HTML servidor-renderizado.
+- **`price_selector` por target es lo mas fiable**: evita depender de la heuristica y sobrevive mejor a cambios de plantilla. Define uno por competidor en tu lista de objetivos.
+- **Normalizacion de precio**: tolera `1.299,99 €` (europeo: punto miles, coma decimal), `$1,299.99` (US), `29,90`, `1299.99`. Heuristica: el separador mas a la derecha es el decimal cuando hay ambos; con solo coma, se trata como decimal si quedan 2 digitos detras, si no como miles. Casos exoticos (3 decimales, formatos regionales raros) pueden malinterpretarse — verifica con `price_selector` apuntando al nodo limpio.
+- **`in_stock` es heuristico**: `True` salvo que el texto de la pagina contenga marcadores de agotado (`agotado`, `sin stock`, `out of stock`, `sold out`, etc.). Falsos positivos/negativos posibles si el sitio usa otra redaccion o muestra esos terminos en contexto no relacionado. `None` si la pagina fallo al cargar.
+- **Tolerancia a fallos por target**: si un target peta (red, timeout, HTML invalido), su fila sale con `price=None`/`in_stock=None` y **el resto del batch continua**. Nunca aborta toda la lista por un fallo individual.
+- **Reintento unico**: cada GET reintenta una vez ante error de transporte. No hay backoff exponencial ni rotacion de proxies/User-Agent; para scraping a escala o contra anti-bot fuerte, eso queda fuera del alcance de esta funcion.
@@ -0,0 +1,389 @@
+"""Scrape current prices for a list of competitor product pages.
+
+Watches competitor pricing: given a list of targets (product URL + competitor),
+fetches each page and extracts the current price using a cascade of strategies
+(CSS selector, JSON-LD offers, meta tags, common-class heuristics). Output rows
+map 1:1 to the Postgres `competitor_prices` table (minus the autogenerated
+id/snapshot_date/scraped_at columns).
+"""
+
+import json
+import re
+import urllib.request
+import urllib.error
+
+from bs4 import BeautifulSoup
+
+_USER_AGENT = (
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+    "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
+)
+
+_REQUEST_HEADERS = {
+    "User-Agent": _USER_AGENT,
+    "Accept": (
+        "text/html,application/xhtml+xml,application/xml;q=0.9,"
+        "image/avif,image/webp,*/*;q=0.8"
+    ),
+    "Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
+    "Accept-Encoding": "identity",
+    "Connection": "close",
+}
+
+# Substrings that, when present, signal the product is NOT available.
+_OUT_OF_STOCK_MARKERS = (
+    "agotado",
+    "sin stock",
+    "sin existencias",
+    "no disponible",
+    "out of stock",
+    "sold out",
+    "unavailable",
+    "currently unavailable",
+)
+
+# Common class/attribute patterns used by mainstream e-commerce templates.
+_PRICE_HEURISTIC_SELECTORS = (
+    "[itemprop=price]",
+    "[data-price]",
+    "[data-product-price]",
+    ".price",
+    ".product-price",
+    ".price--current",
+    ".current-price",
+    ".sale-price",
+    ".a-price .a-offscreen",
+    "[class*=price]",
+)
+
+# A token that looks like a price: optional currency symbol, digits with
+# thousands/decimal separators. Captured group is the numeric part.
+# First alternative requires >=1 explicit thousands group (e.g. 1.299,99);
+# second alternative covers plain contiguous digits with optional decimals
+# (e.g. 1299.99, 29,90). Ordering the thousands branch first avoids the
+# plain-digit branch greedily eating "1299" out of "1299.99".
+_PRICE_NUMBER_RE = re.compile(
+    r"(?:[€$£]|EUR|USD|GBP)?\s*"
+    r"(\d{1,3}(?:[.,\s]\d{3})+(?:[.,]\d{1,2})?|\d+(?:[.,]\d{1,2})?)"
+    r"\s*(?:[€$£]|EUR|USD|GBP)?",
+    re.IGNORECASE,
+)
+
+
+def _fetch_html(url: str, timeout: float = 15.0) -> str:
+    """GET a URL with realistic headers, one retry on failure.
+
+    Raises the last urllib error if both attempts fail.
+    """
+    last_err: Exception | None = None
+    for attempt in range(2):
+        try:
+            req = urllib.request.Request(url, headers=_REQUEST_HEADERS)
+            with urllib.request.urlopen(req, timeout=timeout) as resp:
+                raw = resp.read()
+            charset = resp.headers.get_content_charset() or "utf-8"
+            try:
+                return raw.decode(charset, errors="replace")
+            except (LookupError, UnicodeDecodeError):
+                return raw.decode("utf-8", errors="replace")
+        except Exception as err:  # noqa: BLE001 - retry on any transport error
+            last_err = err
+            continue
+    raise last_err if last_err is not None else RuntimeError("fetch failed")
+
+
+def _normalize_price(raw) -> float | None:
+    """Normalize a price token to float, tolerating comma/dot and symbols.
+
+    Handles "1.299,99 €", "$1,299.99", "1299.99", "29,90" etc.
+    Returns None if no numeric value can be parsed.
+    """
+    if raw is None:
+        return None
+    if isinstance(raw, (int, float)):
+        try:
+            return float(raw)
+        except (ValueError, TypeError):
+            return None
+
+    text = str(raw).strip()
+    if not text:
+        return None
+
+    match = _PRICE_NUMBER_RE.search(text)
+    if not match:
+        return None
+
+    num = match.group(1).strip().replace(" ", "")
+
+    last_comma = num.rfind(",")
+    last_dot = num.rfind(".")
+
+    if last_comma != -1 and last_dot != -1:
+        # The right-most separator is the decimal separator.
+        if last_comma > last_dot:
+            # European: 1.299,99 -> dots are thousands, comma is decimal.
+            num = num.replace(".", "").replace(",", ".")
+        else:
+            # US: 1,299.99 -> commas are thousands, dot is decimal.
+            num = num.replace(",", "")
+    elif last_comma != -1:
+        # Only commas present. Decimal if it looks like "29,90"; else thousands.
+        if len(num) - last_comma - 1 == 2:
+            num = num.replace(",", ".")
+        else:
+            num = num.replace(",", "")
+    # Only dots (or none): assume dot is already decimal / no separators.
+
+    try:
+        return float(num)
+    except ValueError:
+        return None
+
+
+def _extract_from_selector(soup: BeautifulSoup, selector: str) -> float | None:
+    """Try a single CSS selector and normalize the matched node."""
+    try:
+        node = soup.select_one(selector)
+    except Exception:  # noqa: BLE001 - invalid selector should not abort
+        return None
+    if node is None:
+        return None
+    # Prefer common price-bearing attributes, fall back to text.
+    for attr in ("content", "data-price", "data-product-price", "value"):
+        if node.has_attr(attr):
+            price = _normalize_price(node.get(attr))
+            if price is not None:
+                return price
+    return _normalize_price(node.get_text(" ", strip=True))
+
+
+def _iter_json_ld_prices(soup: BeautifulSoup):
+    """Yield candidate prices found inside ld+json offers blocks."""
+    for tag in soup.find_all("script", attrs={"type": "application/ld+json"}):
+        payload = tag.string or tag.get_text()
+        if not payload:
+            continue
+        try:
+            data = json.loads(payload)
+        except (ValueError, TypeError):
+            continue
+        for node in _walk_json(data):
+            if not isinstance(node, dict):
+                continue
+            offers = node.get("offers")
+            for offer in _as_list(offers):
+                if isinstance(offer, dict) and "price" in offer:
+                    yield offer.get("price")
+            # Some schemas place price directly on the node.
+            if "price" in node and not isinstance(node.get("offers"), (dict, list)):
+                yield node.get("price")
+
+
+def _walk_json(node):
+    """Depth-first walk over arbitrarily nested JSON structures."""
+    if isinstance(node, dict):
+        yield node
+        for value in node.values():
+            yield from _walk_json(value)
+    elif isinstance(node, list):
+        for item in node:
+            yield from _walk_json(item)
+
+
+def _as_list(value):
+    """Wrap a value in a list unless it already is one."""
+    if value is None:
+        return []
+    return value if isinstance(value, list) else [value]
+
+
+def _extract_from_meta(soup: BeautifulSoup) -> float | None:
+    """Try common price meta tags in priority order."""
+    candidates = (
+        {"itemprop": "price"},
+        {"property": "og:price:amount"},
+        {"property": "product:price:amount"},
+        {"name": "twitter:data1"},
+    )
+    for attrs in candidates:
+        tag = soup.find("meta", attrs=attrs)
+        if tag is not None:
+            price = _normalize_price(tag.get("content"))
+            if price is not None:
+                return price
+    return None
+
+
+def _detect_in_stock(soup: BeautifulSoup) -> bool | None:
+    """Heuristic stock detection: True unless an out-of-stock marker appears."""
+    text = soup.get_text(" ", strip=True).lower()
+    if not text:
+        return None
+    for marker in _OUT_OF_STOCK_MARKERS:
+        if marker in text:
+            return False
+    return True
+
+
+def _extract_price(soup: BeautifulSoup, price_selector) -> float | None:
+    """Run the extraction cascade and return the first price found."""
+    # 1. Caller-supplied CSS selector (most robust).
+    if price_selector:
+        price = _extract_from_selector(soup, str(price_selector))
+        if price is not None:
+            return price
+
+    # 2. JSON-LD offers.
+    for candidate in _iter_json_ld_prices(soup):
+        price = _normalize_price(candidate)
+        if price is not None:
+            return price
+
+    # 3. Meta tags.
+    price = _extract_from_meta(soup)
+    if price is not None:
+        return price
+
+    # 4. Common-class heuristics.
+    for selector in _PRICE_HEURISTIC_SELECTORS:
+        price = _extract_from_selector(soup, selector)
+        if price is not None:
+            return price
+
+    return None
+
+
+def scrape_competitor_prices(targets: list[dict]) -> list[dict]:
+    """Scrape current prices for a list of competitor product pages.
+
+    For each target performs a GET with realistic headers (timeout + 1 retry)
+    and extracts the price using a cascade of strategies. Extraction failures
+    of a single target never abort the others: that row is returned with
+    price=None (and in_stock=None) so the caller still gets one row per target.
+
+    Args:
+        targets: list of dicts, each with keys:
+            - competitor (str): competitor name/id.
+            - product_key (str): stable internal product key.
+            - product_name (str): human-readable product name.
+            - url (str): product page URL to scrape.
+            - price_selector (str, optional): CSS selector pinpointing the
+              price node. Most robust when provided.
+            - currency (str, optional): currency code to stamp on the row
+              (e.g. "EUR"). Defaults to "EUR".
+
+    Returns:
+        list of dicts, one per target, with EXACTLY these keys (1:1 with the
+        Postgres `competitor_prices` table, minus id/snapshot_date/scraped_at):
+            - competitor (str)
+            - product_key (str)
+            - product_name (str)
+            - url (str)
+            - price (float | None)
+            - currency (str)
+            - in_stock (bool | None)
+    """
+    rows: list[dict] = []
+
+    for target in targets:
+        competitor = target.get("competitor")
+        product_key = target.get("product_key")
+        product_name = target.get("product_name")
+        url = target.get("url")
+        price_selector = target.get("price_selector")
+        currency = target.get("currency") or "EUR"
+
+        price: float | None = None
+        in_stock: bool | None = None
+
+        if url:
+            try:
+                html = _fetch_html(url)
+                soup = BeautifulSoup(html, "lxml")
+                price = _extract_price(soup, price_selector)
+                in_stock = _detect_in_stock(soup)
+            except Exception:  # noqa: BLE001 - never abort the whole batch
+                price = None
+                in_stock = None
+
+        rows.append(
+            {
+                "competitor": competitor,
+                "product_key": product_key,
+                "product_name": product_name,
+                "url": url,
+                "price": price,
+                "currency": currency,
+                "in_stock": in_stock,
+            }
+        )
+
+    return rows
+
+
+if __name__ == "__main__":
+    # Self-test: import is implicitly OK if we reach this point.
+    print("self-test: import OK")
+
+    # Pure-logic checks that need no network.
+    assert _normalize_price("1.299,99 €") == 1299.99, "EU thousands+decimal"
+    assert _normalize_price("$1,299.99") == 1299.99, "US thousands+decimal"
+    assert _normalize_price("29,90") == 29.90, "EU decimal only"
+    assert _normalize_price("1,299") == 1299.0, "US thousands only"
+    assert _normalize_price("1299.99") == 1299.99, "plain dot decimal"
+    assert _normalize_price("Precio: 49,95 EUR hoy") == 49.95, "embedded"
+    assert _normalize_price("no price here") is None, "no number"
+    assert _normalize_price(None) is None, "none in -> none out"
+    print("self-test: price normalization OK")
+
+    # Shape check: one row per target, exact keys, failed target -> price None.
+    sample = scrape_competitor_prices(
+        [
+            {
+                "competitor": "demo",
+                "product_key": "SKU-1",
+                "product_name": "Demo product",
+                "url": "http://invalid.localhost.invalid/nope",
+                "currency": "EUR",
+            }
+        ]
+    )
+    expected_keys = {
+        "competitor",
+        "product_key",
+        "product_name",
+        "url",
+        "price",
+        "currency",
+        "in_stock",
+    }
+    assert len(sample) == 1, "one row per target"
+    assert set(sample[0].keys()) == expected_keys, "exact keys"
+    assert sample[0]["price"] is None, "failed target -> price None, no abort"
+    assert sample[0]["currency"] == "EUR", "currency default"
+    print("self-test: row shape + graceful-failure OK")
+
+    # Optional: best-effort real fetch against a public URL (never fails build).
+    try:
+        live = scrape_competitor_prices(
+            [
+                {
+                    "competitor": "books-to-scrape",
+                    "product_key": "light-in-the-attic",
+                    "product_name": "A Light in the Attic",
+                    "url": (
+                        "http://books.toscrape.com/catalogue/"
+                        "a-light-in-the-attic_1000/index.html"
+                    ),
+                    "price_selector": "p.price_color",
+                    "currency": "GBP",
+                }
+            ]
+        )
+        print(f"self-test: live fetch -> price={live[0]['price']} "
+              f"in_stock={live[0]['in_stock']}")
+    except Exception as err:  # noqa: BLE001 - network optional
+        print(f"self-test: live fetch skipped ({type(err).__name__})")
+
+    print("self-test: ALL OK")
@@ -0,0 +1,77 @@
+---
+name: scrape_google_trends
+kind: function
+lang: py
+domain: datascience
+version: "1.0.0"
+purity: impure
+signature: "def scrape_google_trends(keywords: list[str], geo: str = \"ES\", timeframe: str = \"now 7-d\", include_related: bool = True) -> list[dict]"
+description: "Capta interes de busqueda de Google Trends por keyword/nicho via pytrends. El interes es relativo 0-100, NUNCA volumen absoluto. Aplana interest_over_time + related_queries (rising/top) en filas con schema fijo que casa 1:1 con la tabla Postgres google_trends. Backoff/retry ante 429."
+tags: [google-trends, pytrends, trends, market-intel, datascience]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [pytrends, time]
+params:
+  - name: keywords
+    desc: "lista de terminos/nichos a consultar (max 5 por payload, limite de Google Trends). Cada elemento es una keyword string."
+  - name: geo
+    desc: "codigo de pais ISO-3166 (ej. 'ES', 'US', '' para mundial). Default 'ES'."
+  - name: timeframe
+    desc: "ventana temporal en sintaxis pytrends (ej. 'now 7-d', 'today 3-m', 'today 12-m', '2024-01-01 2024-12-31'). Default 'now 7-d'."
+  - name: include_related
+    desc: "si True anade filas metric='rising' y metric='top' de related_queries por keyword. Si False solo interest_over_time. Default True."
+output: "lista de dicts con claves EXACTAS {geo, timeframe, keyword, metric, point_date, value, related_query}. Tres tipos de fila segun metric: 'interest_over_time' (point_date=fecha ISO, value=0-100, related_query=None), 'rising' (related_query=query, value=valor rising o BREAKOUT_SENTINEL, point_date=None), 'top' (related_query=query, value=0-100, point_date=None). No incluye id/snapshot_date/scraped_at (los anade el ingest)."
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/datascience/scrape_google_trends.py"
+---
+
+## Ejemplo
+
+```python
+import sys, os
+sys.path.insert(0, os.path.join("python", "functions"))
+from datascience.scrape_google_trends import scrape_google_trends
+
+# Interes de busqueda en Espana, ultimos 7 dias, con related queries
+rows = scrape_google_trends(
+    ["coche electrico", "panel solar"],
+    geo="ES",
+    timeframe="now 7-d",
+    include_related=True,
+)
+
+# Cada fila tiene el mismo schema, listo para insertar en Postgres google_trends:
+# {"geo": "ES", "timeframe": "now 7-d", "keyword": "coche electrico",
+#  "metric": "interest_over_time", "point_date": "2026-06-12", "value": 73,
+#  "related_query": None}
+#
+# {"geo": "ES", "timeframe": "now 7-d", "keyword": "coche electrico",
+#  "metric": "rising", "point_date": None, "value": 999999,   # "Breakout"
+#  "related_query": "ayudas coche electrico 2026"}
+#
+# {"geo": "ES", "timeframe": "now 7-d", "keyword": "panel solar",
+#  "metric": "top", "point_date": None, "value": 100,
+#  "related_query": "placas solares precio"}
+
+interes = [r for r in rows if r["metric"] == "interest_over_time"]
+print(len(interes), "puntos de interes temporal")
+```
+
+## Cuando usarla
+
+Cuando necesites medir el interes/momentum de un nicho o keyword en el tiempo (market intelligence, deteccion de tendencias, validacion de demanda de producto) y vayas a persistirlo en la tabla Postgres `google_trends`. Usala antes del ingest: devuelve filas crudas con el schema exacto de la tabla, sin los campos que pone el ingest (id, snapshot_date, scraped_at). Pon `include_related=False` si solo te interesa la serie temporal y quieres minimizar la superficie de rate-limit.
+
+## Gotchas
+
+- **API no oficial + rate-limit (429).** pytrends scrapea una API interna de Google que NO es publica. Google la limita agresivamente: rafagas de llamadas devuelven HTTP 429. La funcion reintenta con backoff incremental (5s, 15s, 30s) ante 429; si tras esos reintentos sigue limitada, lanza `RuntimeError` mencionando explicitamente el rate-limit. En entornos de CI/headless es habitual recibir 429 a la primera — no es un bug de la funcion.
+- **Puede romperse sin aviso.** Al depender de un endpoint interno, Google puede cambiarlo y romper pytrends en cualquier momento. Trata los fallos como esperados y cachea resultados aguas arriba.
+- **Interes relativo, NO volumen absoluto.** Los valores 0-100 estan normalizados DENTRO del payload consultado (mismo geo + timeframe + conjunto de keywords). 100 = el pico del conjunto, no "100 busquedas". No son comparables entre payloads distintos. Cambiar el set de keywords reescala todos los valores.
+- **"Breakout" en rising.** Google marca como la cadena literal `"Breakout"` (en vez de un %) las related_queries rising cuyo crecimiento supera ~5000%. Para mantener la columna `value` numerica en Postgres se mapea al sentinel `BREAKOUT_SENTINEL = 999999`. Si necesitas distinguir un breakout real de un valor 999999 legitimo (imposible en la practica para %), filtra por ese sentinel.
+- **Maximo 5 keywords por payload.** Limite de Google Trends. Pasar mas keywords hace que pytrends falle o ignore las extra. Trocea en lotes de <=5 y llama varias veces (espaciando para no disparar el 429).
+- **DataFrames vacios.** `interest_over_time()` puede volver vacio (keyword sin datos en la ventana) y `related_queries()` devuelve un dict `{keyword: {'top': df|None, 'rising': df|None}}` con valores None. La funcion maneja ambos casos sin petar: simplemente no genera filas para esas combinaciones.
+- **Columna `isPartial`.** `interest_over_time()` incluye una columna `isPartial` que marca el ultimo punto como provisional. Se ignora por completo (solo se leen las columnas que coinciden con las keywords).
@@ -0,0 +1,193 @@
+"""Captación de interés de búsqueda de Google Trends vía pytrends.
+
+Google Trends NUNCA devuelve volúmenes absolutos de búsqueda: todo el interés es
+relativo y está normalizado en una escala 0-100 dentro del payload consultado
+(keywords + geo + timeframe). Esta función aplana el resultado de pytrends en una
+lista de dicts con un schema fijo que casa 1:1 con la tabla Postgres
+`google_trends`.
+"""
+
+import time
+
+
+# Sentinel numérico para related_queries "rising" que Google marca como "Breakout".
+# pytrends entrega la cadena literal "Breakout" cuando el crecimiento es tan alto
+# que no cabe en un porcentaje (>5000%). Lo representamos como este entero para
+# mantener la columna `value` numérica en Postgres sin perder la señal.
+BREAKOUT_SENTINEL = 999999
+
+
+def _to_iso(value) -> str:
+    """Convierte una fecha/timestamp de pandas a ISO YYYY-MM-DD."""
+    # pandas Timestamp y datetime.date/datetime exponen strftime.
+    if hasattr(value, "strftime"):
+        return value.strftime("%Y-%m-%d")
+    # Fallback: ya viene como string ISO o similar; recorta a 10 chars (fecha).
+    return str(value)[:10]
+
+
+def _coerce_value(raw):
+    """Normaliza el valor de una related_query rising/top a int o sentinel.
+
+    pytrends devuelve enteros para top y la mayoría de rising, pero rising puede
+    traer la cadena "Breakout". Cualquier valor no numérico se mapea al sentinel.
+    """
+    if isinstance(raw, str):
+        if raw.strip().lower() == "breakout":
+            return BREAKOUT_SENTINEL
+        try:
+            return int(float(raw))
+        except (ValueError, TypeError):
+            return BREAKOUT_SENTINEL
+    try:
+        return int(raw)
+    except (ValueError, TypeError):
+        return None
+
+
+def scrape_google_trends(
+    keywords: list[str],
+    geo: str = "ES",
+    timeframe: str = "now 7-d",
+    include_related: bool = True,
+) -> list[dict]:
+    """Capta interés de búsqueda de Google Trends para una lista de keywords.
+
+    Construye un único payload de pytrends (keywords + geo + timeframe) y aplana
+    interest_over_time y, opcionalmente, related_queries (rising + top) en filas
+    homogéneas. El interés es relativo 0-100, nunca volumen absoluto.
+
+    Args:
+        keywords: lista de términos/nichos a consultar (máx. 5 por payload — límite
+            de Google Trends). Cada elemento es una keyword.
+        geo: código de país ISO-3166 (ej. "ES", "US", "" para mundial).
+        timeframe: ventana temporal en sintaxis pytrends (ej. "now 7-d",
+            "today 3-m", "today 12-m", "2024-01-01 2024-12-31").
+        include_related: si True, añade filas metric="rising" y metric="top" de
+            related_queries por keyword. Si False, solo interest_over_time.
+
+    Returns:
+        Lista de dicts con EXACTAMENTE estas claves (sin id/snapshot_date/scraped_at,
+        que los añade el ingest):
+            geo, timeframe, keyword, metric, point_date, value, related_query
+        Tres familias de fila según `metric`:
+          - "interest_over_time": una por (keyword, punto temporal). point_date=fecha
+            ISO, value=interés 0-100, related_query=None.
+          - "rising": related_queries rising (si include_related). related_query=query,
+            value=valor rising (Breakout→BREAKOUT_SENTINEL), point_date=None.
+          - "top": related_queries top (si include_related). related_query=query,
+            value=valor 0-100, point_date=None.
+
+    Raises:
+        RuntimeError: si Google rate-limitea (429) tras agotar los reintentos, o si
+            pytrends falla de forma no recuperable.
+    """
+    # Import dentro de la función: pytrends es dependencia impura/externa.
+    from pytrends.request import TrendReq
+
+    if not keywords:
+        return []
+
+    pytrends = TrendReq(hl="es-ES", tz=60)
+
+    # ---- build_payload con backoff ante 429 ----
+    backoff = [5, 15, 30]
+    last_err = None
+    for attempt in range(len(backoff) + 1):
+        try:
+            pytrends.build_payload(keywords, geo=geo, timeframe=timeframe)
+            last_err = None
+            break
+        except Exception as exc:  # pragma: no cover - depende de la red
+            last_err = exc
+            msg = str(exc).lower()
+            is_rate_limit = "429" in msg or "too many requests" in msg or "rate" in msg
+            if attempt < len(backoff) and is_rate_limit:
+                time.sleep(backoff[attempt])
+                continue
+            if is_rate_limit:
+                raise RuntimeError(
+                    "Google Trends rate-limited (429): se agotaron los reintentos "
+                    f"({len(backoff)} backoffs {backoff}s). pytrends usa una API no "
+                    "oficial y Google la limita agresivamente. Reintenta más tarde."
+                ) from exc
+            raise RuntimeError(
+                f"build_payload falló de forma no recuperable: {exc}"
+            ) from exc
+    if last_err is not None:
+        raise RuntimeError(f"build_payload no completó: {last_err}")
+
+    rows: list[dict] = []
+
+    # ---- interest_over_time ----
+    try:
+        iot = pytrends.interest_over_time()
+    except Exception as exc:  # pragma: no cover - depende de la red
+        raise RuntimeError(f"interest_over_time falló: {exc}") from exc
+
+    if iot is not None and not iot.empty:
+        # El índice es la fecha; cada columna es una keyword + 'isPartial' (ignorar).
+        for idx, record in iot.iterrows():
+            point_date = _to_iso(idx)
+            for kw in keywords:
+                if kw not in record:
+                    continue
+                rows.append(
+                    {
+                        "geo": geo,
+                        "timeframe": timeframe,
+                        "keyword": kw,
+                        "metric": "interest_over_time",
+                        "point_date": point_date,
+                        "value": int(record[kw]),
+                        "related_query": None,
+                    }
+                )
+
+    # ---- related_queries (rising + top) ----
+    if include_related:
+        try:
+            related = pytrends.related_queries()
+        except Exception as exc:  # pragma: no cover - depende de la red
+            raise RuntimeError(f"related_queries falló: {exc}") from exc
+
+        related = related or {}
+        for kw in keywords:
+            entry = related.get(kw) or {}
+            for metric in ("rising", "top"):
+                df = entry.get(metric)
+                if df is None or getattr(df, "empty", True):
+                    continue
+                for _, qrow in df.iterrows():
+                    rows.append(
+                        {
+                            "geo": geo,
+                            "timeframe": timeframe,
+                            "keyword": kw,
+                            "metric": metric,
+                            "point_date": None,
+                            "value": _coerce_value(qrow.get("value")),
+                            "related_query": qrow.get("query"),
+                        }
+                    )
+
+    return rows
+
+
+if __name__ == "__main__":
+    # Self-test: el import siempre debe funcionar. Una llamada real a Google puede
+    # dar 429 en este entorno; la capturamos y reportamos sin fallar.
+    print("import OK")
+    try:
+        out = scrape_google_trends(["python", "rust"], geo="ES", timeframe="now 7-d")
+        n_iot = sum(1 for r in out if r["metric"] == "interest_over_time")
+        n_rising = sum(1 for r in out if r["metric"] == "rising")
+        n_top = sum(1 for r in out if r["metric"] == "top")
+        print(
+            f"ok: {len(out)} filas "
+            f"(interest_over_time={n_iot}, rising={n_rising}, top={n_top})"
+        )
+        if out:
+            print("muestra:", out[0])
+    except RuntimeError as exc:
+        print(f"rate-limited o error de red (esperado en este entorno): {exc}")
@@ -0,0 +1,99 @@
+---
+name: scrape_tiktok_creative
+kind: function
+lang: py
+domain: datascience
+version: "1.0.0"
+purity: impure
+signature: "def scrape_tiktok_creative(country: str = 'ES', kind: str = 'hashtag', limit: int = 50, period: int = 7) -> list[dict]"
+description: "Capta tendencias del TikTok Creative Center (hashtags, canciones, creadores y videos virales con metricas reales) via su API JSON interna creative_radar_api. Headers realistas con requests, paginacion, parseo tolerante a cambios de schema. Devuelve filas 1:1 con la tabla Postgres tiktok_trends. Impure: hace HTTP a un endpoint interno no publico que puede romperse o exigir anti-bot."
+tags: [tiktok, social, trends, market-intel, datascience]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [requests]
+params:
+  - name: country
+    desc: "Codigo ISO de pais del ranking (ej. 'ES', 'US', 'MX'). El Creative Center segmenta las tendencias por mercado. Default 'ES'."
+  - name: kind
+    desc: "Tipo de tendencia: 'hashtag' (default, el mas estable), 'song', 'creator' o 'video'. Cada uno usa un endpoint interno distinto. Empieza por hashtag si no estas seguro."
+  - name: limit
+    desc: "Numero maximo de filas a devolver. El endpoint pagina de 50 en 50; la funcion concatena paginas hasta alcanzar limit o agotar resultados. Default 50."
+  - name: period
+    desc: "Ventana temporal en dias. Solo acepta 7 (default), 30 o 120 — el endpoint rechaza otros valores con error de validacion."
+output: "Lista de dicts con EXACTAMENTE las claves: country (str), kind (str), name (str|None), rank (int|None), views (int|None, BIGINT), growth_pct (float|None), industry (str|None), url (str|None). Mapea 1:1 con la tabla Postgres tiktok_trends (sin id/snapshot_date/scraped_at). Devuelve [] si el endpoint responde OK pero sin items para el segmento. Lanza ValueError (kind/period invalidos) o RuntimeError (403 anti-bot, HTTP de error, JSON invalido, code de error logico)."
+tested: false
+tests: []
+test_file_path: ""
+file_path: "python/functions/datascience/scrape_tiktok_creative.py"
+notes: |
+  ESTRATEGIA: el Creative Center (ads.tiktok.com/business/creativecenter) es una
+  SPA JS-rendered, pero alimenta sus rankings desde una API interna de facto bajo
+  https://ads.tiktok.com/creative_radar_api/v1/popular_trend/... Esta funcion habla
+  directamente con ese endpoint con requests (mucho mas barato que un navegador
+  headless CUANDO responde). El parseo tolera variaciones del schema (data.list,
+  data.hashtags, data.items...) y nombres de campo distintos por kind.
+
+  REALISMO: en pruebas reales desde un entorno headless/datacenter el endpoint
+  respondio con code=40101 ("no permission") — rechazo anti-bot por falta de los
+  tokens de sesion firmados (anonymous-user-id, user-sign, timestamp) que la SPA
+  genera en cliente y que no se pueden falsear fuera del navegador. La funcion NO
+  inventa datos: en ese caso lanza RuntimeError con un mensaje claro. Se considera
+  el comportamiento esperado, no un bug de la funcion.
+---
+
+## Ejemplo
+
+```python
+from datascience.scrape_tiktok_creative import scrape_tiktok_creative
+
+# Top 50 hashtags virales en Espana, ultimos 7 dias.
+rows = scrape_tiktok_creative(country="ES", kind="hashtag", limit=50, period=7)
+# rows[0] -> {
+#   "country": "ES", "kind": "hashtag", "name": "fyp", "rank": 1,
+#   "views": 12450000, "growth_pct": 42.0, "industry": "Entertainment",
+#   "url": "https://ads.tiktok.com/business/creativecenter/hashtag/fyp/pc/en"
+# }
+
+# Canciones en tendencia en US, ventana de 30 dias.
+songs = scrape_tiktok_creative(country="US", kind="song", limit=20, period=30)
+
+# Las filas casan 1:1 con un INSERT en la tabla Postgres tiktok_trends
+# (sin id/snapshot_date/scraped_at, que los pone la BD).
+```
+
+## Cuando usarla
+
+Usala cuando necesites market intelligence de TikTok: detectar hashtags, canciones,
+creadores o productos virales por pais con metricas reales (views, ranking,
+crecimiento) para alimentar la tabla `tiktok_trends`, un dashboard de tendencias o
+un analisis de oportunidad de contenido. Empieza por `kind="hashtag"` (el endpoint
+mas estable) antes de probar song/creator/video. Si el fetch HTTP devuelve
+RuntimeError por anti-bot, baja al browser MCP/CDP del ecosistema.
+
+## Gotchas
+
+- **El endpoint interno NO es una API publica versionada.** `creative_radar_api/v1/popular_trend`
+  es un contrato de facto que TikTok cambia sin aviso: ruta, parametros, schema del
+  JSON y claves de campo pueden romperse en cualquier deploy. El parseo es tolerante
+  pero no inmune; si TikTok mueve la lista a otra ruta, la funcion devuelve [] o
+  lanza RuntimeError.
+- **Anti-bot real y frecuente.** Desde IPs de datacenter o entornos headless el
+  endpoint suele responder `403` o `code=40101 (no permission)`. Los rankings se
+  sirven solo a clientes con los tokens de sesion firmados que la SPA genera en
+  navegador (`anonymous-user-id`, `user-sign`, `timestamp`). Esos tokens NO se
+  pueden falsear con requests. **Verificado en self-test: respondio code=40101.**
+- **Alternativa robusta cuando el HTTP esta bloqueado:** usar el browser MCP/CDP del
+  ecosistema (regla `flow_replay.md`) navegando el Creative Center con una sesion de
+  chrome real, dejando que el cliente genere los tokens, y leyendo el JSON de la
+  respuesta XHR o el DOM renderizado. Es mas caro pero pasa el anti-bot.
+- **No inventa datos.** Si no puede extraer de verdad, lanza una excepcion clara con
+  el codigo HTTP / code logico para diagnostico, en vez de devolver filas falsas.
+- **growth_pct heuristico:** el Creative Center expresa el crecimiento como ratio
+  (0.42) o como porcentaje (42) segun campo/version; la funcion normaliza ratios en
+  [-1, 1] a porcentaje (*100). Si TikTok cambia la convencion, revisar `_row_from_item`.
+- **Rate limiting:** la paginacion hace una request por pagina de 50. Para `limit`
+  altos puedes encadenar varias requests rapidas — anade backoff propio si scrapeas
+  muchos paises seguidos para no acelerar el bloqueo.
@@ -0,0 +1,287 @@
+"""Scrape de tendencias del TikTok Creative Center via su API JSON interna.
+
+El TikTok Creative Center (https://ads.tiktok.com/business/creativecenter/) es una
+SPA JS-rendered, pero alimenta sus rankings desde una API interna documentada de
+facto bajo `https://ads.tiktok.com/creative_radar_api/v1/popular_trend/...`.
+Esta funcion habla DIRECTAMENTE con ese endpoint usando `requests` con headers
+realistas, evitando el coste de un navegador headless cuando el endpoint responde.
+
+ADVERTENCIA: el endpoint interno cambia sin aviso, puede exigir token anti-bot y
+desde IPs de datacenter/headless suele devolver 403 o listas vacias. La funcion
+falla con una excepcion clara cuando el endpoint no responde como se espera. La
+alternativa robusta para entornos bloqueados es el browser MCP/CDP del ecosistema
+navegando el Creative Center con una sesion real (ver `## Gotchas` del .md).
+"""
+
+from __future__ import annotations
+
+import requests
+
+# Endpoints internos del Creative Center por tipo de tendencia. Son APIs de facto
+# (no publicas ni versionadas como contrato) y pueden romperse en cualquier deploy
+# de TikTok. Se mantienen aqui en un solo sitio para facilitar el parcheo.
+_BASE = "https://ads.tiktok.com/creative_radar_api/v1/popular_trend"
+_ENDPOINTS: dict[str, str] = {
+    "hashtag": f"{_BASE}/hashtag/list",
+    "song": f"{_BASE}/song/list",
+    "creator": f"{_BASE}/creator/list",
+    "video": f"{_BASE}/list",
+}
+
+# Periodos validos del Creative Center (en dias). El endpoint rechaza otros valores.
+_VALID_PERIODS = {7, 30, 120}
+
+_HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+        "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
+    ),
+    "Accept": "application/json, text/plain, */*",
+    "Accept-Language": "en-US,en;q=0.9,es;q=0.8",
+    "Referer": "https://ads.tiktok.com/business/creativecenter/inspiration/popular/hashtag/pc/en",
+    "Origin": "https://ads.tiktok.com",
+    # El Creative Center exige este header para servir JSON; sin el devuelve HTML.
+    "anonymous-user-id": "",
+    "timestamp": "",
+    "user-sign": "",
+}
+
+
+def _to_int(value: object) -> int | None:
+    """Convierte un valor numerico del payload a int, o None si no es parseable."""
+    if value is None:
+        return None
+    try:
+        # Algunos campos vienen como string ("1234567") o float (1234567.0).
+        return int(float(value))
+    except (TypeError, ValueError):
+        return None
+
+
+def _to_float(value: object) -> float | None:
+    """Convierte un valor numerico del payload a float, o None si no es parseable."""
+    if value is None:
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _extract_items(payload: dict) -> list[dict]:
+    """Localiza la lista de items dentro del JSON, tolerando variaciones del schema.
+
+    El Creative Center ha servido la lista bajo distintas rutas a lo largo del
+    tiempo (`data.list`, `data.hashtags`, `data.items`, ...). Se prueban en orden.
+    """
+    data = payload.get("data")
+    if not isinstance(data, dict):
+        return []
+    for key in ("list", "hashtags", "songs", "creators", "videos", "items"):
+        candidate = data.get(key)
+        if isinstance(candidate, list):
+            return candidate
+    # Fallback: la primera lista no vacia que aparezca dentro de data.
+    for value in data.values():
+        if isinstance(value, list) and value:
+            return value
+    return []
+
+
+def _row_from_item(item: dict, country: str, kind: str, fallback_rank: int) -> dict:
+    """Normaliza un item crudo del payload a la fila canonica de `tiktok_trends`.
+
+    Claves de salida (1:1 con la tabla Postgres): country, kind, name, rank, views,
+    growth_pct, industry, url. Tolera nombres de campo distintos por tipo de kind.
+    """
+    name = (
+        item.get("hashtag_name")
+        or item.get("title")
+        or item.get("name")
+        or item.get("nickname")
+        or item.get("song_title")
+        or item.get("music_name")
+        or item.get("keyword")
+    )
+
+    rank = _to_int(item.get("rank")) or _to_int(item.get("trend_rank"))
+    if rank is None:
+        rank = fallback_rank
+
+    # Volumen de visualizaciones / publicaciones segun el tipo de tendencia.
+    views = (
+        _to_int(item.get("video_views"))
+        or _to_int(item.get("views"))
+        or _to_int(item.get("publish_cnt"))
+        or _to_int(item.get("post_count"))
+        or _to_int(item.get("play_count"))
+    )
+
+    # El Creative Center expresa el crecimiento como ratio (0.42) o porcentaje (42).
+    growth_raw = item.get("trend") or item.get("rank_diff") or item.get("growth")
+    growth_pct = _to_float(growth_raw)
+    if growth_pct is not None and -1.0 <= growth_pct <= 1.0:
+        # Heuristica: si viene como ratio en [-1,1], normalizar a porcentaje.
+        growth_pct = round(growth_pct * 100.0, 2)
+
+    industry = None
+    industries = item.get("industry_info") or item.get("industry")
+    if isinstance(industries, dict):
+        industry = industries.get("value") or industries.get("label")
+    elif isinstance(industries, list) and industries:
+        first = industries[0]
+        industry = first.get("value") if isinstance(first, dict) else str(first)
+    elif isinstance(industries, str):
+        industry = industries
+
+    url = item.get("url") or item.get("link")
+    if not url and kind == "hashtag" and name:
+        slug = str(name).lstrip("#")
+        url = (
+            "https://ads.tiktok.com/business/creativecenter/hashtag/"
+            f"{slug}/pc/en"
+        )
+
+    return {
+        "country": country,
+        "kind": kind,
+        "name": str(name) if name is not None else None,
+        "rank": rank,
+        "views": views,
+        "growth_pct": growth_pct,
+        "industry": industry,
+        "url": url,
+    }
+
+
+def scrape_tiktok_creative(
+    country: str = "ES",
+    kind: str = "hashtag",
+    limit: int = 50,
+    period: int = 7,
+) -> list[dict]:
+    """Capta tendencias del TikTok Creative Center via su API JSON interna.
+
+    Args:
+        country: codigo ISO de pais del ranking (ej. "ES", "US", "MX"). El Creative
+            Center segmenta las tendencias por mercado.
+        kind: tipo de tendencia. Uno de: "hashtag" (default, el mas estable),
+            "song", "creator", "video".
+        limit: numero maximo de filas a devolver (el endpoint pagina de 50 en 50).
+        period: ventana temporal en dias. Validos: 7 (default), 30, 120.
+
+    Returns:
+        Lista de dicts con EXACTAMENTE las claves: country, kind, name, rank, views,
+        growth_pct, industry, url. Mapea 1:1 con la tabla Postgres `tiktok_trends`
+        (sin id/snapshot_date/scraped_at). `views` es int|None, `growth_pct` es
+        float|None, `rank` es int|None. Devuelve [] si el endpoint responde OK pero
+        sin items para el segmento solicitado.
+
+    Raises:
+        ValueError: si `kind` o `period` no son validos.
+        RuntimeError: si el endpoint interno no responde como JSON util (HTTP de
+            error, anti-bot, cambio de schema, bloqueo desde datacenter/headless).
+            El mensaje indica el codigo HTTP o la causa para diagnostico.
+    """
+    if kind not in _ENDPOINTS:
+        raise ValueError(
+            f"kind invalido: {kind!r}. Validos: {sorted(_ENDPOINTS)}"
+        )
+    if period not in _VALID_PERIODS:
+        raise ValueError(
+            f"period invalido: {period}. Validos: {sorted(_VALID_PERIODS)}"
+        )
+
+    endpoint = _ENDPOINTS[kind]
+    rows: list[dict] = []
+    page = 1
+    page_size = 50
+
+    session = requests.Session()
+    session.headers.update(_HEADERS)
+
+    while len(rows) < limit:
+        params = {
+            "page": page,
+            "limit": page_size,
+            "period": period,
+            "country_code": country,
+            "sort_by": "popular",
+        }
+        try:
+            resp = session.get(endpoint, params=params, timeout=15)
+        except requests.RequestException as exc:
+            raise RuntimeError(
+                "TikTok Creative Center: fallo de red contactando el endpoint "
+                f"interno {endpoint!r}: {exc}. Alternativa: usar el browser "
+                "MCP/CDP del ecosistema con sesion real (ver .md ## Gotchas)."
+            ) from exc
+
+        if resp.status_code == 403:
+            raise RuntimeError(
+                "TikTok Creative Center devolvio 403 (anti-bot / IP de "
+                "datacenter bloqueada). El endpoint JSON interno requiere "
+                "tokens de sesion (anonymous-user-id/user-sign) que no se "
+                "pueden falsear desde headless. Alternativa robusta: browser "
+                "MCP/CDP navegando el Creative Center con sesion real."
+            )
+        if resp.status_code != 200:
+            raise RuntimeError(
+                f"TikTok Creative Center devolvio HTTP {resp.status_code} para "
+                f"{endpoint!r}. El endpoint interno pudo cambiar de ruta o de "
+                "contrato (no es una API publica versionada)."
+            )
+
+        try:
+            payload = resp.json()
+        except ValueError as exc:
+            raise RuntimeError(
+                "TikTok Creative Center no devolvio JSON (probable HTML de "
+                "challenge o pagina de login). El endpoint interno cambio o "
+                "exige sesion real. Alternativa: browser MCP/CDP."
+            ) from exc
+
+        # TikTok envuelve la respuesta en {code, msg, data}. code != 0 = error logico.
+        code = payload.get("code")
+        if code not in (0, None):
+            raise RuntimeError(
+                f"TikTok Creative Center respondio code={code} "
+                f"({payload.get('msg', 'sin mensaje')}). El endpoint interno "
+                "rechazo la peticion (parametros o anti-bot)."
+            )
+
+        items = _extract_items(payload)
+        if not items:
+            break
+
+        for offset, item in enumerate(items):
+            if not isinstance(item, dict):
+                continue
+            rank_fallback = (page - 1) * page_size + offset + 1
+            rows.append(_row_from_item(item, country, kind, rank_fallback))
+            if len(rows) >= limit:
+                break
+
+        # Si la pagina vino incompleta, no hay mas resultados.
+        if len(items) < page_size:
+            break
+        page += 1
+
+    return rows[:limit]
+
+
+if __name__ == "__main__":
+    # Self-test honesto: import OK obligatorio + UN intento de fetch real que NO
+    # falla la build por la red. Reporta si TikTok respondio o bloqueo/cambio.
+    print("import OK: scrape_tiktok_creative cargado")
+    try:
+        sample = scrape_tiktok_creative(country="ES", kind="hashtag", limit=10, period=7)
+        if sample:
+            print(f"FETCH REAL OK: {len(sample)} filas. Primera: {sample[0]}")
+        else:
+            print(
+                "FETCH REAL: el endpoint respondio pero sin items "
+                "(segmento vacio o anti-bot silencioso)."
+            )
+    except Exception as exc:  # noqa: BLE001 -- self-test honesto, no propaga
+        print(f"FETCH REAL FALLO (esperable desde headless/datacenter): {exc}")