e1e9bb7499
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
288 lines
11 KiB
Python
288 lines
11 KiB
Python
"""Scrape de tendencias del TikTok Creative Center via su API JSON interna.
|
|
|
|
El TikTok Creative Center (https://ads.tiktok.com/business/creativecenter/) es una
|
|
SPA JS-rendered, pero alimenta sus rankings desde una API interna documentada de
|
|
facto bajo `https://ads.tiktok.com/creative_radar_api/v1/popular_trend/...`.
|
|
Esta funcion habla DIRECTAMENTE con ese endpoint usando `requests` con headers
|
|
realistas, evitando el coste de un navegador headless cuando el endpoint responde.
|
|
|
|
ADVERTENCIA: el endpoint interno cambia sin aviso, puede exigir token anti-bot y
|
|
desde IPs de datacenter/headless suele devolver 403 o listas vacias. La funcion
|
|
falla con una excepcion clara cuando el endpoint no responde como se espera. La
|
|
alternativa robusta para entornos bloqueados es el browser MCP/CDP del ecosistema
|
|
navegando el Creative Center con una sesion real (ver `## Gotchas` del .md).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import requests
|
|
|
|
# Endpoints internos del Creative Center por tipo de tendencia. Son APIs de facto
|
|
# (no publicas ni versionadas como contrato) y pueden romperse en cualquier deploy
|
|
# de TikTok. Se mantienen aqui en un solo sitio para facilitar el parcheo.
|
|
_BASE = "https://ads.tiktok.com/creative_radar_api/v1/popular_trend"
|
|
_ENDPOINTS: dict[str, str] = {
|
|
"hashtag": f"{_BASE}/hashtag/list",
|
|
"song": f"{_BASE}/song/list",
|
|
"creator": f"{_BASE}/creator/list",
|
|
"video": f"{_BASE}/list",
|
|
}
|
|
|
|
# Periodos validos del Creative Center (en dias). El endpoint rechaza otros valores.
|
|
_VALID_PERIODS = {7, 30, 120}
|
|
|
|
_HEADERS = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
|
),
|
|
"Accept": "application/json, text/plain, */*",
|
|
"Accept-Language": "en-US,en;q=0.9,es;q=0.8",
|
|
"Referer": "https://ads.tiktok.com/business/creativecenter/inspiration/popular/hashtag/pc/en",
|
|
"Origin": "https://ads.tiktok.com",
|
|
# El Creative Center exige este header para servir JSON; sin el devuelve HTML.
|
|
"anonymous-user-id": "",
|
|
"timestamp": "",
|
|
"user-sign": "",
|
|
}
|
|
|
|
|
|
def _to_int(value: object) -> int | None:
|
|
"""Convierte un valor numerico del payload a int, o None si no es parseable."""
|
|
if value is None:
|
|
return None
|
|
try:
|
|
# Algunos campos vienen como string ("1234567") o float (1234567.0).
|
|
return int(float(value))
|
|
except (TypeError, ValueError):
|
|
return None
|
|
|
|
|
|
def _to_float(value: object) -> float | None:
|
|
"""Convierte un valor numerico del payload a float, o None si no es parseable."""
|
|
if value is None:
|
|
return None
|
|
try:
|
|
return float(value)
|
|
except (TypeError, ValueError):
|
|
return None
|
|
|
|
|
|
def _extract_items(payload: dict) -> list[dict]:
|
|
"""Localiza la lista de items dentro del JSON, tolerando variaciones del schema.
|
|
|
|
El Creative Center ha servido la lista bajo distintas rutas a lo largo del
|
|
tiempo (`data.list`, `data.hashtags`, `data.items`, ...). Se prueban en orden.
|
|
"""
|
|
data = payload.get("data")
|
|
if not isinstance(data, dict):
|
|
return []
|
|
for key in ("list", "hashtags", "songs", "creators", "videos", "items"):
|
|
candidate = data.get(key)
|
|
if isinstance(candidate, list):
|
|
return candidate
|
|
# Fallback: la primera lista no vacia que aparezca dentro de data.
|
|
for value in data.values():
|
|
if isinstance(value, list) and value:
|
|
return value
|
|
return []
|
|
|
|
|
|
def _row_from_item(item: dict, country: str, kind: str, fallback_rank: int) -> dict:
|
|
"""Normaliza un item crudo del payload a la fila canonica de `tiktok_trends`.
|
|
|
|
Claves de salida (1:1 con la tabla Postgres): country, kind, name, rank, views,
|
|
growth_pct, industry, url. Tolera nombres de campo distintos por tipo de kind.
|
|
"""
|
|
name = (
|
|
item.get("hashtag_name")
|
|
or item.get("title")
|
|
or item.get("name")
|
|
or item.get("nickname")
|
|
or item.get("song_title")
|
|
or item.get("music_name")
|
|
or item.get("keyword")
|
|
)
|
|
|
|
rank = _to_int(item.get("rank")) or _to_int(item.get("trend_rank"))
|
|
if rank is None:
|
|
rank = fallback_rank
|
|
|
|
# Volumen de visualizaciones / publicaciones segun el tipo de tendencia.
|
|
views = (
|
|
_to_int(item.get("video_views"))
|
|
or _to_int(item.get("views"))
|
|
or _to_int(item.get("publish_cnt"))
|
|
or _to_int(item.get("post_count"))
|
|
or _to_int(item.get("play_count"))
|
|
)
|
|
|
|
# El Creative Center expresa el crecimiento como ratio (0.42) o porcentaje (42).
|
|
growth_raw = item.get("trend") or item.get("rank_diff") or item.get("growth")
|
|
growth_pct = _to_float(growth_raw)
|
|
if growth_pct is not None and -1.0 <= growth_pct <= 1.0:
|
|
# Heuristica: si viene como ratio en [-1,1], normalizar a porcentaje.
|
|
growth_pct = round(growth_pct * 100.0, 2)
|
|
|
|
industry = None
|
|
industries = item.get("industry_info") or item.get("industry")
|
|
if isinstance(industries, dict):
|
|
industry = industries.get("value") or industries.get("label")
|
|
elif isinstance(industries, list) and industries:
|
|
first = industries[0]
|
|
industry = first.get("value") if isinstance(first, dict) else str(first)
|
|
elif isinstance(industries, str):
|
|
industry = industries
|
|
|
|
url = item.get("url") or item.get("link")
|
|
if not url and kind == "hashtag" and name:
|
|
slug = str(name).lstrip("#")
|
|
url = (
|
|
"https://ads.tiktok.com/business/creativecenter/hashtag/"
|
|
f"{slug}/pc/en"
|
|
)
|
|
|
|
return {
|
|
"country": country,
|
|
"kind": kind,
|
|
"name": str(name) if name is not None else None,
|
|
"rank": rank,
|
|
"views": views,
|
|
"growth_pct": growth_pct,
|
|
"industry": industry,
|
|
"url": url,
|
|
}
|
|
|
|
|
|
def scrape_tiktok_creative(
|
|
country: str = "ES",
|
|
kind: str = "hashtag",
|
|
limit: int = 50,
|
|
period: int = 7,
|
|
) -> list[dict]:
|
|
"""Capta tendencias del TikTok Creative Center via su API JSON interna.
|
|
|
|
Args:
|
|
country: codigo ISO de pais del ranking (ej. "ES", "US", "MX"). El Creative
|
|
Center segmenta las tendencias por mercado.
|
|
kind: tipo de tendencia. Uno de: "hashtag" (default, el mas estable),
|
|
"song", "creator", "video".
|
|
limit: numero maximo de filas a devolver (el endpoint pagina de 50 en 50).
|
|
period: ventana temporal en dias. Validos: 7 (default), 30, 120.
|
|
|
|
Returns:
|
|
Lista de dicts con EXACTAMENTE las claves: country, kind, name, rank, views,
|
|
growth_pct, industry, url. Mapea 1:1 con la tabla Postgres `tiktok_trends`
|
|
(sin id/snapshot_date/scraped_at). `views` es int|None, `growth_pct` es
|
|
float|None, `rank` es int|None. Devuelve [] si el endpoint responde OK pero
|
|
sin items para el segmento solicitado.
|
|
|
|
Raises:
|
|
ValueError: si `kind` o `period` no son validos.
|
|
RuntimeError: si el endpoint interno no responde como JSON util (HTTP de
|
|
error, anti-bot, cambio de schema, bloqueo desde datacenter/headless).
|
|
El mensaje indica el codigo HTTP o la causa para diagnostico.
|
|
"""
|
|
if kind not in _ENDPOINTS:
|
|
raise ValueError(
|
|
f"kind invalido: {kind!r}. Validos: {sorted(_ENDPOINTS)}"
|
|
)
|
|
if period not in _VALID_PERIODS:
|
|
raise ValueError(
|
|
f"period invalido: {period}. Validos: {sorted(_VALID_PERIODS)}"
|
|
)
|
|
|
|
endpoint = _ENDPOINTS[kind]
|
|
rows: list[dict] = []
|
|
page = 1
|
|
page_size = 50
|
|
|
|
session = requests.Session()
|
|
session.headers.update(_HEADERS)
|
|
|
|
while len(rows) < limit:
|
|
params = {
|
|
"page": page,
|
|
"limit": page_size,
|
|
"period": period,
|
|
"country_code": country,
|
|
"sort_by": "popular",
|
|
}
|
|
try:
|
|
resp = session.get(endpoint, params=params, timeout=15)
|
|
except requests.RequestException as exc:
|
|
raise RuntimeError(
|
|
"TikTok Creative Center: fallo de red contactando el endpoint "
|
|
f"interno {endpoint!r}: {exc}. Alternativa: usar el browser "
|
|
"MCP/CDP del ecosistema con sesion real (ver .md ## Gotchas)."
|
|
) from exc
|
|
|
|
if resp.status_code == 403:
|
|
raise RuntimeError(
|
|
"TikTok Creative Center devolvio 403 (anti-bot / IP de "
|
|
"datacenter bloqueada). El endpoint JSON interno requiere "
|
|
"tokens de sesion (anonymous-user-id/user-sign) que no se "
|
|
"pueden falsear desde headless. Alternativa robusta: browser "
|
|
"MCP/CDP navegando el Creative Center con sesion real."
|
|
)
|
|
if resp.status_code != 200:
|
|
raise RuntimeError(
|
|
f"TikTok Creative Center devolvio HTTP {resp.status_code} para "
|
|
f"{endpoint!r}. El endpoint interno pudo cambiar de ruta o de "
|
|
"contrato (no es una API publica versionada)."
|
|
)
|
|
|
|
try:
|
|
payload = resp.json()
|
|
except ValueError as exc:
|
|
raise RuntimeError(
|
|
"TikTok Creative Center no devolvio JSON (probable HTML de "
|
|
"challenge o pagina de login). El endpoint interno cambio o "
|
|
"exige sesion real. Alternativa: browser MCP/CDP."
|
|
) from exc
|
|
|
|
# TikTok envuelve la respuesta en {code, msg, data}. code != 0 = error logico.
|
|
code = payload.get("code")
|
|
if code not in (0, None):
|
|
raise RuntimeError(
|
|
f"TikTok Creative Center respondio code={code} "
|
|
f"({payload.get('msg', 'sin mensaje')}). El endpoint interno "
|
|
"rechazo la peticion (parametros o anti-bot)."
|
|
)
|
|
|
|
items = _extract_items(payload)
|
|
if not items:
|
|
break
|
|
|
|
for offset, item in enumerate(items):
|
|
if not isinstance(item, dict):
|
|
continue
|
|
rank_fallback = (page - 1) * page_size + offset + 1
|
|
rows.append(_row_from_item(item, country, kind, rank_fallback))
|
|
if len(rows) >= limit:
|
|
break
|
|
|
|
# Si la pagina vino incompleta, no hay mas resultados.
|
|
if len(items) < page_size:
|
|
break
|
|
page += 1
|
|
|
|
return rows[:limit]
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Self-test honesto: import OK obligatorio + UN intento de fetch real que NO
|
|
# falla la build por la red. Reporta si TikTok respondio o bloqueo/cambio.
|
|
print("import OK: scrape_tiktok_creative cargado")
|
|
try:
|
|
sample = scrape_tiktok_creative(country="ES", kind="hashtag", limit=10, period=7)
|
|
if sample:
|
|
print(f"FETCH REAL OK: {len(sample)} filas. Primera: {sample[0]}")
|
|
else:
|
|
print(
|
|
"FETCH REAL: el endpoint respondio pero sin items "
|
|
"(segmento vacio o anti-bot silencioso)."
|
|
)
|
|
except Exception as exc: # noqa: BLE001 -- self-test honesto, no propaga
|
|
print(f"FETCH REAL FALLO (esperable desde headless/datacenter): {exc}")
|