e1e9bb7499
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
394 lines
13 KiB
Python
394 lines
13 KiB
Python
"""Capta productos populares de AliExpress como señal de e-commerce/dropshipping.
|
||
|
||
Extrae el JSON que AliExpress embebe en el HTML de su página de búsqueda/listado
|
||
(``window.runParams`` / ``_dida_config`` / scripts ``data``) en lugar de parsear
|
||
el DOM renderizado por JS. AliExpress es anti-bot fuerte (captcha, 403, fingerprint
|
||
sobre headless/datacenter), por lo que esta función es best-effort: cuando el fetch
|
||
real es bloqueado lanza ``RuntimeError`` con un mensaje claro. NUNCA inventa datos.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import re
|
||
from typing import Any
|
||
|
||
|
||
_BASE = "https://www.aliexpress.com"
|
||
_WHOLESALE = f"{_BASE}/wholesale"
|
||
|
||
# Headers realistas de un navegador desktop. AliExpress fingerprint-ea agresivamente,
|
||
# así que enviamos un perfil coherente (Chrome estable + Accept-Language acorde a region).
|
||
_DESKTOP_HEADERS = {
|
||
"User-Agent": (
|
||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) "
|
||
"Chrome/124.0.0.0 Safari/537.36"
|
||
),
|
||
"Accept": (
|
||
"text/html,application/xhtml+xml,application/xml;q=0.9,"
|
||
"image/avif,image/webp,image/apng,*/*;q=0.8"
|
||
),
|
||
"Accept-Encoding": "gzip, deflate, br",
|
||
"Connection": "keep-alive",
|
||
"Upgrade-Insecure-Requests": "1",
|
||
"Sec-Fetch-Dest": "document",
|
||
"Sec-Fetch-Mode": "navigate",
|
||
"Sec-Fetch-Site": "none",
|
||
"Sec-Fetch-User": "?1",
|
||
"Cache-Control": "max-age=0",
|
||
}
|
||
|
||
# AliExpress decide moneda/region por estas cookies. Mapa ship_to -> (region, locale, currency).
|
||
_REGION_MAP: dict[str, tuple[str, str, str]] = {
|
||
"ES": ("ES", "es_ES", "EUR"),
|
||
"US": ("US", "en_US", "USD"),
|
||
"GB": ("GB", "en_GB", "GBP"),
|
||
"FR": ("FR", "fr_FR", "EUR"),
|
||
"DE": ("DE", "de_DE", "EUR"),
|
||
"IT": ("IT", "it_IT", "EUR"),
|
||
"PT": ("PT", "pt_PT", "EUR"),
|
||
"MX": ("MX", "es_MX", "USD"),
|
||
"BR": ("BR", "pt_BR", "BRL"),
|
||
}
|
||
|
||
# Señales de bloqueo anti-bot en la respuesta.
|
||
_BLOCK_MARKERS = (
|
||
"punish", # /_____tmd_____/punish — captcha slider de AliExpress
|
||
"nc_token", # NoCaptcha de Alibaba
|
||
"captcha",
|
||
"Access Denied",
|
||
"baxia-dialog", # widget de verificacion
|
||
)
|
||
|
||
|
||
def _region_cookies(ship_to: str) -> dict[str, str]:
|
||
region, locale, currency = _REGION_MAP.get(
|
||
ship_to.upper(), _REGION_MAP["ES"]
|
||
)
|
||
return {
|
||
"aep_usuc_f": f"site=glo&c_tp={currency}®ion={region}&b_locale={locale}",
|
||
"intl_locale": locale,
|
||
"xman_us_f": f"x_l=0&no_popup_today=n&zero_order=n&x_locale={locale}",
|
||
}
|
||
|
||
|
||
def _build_url(query: str | None, category: str | None) -> str:
|
||
if query:
|
||
# /wholesale?SearchText=... es el listado de búsqueda con runParams embebido.
|
||
from urllib.parse import quote_plus
|
||
|
||
return f"{_WHOLESALE}?SearchText={quote_plus(query)}&SortType=total_tranpro_desc"
|
||
if category:
|
||
# Categorías numéricas: /category/<id>/x.html. Si llega un slug, lo usamos como texto.
|
||
if category.isdigit():
|
||
return f"{_BASE}/category/{category}/x.html?SortType=total_tranpro_desc"
|
||
from urllib.parse import quote_plus
|
||
|
||
return f"{_WHOLESALE}?SearchText={quote_plus(category)}&SortType=total_tranpro_desc"
|
||
# Sin query ni categoría: listado de best-selling genérico.
|
||
return f"{_WHOLESALE}?SearchText=hot+products&SortType=total_tranpro_desc"
|
||
|
||
|
||
def _looks_blocked(html: str, status_code: int) -> bool:
|
||
if status_code in (403, 429, 503):
|
||
return True
|
||
head = html[:6000].lower()
|
||
return any(marker.lower() in head for marker in _BLOCK_MARKERS)
|
||
|
||
|
||
def _extract_embedded_json(html: str) -> dict[str, Any] | None:
|
||
"""Intenta varios patrones de JSON embebido que AliExpress ha usado a lo largo del tiempo.
|
||
|
||
El nombre/forma cambia con frecuencia, así que probamos en orden y nos quedamos
|
||
con el primero que parsee y contenga algo con pinta de items.
|
||
"""
|
||
patterns = (
|
||
r"window\.runParams\s*=\s*({.*?})\s*;\s*</script>",
|
||
r"window\._dida_config_\s*=\s*({.*?})\s*;",
|
||
r"_init_data_\s*=\s*{\s*data:\s*({.*?})\s*}\s*</script>",
|
||
r"window\.runParams\s*=\s*({.*?});",
|
||
)
|
||
for pat in patterns:
|
||
m = re.search(pat, html, re.DOTALL)
|
||
if not m:
|
||
continue
|
||
blob = m.group(1)
|
||
try:
|
||
data = json.loads(blob)
|
||
except (json.JSONDecodeError, ValueError):
|
||
continue
|
||
if isinstance(data, dict):
|
||
return data
|
||
return None
|
||
|
||
|
||
def _dig_items(data: dict[str, Any]) -> list[dict[str, Any]]:
|
||
"""Localiza la lista de productos dentro del JSON embebido, sea cual sea su anidación.
|
||
|
||
Las claves han variado entre 'mods.itemList.content', 'items', 'result.items'...
|
||
así que hacemos un walk genérico buscando la primera lista de dicts con pinta de
|
||
producto (tienen productId/title/trade).
|
||
"""
|
||
found: list[dict[str, Any]] = []
|
||
|
||
def _is_product(d: dict[str, Any]) -> bool:
|
||
keys = set(d.keys())
|
||
id_keys = {"productId", "product_id", "productid", "id"}
|
||
title_keys = {"title", "subject", "name"}
|
||
return bool(keys & id_keys) and bool(keys & title_keys)
|
||
|
||
def _walk(node: Any) -> None:
|
||
if found:
|
||
return
|
||
if isinstance(node, list):
|
||
product_like = [x for x in node if isinstance(x, dict) and _is_product(x)]
|
||
if len(product_like) >= 2:
|
||
found.extend(product_like)
|
||
return
|
||
for x in node:
|
||
_walk(x)
|
||
elif isinstance(node, dict):
|
||
for v in node.values():
|
||
_walk(v)
|
||
|
||
_walk(data)
|
||
return found
|
||
|
||
|
||
def _to_float(value: Any) -> float | None:
|
||
if value is None:
|
||
return None
|
||
if isinstance(value, (int, float)):
|
||
return float(value)
|
||
s = str(value)
|
||
# Quita símbolos de moneda y separadores de miles; deja el primer número decimal.
|
||
m = re.search(r"\d[\d.,]*", s.replace(" ", " "))
|
||
if not m:
|
||
return None
|
||
num = m.group(0)
|
||
# Heurística: si hay coma y punto, asume coma = miles. Si solo coma, coma = decimal.
|
||
if "," in num and "." in num:
|
||
num = num.replace(",", "")
|
||
elif "," in num:
|
||
num = num.replace(",", ".")
|
||
try:
|
||
return float(num)
|
||
except ValueError:
|
||
return None
|
||
|
||
|
||
def _to_orders(value: Any) -> int | None:
|
||
if value is None:
|
||
return None
|
||
if isinstance(value, int):
|
||
return value
|
||
s = str(value).lower()
|
||
# Formatos: "1,234 sold", "2.3k sold", "10000+ orders".
|
||
mult = 1
|
||
if "k" in s:
|
||
mult = 1000
|
||
m = re.search(r"\d[\d.,]*", s)
|
||
if not m:
|
||
return None
|
||
num = m.group(0).replace(",", "")
|
||
try:
|
||
base = float(num)
|
||
except ValueError:
|
||
return None
|
||
return int(base * mult)
|
||
|
||
|
||
def _normalize_item(
|
||
raw: dict[str, Any], category: str | None
|
||
) -> dict[str, Any] | None:
|
||
pid = (
|
||
raw.get("productId")
|
||
or raw.get("product_id")
|
||
or raw.get("productid")
|
||
or raw.get("id")
|
||
)
|
||
if pid is None:
|
||
return None
|
||
product_id = str(pid)
|
||
|
||
title = raw.get("title") or raw.get("subject") or raw.get("name")
|
||
if isinstance(title, dict):
|
||
title = title.get("displayTitle") or title.get("seoTitle")
|
||
title = str(title).strip() if title else None
|
||
|
||
# Precio: AliExpress lo mete en 'prices.salePrice.minPrice' o variantes planas.
|
||
price_node = (
|
||
raw.get("prices", {}).get("salePrice", {})
|
||
if isinstance(raw.get("prices"), dict)
|
||
else {}
|
||
)
|
||
price = _to_float(
|
||
(price_node.get("minPrice") if isinstance(price_node, dict) else None)
|
||
or raw.get("salePrice")
|
||
or raw.get("price")
|
||
or raw.get("minPrice")
|
||
)
|
||
|
||
currency = None
|
||
if isinstance(price_node, dict):
|
||
currency = price_node.get("currencyCode")
|
||
currency = currency or raw.get("currency") or raw.get("currencyCode")
|
||
currency = str(currency) if currency else None
|
||
|
||
orders = _to_orders(
|
||
raw.get("trade", {}).get("tradeDesc")
|
||
if isinstance(raw.get("trade"), dict)
|
||
else None
|
||
)
|
||
if orders is None:
|
||
orders = _to_orders(
|
||
raw.get("orders") or raw.get("tradeCount") or raw.get("sales")
|
||
)
|
||
|
||
rating = _to_float(
|
||
(
|
||
raw.get("evaluation", {}).get("starRating")
|
||
if isinstance(raw.get("evaluation"), dict)
|
||
else None
|
||
)
|
||
or raw.get("rating")
|
||
or raw.get("averageStar")
|
||
or raw.get("starRating")
|
||
)
|
||
|
||
url = raw.get("productDetailUrl") or raw.get("url") or raw.get("detail_url")
|
||
if url:
|
||
url = str(url)
|
||
if url.startswith("//"):
|
||
url = "https:" + url
|
||
else:
|
||
url = f"{_BASE}/item/{product_id}.html"
|
||
|
||
return {
|
||
"category": category,
|
||
"product_id": product_id,
|
||
"title": title,
|
||
"price": price,
|
||
"currency": currency,
|
||
"orders": orders,
|
||
"rating": rating,
|
||
"url": url,
|
||
}
|
||
|
||
|
||
def scrape_aliexpress_trending(
|
||
query: str | None = None,
|
||
category: str | None = None,
|
||
limit: int = 40,
|
||
ship_to: str = "ES",
|
||
) -> list[dict]:
|
||
"""Capta productos populares de AliExpress (señal e-commerce/dropshipping).
|
||
|
||
Hace UNA request HTTP a la página de listado de AliExpress ordenada por número
|
||
de pedidos (``total_tranpro_desc``) y extrae el JSON embebido en el HTML. Es
|
||
best-effort: AliExpress bloquea agresivamente headless/datacenter, por lo que
|
||
ante un bloqueo (403/429/captcha) lanza ``RuntimeError`` con un mensaje claro y
|
||
ante un HTML sin JSON parseable devuelve ``[]``. NUNCA inventa datos.
|
||
|
||
Args:
|
||
query: Texto de búsqueda (ej. "kitchen gadgets"). Si se da, manda en la URL.
|
||
category: ID numérico de categoría AliExpress o slug. Ignorado si hay ``query``.
|
||
limit: Número máximo de productos a devolver. Default 40.
|
||
ship_to: Código de país ISO-2 para fijar región/moneda via cookies. Default "ES".
|
||
|
||
Returns:
|
||
Lista de dicts con claves exactas:
|
||
``category, product_id, title, price, currency, orders, rating, url``.
|
||
``price``/``rating`` son ``float | None``, ``orders`` es ``int | None``.
|
||
Lista vacía si el HTML no traía JSON parseable.
|
||
|
||
Raises:
|
||
RuntimeError: Si AliExpress bloquea la request (captcha/403/429) o la red falla.
|
||
"""
|
||
import requests
|
||
|
||
url = _build_url(query, category)
|
||
cookies = _region_cookies(ship_to)
|
||
headers = dict(_DESKTOP_HEADERS)
|
||
_, locale, _ = _REGION_MAP.get(ship_to.upper(), _REGION_MAP["ES"])
|
||
headers["Accept-Language"] = f"{locale.replace('_', '-')},en;q=0.8"
|
||
|
||
try:
|
||
resp = requests.get(
|
||
url,
|
||
headers=headers,
|
||
cookies=cookies,
|
||
timeout=20,
|
||
allow_redirects=True,
|
||
)
|
||
except requests.RequestException as exc:
|
||
raise RuntimeError(
|
||
f"scrape_aliexpress_trending: fallo de red contra {url}: {exc}"
|
||
) from exc
|
||
|
||
html = resp.text or ""
|
||
|
||
if _looks_blocked(html, resp.status_code):
|
||
raise RuntimeError(
|
||
f"scrape_aliexpress_trending: AliExpress bloqueó la request "
|
||
f"(status={resp.status_code}, captcha/anti-bot). "
|
||
f"Usa el browser MCP/CDP con sesión real para esta fuente."
|
||
)
|
||
|
||
data = _extract_embedded_json(html)
|
||
if data is None:
|
||
# HTML sin el JSON esperado: layout cambió o respondió un shell vacío.
|
||
# Devolvemos [] honesto en vez de inventar.
|
||
return []
|
||
|
||
raw_items = _dig_items(data)
|
||
cat_label = category if (category and not query) else (query or category)
|
||
|
||
out: list[dict] = []
|
||
seen: set[str] = set()
|
||
for raw in raw_items:
|
||
norm = _normalize_item(raw, cat_label)
|
||
if norm is None:
|
||
continue
|
||
if norm["product_id"] in seen:
|
||
continue
|
||
seen.add(norm["product_id"])
|
||
out.append(norm)
|
||
if len(out) >= limit:
|
||
break
|
||
|
||
return out
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# Self-test honesto: import OK obligatorio + UN fetch real en try/except.
|
||
# NUNCA falla la build por la red.
|
||
print("import OK: scrape_aliexpress_trending")
|
||
expected_keys = {
|
||
"category",
|
||
"product_id",
|
||
"title",
|
||
"price",
|
||
"currency",
|
||
"orders",
|
||
"rating",
|
||
"url",
|
||
}
|
||
try:
|
||
rows = scrape_aliexpress_trending(query="phone holder", limit=5, ship_to="ES")
|
||
if rows:
|
||
got_keys = set(rows[0].keys())
|
||
keys_ok = got_keys == expected_keys
|
||
print(
|
||
f"fetch real: {len(rows)} filas obtenidas | "
|
||
f"claves correctas={keys_ok}"
|
||
)
|
||
print(f" muestra: {rows[0]}")
|
||
else:
|
||
print(
|
||
"fetch real: 0 filas (HTML sin JSON embebido parseable "
|
||
"— layout cambió o shell vacío). NO se inventan datos."
|
||
)
|
||
except RuntimeError as exc:
|
||
print(f"fetch real: BLOQUEADO/ERROR honesto -> {exc}")
|