feat(eda): project_clusters_2d + describe_clusters_llm para el capitulo MODELOS
project_clusters_2d (pura): PCA(2)+KMeans sobre el MISMO subset estandarizado, devolviendo proyeccion 2D y labels alineados por fila + centroides en espacio PCA + perfiles de cluster desestandarizados. Es la pieza que garantiza la alineacion points<->labels que pca_explained y kmeans_segments no cubren (estandarizan por separado y kmeans descarta los labels). Habilita el scatter PCA coloreado por cluster (MUST-8.1). describe_clusters_llm (impura): micro-analisis LLM de los clusters en una sola llamada a ask_llm (grupo claude-direct), devuelve titulo + descripcion por cluster con degradacion dict-no-throw a titulos genericos si el LLM no responde (MUST-8.2). Ambas re-exportadas en datascience/__init__.py. Tests: 6/6 y 9/9 (sin red). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,160 @@
|
||||
"""Tests para describe_clusters_llm.
|
||||
|
||||
NO acceden a red ni a credenciales: _parse_clusters_json es testeable aislada y la
|
||||
unica via que llamaria al LLM (describe_clusters_llm) se prueba monkeypatcheando
|
||||
ask_llm con respuestas simuladas. Cubre golden (LLM ok), edge (cluster faltante,
|
||||
array envuelto en basura, lista vacia / input no-lista) y error (LLM caido, texto
|
||||
no parseable) — todos sin tocar la red.
|
||||
"""
|
||||
|
||||
import importlib
|
||||
import json
|
||||
|
||||
from datascience.describe_clusters_llm import (
|
||||
_parse_clusters_json,
|
||||
describe_clusters_llm,
|
||||
)
|
||||
|
||||
# Perfiles de ejemplo con la forma que produce project_clusters_2d.
|
||||
_PROFILES = [
|
||||
{
|
||||
"cluster": 0,
|
||||
"size": 60,
|
||||
"pct": 60.0,
|
||||
"centroid_original": {"acidez": 8.5, "alcohol": 9.2},
|
||||
"distinctive": ["acidez", "alcohol"],
|
||||
"centroid_z": {"acidez": 1.4, "alcohol": -0.9},
|
||||
},
|
||||
{
|
||||
"cluster": 1,
|
||||
"size": 40,
|
||||
"pct": 40.0,
|
||||
"centroid_original": {"acidez": 5.1, "alcohol": 13.0},
|
||||
"distinctive": ["alcohol"],
|
||||
"centroid_z": {"acidez": -0.7, "alcohol": 1.6},
|
||||
},
|
||||
]
|
||||
_FEATURES = ["acidez", "alcohol", "azucar"]
|
||||
|
||||
|
||||
def _patch_ask_llm(monkeypatch, returner):
|
||||
"""Monkeypatchea ask_llm en el modulo bajo prueba con un callable simulado."""
|
||||
mod = importlib.import_module("datascience.describe_clusters_llm")
|
||||
monkeypatch.setattr(
|
||||
mod, "ask_llm", lambda prompt, model="x", system="", echo=True: returner
|
||||
)
|
||||
|
||||
|
||||
# --- _parse_clusters_json (parser puro, sin red) ---
|
||||
|
||||
|
||||
def test_parse_clusters_json_valid_array():
|
||||
text = json.dumps(
|
||||
[
|
||||
{"cluster": 0, "title": "A", "description": "desc a"},
|
||||
{"cluster": 1, "title": "B", "description": "desc b"},
|
||||
]
|
||||
)
|
||||
parsed = _parse_clusters_json(text, 2)
|
||||
assert parsed == [
|
||||
{"cluster": 0, "title": "A", "description": "desc a"},
|
||||
{"cluster": 1, "title": "B", "description": "desc b"},
|
||||
]
|
||||
|
||||
|
||||
def test_parse_clusters_json_wrapped_in_junk_text():
|
||||
payload = [{"cluster": 0, "title": "Solo uno", "description": "d"}]
|
||||
text = "Claro, aqui tienes el resultado:\n" + json.dumps(payload) + "\nEspero que sirva."
|
||||
parsed = _parse_clusters_json(text, 1)
|
||||
assert parsed[0]["title"] == "Solo uno"
|
||||
assert parsed[0]["cluster"] == 0
|
||||
|
||||
|
||||
def test_parse_clusters_json_non_json_returns_none():
|
||||
# Texto sin array JSON -> degradacion (None) sin lanzar.
|
||||
assert _parse_clusters_json("no hay json aqui", 2) is None
|
||||
assert _parse_clusters_json("", 2) is None
|
||||
assert _parse_clusters_json("{solo un objeto}", 2) is None
|
||||
|
||||
|
||||
def test_parse_clusters_json_fills_missing_cluster_by_index():
|
||||
text = json.dumps(
|
||||
[
|
||||
{"title": "A", "description": "d"},
|
||||
{"title": "B", "description": "e"},
|
||||
]
|
||||
)
|
||||
parsed = _parse_clusters_json(text, 2)
|
||||
assert parsed[0]["cluster"] == 0
|
||||
assert parsed[1]["cluster"] == 1
|
||||
assert parsed[0]["title"] == "A"
|
||||
|
||||
|
||||
# --- describe_clusters_llm (con ask_llm monkeypatcheado, sin red) ---
|
||||
|
||||
|
||||
def test_describe_clusters_llm_ok_with_monkeypatched_llm(monkeypatch):
|
||||
fake = json.dumps(
|
||||
[
|
||||
{
|
||||
"cluster": 0,
|
||||
"title": "Vinos de alta acidez",
|
||||
"description": "Acidez por encima de la media y graduacion baja.",
|
||||
},
|
||||
{
|
||||
"cluster": 1,
|
||||
"title": "Vinos de alta graduacion",
|
||||
"description": "Alcohol claramente por encima de la media.",
|
||||
},
|
||||
]
|
||||
)
|
||||
_patch_ask_llm(monkeypatch, fake)
|
||||
|
||||
out = describe_clusters_llm(_PROFILES, _FEATURES)
|
||||
assert out["note"] == ""
|
||||
assert out["model"] == "claude-haiku-4-5-20251001"
|
||||
assert len(out["clusters"]) == 2
|
||||
assert out["clusters"][0]["title"] == "Vinos de alta acidez"
|
||||
assert set(out["clusters"][0].keys()) == {"cluster", "title", "description"}
|
||||
|
||||
|
||||
def test_describe_clusters_llm_degrades_on_empty_response(monkeypatch):
|
||||
# ask_llm devuelve "" (error/red caida) -> titulos genericos + note.
|
||||
_patch_ask_llm(monkeypatch, "")
|
||||
|
||||
out = describe_clusters_llm(_PROFILES, _FEATURES)
|
||||
assert out["clusters"][0]["title"] == "Cluster 0"
|
||||
assert out["clusters"][1]["title"] == "Cluster 1"
|
||||
assert out["clusters"][0]["description"] == ""
|
||||
assert out["note"] == "LLM no disponible"
|
||||
assert out["model"] == "claude-haiku-4-5-20251001"
|
||||
|
||||
|
||||
def test_describe_clusters_llm_degrades_on_unparseable_response(monkeypatch):
|
||||
_patch_ask_llm(monkeypatch, "lo siento, no puedo ayudarte con eso")
|
||||
|
||||
out = describe_clusters_llm(_PROFILES, _FEATURES)
|
||||
assert out["clusters"][0]["title"] == "Cluster 0"
|
||||
assert out["clusters"][1]["title"] == "Cluster 1"
|
||||
assert out["note"] == "parse fallido"
|
||||
|
||||
|
||||
def test_describe_clusters_llm_empty_list_skips_llm(monkeypatch):
|
||||
# Con lista vacia NO debe llamarse al LLM en absoluto.
|
||||
def boom(*args, **kwargs):
|
||||
raise AssertionError("ask_llm no debe llamarse con lista vacia")
|
||||
|
||||
mod = importlib.import_module("datascience.describe_clusters_llm")
|
||||
monkeypatch.setattr(mod, "ask_llm", boom)
|
||||
|
||||
out = describe_clusters_llm([], _FEATURES)
|
||||
assert out["clusters"] == []
|
||||
assert out["note"] == "sin clusters"
|
||||
|
||||
|
||||
def test_describe_clusters_llm_non_list_input_skips_llm():
|
||||
# Input no-lista (None) -> clusters vacio sin tocar la red.
|
||||
out = describe_clusters_llm(None, _FEATURES)
|
||||
assert out["clusters"] == []
|
||||
assert out["note"] == "sin clusters"
|
||||
assert out["model"] == "claude-haiku-4-5-20251001"
|
||||
Reference in New Issue
Block a user